ahmedheakl's picture
Add llama-3b checkpoints
1ee0294 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.436860068259386,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.010921501706484642,
"f1_execute": 0.5142857432365417,
"f1_repeat": 0.2222222238779068,
"f1_skip": 0.0,
"grad_norm": 31.125,
"learning_rate": 2e-06,
"loss": 2.8198,
"macro_f1": 0.24550265073776245,
"num_tokens": 3507.0,
"repeat_count": 1.0,
"routers_loss": 1.076732873916626,
"skip_count": 2.0,
"step": 2,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 0.021843003412969283,
"f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 38.5,
"learning_rate": 6e-06,
"loss": 3.125,
"macro_f1": 0.222222238779068,
"num_tokens": 7330.0,
"repeat_count": 0.0,
"routers_loss": 4.3143134117126465,
"skip_count": 0.0,
"step": 4,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 0.032764505119453925,
"f1_execute": 0.5999999642372131,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 33.75,
"learning_rate": 1e-05,
"loss": 3.0713,
"macro_f1": 0.19999998807907104,
"num_tokens": 11360.0,
"repeat_count": 0.0,
"routers_loss": 1.8818678855895996,
"skip_count": 0.0,
"step": 6,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 0.04368600682593857,
"f1_execute": 0.5789473652839661,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 37.25,
"learning_rate": 1.4e-05,
"loss": 2.992,
"macro_f1": 0.19298246502876282,
"num_tokens": 14241.0,
"repeat_count": 1.0,
"routers_loss": 2.340613603591919,
"skip_count": 1.0,
"step": 8,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 0.05460750853242321,
"f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 34.5,
"learning_rate": 1.8e-05,
"loss": 3.0072,
"macro_f1": 0.222222238779068,
"num_tokens": 17520.0,
"repeat_count": 0.0,
"routers_loss": 1.7916433811187744,
"skip_count": 0.0,
"step": 10,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.06552901023890785,
"f1_execute": 0.6315789818763733,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 41.25,
"learning_rate": 2.2e-05,
"loss": 3.2227,
"macro_f1": 0.21052633225917816,
"num_tokens": 20401.0,
"repeat_count": 1.0,
"routers_loss": 2.2361459732055664,
"skip_count": 1.0,
"step": 12,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 0.07645051194539249,
"f1_execute": 0.5789473652839661,
"f1_repeat": 0.0,
"f1_skip": 0.20000000298023224,
"grad_norm": 31.875,
"learning_rate": 2.6e-05,
"loss": 3.1809,
"macro_f1": 0.2596491277217865,
"num_tokens": 23722.0,
"repeat_count": 1.0,
"routers_loss": 2.6635637283325195,
"skip_count": 2.0,
"step": 14,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 0.08737201365187713,
"f1_execute": 0.6341463327407837,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 40.25,
"learning_rate": 3e-05,
"loss": 3.2606,
"macro_f1": 0.21138212084770203,
"num_tokens": 26754.0,
"repeat_count": 0.0,
"routers_loss": 1.967104196548462,
"skip_count": 0.0,
"step": 16,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 25.0,
"epoch": 0.09829351535836177,
"f1_execute": 0.5405405163764954,
"f1_repeat": 0.0,
"f1_skip": 0.1666666567325592,
"grad_norm": 39.5,
"learning_rate": 3.4000000000000007e-05,
"loss": 2.9096,
"macro_f1": 0.23573574423789978,
"num_tokens": 29878.0,
"repeat_count": 0.0,
"routers_loss": 0.6965824365615845,
"skip_count": 2.0,
"step": 18,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 0.10921501706484642,
"f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 40.75,
"learning_rate": 3.8e-05,
"loss": 3.2996,
"macro_f1": 0.222222238779068,
"num_tokens": 32410.0,
"repeat_count": 0.0,
"routers_loss": 7.038887977600098,
"skip_count": 0.0,
"step": 20,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 0.12013651877133105,
"f1_execute": 0.5641025900840759,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 32.5,
"learning_rate": 4.2000000000000004e-05,
"loss": 2.7437,
"macro_f1": 0.18803420662879944,
"num_tokens": 35122.0,
"repeat_count": 1.0,
"routers_loss": 4.3931450843811035,
"skip_count": 2.0,
"step": 22,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 0.1310580204778157,
"f1_execute": 0.6341463327407837,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 44.0,
"learning_rate": 4.6e-05,
"loss": 2.9583,
"macro_f1": 0.21138212084770203,
"num_tokens": 38647.0,
"repeat_count": 0.0,
"routers_loss": 5.246743202209473,
"skip_count": 2.0,
"step": 24,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 0.14197952218430035,
"f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 36.0,
"learning_rate": 5e-05,
"loss": 2.0258,
"macro_f1": 0.222222238779068,
"num_tokens": 41759.0,
"repeat_count": 0.0,
"routers_loss": 4.385664463043213,
"skip_count": 0.0,
"step": 26,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 0.15290102389078497,
"f1_execute": 0.6666666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 22.75,
"learning_rate": 5.4e-05,
"loss": 1.8932,
"macro_f1": 0.222222238779068,
"num_tokens": 45255.0,
"repeat_count": 1.0,
"routers_loss": 2.442974090576172,
"skip_count": 2.0,
"step": 28,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 0.16382252559726962,
"f1_execute": 0.7272726893424988,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 20.5,
"learning_rate": 5.800000000000001e-05,
"loss": 1.5961,
"macro_f1": 0.24242423474788666,
"num_tokens": 48765.0,
"repeat_count": 0.0,
"routers_loss": 1.319467306137085,
"skip_count": 3.0,
"step": 30,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.17474402730375427,
"f1_execute": 0.782608687877655,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 21.875,
"learning_rate": 6.2e-05,
"loss": 1.7529,
"macro_f1": 0.260869562625885,
"num_tokens": 51973.0,
"repeat_count": 0.0,
"routers_loss": 1.2047386169433594,
"skip_count": 2.0,
"step": 32,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.18566552901023892,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 26.875,
"learning_rate": 6.6e-05,
"loss": 1.4983,
"macro_f1": 0.29333335161209106,
"num_tokens": 54972.0,
"repeat_count": 0.0,
"routers_loss": 0.8216792345046997,
"skip_count": 0.0,
"step": 34,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.19658703071672354,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 20.75,
"learning_rate": 7.000000000000001e-05,
"loss": 1.2751,
"macro_f1": 0.3076923191547394,
"num_tokens": 58134.0,
"repeat_count": 0.0,
"routers_loss": 0.6534898281097412,
"skip_count": 0.0,
"step": 36,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.2075085324232082,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 17.75,
"learning_rate": 7.4e-05,
"loss": 0.9561,
"macro_f1": 0.29333335161209106,
"num_tokens": 61291.0,
"repeat_count": 0.0,
"routers_loss": 0.6772168278694153,
"skip_count": 2.0,
"step": 38,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.21843003412969283,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 15.875,
"learning_rate": 7.8e-05,
"loss": 0.6809,
"macro_f1": 0.307692289352417,
"num_tokens": 64406.0,
"repeat_count": 0.0,
"routers_loss": 0.7885609865188599,
"skip_count": 1.0,
"step": 40,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.22935153583617748,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 17.0,
"learning_rate": 8.2e-05,
"loss": 0.587,
"macro_f1": 0.3205128312110901,
"num_tokens": 67402.0,
"repeat_count": 1.0,
"routers_loss": 0.31721553206443787,
"skip_count": 0.0,
"step": 42,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.2402730375426621,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 12.625,
"learning_rate": 8.599999999999999e-05,
"loss": 0.4996,
"macro_f1": 0.32098764181137085,
"num_tokens": 70935.0,
"repeat_count": 0.0,
"routers_loss": 0.13094936311244965,
"skip_count": 0.0,
"step": 44,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.25119453924914675,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 12.5625,
"learning_rate": 8.999999999999999e-05,
"loss": 0.4226,
"macro_f1": 0.29333335161209106,
"num_tokens": 73716.0,
"repeat_count": 2.0,
"routers_loss": 0.48597365617752075,
"skip_count": 3.0,
"step": 46,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.2621160409556314,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.1875,
"learning_rate": 9.400000000000001e-05,
"loss": 0.2499,
"macro_f1": 0.31446540355682373,
"num_tokens": 76662.0,
"repeat_count": 0.0,
"routers_loss": 0.7850716710090637,
"skip_count": 1.0,
"step": 48,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.27303754266211605,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.5625,
"learning_rate": 9.800000000000001e-05,
"loss": 0.3029,
"macro_f1": 0.3144654333591461,
"num_tokens": 80080.0,
"repeat_count": 2.0,
"routers_loss": 1.4728330373764038,
"skip_count": 1.0,
"step": 50,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.2839590443686007,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.0625,
"learning_rate": 0.000102,
"loss": 0.2549,
"macro_f1": 0.32098764181137085,
"num_tokens": 82942.0,
"repeat_count": 0.0,
"routers_loss": 0.16784702241420746,
"skip_count": 2.0,
"step": 52,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.29488054607508535,
"f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.5625,
"learning_rate": 0.000106,
"loss": 0.2782,
"macro_f1": 0.2857142686843872,
"num_tokens": 85928.0,
"repeat_count": 1.0,
"routers_loss": 0.25518977642059326,
"skip_count": 4.0,
"step": 54,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.30580204778156994,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.75,
"learning_rate": 0.00011,
"loss": 0.2309,
"macro_f1": 0.307692289352417,
"num_tokens": 88804.0,
"repeat_count": 0.0,
"routers_loss": 0.21613653004169464,
"skip_count": 3.0,
"step": 56,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.3167235494880546,
"f1_execute": 0.8571429252624512,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.625,
"learning_rate": 0.000114,
"loss": 0.1319,
"macro_f1": 0.285714328289032,
"num_tokens": 91674.0,
"repeat_count": 1.0,
"routers_loss": 0.4971294403076172,
"skip_count": 5.0,
"step": 58,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.32764505119453924,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.625,
"learning_rate": 0.000118,
"loss": 0.1637,
"macro_f1": 0.3333333432674408,
"num_tokens": 94858.0,
"repeat_count": 0.0,
"routers_loss": 0.01838197372853756,
"skip_count": 0.0,
"step": 60,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.3385665529010239,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.859375,
"learning_rate": 0.000122,
"loss": 0.1888,
"macro_f1": 0.31446540355682373,
"num_tokens": 97538.0,
"repeat_count": 1.0,
"routers_loss": 0.5383598804473877,
"skip_count": 1.0,
"step": 62,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 31.0,
"epoch": 0.34948805460750854,
"f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.5,
"learning_rate": 0.000126,
"loss": 0.2176,
"macro_f1": 0.2857142686843872,
"num_tokens": 101249.0,
"repeat_count": 1.0,
"routers_loss": 0.2093856781721115,
"skip_count": 1.0,
"step": 64,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.3604095563139932,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.625,
"learning_rate": 0.00013000000000000002,
"loss": 0.1568,
"macro_f1": 0.3333333432674408,
"num_tokens": 104398.0,
"repeat_count": 0.0,
"routers_loss": 0.015723152086138725,
"skip_count": 0.0,
"step": 66,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.37133105802047783,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.34375,
"learning_rate": 0.000134,
"loss": 0.2764,
"macro_f1": 0.3333333432674408,
"num_tokens": 107538.0,
"repeat_count": 0.0,
"routers_loss": 0.019146224483847618,
"skip_count": 0.0,
"step": 68,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.3822525597269625,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.25,
"learning_rate": 0.00013800000000000002,
"loss": 0.2035,
"macro_f1": 0.3144654333591461,
"num_tokens": 110689.0,
"repeat_count": 3.0,
"routers_loss": 0.6408394575119019,
"skip_count": 0.0,
"step": 70,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.3931740614334471,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.6875,
"learning_rate": 0.00014199999999999998,
"loss": 0.1986,
"macro_f1": 0.32098764181137085,
"num_tokens": 114205.0,
"repeat_count": 0.0,
"routers_loss": 0.04342689737677574,
"skip_count": 0.0,
"step": 72,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 0.4040955631399317,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.0625,
"learning_rate": 0.000146,
"loss": 0.1412,
"macro_f1": 0.307692289352417,
"num_tokens": 117140.0,
"repeat_count": 0.0,
"routers_loss": 0.12777170538902283,
"skip_count": 1.0,
"step": 74,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.4150170648464164,
"f1_execute": 0.8571428060531616,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.5,
"learning_rate": 0.00015,
"loss": 0.1273,
"macro_f1": 0.2857142686843872,
"num_tokens": 120355.0,
"repeat_count": 0.0,
"routers_loss": 0.2570268511772156,
"skip_count": 5.0,
"step": 76,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.425938566552901,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.03125,
"learning_rate": 0.000154,
"loss": 0.1169,
"macro_f1": 0.3333333432674408,
"num_tokens": 123542.0,
"repeat_count": 0.0,
"routers_loss": 0.019178830087184906,
"skip_count": 0.0,
"step": 78,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.43686006825938567,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.5,
"learning_rate": 0.000158,
"loss": 0.1702,
"macro_f1": 0.3006536066532135,
"num_tokens": 126444.0,
"repeat_count": 0.0,
"routers_loss": 0.40678197145462036,
"skip_count": 4.0,
"step": 80,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.4477815699658703,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.40625,
"learning_rate": 0.000162,
"loss": 0.207,
"macro_f1": 0.3333333432674408,
"num_tokens": 129208.0,
"repeat_count": 0.0,
"routers_loss": 0.016020173206925392,
"skip_count": 0.0,
"step": 82,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.45870307167235497,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.0,
"learning_rate": 0.00016600000000000002,
"loss": 0.1469,
"macro_f1": 0.3333333432674408,
"num_tokens": 132692.0,
"repeat_count": 0.0,
"routers_loss": 0.015191584825515747,
"skip_count": 0.0,
"step": 84,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.4696245733788396,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.9375,
"learning_rate": 0.00017,
"loss": 0.1883,
"macro_f1": 0.307692289352417,
"num_tokens": 135433.0,
"repeat_count": 1.0,
"routers_loss": 0.29757800698280334,
"skip_count": 2.0,
"step": 86,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.1111111119389534,
"avg_layers": 27.0,
"epoch": 0.4805460750853242,
"f1_execute": 0.7142857313156128,
"f1_repeat": 0.0,
"f1_skip": 0.1818181872367859,
"grad_norm": 4.21875,
"learning_rate": 0.000174,
"loss": 0.2656,
"macro_f1": 0.29870131611824036,
"num_tokens": 139019.0,
"repeat_count": 2.0,
"routers_loss": 0.5406635403633118,
"skip_count": 9.0,
"step": 88,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.49146757679180886,
"f1_execute": 0.8571429252624512,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.46875,
"learning_rate": 0.000178,
"loss": 0.2149,
"macro_f1": 0.285714328289032,
"num_tokens": 142156.0,
"repeat_count": 3.0,
"routers_loss": 0.9084331393241882,
"skip_count": 3.0,
"step": 90,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 25.0,
"epoch": 0.5023890784982935,
"f1_execute": 0.8979592323303223,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 6.15625,
"learning_rate": 0.000182,
"loss": 0.1461,
"macro_f1": 0.4104308784008026,
"num_tokens": 144866.0,
"repeat_count": 1.0,
"routers_loss": 0.298293799161911,
"skip_count": 3.0,
"step": 92,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.5133105802047782,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.59375,
"learning_rate": 0.000186,
"loss": 0.1432,
"macro_f1": 0.32098764181137085,
"num_tokens": 148029.0,
"repeat_count": 1.0,
"routers_loss": 0.13971005380153656,
"skip_count": 1.0,
"step": 94,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.5242320819112628,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.53125,
"learning_rate": 0.00019,
"loss": 0.1566,
"macro_f1": 0.32098764181137085,
"num_tokens": 151076.0,
"repeat_count": 0.0,
"routers_loss": 0.2203323394060135,
"skip_count": 2.0,
"step": 96,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.5351535836177475,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 10.25,
"learning_rate": 0.000194,
"loss": 0.3221,
"macro_f1": 0.32098764181137085,
"num_tokens": 153825.0,
"repeat_count": 0.0,
"routers_loss": 0.22957128286361694,
"skip_count": 2.0,
"step": 98,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 0.5460750853242321,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.46875,
"learning_rate": 0.00019800000000000002,
"loss": 0.1445,
"macro_f1": 0.3272727429866791,
"num_tokens": 157200.0,
"repeat_count": 0.0,
"routers_loss": 0.0985352173447609,
"skip_count": 0.0,
"step": 100,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.5569965870307167,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.125,
"learning_rate": 0.000202,
"loss": 0.2346,
"macro_f1": 0.3144654333591461,
"num_tokens": 161171.0,
"repeat_count": 1.0,
"routers_loss": 0.5728805065155029,
"skip_count": 2.0,
"step": 102,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 0.5679180887372014,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 4.65625,
"learning_rate": 0.000206,
"loss": 0.1532,
"macro_f1": 0.4871794879436493,
"num_tokens": 165319.0,
"repeat_count": 0.0,
"routers_loss": 0.08763546496629715,
"skip_count": 2.0,
"step": 104,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.578839590443686,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.375,
"learning_rate": 0.00021,
"loss": 0.1183,
"macro_f1": 0.3272727429866791,
"num_tokens": 168259.0,
"repeat_count": 0.0,
"routers_loss": 0.11700262129306793,
"skip_count": 1.0,
"step": 106,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.5897610921501707,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.1875,
"learning_rate": 0.000214,
"loss": 0.1856,
"macro_f1": 0.3144654333591461,
"num_tokens": 171640.0,
"repeat_count": 1.0,
"routers_loss": 0.2897156774997711,
"skip_count": 2.0,
"step": 108,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.6006825938566553,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.84375,
"learning_rate": 0.000218,
"loss": 0.1379,
"macro_f1": 0.3006536066532135,
"num_tokens": 174452.0,
"repeat_count": 0.0,
"routers_loss": 0.20764203369617462,
"skip_count": 4.0,
"step": 110,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.6116040955631399,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.9375,
"learning_rate": 0.000222,
"loss": 0.14,
"macro_f1": 0.32098764181137085,
"num_tokens": 177034.0,
"repeat_count": 0.0,
"routers_loss": 0.07773401588201523,
"skip_count": 0.0,
"step": 112,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 0.6225255972696245,
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.375,
"learning_rate": 0.00022600000000000002,
"loss": 0.1327,
"macro_f1": 0.2857142984867096,
"num_tokens": 180310.0,
"repeat_count": 2.0,
"routers_loss": 0.3696478605270386,
"skip_count": 2.0,
"step": 114,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.6334470989761092,
"f1_execute": 0.8333333730697632,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.984375,
"learning_rate": 0.00023,
"loss": 0.155,
"macro_f1": 0.2777777910232544,
"num_tokens": 182835.0,
"repeat_count": 3.0,
"routers_loss": 0.5024136304855347,
"skip_count": 5.0,
"step": 116,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.6443686006825938,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.71875,
"learning_rate": 0.00023400000000000002,
"loss": 0.1566,
"macro_f1": 0.3333333432674408,
"num_tokens": 186508.0,
"repeat_count": 0.0,
"routers_loss": 0.02631981112062931,
"skip_count": 0.0,
"step": 118,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.6552901023890785,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.03125,
"learning_rate": 0.00023799999999999998,
"loss": 0.1503,
"macro_f1": 0.32098764181137085,
"num_tokens": 190380.0,
"repeat_count": 0.0,
"routers_loss": 0.036612559109926224,
"skip_count": 0.0,
"step": 120,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.6662116040955631,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.28125,
"learning_rate": 0.000242,
"loss": 0.181,
"macro_f1": 0.3076923191547394,
"num_tokens": 193279.0,
"repeat_count": 1.0,
"routers_loss": 0.37753066420555115,
"skip_count": 1.0,
"step": 122,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.6771331058020478,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.75,
"learning_rate": 0.000246,
"loss": 0.1187,
"macro_f1": 0.32098767161369324,
"num_tokens": 196711.0,
"repeat_count": 0.0,
"routers_loss": 0.08419940620660782,
"skip_count": 1.0,
"step": 124,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 0.6880546075085324,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.0,
"learning_rate": 0.00025,
"loss": 0.1184,
"macro_f1": 0.5492662787437439,
"num_tokens": 199715.0,
"repeat_count": 0.0,
"routers_loss": 0.043020736426115036,
"skip_count": 2.0,
"step": 126,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.6989761092150171,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.9375,
"learning_rate": 0.000254,
"loss": 0.1421,
"macro_f1": 0.32098767161369324,
"num_tokens": 204217.0,
"repeat_count": 0.0,
"routers_loss": 0.0802314504981041,
"skip_count": 1.0,
"step": 128,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.7098976109215017,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.0,
"learning_rate": 0.00025800000000000004,
"loss": 0.1719,
"macro_f1": 0.32098764181137085,
"num_tokens": 206777.0,
"repeat_count": 1.0,
"routers_loss": 0.09076520055532455,
"skip_count": 1.0,
"step": 130,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.7208191126279864,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.9375,
"learning_rate": 0.000262,
"loss": 0.1423,
"macro_f1": 0.3272727429866791,
"num_tokens": 210838.0,
"repeat_count": 0.0,
"routers_loss": 0.024340573698282242,
"skip_count": 0.0,
"step": 132,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.731740614334471,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.875,
"learning_rate": 0.000266,
"loss": 0.1,
"macro_f1": 0.3333333432674408,
"num_tokens": 213498.0,
"repeat_count": 0.0,
"routers_loss": 0.016322199255228043,
"skip_count": 0.0,
"step": 134,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.7426621160409557,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.34375,
"learning_rate": 0.00027,
"loss": 0.1408,
"macro_f1": 0.3272727429866791,
"num_tokens": 216998.0,
"repeat_count": 0.0,
"routers_loss": 0.042806077748537064,
"skip_count": 1.0,
"step": 136,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.7535836177474403,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.6875,
"learning_rate": 0.00027400000000000005,
"loss": 0.1012,
"macro_f1": 0.32098764181137085,
"num_tokens": 219952.0,
"repeat_count": 0.0,
"routers_loss": 0.12166574597358704,
"skip_count": 2.0,
"step": 138,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.764505119453925,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.59375,
"learning_rate": 0.00027800000000000004,
"loss": 0.1576,
"macro_f1": 0.32098767161369324,
"num_tokens": 223326.0,
"repeat_count": 0.0,
"routers_loss": 0.12389889359474182,
"skip_count": 1.0,
"step": 140,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.7754266211604095,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.46875,
"learning_rate": 0.00028199999999999997,
"loss": 0.1554,
"macro_f1": 0.31446540355682373,
"num_tokens": 226179.0,
"repeat_count": 0.0,
"routers_loss": 0.1315135806798935,
"skip_count": 2.0,
"step": 142,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.7863481228668942,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.625,
"learning_rate": 0.00028599999999999996,
"loss": 0.1188,
"macro_f1": 0.3272727429866791,
"num_tokens": 228782.0,
"repeat_count": 0.0,
"routers_loss": 0.08095238357782364,
"skip_count": 1.0,
"step": 144,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.7972696245733788,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.5,
"learning_rate": 0.00029,
"loss": 0.1616,
"macro_f1": 0.3076923191547394,
"num_tokens": 231771.0,
"repeat_count": 0.0,
"routers_loss": 0.13997994363307953,
"skip_count": 4.0,
"step": 146,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.8081911262798634,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.0,
"learning_rate": 0.000294,
"loss": 0.1868,
"macro_f1": 0.3333333432674408,
"num_tokens": 234517.0,
"repeat_count": 0.0,
"routers_loss": 0.03245344012975693,
"skip_count": 0.0,
"step": 148,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 0.8191126279863481,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.375,
"learning_rate": 0.000298,
"loss": 0.148,
"macro_f1": 0.3006536066532135,
"num_tokens": 237324.0,
"repeat_count": 1.0,
"routers_loss": 0.36887046694755554,
"skip_count": 2.0,
"step": 150,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.8300341296928327,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.734375,
"learning_rate": 0.000302,
"loss": 0.1759,
"macro_f1": 0.3272727429866791,
"num_tokens": 240657.0,
"repeat_count": 1.0,
"routers_loss": 0.1363309770822525,
"skip_count": 0.0,
"step": 152,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.8409556313993174,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.65625,
"learning_rate": 0.000306,
"loss": 0.2043,
"macro_f1": 0.3333333432674408,
"num_tokens": 243741.0,
"repeat_count": 0.0,
"routers_loss": 0.024881718680262566,
"skip_count": 0.0,
"step": 154,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 27.0,
"epoch": 0.851877133105802,
"f1_execute": 0.8979592323303223,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 4.5625,
"learning_rate": 0.00031,
"loss": 0.1777,
"macro_f1": 0.4326530694961548,
"num_tokens": 246879.0,
"repeat_count": 1.0,
"routers_loss": 0.25227662920951843,
"skip_count": 3.0,
"step": 156,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 27.0,
"epoch": 0.8627986348122867,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 5.28125,
"learning_rate": 0.000314,
"loss": 0.1641,
"macro_f1": 0.47333335876464844,
"num_tokens": 249880.0,
"repeat_count": 2.0,
"routers_loss": 0.3088915944099426,
"skip_count": 3.0,
"step": 158,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 25.0,
"epoch": 0.8737201365187713,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 6.59375,
"learning_rate": 0.00031800000000000003,
"loss": 0.1687,
"macro_f1": 0.41777777671813965,
"num_tokens": 252725.0,
"repeat_count": 0.0,
"routers_loss": 0.11272747814655304,
"skip_count": 3.0,
"step": 160,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 0.884641638225256,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.90625,
"learning_rate": 0.000322,
"loss": 0.1408,
"macro_f1": 0.3144654333591461,
"num_tokens": 255951.0,
"repeat_count": 0.0,
"routers_loss": 0.05064187943935394,
"skip_count": 0.0,
"step": 162,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.8955631399317406,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.65625,
"learning_rate": 0.000326,
"loss": 0.1509,
"macro_f1": 0.3076923191547394,
"num_tokens": 259469.0,
"repeat_count": 0.0,
"routers_loss": 0.21262036263942719,
"skip_count": 2.0,
"step": 164,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 26.0,
"epoch": 0.9064846416382253,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 5.25,
"learning_rate": 0.00033,
"loss": 0.1578,
"macro_f1": 0.4400000274181366,
"num_tokens": 262272.0,
"repeat_count": 1.0,
"routers_loss": 0.1725386530160904,
"skip_count": 3.0,
"step": 166,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 0.9174061433447099,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.4375,
"learning_rate": 0.00033400000000000004,
"loss": 0.1471,
"macro_f1": 0.3272727429866791,
"num_tokens": 266415.0,
"repeat_count": 0.0,
"routers_loss": 0.02629087306559086,
"skip_count": 0.0,
"step": 168,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.9283276450511946,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.296875,
"learning_rate": 0.00033800000000000003,
"loss": 0.1185,
"macro_f1": 0.32098767161369324,
"num_tokens": 269700.0,
"repeat_count": 0.0,
"routers_loss": 0.05510875955224037,
"skip_count": 1.0,
"step": 170,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.9392491467576792,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.75,
"learning_rate": 0.000342,
"loss": 0.1637,
"macro_f1": 0.3006536066532135,
"num_tokens": 272587.0,
"repeat_count": 1.0,
"routers_loss": 0.27733829617500305,
"skip_count": 3.0,
"step": 172,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 0.9501706484641638,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.4375,
"learning_rate": 0.000346,
"loss": 0.2034,
"macro_f1": 0.32098764181137085,
"num_tokens": 277005.0,
"repeat_count": 0.0,
"routers_loss": 0.14457301795482635,
"skip_count": 2.0,
"step": 174,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 0.9610921501706484,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 9.125,
"learning_rate": 0.00035,
"loss": 0.154,
"macro_f1": 0.4871794879436493,
"num_tokens": 279607.0,
"repeat_count": 0.0,
"routers_loss": 0.07571296393871307,
"skip_count": 2.0,
"step": 176,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 0.9720136518771331,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.84375,
"learning_rate": 0.000354,
"loss": 0.1894,
"macro_f1": 0.32098767161369324,
"num_tokens": 282547.0,
"repeat_count": 1.0,
"routers_loss": 0.5549371838569641,
"skip_count": 0.0,
"step": 178,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 0.9829351535836177,
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
"grad_norm": 4.9375,
"learning_rate": 0.000358,
"loss": 0.1226,
"macro_f1": 0.5359477400779724,
"num_tokens": 286081.0,
"repeat_count": 2.0,
"routers_loss": 0.2509016990661621,
"skip_count": 2.0,
"step": 180,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 0.9938566552901024,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.9375,
"learning_rate": 0.000362,
"loss": 0.1795,
"macro_f1": 0.3272727429866791,
"num_tokens": 289224.0,
"repeat_count": 0.0,
"routers_loss": 0.017457736656069756,
"skip_count": 0.0,
"step": 182,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.0,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.390625,
"learning_rate": 0.000366,
"loss": 0.1471,
"macro_f1": 0.3272727429866791,
"num_tokens": 290916.0,
"repeat_count": 0.0,
"routers_loss": 0.05112108215689659,
"skip_count": 0.0,
"step": 184,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.0109215017064845,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.6875,
"learning_rate": 0.00037,
"loss": 0.1459,
"macro_f1": 0.3076923191547394,
"num_tokens": 294182.0,
"repeat_count": 3.0,
"routers_loss": 0.5592358708381653,
"skip_count": 1.0,
"step": 186,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.0218430034129693,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.890625,
"learning_rate": 0.000374,
"loss": 0.1446,
"macro_f1": 0.3333333432674408,
"num_tokens": 296702.0,
"repeat_count": 0.0,
"routers_loss": 0.006012737285345793,
"skip_count": 0.0,
"step": 188,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.0327645051194538,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.96875,
"learning_rate": 0.000378,
"loss": 0.1394,
"macro_f1": 0.31446540355682373,
"num_tokens": 300348.0,
"repeat_count": 0.0,
"routers_loss": 0.06094537675380707,
"skip_count": 2.0,
"step": 190,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.0436860068259386,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.625,
"learning_rate": 0.000382,
"loss": 0.0995,
"macro_f1": 0.3272727429866791,
"num_tokens": 303466.0,
"repeat_count": 0.0,
"routers_loss": 0.08475696295499802,
"skip_count": 1.0,
"step": 192,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.0546075085324231,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.1875,
"learning_rate": 0.000386,
"loss": 0.1749,
"macro_f1": 0.3333333432674408,
"num_tokens": 306160.0,
"repeat_count": 0.0,
"routers_loss": 0.010187637060880661,
"skip_count": 0.0,
"step": 194,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.065529010238908,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.28125,
"learning_rate": 0.00039000000000000005,
"loss": 0.1692,
"macro_f1": 0.3076923191547394,
"num_tokens": 309453.0,
"repeat_count": 1.0,
"routers_loss": 0.20142780244350433,
"skip_count": 1.0,
"step": 196,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.0764505119453924,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.90625,
"learning_rate": 0.00039400000000000004,
"loss": 0.1283,
"macro_f1": 0.3333333432674408,
"num_tokens": 312138.0,
"repeat_count": 0.0,
"routers_loss": 0.015577984042465687,
"skip_count": 0.0,
"step": 198,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 1.0873720136518772,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.46875,
"learning_rate": 0.000398,
"loss": 0.1061,
"macro_f1": 0.4803921580314636,
"num_tokens": 315833.0,
"repeat_count": 0.0,
"routers_loss": 0.1465342938899994,
"skip_count": 2.0,
"step": 200,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 1.0982935153583617,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 10.5625,
"learning_rate": 0.000402,
"loss": 0.1879,
"macro_f1": 0.32098764181137085,
"num_tokens": 318690.0,
"repeat_count": 0.0,
"routers_loss": 0.09964372962713242,
"skip_count": 0.0,
"step": 202,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 30.0,
"epoch": 1.1092150170648465,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.25,
"learning_rate": 0.00040600000000000006,
"loss": 0.1226,
"macro_f1": 0.32098764181137085,
"num_tokens": 322294.0,
"repeat_count": 0.0,
"routers_loss": 0.030282732099294662,
"skip_count": 0.0,
"step": 204,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 1.120136518771331,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.125,
"learning_rate": 0.00041,
"loss": 0.1582,
"macro_f1": 0.32098767161369324,
"num_tokens": 325029.0,
"repeat_count": 0.0,
"routers_loss": 0.24788229167461395,
"skip_count": 1.0,
"step": 206,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 25.0,
"epoch": 1.1310580204778158,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 5.9375,
"learning_rate": 0.000414,
"loss": 0.2048,
"macro_f1": 0.4871794879436493,
"num_tokens": 328178.0,
"repeat_count": 0.0,
"routers_loss": 0.031264692544937134,
"skip_count": 1.0,
"step": 208,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 25.0,
"epoch": 1.1419795221843003,
"f1_execute": 0.9166666269302368,
"f1_repeat": 0.0,
"f1_skip": 0.5714285373687744,
"grad_norm": 6.8125,
"learning_rate": 0.00041799999999999997,
"loss": 0.1756,
"macro_f1": 0.4960317313671112,
"num_tokens": 331351.0,
"repeat_count": 1.0,
"routers_loss": 0.343823105096817,
"skip_count": 4.0,
"step": 210,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.1529010238907849,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.84375,
"learning_rate": 0.000422,
"loss": 0.1246,
"macro_f1": 0.3333333432674408,
"num_tokens": 335297.0,
"repeat_count": 0.0,
"routers_loss": 0.014860679395496845,
"skip_count": 0.0,
"step": 212,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.1638225255972696,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.46875,
"learning_rate": 0.000426,
"loss": 0.1537,
"macro_f1": 0.3006536066532135,
"num_tokens": 338427.0,
"repeat_count": 1.0,
"routers_loss": 0.33231568336486816,
"skip_count": 3.0,
"step": 214,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.1747440273037544,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.59375,
"learning_rate": 0.00043,
"loss": 0.1546,
"macro_f1": 0.3333333432674408,
"num_tokens": 341158.0,
"repeat_count": 0.0,
"routers_loss": 0.007448212709277868,
"skip_count": 0.0,
"step": 216,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 1.185665529010239,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.28125,
"learning_rate": 0.00043400000000000003,
"loss": 0.1468,
"macro_f1": 0.3272727429866791,
"num_tokens": 344329.0,
"repeat_count": 0.0,
"routers_loss": 0.02311822399497032,
"skip_count": 0.0,
"step": 218,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.1965870307167235,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.15625,
"learning_rate": 0.000438,
"loss": 0.1307,
"macro_f1": 0.32098767161369324,
"num_tokens": 348948.0,
"repeat_count": 0.0,
"routers_loss": 0.02867077849805355,
"skip_count": 1.0,
"step": 220,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 1.2075085324232082,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.96875,
"learning_rate": 0.000442,
"loss": 0.2046,
"macro_f1": 0.5492662787437439,
"num_tokens": 351741.0,
"repeat_count": 0.0,
"routers_loss": 0.03160649910569191,
"skip_count": 2.0,
"step": 222,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.2184300341296928,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.5,
"learning_rate": 0.000446,
"loss": 0.2074,
"macro_f1": 0.3272727429866791,
"num_tokens": 354852.0,
"repeat_count": 1.0,
"routers_loss": 0.1611160784959793,
"skip_count": 0.0,
"step": 224,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.3333333432674408,
"avg_layers": 29.0,
"epoch": 1.2293515358361775,
"f1_execute": 0.8695651888847351,
"f1_repeat": 0.4000000059604645,
"f1_skip": 0.4000000059604645,
"grad_norm": 3.328125,
"learning_rate": 0.00045000000000000004,
"loss": 0.118,
"macro_f1": 0.5565217733383179,
"num_tokens": 357431.0,
"repeat_count": 2.0,
"routers_loss": 0.7632720470428467,
"skip_count": 3.0,
"step": 226,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.240273037542662,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.59375,
"learning_rate": 0.00045400000000000003,
"loss": 0.0965,
"macro_f1": 0.32098767161369324,
"num_tokens": 360192.0,
"repeat_count": 0.0,
"routers_loss": 0.08349918574094772,
"skip_count": 1.0,
"step": 228,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 1.2511945392491468,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 9.9375,
"learning_rate": 0.000458,
"loss": 0.1714,
"macro_f1": 0.4871794879436493,
"num_tokens": 363209.0,
"repeat_count": 0.0,
"routers_loss": 0.06626693904399872,
"skip_count": 2.0,
"step": 230,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.2621160409556313,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.25,
"learning_rate": 0.000462,
"loss": 0.1859,
"macro_f1": 0.3272727429866791,
"num_tokens": 368262.0,
"repeat_count": 0.0,
"routers_loss": 0.03743857145309448,
"skip_count": 0.0,
"step": 232,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 1.273037542662116,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.34375,
"learning_rate": 0.00046600000000000005,
"loss": 0.2281,
"macro_f1": 0.31446540355682373,
"num_tokens": 370737.0,
"repeat_count": 1.0,
"routers_loss": 0.12340149283409119,
"skip_count": 0.0,
"step": 234,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 1.2839590443686006,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.8125,
"learning_rate": 0.00047,
"loss": 0.1535,
"macro_f1": 0.32098764181137085,
"num_tokens": 373272.0,
"repeat_count": 0.0,
"routers_loss": 0.04501926526427269,
"skip_count": 0.0,
"step": 236,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 1.2948805460750854,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.625,
"learning_rate": 0.000474,
"loss": 0.1701,
"macro_f1": 0.3076923191547394,
"num_tokens": 376924.0,
"repeat_count": 1.0,
"routers_loss": 0.3543643057346344,
"skip_count": 1.0,
"step": 238,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.25,
"avg_layers": 27.0,
"epoch": 1.30580204778157,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 5.78125,
"learning_rate": 0.00047799999999999996,
"loss": 0.1553,
"macro_f1": 0.4400000274181366,
"num_tokens": 380034.0,
"repeat_count": 1.0,
"routers_loss": 0.1332877278327942,
"skip_count": 4.0,
"step": 240,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.3167235494880547,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.125,
"learning_rate": 0.000482,
"loss": 0.0874,
"macro_f1": 0.3333333432674408,
"num_tokens": 382846.0,
"repeat_count": 0.0,
"routers_loss": 0.013933669775724411,
"skip_count": 0.0,
"step": 242,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.3276450511945392,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.765625,
"learning_rate": 0.000486,
"loss": 0.1505,
"macro_f1": 0.3272727429866791,
"num_tokens": 385916.0,
"repeat_count": 0.0,
"routers_loss": 0.11566327512264252,
"skip_count": 1.0,
"step": 244,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.3385665529010238,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.9375,
"learning_rate": 0.00049,
"loss": 0.1634,
"macro_f1": 0.3272727429866791,
"num_tokens": 388768.0,
"repeat_count": 0.0,
"routers_loss": 0.015394577756524086,
"skip_count": 0.0,
"step": 246,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 1.3494880546075085,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.78125,
"learning_rate": 0.000494,
"loss": 0.1493,
"macro_f1": 0.32098764181137085,
"num_tokens": 391699.0,
"repeat_count": 0.0,
"routers_loss": 0.05529753863811493,
"skip_count": 0.0,
"step": 248,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.3604095563139933,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.75,
"learning_rate": 0.000498,
"loss": 0.2545,
"macro_f1": 0.31446540355682373,
"num_tokens": 395380.0,
"repeat_count": 1.0,
"routers_loss": 0.15498189628124237,
"skip_count": 1.0,
"step": 250,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.3713310580204778,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.8125,
"learning_rate": 0.0005020000000000001,
"loss": 0.1998,
"macro_f1": 0.31446540355682373,
"num_tokens": 398414.0,
"repeat_count": 0.0,
"routers_loss": 0.053408559411764145,
"skip_count": 2.0,
"step": 252,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 1.3822525597269624,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.4375,
"learning_rate": 0.000506,
"loss": 0.1761,
"macro_f1": 0.31446540355682373,
"num_tokens": 401690.0,
"repeat_count": 0.0,
"routers_loss": 0.15143637359142303,
"skip_count": 1.0,
"step": 254,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.3931740614334471,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.796875,
"learning_rate": 0.00051,
"loss": 0.1638,
"macro_f1": 0.3272727429866791,
"num_tokens": 404533.0,
"repeat_count": 0.0,
"routers_loss": 0.036931805312633514,
"skip_count": 1.0,
"step": 256,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 1.4040955631399317,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 7.21875,
"learning_rate": 0.000514,
"loss": 0.1765,
"macro_f1": 0.5427350401878357,
"num_tokens": 408175.0,
"repeat_count": 1.0,
"routers_loss": 0.16898785531520844,
"skip_count": 2.0,
"step": 258,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 25.0,
"epoch": 1.4150170648464164,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.875,
"learning_rate": 0.000518,
"loss": 0.2172,
"macro_f1": 0.4871794879436493,
"num_tokens": 411160.0,
"repeat_count": 0.0,
"routers_loss": 0.05883602425456047,
"skip_count": 1.0,
"step": 260,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.425938566552901,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.453125,
"learning_rate": 0.000522,
"loss": 0.1121,
"macro_f1": 0.31446540355682373,
"num_tokens": 414391.0,
"repeat_count": 0.0,
"routers_loss": 0.14810606837272644,
"skip_count": 2.0,
"step": 262,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.4368600682593857,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.46875,
"learning_rate": 0.000526,
"loss": 0.1772,
"macro_f1": 0.3272727429866791,
"num_tokens": 417763.0,
"repeat_count": 1.0,
"routers_loss": 0.20452100038528442,
"skip_count": 0.0,
"step": 264,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 26.0,
"epoch": 1.4477815699658703,
"f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 3.5,
"learning_rate": 0.0005300000000000001,
"loss": 0.1446,
"macro_f1": 0.4326530694961548,
"num_tokens": 421881.0,
"repeat_count": 2.0,
"routers_loss": 0.32300108671188354,
"skip_count": 3.0,
"step": 266,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.20000000298023224,
"avg_layers": 27.0,
"epoch": 1.458703071672355,
"f1_execute": 0.8260869383811951,
"f1_repeat": 0.0,
"f1_skip": 0.2857142984867096,
"grad_norm": 3.96875,
"learning_rate": 0.0005340000000000001,
"loss": 0.1377,
"macro_f1": 0.3706004321575165,
"num_tokens": 424938.0,
"repeat_count": 2.0,
"routers_loss": 0.5530142784118652,
"skip_count": 5.0,
"step": 268,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.4696245733788396,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.5625,
"learning_rate": 0.0005380000000000001,
"loss": 0.1457,
"macro_f1": 0.307692289352417,
"num_tokens": 427555.0,
"repeat_count": 0.0,
"routers_loss": 0.10682675242424011,
"skip_count": 3.0,
"step": 270,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.480546075085324,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.90625,
"learning_rate": 0.0005420000000000001,
"loss": 0.174,
"macro_f1": 0.3144654333591461,
"num_tokens": 430168.0,
"repeat_count": 1.0,
"routers_loss": 0.9753395318984985,
"skip_count": 2.0,
"step": 272,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.4914675767918089,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.75,
"learning_rate": 0.000546,
"loss": 0.1441,
"macro_f1": 0.3333333432674408,
"num_tokens": 433358.0,
"repeat_count": 0.0,
"routers_loss": 0.021224403753876686,
"skip_count": 0.0,
"step": 274,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.5023890784982936,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.78125,
"learning_rate": 0.00055,
"loss": 0.1624,
"macro_f1": 0.32098764181137085,
"num_tokens": 436460.0,
"repeat_count": 0.0,
"routers_loss": 0.08185791224241257,
"skip_count": 2.0,
"step": 276,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 1.5133105802047782,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.40625,
"learning_rate": 0.000554,
"loss": 0.1677,
"macro_f1": 0.3144654333591461,
"num_tokens": 439531.0,
"repeat_count": 0.0,
"routers_loss": 0.037240445613861084,
"skip_count": 0.0,
"step": 278,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.5242320819112627,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.765625,
"learning_rate": 0.000558,
"loss": 0.2688,
"macro_f1": 0.3006536066532135,
"num_tokens": 442521.0,
"repeat_count": 1.0,
"routers_loss": 0.3406132459640503,
"skip_count": 3.0,
"step": 280,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.5351535836177475,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.5625,
"learning_rate": 0.0005620000000000001,
"loss": 0.0875,
"macro_f1": 0.3333333432674408,
"num_tokens": 444942.0,
"repeat_count": 0.0,
"routers_loss": 0.006758399773389101,
"skip_count": 0.0,
"step": 282,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.5460750853242322,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.90625,
"learning_rate": 0.000566,
"loss": 0.1597,
"macro_f1": 0.3144654333591461,
"num_tokens": 448193.0,
"repeat_count": 0.0,
"routers_loss": 0.06801790744066238,
"skip_count": 0.0,
"step": 284,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.20000000298023224,
"avg_layers": 27.0,
"epoch": 1.5569965870307167,
"f1_execute": 0.8510637879371643,
"f1_repeat": 0.0,
"f1_skip": 0.3333333134651184,
"grad_norm": 4.78125,
"learning_rate": 0.00057,
"loss": 0.2027,
"macro_f1": 0.39479905366897583,
"num_tokens": 451293.0,
"repeat_count": 3.0,
"routers_loss": 0.23832914233207703,
"skip_count": 5.0,
"step": 286,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.5679180887372013,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.40625,
"learning_rate": 0.000574,
"loss": 0.1361,
"macro_f1": 0.3272727429866791,
"num_tokens": 454069.0,
"repeat_count": 1.0,
"routers_loss": 0.14267782866954803,
"skip_count": 0.0,
"step": 288,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.578839590443686,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.65625,
"learning_rate": 0.000578,
"loss": 0.1921,
"macro_f1": 0.31446540355682373,
"num_tokens": 457308.0,
"repeat_count": 0.0,
"routers_loss": 0.3219856917858124,
"skip_count": 2.0,
"step": 290,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.5897610921501708,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.90625,
"learning_rate": 0.0005819999999999999,
"loss": 0.2214,
"macro_f1": 0.31446540355682373,
"num_tokens": 460138.0,
"repeat_count": 1.0,
"routers_loss": 0.4478992521762848,
"skip_count": 1.0,
"step": 292,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.6006825938566553,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.9375,
"learning_rate": 0.0005859999999999999,
"loss": 0.2102,
"macro_f1": 0.3333333432674408,
"num_tokens": 464029.0,
"repeat_count": 0.0,
"routers_loss": 0.019972749054431915,
"skip_count": 0.0,
"step": 294,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.6116040955631399,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.5,
"learning_rate": 0.00059,
"loss": 0.1164,
"macro_f1": 0.3076923191547394,
"num_tokens": 467500.0,
"repeat_count": 1.0,
"routers_loss": 0.14752870798110962,
"skip_count": 3.0,
"step": 296,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.6225255972696244,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.75,
"learning_rate": 0.000594,
"loss": 0.1434,
"macro_f1": 0.32098764181137085,
"num_tokens": 470734.0,
"repeat_count": 1.0,
"routers_loss": 0.30419600009918213,
"skip_count": 1.0,
"step": 298,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 1.6334470989761092,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.375,
"learning_rate": 0.000598,
"loss": 0.2077,
"macro_f1": 0.31446540355682373,
"num_tokens": 474514.0,
"repeat_count": 0.0,
"routers_loss": 0.06921514868736267,
"skip_count": 2.0,
"step": 300,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.644368600682594,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.125,
"learning_rate": 0.000602,
"loss": 0.1566,
"macro_f1": 0.3076923191547394,
"num_tokens": 477393.0,
"repeat_count": 0.0,
"routers_loss": 0.2468976378440857,
"skip_count": 2.0,
"step": 302,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.6552901023890785,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.375,
"learning_rate": 0.000606,
"loss": 0.1649,
"macro_f1": 0.3272727429866791,
"num_tokens": 480381.0,
"repeat_count": 0.0,
"routers_loss": 0.020447812974452972,
"skip_count": 0.0,
"step": 304,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.666211604095563,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.5,
"learning_rate": 0.00061,
"loss": 0.1423,
"macro_f1": 0.31446540355682373,
"num_tokens": 483502.0,
"repeat_count": 0.0,
"routers_loss": 0.05023586004972458,
"skip_count": 1.0,
"step": 306,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 1.6771331058020478,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.0,
"learning_rate": 0.000614,
"loss": 0.2042,
"macro_f1": 0.3144654333591461,
"num_tokens": 488006.0,
"repeat_count": 0.0,
"routers_loss": 0.049936871975660324,
"skip_count": 0.0,
"step": 308,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.6880546075085325,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.46875,
"learning_rate": 0.0006180000000000001,
"loss": 0.2121,
"macro_f1": 0.3272727429866791,
"num_tokens": 491611.0,
"repeat_count": 1.0,
"routers_loss": 0.20010031759738922,
"skip_count": 0.0,
"step": 310,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.698976109215017,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.09375,
"learning_rate": 0.000622,
"loss": 0.2415,
"macro_f1": 0.3333333432674408,
"num_tokens": 494903.0,
"repeat_count": 0.0,
"routers_loss": 0.01630268059670925,
"skip_count": 0.0,
"step": 312,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.7098976109215016,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.90625,
"learning_rate": 0.000626,
"loss": 0.2042,
"macro_f1": 0.32098767161369324,
"num_tokens": 497949.0,
"repeat_count": 0.0,
"routers_loss": 0.2674679160118103,
"skip_count": 1.0,
"step": 314,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.5,
"avg_layers": 28.0,
"epoch": 1.7208191126279864,
"f1_execute": 0.9803921580314636,
"f1_repeat": 1.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 2.5,
"learning_rate": 0.00063,
"loss": 0.1844,
"macro_f1": 0.8823530077934265,
"num_tokens": 501082.0,
"repeat_count": 1.0,
"routers_loss": 0.1621737778186798,
"skip_count": 2.0,
"step": 316,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.7317406143344711,
"f1_execute": 0.8979592323303223,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
"grad_norm": 9.125,
"learning_rate": 0.000634,
"loss": 0.1708,
"macro_f1": 0.5215420126914978,
"num_tokens": 504131.0,
"repeat_count": 2.0,
"routers_loss": 0.6877225041389465,
"skip_count": 2.0,
"step": 318,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 1.7426621160409557,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.6875,
"learning_rate": 0.000638,
"loss": 0.1874,
"macro_f1": 0.29333335161209106,
"num_tokens": 507012.0,
"repeat_count": 0.0,
"routers_loss": 0.14521881937980652,
"skip_count": 2.0,
"step": 320,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 23.0,
"epoch": 1.7535836177474402,
"f1_execute": 0.8936170339584351,
"f1_repeat": 0.0,
"f1_skip": 0.444444477558136,
"grad_norm": 4.46875,
"learning_rate": 0.000642,
"loss": 0.1489,
"macro_f1": 0.44602054357528687,
"num_tokens": 509950.0,
"repeat_count": 0.0,
"routers_loss": 0.15650968253612518,
"skip_count": 4.0,
"step": 322,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 1.764505119453925,
"f1_execute": 0.8333333730697632,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.96875,
"learning_rate": 0.000646,
"loss": 0.163,
"macro_f1": 0.2777777910232544,
"num_tokens": 512900.0,
"repeat_count": 2.0,
"routers_loss": 0.3924711048603058,
"skip_count": 3.0,
"step": 324,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 1.7754266211604095,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 2.546875,
"learning_rate": 0.0006500000000000001,
"loss": 0.1452,
"macro_f1": 0.5492662787437439,
"num_tokens": 516233.0,
"repeat_count": 0.0,
"routers_loss": 0.038907092064619064,
"skip_count": 2.0,
"step": 326,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.7863481228668943,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.796875,
"learning_rate": 0.0006540000000000001,
"loss": 0.1641,
"macro_f1": 0.3333333432674408,
"num_tokens": 519636.0,
"repeat_count": 0.0,
"routers_loss": 0.0022514634765684605,
"skip_count": 0.0,
"step": 328,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 28.0,
"epoch": 1.7972696245733788,
"f1_execute": 0.9166666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 7.03125,
"learning_rate": 0.0006580000000000001,
"loss": 0.2761,
"macro_f1": 0.4722222685813904,
"num_tokens": 522992.0,
"repeat_count": 2.0,
"routers_loss": 0.4415050148963928,
"skip_count": 2.0,
"step": 330,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 1.8081911262798633,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.4375,
"learning_rate": 0.000662,
"loss": 0.1657,
"macro_f1": 0.32098767161369324,
"num_tokens": 526843.0,
"repeat_count": 0.0,
"routers_loss": 0.06788615882396698,
"skip_count": 1.0,
"step": 332,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 1.819112627986348,
"f1_execute": 0.9811320900917053,
"f1_repeat": 1.0,
"f1_skip": 0.0,
"grad_norm": 5.78125,
"learning_rate": 0.000666,
"loss": 0.1996,
"macro_f1": 0.6603773832321167,
"num_tokens": 530177.0,
"repeat_count": 1.0,
"routers_loss": 0.06985973566770554,
"skip_count": 1.0,
"step": 334,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.8300341296928329,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.46875,
"learning_rate": 0.00067,
"loss": 0.1877,
"macro_f1": 0.307692289352417,
"num_tokens": 533183.0,
"repeat_count": 1.0,
"routers_loss": 0.33230671286582947,
"skip_count": 2.0,
"step": 336,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.8409556313993174,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.6875,
"learning_rate": 0.000674,
"loss": 0.1249,
"macro_f1": 0.3076923191547394,
"num_tokens": 536858.0,
"repeat_count": 0.0,
"routers_loss": 0.15104004740715027,
"skip_count": 2.0,
"step": 338,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.851877133105802,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.625,
"learning_rate": 0.0006780000000000001,
"loss": 0.1885,
"macro_f1": 0.3272727429866791,
"num_tokens": 540769.0,
"repeat_count": 0.0,
"routers_loss": 0.032123174518346786,
"skip_count": 0.0,
"step": 340,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.8627986348122867,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.96875,
"learning_rate": 0.0006820000000000001,
"loss": 0.1809,
"macro_f1": 0.3272727429866791,
"num_tokens": 543783.0,
"repeat_count": 0.0,
"routers_loss": 0.05651572719216347,
"skip_count": 1.0,
"step": 342,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 1.8737201365187715,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.6875,
"learning_rate": 0.0006860000000000001,
"loss": 0.1804,
"macro_f1": 0.3076923191547394,
"num_tokens": 547125.0,
"repeat_count": 0.0,
"routers_loss": 0.13617995381355286,
"skip_count": 2.0,
"step": 344,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.884641638225256,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.65625,
"learning_rate": 0.00069,
"loss": 0.204,
"macro_f1": 0.3272727429866791,
"num_tokens": 550591.0,
"repeat_count": 0.0,
"routers_loss": 0.023369189351797104,
"skip_count": 0.0,
"step": 346,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.8955631399317405,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.625,
"learning_rate": 0.000694,
"loss": 0.2275,
"macro_f1": 0.3272727429866791,
"num_tokens": 553785.0,
"repeat_count": 0.0,
"routers_loss": 0.09765879064798355,
"skip_count": 1.0,
"step": 348,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.9064846416382253,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 10.5,
"learning_rate": 0.0006979999999999999,
"loss": 0.4191,
"macro_f1": 0.3333333432674408,
"num_tokens": 556135.0,
"repeat_count": 0.0,
"routers_loss": 0.011158714070916176,
"skip_count": 0.0,
"step": 350,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.91740614334471,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.53125,
"learning_rate": 0.0007019999999999999,
"loss": 0.1557,
"macro_f1": 0.3272727429866791,
"num_tokens": 558980.0,
"repeat_count": 0.0,
"routers_loss": 0.036593515425920486,
"skip_count": 0.0,
"step": 352,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.25,
"avg_layers": 26.0,
"epoch": 1.9283276450511946,
"f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 4.1875,
"learning_rate": 0.0007059999999999999,
"loss": 0.183,
"macro_f1": 0.4104308485984802,
"num_tokens": 562187.0,
"repeat_count": 1.0,
"routers_loss": 0.48064568638801575,
"skip_count": 4.0,
"step": 354,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 1.9392491467576791,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.0,
"learning_rate": 0.00071,
"loss": 0.1982,
"macro_f1": 0.32098767161369324,
"num_tokens": 565278.0,
"repeat_count": 0.0,
"routers_loss": 0.13826458156108856,
"skip_count": 1.0,
"step": 356,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 1.9501706484641637,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.90625,
"learning_rate": 0.000714,
"loss": 0.2709,
"macro_f1": 0.3333333432674408,
"num_tokens": 567869.0,
"repeat_count": 0.0,
"routers_loss": 0.01589345932006836,
"skip_count": 0.0,
"step": 358,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 1.9610921501706484,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.09375,
"learning_rate": 0.000718,
"loss": 0.1902,
"macro_f1": 0.3272727429866791,
"num_tokens": 571069.0,
"repeat_count": 0.0,
"routers_loss": 0.029062755405902863,
"skip_count": 0.0,
"step": 360,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 1.9720136518771332,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.125,
"learning_rate": 0.000722,
"loss": 0.2125,
"macro_f1": 0.3076923191547394,
"num_tokens": 573838.0,
"repeat_count": 1.0,
"routers_loss": 0.3241157531738281,
"skip_count": 1.0,
"step": 362,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 1.9829351535836177,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.46875,
"learning_rate": 0.000726,
"loss": 0.2176,
"macro_f1": 0.3272727429866791,
"num_tokens": 576554.0,
"repeat_count": 0.0,
"routers_loss": 0.03469887003302574,
"skip_count": 0.0,
"step": 364,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 25.0,
"epoch": 1.9938566552901023,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 7.34375,
"learning_rate": 0.00073,
"loss": 0.182,
"macro_f1": 0.4803921580314636,
"num_tokens": 579653.0,
"repeat_count": 1.0,
"routers_loss": 0.11800751090049744,
"skip_count": 1.0,
"step": 366,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.0,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 15.125,
"learning_rate": 0.000734,
"loss": 0.3307,
"macro_f1": 0.3333333432674408,
"num_tokens": 581832.0,
"repeat_count": 0.0,
"routers_loss": 0.014465595595538616,
"skip_count": 0.0,
"step": 368,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 2.0109215017064845,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.375,
"learning_rate": 0.000738,
"loss": 0.1482,
"macro_f1": 0.3272727429866791,
"num_tokens": 585207.0,
"repeat_count": 0.0,
"routers_loss": 0.030198052525520325,
"skip_count": 0.0,
"step": 370,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.021843003412969,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.78125,
"learning_rate": 0.000742,
"loss": 0.0906,
"macro_f1": 0.32098767161369324,
"num_tokens": 588893.0,
"repeat_count": 0.0,
"routers_loss": 0.04226446524262428,
"skip_count": 1.0,
"step": 372,
"text_loss": 0.0
},
{
"acc_repeat": 0.6666666865348816,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 2.032764505119454,
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.800000011920929,
"f1_skip": 1.0,
"grad_norm": 8.0625,
"learning_rate": 0.000746,
"loss": 0.2092,
"macro_f1": 0.9259259104728699,
"num_tokens": 592246.0,
"repeat_count": 3.0,
"routers_loss": 0.05995782092213631,
"skip_count": 3.0,
"step": 374,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 2.0436860068259386,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.03125,
"learning_rate": 0.00075,
"loss": 0.1724,
"macro_f1": 0.3006536066532135,
"num_tokens": 594777.0,
"repeat_count": 0.0,
"routers_loss": 0.14366891980171204,
"skip_count": 3.0,
"step": 376,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.054607508532423,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 1.90625,
"learning_rate": 0.000754,
"loss": 0.0803,
"macro_f1": 0.3333333432674408,
"num_tokens": 597931.0,
"repeat_count": 0.0,
"routers_loss": 0.0027963866014033556,
"skip_count": 0.0,
"step": 378,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 29.0,
"epoch": 2.0655290102389077,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 6.28125,
"learning_rate": 0.000758,
"loss": 0.2873,
"macro_f1": 0.5359477400779724,
"num_tokens": 601227.0,
"repeat_count": 0.0,
"routers_loss": 0.15012779831886292,
"skip_count": 2.0,
"step": 380,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 2.0764505119453927,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
"grad_norm": 4.96875,
"learning_rate": 0.000762,
"loss": 0.1602,
"macro_f1": 0.5427350401878357,
"num_tokens": 604297.0,
"repeat_count": 2.0,
"routers_loss": 0.0708698183298111,
"skip_count": 1.0,
"step": 382,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.20000000298023224,
"avg_layers": 28.0,
"epoch": 2.087372013651877,
"f1_execute": 0.8510638475418091,
"f1_repeat": 0.0,
"f1_skip": 0.3333333134651184,
"grad_norm": 8.25,
"learning_rate": 0.0007660000000000001,
"loss": 0.1786,
"macro_f1": 0.3947990834712982,
"num_tokens": 607137.0,
"repeat_count": 2.0,
"routers_loss": 0.46035754680633545,
"skip_count": 5.0,
"step": 384,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 2.0982935153583617,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 9.8125,
"learning_rate": 0.0007700000000000001,
"loss": 0.1415,
"macro_f1": 0.4871794879436493,
"num_tokens": 610067.0,
"repeat_count": 0.0,
"routers_loss": 0.04594701901078224,
"skip_count": 2.0,
"step": 386,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.25,
"avg_layers": 26.0,
"epoch": 2.1092150170648463,
"f1_execute": 0.9387754797935486,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 5.59375,
"learning_rate": 0.0007740000000000001,
"loss": 0.1453,
"macro_f1": 0.42403626441955566,
"num_tokens": 613020.0,
"repeat_count": 1.0,
"routers_loss": 0.21872307360172272,
"skip_count": 4.0,
"step": 388,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.1201365187713312,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.71875,
"learning_rate": 0.000778,
"loss": 0.2459,
"macro_f1": 0.3006536066532135,
"num_tokens": 615777.0,
"repeat_count": 0.0,
"routers_loss": 0.17068128287792206,
"skip_count": 3.0,
"step": 390,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 2.131058020477816,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.40625,
"learning_rate": 0.000782,
"loss": 0.1734,
"macro_f1": 0.5492662787437439,
"num_tokens": 618883.0,
"repeat_count": 0.0,
"routers_loss": 0.06883871555328369,
"skip_count": 2.0,
"step": 392,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 2.1419795221843003,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 4.4375,
"learning_rate": 0.000786,
"loss": 0.1822,
"macro_f1": 0.4871794879436493,
"num_tokens": 621785.0,
"repeat_count": 0.0,
"routers_loss": 0.021629702299833298,
"skip_count": 2.0,
"step": 394,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 2.152901023890785,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 10.4375,
"learning_rate": 0.00079,
"loss": 0.2188,
"macro_f1": 0.4871794879436493,
"num_tokens": 624497.0,
"repeat_count": 0.0,
"routers_loss": 0.02989846095442772,
"skip_count": 2.0,
"step": 396,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.1638225255972694,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.03125,
"learning_rate": 0.0007940000000000001,
"loss": 0.2,
"macro_f1": 0.3333333432674408,
"num_tokens": 627530.0,
"repeat_count": 0.0,
"routers_loss": 0.0030090075451880693,
"skip_count": 0.0,
"step": 398,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.1747440273037544,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.625,
"learning_rate": 0.0007980000000000001,
"loss": 0.1503,
"macro_f1": 0.3272727429866791,
"num_tokens": 630816.0,
"repeat_count": 0.0,
"routers_loss": 0.02026674523949623,
"skip_count": 0.0,
"step": 400,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.185665529010239,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.6875,
"learning_rate": 0.0008020000000000001,
"loss": 0.1285,
"macro_f1": 0.3272727429866791,
"num_tokens": 633715.0,
"repeat_count": 1.0,
"routers_loss": 0.08777285367250443,
"skip_count": 0.0,
"step": 402,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.1965870307167235,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.59375,
"learning_rate": 0.0008060000000000001,
"loss": 0.186,
"macro_f1": 0.3272727429866791,
"num_tokens": 636871.0,
"repeat_count": 0.0,
"routers_loss": 0.049915000796318054,
"skip_count": 1.0,
"step": 404,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 2.207508532423208,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 5.34375,
"learning_rate": 0.0008100000000000001,
"loss": 0.1592,
"macro_f1": 0.5492662787437439,
"num_tokens": 639784.0,
"repeat_count": 0.0,
"routers_loss": 0.05443386733531952,
"skip_count": 2.0,
"step": 406,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 2.218430034129693,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.90625,
"learning_rate": 0.0008139999999999999,
"loss": 0.1947,
"macro_f1": 0.3272727429866791,
"num_tokens": 642682.0,
"repeat_count": 0.0,
"routers_loss": 0.021953796967864037,
"skip_count": 0.0,
"step": 408,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.2293515358361775,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.625,
"learning_rate": 0.0008179999999999999,
"loss": 0.2197,
"macro_f1": 0.3333333432674408,
"num_tokens": 645962.0,
"repeat_count": 0.0,
"routers_loss": 0.010657553561031818,
"skip_count": 0.0,
"step": 410,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.240273037542662,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.421875,
"learning_rate": 0.0008219999999999999,
"loss": 0.2091,
"macro_f1": 0.3333333432674408,
"num_tokens": 649180.0,
"repeat_count": 0.0,
"routers_loss": 0.013879667967557907,
"skip_count": 0.0,
"step": 412,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.2511945392491466,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.0625,
"learning_rate": 0.000826,
"loss": 0.1555,
"macro_f1": 0.31446540355682373,
"num_tokens": 653015.0,
"repeat_count": 0.0,
"routers_loss": 0.12807206809520721,
"skip_count": 2.0,
"step": 414,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.6666666865348816,
"avg_layers": 25.0,
"epoch": 2.2621160409556316,
"f1_execute": 0.9166666269302368,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.453125,
"learning_rate": 0.00083,
"loss": 0.1335,
"macro_f1": 0.5277777910232544,
"num_tokens": 655892.0,
"repeat_count": 2.0,
"routers_loss": 0.8250671625137329,
"skip_count": 3.0,
"step": 416,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 2.273037542662116,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 5.8125,
"learning_rate": 0.000834,
"loss": 0.1831,
"macro_f1": 0.5492662787437439,
"num_tokens": 658426.0,
"repeat_count": 0.0,
"routers_loss": 0.03139641508460045,
"skip_count": 2.0,
"step": 418,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.2839590443686006,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
"grad_norm": 3.40625,
"learning_rate": 0.000838,
"loss": 0.1345,
"macro_f1": 0.5427350401878357,
"num_tokens": 661809.0,
"repeat_count": 2.0,
"routers_loss": 0.0441780611872673,
"skip_count": 0.0,
"step": 420,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.294880546075085,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.84375,
"learning_rate": 0.000842,
"loss": 0.1127,
"macro_f1": 0.3272727429866791,
"num_tokens": 664874.0,
"repeat_count": 0.0,
"routers_loss": 0.44332680106163025,
"skip_count": 1.0,
"step": 422,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.3058020477815697,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.96875,
"learning_rate": 0.000846,
"loss": 0.1225,
"macro_f1": 0.3272727429866791,
"num_tokens": 668325.0,
"repeat_count": 0.0,
"routers_loss": 0.059455983340740204,
"skip_count": 0.0,
"step": 424,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 2.3167235494880547,
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 6.8125,
"learning_rate": 0.00085,
"loss": 0.1816,
"macro_f1": 0.5359477400779724,
"num_tokens": 671097.0,
"repeat_count": 2.0,
"routers_loss": 0.3154633641242981,
"skip_count": 2.0,
"step": 426,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 25.0,
"epoch": 2.3276450511945392,
"f1_execute": 0.8979592323303223,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 5.65625,
"learning_rate": 0.000854,
"loss": 0.122,
"macro_f1": 0.4104308784008026,
"num_tokens": 674042.0,
"repeat_count": 1.0,
"routers_loss": 0.4580267667770386,
"skip_count": 3.0,
"step": 428,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.3385665529010238,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.21875,
"learning_rate": 0.000858,
"loss": 0.1113,
"macro_f1": 0.3272727429866791,
"num_tokens": 677016.0,
"repeat_count": 0.0,
"routers_loss": 0.015222650021314621,
"skip_count": 0.0,
"step": 430,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.3494880546075088,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.78125,
"learning_rate": 0.000862,
"loss": 0.1379,
"macro_f1": 0.3333333432674408,
"num_tokens": 679990.0,
"repeat_count": 1.0,
"routers_loss": 0.24279196560382843,
"skip_count": 0.0,
"step": 432,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.3604095563139933,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.28125,
"learning_rate": 0.000866,
"loss": 0.1476,
"macro_f1": 0.3333333432674408,
"num_tokens": 682786.0,
"repeat_count": 1.0,
"routers_loss": 0.1684337556362152,
"skip_count": 0.0,
"step": 434,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.371331058020478,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.0,
"learning_rate": 0.00087,
"loss": 0.1204,
"macro_f1": 0.3272727429866791,
"num_tokens": 685882.0,
"repeat_count": 1.0,
"routers_loss": 0.19464725255966187,
"skip_count": 0.0,
"step": 436,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.3822525597269624,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.71875,
"learning_rate": 0.000874,
"loss": 0.1124,
"macro_f1": 0.32098764181137085,
"num_tokens": 689570.0,
"repeat_count": 0.0,
"routers_loss": 0.05968143790960312,
"skip_count": 2.0,
"step": 438,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.393174061433447,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.84375,
"learning_rate": 0.000878,
"loss": 0.1528,
"macro_f1": 0.3333333432674408,
"num_tokens": 693559.0,
"repeat_count": 0.0,
"routers_loss": 0.004517437424510717,
"skip_count": 0.0,
"step": 440,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.404095563139932,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.125,
"learning_rate": 0.000882,
"loss": 0.1353,
"macro_f1": 0.3006536066532135,
"num_tokens": 696374.0,
"repeat_count": 0.0,
"routers_loss": 0.26632770895957947,
"skip_count": 2.0,
"step": 442,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.4150170648464164,
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.75,
"learning_rate": 0.0008860000000000001,
"loss": 0.1874,
"macro_f1": 0.2857142984867096,
"num_tokens": 699954.0,
"repeat_count": 1.0,
"routers_loss": 0.3751397728919983,
"skip_count": 3.0,
"step": 444,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.425938566552901,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.28125,
"learning_rate": 0.0008900000000000001,
"loss": 0.2139,
"macro_f1": 0.32098764181137085,
"num_tokens": 703477.0,
"repeat_count": 0.0,
"routers_loss": 0.2166936844587326,
"skip_count": 2.0,
"step": 446,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.4368600682593855,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.0625,
"learning_rate": 0.000894,
"loss": 0.3078,
"macro_f1": 0.3333333432674408,
"num_tokens": 706342.0,
"repeat_count": 0.0,
"routers_loss": 0.004165076185017824,
"skip_count": 0.0,
"step": 448,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.4477815699658705,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.5625,
"learning_rate": 0.000898,
"loss": 0.3248,
"macro_f1": 0.307692289352417,
"num_tokens": 709048.0,
"repeat_count": 0.0,
"routers_loss": 0.11787679046392441,
"skip_count": 1.0,
"step": 450,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.458703071672355,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.875,
"learning_rate": 0.000902,
"loss": 0.2151,
"macro_f1": 0.31446540355682373,
"num_tokens": 712168.0,
"repeat_count": 2.0,
"routers_loss": 0.24694015085697174,
"skip_count": 0.0,
"step": 452,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 2.4696245733788396,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 9.0625,
"learning_rate": 0.000906,
"loss": 0.1899,
"macro_f1": 0.5492662787437439,
"num_tokens": 715867.0,
"repeat_count": 0.0,
"routers_loss": 0.14055466651916504,
"skip_count": 2.0,
"step": 454,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.480546075085324,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.84375,
"learning_rate": 0.00091,
"loss": 0.136,
"macro_f1": 0.32098764181137085,
"num_tokens": 718940.0,
"repeat_count": 0.0,
"routers_loss": 0.2996567487716675,
"skip_count": 2.0,
"step": 456,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 2.491467576791809,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 8.5625,
"learning_rate": 0.0009140000000000001,
"loss": 0.2439,
"macro_f1": 0.5492662787437439,
"num_tokens": 721407.0,
"repeat_count": 0.0,
"routers_loss": 0.032011453062295914,
"skip_count": 2.0,
"step": 458,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 2.5023890784982936,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 11.0,
"learning_rate": 0.0009180000000000001,
"loss": 0.2592,
"macro_f1": 0.3144654333591461,
"num_tokens": 726056.0,
"repeat_count": 0.0,
"routers_loss": 0.06647517532110214,
"skip_count": 0.0,
"step": 460,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.513310580204778,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.875,
"learning_rate": 0.0009220000000000001,
"loss": 0.1904,
"macro_f1": 0.32098764181137085,
"num_tokens": 729038.0,
"repeat_count": 0.0,
"routers_loss": 0.08919267356395721,
"skip_count": 0.0,
"step": 462,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 2.5242320819112627,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.46875,
"learning_rate": 0.0009260000000000001,
"loss": 0.1969,
"macro_f1": 0.3006536066532135,
"num_tokens": 732172.0,
"repeat_count": 0.0,
"routers_loss": 0.4903416037559509,
"skip_count": 2.0,
"step": 464,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 2.5351535836177472,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 5.5,
"learning_rate": 0.00093,
"loss": 0.1957,
"macro_f1": 0.6666666865348816,
"num_tokens": 735282.0,
"repeat_count": 0.0,
"routers_loss": 0.025489339604973793,
"skip_count": 2.0,
"step": 466,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.546075085324232,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.28125,
"learning_rate": 0.000934,
"loss": 0.2198,
"macro_f1": 0.3333333432674408,
"num_tokens": 739208.0,
"repeat_count": 0.0,
"routers_loss": 0.013121264986693859,
"skip_count": 0.0,
"step": 468,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.5569965870307167,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.15625,
"learning_rate": 0.0009379999999999999,
"loss": 0.3641,
"macro_f1": 0.32098764181137085,
"num_tokens": 741980.0,
"repeat_count": 0.0,
"routers_loss": 0.45740270614624023,
"skip_count": 2.0,
"step": 470,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.5679180887372013,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.90625,
"learning_rate": 0.000942,
"loss": 0.1668,
"macro_f1": 0.31446540355682373,
"num_tokens": 745551.0,
"repeat_count": 0.0,
"routers_loss": 0.1244814470410347,
"skip_count": 2.0,
"step": 472,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 2.5788395904436863,
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 12.25,
"learning_rate": 0.000946,
"loss": 0.2807,
"macro_f1": 0.2857142984867096,
"num_tokens": 748488.0,
"repeat_count": 1.0,
"routers_loss": 0.3303976058959961,
"skip_count": 3.0,
"step": 474,
"text_loss": 0.0
},
{
"acc_repeat": 0.3333333432674408,
"acc_skip": 0.0,
"avg_layers": 30.0,
"epoch": 2.589761092150171,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.4000000059604645,
"f1_skip": 0.0,
"grad_norm": 3.640625,
"learning_rate": 0.00095,
"loss": 0.1353,
"macro_f1": 0.44705885648727417,
"num_tokens": 752865.0,
"repeat_count": 3.0,
"routers_loss": 0.24396798014640808,
"skip_count": 0.0,
"step": 476,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 27.0,
"epoch": 2.6006825938566553,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 5.59375,
"learning_rate": 0.000954,
"loss": 0.1584,
"macro_f1": 0.4400000274181366,
"num_tokens": 755653.0,
"repeat_count": 0.0,
"routers_loss": 0.09343712776899338,
"skip_count": 3.0,
"step": 478,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.61160409556314,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.4375,
"learning_rate": 0.000958,
"loss": 0.2014,
"macro_f1": 0.3272727429866791,
"num_tokens": 758567.0,
"repeat_count": 0.0,
"routers_loss": 0.03879999741911888,
"skip_count": 1.0,
"step": 480,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.6225255972696244,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.5625,
"learning_rate": 0.000962,
"loss": 0.2174,
"macro_f1": 0.32098764181137085,
"num_tokens": 762013.0,
"repeat_count": 0.0,
"routers_loss": 0.13902239501476288,
"skip_count": 2.0,
"step": 482,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.6334470989761094,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.125,
"learning_rate": 0.000966,
"loss": 0.2322,
"macro_f1": 0.3272727429866791,
"num_tokens": 764820.0,
"repeat_count": 0.0,
"routers_loss": 0.0281832292675972,
"skip_count": 0.0,
"step": 484,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 2.644368600682594,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 11.25,
"learning_rate": 0.0009699999999999999,
"loss": 0.178,
"macro_f1": 0.29333335161209106,
"num_tokens": 767962.0,
"repeat_count": 0.0,
"routers_loss": 0.3387240767478943,
"skip_count": 2.0,
"step": 486,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 2.6552901023890785,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.4375,
"learning_rate": 0.000974,
"loss": 0.1818,
"macro_f1": 0.32098764181137085,
"num_tokens": 771189.0,
"repeat_count": 0.0,
"routers_loss": 0.033774666488170624,
"skip_count": 0.0,
"step": 488,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.666211604095563,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.875,
"learning_rate": 0.000978,
"loss": 0.2071,
"macro_f1": 0.3333333432674408,
"num_tokens": 774073.0,
"repeat_count": 0.0,
"routers_loss": 0.009604716673493385,
"skip_count": 0.0,
"step": 490,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.6771331058020476,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.46875,
"learning_rate": 0.000982,
"loss": 0.1853,
"macro_f1": 0.3333333432674408,
"num_tokens": 776722.0,
"repeat_count": 0.0,
"routers_loss": 0.0034638401120901108,
"skip_count": 0.0,
"step": 492,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.6880546075085325,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.6875,
"learning_rate": 0.0009860000000000001,
"loss": 0.2882,
"macro_f1": 0.32098764181137085,
"num_tokens": 780051.0,
"repeat_count": 0.0,
"routers_loss": 0.08520562946796417,
"skip_count": 0.0,
"step": 494,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.698976109215017,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.875,
"learning_rate": 0.00099,
"loss": 0.1995,
"macro_f1": 0.3272727429866791,
"num_tokens": 782813.0,
"repeat_count": 0.0,
"routers_loss": 0.16369783878326416,
"skip_count": 1.0,
"step": 496,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 2.7098976109215016,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.859375,
"learning_rate": 0.000994,
"loss": 0.1725,
"macro_f1": 0.3006536066532135,
"num_tokens": 785376.0,
"repeat_count": 0.0,
"routers_loss": 0.17243081331253052,
"skip_count": 2.0,
"step": 498,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.25,
"avg_layers": 26.0,
"epoch": 2.7208191126279866,
"f1_execute": 0.8749999403953552,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 9.9375,
"learning_rate": 0.000998,
"loss": 0.1842,
"macro_f1": 0.402777761220932,
"num_tokens": 788030.0,
"repeat_count": 2.0,
"routers_loss": 0.15272235870361328,
"skip_count": 4.0,
"step": 500,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.731740614334471,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.4375,
"learning_rate": 0.0009999999674012276,
"loss": 0.1709,
"macro_f1": 0.32098764181137085,
"num_tokens": 791099.0,
"repeat_count": 0.0,
"routers_loss": 0.02299564890563488,
"skip_count": 0.0,
"step": 502,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 2.7426621160409557,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.96875,
"learning_rate": 0.000999999706611075,
"loss": 0.1858,
"macro_f1": 0.3144654333591461,
"num_tokens": 794155.0,
"repeat_count": 0.0,
"routers_loss": 0.0592501275241375,
"skip_count": 0.0,
"step": 504,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 2.75358361774744,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.71875,
"learning_rate": 0.0009999991850309056,
"loss": 0.1347,
"macro_f1": 0.307692289352417,
"num_tokens": 797457.0,
"repeat_count": 0.0,
"routers_loss": 0.07785549014806747,
"skip_count": 1.0,
"step": 506,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 2.7645051194539247,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 9.25,
"learning_rate": 0.0009999984026609918,
"loss": 0.1448,
"macro_f1": 0.4803921580314636,
"num_tokens": 800614.0,
"repeat_count": 0.0,
"routers_loss": 0.32612788677215576,
"skip_count": 2.0,
"step": 508,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.7754266211604097,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.84375,
"learning_rate": 0.0009999973595017412,
"loss": 0.2566,
"macro_f1": 0.3272727429866791,
"num_tokens": 804027.0,
"repeat_count": 0.0,
"routers_loss": 0.03253546729683876,
"skip_count": 0.0,
"step": 510,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 28.0,
"epoch": 2.7863481228668943,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.6875,
"learning_rate": 0.0009999960555536983,
"loss": 0.1271,
"macro_f1": 0.5359477400779724,
"num_tokens": 807662.0,
"repeat_count": 1.0,
"routers_loss": 0.16023527085781097,
"skip_count": 2.0,
"step": 512,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.797269624573379,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.375,
"learning_rate": 0.0009999944908175428,
"loss": 0.1876,
"macro_f1": 0.3272727429866791,
"num_tokens": 810905.0,
"repeat_count": 0.0,
"routers_loss": 0.022885220125317574,
"skip_count": 0.0,
"step": 514,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.8081911262798633,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.078125,
"learning_rate": 0.0009999926652940912,
"loss": 0.1309,
"macro_f1": 0.3333333432674408,
"num_tokens": 814110.0,
"repeat_count": 0.0,
"routers_loss": 0.007647325750440359,
"skip_count": 0.0,
"step": 516,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.819112627986348,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.375,
"learning_rate": 0.0009999905789842955,
"loss": 0.2302,
"macro_f1": 0.32098767161369324,
"num_tokens": 816905.0,
"repeat_count": 1.0,
"routers_loss": 0.0514276959002018,
"skip_count": 0.0,
"step": 518,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.830034129692833,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.875,
"learning_rate": 0.0009999882318892442,
"loss": 0.2078,
"macro_f1": 0.31446540355682373,
"num_tokens": 819821.0,
"repeat_count": 2.0,
"routers_loss": 0.3009680211544037,
"skip_count": 0.0,
"step": 520,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 2.8409556313993174,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.859375,
"learning_rate": 0.000999985624010161,
"loss": 0.1296,
"macro_f1": 0.32098767161369324,
"num_tokens": 822580.0,
"repeat_count": 0.0,
"routers_loss": 0.05273444578051567,
"skip_count": 1.0,
"step": 522,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.851877133105802,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.5625,
"learning_rate": 0.0009999827553484064,
"loss": 0.2293,
"macro_f1": 0.3333333432674408,
"num_tokens": 825874.0,
"repeat_count": 0.0,
"routers_loss": 0.008311637677252293,
"skip_count": 0.0,
"step": 524,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 2.862798634812287,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.921875,
"learning_rate": 0.0009999796259054763,
"loss": 0.1759,
"macro_f1": 0.29333335161209106,
"num_tokens": 829040.0,
"repeat_count": 3.0,
"routers_loss": 1.207849383354187,
"skip_count": 2.0,
"step": 526,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.8737201365187715,
"f1_execute": 0.9019608497619629,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.65625,
"learning_rate": 0.0009999762356830036,
"loss": 0.2089,
"macro_f1": 0.3006536364555359,
"num_tokens": 834261.0,
"repeat_count": 2.0,
"routers_loss": 0.5721967220306396,
"skip_count": 3.0,
"step": 528,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 2.884641638225256,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.6875,
"learning_rate": 0.000999972584682756,
"loss": 0.2308,
"macro_f1": 0.29333335161209106,
"num_tokens": 837501.0,
"repeat_count": 0.0,
"routers_loss": 0.09908123314380646,
"skip_count": 2.0,
"step": 530,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 2.8955631399317405,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.6875,
"learning_rate": 0.0009999686729066381,
"loss": 0.1818,
"macro_f1": 0.32098764181137085,
"num_tokens": 840390.0,
"repeat_count": 0.0,
"routers_loss": 0.04153004288673401,
"skip_count": 0.0,
"step": 532,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 26.0,
"epoch": 2.906484641638225,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 5.09375,
"learning_rate": 0.0009999645003566902,
"loss": 0.1759,
"macro_f1": 0.4400000274181366,
"num_tokens": 843327.0,
"repeat_count": 1.0,
"routers_loss": 0.37754446268081665,
"skip_count": 3.0,
"step": 534,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 2.91740614334471,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.953125,
"learning_rate": 0.0009999600670350882,
"loss": 0.1873,
"macro_f1": 0.4871794879436493,
"num_tokens": 847028.0,
"repeat_count": 0.0,
"routers_loss": 0.03440186381340027,
"skip_count": 2.0,
"step": 536,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 2.9283276450511946,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 12.875,
"learning_rate": 0.000999955372944145,
"loss": 0.342,
"macro_f1": 0.29333335161209106,
"num_tokens": 850735.0,
"repeat_count": 1.0,
"routers_loss": 0.18292225897312164,
"skip_count": 0.0,
"step": 538,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.939249146757679,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.421875,
"learning_rate": 0.0009999504180863087,
"loss": 0.1714,
"macro_f1": 0.32098764181137085,
"num_tokens": 854731.0,
"repeat_count": 1.0,
"routers_loss": 0.31060779094696045,
"skip_count": 1.0,
"step": 540,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.9501706484641637,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.65625,
"learning_rate": 0.0009999452024641636,
"loss": 0.1744,
"macro_f1": 0.3144654333591461,
"num_tokens": 858249.0,
"repeat_count": 1.0,
"routers_loss": 0.09356094151735306,
"skip_count": 2.0,
"step": 542,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.961092150170648,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.4375,
"learning_rate": 0.0009999397260804302,
"loss": 0.1456,
"macro_f1": 0.3333333432674408,
"num_tokens": 860901.0,
"repeat_count": 0.0,
"routers_loss": 0.006649349816143513,
"skip_count": 0.0,
"step": 544,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 2.972013651877133,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.0,
"learning_rate": 0.0009999339889379647,
"loss": 0.191,
"macro_f1": 0.3272727429866791,
"num_tokens": 863756.0,
"repeat_count": 0.0,
"routers_loss": 0.024081196635961533,
"skip_count": 0.0,
"step": 546,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 2.9829351535836177,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.328125,
"learning_rate": 0.0009999279910397597,
"loss": 0.1806,
"macro_f1": 0.4871794879436493,
"num_tokens": 867242.0,
"repeat_count": 0.0,
"routers_loss": 0.06612888723611832,
"skip_count": 2.0,
"step": 548,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 2.9938566552901023,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.09375,
"learning_rate": 0.000999921732388943,
"loss": 0.1438,
"macro_f1": 0.32098764181137085,
"num_tokens": 870235.0,
"repeat_count": 0.0,
"routers_loss": 0.02564089559018612,
"skip_count": 0.0,
"step": 550,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 3.0,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 11.0,
"learning_rate": 0.0009999152129887801,
"loss": 0.1395,
"macro_f1": 0.3006536066532135,
"num_tokens": 872748.0,
"repeat_count": 1.0,
"routers_loss": 0.31180688738822937,
"skip_count": 2.0,
"step": 552,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.8333333134651184,
"avg_layers": 25.0,
"epoch": 3.0109215017064845,
"f1_execute": 0.9523809552192688,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.9090909361839294,
"grad_norm": 7.8125,
"learning_rate": 0.0009999084328426704,
"loss": 0.1243,
"macro_f1": 0.8427128791809082,
"num_tokens": 876257.0,
"repeat_count": 1.0,
"routers_loss": 0.06441941112279892,
"skip_count": 6.0,
"step": 554,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.021843003412969,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.4375,
"learning_rate": 0.0009999013919541506,
"loss": 0.2276,
"macro_f1": 0.32098764181137085,
"num_tokens": 879189.0,
"repeat_count": 0.0,
"routers_loss": 0.1297590732574463,
"skip_count": 2.0,
"step": 556,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 3.032764505119454,
"f1_execute": 0.95652174949646,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.5714285373687744,
"grad_norm": 2.953125,
"learning_rate": 0.0009998940903268932,
"loss": 0.1034,
"macro_f1": 0.7315390110015869,
"num_tokens": 882626.0,
"repeat_count": 2.0,
"routers_loss": 0.40159890055656433,
"skip_count": 4.0,
"step": 558,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.0436860068259386,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.96875,
"learning_rate": 0.0009998865279647066,
"loss": 0.1627,
"macro_f1": 0.307692289352417,
"num_tokens": 885572.0,
"repeat_count": 0.0,
"routers_loss": 0.05809749290347099,
"skip_count": 3.0,
"step": 560,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.054607508532423,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.8125,
"learning_rate": 0.0009998787048715349,
"loss": 0.1533,
"macro_f1": 0.31446540355682373,
"num_tokens": 889088.0,
"repeat_count": 0.0,
"routers_loss": 0.4470720589160919,
"skip_count": 2.0,
"step": 562,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.0655290102389077,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.078125,
"learning_rate": 0.0009998706210514589,
"loss": 0.167,
"macro_f1": 0.3272727429866791,
"num_tokens": 892449.0,
"repeat_count": 0.0,
"routers_loss": 0.017404144629836082,
"skip_count": 0.0,
"step": 564,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 24.0,
"epoch": 3.0764505119453927,
"f1_execute": 0.8749999403953552,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.5,
"learning_rate": 0.0009998622765086946,
"loss": 0.1492,
"macro_f1": 0.2916666567325592,
"num_tokens": 895586.0,
"repeat_count": 1.0,
"routers_loss": 0.3639675974845886,
"skip_count": 1.0,
"step": 566,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 23.0,
"epoch": 3.087372013651877,
"f1_execute": 0.8979591727256775,
"f1_repeat": 0.0,
"f1_skip": 0.3333333134651184,
"grad_norm": 9.0625,
"learning_rate": 0.0009998536712475944,
"loss": 0.2095,
"macro_f1": 0.4104308485984802,
"num_tokens": 898285.0,
"repeat_count": 1.0,
"routers_loss": 0.16401837766170502,
"skip_count": 1.0,
"step": 568,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 3.0982935153583617,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.921875,
"learning_rate": 0.0009998448052726467,
"loss": 0.1679,
"macro_f1": 0.5427350401878357,
"num_tokens": 901345.0,
"repeat_count": 1.0,
"routers_loss": 0.2740897238254547,
"skip_count": 1.0,
"step": 570,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.1092150170648463,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.921875,
"learning_rate": 0.000999835678588476,
"loss": 0.1513,
"macro_f1": 0.3333333432674408,
"num_tokens": 904674.0,
"repeat_count": 0.0,
"routers_loss": 0.004289933945983648,
"skip_count": 0.0,
"step": 572,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 28.0,
"epoch": 3.1201365187713312,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 5.125,
"learning_rate": 0.0009998262911998423,
"loss": 0.2076,
"macro_f1": 0.47333335876464844,
"num_tokens": 908392.0,
"repeat_count": 1.0,
"routers_loss": 0.6915572881698608,
"skip_count": 3.0,
"step": 574,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 25.0,
"epoch": 3.131058020477816,
"f1_execute": 0.9387754797935486,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 5.65625,
"learning_rate": 0.000999816643111642,
"loss": 0.166,
"macro_f1": 0.47959184646606445,
"num_tokens": 911574.0,
"repeat_count": 3.0,
"routers_loss": 0.27853959798812866,
"skip_count": 1.0,
"step": 576,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 3.1419795221843003,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.6875,
"learning_rate": 0.0009998067343289074,
"loss": 0.2197,
"macro_f1": 0.3076923191547394,
"num_tokens": 914726.0,
"repeat_count": 1.0,
"routers_loss": 0.39462774991989136,
"skip_count": 1.0,
"step": 578,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.152901023890785,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.515625,
"learning_rate": 0.0009997965648568066,
"loss": 0.1345,
"macro_f1": 0.3333333432674408,
"num_tokens": 918249.0,
"repeat_count": 0.0,
"routers_loss": 0.0032140507828444242,
"skip_count": 0.0,
"step": 580,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.1638225255972694,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.1875,
"learning_rate": 0.000999786134700644,
"loss": 0.1132,
"macro_f1": 0.3333333432674408,
"num_tokens": 921025.0,
"repeat_count": 0.0,
"routers_loss": 0.0016512145521119237,
"skip_count": 0.0,
"step": 582,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 3.1747440273037544,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.59375,
"learning_rate": 0.0009997754438658595,
"loss": 0.0915,
"macro_f1": 0.3006536066532135,
"num_tokens": 924102.0,
"repeat_count": 0.0,
"routers_loss": 0.6956021785736084,
"skip_count": 2.0,
"step": 584,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 28.0,
"epoch": 3.185665529010239,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 9.1875,
"learning_rate": 0.0009997644923580293,
"loss": 0.1437,
"macro_f1": 0.5359477400779724,
"num_tokens": 927662.0,
"repeat_count": 1.0,
"routers_loss": 0.32544562220573425,
"skip_count": 2.0,
"step": 586,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.1965870307167235,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.25,
"learning_rate": 0.0009997532801828658,
"loss": 0.1488,
"macro_f1": 0.3333333432674408,
"num_tokens": 930556.0,
"repeat_count": 0.0,
"routers_loss": 0.00869440846145153,
"skip_count": 0.0,
"step": 588,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.207508532423208,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.65625,
"learning_rate": 0.0009997418073462167,
"loss": 0.1584,
"macro_f1": 0.32098764181137085,
"num_tokens": 933435.0,
"repeat_count": 0.0,
"routers_loss": 0.08498232066631317,
"skip_count": 2.0,
"step": 590,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.218430034129693,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.015625,
"learning_rate": 0.0009997300738540662,
"loss": 0.1075,
"macro_f1": 0.32098764181137085,
"num_tokens": 936478.0,
"repeat_count": 0.0,
"routers_loss": 0.19423364102840424,
"skip_count": 2.0,
"step": 592,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 3.2293515358361775,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 6.03125,
"learning_rate": 0.000999718079712534,
"loss": 0.1615,
"macro_f1": 0.5492662787437439,
"num_tokens": 939400.0,
"repeat_count": 0.0,
"routers_loss": 0.02402239292860031,
"skip_count": 1.0,
"step": 594,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 3.240273037542662,
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
"grad_norm": 4.875,
"learning_rate": 0.0009997058249278763,
"loss": 0.221,
"macro_f1": 0.6666666865348816,
"num_tokens": 943300.0,
"repeat_count": 1.0,
"routers_loss": 0.0028402789030224085,
"skip_count": 0.0,
"step": 596,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.2511945392491466,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.09375,
"learning_rate": 0.0009996933095064847,
"loss": 0.1423,
"macro_f1": 0.3144654333591461,
"num_tokens": 947399.0,
"repeat_count": 1.0,
"routers_loss": 0.2962486445903778,
"skip_count": 2.0,
"step": 598,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.2621160409556316,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.625,
"learning_rate": 0.0009996805334548872,
"loss": 0.1535,
"macro_f1": 0.29333335161209106,
"num_tokens": 950094.0,
"repeat_count": 0.0,
"routers_loss": 0.47425299882888794,
"skip_count": 4.0,
"step": 600,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.4000000059604645,
"avg_layers": 24.0,
"epoch": 3.273037542662116,
"f1_execute": 0.8636363744735718,
"f1_repeat": 0.0,
"f1_skip": 0.444444477558136,
"grad_norm": 4.71875,
"learning_rate": 0.0009996674967797476,
"loss": 0.1282,
"macro_f1": 0.43602699041366577,
"num_tokens": 953673.0,
"repeat_count": 3.0,
"routers_loss": 0.3788261115550995,
"skip_count": 5.0,
"step": 602,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.2839590443686006,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.5625,
"learning_rate": 0.0009996541994878655,
"loss": 0.1239,
"macro_f1": 0.3272727429866791,
"num_tokens": 956885.0,
"repeat_count": 1.0,
"routers_loss": 0.13212358951568604,
"skip_count": 0.0,
"step": 604,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 3.294880546075085,
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 3.828125,
"learning_rate": 0.0009996406415861763,
"loss": 0.0874,
"macro_f1": 0.6601307392120361,
"num_tokens": 959794.0,
"repeat_count": 0.0,
"routers_loss": 0.0332571342587471,
"skip_count": 2.0,
"step": 606,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.3058020477815697,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.5625,
"learning_rate": 0.0009996268230817518,
"loss": 0.1068,
"macro_f1": 0.3333333432674408,
"num_tokens": 963516.0,
"repeat_count": 0.0,
"routers_loss": 0.007200752384960651,
"skip_count": 0.0,
"step": 608,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.3167235494880547,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.75,
"learning_rate": 0.0009996127439817993,
"loss": 0.1237,
"macro_f1": 0.3272727429866791,
"num_tokens": 966363.0,
"repeat_count": 0.0,
"routers_loss": 0.23764896392822266,
"skip_count": 1.0,
"step": 610,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.3276450511945392,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.484375,
"learning_rate": 0.0009995984042936621,
"loss": 0.1411,
"macro_f1": 0.3333333432674408,
"num_tokens": 969265.0,
"repeat_count": 0.0,
"routers_loss": 0.0006030416116118431,
"skip_count": 0.0,
"step": 612,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 3.3385665529010238,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.8125,
"learning_rate": 0.0009995838040248197,
"loss": 0.1516,
"macro_f1": 0.5492662787437439,
"num_tokens": 972024.0,
"repeat_count": 0.0,
"routers_loss": 0.029178157448768616,
"skip_count": 1.0,
"step": 614,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 24.0,
"epoch": 3.3494880546075088,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 6.0625,
"learning_rate": 0.0009995689431828872,
"loss": 0.132,
"macro_f1": 0.41777777671813965,
"num_tokens": 974328.0,
"repeat_count": 0.0,
"routers_loss": 0.41580793261528015,
"skip_count": 2.0,
"step": 616,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.3604095563139933,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.625,
"learning_rate": 0.000999553821775616,
"loss": 0.1495,
"macro_f1": 0.307692289352417,
"num_tokens": 977628.0,
"repeat_count": 0.0,
"routers_loss": 0.26905494928359985,
"skip_count": 3.0,
"step": 618,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.371331058020478,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.375,
"learning_rate": 0.0009995384398108927,
"loss": 0.1372,
"macro_f1": 0.3333333432674408,
"num_tokens": 980458.0,
"repeat_count": 0.0,
"routers_loss": 0.007225328590720892,
"skip_count": 0.0,
"step": 620,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 3.3822525597269624,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 2.984375,
"learning_rate": 0.0009995227972967404,
"loss": 0.1104,
"macro_f1": 0.6603773832321167,
"num_tokens": 983776.0,
"repeat_count": 1.0,
"routers_loss": 0.09698990732431412,
"skip_count": 1.0,
"step": 622,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.393174061433447,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.40625,
"learning_rate": 0.000999506894241318,
"loss": 0.1211,
"macro_f1": 0.32098764181137085,
"num_tokens": 986625.0,
"repeat_count": 0.0,
"routers_loss": 0.028710627928376198,
"skip_count": 0.0,
"step": 624,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 3.404095563139932,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 5.53125,
"learning_rate": 0.0009994907306529201,
"loss": 0.186,
"macro_f1": 0.5427350401878357,
"num_tokens": 989896.0,
"repeat_count": 1.0,
"routers_loss": 0.18436689674854279,
"skip_count": 2.0,
"step": 626,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 3.4150170648464164,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 5.71875,
"learning_rate": 0.0009994743065399776,
"loss": 0.1819,
"macro_f1": 0.6666666865348816,
"num_tokens": 992963.0,
"repeat_count": 0.0,
"routers_loss": 0.011628196574747562,
"skip_count": 2.0,
"step": 628,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.425938566552901,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.5625,
"learning_rate": 0.0009994576219110565,
"loss": 0.2279,
"macro_f1": 0.3272727429866791,
"num_tokens": 995486.0,
"repeat_count": 0.0,
"routers_loss": 0.03694930672645569,
"skip_count": 0.0,
"step": 630,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 3.4368600682593855,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.71875,
"learning_rate": 0.0009994406767748596,
"loss": 0.2908,
"macro_f1": 0.3076923191547394,
"num_tokens": 998880.0,
"repeat_count": 1.0,
"routers_loss": 0.3335764706134796,
"skip_count": 1.0,
"step": 632,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 3.4477815699658705,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 5.3125,
"learning_rate": 0.000999423471140225,
"loss": 0.1652,
"macro_f1": 0.4871794879436493,
"num_tokens": 1001623.0,
"repeat_count": 0.0,
"routers_loss": 0.03843867778778076,
"skip_count": 2.0,
"step": 634,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.458703071672355,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.15625,
"learning_rate": 0.0009994060050161268,
"loss": 0.1534,
"macro_f1": 0.307692289352417,
"num_tokens": 1004900.0,
"repeat_count": 2.0,
"routers_loss": 0.26561209559440613,
"skip_count": 1.0,
"step": 636,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 28.0,
"epoch": 3.4696245733788396,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 6.40625,
"learning_rate": 0.0009993882784116752,
"loss": 0.147,
"macro_f1": 0.4803921580314636,
"num_tokens": 1008732.0,
"repeat_count": 0.0,
"routers_loss": 0.3012487590312958,
"skip_count": 3.0,
"step": 638,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.480546075085324,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.15625,
"learning_rate": 0.0009993702913361155,
"loss": 0.1252,
"macro_f1": 0.3333333432674408,
"num_tokens": 1011699.0,
"repeat_count": 0.0,
"routers_loss": 0.012646762654185295,
"skip_count": 0.0,
"step": 640,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 27.0,
"epoch": 3.491467576791809,
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.90625,
"learning_rate": 0.0009993520437988302,
"loss": 0.1487,
"macro_f1": 0.480392187833786,
"num_tokens": 1014406.0,
"repeat_count": 1.0,
"routers_loss": 0.1068505123257637,
"skip_count": 3.0,
"step": 642,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 3.5023890784982936,
"f1_execute": 0.8085106015205383,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.34375,
"learning_rate": 0.000999333535809336,
"loss": 0.1731,
"macro_f1": 0.26950353384017944,
"num_tokens": 1017801.0,
"repeat_count": 2.0,
"routers_loss": 2.2939841747283936,
"skip_count": 5.0,
"step": 644,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.513310580204778,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.8125,
"learning_rate": 0.0009993147673772868,
"loss": 0.1609,
"macro_f1": 0.3272727429866791,
"num_tokens": 1021185.0,
"repeat_count": 0.0,
"routers_loss": 0.02110578864812851,
"skip_count": 0.0,
"step": 646,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 3.5242320819112627,
"f1_execute": 0.9600000381469727,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 4.90625,
"learning_rate": 0.000999295738512472,
"loss": 0.124,
"macro_f1": 0.4533333480358124,
"num_tokens": 1025108.0,
"repeat_count": 0.0,
"routers_loss": 0.15021832287311554,
"skip_count": 2.0,
"step": 648,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.5351535836177472,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.0,
"learning_rate": 0.0009992764492248163,
"loss": 0.2309,
"macro_f1": 0.3333333432674408,
"num_tokens": 1028805.0,
"repeat_count": 0.0,
"routers_loss": 0.002900304039940238,
"skip_count": 0.0,
"step": 650,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 26.0,
"epoch": 3.546075085324232,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 5.0,
"learning_rate": 0.0009992568995243808,
"loss": 0.1452,
"macro_f1": 0.44705885648727417,
"num_tokens": 1032069.0,
"repeat_count": 0.0,
"routers_loss": 0.2886044383049011,
"skip_count": 3.0,
"step": 652,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.5569965870307167,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.0625,
"learning_rate": 0.0009992370894213623,
"loss": 0.1319,
"macro_f1": 0.3144654333591461,
"num_tokens": 1035634.0,
"repeat_count": 1.0,
"routers_loss": 0.42971259355545044,
"skip_count": 2.0,
"step": 654,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 27.0,
"epoch": 3.5679180887372013,
"f1_execute": 0.9387754797935486,
"f1_repeat": 1.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 7.375,
"learning_rate": 0.000999217018926093,
"loss": 0.1152,
"macro_f1": 0.7795917987823486,
"num_tokens": 1039948.0,
"repeat_count": 1.0,
"routers_loss": 0.07567094266414642,
"skip_count": 3.0,
"step": 656,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.5788395904436863,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.46875,
"learning_rate": 0.0009991966880490417,
"loss": 0.1425,
"macro_f1": 0.3333333432674408,
"num_tokens": 1043710.0,
"repeat_count": 0.0,
"routers_loss": 0.001569207408465445,
"skip_count": 0.0,
"step": 658,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.589761092150171,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.453125,
"learning_rate": 0.0009991760968008124,
"loss": 0.1177,
"macro_f1": 0.3333333432674408,
"num_tokens": 1047211.0,
"repeat_count": 0.0,
"routers_loss": 0.014489148743450642,
"skip_count": 0.0,
"step": 660,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.6006825938566553,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.34375,
"learning_rate": 0.0009991552451921453,
"loss": 0.104,
"macro_f1": 0.32098767161369324,
"num_tokens": 1050220.0,
"repeat_count": 0.0,
"routers_loss": 0.052834026515483856,
"skip_count": 1.0,
"step": 662,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 3.61160409556314,
"f1_execute": 0.875,
"f1_repeat": 1.0,
"f1_skip": 0.0,
"grad_norm": 4.84375,
"learning_rate": 0.0009991341332339157,
"loss": 0.1706,
"macro_f1": 0.625,
"num_tokens": 1053982.0,
"repeat_count": 1.0,
"routers_loss": 0.2865705192089081,
"skip_count": 3.0,
"step": 664,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 3.6225255972696244,
"f1_execute": 0.923076868057251,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.25,
"learning_rate": 0.0009991127609371357,
"loss": 0.1275,
"macro_f1": 0.307692289352417,
"num_tokens": 1056846.0,
"repeat_count": 1.0,
"routers_loss": 0.32878634333610535,
"skip_count": 0.0,
"step": 666,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 1.0,
"avg_layers": 25.0,
"epoch": 3.6334470989761094,
"f1_execute": 0.9777777791023254,
"f1_repeat": 0.6666666865348816,
"f1_skip": 1.0,
"grad_norm": 3.328125,
"learning_rate": 0.0009990911283129524,
"loss": 0.1348,
"macro_f1": 0.8814815282821655,
"num_tokens": 1059648.0,
"repeat_count": 2.0,
"routers_loss": 0.10558832436800003,
"skip_count": 4.0,
"step": 668,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 3.644368600682594,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 1.90625,
"learning_rate": 0.0009990692353726489,
"loss": 0.0572,
"macro_f1": 0.6666666865348816,
"num_tokens": 1062290.0,
"repeat_count": 0.0,
"routers_loss": 0.0071791489608585835,
"skip_count": 2.0,
"step": 670,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.6552901023890785,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.0625,
"learning_rate": 0.0009990470821276442,
"loss": 0.156,
"macro_f1": 0.3272727429866791,
"num_tokens": 1065212.0,
"repeat_count": 0.0,
"routers_loss": 0.028384100645780563,
"skip_count": 0.0,
"step": 672,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 3.666211604095563,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 7.4375,
"learning_rate": 0.0009990246685894933,
"loss": 0.1457,
"macro_f1": 0.4871794879436493,
"num_tokens": 1068029.0,
"repeat_count": 0.0,
"routers_loss": 0.03461477532982826,
"skip_count": 2.0,
"step": 674,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.6771331058020476,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.78125,
"learning_rate": 0.0009990019947698863,
"loss": 0.1055,
"macro_f1": 0.3333333432674408,
"num_tokens": 1071229.0,
"repeat_count": 0.0,
"routers_loss": 0.004003713373094797,
"skip_count": 0.0,
"step": 676,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.6666666865348816,
"avg_layers": 26.0,
"epoch": 3.6880546075085325,
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
"grad_norm": 2.015625,
"learning_rate": 0.0009989790606806494,
"loss": 0.1026,
"macro_f1": 0.5934640765190125,
"num_tokens": 1074046.0,
"repeat_count": 0.0,
"routers_loss": 0.03134514391422272,
"skip_count": 3.0,
"step": 678,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 3.698976109215017,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 2.71875,
"learning_rate": 0.0009989558663337447,
"loss": 0.1402,
"macro_f1": 0.6666666865348816,
"num_tokens": 1076635.0,
"repeat_count": 0.0,
"routers_loss": 0.00439166184514761,
"skip_count": 1.0,
"step": 680,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 3.7098976109215016,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.09375,
"learning_rate": 0.0009989324117412699,
"loss": 0.1021,
"macro_f1": 0.31446540355682373,
"num_tokens": 1079958.0,
"repeat_count": 0.0,
"routers_loss": 0.12589046359062195,
"skip_count": 2.0,
"step": 682,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.7208191126279866,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.890625,
"learning_rate": 0.0009989086969154587,
"loss": 0.1762,
"macro_f1": 0.3333333432674408,
"num_tokens": 1082589.0,
"repeat_count": 0.0,
"routers_loss": 0.01050520222634077,
"skip_count": 0.0,
"step": 684,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.731740614334471,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.90625,
"learning_rate": 0.0009988847218686796,
"loss": 0.1527,
"macro_f1": 0.3272727429866791,
"num_tokens": 1085634.0,
"repeat_count": 0.0,
"routers_loss": 0.08884720504283905,
"skip_count": 1.0,
"step": 686,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 3.7426621160409557,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.5625,
"learning_rate": 0.0009988604866134384,
"loss": 0.196,
"macro_f1": 0.29333335161209106,
"num_tokens": 1088501.0,
"repeat_count": 1.0,
"routers_loss": 0.3627224862575531,
"skip_count": 2.0,
"step": 688,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.75358361774744,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.9375,
"learning_rate": 0.0009988359911623748,
"loss": 0.2456,
"macro_f1": 0.3272727429866791,
"num_tokens": 1091083.0,
"repeat_count": 0.0,
"routers_loss": 0.025369791314005852,
"skip_count": 0.0,
"step": 690,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.7645051194539247,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.46875,
"learning_rate": 0.000998811235528266,
"loss": 0.1186,
"macro_f1": 0.3272727429866791,
"num_tokens": 1095673.0,
"repeat_count": 0.0,
"routers_loss": 0.023373540490865707,
"skip_count": 0.0,
"step": 692,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.7754266211604097,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.03125,
"learning_rate": 0.0009987862197240237,
"loss": 0.1518,
"macro_f1": 0.3272727429866791,
"num_tokens": 1098519.0,
"repeat_count": 0.0,
"routers_loss": 0.014006087556481361,
"skip_count": 0.0,
"step": 694,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 3.7863481228668943,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.59375,
"learning_rate": 0.0009987609437626954,
"loss": 0.2149,
"macro_f1": 0.31446540355682373,
"num_tokens": 1101510.0,
"repeat_count": 0.0,
"routers_loss": 0.057559430599212646,
"skip_count": 1.0,
"step": 696,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.797269624573379,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.765625,
"learning_rate": 0.0009987354076574648,
"loss": 0.1507,
"macro_f1": 0.3333333432674408,
"num_tokens": 1104637.0,
"repeat_count": 0.0,
"routers_loss": 0.001837484072893858,
"skip_count": 0.0,
"step": 698,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.8081911262798633,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.53125,
"learning_rate": 0.0009987096114216511,
"loss": 0.1046,
"macro_f1": 0.3272727429866791,
"num_tokens": 1107964.0,
"repeat_count": 0.0,
"routers_loss": 0.3758608400821686,
"skip_count": 1.0,
"step": 700,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 25.0,
"epoch": 3.819112627986348,
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
"grad_norm": 4.375,
"learning_rate": 0.000998683555068709,
"loss": 0.1269,
"macro_f1": 0.5934640765190125,
"num_tokens": 1111541.0,
"repeat_count": 0.0,
"routers_loss": 0.02019377611577511,
"skip_count": 2.0,
"step": 702,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.830034129692833,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.671875,
"learning_rate": 0.000998657238612229,
"loss": 0.1522,
"macro_f1": 0.3272727429866791,
"num_tokens": 1114819.0,
"repeat_count": 0.0,
"routers_loss": 0.019685756415128708,
"skip_count": 0.0,
"step": 704,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.8409556313993174,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.484375,
"learning_rate": 0.0009986306620659374,
"loss": 0.1104,
"macro_f1": 0.3333333432674408,
"num_tokens": 1117888.0,
"repeat_count": 0.0,
"routers_loss": 0.0059326752088963985,
"skip_count": 0.0,
"step": 706,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 3.851877133105802,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.125,
"learning_rate": 0.0009986038254436956,
"loss": 0.1038,
"macro_f1": 0.32098764181137085,
"num_tokens": 1120946.0,
"repeat_count": 0.0,
"routers_loss": 0.022552471607923508,
"skip_count": 0.0,
"step": 708,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 3.862798634812287,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.625,
"learning_rate": 0.0009985767287595015,
"loss": 0.1433,
"macro_f1": 0.4871794879436493,
"num_tokens": 1124013.0,
"repeat_count": 0.0,
"routers_loss": 0.03914980590343475,
"skip_count": 2.0,
"step": 710,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 3.8737201365187715,
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 1.0,
"grad_norm": 5.0625,
"learning_rate": 0.0009985493720274879,
"loss": 0.1663,
"macro_f1": 1.0,
"num_tokens": 1127662.0,
"repeat_count": 1.0,
"routers_loss": 0.01359120849519968,
"skip_count": 2.0,
"step": 712,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 3.884641638225256,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.96875,
"learning_rate": 0.0009985217552619236,
"loss": 0.1134,
"macro_f1": 0.3272727429866791,
"num_tokens": 1130742.0,
"repeat_count": 0.0,
"routers_loss": 0.0699341893196106,
"skip_count": 0.0,
"step": 714,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.8955631399317405,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.75,
"learning_rate": 0.000998493878477213,
"loss": 0.1643,
"macro_f1": 0.3333333432674408,
"num_tokens": 1133386.0,
"repeat_count": 0.0,
"routers_loss": 0.006396451499313116,
"skip_count": 0.0,
"step": 716,
"text_loss": 0.0
},
{
"acc_repeat": 0.3333333432674408,
"acc_skip": 0.5,
"avg_layers": 28.0,
"epoch": 3.906484641638225,
"f1_execute": 0.8292683362960815,
"f1_repeat": 0.3333333432674408,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.75,
"learning_rate": 0.0009984657416878962,
"loss": 0.1396,
"macro_f1": 0.6097561120986938,
"num_tokens": 1136071.0,
"repeat_count": 3.0,
"routers_loss": 0.23587316274642944,
"skip_count": 6.0,
"step": 718,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.91740614334471,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 9.625,
"learning_rate": 0.0009984373449086485,
"loss": 0.1686,
"macro_f1": 0.3076923191547394,
"num_tokens": 1139061.0,
"repeat_count": 0.0,
"routers_loss": 0.23841485381126404,
"skip_count": 2.0,
"step": 720,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 3.9283276450511946,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
"grad_norm": 2.8125,
"learning_rate": 0.0009984086881542815,
"loss": 0.1112,
"macro_f1": 0.5288889408111572,
"num_tokens": 1141926.0,
"repeat_count": 2.0,
"routers_loss": 0.37492331862449646,
"skip_count": 3.0,
"step": 722,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 3.939249146757679,
"f1_execute": 0.9166666865348816,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.4000000059604645,
"grad_norm": 4.375,
"learning_rate": 0.0009983797714397415,
"loss": 0.1395,
"macro_f1": 0.6611111164093018,
"num_tokens": 1145302.0,
"repeat_count": 2.0,
"routers_loss": 0.5061943531036377,
"skip_count": 2.0,
"step": 724,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.9501706484641637,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 11.5625,
"learning_rate": 0.0009983505947801115,
"loss": 0.327,
"macro_f1": 0.3272727429866791,
"num_tokens": 1148991.0,
"repeat_count": 0.0,
"routers_loss": 0.030050436034798622,
"skip_count": 0.0,
"step": 726,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 3.961092150170648,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.84375,
"learning_rate": 0.0009983211581906088,
"loss": 0.2311,
"macro_f1": 0.5492662787437439,
"num_tokens": 1151711.0,
"repeat_count": 0.0,
"routers_loss": 0.04163246229290962,
"skip_count": 2.0,
"step": 728,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 3.972013651877133,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.0625,
"learning_rate": 0.0009982914616865875,
"loss": 0.1956,
"macro_f1": 0.3333333432674408,
"num_tokens": 1155061.0,
"repeat_count": 0.0,
"routers_loss": 0.002654903568327427,
"skip_count": 0.0,
"step": 730,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 3.9829351535836177,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.46875,
"learning_rate": 0.0009982615052835364,
"loss": 0.1239,
"macro_f1": 0.31446540355682373,
"num_tokens": 1158043.0,
"repeat_count": 0.0,
"routers_loss": 0.18476539850234985,
"skip_count": 2.0,
"step": 732,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 3.9938566552901023,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.625,
"learning_rate": 0.0009982312889970804,
"loss": 0.211,
"macro_f1": 0.31446540355682373,
"num_tokens": 1161487.0,
"repeat_count": 2.0,
"routers_loss": 0.33558642864227295,
"skip_count": 0.0,
"step": 734,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.0,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.5625,
"learning_rate": 0.0009982008128429794,
"loss": 0.14,
"macro_f1": 0.3272727429866791,
"num_tokens": 1163664.0,
"repeat_count": 0.0,
"routers_loss": 0.010565636679530144,
"skip_count": 0.0,
"step": 736,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.010921501706485,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.75,
"learning_rate": 0.0009981700768371296,
"loss": 0.0823,
"macro_f1": 0.3333333432674408,
"num_tokens": 1166461.0,
"repeat_count": 0.0,
"routers_loss": 0.001561413868330419,
"skip_count": 0.0,
"step": 738,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 4.021843003412969,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 5.125,
"learning_rate": 0.000998139080995562,
"loss": 0.1766,
"macro_f1": 0.6666666865348816,
"num_tokens": 1170134.0,
"repeat_count": 0.0,
"routers_loss": 0.010665918700397015,
"skip_count": 2.0,
"step": 740,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 4.032764505119454,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.578125,
"learning_rate": 0.0009981078253344432,
"loss": 0.1177,
"macro_f1": 0.3333333432674408,
"num_tokens": 1173075.0,
"repeat_count": 0.0,
"routers_loss": 0.047345057129859924,
"skip_count": 1.0,
"step": 742,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 4.043686006825938,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 2.1875,
"learning_rate": 0.000998076309870076,
"loss": 0.0517,
"macro_f1": 0.6666666865348816,
"num_tokens": 1176281.0,
"repeat_count": 0.0,
"routers_loss": 0.0033105311449617147,
"skip_count": 1.0,
"step": 744,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.054607508532423,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.0625,
"learning_rate": 0.000998044534618898,
"loss": 0.0864,
"macro_f1": 0.32098764181137085,
"num_tokens": 1179403.0,
"repeat_count": 0.0,
"routers_loss": 0.033084314316511154,
"skip_count": 0.0,
"step": 746,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.065529010238908,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.234375,
"learning_rate": 0.0009980124995974827,
"loss": 0.0925,
"macro_f1": 0.3006536066532135,
"num_tokens": 1182596.0,
"repeat_count": 1.0,
"routers_loss": 0.21827591955661774,
"skip_count": 3.0,
"step": 748,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 27.0,
"epoch": 4.076450511945392,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.125,
"learning_rate": 0.0009979802048225388,
"loss": 0.1244,
"macro_f1": 0.4871794879436493,
"num_tokens": 1186303.0,
"repeat_count": 0.0,
"routers_loss": 0.18225915729999542,
"skip_count": 3.0,
"step": 750,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 30.0,
"epoch": 4.087372013651877,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
"grad_norm": 3.984375,
"learning_rate": 0.0009979476503109107,
"loss": 0.0728,
"macro_f1": 0.5492662787437439,
"num_tokens": 1189299.0,
"repeat_count": 1.0,
"routers_loss": 0.03163563460111618,
"skip_count": 0.0,
"step": 752,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 24.0,
"epoch": 4.098293515358362,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 6.34375,
"learning_rate": 0.000997914836079578,
"loss": 0.148,
"macro_f1": 0.41777777671813965,
"num_tokens": 1192694.0,
"repeat_count": 0.0,
"routers_loss": 0.28674715757369995,
"skip_count": 2.0,
"step": 754,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.109215017064846,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.34375,
"learning_rate": 0.0009978817621456562,
"loss": 0.0869,
"macro_f1": 0.31446540355682373,
"num_tokens": 1196319.0,
"repeat_count": 0.0,
"routers_loss": 0.05852695554494858,
"skip_count": 1.0,
"step": 756,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 4.120136518771331,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 1.6484375,
"learning_rate": 0.000997848428526396,
"loss": 0.0648,
"macro_f1": 0.5492662787437439,
"num_tokens": 1199844.0,
"repeat_count": 0.0,
"routers_loss": 0.06834150850772858,
"skip_count": 2.0,
"step": 758,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.131058020477815,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.375,
"learning_rate": 0.0009978148352391835,
"loss": 0.0801,
"macro_f1": 0.3333333432674408,
"num_tokens": 1202876.0,
"repeat_count": 0.0,
"routers_loss": 0.0058227707631886005,
"skip_count": 0.0,
"step": 760,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 27.0,
"epoch": 4.1419795221843,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.296875,
"learning_rate": 0.00099778098230154,
"loss": 0.1094,
"macro_f1": 0.4871794879436493,
"num_tokens": 1206870.0,
"repeat_count": 0.0,
"routers_loss": 0.079805389046669,
"skip_count": 3.0,
"step": 762,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.152901023890785,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.296875,
"learning_rate": 0.0009977468697311232,
"loss": 0.0902,
"macro_f1": 0.3076923191547394,
"num_tokens": 1209825.0,
"repeat_count": 0.0,
"routers_loss": 0.21695999801158905,
"skip_count": 2.0,
"step": 764,
"text_loss": 0.0
},
{
"acc_repeat": 0.5,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.163822525597269,
"f1_execute": 0.8749999403953552,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.0,
"grad_norm": 3.265625,
"learning_rate": 0.0009977124975457249,
"loss": 0.1244,
"macro_f1": 0.5138888955116272,
"num_tokens": 1213093.0,
"repeat_count": 2.0,
"routers_loss": 0.12744387984275818,
"skip_count": 4.0,
"step": 766,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 28.0,
"epoch": 4.174744027303754,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 2.34375,
"learning_rate": 0.0009976778657632733,
"loss": 0.0783,
"macro_f1": 0.5427350401878357,
"num_tokens": 1216291.0,
"repeat_count": 0.0,
"routers_loss": 0.07573267817497253,
"skip_count": 2.0,
"step": 768,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.1856655290102385,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.296875,
"learning_rate": 0.0009976429744018313,
"loss": 0.0752,
"macro_f1": 0.3333333432674408,
"num_tokens": 1219537.0,
"repeat_count": 0.0,
"routers_loss": 0.0009250715957023203,
"skip_count": 0.0,
"step": 770,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.1965870307167235,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.234375,
"learning_rate": 0.0009976078234795983,
"loss": 0.1114,
"macro_f1": 0.3333333432674408,
"num_tokens": 1222736.0,
"repeat_count": 0.0,
"routers_loss": 0.00175693747587502,
"skip_count": 0.0,
"step": 772,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 4.207508532423208,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.390625,
"learning_rate": 0.0009975724130149076,
"loss": 0.0918,
"macro_f1": 0.5492662787437439,
"num_tokens": 1226120.0,
"repeat_count": 0.0,
"routers_loss": 0.027441009879112244,
"skip_count": 2.0,
"step": 774,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.2184300341296925,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.46875,
"learning_rate": 0.0009975367430262287,
"loss": 0.0992,
"macro_f1": 0.3272727429866791,
"num_tokens": 1228810.0,
"repeat_count": 0.0,
"routers_loss": 0.027025407180190086,
"skip_count": 0.0,
"step": 776,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.2293515358361775,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.453125,
"learning_rate": 0.0009975008135321667,
"loss": 0.0931,
"macro_f1": 0.3333333432674408,
"num_tokens": 1231669.0,
"repeat_count": 0.0,
"routers_loss": 0.00917113944888115,
"skip_count": 0.0,
"step": 778,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.2402730375426625,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.890625,
"learning_rate": 0.0009974646245514615,
"loss": 0.0505,
"macro_f1": 0.3333333432674408,
"num_tokens": 1234476.0,
"repeat_count": 0.0,
"routers_loss": 0.010482276789844036,
"skip_count": 0.0,
"step": 780,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 4.251194539249147,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 4.28125,
"learning_rate": 0.0009974281761029886,
"loss": 0.0675,
"macro_f1": 0.6666666865348816,
"num_tokens": 1237748.0,
"repeat_count": 0.0,
"routers_loss": 0.009005382657051086,
"skip_count": 1.0,
"step": 782,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 4.262116040955632,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 2.9375,
"learning_rate": 0.0009973914682057587,
"loss": 0.1734,
"macro_f1": 0.4871794879436493,
"num_tokens": 1240362.0,
"repeat_count": 0.0,
"routers_loss": 0.09049399197101593,
"skip_count": 2.0,
"step": 784,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.273037542662116,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.984375,
"learning_rate": 0.0009973545008789182,
"loss": 0.1156,
"macro_f1": 0.3333333432674408,
"num_tokens": 1244147.0,
"repeat_count": 0.0,
"routers_loss": 0.0037465172354131937,
"skip_count": 0.0,
"step": 786,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.283959044368601,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.84375,
"learning_rate": 0.000997317274141748,
"loss": 0.1302,
"macro_f1": 0.3333333432674408,
"num_tokens": 1247058.0,
"repeat_count": 0.0,
"routers_loss": 0.002100529847666621,
"skip_count": 0.0,
"step": 788,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 24.0,
"epoch": 4.294880546075086,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.3333333432674408,
"grad_norm": 3.03125,
"learning_rate": 0.0009972797880136654,
"loss": 0.0771,
"macro_f1": 0.41777777671813965,
"num_tokens": 1250331.0,
"repeat_count": 0.0,
"routers_loss": 0.08377297967672348,
"skip_count": 2.0,
"step": 790,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 4.30580204778157,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.421875,
"learning_rate": 0.0009972420425142224,
"loss": 0.0782,
"macro_f1": 0.4871794879436493,
"num_tokens": 1253848.0,
"repeat_count": 0.0,
"routers_loss": 0.06583717465400696,
"skip_count": 2.0,
"step": 792,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.316723549488055,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.28125,
"learning_rate": 0.0009972040376631057,
"loss": 0.1235,
"macro_f1": 0.32098767161369324,
"num_tokens": 1257122.0,
"repeat_count": 0.0,
"routers_loss": 0.12353084981441498,
"skip_count": 1.0,
"step": 794,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.327645051194539,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.171875,
"learning_rate": 0.0009971657734801384,
"loss": 0.0899,
"macro_f1": 0.3333333432674408,
"num_tokens": 1261136.0,
"repeat_count": 0.0,
"routers_loss": 0.004150724504143,
"skip_count": 0.0,
"step": 796,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.338566552901024,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.5625,
"learning_rate": 0.0009971272499852784,
"loss": 0.1815,
"macro_f1": 0.3272727429866791,
"num_tokens": 1264211.0,
"repeat_count": 0.0,
"routers_loss": 0.02800264209508896,
"skip_count": 0.0,
"step": 798,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 4.349488054607509,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.125,
"learning_rate": 0.0009970884671986187,
"loss": 0.1118,
"macro_f1": 0.5492662787437439,
"num_tokens": 1266964.0,
"repeat_count": 0.0,
"routers_loss": 0.05382822826504707,
"skip_count": 1.0,
"step": 800,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.360409556313993,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.1875,
"learning_rate": 0.0009970494251403874,
"loss": 0.1015,
"macro_f1": 0.31446540355682373,
"num_tokens": 1269856.0,
"repeat_count": 0.0,
"routers_loss": 0.20994320511817932,
"skip_count": 2.0,
"step": 802,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.371331058020478,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.296875,
"learning_rate": 0.000997010123830948,
"loss": 0.1095,
"macro_f1": 0.31446540355682373,
"num_tokens": 1272945.0,
"repeat_count": 0.0,
"routers_loss": 0.07841377705335617,
"skip_count": 1.0,
"step": 804,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 30.0,
"epoch": 4.382252559726963,
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
"grad_norm": 6.5625,
"learning_rate": 0.0009969705632907999,
"loss": 0.1242,
"macro_f1": 0.6666666865348816,
"num_tokens": 1276127.0,
"repeat_count": 2.0,
"routers_loss": 0.008330464363098145,
"skip_count": 0.0,
"step": 806,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.393174061433447,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.4375,
"learning_rate": 0.0009969307435405766,
"loss": 0.1688,
"macro_f1": 0.3333333432674408,
"num_tokens": 1279056.0,
"repeat_count": 0.0,
"routers_loss": 0.004059277940541506,
"skip_count": 0.0,
"step": 808,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.404095563139932,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.3125,
"learning_rate": 0.0009968906646010474,
"loss": 0.1232,
"macro_f1": 0.3333333432674408,
"num_tokens": 1282092.0,
"repeat_count": 0.0,
"routers_loss": 0.005245010834187269,
"skip_count": 0.0,
"step": 810,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.415017064846416,
"f1_execute": 0.9411765336990356,
"f1_repeat": 1.0,
"f1_skip": 0.0,
"grad_norm": 7.3125,
"learning_rate": 0.0009968503264931167,
"loss": 0.0964,
"macro_f1": 0.6470588445663452,
"num_tokens": 1285759.0,
"repeat_count": 1.0,
"routers_loss": 0.04135916382074356,
"skip_count": 0.0,
"step": 812,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 4.425938566552901,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.0,
"learning_rate": 0.0009968097292378244,
"loss": 0.1636,
"macro_f1": 0.32098767161369324,
"num_tokens": 1288141.0,
"repeat_count": 0.0,
"routers_loss": 0.11239507049322128,
"skip_count": 1.0,
"step": 814,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.436860068259386,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.71875,
"learning_rate": 0.0009967688728563446,
"loss": 0.1044,
"macro_f1": 0.32098767161369324,
"num_tokens": 1291293.0,
"repeat_count": 1.0,
"routers_loss": 0.3831826150417328,
"skip_count": 0.0,
"step": 816,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.44778156996587,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.875,
"learning_rate": 0.0009967277573699875,
"loss": 0.1445,
"macro_f1": 0.32098764181137085,
"num_tokens": 1293847.0,
"repeat_count": 0.0,
"routers_loss": 0.054437290877103806,
"skip_count": 0.0,
"step": 818,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.458703071672355,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 1.859375,
"learning_rate": 0.000996686382800198,
"loss": 0.0712,
"macro_f1": 0.3333333432674408,
"num_tokens": 1296724.0,
"repeat_count": 0.0,
"routers_loss": 0.012091469950973988,
"skip_count": 0.0,
"step": 820,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.75,
"avg_layers": 24.0,
"epoch": 4.46962457337884,
"f1_execute": 0.936170220375061,
"f1_repeat": 0.0,
"f1_skip": 0.75,
"grad_norm": 4.4375,
"learning_rate": 0.000996644749168557,
"loss": 0.1332,
"macro_f1": 0.5620567798614502,
"num_tokens": 1299674.0,
"repeat_count": 1.0,
"routers_loss": 0.06590834259986877,
"skip_count": 4.0,
"step": 822,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 25.0,
"epoch": 4.480546075085324,
"f1_execute": 0.9200000166893005,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 3.265625,
"learning_rate": 0.0009966028564967785,
"loss": 0.1285,
"macro_f1": 0.4400000274181366,
"num_tokens": 1302843.0,
"repeat_count": 1.0,
"routers_loss": 0.06902799010276794,
"skip_count": 2.0,
"step": 824,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 25.0,
"epoch": 4.491467576791809,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 7.4375,
"learning_rate": 0.0009965607048067137,
"loss": 0.1249,
"macro_f1": 0.44705885648727417,
"num_tokens": 1305575.0,
"repeat_count": 0.0,
"routers_loss": 0.08320864289999008,
"skip_count": 2.0,
"step": 826,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 4.502389078498293,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.65625,
"learning_rate": 0.0009965182941203481,
"loss": 0.1834,
"macro_f1": 0.32098767161369324,
"num_tokens": 1308244.0,
"repeat_count": 0.0,
"routers_loss": 0.12352414429187775,
"skip_count": 1.0,
"step": 828,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.513310580204778,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.9375,
"learning_rate": 0.0009964756244598021,
"loss": 0.0915,
"macro_f1": 0.3333333432674408,
"num_tokens": 1311314.0,
"repeat_count": 0.0,
"routers_loss": 0.014358235523104668,
"skip_count": 0.0,
"step": 830,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.524232081911263,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.65625,
"learning_rate": 0.0009964326958473316,
"loss": 0.102,
"macro_f1": 0.3272727429866791,
"num_tokens": 1315495.0,
"repeat_count": 0.0,
"routers_loss": 0.008667540736496449,
"skip_count": 0.0,
"step": 832,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.535153583617747,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.484375,
"learning_rate": 0.000996389508305327,
"loss": 0.0822,
"macro_f1": 0.3272727429866791,
"num_tokens": 1319132.0,
"repeat_count": 0.0,
"routers_loss": 0.018217027187347412,
"skip_count": 0.0,
"step": 834,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.546075085324232,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 10.8125,
"learning_rate": 0.000996346061856314,
"loss": 0.2215,
"macro_f1": 0.31446540355682373,
"num_tokens": 1321294.0,
"repeat_count": 0.0,
"routers_loss": 0.1659325808286667,
"skip_count": 1.0,
"step": 836,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.556996587030717,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.625,
"learning_rate": 0.0009963023565229536,
"loss": 0.1108,
"macro_f1": 0.3272727429866791,
"num_tokens": 1324186.0,
"repeat_count": 0.0,
"routers_loss": 0.11435546725988388,
"skip_count": 0.0,
"step": 838,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.567918088737201,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.34375,
"learning_rate": 0.0009962583923280419,
"loss": 0.1153,
"macro_f1": 0.3333333432674408,
"num_tokens": 1327215.0,
"repeat_count": 0.0,
"routers_loss": 0.001215719268657267,
"skip_count": 0.0,
"step": 840,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.578839590443686,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.5625,
"learning_rate": 0.0009962141692945092,
"loss": 0.1181,
"macro_f1": 0.3272727429866791,
"num_tokens": 1330394.0,
"repeat_count": 1.0,
"routers_loss": 0.05636778846383095,
"skip_count": 0.0,
"step": 842,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 25.0,
"epoch": 4.58976109215017,
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 0.800000011920929,
"grad_norm": 5.53125,
"learning_rate": 0.0009961696874454219,
"loss": 0.0985,
"macro_f1": 0.5934640765190125,
"num_tokens": 1333840.0,
"repeat_count": 0.0,
"routers_loss": 0.17423874139785767,
"skip_count": 2.0,
"step": 844,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 4.600682593856655,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.375,
"learning_rate": 0.0009961249468039806,
"loss": 0.1442,
"macro_f1": 0.3272727429866791,
"num_tokens": 1337481.0,
"repeat_count": 0.0,
"routers_loss": 0.08344361186027527,
"skip_count": 0.0,
"step": 846,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 4.611604095563139,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 8.1875,
"learning_rate": 0.0009960799473935212,
"loss": 0.1287,
"macro_f1": 0.29333335161209106,
"num_tokens": 1340525.0,
"repeat_count": 1.0,
"routers_loss": 0.10816935449838638,
"skip_count": 2.0,
"step": 848,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.622525597269624,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.703125,
"learning_rate": 0.0009960346892375143,
"loss": 0.1476,
"macro_f1": 0.3272727429866791,
"num_tokens": 1344963.0,
"repeat_count": 0.0,
"routers_loss": 0.02773604914546013,
"skip_count": 0.0,
"step": 850,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.633447098976109,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.25,
"learning_rate": 0.000995989172359566,
"loss": 0.074,
"macro_f1": 0.3144654333591461,
"num_tokens": 1347911.0,
"repeat_count": 0.0,
"routers_loss": 0.07946910709142685,
"skip_count": 3.0,
"step": 852,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.6443686006825935,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.5625,
"learning_rate": 0.0009959433967834167,
"loss": 0.0946,
"macro_f1": 0.3272727429866791,
"num_tokens": 1352093.0,
"repeat_count": 0.0,
"routers_loss": 0.20672957599163055,
"skip_count": 1.0,
"step": 854,
"text_loss": 0.0
},
{
"acc_repeat": 0.6666666865348816,
"acc_skip": 0.5,
"avg_layers": 28.0,
"epoch": 4.6552901023890785,
"f1_execute": 0.8780487775802612,
"f1_repeat": 0.6666666865348816,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.109375,
"learning_rate": 0.0009958973625329424,
"loss": 0.1035,
"macro_f1": 0.737127423286438,
"num_tokens": 1355052.0,
"repeat_count": 3.0,
"routers_loss": 0.14273089170455933,
"skip_count": 6.0,
"step": 856,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.6662116040955635,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.15625,
"learning_rate": 0.0009958510696321532,
"loss": 0.1217,
"macro_f1": 0.32098764181137085,
"num_tokens": 1358739.0,
"repeat_count": 0.0,
"routers_loss": 0.03209677338600159,
"skip_count": 0.0,
"step": 858,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.6771331058020476,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.609375,
"learning_rate": 0.000995804518105195,
"loss": 0.1511,
"macro_f1": 0.3272727429866791,
"num_tokens": 1361816.0,
"repeat_count": 0.0,
"routers_loss": 0.016142090782523155,
"skip_count": 0.0,
"step": 860,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.6880546075085325,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.03125,
"learning_rate": 0.0009957577079763478,
"loss": 0.1588,
"macro_f1": 0.3333333432674408,
"num_tokens": 1365188.0,
"repeat_count": 0.0,
"routers_loss": 0.005357397720217705,
"skip_count": 0.0,
"step": 862,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.6989761092150175,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.53125,
"learning_rate": 0.0009957106392700272,
"loss": 0.0981,
"macro_f1": 0.3333333432674408,
"num_tokens": 1368207.0,
"repeat_count": 0.0,
"routers_loss": 0.005774896126240492,
"skip_count": 0.0,
"step": 864,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.709897610921502,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.859375,
"learning_rate": 0.000995663312010783,
"loss": 0.1432,
"macro_f1": 0.3333333432674408,
"num_tokens": 1370949.0,
"repeat_count": 0.0,
"routers_loss": 0.0034105523955076933,
"skip_count": 0.0,
"step": 866,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.720819112627987,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.6875,
"learning_rate": 0.0009956157262233003,
"loss": 0.1171,
"macro_f1": 0.3272727429866791,
"num_tokens": 1373855.0,
"repeat_count": 0.0,
"routers_loss": 0.00975721050053835,
"skip_count": 0.0,
"step": 868,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 27.0,
"epoch": 4.731740614334471,
"f1_execute": 0.8979592323303223,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 9.8125,
"learning_rate": 0.000995567881932399,
"loss": 0.1658,
"macro_f1": 0.4326530694961548,
"num_tokens": 1376396.0,
"repeat_count": 1.0,
"routers_loss": 0.3017057776451111,
"skip_count": 3.0,
"step": 870,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.742662116040956,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.5625,
"learning_rate": 0.0009955197791630336,
"loss": 0.141,
"macro_f1": 0.3333333432674408,
"num_tokens": 1379027.0,
"repeat_count": 0.0,
"routers_loss": 0.008239896968007088,
"skip_count": 0.0,
"step": 872,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.753583617747441,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.53125,
"learning_rate": 0.0009954714179402936,
"loss": 0.1144,
"macro_f1": 0.3333333432674408,
"num_tokens": 1382288.0,
"repeat_count": 0.0,
"routers_loss": 0.010364998131990433,
"skip_count": 0.0,
"step": 874,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 4.764505119453925,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 6.53125,
"learning_rate": 0.0009954227982894035,
"loss": 0.1795,
"macro_f1": 0.5492662787437439,
"num_tokens": 1385672.0,
"repeat_count": 0.0,
"routers_loss": 0.15057335793972015,
"skip_count": 1.0,
"step": 876,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.77542662116041,
"f1_execute": 0.8799999952316284,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.90625,
"learning_rate": 0.0009953739202357217,
"loss": 0.1139,
"macro_f1": 0.29333335161209106,
"num_tokens": 1389206.0,
"repeat_count": 1.0,
"routers_loss": 0.42493173480033875,
"skip_count": 3.0,
"step": 878,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.786348122866894,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.46875,
"learning_rate": 0.0009953247838047428,
"loss": 0.1882,
"macro_f1": 0.3333333432674408,
"num_tokens": 1392492.0,
"repeat_count": 0.0,
"routers_loss": 0.005968689452856779,
"skip_count": 0.0,
"step": 880,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.797269624573379,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.046875,
"learning_rate": 0.0009952753890220948,
"loss": 0.1183,
"macro_f1": 0.3272727429866791,
"num_tokens": 1395478.0,
"repeat_count": 0.0,
"routers_loss": 0.14635904133319855,
"skip_count": 1.0,
"step": 882,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 4.808191126279864,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.34375,
"learning_rate": 0.0009952257359135417,
"loss": 0.1388,
"macro_f1": 0.3006536066532135,
"num_tokens": 1398518.0,
"repeat_count": 0.0,
"routers_loss": 0.1135154739022255,
"skip_count": 2.0,
"step": 884,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 28.0,
"epoch": 4.819112627986348,
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.65625,
"learning_rate": 0.0009951758245049808,
"loss": 0.179,
"macro_f1": 0.5359477400779724,
"num_tokens": 1401259.0,
"repeat_count": 0.0,
"routers_loss": 0.18914444744586945,
"skip_count": 1.0,
"step": 886,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 4.830034129692833,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 2.5,
"learning_rate": 0.0009951256548224455,
"loss": 0.0913,
"macro_f1": 0.6603773832321167,
"num_tokens": 1404149.0,
"repeat_count": 1.0,
"routers_loss": 0.04007445275783539,
"skip_count": 1.0,
"step": 888,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.840955631399318,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.15625,
"learning_rate": 0.000995075226892103,
"loss": 0.129,
"macro_f1": 0.32098767161369324,
"num_tokens": 1406960.0,
"repeat_count": 0.0,
"routers_loss": 0.4282263517379761,
"skip_count": 1.0,
"step": 890,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.5714285969734192,
"avg_layers": 27.0,
"epoch": 4.851877133105802,
"f1_execute": 0.8999999761581421,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.7272727489471436,
"grad_norm": 5.40625,
"learning_rate": 0.0009950245407402557,
"loss": 0.2196,
"macro_f1": 0.8090909719467163,
"num_tokens": 1409634.0,
"repeat_count": 2.0,
"routers_loss": 0.3470841348171234,
"skip_count": 7.0,
"step": 892,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 24.0,
"epoch": 4.862798634812287,
"f1_execute": 0.9795917868614197,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 2.3125,
"learning_rate": 0.0009949735963933404,
"loss": 0.115,
"macro_f1": 0.5487528443336487,
"num_tokens": 1413390.0,
"repeat_count": 1.0,
"routers_loss": 0.05957069247961044,
"skip_count": 2.0,
"step": 894,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.873720136518771,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.53125,
"learning_rate": 0.0009949223938779286,
"loss": 0.0754,
"macro_f1": 0.3333333432674408,
"num_tokens": 1416605.0,
"repeat_count": 0.0,
"routers_loss": 0.002007940784096718,
"skip_count": 0.0,
"step": 896,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 4.884641638225256,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 8.1875,
"learning_rate": 0.000994870933220727,
"loss": 0.1282,
"macro_f1": 0.4803921580314636,
"num_tokens": 1420764.0,
"repeat_count": 0.0,
"routers_loss": 0.08513174206018448,
"skip_count": 2.0,
"step": 898,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.895563139931741,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.09375,
"learning_rate": 0.0009948192144485757,
"loss": 0.0972,
"macro_f1": 0.32098767161369324,
"num_tokens": 1424182.0,
"repeat_count": 0.0,
"routers_loss": 0.03853657469153404,
"skip_count": 1.0,
"step": 900,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 4.906484641638225,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 5.03125,
"learning_rate": 0.0009947672375884506,
"loss": 0.1737,
"macro_f1": 0.6666666865348816,
"num_tokens": 1426986.0,
"repeat_count": 0.0,
"routers_loss": 0.008192243054509163,
"skip_count": 1.0,
"step": 902,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 4.91740614334471,
"f1_execute": 0.9795917868614197,
"f1_repeat": 1.0,
"f1_skip": 0.800000011920929,
"grad_norm": 4.875,
"learning_rate": 0.0009947150026674621,
"loss": 0.0577,
"macro_f1": 0.9265305995941162,
"num_tokens": 1429981.0,
"repeat_count": 1.0,
"routers_loss": 0.06954901665449142,
"skip_count": 2.0,
"step": 904,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.928327645051194,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.234375,
"learning_rate": 0.0009946625097128543,
"loss": 0.168,
"macro_f1": 0.32098767161369324,
"num_tokens": 1432902.0,
"repeat_count": 0.0,
"routers_loss": 0.0880909413099289,
"skip_count": 1.0,
"step": 906,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 4.939249146757679,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 1.9921875,
"learning_rate": 0.000994609758752007,
"loss": 0.1445,
"macro_f1": 0.3272727429866791,
"num_tokens": 1436788.0,
"repeat_count": 1.0,
"routers_loss": 0.5064544081687927,
"skip_count": 0.0,
"step": 908,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 4.950170648464164,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.0625,
"learning_rate": 0.0009945567498124339,
"loss": 0.1658,
"macro_f1": 0.5492662787437439,
"num_tokens": 1439507.0,
"repeat_count": 0.0,
"routers_loss": 0.019065011292696,
"skip_count": 2.0,
"step": 910,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.961092150170648,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.40625,
"learning_rate": 0.0009945034829217832,
"loss": 0.0968,
"macro_f1": 0.3272727429866791,
"num_tokens": 1442860.0,
"repeat_count": 0.0,
"routers_loss": 0.018776487559080124,
"skip_count": 0.0,
"step": 912,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 4.972013651877133,
"f1_execute": 0.9230769276618958,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.71875,
"learning_rate": 0.0009944499581078382,
"loss": 0.1252,
"macro_f1": 0.3076923191547394,
"num_tokens": 1446637.0,
"repeat_count": 0.0,
"routers_loss": 0.1531504988670349,
"skip_count": 2.0,
"step": 914,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 4.982935153583618,
"f1_execute": 0.943396270275116,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.21875,
"learning_rate": 0.000994396175398516,
"loss": 0.0992,
"macro_f1": 0.3144654333591461,
"num_tokens": 1450238.0,
"repeat_count": 0.0,
"routers_loss": 0.1735955774784088,
"skip_count": 0.0,
"step": 916,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 4.993856655290102,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 7.6875,
"learning_rate": 0.000994342134821869,
"loss": 0.1523,
"macro_f1": 0.3272727429866791,
"num_tokens": 1453160.0,
"repeat_count": 0.0,
"routers_loss": 0.15269255638122559,
"skip_count": 0.0,
"step": 918,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 5.0,
"f1_execute": 0.9433962106704712,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 12.4375,
"learning_rate": 0.0009942878364060837,
"loss": 0.1131,
"macro_f1": 0.31446540355682373,
"num_tokens": 1454580.0,
"repeat_count": 1.0,
"routers_loss": 0.2639358341693878,
"skip_count": 0.0,
"step": 920,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 5.010921501706485,
"f1_execute": 0.9803921580314636,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 5.09375,
"learning_rate": 0.0009942332801794807,
"loss": 0.1702,
"macro_f1": 0.6601307392120361,
"num_tokens": 1457292.0,
"repeat_count": 0.0,
"routers_loss": 0.043732915073633194,
"skip_count": 2.0,
"step": 922,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 28.0,
"epoch": 5.021843003412969,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 2.8125,
"learning_rate": 0.000994178466170516,
"loss": 0.1107,
"macro_f1": 0.6538461446762085,
"num_tokens": 1460434.0,
"repeat_count": 1.0,
"routers_loss": 0.36936479806900024,
"skip_count": 1.0,
"step": 924,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 5.032764505119454,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 2.09375,
"learning_rate": 0.0009941233944077788,
"loss": 0.0547,
"macro_f1": 0.6666666865348816,
"num_tokens": 1463373.0,
"repeat_count": 0.0,
"routers_loss": 0.0019650806207209826,
"skip_count": 1.0,
"step": 926,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.043686006825938,
"f1_execute": 0.9629629254341125,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 1.125,
"learning_rate": 0.000994068064919994,
"loss": 0.0665,
"macro_f1": 0.32098764181137085,
"num_tokens": 1466927.0,
"repeat_count": 1.0,
"routers_loss": 0.06489580124616623,
"skip_count": 1.0,
"step": 928,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 27.0,
"epoch": 5.054607508532423,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.203125,
"learning_rate": 0.0009940124777360203,
"loss": 0.0898,
"macro_f1": 0.3272727429866791,
"num_tokens": 1469834.0,
"repeat_count": 0.0,
"routers_loss": 0.013250669464468956,
"skip_count": 0.0,
"step": 930,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.065529010238908,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.421875,
"learning_rate": 0.0009939566328848507,
"loss": 0.0616,
"macro_f1": 0.3272727429866791,
"num_tokens": 1472714.0,
"repeat_count": 0.0,
"routers_loss": 0.03642500564455986,
"skip_count": 1.0,
"step": 932,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 5.076450511945392,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 2.015625,
"learning_rate": 0.000993900530395613,
"loss": 0.0672,
"macro_f1": 0.5492662787437439,
"num_tokens": 1476458.0,
"repeat_count": 0.0,
"routers_loss": 0.019950609654188156,
"skip_count": 2.0,
"step": 934,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 5.087372013651877,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 4.34375,
"learning_rate": 0.0009938441702975688,
"loss": 0.0714,
"macro_f1": 0.5492662787437439,
"num_tokens": 1479499.0,
"repeat_count": 0.0,
"routers_loss": 0.05769496411085129,
"skip_count": 2.0,
"step": 936,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 26.0,
"epoch": 5.098293515358362,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 3.09375,
"learning_rate": 0.000993787552620115,
"loss": 0.0647,
"macro_f1": 0.6666666865348816,
"num_tokens": 1482112.0,
"repeat_count": 0.0,
"routers_loss": 0.006518410053104162,
"skip_count": 2.0,
"step": 938,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 5.109215017064846,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.359375,
"learning_rate": 0.0009937306773927816,
"loss": 0.0569,
"macro_f1": 0.5492662787437439,
"num_tokens": 1485128.0,
"repeat_count": 0.0,
"routers_loss": 0.16481046378612518,
"skip_count": 2.0,
"step": 940,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.120136518771331,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.375,
"learning_rate": 0.0009936735446452341,
"loss": 0.0689,
"macro_f1": 0.3333333432674408,
"num_tokens": 1487854.0,
"repeat_count": 0.0,
"routers_loss": 0.00462290458381176,
"skip_count": 0.0,
"step": 942,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.131058020477815,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.0625,
"learning_rate": 0.0009936161544072716,
"loss": 0.0596,
"macro_f1": 0.3333333432674408,
"num_tokens": 1490795.0,
"repeat_count": 0.0,
"routers_loss": 0.0042699906043708324,
"skip_count": 0.0,
"step": 944,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 5.1419795221843,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 2.78125,
"learning_rate": 0.0009935585067088275,
"loss": 0.1091,
"macro_f1": 0.5492662787437439,
"num_tokens": 1494150.0,
"repeat_count": 0.0,
"routers_loss": 0.01713154837489128,
"skip_count": 2.0,
"step": 946,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.152901023890785,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.640625,
"learning_rate": 0.0009935006015799703,
"loss": 0.0893,
"macro_f1": 0.3333333432674408,
"num_tokens": 1497517.0,
"repeat_count": 0.0,
"routers_loss": 0.014775852672755718,
"skip_count": 0.0,
"step": 948,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 5.163822525597269,
"f1_execute": 0.9629629850387573,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.828125,
"learning_rate": 0.0009934424390509017,
"loss": 0.1128,
"macro_f1": 0.32098767161369324,
"num_tokens": 1500944.0,
"repeat_count": 0.0,
"routers_loss": 0.08066675066947937,
"skip_count": 1.0,
"step": 950,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.25,
"avg_layers": 27.0,
"epoch": 5.174744027303754,
"f1_execute": 0.9411765336990356,
"f1_repeat": 0.0,
"f1_skip": 0.4000000059604645,
"grad_norm": 1.421875,
"learning_rate": 0.0009933840191519584,
"loss": 0.0536,
"macro_f1": 0.44705885648727417,
"num_tokens": 1504267.0,
"repeat_count": 0.0,
"routers_loss": 0.10788286477327347,
"skip_count": 4.0,
"step": 952,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.3333333432674408,
"avg_layers": 28.0,
"epoch": 5.1856655290102385,
"f1_execute": 0.9600000381469727,
"f1_repeat": 1.0,
"f1_skip": 0.5,
"grad_norm": 2.1875,
"learning_rate": 0.0009933253419136107,
"loss": 0.0582,
"macro_f1": 0.8200000524520874,
"num_tokens": 1507688.0,
"repeat_count": 1.0,
"routers_loss": 0.088263139128685,
"skip_count": 3.0,
"step": 954,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.1965870307167235,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.90625,
"learning_rate": 0.000993266407366464,
"loss": 0.0989,
"macro_f1": 0.3333333432674408,
"num_tokens": 1510658.0,
"repeat_count": 0.0,
"routers_loss": 0.005081284325569868,
"skip_count": 0.0,
"step": 956,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 5.207508532423208,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 1.59375,
"learning_rate": 0.000993207215541257,
"loss": 0.0562,
"macro_f1": 0.5492662787437439,
"num_tokens": 1515152.0,
"repeat_count": 0.0,
"routers_loss": 0.025190535932779312,
"skip_count": 2.0,
"step": 958,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 29.0,
"epoch": 5.2184300341296925,
"f1_execute": 1.0,
"f1_repeat": 1.0,
"f1_skip": 0.0,
"grad_norm": 2.3125,
"learning_rate": 0.000993147766468863,
"loss": 0.0672,
"macro_f1": 0.6666666865348816,
"num_tokens": 1518790.0,
"repeat_count": 1.0,
"routers_loss": 0.007869229651987553,
"skip_count": 0.0,
"step": 960,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 5.2293515358361775,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 1.609375,
"learning_rate": 0.0009930880601802898,
"loss": 0.0658,
"macro_f1": 0.5427350401878357,
"num_tokens": 1522153.0,
"repeat_count": 1.0,
"routers_loss": 0.15375611186027527,
"skip_count": 2.0,
"step": 962,
"text_loss": 0.0
},
{
"acc_repeat": 0.6666666865348816,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.2402730375426625,
"f1_execute": 0.8444444537162781,
"f1_repeat": 0.800000011920929,
"f1_skip": 0.0,
"grad_norm": 5.15625,
"learning_rate": 0.0009930280967066787,
"loss": 0.1698,
"macro_f1": 0.5481481552124023,
"num_tokens": 1525054.0,
"repeat_count": 3.0,
"routers_loss": 0.3285106122493744,
"skip_count": 4.0,
"step": 964,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 5.251194539249147,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 2.71875,
"learning_rate": 0.0009929678760793057,
"loss": 0.0853,
"macro_f1": 0.4871794879436493,
"num_tokens": 1528654.0,
"repeat_count": 0.0,
"routers_loss": 0.06668563932180405,
"skip_count": 2.0,
"step": 966,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 5.262116040955632,
"f1_execute": 0.9166666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 2.734375,
"learning_rate": 0.0009929073983295804,
"loss": 0.0927,
"macro_f1": 0.5277777910232544,
"num_tokens": 1531379.0,
"repeat_count": 2.0,
"routers_loss": 0.2843759059906006,
"skip_count": 4.0,
"step": 968,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 5.273037542662116,
"f1_execute": 0.936170220375061,
"f1_repeat": 0.0,
"f1_skip": 0.5714285373687744,
"grad_norm": 2.265625,
"learning_rate": 0.0009928466634890473,
"loss": 0.0759,
"macro_f1": 0.502532958984375,
"num_tokens": 1534519.0,
"repeat_count": 1.0,
"routers_loss": 0.061425577849149704,
"skip_count": 4.0,
"step": 970,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 5.283959044368601,
"f1_execute": 0.9615384340286255,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 3.859375,
"learning_rate": 0.0009927856715893839,
"loss": 0.1502,
"macro_f1": 0.4871794879436493,
"num_tokens": 1537641.0,
"repeat_count": 0.0,
"routers_loss": 0.12876227498054504,
"skip_count": 2.0,
"step": 972,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 5.294880546075086,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.5,
"grad_norm": 2.703125,
"learning_rate": 0.0009927244226624029,
"loss": 0.0589,
"macro_f1": 0.4803921580314636,
"num_tokens": 1540885.0,
"repeat_count": 1.0,
"routers_loss": 0.24013344943523407,
"skip_count": 2.0,
"step": 974,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 5.30580204778157,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 1.0,
"grad_norm": 2.578125,
"learning_rate": 0.00099266291674005,
"loss": 0.1553,
"macro_f1": 0.6666666865348816,
"num_tokens": 1545093.0,
"repeat_count": 0.0,
"routers_loss": 0.008588392287492752,
"skip_count": 1.0,
"step": 976,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.316723549488055,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 5.0625,
"learning_rate": 0.000992601153854406,
"loss": 0.0732,
"macro_f1": 0.3272727429866791,
"num_tokens": 1547669.0,
"repeat_count": 0.0,
"routers_loss": 0.1047874391078949,
"skip_count": 1.0,
"step": 978,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 26.0,
"epoch": 5.327645051194539,
"f1_execute": 0.8571428656578064,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 6.15625,
"learning_rate": 0.000992539134037685,
"loss": 0.1686,
"macro_f1": 0.2857142984867096,
"num_tokens": 1550684.0,
"repeat_count": 1.0,
"routers_loss": 0.3830685019493103,
"skip_count": 2.0,
"step": 980,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.338566552901024,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.296875,
"learning_rate": 0.0009924768573222353,
"loss": 0.0979,
"macro_f1": 0.3333333432674408,
"num_tokens": 1553458.0,
"repeat_count": 0.0,
"routers_loss": 0.0034001434687525034,
"skip_count": 0.0,
"step": 982,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.349488054607509,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.515625,
"learning_rate": 0.0009924143237405392,
"loss": 0.0553,
"macro_f1": 0.3333333432674408,
"num_tokens": 1557067.0,
"repeat_count": 0.0,
"routers_loss": 0.0015051440568640828,
"skip_count": 0.0,
"step": 984,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 25.0,
"epoch": 5.360409556313993,
"f1_execute": 0.9019607901573181,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 4.375,
"learning_rate": 0.0009923515333252128,
"loss": 0.0821,
"macro_f1": 0.3006536066532135,
"num_tokens": 1560210.0,
"repeat_count": 0.0,
"routers_loss": 0.38080108165740967,
"skip_count": 2.0,
"step": 986,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 1.0,
"avg_layers": 27.0,
"epoch": 5.371331058020478,
"f1_execute": 0.9411764740943909,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 5.34375,
"learning_rate": 0.0009922884861090068,
"loss": 0.104,
"macro_f1": 0.5359477400779724,
"num_tokens": 1563164.0,
"repeat_count": 1.0,
"routers_loss": 0.15402451157569885,
"skip_count": 1.0,
"step": 988,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.382252559726963,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 2.6875,
"learning_rate": 0.0009922251821248053,
"loss": 0.0596,
"macro_f1": 0.3333333432674408,
"num_tokens": 1566178.0,
"repeat_count": 0.0,
"routers_loss": 0.0008378620259463787,
"skip_count": 0.0,
"step": 990,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.393174061433447,
"f1_execute": 0.9818181991577148,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 3.5,
"learning_rate": 0.0009921616214056258,
"loss": 0.0858,
"macro_f1": 0.3272727429866791,
"num_tokens": 1568705.0,
"repeat_count": 0.0,
"routers_loss": 0.1363816112279892,
"skip_count": 1.0,
"step": 992,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 26.0,
"epoch": 5.404095563139932,
"f1_execute": 0.9166666865348816,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.125,
"learning_rate": 0.000992097803984621,
"loss": 0.0683,
"macro_f1": 0.5277777910232544,
"num_tokens": 1571934.0,
"repeat_count": 2.0,
"routers_loss": 0.15122386813163757,
"skip_count": 4.0,
"step": 994,
"text_loss": 0.0
},
{
"acc_repeat": 1.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.415017064846416,
"f1_execute": 0.9615384340286255,
"f1_repeat": 1.0,
"f1_skip": 0.0,
"grad_norm": 2.328125,
"learning_rate": 0.0009920337298950765,
"loss": 0.12,
"macro_f1": 0.6538461446762085,
"num_tokens": 1574947.0,
"repeat_count": 1.0,
"routers_loss": 0.16266369819641113,
"skip_count": 1.0,
"step": 996,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.0,
"avg_layers": 28.0,
"epoch": 5.425938566552901,
"f1_execute": 1.0,
"f1_repeat": 0.0,
"f1_skip": 0.0,
"grad_norm": 1.3203125,
"learning_rate": 0.0009919693991704123,
"loss": 0.0627,
"macro_f1": 0.3333333432674408,
"num_tokens": 1577895.0,
"repeat_count": 0.0,
"routers_loss": 0.002958054654300213,
"skip_count": 0.0,
"step": 998,
"text_loss": 0.0
},
{
"acc_repeat": 0.0,
"acc_skip": 0.5,
"avg_layers": 27.0,
"epoch": 5.436860068259386,
"f1_execute": 0.9811320900917053,
"f1_repeat": 0.0,
"f1_skip": 0.6666666865348816,
"grad_norm": 3.703125,
"learning_rate": 0.0009919048118441818,
"loss": 0.1173,
"macro_f1": 0.5492662787437439,
"num_tokens": 1581513.0,
"repeat_count": 0.0,
"routers_loss": 0.08616811782121658,
"skip_count": 2.0,
"step": 1000,
"text_loss": 0.0
}
],
"logging_steps": 2,
"max_steps": 9200,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.7215681060599736e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}