{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 46.94863516289991, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.009392427355444672, "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.25, "learning_rate": 2e-06, "loss": 0.4974, "macro_f1": 0.23255813121795654, "num_tokens": 3175.0, "repeat_count": 0.0, "routers_loss": 0.4339469373226166, "skip_count": 0.0, "step": 2, "text_loss": 0.3330848515033722 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 23.0, "epoch": 0.018784854710889344, "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8359375, "learning_rate": 6e-06, "loss": 0.4988, "macro_f1": 0.24242423474788666, "num_tokens": 5816.0, "repeat_count": 0.0, "routers_loss": 0.4511934816837311, "skip_count": 1.0, "step": 4, "text_loss": 0.4571273922920227 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.02817728206633402, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.234375, "learning_rate": 1e-05, "loss": 0.5113, "macro_f1": 0.222222238779068, "num_tokens": 9739.0, "repeat_count": 0.0, "routers_loss": 0.49306994676589966, "skip_count": 0.0, "step": 6, "text_loss": 0.41060560941696167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.03756970942177869, "f1_execute": 0.5641025900840759, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7265625, "learning_rate": 1.4e-05, "loss": 0.4766, "macro_f1": 0.18803420662879944, "num_tokens": 12869.0, "repeat_count": 1.0, "routers_loss": 0.48872503638267517, "skip_count": 2.0, "step": 8, "text_loss": 0.36678561568260193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.046962136777223364, "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.78125, "learning_rate": 1.8e-05, "loss": 0.4806, "macro_f1": 0.23255813121795654, "num_tokens": 15845.0, "repeat_count": 0.0, "routers_loss": 0.45077216625213623, "skip_count": 0.0, "step": 10, "text_loss": 0.5597779154777527 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 0.05635456413266804, "f1_execute": 0.7179487347602844, "f1_repeat": 0.2857142984867096, "f1_skip": 0.20000000298023224, "grad_norm": 1.5390625, "learning_rate": 2.2e-05, "loss": 0.4557, "macro_f1": 0.40122103691101074, "num_tokens": 19353.0, "repeat_count": 2.0, "routers_loss": 0.4130440056324005, "skip_count": 3.0, "step": 12, "text_loss": 0.2056603729724884 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.06574699148811271, "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.4375, "learning_rate": 2.6e-05, "loss": 0.5129, "macro_f1": 0.23255813121795654, "num_tokens": 22675.0, "repeat_count": 0.0, "routers_loss": 0.4582902193069458, "skip_count": 0.0, "step": 14, "text_loss": 0.32989829778671265 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 0.07513941884355738, "f1_execute": 0.6829268336296082, "f1_repeat": 0.0, "f1_skip": 0.2222222238779068, "grad_norm": 1.7421875, "learning_rate": 3e-05, "loss": 0.4729, "macro_f1": 0.3017163574695587, "num_tokens": 26022.0, "repeat_count": 0.0, "routers_loss": 0.42910993099212646, "skip_count": 1.0, "step": 16, "text_loss": 0.1353905349969864 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.08453184619900206, "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4765625, "learning_rate": 3.4000000000000007e-05, "loss": 0.4274, "macro_f1": 0.2518518567085266, "num_tokens": 29251.0, "repeat_count": 0.0, "routers_loss": 0.3990713059902191, "skip_count": 0.0, "step": 18, "text_loss": 0.3806765377521515 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.09392427355444673, "f1_execute": 0.6829268336296082, "f1_repeat": 0.2857142984867096, "f1_skip": 0.0, "grad_norm": 1.3125, "learning_rate": 3.8e-05, "loss": 0.4261, "macro_f1": 0.3228803873062134, "num_tokens": 32545.0, "repeat_count": 1.0, "routers_loss": 0.40146592259407043, "skip_count": 0.0, "step": 20, "text_loss": 0.25648367404937744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.1033167009098914, "f1_execute": 0.7272727489471436, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.625, "learning_rate": 4.2000000000000004e-05, "loss": 0.404, "macro_f1": 0.24242424964904785, "num_tokens": 36560.0, "repeat_count": 0.0, "routers_loss": 0.372715026140213, "skip_count": 0.0, "step": 22, "text_loss": 0.2799522578716278 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.11270912826533608, "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6328125, "learning_rate": 4.6e-05, "loss": 0.4218, "macro_f1": 0.2518518567085266, "num_tokens": 39597.0, "repeat_count": 0.0, "routers_loss": 0.4504941403865814, "skip_count": 0.0, "step": 24, "text_loss": 0.6635695695877075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.12210155562078075, "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7109375, "learning_rate": 5e-05, "loss": 0.3886, "macro_f1": 0.26950353384017944, "num_tokens": 43080.0, "repeat_count": 0.0, "routers_loss": 0.3498791456222534, "skip_count": 0.0, "step": 26, "text_loss": 0.7035041451454163 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.13149398297622542, "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.34375, "learning_rate": 5.4e-05, "loss": 0.3724, "macro_f1": 0.26950353384017944, "num_tokens": 46406.0, "repeat_count": 0.0, "routers_loss": 0.31265875697135925, "skip_count": 0.0, "step": 28, "text_loss": 0.6388277411460876 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.1408864103316701, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2578125, "learning_rate": 5.800000000000001e-05, "loss": 0.341, "macro_f1": 0.2857142686843872, "num_tokens": 49966.0, "repeat_count": 0.0, "routers_loss": 0.3200918138027191, "skip_count": 2.0, "step": 30, "text_loss": 0.17372547090053558 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.15027883768711475, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4140625, "learning_rate": 6.2e-05, "loss": 0.3207, "macro_f1": 0.2857142686843872, "num_tokens": 53378.0, "repeat_count": 1.0, "routers_loss": 0.32304447889328003, "skip_count": 1.0, "step": 32, "text_loss": 0.18196581304073334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.15967126504255943, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.46875, "learning_rate": 6.6e-05, "loss": 0.3304, "macro_f1": 0.3006536364555359, "num_tokens": 56933.0, "repeat_count": 0.0, "routers_loss": 0.24814388155937195, "skip_count": 0.0, "step": 34, "text_loss": 0.28823015093803406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.16906369239800412, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1171875, "learning_rate": 7.000000000000001e-05, "loss": 0.2778, "macro_f1": 0.3006536066532135, "num_tokens": 60744.0, "repeat_count": 1.0, "routers_loss": 0.22411039471626282, "skip_count": 0.0, "step": 36, "text_loss": 0.5260357856750488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.17845611975344877, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.484375, "learning_rate": 7.4e-05, "loss": 0.2738, "macro_f1": 0.2857142984867096, "num_tokens": 64900.0, "repeat_count": 0.0, "routers_loss": 0.44355395436286926, "skip_count": 0.0, "step": 38, "text_loss": 0.5382097363471985 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.18784854710889345, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3828125, "learning_rate": 7.8e-05, "loss": 0.2137, "macro_f1": 0.3076923191547394, "num_tokens": 68000.0, "repeat_count": 0.0, "routers_loss": 0.202330082654953, "skip_count": 0.0, "step": 40, "text_loss": 0.5946118831634521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.19724097446433814, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.78125, "learning_rate": 8.2e-05, "loss": 0.21, "macro_f1": 0.3144654333591461, "num_tokens": 70529.0, "repeat_count": 0.0, "routers_loss": 0.18023855984210968, "skip_count": 0.0, "step": 42, "text_loss": 0.5550904273986816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2066334018197828, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.609375, "learning_rate": 8.599999999999999e-05, "loss": 0.1918, "macro_f1": 0.32098764181137085, "num_tokens": 73427.0, "repeat_count": 2.0, "routers_loss": 0.2101590931415558, "skip_count": 0.0, "step": 44, "text_loss": 0.4636923372745514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.21602582917522747, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.53125, "learning_rate": 8.999999999999999e-05, "loss": 0.1881, "macro_f1": 0.3333333432674408, "num_tokens": 76472.0, "repeat_count": 0.0, "routers_loss": 0.11800424009561539, "skip_count": 0.0, "step": 46, "text_loss": 0.4187001883983612 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.22541825653067216, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.953125, "learning_rate": 9.400000000000001e-05, "loss": 0.1446, "macro_f1": 0.3272727429866791, "num_tokens": 79124.0, "repeat_count": 1.0, "routers_loss": 0.11632519960403442, "skip_count": 0.0, "step": 48, "text_loss": 0.2253919243812561 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.2348106838861168, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.58984375, "learning_rate": 9.800000000000001e-05, "loss": 0.1543, "macro_f1": 0.32098767161369324, "num_tokens": 81980.0, "repeat_count": 1.0, "routers_loss": 0.09669367223978043, "skip_count": 0.0, "step": 50, "text_loss": 0.6053179502487183 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.2442031112415615, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8515625, "learning_rate": 0.000102, "loss": 0.1393, "macro_f1": 0.32098764181137085, "num_tokens": 85236.0, "repeat_count": 0.0, "routers_loss": 0.12471720576286316, "skip_count": 0.0, "step": 52, "text_loss": 0.6027331948280334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2535955385970062, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.421875, "learning_rate": 0.000106, "loss": 0.1473, "macro_f1": 0.32098764181137085, "num_tokens": 88238.0, "repeat_count": 0.0, "routers_loss": 0.1376056969165802, "skip_count": 2.0, "step": 54, "text_loss": 0.2861751616001129 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.26298796595245083, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.35546875, "learning_rate": 0.00011, "loss": 0.1082, "macro_f1": 0.3333333432674408, "num_tokens": 91056.0, "repeat_count": 0.0, "routers_loss": 0.07449393719434738, "skip_count": 0.0, "step": 56, "text_loss": 0.48106974363327026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.2723803933078955, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.000114, "loss": 0.1123, "macro_f1": 0.32098764181137085, "num_tokens": 94987.0, "repeat_count": 0.0, "routers_loss": 0.07064720243215561, "skip_count": 0.0, "step": 58, "text_loss": 0.3554874658584595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2817728206633402, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5390625, "learning_rate": 0.000118, "loss": 0.1234, "macro_f1": 0.32098764181137085, "num_tokens": 97909.0, "repeat_count": 0.0, "routers_loss": 0.16835889220237732, "skip_count": 2.0, "step": 60, "text_loss": 0.5475804805755615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.29116524801878485, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.000122, "loss": 0.1224, "macro_f1": 0.3333333432674408, "num_tokens": 101043.0, "repeat_count": 0.0, "routers_loss": 0.06127442046999931, "skip_count": 0.0, "step": 62, "text_loss": 0.5966938734054565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3005576753742295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.000126, "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 104103.0, "repeat_count": 0.0, "routers_loss": 0.047825805842876434, "skip_count": 0.0, "step": 64, "text_loss": 0.5480486750602722 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3099501027296742, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.00013000000000000002, "loss": 0.1088, "macro_f1": 0.3006536364555359, "num_tokens": 107009.0, "repeat_count": 1.0, "routers_loss": 0.275174081325531, "skip_count": 4.0, "step": 66, "text_loss": 0.41714492440223694 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.31934253008511887, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.000134, "loss": 0.1123, "macro_f1": 0.3333333432674408, "num_tokens": 110486.0, "repeat_count": 0.0, "routers_loss": 0.029025178402662277, "skip_count": 0.0, "step": 68, "text_loss": 0.6775627732276917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3287349574405635, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.314453125, "learning_rate": 0.00013800000000000002, "loss": 0.1049, "macro_f1": 0.3272727429866791, "num_tokens": 113878.0, "repeat_count": 0.0, "routers_loss": 0.10141710191965103, "skip_count": 1.0, "step": 70, "text_loss": 0.6678873896598816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.33812738479600823, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.248046875, "learning_rate": 0.00014199999999999998, "loss": 0.1119, "macro_f1": 0.3272727429866791, "num_tokens": 116989.0, "repeat_count": 0.0, "routers_loss": 0.08002066612243652, "skip_count": 1.0, "step": 72, "text_loss": 0.405692994594574 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3475198121514529, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1787109375, "learning_rate": 0.000146, "loss": 0.0944, "macro_f1": 0.3144654333591461, "num_tokens": 119883.0, "repeat_count": 0.0, "routers_loss": 0.1867009848356247, "skip_count": 3.0, "step": 74, "text_loss": 0.44616150856018066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.35691223950689754, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.333984375, "learning_rate": 0.00015, "loss": 0.1003, "macro_f1": 0.32098764181137085, "num_tokens": 123325.0, "repeat_count": 0.0, "routers_loss": 0.07042168825864792, "skip_count": 2.0, "step": 76, "text_loss": 0.11340200901031494 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.36630466686234225, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26171875, "learning_rate": 0.000154, "loss": 0.1066, "macro_f1": 0.32098764181137085, "num_tokens": 126131.0, "repeat_count": 0.0, "routers_loss": 0.11535373330116272, "skip_count": 2.0, "step": 78, "text_loss": 0.3269135355949402 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3756970942177869, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.000158, "loss": 0.0891, "macro_f1": 0.3272727429866791, "num_tokens": 130349.0, "repeat_count": 0.0, "routers_loss": 0.09497501701116562, "skip_count": 1.0, "step": 80, "text_loss": 0.15273472666740417 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.38508952157323156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.000162, "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 133607.0, "repeat_count": 0.0, "routers_loss": 0.030639523640275, "skip_count": 0.0, "step": 82, "text_loss": 0.282884806394577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3944819489286763, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.00016600000000000002, "loss": 0.1254, "macro_f1": 0.3272727429866791, "num_tokens": 136694.0, "repeat_count": 0.0, "routers_loss": 0.07906441390514374, "skip_count": 1.0, "step": 84, "text_loss": 0.459094375371933 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.40387437628412093, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.00017, "loss": 0.1071, "macro_f1": 0.3144654333591461, "num_tokens": 139966.0, "repeat_count": 1.0, "routers_loss": 0.1124570444226265, "skip_count": 2.0, "step": 86, "text_loss": 0.29985448718070984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4132668036395656, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25390625, "learning_rate": 0.000174, "loss": 0.1031, "macro_f1": 0.32098764181137085, "num_tokens": 142788.0, "repeat_count": 2.0, "routers_loss": 0.1966402679681778, "skip_count": 0.0, "step": 88, "text_loss": 0.6435291767120361 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4226592309950103, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.349609375, "learning_rate": 0.000178, "loss": 0.0963, "macro_f1": 0.3333333432674408, "num_tokens": 146192.0, "repeat_count": 0.0, "routers_loss": 0.0325632207095623, "skip_count": 0.0, "step": 90, "text_loss": 0.35170626640319824 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.43205165835045495, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2265625, "learning_rate": 0.000182, "loss": 0.1073, "macro_f1": 0.32098764181137085, "num_tokens": 149792.0, "repeat_count": 1.0, "routers_loss": 0.15115146338939667, "skip_count": 1.0, "step": 92, "text_loss": 0.83159339427948 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4414440857058996, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.205078125, "learning_rate": 0.000186, "loss": 0.1073, "macro_f1": 0.3333333432674408, "num_tokens": 152766.0, "repeat_count": 0.0, "routers_loss": 0.043313540518283844, "skip_count": 0.0, "step": 94, "text_loss": 0.49707934260368347 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4508365130613443, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.00019, "loss": 0.0947, "macro_f1": 0.3333333432674408, "num_tokens": 156112.0, "repeat_count": 0.0, "routers_loss": 0.032021280378103256, "skip_count": 0.0, "step": 96, "text_loss": 0.27608928084373474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.46022894041678897, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2099609375, "learning_rate": 0.000194, "loss": 0.0846, "macro_f1": 0.3076923191547394, "num_tokens": 159454.0, "repeat_count": 2.0, "routers_loss": 0.24473154544830322, "skip_count": 2.0, "step": 98, "text_loss": 0.6026689410209656 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4696213677722336, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.00019800000000000002, "loss": 0.1028, "macro_f1": 0.32098764181137085, "num_tokens": 163661.0, "repeat_count": 0.0, "routers_loss": 0.11468276381492615, "skip_count": 2.0, "step": 100, "text_loss": 0.46733155846595764 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.47901379512767833, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.000202, "loss": 0.1089, "macro_f1": 0.3333333432674408, "num_tokens": 167134.0, "repeat_count": 0.0, "routers_loss": 0.021144939586520195, "skip_count": 0.0, "step": 102, "text_loss": 0.6362994909286499 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.488406222483123, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.000206, "loss": 0.0621, "macro_f1": 0.3272727429866791, "num_tokens": 170433.0, "repeat_count": 0.0, "routers_loss": 0.06594710797071457, "skip_count": 1.0, "step": 104, "text_loss": 0.4515477120876312 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.49779864983856764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.00021, "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 173387.0, "repeat_count": 0.0, "routers_loss": 0.032923027873039246, "skip_count": 0.0, "step": 106, "text_loss": 0.6638453006744385 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5071910771940124, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.240234375, "learning_rate": 0.000214, "loss": 0.0883, "macro_f1": 0.3272727429866791, "num_tokens": 176170.0, "repeat_count": 1.0, "routers_loss": 0.08034781366586685, "skip_count": 0.0, "step": 108, "text_loss": 1.186936855316162 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.516583504549457, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000218, "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 179877.0, "repeat_count": 0.0, "routers_loss": 0.07814185321331024, "skip_count": 1.0, "step": 110, "text_loss": 0.5488709211349487 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5259759319049017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.000222, "loss": 0.0946, "macro_f1": 0.3333333432674408, "num_tokens": 182726.0, "repeat_count": 0.0, "routers_loss": 0.01884695515036583, "skip_count": 0.0, "step": 112, "text_loss": 0.5195863842964172 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5353683592603463, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19921875, "learning_rate": 0.00022600000000000002, "loss": 0.0974, "macro_f1": 0.32098764181137085, "num_tokens": 185624.0, "repeat_count": 0.0, "routers_loss": 0.09657823294401169, "skip_count": 2.0, "step": 114, "text_loss": 0.43858134746551514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.544760786615791, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3046875, "learning_rate": 0.00023, "loss": 0.0753, "macro_f1": 0.3333333432674408, "num_tokens": 188155.0, "repeat_count": 0.0, "routers_loss": 0.01463601179420948, "skip_count": 0.0, "step": 116, "text_loss": 0.392981618642807 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5541532139712357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.439453125, "learning_rate": 0.00023400000000000002, "loss": 0.0843, "macro_f1": 0.3333333432674408, "num_tokens": 190970.0, "repeat_count": 0.0, "routers_loss": 0.03859659656882286, "skip_count": 0.0, "step": 118, "text_loss": 0.309179425239563 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5635456413266804, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2255859375, "learning_rate": 0.00023799999999999998, "loss": 0.053, "macro_f1": 0.3333333432674408, "num_tokens": 193988.0, "repeat_count": 0.0, "routers_loss": 0.019092386588454247, "skip_count": 0.0, "step": 120, "text_loss": 0.48543134331703186 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.572938068682125, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.35546875, "learning_rate": 0.000242, "loss": 0.1203, "macro_f1": 0.3272727429866791, "num_tokens": 196475.0, "repeat_count": 0.0, "routers_loss": 0.0619138665497303, "skip_count": 1.0, "step": 122, "text_loss": 0.4615364074707031 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5823304960375697, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1875, "learning_rate": 0.000246, "loss": 0.1002, "macro_f1": 0.3272727429866791, "num_tokens": 200045.0, "repeat_count": 1.0, "routers_loss": 0.09752107411623001, "skip_count": 0.0, "step": 124, "text_loss": 0.15802054107189178 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5917229233930144, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.00025, "loss": 0.0773, "macro_f1": 0.3333333432674408, "num_tokens": 203214.0, "repeat_count": 0.0, "routers_loss": 0.02896115928888321, "skip_count": 0.0, "step": 126, "text_loss": 0.4543360471725464 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.601115350748459, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4296875, "learning_rate": 0.000254, "loss": 0.0973, "macro_f1": 0.3333333432674408, "num_tokens": 206168.0, "repeat_count": 0.0, "routers_loss": 0.011423567309975624, "skip_count": 0.0, "step": 128, "text_loss": 0.4730179011821747 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6105077781039038, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.365234375, "learning_rate": 0.00025800000000000004, "loss": 0.099, "macro_f1": 0.3333333432674408, "num_tokens": 209907.0, "repeat_count": 0.0, "routers_loss": 0.01957600563764572, "skip_count": 0.0, "step": 130, "text_loss": 0.45122358202934265 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6199002054593484, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2060546875, "learning_rate": 0.000262, "loss": 0.0868, "macro_f1": 0.3272727429866791, "num_tokens": 213521.0, "repeat_count": 0.0, "routers_loss": 0.04882373288273811, "skip_count": 1.0, "step": 132, "text_loss": 0.4341491758823395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6292926328147931, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.000266, "loss": 0.0834, "macro_f1": 0.3333333432674408, "num_tokens": 216484.0, "repeat_count": 0.0, "routers_loss": 0.016083380207419395, "skip_count": 0.0, "step": 134, "text_loss": 0.46990111470222473 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6386850601702377, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.220703125, "learning_rate": 0.00027, "loss": 0.0863, "macro_f1": 0.3333333432674408, "num_tokens": 219398.0, "repeat_count": 0.0, "routers_loss": 0.01733536459505558, "skip_count": 0.0, "step": 136, "text_loss": 0.4455361068248749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6480774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.00027400000000000005, "loss": 0.0997, "macro_f1": 0.3333333432674408, "num_tokens": 222430.0, "repeat_count": 0.0, "routers_loss": 0.01332803163677454, "skip_count": 0.0, "step": 138, "text_loss": 0.47699397802352905 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.657469914881127, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.333984375, "learning_rate": 0.00027800000000000004, "loss": 0.0922, "macro_f1": 0.3144654333591461, "num_tokens": 225458.0, "repeat_count": 1.0, "routers_loss": 0.14924728870391846, "skip_count": 2.0, "step": 140, "text_loss": 0.5858222842216492 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6668623422365718, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25, "learning_rate": 0.00028199999999999997, "loss": 0.0798, "macro_f1": 0.3144654333591461, "num_tokens": 229365.0, "repeat_count": 1.0, "routers_loss": 0.1860177218914032, "skip_count": 2.0, "step": 142, "text_loss": 0.5003137588500977 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6762547695920165, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.00028599999999999996, "loss": 0.054, "macro_f1": 0.32098764181137085, "num_tokens": 231787.0, "repeat_count": 1.0, "routers_loss": 0.16498211026191711, "skip_count": 1.0, "step": 144, "text_loss": 0.5026470422744751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6856471969474611, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.00029, "loss": 0.0936, "macro_f1": 0.32098764181137085, "num_tokens": 235014.0, "repeat_count": 1.0, "routers_loss": 0.11801310628652573, "skip_count": 1.0, "step": 146, "text_loss": 0.611888587474823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6950396243029058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.000294, "loss": 0.0878, "macro_f1": 0.3333333432674408, "num_tokens": 238210.0, "repeat_count": 0.0, "routers_loss": 0.02422776259481907, "skip_count": 0.0, "step": 148, "text_loss": 0.2876914143562317 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7044320516583504, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.000298, "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 241582.0, "repeat_count": 0.0, "routers_loss": 0.07282499223947525, "skip_count": 2.0, "step": 150, "text_loss": 0.3919292390346527 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7138244790137951, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.37890625, "learning_rate": 0.000302, "loss": 0.0797, "macro_f1": 0.32098764181137085, "num_tokens": 244621.0, "repeat_count": 1.0, "routers_loss": 0.20659038424491882, "skip_count": 1.0, "step": 152, "text_loss": 0.4294498860836029 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7232169063692399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1787109375, "learning_rate": 0.000306, "loss": 0.072, "macro_f1": 0.3333333432674408, "num_tokens": 247833.0, "repeat_count": 0.0, "routers_loss": 0.02428400330245495, "skip_count": 0.0, "step": 154, "text_loss": 0.5930765867233276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7326093337246845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.00031, "loss": 0.0772, "macro_f1": 0.3333333432674408, "num_tokens": 251349.0, "repeat_count": 0.0, "routers_loss": 0.0167869683355093, "skip_count": 0.0, "step": 156, "text_loss": 0.41063904762268066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7420017610801292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.000314, "loss": 0.0821, "macro_f1": 0.3333333432674408, "num_tokens": 254886.0, "repeat_count": 0.0, "routers_loss": 0.02531604655086994, "skip_count": 0.0, "step": 158, "text_loss": 0.6739020347595215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7513941884355738, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.201171875, "learning_rate": 0.00031800000000000003, "loss": 0.09, "macro_f1": 0.3333333432674408, "num_tokens": 258260.0, "repeat_count": 0.0, "routers_loss": 0.017772775143384933, "skip_count": 0.0, "step": 160, "text_loss": 0.46873849630355835 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7607866157910185, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.224609375, "learning_rate": 0.000322, "loss": 0.0893, "macro_f1": 0.3272727429866791, "num_tokens": 261846.0, "repeat_count": 0.0, "routers_loss": 0.034902360290288925, "skip_count": 1.0, "step": 162, "text_loss": 0.3727971017360687 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7701790431464631, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.000326, "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 264348.0, "repeat_count": 0.0, "routers_loss": 0.013553355820477009, "skip_count": 0.0, "step": 164, "text_loss": 0.5798237323760986 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7795714705019078, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.408203125, "learning_rate": 0.00033, "loss": 0.0926, "macro_f1": 0.32098764181137085, "num_tokens": 267479.0, "repeat_count": 1.0, "routers_loss": 0.13571743667125702, "skip_count": 1.0, "step": 166, "text_loss": 0.8084776997566223 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7889638978573525, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2431640625, "learning_rate": 0.00033400000000000004, "loss": 0.0817, "macro_f1": 0.32098764181137085, "num_tokens": 270268.0, "repeat_count": 2.0, "routers_loss": 0.19884146749973297, "skip_count": 0.0, "step": 168, "text_loss": 0.7366134524345398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7983563252127972, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.00033800000000000003, "loss": 0.1022, "macro_f1": 0.32098764181137085, "num_tokens": 273518.0, "repeat_count": 1.0, "routers_loss": 0.15469175577163696, "skip_count": 1.0, "step": 170, "text_loss": 0.27204006910324097 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8077487525682419, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.000342, "loss": 0.0865, "macro_f1": 0.32098764181137085, "num_tokens": 277210.0, "repeat_count": 0.0, "routers_loss": 0.08603330701589584, "skip_count": 2.0, "step": 172, "text_loss": 0.7137667536735535 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8171411799236865, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000346, "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 280389.0, "repeat_count": 0.0, "routers_loss": 0.17851492762565613, "skip_count": 4.0, "step": 174, "text_loss": 0.5148105621337891 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8265336072791312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.00035, "loss": 0.0853, "macro_f1": 0.3333333432674408, "num_tokens": 283501.0, "repeat_count": 0.0, "routers_loss": 0.021331604570150375, "skip_count": 0.0, "step": 176, "text_loss": 0.301013320684433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8359260346345758, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.000354, "loss": 0.0911, "macro_f1": 0.32098764181137085, "num_tokens": 287154.0, "repeat_count": 0.0, "routers_loss": 0.057273946702480316, "skip_count": 2.0, "step": 178, "text_loss": 0.4740981459617615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8453184619900206, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.240234375, "learning_rate": 0.000358, "loss": 0.0904, "macro_f1": 0.3272727429866791, "num_tokens": 289929.0, "repeat_count": 0.0, "routers_loss": 0.04116598889231682, "skip_count": 1.0, "step": 180, "text_loss": 0.4838573932647705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8547108893454652, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.000362, "loss": 0.0991, "macro_f1": 0.3333333432674408, "num_tokens": 294293.0, "repeat_count": 0.0, "routers_loss": 0.027111956849694252, "skip_count": 0.0, "step": 182, "text_loss": 0.7495553493499756 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8641033167009099, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.158203125, "learning_rate": 0.000366, "loss": 0.1038, "macro_f1": 0.3333333432674408, "num_tokens": 297730.0, "repeat_count": 0.0, "routers_loss": 0.019166452810168266, "skip_count": 0.0, "step": 184, "text_loss": 0.534831166267395 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 0.8734957440563546, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2236328125, "learning_rate": 0.00037, "loss": 0.0784, "macro_f1": 0.5427350401878357, "num_tokens": 300593.0, "repeat_count": 1.0, "routers_loss": 0.2349659502506256, "skip_count": 2.0, "step": 186, "text_loss": 0.3549048602581024 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8828881714117992, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2041015625, "learning_rate": 0.000374, "loss": 0.0827, "macro_f1": 0.3076923191547394, "num_tokens": 303456.0, "repeat_count": 2.0, "routers_loss": 0.22502389550209045, "skip_count": 2.0, "step": 188, "text_loss": 0.8837642073631287 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8922805987672439, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.000378, "loss": 0.1085, "macro_f1": 0.3272727429866791, "num_tokens": 306241.0, "repeat_count": 1.0, "routers_loss": 0.12291611731052399, "skip_count": 0.0, "step": 190, "text_loss": 0.73353511095047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9016730261226886, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000382, "loss": 0.0969, "macro_f1": 0.3272727429866791, "num_tokens": 310606.0, "repeat_count": 0.0, "routers_loss": 0.055988848209381104, "skip_count": 1.0, "step": 192, "text_loss": 0.6261917352676392 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9110654534781333, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.34375, "learning_rate": 0.000386, "loss": 0.1055, "macro_f1": 0.3144654333591461, "num_tokens": 313564.0, "repeat_count": 0.0, "routers_loss": 0.12363404780626297, "skip_count": 3.0, "step": 194, "text_loss": 0.2790874242782593 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9204578808335779, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.27734375, "learning_rate": 0.00039000000000000005, "loss": 0.0964, "macro_f1": 0.3076923191547394, "num_tokens": 316958.0, "repeat_count": 2.0, "routers_loss": 0.2718356251716614, "skip_count": 2.0, "step": 196, "text_loss": 0.14428086578845978 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9298503081890226, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.00039400000000000004, "loss": 0.0917, "macro_f1": 0.32098764181137085, "num_tokens": 320103.0, "repeat_count": 0.0, "routers_loss": 0.07188102602958679, "skip_count": 2.0, "step": 198, "text_loss": 0.27155816555023193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9392427355444672, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.201171875, "learning_rate": 0.000398, "loss": 0.0809, "macro_f1": 0.32098764181137085, "num_tokens": 323566.0, "repeat_count": 1.0, "routers_loss": 0.18038256466388702, "skip_count": 1.0, "step": 200, "text_loss": 0.8453494310379028 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9486351628999119, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.000402, "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 326385.0, "repeat_count": 0.0, "routers_loss": 0.014639763161540031, "skip_count": 0.0, "step": 202, "text_loss": 0.5733131766319275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9580275902553567, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21875, "learning_rate": 0.00040600000000000006, "loss": 0.104, "macro_f1": 0.3333333432674408, "num_tokens": 329266.0, "repeat_count": 0.0, "routers_loss": 0.015269627794623375, "skip_count": 0.0, "step": 204, "text_loss": 0.7355639934539795 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9674200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.27734375, "learning_rate": 0.00041, "loss": 0.0833, "macro_f1": 0.3333333432674408, "num_tokens": 332984.0, "repeat_count": 0.0, "routers_loss": 0.018046971410512924, "skip_count": 0.0, "step": 206, "text_loss": 0.587641179561615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.976812444966246, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.000414, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 335739.0, "repeat_count": 1.0, "routers_loss": 0.12791286408901215, "skip_count": 0.0, "step": 208, "text_loss": 0.6538406610488892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9862048723216906, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.24609375, "learning_rate": 0.00041799999999999997, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 338966.0, "repeat_count": 0.0, "routers_loss": 0.050490595400333405, "skip_count": 1.0, "step": 210, "text_loss": 0.4188295602798462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9955972996771353, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.000422, "loss": 0.0588, "macro_f1": 0.3144654333591461, "num_tokens": 342063.0, "repeat_count": 0.0, "routers_loss": 0.11652113497257233, "skip_count": 3.0, "step": 212, "text_loss": 0.21822240948677063 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0046962136777224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2060546875, "learning_rate": 0.000426, "loss": 0.0621, "macro_f1": 0.3333333432674408, "num_tokens": 344887.0, "repeat_count": 0.0, "routers_loss": 0.023898238316178322, "skip_count": 0.0, "step": 214, "text_loss": 0.24692800641059875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.014088641033167, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3671875, "learning_rate": 0.00043, "loss": 0.1005, "macro_f1": 0.3272727429866791, "num_tokens": 348700.0, "repeat_count": 1.0, "routers_loss": 0.06414655596017838, "skip_count": 0.0, "step": 216, "text_loss": 0.4744548797607422 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0234810683886117, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.00043400000000000003, "loss": 0.0753, "macro_f1": 0.32098764181137085, "num_tokens": 351507.0, "repeat_count": 1.0, "routers_loss": 0.11702914535999298, "skip_count": 1.0, "step": 218, "text_loss": 0.5614864826202393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0328734957440564, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000438, "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 354484.0, "repeat_count": 0.0, "routers_loss": 0.014991643838584423, "skip_count": 0.0, "step": 220, "text_loss": 0.47209832072257996 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.042265923099501, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.251953125, "learning_rate": 0.000442, "loss": 0.106, "macro_f1": 0.3272727429866791, "num_tokens": 357954.0, "repeat_count": 0.0, "routers_loss": 0.04747112840414047, "skip_count": 1.0, "step": 222, "text_loss": 0.2968728244304657 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0516583504549457, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.40234375, "learning_rate": 0.000446, "loss": 0.0853, "macro_f1": 0.32098764181137085, "num_tokens": 360547.0, "repeat_count": 0.0, "routers_loss": 0.06754162162542343, "skip_count": 2.0, "step": 224, "text_loss": 0.2364148646593094 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0610507778103904, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2412109375, "learning_rate": 0.00045000000000000004, "loss": 0.1016, "macro_f1": 0.3272727429866791, "num_tokens": 364529.0, "repeat_count": 0.0, "routers_loss": 0.07830183953046799, "skip_count": 1.0, "step": 226, "text_loss": 0.4787476360797882 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.070443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.00045400000000000003, "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 367683.0, "repeat_count": 0.0, "routers_loss": 0.015735948458313942, "skip_count": 0.0, "step": 228, "text_loss": 0.37148505449295044 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25, "learning_rate": 0.000458, "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 371402.0, "repeat_count": 0.0, "routers_loss": 0.013354359194636345, "skip_count": 0.0, "step": 230, "text_loss": 0.7464763522148132 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.000462, "loss": 0.0731, "macro_f1": 0.3333333432674408, "num_tokens": 374587.0, "repeat_count": 0.0, "routers_loss": 0.013763721100986004, "skip_count": 0.0, "step": 232, "text_loss": 0.8754443526268005 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.098620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3984375, "learning_rate": 0.00046600000000000005, "loss": 0.0861, "macro_f1": 0.3333333432674408, "num_tokens": 377513.0, "repeat_count": 0.0, "routers_loss": 0.010075435042381287, "skip_count": 0.0, "step": 234, "text_loss": 0.31534913182258606 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1080129145876136, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.00047, "loss": 0.0791, "macro_f1": 0.3272727429866791, "num_tokens": 380736.0, "repeat_count": 0.0, "routers_loss": 0.059825167059898376, "skip_count": 1.0, "step": 236, "text_loss": 0.5936337113380432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1174053419430585, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000474, "loss": 0.0514, "macro_f1": 0.32098764181137085, "num_tokens": 383236.0, "repeat_count": 0.0, "routers_loss": 0.09134846180677414, "skip_count": 2.0, "step": 238, "text_loss": 0.5976157784461975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1267977692985032, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.208984375, "learning_rate": 0.00047799999999999996, "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 385778.0, "repeat_count": 1.0, "routers_loss": 0.11989791691303253, "skip_count": 1.0, "step": 240, "text_loss": 0.3554210960865021 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1361901966539478, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.000482, "loss": 0.0734, "macro_f1": 0.3333333432674408, "num_tokens": 388777.0, "repeat_count": 0.0, "routers_loss": 0.013591105118393898, "skip_count": 0.0, "step": 242, "text_loss": 0.4829460382461548 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1455826240093925, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12060546875, "learning_rate": 0.000486, "loss": 0.0625, "macro_f1": 0.32098764181137085, "num_tokens": 391797.0, "repeat_count": 0.0, "routers_loss": 0.0920003354549408, "skip_count": 2.0, "step": 244, "text_loss": 0.3085818886756897 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1549750513648371, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.00049, "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 396485.0, "repeat_count": 0.0, "routers_loss": 0.0129330949857831, "skip_count": 0.0, "step": 246, "text_loss": 0.42803969979286194 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1643674787202818, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.296875, "learning_rate": 0.000494, "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 399923.0, "repeat_count": 0.0, "routers_loss": 0.10677755624055862, "skip_count": 3.0, "step": 248, "text_loss": 0.2908555567264557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1737599060757264, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.000498, "loss": 0.0812, "macro_f1": 0.3144654333591461, "num_tokens": 403647.0, "repeat_count": 0.0, "routers_loss": 0.1504337340593338, "skip_count": 3.0, "step": 250, "text_loss": 0.333095908164978 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.183152333431171, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0005020000000000001, "loss": 0.0828, "macro_f1": 0.32098764181137085, "num_tokens": 409147.0, "repeat_count": 0.0, "routers_loss": 0.06503184884786606, "skip_count": 2.0, "step": 252, "text_loss": 0.16117942333221436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1925447607866158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.287109375, "learning_rate": 0.000506, "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 412072.0, "repeat_count": 0.0, "routers_loss": 0.016280122101306915, "skip_count": 0.0, "step": 254, "text_loss": 0.4217492640018463 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2019371881420604, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.00051, "loss": 0.0803, "macro_f1": 0.3144654333591461, "num_tokens": 415052.0, "repeat_count": 2.0, "routers_loss": 0.2117508500814438, "skip_count": 1.0, "step": 256, "text_loss": 0.5795308947563171 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.211329615497505, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2421875, "learning_rate": 0.000514, "loss": 0.0668, "macro_f1": 0.3272727429866791, "num_tokens": 418099.0, "repeat_count": 1.0, "routers_loss": 0.15002092719078064, "skip_count": 0.0, "step": 258, "text_loss": 0.4840938448905945 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2207220428529497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.000518, "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 422526.0, "repeat_count": 0.0, "routers_loss": 0.012834074907004833, "skip_count": 0.0, "step": 260, "text_loss": 0.36141225695610046 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2301144702083944, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.000522, "loss": 0.085, "macro_f1": 0.3076923191547394, "num_tokens": 425765.0, "repeat_count": 2.0, "routers_loss": 0.23808011412620544, "skip_count": 2.0, "step": 262, "text_loss": 0.27572691440582275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2395068975638392, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.000526, "loss": 0.0708, "macro_f1": 0.3272727429866791, "num_tokens": 429048.0, "repeat_count": 0.0, "routers_loss": 0.055687375366687775, "skip_count": 1.0, "step": 264, "text_loss": 0.37020301818847656 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.248899324919284, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2080078125, "learning_rate": 0.0005300000000000001, "loss": 0.0839, "macro_f1": 0.3272727429866791, "num_tokens": 431784.0, "repeat_count": 0.0, "routers_loss": 0.0872957780957222, "skip_count": 1.0, "step": 266, "text_loss": 0.5937283039093018 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2582917522747286, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.0005340000000000001, "loss": 0.0733, "macro_f1": 0.32098764181137085, "num_tokens": 434297.0, "repeat_count": 2.0, "routers_loss": 0.23507654666900635, "skip_count": 0.0, "step": 268, "text_loss": 0.3367372453212738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2676841796301732, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2431640625, "learning_rate": 0.0005380000000000001, "loss": 0.0708, "macro_f1": 0.32098764181137085, "num_tokens": 437586.0, "repeat_count": 0.0, "routers_loss": 0.12860390543937683, "skip_count": 2.0, "step": 270, "text_loss": 0.7149854302406311 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2770766069856179, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2451171875, "learning_rate": 0.0005420000000000001, "loss": 0.1072, "macro_f1": 0.3272727429866791, "num_tokens": 440649.0, "repeat_count": 0.0, "routers_loss": 0.044308312237262726, "skip_count": 1.0, "step": 272, "text_loss": 0.26778292655944824 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2864690343410625, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.44921875, "learning_rate": 0.000546, "loss": 0.0938, "macro_f1": 0.3144654333591461, "num_tokens": 443907.0, "repeat_count": 0.0, "routers_loss": 0.11514109373092651, "skip_count": 3.0, "step": 274, "text_loss": 0.23578761518001556 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.2958614616965072, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2578125, "learning_rate": 0.00055, "loss": 0.0932, "macro_f1": 0.5492662787437439, "num_tokens": 447147.0, "repeat_count": 0.0, "routers_loss": 0.055705297738313675, "skip_count": 2.0, "step": 276, "text_loss": 0.2513524889945984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3052538890519518, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.000554, "loss": 0.0667, "macro_f1": 0.32098764181137085, "num_tokens": 450032.0, "repeat_count": 0.0, "routers_loss": 0.13778971135616302, "skip_count": 2.0, "step": 278, "text_loss": 0.4857243597507477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3146463164073965, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.000558, "loss": 0.0672, "macro_f1": 0.3272727429866791, "num_tokens": 453195.0, "repeat_count": 1.0, "routers_loss": 0.0700262188911438, "skip_count": 0.0, "step": 280, "text_loss": 0.7589789628982544 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3240387437628411, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25, "learning_rate": 0.0005620000000000001, "loss": 0.0603, "macro_f1": 0.3144654333591461, "num_tokens": 455942.0, "repeat_count": 1.0, "routers_loss": 0.11706235259771347, "skip_count": 2.0, "step": 282, "text_loss": 0.4783432185649872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3334311711182858, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.265625, "learning_rate": 0.000566, "loss": 0.0793, "macro_f1": 0.3272727429866791, "num_tokens": 458932.0, "repeat_count": 0.0, "routers_loss": 0.07073967158794403, "skip_count": 1.0, "step": 284, "text_loss": 0.7117193937301636 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3428235984737307, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.00057, "loss": 0.0915, "macro_f1": 0.3272727429866791, "num_tokens": 462650.0, "repeat_count": 0.0, "routers_loss": 0.05301115661859512, "skip_count": 1.0, "step": 286, "text_loss": 0.4175460636615753 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.352216025829175, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.000574, "loss": 0.0675, "macro_f1": 0.3272727429866791, "num_tokens": 466290.0, "repeat_count": 0.0, "routers_loss": 0.06356479972600937, "skip_count": 1.0, "step": 288, "text_loss": 0.5832946300506592 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.36160845318462, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28515625, "learning_rate": 0.000578, "loss": 0.0805, "macro_f1": 0.3006536066532135, "num_tokens": 469296.0, "repeat_count": 1.0, "routers_loss": 0.21032999455928802, "skip_count": 3.0, "step": 290, "text_loss": 0.36023473739624023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3710008805400646, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.27734375, "learning_rate": 0.0005819999999999999, "loss": 0.0685, "macro_f1": 0.32098764181137085, "num_tokens": 472272.0, "repeat_count": 1.0, "routers_loss": 0.08062280714511871, "skip_count": 1.0, "step": 292, "text_loss": 0.37197956442832947 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3803933078955093, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28125, "learning_rate": 0.0005859999999999999, "loss": 0.0878, "macro_f1": 0.32098764181137085, "num_tokens": 475864.0, "repeat_count": 0.0, "routers_loss": 0.05023600533604622, "skip_count": 2.0, "step": 294, "text_loss": 0.4765273630619049 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.389785735250954, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2177734375, "learning_rate": 0.00059, "loss": 0.0728, "macro_f1": 0.3333333432674408, "num_tokens": 478916.0, "repeat_count": 0.0, "routers_loss": 0.011689410544931889, "skip_count": 0.0, "step": 296, "text_loss": 0.5878773927688599 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3991781626063986, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000594, "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 482369.0, "repeat_count": 0.0, "routers_loss": 0.010772093199193478, "skip_count": 0.0, "step": 298, "text_loss": 0.4424116313457489 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4085705899618433, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.181640625, "learning_rate": 0.000598, "loss": 0.0787, "macro_f1": 0.3076923191547394, "num_tokens": 486049.0, "repeat_count": 2.0, "routers_loss": 0.23482851684093475, "skip_count": 2.0, "step": 300, "text_loss": 0.21217775344848633 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.417963017317288, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2080078125, "learning_rate": 0.000602, "loss": 0.073, "macro_f1": 0.3076923191547394, "num_tokens": 488683.0, "repeat_count": 1.0, "routers_loss": 0.18843084573745728, "skip_count": 3.0, "step": 302, "text_loss": 0.2109498232603073 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4273554446727326, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.279296875, "learning_rate": 0.000606, "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 492010.0, "repeat_count": 0.0, "routers_loss": 0.17861786484718323, "skip_count": 3.0, "step": 304, "text_loss": 0.8446305394172668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4367478720281772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.00061, "loss": 0.0827, "macro_f1": 0.3333333432674408, "num_tokens": 494764.0, "repeat_count": 0.0, "routers_loss": 0.014124520123004913, "skip_count": 0.0, "step": 306, "text_loss": 0.742735743522644 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4461402993836219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26953125, "learning_rate": 0.000614, "loss": 0.1071, "macro_f1": 0.3333333432674408, "num_tokens": 497820.0, "repeat_count": 0.0, "routers_loss": 0.017968112602829933, "skip_count": 0.0, "step": 308, "text_loss": 0.28305482864379883 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4555327267390665, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1689453125, "learning_rate": 0.0006180000000000001, "loss": 0.0775, "macro_f1": 0.32098764181137085, "num_tokens": 500694.0, "repeat_count": 0.0, "routers_loss": 0.08593655377626419, "skip_count": 2.0, "step": 310, "text_loss": 0.3496848940849304 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4649251540945114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19140625, "learning_rate": 0.000622, "loss": 0.061, "macro_f1": 0.3333333432674408, "num_tokens": 503871.0, "repeat_count": 0.0, "routers_loss": 0.016449492424726486, "skip_count": 0.0, "step": 312, "text_loss": 0.6691372990608215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4743175814499558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.205078125, "learning_rate": 0.000626, "loss": 0.0815, "macro_f1": 0.3333333432674408, "num_tokens": 506730.0, "repeat_count": 0.0, "routers_loss": 0.014532964676618576, "skip_count": 0.0, "step": 314, "text_loss": 0.6118118166923523 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4837100088054007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2216796875, "learning_rate": 0.00063, "loss": 0.0742, "macro_f1": 0.3333333432674408, "num_tokens": 510323.0, "repeat_count": 0.0, "routers_loss": 0.013093139044940472, "skip_count": 0.0, "step": 316, "text_loss": 0.38126271963119507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4931024361608454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.400390625, "learning_rate": 0.000634, "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 514075.0, "repeat_count": 0.0, "routers_loss": 0.008627045899629593, "skip_count": 0.0, "step": 318, "text_loss": 0.5983037948608398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.50249486351629, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000638, "loss": 0.1008, "macro_f1": 0.3272727429866791, "num_tokens": 517418.0, "repeat_count": 0.0, "routers_loss": 0.04561378434300423, "skip_count": 1.0, "step": 320, "text_loss": 0.767257034778595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.5118872908717347, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.259765625, "learning_rate": 0.000642, "loss": 0.0926, "macro_f1": 0.3272727429866791, "num_tokens": 520443.0, "repeat_count": 0.0, "routers_loss": 0.024372953921556473, "skip_count": 0.0, "step": 322, "text_loss": 0.6572105884552002 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5212797182271793, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30078125, "learning_rate": 0.000646, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 523317.0, "repeat_count": 1.0, "routers_loss": 0.08099937438964844, "skip_count": 0.0, "step": 324, "text_loss": 0.205499529838562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.530672145582624, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.0006500000000000001, "loss": 0.0809, "macro_f1": 0.32098767161369324, "num_tokens": 526355.0, "repeat_count": 0.0, "routers_loss": 0.0657225176692009, "skip_count": 1.0, "step": 326, "text_loss": 0.2587239742279053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5400645729380686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.111328125, "learning_rate": 0.0006540000000000001, "loss": 0.0779, "macro_f1": 0.3333333432674408, "num_tokens": 529689.0, "repeat_count": 0.0, "routers_loss": 0.01849208027124405, "skip_count": 0.0, "step": 328, "text_loss": 0.2172023057937622 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5494570002935135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1845703125, "learning_rate": 0.0006580000000000001, "loss": 0.0758, "macro_f1": 0.3333333432674408, "num_tokens": 532603.0, "repeat_count": 0.0, "routers_loss": 0.016184113919734955, "skip_count": 0.0, "step": 330, "text_loss": 0.5980568528175354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.220703125, "learning_rate": 0.000662, "loss": 0.0439, "macro_f1": 0.3333333432674408, "num_tokens": 536056.0, "repeat_count": 0.0, "routers_loss": 0.01303898449987173, "skip_count": 0.0, "step": 332, "text_loss": 0.5421966314315796 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 1.5682418550044028, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.296875, "learning_rate": 0.000666, "loss": 0.0963, "macro_f1": 0.465986430644989, "num_tokens": 539231.0, "repeat_count": 3.0, "routers_loss": 0.3075675964355469, "skip_count": 3.0, "step": 334, "text_loss": 0.19719554483890533 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5776342823598473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.00067, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 542038.0, "repeat_count": 0.0, "routers_loss": 0.009116224013268948, "skip_count": 0.0, "step": 336, "text_loss": 0.3407036066055298 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5870267097152921, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2421875, "learning_rate": 0.000674, "loss": 0.0768, "macro_f1": 0.3333333432674408, "num_tokens": 545019.0, "repeat_count": 0.0, "routers_loss": 0.021463042125105858, "skip_count": 0.0, "step": 338, "text_loss": 0.24486012756824493 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5964191370707366, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.0006780000000000001, "loss": 0.0889, "macro_f1": 0.3333333432674408, "num_tokens": 548036.0, "repeat_count": 0.0, "routers_loss": 0.01857556402683258, "skip_count": 0.0, "step": 340, "text_loss": 0.28140124678611755 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6058115644261814, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0006820000000000001, "loss": 0.0617, "macro_f1": 0.3006536364555359, "num_tokens": 551419.0, "repeat_count": 2.0, "routers_loss": 0.27090007066726685, "skip_count": 3.0, "step": 342, "text_loss": 0.20690307021141052 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.615203991781626, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3046875, "learning_rate": 0.0006860000000000001, "loss": 0.1047, "macro_f1": 0.32098764181137085, "num_tokens": 554037.0, "repeat_count": 0.0, "routers_loss": 0.09231195598840714, "skip_count": 2.0, "step": 344, "text_loss": 0.4479128420352936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6245964191370708, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.00069, "loss": 0.0883, "macro_f1": 0.3333333432674408, "num_tokens": 556672.0, "repeat_count": 0.0, "routers_loss": 0.00935924518853426, "skip_count": 0.0, "step": 346, "text_loss": 0.6377320289611816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6339888464925154, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.000694, "loss": 0.0781, "macro_f1": 0.32098764181137085, "num_tokens": 559756.0, "repeat_count": 0.0, "routers_loss": 0.17641772329807281, "skip_count": 2.0, "step": 348, "text_loss": 0.6097636222839355 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.64338127384796, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.30078125, "learning_rate": 0.0006979999999999999, "loss": 0.0616, "macro_f1": 0.5492662787437439, "num_tokens": 563415.0, "repeat_count": 0.0, "routers_loss": 0.06240406632423401, "skip_count": 2.0, "step": 350, "text_loss": 0.5291631817817688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6527737012034047, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.296875, "learning_rate": 0.0007019999999999999, "loss": 0.1026, "macro_f1": 0.3333333432674408, "num_tokens": 566357.0, "repeat_count": 0.0, "routers_loss": 0.012269247323274612, "skip_count": 0.0, "step": 352, "text_loss": 0.5170195698738098 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6621661285588494, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0007059999999999999, "loss": 0.0815, "macro_f1": 0.32098764181137085, "num_tokens": 569449.0, "repeat_count": 0.0, "routers_loss": 0.07515309751033783, "skip_count": 2.0, "step": 354, "text_loss": 0.34507250785827637 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6715585559142943, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.00071, "loss": 0.0791, "macro_f1": 0.3144654333591461, "num_tokens": 572761.0, "repeat_count": 1.0, "routers_loss": 0.20768006145954132, "skip_count": 2.0, "step": 356, "text_loss": 0.3158532381057739 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6809509832697387, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.000714, "loss": 0.0682, "macro_f1": 0.3333333432674408, "num_tokens": 575909.0, "repeat_count": 0.0, "routers_loss": 0.025329967960715294, "skip_count": 0.0, "step": 358, "text_loss": 0.21455390751361847 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.6903434106251836, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.000718, "loss": 0.0775, "macro_f1": 0.32098767161369324, "num_tokens": 579186.0, "repeat_count": 1.0, "routers_loss": 0.07676175981760025, "skip_count": 0.0, "step": 360, "text_loss": 0.61895352602005 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.699735837980628, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.197265625, "learning_rate": 0.000722, "loss": 0.0781, "macro_f1": 0.32098767161369324, "num_tokens": 582437.0, "repeat_count": 0.0, "routers_loss": 0.08070661872625351, "skip_count": 1.0, "step": 362, "text_loss": 0.20557661354541779 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7091282653360729, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2216796875, "learning_rate": 0.000726, "loss": 0.11, "macro_f1": 0.3333333432674408, "num_tokens": 586096.0, "repeat_count": 0.0, "routers_loss": 0.015891313552856445, "skip_count": 0.0, "step": 364, "text_loss": 0.597991943359375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7185206926915173, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.00073, "loss": 0.0573, "macro_f1": 0.3076923191547394, "num_tokens": 589520.0, "repeat_count": 1.0, "routers_loss": 0.12844261527061462, "skip_count": 3.0, "step": 366, "text_loss": 0.2944789230823517 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7279131200469622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.000734, "loss": 0.1005, "macro_f1": 0.3333333432674408, "num_tokens": 592691.0, "repeat_count": 0.0, "routers_loss": 0.02382199838757515, "skip_count": 0.0, "step": 368, "text_loss": 0.23989969491958618 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7373055474024068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.000738, "loss": 0.0661, "macro_f1": 0.3333333432674408, "num_tokens": 596004.0, "repeat_count": 0.0, "routers_loss": 0.018812084570527077, "skip_count": 0.0, "step": 370, "text_loss": 0.22111408412456512 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7466979747578515, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2412109375, "learning_rate": 0.000742, "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 599087.0, "repeat_count": 0.0, "routers_loss": 0.08290331065654755, "skip_count": 1.0, "step": 372, "text_loss": 0.2567356526851654 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7560904021132961, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2412109375, "learning_rate": 0.000746, "loss": 0.0941, "macro_f1": 0.32098764181137085, "num_tokens": 602330.0, "repeat_count": 1.0, "routers_loss": 0.11482042074203491, "skip_count": 1.0, "step": 374, "text_loss": 0.7217292785644531 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7654828294687408, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2265625, "learning_rate": 0.00075, "loss": 0.0728, "macro_f1": 0.3272727429866791, "num_tokens": 605503.0, "repeat_count": 1.0, "routers_loss": 0.11849870532751083, "skip_count": 0.0, "step": 376, "text_loss": 0.5122153759002686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.7748752568241855, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2333984375, "learning_rate": 0.000754, "loss": 0.0835, "macro_f1": 0.32098767161369324, "num_tokens": 608505.0, "repeat_count": 0.0, "routers_loss": 0.07090992480516434, "skip_count": 1.0, "step": 378, "text_loss": 0.2204965502023697 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.78426768417963, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.000758, "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 611193.0, "repeat_count": 0.0, "routers_loss": 0.03812089189887047, "skip_count": 1.0, "step": 380, "text_loss": 0.44909021258354187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.793660111535075, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1689453125, "learning_rate": 0.000762, "loss": 0.0882, "macro_f1": 0.3272727429866791, "num_tokens": 614231.0, "repeat_count": 1.0, "routers_loss": 0.10270529240369797, "skip_count": 0.0, "step": 382, "text_loss": 0.13624964654445648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8030525388905194, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.330078125, "learning_rate": 0.0007660000000000001, "loss": 0.1107, "macro_f1": 0.32098764181137085, "num_tokens": 617090.0, "repeat_count": 1.0, "routers_loss": 0.11624004691839218, "skip_count": 1.0, "step": 384, "text_loss": 0.7314052581787109 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8124449662459643, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007700000000000001, "loss": 0.0628, "macro_f1": 0.32098764181137085, "num_tokens": 620596.0, "repeat_count": 0.0, "routers_loss": 0.07114322483539581, "skip_count": 2.0, "step": 386, "text_loss": 0.503322958946228 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8218373936014087, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.0007740000000000001, "loss": 0.0829, "macro_f1": 0.32098764181137085, "num_tokens": 624108.0, "repeat_count": 0.0, "routers_loss": 0.06061873584985733, "skip_count": 2.0, "step": 388, "text_loss": 0.11481904983520508 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8312298209568536, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2099609375, "learning_rate": 0.000778, "loss": 0.0791, "macro_f1": 0.3006536364555359, "num_tokens": 626895.0, "repeat_count": 1.0, "routers_loss": 0.2921771705150604, "skip_count": 4.0, "step": 390, "text_loss": 0.3069624602794647 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8406222483122983, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30859375, "learning_rate": 0.000782, "loss": 0.0605, "macro_f1": 0.3076923191547394, "num_tokens": 630204.0, "repeat_count": 0.0, "routers_loss": 0.202707901597023, "skip_count": 4.0, "step": 392, "text_loss": 0.6022785305976868 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.850014675667743, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.000786, "loss": 0.0877, "macro_f1": 0.3333333432674408, "num_tokens": 634373.0, "repeat_count": 0.0, "routers_loss": 0.0221510399132967, "skip_count": 0.0, "step": 394, "text_loss": 0.26787394285202026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8594071030231876, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.37890625, "learning_rate": 0.00079, "loss": 0.0805, "macro_f1": 0.32098764181137085, "num_tokens": 637442.0, "repeat_count": 2.0, "routers_loss": 0.12636390328407288, "skip_count": 0.0, "step": 396, "text_loss": 0.2799781560897827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8687995303786322, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2080078125, "learning_rate": 0.0007940000000000001, "loss": 0.0724, "macro_f1": 0.32098764181137085, "num_tokens": 641231.0, "repeat_count": 0.0, "routers_loss": 0.07933453470468521, "skip_count": 2.0, "step": 398, "text_loss": 0.2507784366607666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8781919577340769, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.0007980000000000001, "loss": 0.0909, "macro_f1": 0.3272727429866791, "num_tokens": 644560.0, "repeat_count": 1.0, "routers_loss": 0.10324911028146744, "skip_count": 0.0, "step": 400, "text_loss": 0.7756280303001404 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8875843850895215, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0008020000000000001, "loss": 0.0783, "macro_f1": 0.3144654333591461, "num_tokens": 647393.0, "repeat_count": 1.0, "routers_loss": 0.18546262383460999, "skip_count": 2.0, "step": 402, "text_loss": 0.5013328194618225 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8969768124449664, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0008060000000000001, "loss": 0.0787, "macro_f1": 0.2857142984867096, "num_tokens": 650355.0, "repeat_count": 3.0, "routers_loss": 0.3280293643474579, "skip_count": 4.0, "step": 404, "text_loss": 0.2842077314853668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9063692398004108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.0008100000000000001, "loss": 0.0901, "macro_f1": 0.3333333432674408, "num_tokens": 654280.0, "repeat_count": 0.0, "routers_loss": 0.02623247355222702, "skip_count": 0.0, "step": 406, "text_loss": 0.46742817759513855 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9157616671558557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.0008139999999999999, "loss": 0.0945, "macro_f1": 0.3333333432674408, "num_tokens": 657568.0, "repeat_count": 0.0, "routers_loss": 0.009744114242494106, "skip_count": 0.0, "step": 408, "text_loss": 0.7168047428131104 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9251540945113002, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.0008179999999999999, "loss": 0.1065, "macro_f1": 0.32098764181137085, "num_tokens": 660593.0, "repeat_count": 0.0, "routers_loss": 0.07591600716114044, "skip_count": 2.0, "step": 410, "text_loss": 0.449823260307312 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0008219999999999999, "loss": 0.0795, "macro_f1": 0.3333333432674408, "num_tokens": 663916.0, "repeat_count": 0.0, "routers_loss": 0.02076602540910244, "skip_count": 0.0, "step": 412, "text_loss": 0.4764713943004608 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9439389492221895, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.000826, "loss": 0.0836, "macro_f1": 0.3272727429866791, "num_tokens": 667502.0, "repeat_count": 0.0, "routers_loss": 0.049170155078172684, "skip_count": 1.0, "step": 414, "text_loss": 0.30333325266838074 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9533313765776343, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1513671875, "learning_rate": 0.00083, "loss": 0.1021, "macro_f1": 0.3272727429866791, "num_tokens": 670510.0, "repeat_count": 1.0, "routers_loss": 0.15554003417491913, "skip_count": 0.0, "step": 416, "text_loss": 0.3691870868206024 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.962723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000834, "loss": 0.1013, "macro_f1": 0.3333333432674408, "num_tokens": 674761.0, "repeat_count": 0.0, "routers_loss": 0.024516675621271133, "skip_count": 0.0, "step": 418, "text_loss": 0.32850381731987 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9721162312885236, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.000838, "loss": 0.0649, "macro_f1": 0.3333333432674408, "num_tokens": 678055.0, "repeat_count": 0.0, "routers_loss": 0.011026890948414803, "skip_count": 0.0, "step": 420, "text_loss": 0.6637290716171265 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9815086586439683, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000842, "loss": 0.0771, "macro_f1": 0.3272727429866791, "num_tokens": 680979.0, "repeat_count": 0.0, "routers_loss": 0.07451887428760529, "skip_count": 1.0, "step": 422, "text_loss": 0.27131685614585876 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.990901085999413, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1318359375, "learning_rate": 0.000846, "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 684144.0, "repeat_count": 1.0, "routers_loss": 0.11341800540685654, "skip_count": 1.0, "step": 424, "text_loss": 0.652126669883728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.00085, "loss": 0.0754, "macro_f1": 0.3272727429866791, "num_tokens": 687004.0, "repeat_count": 1.0, "routers_loss": 0.08985847979784012, "skip_count": 0.0, "step": 426, "text_loss": 0.2589428424835205 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.009392427355445, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23828125, "learning_rate": 0.000854, "loss": 0.0866, "macro_f1": 0.3333333432674408, "num_tokens": 689702.0, "repeat_count": 0.0, "routers_loss": 0.011355436407029629, "skip_count": 0.0, "step": 428, "text_loss": 0.8909716010093689 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0187848547108893, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.000858, "loss": 0.0623, "macro_f1": 0.3333333432674408, "num_tokens": 692698.0, "repeat_count": 0.0, "routers_loss": 0.013788948766887188, "skip_count": 0.0, "step": 430, "text_loss": 0.19141142070293427 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.028177282066334, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.000862, "loss": 0.0499, "macro_f1": 0.32098764181137085, "num_tokens": 696007.0, "repeat_count": 0.0, "routers_loss": 0.07998392730951309, "skip_count": 2.0, "step": 432, "text_loss": 0.1611809879541397 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0375697094217786, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.000866, "loss": 0.0541, "macro_f1": 0.32098764181137085, "num_tokens": 700271.0, "repeat_count": 0.0, "routers_loss": 0.06988382339477539, "skip_count": 2.0, "step": 434, "text_loss": 0.37254223227500916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0469621367772235, "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.00087, "loss": 0.0834, "macro_f1": 0.2777777910232544, "num_tokens": 703519.0, "repeat_count": 3.0, "routers_loss": 0.28240787982940674, "skip_count": 5.0, "step": 436, "text_loss": 0.29636648297309875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.056354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.423828125, "learning_rate": 0.000874, "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 706826.0, "repeat_count": 0.0, "routers_loss": 0.013924967497587204, "skip_count": 0.0, "step": 438, "text_loss": 0.20867908000946045 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.065746991488113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.000878, "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 710530.0, "repeat_count": 0.0, "routers_loss": 0.01170142088085413, "skip_count": 0.0, "step": 440, "text_loss": 0.7273373007774353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0751394188435572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.000882, "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 713503.0, "repeat_count": 0.0, "routers_loss": 0.011930872686207294, "skip_count": 0.0, "step": 442, "text_loss": 0.39314430952072144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.084531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.0008860000000000001, "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 716582.0, "repeat_count": 0.0, "routers_loss": 0.008630385622382164, "skip_count": 0.0, "step": 444, "text_loss": 0.5925271511077881 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.0939242735544465, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.0008900000000000001, "loss": 0.0811, "macro_f1": 0.3006536066532135, "num_tokens": 719941.0, "repeat_count": 3.0, "routers_loss": 0.3015584945678711, "skip_count": 1.0, "step": 446, "text_loss": 0.5059905052185059 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.1033167009098914, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.000894, "loss": 0.0822, "macro_f1": 0.31446540355682373, "num_tokens": 723113.0, "repeat_count": 1.0, "routers_loss": 0.10897493362426758, "skip_count": 1.0, "step": 448, "text_loss": 0.19616436958312988 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.112709128265336, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.33984375, "learning_rate": 0.000898, "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 726193.0, "repeat_count": 0.0, "routers_loss": 0.07236456125974655, "skip_count": 2.0, "step": 450, "text_loss": 0.1773054152727127 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1221015556207807, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3203125, "learning_rate": 0.000902, "loss": 0.058, "macro_f1": 0.3272727429866791, "num_tokens": 729275.0, "repeat_count": 1.0, "routers_loss": 0.08184371143579483, "skip_count": 0.0, "step": 452, "text_loss": 0.4927310049533844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1314939829762256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.000906, "loss": 0.0607, "macro_f1": 0.3333333432674408, "num_tokens": 731948.0, "repeat_count": 0.0, "routers_loss": 0.014033539220690727, "skip_count": 0.0, "step": 454, "text_loss": 0.4745742678642273 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.00091, "loss": 0.0651, "macro_f1": 0.3333333432674408, "num_tokens": 735351.0, "repeat_count": 0.0, "routers_loss": 0.0071774693205952644, "skip_count": 0.0, "step": 456, "text_loss": 0.18523462116718292 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.150278837687115, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.400390625, "learning_rate": 0.0009140000000000001, "loss": 0.0738, "macro_f1": 0.5492662787437439, "num_tokens": 738587.0, "repeat_count": 0.0, "routers_loss": 0.07781517505645752, "skip_count": 2.0, "step": 458, "text_loss": 0.3459635376930237 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.1596712650425594, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28125, "learning_rate": 0.0009180000000000001, "loss": 0.0723, "macro_f1": 0.3076923191547394, "num_tokens": 741779.0, "repeat_count": 0.0, "routers_loss": 0.09529037028551102, "skip_count": 2.0, "step": 460, "text_loss": 0.20197433233261108 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1690636923980042, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1865234375, "learning_rate": 0.0009220000000000001, "loss": 0.0519, "macro_f1": 0.3333333432674408, "num_tokens": 745355.0, "repeat_count": 0.0, "routers_loss": 0.009765669703483582, "skip_count": 0.0, "step": 462, "text_loss": 0.7031404376029968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1784561197534487, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0009260000000000001, "loss": 0.0527, "macro_f1": 0.3272727429866791, "num_tokens": 748628.0, "repeat_count": 0.0, "routers_loss": 0.03344850242137909, "skip_count": 1.0, "step": 464, "text_loss": 0.21274663507938385 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1878485471088935, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.00093, "loss": 0.0534, "macro_f1": 0.3076923191547394, "num_tokens": 751472.0, "repeat_count": 2.0, "routers_loss": 0.1354292333126068, "skip_count": 2.0, "step": 466, "text_loss": 0.5350717306137085 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.197240974464338, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.000934, "loss": 0.0598, "macro_f1": 0.3272727429866791, "num_tokens": 754479.0, "repeat_count": 0.0, "routers_loss": 0.056420840322971344, "skip_count": 1.0, "step": 468, "text_loss": 0.28153330087661743 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.206633401819783, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.234375, "learning_rate": 0.0009379999999999999, "loss": 0.0597, "macro_f1": 0.31446540355682373, "num_tokens": 757872.0, "repeat_count": 1.0, "routers_loss": 0.1622387170791626, "skip_count": 1.0, "step": 470, "text_loss": 0.22956843674182892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2160258291752273, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5, "learning_rate": 0.000942, "loss": 0.0953, "macro_f1": 0.32098764181137085, "num_tokens": 760468.0, "repeat_count": 0.0, "routers_loss": 0.05146972835063934, "skip_count": 2.0, "step": 472, "text_loss": 0.4513966739177704 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.225418256530672, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.000946, "loss": 0.0592, "macro_f1": 0.3272727429866791, "num_tokens": 763519.0, "repeat_count": 1.0, "routers_loss": 0.09022669494152069, "skip_count": 0.0, "step": 474, "text_loss": 0.25758957862854004 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.234810683886117, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.00095, "loss": 0.0498, "macro_f1": 0.3272727429866791, "num_tokens": 767391.0, "repeat_count": 0.0, "routers_loss": 0.03044828027486801, "skip_count": 1.0, "step": 476, "text_loss": 0.21366681158542633 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2442031112415615, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.291015625, "learning_rate": 0.000954, "loss": 0.0802, "macro_f1": 0.3272727429866791, "num_tokens": 770338.0, "repeat_count": 0.0, "routers_loss": 0.10397060960531235, "skip_count": 1.0, "step": 478, "text_loss": 1.0396177768707275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.2535955385970063, "f1_execute": 0.8571429252624512, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000958, "loss": 0.1099, "macro_f1": 0.285714328289032, "num_tokens": 773699.0, "repeat_count": 2.0, "routers_loss": 0.22604143619537354, "skip_count": 4.0, "step": 480, "text_loss": 0.2570283114910126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.2629879659524508, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.000962, "loss": 0.0667, "macro_f1": 0.32098767161369324, "num_tokens": 777473.0, "repeat_count": 0.0, "routers_loss": 0.048258859664201736, "skip_count": 1.0, "step": 482, "text_loss": 0.2540103495121002 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2723803933078957, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.197265625, "learning_rate": 0.000966, "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 780833.0, "repeat_count": 0.0, "routers_loss": 0.023018671199679375, "skip_count": 0.0, "step": 484, "text_loss": 0.38524550199508667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.28177282066334, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.314453125, "learning_rate": 0.0009699999999999999, "loss": 0.0709, "macro_f1": 0.3272727429866791, "num_tokens": 783656.0, "repeat_count": 0.0, "routers_loss": 0.044845327734947205, "skip_count": 1.0, "step": 486, "text_loss": 0.5859048366546631 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000974, "loss": 0.0615, "macro_f1": 0.3333333432674408, "num_tokens": 787173.0, "repeat_count": 0.0, "routers_loss": 0.010898692533373833, "skip_count": 0.0, "step": 488, "text_loss": 0.3456067442893982 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3005576753742294, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000978, "loss": 0.0796, "macro_f1": 0.32098764181137085, "num_tokens": 790395.0, "repeat_count": 0.0, "routers_loss": 0.06497956812381744, "skip_count": 2.0, "step": 490, "text_loss": 0.3751123249530792 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3099501027296743, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.000982, "loss": 0.0772, "macro_f1": 0.3272727429866791, "num_tokens": 793137.0, "repeat_count": 0.0, "routers_loss": 0.07763728499412537, "skip_count": 1.0, "step": 492, "text_loss": 0.43296709656715393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3193425300851187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009860000000000001, "loss": 0.0819, "macro_f1": 0.3333333432674408, "num_tokens": 796497.0, "repeat_count": 0.0, "routers_loss": 0.02127906307578087, "skip_count": 0.0, "step": 494, "text_loss": 0.4841311275959015 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3287349574405636, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.00099, "loss": 0.073, "macro_f1": 0.3272727429866791, "num_tokens": 799361.0, "repeat_count": 1.0, "routers_loss": 0.09518691152334213, "skip_count": 0.0, "step": 496, "text_loss": 0.5094487071037292 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.3381273847960085, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.130859375, "learning_rate": 0.000994, "loss": 0.0789, "macro_f1": 0.5492662787437439, "num_tokens": 802629.0, "repeat_count": 0.0, "routers_loss": 0.0563947930932045, "skip_count": 2.0, "step": 498, "text_loss": 0.42783617973327637 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.347519812151453, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1865234375, "learning_rate": 0.000998, "loss": 0.0476, "macro_f1": 0.3272727429866791, "num_tokens": 805881.0, "repeat_count": 1.0, "routers_loss": 0.10570426285266876, "skip_count": 0.0, "step": 500, "text_loss": 0.28395503759384155 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.3569122395068973, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2275390625, "learning_rate": 0.0009999999760498814, "loss": 0.0849, "macro_f1": 0.5492662787437439, "num_tokens": 809283.0, "repeat_count": 0.0, "routers_loss": 0.031202208250761032, "skip_count": 2.0, "step": 502, "text_loss": 0.32970911264419556 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.366304666862342, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0009999997844489475, "loss": 0.0574, "macro_f1": 0.3272727429866791, "num_tokens": 812440.0, "repeat_count": 0.0, "routers_loss": 0.07647835463285446, "skip_count": 1.0, "step": 504, "text_loss": 0.4901447296142578 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.375697094217787, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25, "learning_rate": 0.000999999401247153, "loss": 0.0668, "macro_f1": 0.32098764181137085, "num_tokens": 815716.0, "repeat_count": 0.0, "routers_loss": 0.08515176922082901, "skip_count": 2.0, "step": 506, "text_loss": 0.6157599687576294 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3850895215732315, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25390625, "learning_rate": 0.0009999988264446445, "loss": 0.0686, "macro_f1": 0.3333333432674408, "num_tokens": 819086.0, "repeat_count": 0.0, "routers_loss": 0.00946938619017601, "skip_count": 0.0, "step": 508, "text_loss": 0.5053519010543823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3944819489286764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0009999980600416424, "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 822268.0, "repeat_count": 0.0, "routers_loss": 0.01058756373822689, "skip_count": 0.0, "step": 510, "text_loss": 0.5570021867752075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.403874376284121, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.000999997102038441, "loss": 0.0678, "macro_f1": 0.3333333432674408, "num_tokens": 825728.0, "repeat_count": 0.0, "routers_loss": 0.008705209009349346, "skip_count": 0.0, "step": 512, "text_loss": 0.6519040465354919 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.4132668036395657, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.220703125, "learning_rate": 0.0009999959524354064, "loss": 0.083, "macro_f1": 0.3272727429866791, "num_tokens": 829459.0, "repeat_count": 0.0, "routers_loss": 0.04024193435907364, "skip_count": 1.0, "step": 514, "text_loss": 0.5290043950080872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25390625, "learning_rate": 0.00099999461123298, "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 832291.0, "repeat_count": 0.0, "routers_loss": 0.015742862597107887, "skip_count": 0.0, "step": 516, "text_loss": 0.7910057902336121 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.432051658350455, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.000999993078431675, "loss": 0.0759, "macro_f1": 0.3076923191547394, "num_tokens": 835399.0, "repeat_count": 1.0, "routers_loss": 0.16753782331943512, "skip_count": 3.0, "step": 518, "text_loss": 0.45196083188056946 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.4414440857058994, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.236328125, "learning_rate": 0.0009999913540320792, "loss": 0.0968, "macro_f1": 0.31446540355682373, "num_tokens": 838993.0, "repeat_count": 0.0, "routers_loss": 0.09357143193483353, "skip_count": 2.0, "step": 520, "text_loss": 0.5499435663223267 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.4508365130613443, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2451171875, "learning_rate": 0.0009999894380348536, "loss": 0.0821, "macro_f1": 0.5492662787437439, "num_tokens": 842652.0, "repeat_count": 0.0, "routers_loss": 0.056803856045007706, "skip_count": 2.0, "step": 522, "text_loss": 0.197520449757576 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 2.4602289404167887, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.2333984375, "learning_rate": 0.000999987330440732, "loss": 0.0725, "macro_f1": 0.4871794879436493, "num_tokens": 847061.0, "repeat_count": 0.0, "routers_loss": 0.08962195366621017, "skip_count": 3.0, "step": 524, "text_loss": 0.27509039640426636 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.4696213677722336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000999985031250522, "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 850780.0, "repeat_count": 0.0, "routers_loss": 0.022930558770895004, "skip_count": 0.0, "step": 526, "text_loss": 0.13291706144809723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.4790137951276785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.197265625, "learning_rate": 0.0009999825404651053, "loss": 0.0614, "macro_f1": 0.3333333432674408, "num_tokens": 853886.0, "repeat_count": 0.0, "routers_loss": 0.017097990959882736, "skip_count": 0.0, "step": 528, "text_loss": 0.21706295013427734 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.0009999798580854356, "loss": 0.0724, "macro_f1": 0.3333333432674408, "num_tokens": 857364.0, "repeat_count": 0.0, "routers_loss": 0.02831801027059555, "skip_count": 0.0, "step": 530, "text_loss": 0.9035662412643433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.497798649838568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.000999976984112541, "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 860661.0, "repeat_count": 0.0, "routers_loss": 0.019671892747282982, "skip_count": 0.0, "step": 532, "text_loss": 0.8354863524436951 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 2.5071910771940122, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.2890625, "learning_rate": 0.0009999739185475231, "loss": 0.0963, "macro_f1": 0.47333335876464844, "num_tokens": 864124.0, "repeat_count": 2.0, "routers_loss": 0.21383361518383026, "skip_count": 3.0, "step": 534, "text_loss": 0.23422949016094208 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.516583504549457, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.0009999706613915565, "loss": 0.0598, "macro_f1": 0.32098767161369324, "num_tokens": 866976.0, "repeat_count": 0.0, "routers_loss": 0.07158871740102768, "skip_count": 1.0, "step": 536, "text_loss": 0.11800774186849594 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.5259759319049016, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26953125, "learning_rate": 0.0009999672126458894, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 870549.0, "repeat_count": 0.0, "routers_loss": 0.08185924589633942, "skip_count": 1.0, "step": 538, "text_loss": 0.19232480227947235 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.5353683592603464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.000999963572311843, "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 873733.0, "repeat_count": 0.0, "routers_loss": 0.01633382774889469, "skip_count": 0.0, "step": 540, "text_loss": 0.3725031912326813 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.544760786615791, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0009999597403908128, "loss": 0.0761, "macro_f1": 0.3272727429866791, "num_tokens": 877099.0, "repeat_count": 0.0, "routers_loss": 0.0782657191157341, "skip_count": 1.0, "step": 542, "text_loss": 0.17589199542999268 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.5541532139712357, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2177734375, "learning_rate": 0.0009999557168842669, "loss": 0.0716, "macro_f1": 0.5492662787437439, "num_tokens": 879883.0, "repeat_count": 0.0, "routers_loss": 0.05275818333029747, "skip_count": 2.0, "step": 544, "text_loss": 0.26448264718055725 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.56354564132668, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.0009999515017937468, "loss": 0.071, "macro_f1": 0.32098764181137085, "num_tokens": 882223.0, "repeat_count": 0.0, "routers_loss": 0.09335892647504807, "skip_count": 2.0, "step": 546, "text_loss": 0.208544060587883 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.572938068682125, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.376953125, "learning_rate": 0.0009999470951208684, "loss": 0.0855, "macro_f1": 0.32098764181137085, "num_tokens": 885241.0, "repeat_count": 2.0, "routers_loss": 0.22983254492282867, "skip_count": 0.0, "step": 548, "text_loss": 0.6612338423728943 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.58233049603757, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.00099994249686732, "loss": 0.0786, "macro_f1": 0.3272727429866791, "num_tokens": 887897.0, "repeat_count": 1.0, "routers_loss": 0.12858282029628754, "skip_count": 0.0, "step": 550, "text_loss": 0.4673548936843872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.5917229233930144, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0009999377070348638, "loss": 0.0944, "macro_f1": 0.3333333432674408, "num_tokens": 891224.0, "repeat_count": 0.0, "routers_loss": 0.017421770840883255, "skip_count": 0.0, "step": 552, "text_loss": 0.6419258117675781 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.601115350748459, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000999932725625335, "loss": 0.0791, "macro_f1": 0.32098764181137085, "num_tokens": 894578.0, "repeat_count": 0.0, "routers_loss": 0.07890026271343231, "skip_count": 2.0, "step": 554, "text_loss": 0.5970752239227295 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.6105077781039037, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.0009999275526406427, "loss": 0.0796, "macro_f1": 0.31446540355682373, "num_tokens": 897145.0, "repeat_count": 1.0, "routers_loss": 0.09836960583925247, "skip_count": 1.0, "step": 556, "text_loss": 0.752425491809845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6199002054593485, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1875, "learning_rate": 0.0009999221880827693, "loss": 0.0882, "macro_f1": 0.3333333432674408, "num_tokens": 900565.0, "repeat_count": 0.0, "routers_loss": 0.017694659531116486, "skip_count": 0.0, "step": 558, "text_loss": 0.195619136095047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.629292632814793, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.0009999166319537703, "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 903506.0, "repeat_count": 0.0, "routers_loss": 0.019375264644622803, "skip_count": 0.0, "step": 560, "text_loss": 0.4603337347507477 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 2.638685060170238, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.146484375, "learning_rate": 0.0009999108842557748, "loss": 0.0953, "macro_f1": 0.4871794879436493, "num_tokens": 906380.0, "repeat_count": 0.0, "routers_loss": 0.12013207376003265, "skip_count": 3.0, "step": 562, "text_loss": 0.6279402375221252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6480774875256823, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0009999049449909854, "loss": 0.0799, "macro_f1": 0.3272727429866791, "num_tokens": 909116.0, "repeat_count": 0.0, "routers_loss": 0.06441342830657959, "skip_count": 1.0, "step": 564, "text_loss": 0.23741699755191803 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.657469914881127, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0009998988141616781, "loss": 0.064, "macro_f1": 0.32098767161369324, "num_tokens": 912189.0, "repeat_count": 0.0, "routers_loss": 0.08309414982795715, "skip_count": 1.0, "step": 566, "text_loss": 0.27780941128730774 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6668623422365716, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009998924917702023, "loss": 0.0876, "macro_f1": 0.3272727429866791, "num_tokens": 916279.0, "repeat_count": 1.0, "routers_loss": 0.07197169959545135, "skip_count": 0.0, "step": 568, "text_loss": 0.6371755599975586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6762547695920165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2255859375, "learning_rate": 0.0009998859778189806, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 919490.0, "repeat_count": 0.0, "routers_loss": 0.008022273890674114, "skip_count": 0.0, "step": 570, "text_loss": 0.6028938889503479 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6856471969474613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.000999879272310509, "loss": 0.084, "macro_f1": 0.3333333432674408, "num_tokens": 923694.0, "repeat_count": 0.0, "routers_loss": 0.01634674146771431, "skip_count": 0.0, "step": 572, "text_loss": 0.7177054286003113 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.695039624302906, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.0009998723752473574, "loss": 0.0716, "macro_f1": 0.3272727429866791, "num_tokens": 926933.0, "repeat_count": 0.0, "routers_loss": 0.060559045523405075, "skip_count": 1.0, "step": 574, "text_loss": 0.5203254818916321 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.0009998652866321687, "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 929832.0, "repeat_count": 0.0, "routers_loss": 0.011485611088573933, "skip_count": 0.0, "step": 576, "text_loss": 0.6147452592849731 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.713824479013795, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.000999858006467659, "loss": 0.0649, "macro_f1": 0.29333335161209106, "num_tokens": 933266.0, "repeat_count": 2.0, "routers_loss": 0.2929030954837799, "skip_count": 4.0, "step": 578, "text_loss": 0.1720666140317917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.72321690636924, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.24609375, "learning_rate": 0.0009998505347566186, "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 937545.0, "repeat_count": 0.0, "routers_loss": 0.053780000656843185, "skip_count": 2.0, "step": 580, "text_loss": 0.3258405327796936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7326093337246844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.00099984287150191, "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 941001.0, "repeat_count": 0.0, "routers_loss": 0.02637636847794056, "skip_count": 0.0, "step": 582, "text_loss": 0.23762771487236023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7420017610801293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009998350167064705, "loss": 0.0672, "macro_f1": 0.3333333432674408, "num_tokens": 943989.0, "repeat_count": 0.0, "routers_loss": 0.01637580618262291, "skip_count": 0.0, "step": 584, "text_loss": 0.7460582852363586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7513941884355737, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009998269703733096, "loss": 0.0686, "macro_f1": 0.3272727429866791, "num_tokens": 947245.0, "repeat_count": 1.0, "routers_loss": 0.13934117555618286, "skip_count": 0.0, "step": 586, "text_loss": 0.5284690260887146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7607866157910186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.13671875, "learning_rate": 0.0009998187325055106, "loss": 0.0667, "macro_f1": 0.3333333432674408, "num_tokens": 950116.0, "repeat_count": 0.0, "routers_loss": 0.02138397842645645, "skip_count": 0.0, "step": 588, "text_loss": 0.3920256197452545 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0009998103031062305, "loss": 0.0778, "macro_f1": 0.3333333432674408, "num_tokens": 953277.0, "repeat_count": 0.0, "routers_loss": 0.007098200265318155, "skip_count": 0.0, "step": 590, "text_loss": 0.7472905516624451 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.779571470501908, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.318359375, "learning_rate": 0.0009998016821786994, "loss": 0.0872, "macro_f1": 0.32098764181137085, "num_tokens": 958229.0, "repeat_count": 1.0, "routers_loss": 0.07946522533893585, "skip_count": 1.0, "step": 592, "text_loss": 0.5506448745727539 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7889638978573528, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.000999792869726221, "loss": 0.0523, "macro_f1": 0.3272727429866791, "num_tokens": 961016.0, "repeat_count": 0.0, "routers_loss": 0.0850791186094284, "skip_count": 1.0, "step": 594, "text_loss": 0.3824431002140045 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0009997838657521717, "loss": 0.0632, "macro_f1": 0.3333333432674408, "num_tokens": 963847.0, "repeat_count": 0.0, "routers_loss": 0.016370445489883423, "skip_count": 0.0, "step": 596, "text_loss": 0.2139475792646408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.8077487525682416, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009997746702600026, "loss": 0.0702, "macro_f1": 0.307692289352417, "num_tokens": 966619.0, "repeat_count": 0.0, "routers_loss": 0.1310746818780899, "skip_count": 3.0, "step": 598, "text_loss": 0.3651018440723419 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8171411799236865, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23828125, "learning_rate": 0.0009997652832532372, "loss": 0.0792, "macro_f1": 0.3272727429866791, "num_tokens": 970418.0, "repeat_count": 1.0, "routers_loss": 0.14303378760814667, "skip_count": 0.0, "step": 600, "text_loss": 0.7094736099243164 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8265336072791314, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009997557047354722, "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 973491.0, "repeat_count": 0.0, "routers_loss": 0.03334212675690651, "skip_count": 1.0, "step": 602, "text_loss": 0.4812237024307251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.835926034634576, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.0009997459347103783, "loss": 0.0956, "macro_f1": 0.3272727429866791, "num_tokens": 976672.0, "repeat_count": 0.0, "routers_loss": 0.02831871062517166, "skip_count": 0.0, "step": 604, "text_loss": 0.21737146377563477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8453184619900207, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0009997359731816998, "loss": 0.0646, "macro_f1": 0.3333333432674408, "num_tokens": 979898.0, "repeat_count": 0.0, "routers_loss": 0.017968013882637024, "skip_count": 0.0, "step": 606, "text_loss": 0.5458008050918579 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.854710889345465, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.224609375, "learning_rate": 0.0009997258201532536, "loss": 0.0751, "macro_f1": 0.3333333432674408, "num_tokens": 982811.0, "repeat_count": 0.0, "routers_loss": 0.016256732866168022, "skip_count": 0.0, "step": 608, "text_loss": 0.8643257021903992 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0009997154756289303, "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 985245.0, "repeat_count": 0.0, "routers_loss": 0.021214161068201065, "skip_count": 0.0, "step": 610, "text_loss": 0.2204967886209488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8734957440563544, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.000999704939612694, "loss": 0.0636, "macro_f1": 0.3006536364555359, "num_tokens": 988539.0, "repeat_count": 3.0, "routers_loss": 0.23249399662017822, "skip_count": 2.0, "step": 612, "text_loss": 0.32489025592803955 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8828881714117993, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009996942121085824, "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 991660.0, "repeat_count": 0.0, "routers_loss": 0.010706410743296146, "skip_count": 0.0, "step": 614, "text_loss": 0.4551754891872406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8922805987672437, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3671875, "learning_rate": 0.000999683293120706, "loss": 0.1016, "macro_f1": 0.3333333432674408, "num_tokens": 994828.0, "repeat_count": 0.0, "routers_loss": 0.006676184479147196, "skip_count": 0.0, "step": 616, "text_loss": 0.6212068200111389 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9016730261226886, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.408203125, "learning_rate": 0.0009996721826532491, "loss": 0.0976, "macro_f1": 0.3076923191547394, "num_tokens": 997951.0, "repeat_count": 2.0, "routers_loss": 0.2148125320672989, "skip_count": 2.0, "step": 618, "text_loss": 0.26514527201652527 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.911065453478133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1904296875, "learning_rate": 0.000999660880710469, "loss": 0.0909, "macro_f1": 0.3333333432674408, "num_tokens": 1001139.0, "repeat_count": 0.0, "routers_loss": 0.022332455962896347, "skip_count": 0.0, "step": 620, "text_loss": 0.26131340861320496 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.920457880833578, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009996493872966971, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1003678.0, "repeat_count": 1.0, "routers_loss": 0.08348730951547623, "skip_count": 0.0, "step": 622, "text_loss": 0.19151706993579865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.0009996377024163374, "loss": 0.0822, "macro_f1": 0.3333333432674408, "num_tokens": 1007082.0, "repeat_count": 0.0, "routers_loss": 0.028577150776982307, "skip_count": 0.0, "step": 624, "text_loss": 0.305387407541275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9392427355444672, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.0009996258260738676, "loss": 0.0892, "macro_f1": 0.3272727429866791, "num_tokens": 1010064.0, "repeat_count": 1.0, "routers_loss": 0.08312026411294937, "skip_count": 0.0, "step": 626, "text_loss": 0.49436143040657043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9486351628999117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009996137582738388, "loss": 0.0591, "macro_f1": 0.3333333432674408, "num_tokens": 1013462.0, "repeat_count": 0.0, "routers_loss": 0.013337327167391777, "skip_count": 0.0, "step": 628, "text_loss": 0.6515294313430786 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9580275902553566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.000999601499020875, "loss": 0.0537, "macro_f1": 0.3333333432674408, "num_tokens": 1016246.0, "repeat_count": 0.0, "routers_loss": 0.029126765206456184, "skip_count": 0.0, "step": 630, "text_loss": 0.18834827840328217 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9674200176108014, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009995890483196746, "loss": 0.0602, "macro_f1": 0.3272727429866791, "num_tokens": 1019286.0, "repeat_count": 0.0, "routers_loss": 0.054844800382852554, "skip_count": 1.0, "step": 632, "text_loss": 0.6988179087638855 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.322265625, "learning_rate": 0.0009995764061750086, "loss": 0.0767, "macro_f1": 0.3333333432674408, "num_tokens": 1022207.0, "repeat_count": 0.0, "routers_loss": 0.010095693171024323, "skip_count": 0.0, "step": 634, "text_loss": 0.558451771736145 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9862048723216907, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000999563572591721, "loss": 0.0521, "macro_f1": 0.32098764181137085, "num_tokens": 1025319.0, "repeat_count": 1.0, "routers_loss": 0.0698433518409729, "skip_count": 1.0, "step": 636, "text_loss": 0.5961872935295105 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.995597299677135, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11083984375, "learning_rate": 0.0009995505475747302, "loss": 0.0849, "macro_f1": 0.3272727429866791, "num_tokens": 1028362.0, "repeat_count": 0.0, "routers_loss": 0.040211405605077744, "skip_count": 1.0, "step": 638, "text_loss": 0.546863317489624 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.004696213677722, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.119140625, "learning_rate": 0.0009995373311290272, "loss": 0.0709, "macro_f1": 0.3144654333591461, "num_tokens": 1032199.0, "repeat_count": 2.0, "routers_loss": 0.1457643061876297, "skip_count": 1.0, "step": 640, "text_loss": 0.2137298285961151 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0009995239232596764, "loss": 0.0545, "macro_f1": 0.3333333432674408, "num_tokens": 1035801.0, "repeat_count": 0.0, "routers_loss": 0.011394930072128773, "skip_count": 0.0, "step": 642, "text_loss": 0.43054503202438354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0234810683886115, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009995103239718163, "loss": 0.0665, "macro_f1": 0.3333333432674408, "num_tokens": 1039223.0, "repeat_count": 0.0, "routers_loss": 0.00997432041913271, "skip_count": 0.0, "step": 644, "text_loss": 0.7749615907669067 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0328734957440564, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0009994965332706573, "loss": 0.0755, "macro_f1": 0.3144654333591461, "num_tokens": 1042154.0, "repeat_count": 3.0, "routers_loss": 0.10589150339365005, "skip_count": 0.0, "step": 646, "text_loss": 0.7812211513519287 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.042265923099501, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.0009994825511614846, "loss": 0.0383, "macro_f1": 0.3272727429866791, "num_tokens": 1045250.0, "repeat_count": 0.0, "routers_loss": 0.0748734176158905, "skip_count": 1.0, "step": 648, "text_loss": 0.844803512096405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0516583504549457, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1220703125, "learning_rate": 0.0009994683776496562, "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 1048446.0, "repeat_count": 0.0, "routers_loss": 0.03742415830492973, "skip_count": 1.0, "step": 650, "text_loss": 0.2098839282989502 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0610507778103906, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009994540127406034, "loss": 0.0591, "macro_f1": 0.32098764181137085, "num_tokens": 1051840.0, "repeat_count": 0.0, "routers_loss": 0.06025516986846924, "skip_count": 2.0, "step": 652, "text_loss": 0.27727583050727844 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.070443205165835, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.181640625, "learning_rate": 0.0009994394564398306, "loss": 0.0519, "macro_f1": 0.521541953086853, "num_tokens": 1055142.0, "repeat_count": 4.0, "routers_loss": 0.22807340323925018, "skip_count": 2.0, "step": 654, "text_loss": 0.9672397971153259 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0009994247087529158, "loss": 0.0618, "macro_f1": 0.3333333432674408, "num_tokens": 1057698.0, "repeat_count": 0.0, "routers_loss": 0.01348950993269682, "skip_count": 0.0, "step": 656, "text_loss": 0.6375506520271301 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.0009994097696855106, "loss": 0.0412, "macro_f1": 0.3333333432674408, "num_tokens": 1060624.0, "repeat_count": 0.0, "routers_loss": 0.009649243205785751, "skip_count": 0.0, "step": 658, "text_loss": 0.5315385460853577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.098620487232169, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2041015625, "learning_rate": 0.0009993946392433395, "loss": 0.0609, "macro_f1": 0.307692289352417, "num_tokens": 1065076.0, "repeat_count": 0.0, "routers_loss": 0.1250980943441391, "skip_count": 3.0, "step": 660, "text_loss": 0.25780341029167175 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1080129145876136, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0009993793174322006, "loss": 0.0471, "macro_f1": 0.3333333432674408, "num_tokens": 1068365.0, "repeat_count": 0.0, "routers_loss": 0.011544390581548214, "skip_count": 0.0, "step": 662, "text_loss": 0.34876301884651184 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1174053419430585, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009993638042579654, "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1071693.0, "repeat_count": 0.0, "routers_loss": 0.03777370601892471, "skip_count": 1.0, "step": 664, "text_loss": 0.21811571717262268 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.126797769298503, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.203125, "learning_rate": 0.0009993480997265783, "loss": 0.0475, "macro_f1": 0.5492662787437439, "num_tokens": 1074733.0, "repeat_count": 0.0, "routers_loss": 0.049949806183576584, "skip_count": 2.0, "step": 666, "text_loss": 0.38410288095474243 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.136190196653948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10302734375, "learning_rate": 0.0009993322038440572, "loss": 0.0605, "macro_f1": 0.3333333432674408, "num_tokens": 1077993.0, "repeat_count": 0.0, "routers_loss": 0.0247171800583601, "skip_count": 0.0, "step": 668, "text_loss": 0.25576895475387573 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1455826240093923, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.000999316116616494, "loss": 0.0619, "macro_f1": 0.3333333432674408, "num_tokens": 1080491.0, "repeat_count": 0.0, "routers_loss": 0.008118715137243271, "skip_count": 0.0, "step": 670, "text_loss": 0.6269792914390564 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.154975051364837, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.0009992998380500527, "loss": 0.0462, "macro_f1": 0.3272727429866791, "num_tokens": 1083817.0, "repeat_count": 0.0, "routers_loss": 0.03366057574748993, "skip_count": 1.0, "step": 672, "text_loss": 0.26891493797302246 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1643674787202816, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009992833681509716, "loss": 0.0529, "macro_f1": 0.3333333432674408, "num_tokens": 1087368.0, "repeat_count": 0.0, "routers_loss": 0.020552074536681175, "skip_count": 0.0, "step": 674, "text_loss": 0.14421936869621277 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.1737599060757264, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.18359375, "learning_rate": 0.0009992667069255619, "loss": 0.0696, "macro_f1": 0.31446540355682373, "num_tokens": 1090452.0, "repeat_count": 0.0, "routers_loss": 0.06937336176633835, "skip_count": 2.0, "step": 676, "text_loss": 0.24999259412288666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1831523334311713, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.0009992498543802085, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 1093996.0, "repeat_count": 1.0, "routers_loss": 0.0380021296441555, "skip_count": 0.0, "step": 678, "text_loss": 0.42473849654197693 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 3.1925447607866158, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.2119140625, "learning_rate": 0.0009992328105213688, "loss": 0.0411, "macro_f1": 0.4400000274181366, "num_tokens": 1096837.0, "repeat_count": 1.0, "routers_loss": 0.20885063707828522, "skip_count": 4.0, "step": 680, "text_loss": 0.3829527199268341 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.2019371881420606, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1474609375, "learning_rate": 0.0009992155753555747, "loss": 0.0722, "macro_f1": 0.5492662787437439, "num_tokens": 1100320.0, "repeat_count": 0.0, "routers_loss": 0.018230699002742767, "skip_count": 2.0, "step": 682, "text_loss": 0.6190969944000244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.211329615497505, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30859375, "learning_rate": 0.0009991981488894303, "loss": 0.0681, "macro_f1": 0.32098767161369324, "num_tokens": 1103682.0, "repeat_count": 0.0, "routers_loss": 0.05550144240260124, "skip_count": 1.0, "step": 684, "text_loss": 0.44418027997016907 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.22072204285295, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.0009991805311296133, "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1106427.0, "repeat_count": 0.0, "routers_loss": 0.07990608364343643, "skip_count": 2.0, "step": 686, "text_loss": 0.5577231645584106 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2301144702083944, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009991627220828753, "loss": 0.0568, "macro_f1": 0.32098764181137085, "num_tokens": 1109314.0, "repeat_count": 0.0, "routers_loss": 0.05167485028505325, "skip_count": 2.0, "step": 688, "text_loss": 0.27325430512428284 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.2395068975638392, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10693359375, "learning_rate": 0.0009991447217560408, "loss": 0.0521, "macro_f1": 0.5492662787437439, "num_tokens": 1112748.0, "repeat_count": 0.0, "routers_loss": 0.04621964320540428, "skip_count": 2.0, "step": 690, "text_loss": 0.5288321375846863 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.2488993249192837, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.000999126530156007, "loss": 0.0499, "macro_f1": 0.307692289352417, "num_tokens": 1116965.0, "repeat_count": 1.0, "routers_loss": 0.11950276792049408, "skip_count": 2.0, "step": 692, "text_loss": 0.14215624332427979 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2582917522747286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.0009991081472897454, "loss": 0.0722, "macro_f1": 0.3333333432674408, "num_tokens": 1120570.0, "repeat_count": 0.0, "routers_loss": 0.01905500330030918, "skip_count": 0.0, "step": 694, "text_loss": 0.41862696409225464 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.267684179630173, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0009990895731643002, "loss": 0.0464, "macro_f1": 0.3272727429866791, "num_tokens": 1124009.0, "repeat_count": 1.0, "routers_loss": 0.06974572688341141, "skip_count": 0.0, "step": 696, "text_loss": 0.41160130500793457 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.277076606985618, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.000999070807786789, "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 1127370.0, "repeat_count": 1.0, "routers_loss": 0.07055293023586273, "skip_count": 0.0, "step": 698, "text_loss": 0.48068273067474365 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2864690343410627, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.197265625, "learning_rate": 0.000999051851164403, "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1130234.0, "repeat_count": 1.0, "routers_loss": 0.12506946921348572, "skip_count": 1.0, "step": 700, "text_loss": 0.47925490140914917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.000999032703304406, "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 1132874.0, "repeat_count": 0.0, "routers_loss": 0.00809287466108799, "skip_count": 0.0, "step": 702, "text_loss": 0.47433632612228394 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.305253889051952, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1064453125, "learning_rate": 0.0009990133642141358, "loss": 0.0497, "macro_f1": 0.5492662787437439, "num_tokens": 1136011.0, "repeat_count": 0.0, "routers_loss": 0.0319170281291008, "skip_count": 2.0, "step": 704, "text_loss": 0.6574832201004028 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3146463164073965, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.33984375, "learning_rate": 0.000998993833901003, "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1139674.0, "repeat_count": 0.0, "routers_loss": 0.09850362688302994, "skip_count": 2.0, "step": 706, "text_loss": 0.7660127282142639 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3240387437628414, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0009989741123724919, "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 1143558.0, "repeat_count": 0.0, "routers_loss": 0.006673311349004507, "skip_count": 0.0, "step": 708, "text_loss": 0.5976111888885498 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.333431171118286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009989541996361594, "loss": 0.045, "macro_f1": 0.3333333432674408, "num_tokens": 1146122.0, "repeat_count": 0.0, "routers_loss": 0.004988791421055794, "skip_count": 0.0, "step": 710, "text_loss": 0.5256119966506958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3428235984737307, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.0009989340956996367, "loss": 0.0528, "macro_f1": 0.3333333432674408, "num_tokens": 1149546.0, "repeat_count": 0.0, "routers_loss": 0.0067769973538815975, "skip_count": 0.0, "step": 712, "text_loss": 0.5040497779846191 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.352216025829175, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26953125, "learning_rate": 0.0009989138005706273, "loss": 0.0735, "macro_f1": 0.32098764181137085, "num_tokens": 1153195.0, "repeat_count": 0.0, "routers_loss": 0.09899546951055527, "skip_count": 2.0, "step": 714, "text_loss": 0.20803412795066833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.000998893314256908, "loss": 0.064, "macro_f1": 0.3333333432674408, "num_tokens": 1157081.0, "repeat_count": 0.0, "routers_loss": 0.010492355562746525, "skip_count": 0.0, "step": 716, "text_loss": 0.23077639937400818 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3710008805400644, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0009988726367663298, "loss": 0.0539, "macro_f1": 0.3333333432674408, "num_tokens": 1160079.0, "repeat_count": 0.0, "routers_loss": 0.01063773687928915, "skip_count": 0.0, "step": 718, "text_loss": 0.6085864901542664 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3803933078955093, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0009988517681068163, "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1163249.0, "repeat_count": 1.0, "routers_loss": 0.05981874838471413, "skip_count": 0.0, "step": 720, "text_loss": 0.4047050476074219 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3897857352509537, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.0009988307082863638, "loss": 0.0361, "macro_f1": 0.3333333432674408, "num_tokens": 1166259.0, "repeat_count": 0.0, "routers_loss": 0.009750043973326683, "skip_count": 0.0, "step": 722, "text_loss": 0.5306474566459656 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.3991781626063986, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.240234375, "learning_rate": 0.0009988094573130434, "loss": 0.063, "macro_f1": 0.5359477400779724, "num_tokens": 1168887.0, "repeat_count": 2.0, "routers_loss": 0.18601104617118835, "skip_count": 2.0, "step": 724, "text_loss": 0.53528892993927 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.408570589961843, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009987880151949974, "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1172625.0, "repeat_count": 0.0, "routers_loss": 0.02845010720193386, "skip_count": 1.0, "step": 726, "text_loss": 0.4760453701019287 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.417963017317288, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2177734375, "learning_rate": 0.0009987663819404434, "loss": 0.06, "macro_f1": 0.5492662787437439, "num_tokens": 1176580.0, "repeat_count": 0.0, "routers_loss": 0.017596980556845665, "skip_count": 2.0, "step": 728, "text_loss": 0.5146099328994751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.427355444672733, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1318359375, "learning_rate": 0.000998744557557671, "loss": 0.0484, "macro_f1": 0.3272727429866791, "num_tokens": 1179804.0, "repeat_count": 0.0, "routers_loss": 0.0625474750995636, "skip_count": 1.0, "step": 730, "text_loss": 0.27738022804260254 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.436747872028177, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.0009987225420550433, "loss": 0.0796, "macro_f1": 0.307692289352417, "num_tokens": 1182658.0, "repeat_count": 1.0, "routers_loss": 0.16188351809978485, "skip_count": 2.0, "step": 732, "text_loss": 0.23231445252895355 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.446140299383622, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2001953125, "learning_rate": 0.0009987003354409965, "loss": 0.0626, "macro_f1": 0.3272727429866791, "num_tokens": 1185451.0, "repeat_count": 0.0, "routers_loss": 0.02391529455780983, "skip_count": 0.0, "step": 734, "text_loss": 0.4496627151966095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.4555327267390665, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.234375, "learning_rate": 0.0009986779377240405, "loss": 0.0513, "macro_f1": 0.32098767161369324, "num_tokens": 1188666.0, "repeat_count": 0.0, "routers_loss": 0.08435963839292526, "skip_count": 1.0, "step": 736, "text_loss": 0.4950787127017975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.4649251540945114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1220703125, "learning_rate": 0.000998655348912758, "loss": 0.0515, "macro_f1": 0.3333333432674408, "num_tokens": 1193035.0, "repeat_count": 0.0, "routers_loss": 0.01648722216486931, "skip_count": 0.0, "step": 738, "text_loss": 0.24761848151683807 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1513671875, "learning_rate": 0.0009986325690158051, "loss": 0.0435, "macro_f1": 0.3333333432674408, "num_tokens": 1196840.0, "repeat_count": 0.0, "routers_loss": 0.013143910095095634, "skip_count": 0.0, "step": 740, "text_loss": 0.15662719309329987 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.4837100088054007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009986095980419113, "loss": 0.0757, "macro_f1": 0.3333333432674408, "num_tokens": 1200573.0, "repeat_count": 0.0, "routers_loss": 0.026706280186772346, "skip_count": 0.0, "step": 742, "text_loss": 0.16725164651870728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.493102436160845, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1982421875, "learning_rate": 0.0009985864359998787, "loss": 0.0795, "macro_f1": 0.3006536364555359, "num_tokens": 1203589.0, "repeat_count": 2.0, "routers_loss": 0.28607678413391113, "skip_count": 3.0, "step": 744, "text_loss": 0.6350882053375244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.50249486351629, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009985630828985835, "loss": 0.0572, "macro_f1": 0.3272727429866791, "num_tokens": 1206422.0, "repeat_count": 0.0, "routers_loss": 0.05685260891914368, "skip_count": 1.0, "step": 746, "text_loss": 0.33779552578926086 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.5118872908717345, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.09814453125, "learning_rate": 0.0009985395387469742, "loss": 0.0458, "macro_f1": 0.5492662787437439, "num_tokens": 1211588.0, "repeat_count": 0.0, "routers_loss": 0.0437830351293087, "skip_count": 2.0, "step": 748, "text_loss": 0.28664472699165344 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5212797182271793, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.0009985158035540735, "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 1214580.0, "repeat_count": 2.0, "routers_loss": 0.07074898481369019, "skip_count": 0.0, "step": 750, "text_loss": 0.3939313292503357 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.0009984918773289762, "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1217388.0, "repeat_count": 0.0, "routers_loss": 0.009757856838405132, "skip_count": 0.0, "step": 752, "text_loss": 0.37641215324401855 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5400645729380686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0009984677600808512, "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 1219960.0, "repeat_count": 0.0, "routers_loss": 0.02515069581568241, "skip_count": 0.0, "step": 754, "text_loss": 0.155938982963562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5494570002935135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30078125, "learning_rate": 0.0009984434518189405, "loss": 0.0764, "macro_f1": 0.3333333432674408, "num_tokens": 1223234.0, "repeat_count": 0.0, "routers_loss": 0.025766927748918533, "skip_count": 0.0, "step": 756, "text_loss": 0.691118061542511 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.558849427648958, "f1_execute": 0.9411765336990356, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009984189525525584, "loss": 0.0451, "macro_f1": 0.5359477400779724, "num_tokens": 1225764.0, "repeat_count": 2.0, "routers_loss": 0.1782722771167755, "skip_count": 2.0, "step": 758, "text_loss": 0.3592209219932556 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.568241855004403, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.0009983942622910935, "loss": 0.0659, "macro_f1": 0.3333333432674408, "num_tokens": 1230097.0, "repeat_count": 0.0, "routers_loss": 0.00825568474829197, "skip_count": 0.0, "step": 760, "text_loss": 0.4646475315093994 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5776342823598473, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009983693810440074, "loss": 0.0477, "macro_f1": 0.32098764181137085, "num_tokens": 1233140.0, "repeat_count": 0.0, "routers_loss": 0.04156976938247681, "skip_count": 2.0, "step": 762, "text_loss": 0.298682302236557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.587026709715292, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3515625, "learning_rate": 0.000998344308820834, "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 1236305.0, "repeat_count": 0.0, "routers_loss": 0.05697929114103317, "skip_count": 1.0, "step": 764, "text_loss": 0.5249121189117432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5964191370707366, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.18359375, "learning_rate": 0.0009983190456311817, "loss": 0.0592, "macro_f1": 0.3144654333591461, "num_tokens": 1239673.0, "repeat_count": 0.0, "routers_loss": 0.09547408670186996, "skip_count": 3.0, "step": 766, "text_loss": 0.41277334094047546 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.6058115644261814, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.185546875, "learning_rate": 0.000998293591484731, "loss": 0.0484, "macro_f1": 0.5492662787437439, "num_tokens": 1242292.0, "repeat_count": 0.0, "routers_loss": 0.030693158507347107, "skip_count": 2.0, "step": 768, "text_loss": 0.1583656519651413 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.615203991781626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000998267946391236, "loss": 0.051, "macro_f1": 0.3333333432674408, "num_tokens": 1244661.0, "repeat_count": 0.0, "routers_loss": 0.01211300864815712, "skip_count": 0.0, "step": 770, "text_loss": 0.4629349112510681 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6245964191370708, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.0009982421103605238, "loss": 0.0441, "macro_f1": 0.32098764181137085, "num_tokens": 1248688.0, "repeat_count": 0.0, "routers_loss": 0.0665968507528305, "skip_count": 2.0, "step": 772, "text_loss": 0.4019293785095215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6339888464925156, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000998216083402495, "loss": 0.0613, "macro_f1": 0.32098764181137085, "num_tokens": 1251395.0, "repeat_count": 0.0, "routers_loss": 0.07186859846115112, "skip_count": 2.0, "step": 774, "text_loss": 0.4659276604652405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.302734375, "learning_rate": 0.0009981898655271235, "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1254888.0, "repeat_count": 0.0, "routers_loss": 0.007823926396667957, "skip_count": 0.0, "step": 776, "text_loss": 0.5160359740257263 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 3.6527737012034045, "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.11962890625, "learning_rate": 0.0009981634567444557, "loss": 0.0775, "macro_f1": 0.590062141418457, "num_tokens": 1258250.0, "repeat_count": 3.0, "routers_loss": 0.24624499678611755, "skip_count": 4.0, "step": 778, "text_loss": 0.29319918155670166 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6621661285588494, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.0009981368570646115, "loss": 0.0885, "macro_f1": 0.3272727429866791, "num_tokens": 1260916.0, "repeat_count": 0.0, "routers_loss": 0.030730176717042923, "skip_count": 1.0, "step": 780, "text_loss": 0.624981164932251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6715585559142943, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009981100664977838, "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1264004.0, "repeat_count": 0.0, "routers_loss": 0.006829176563769579, "skip_count": 0.0, "step": 782, "text_loss": 0.6137266159057617 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6809509832697387, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009980830850542391, "loss": 0.058, "macro_f1": 0.3333333432674408, "num_tokens": 1267130.0, "repeat_count": 0.0, "routers_loss": 0.018471000716090202, "skip_count": 0.0, "step": 784, "text_loss": 0.15213175117969513 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6903434106251836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.0009980559127443166, "loss": 0.052, "macro_f1": 0.3333333432674408, "num_tokens": 1271129.0, "repeat_count": 0.0, "routers_loss": 0.007903140969574451, "skip_count": 0.0, "step": 786, "text_loss": 0.5768613219261169 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.699735837980628, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.000998028549578429, "loss": 0.0719, "macro_f1": 0.307692289352417, "num_tokens": 1274232.0, "repeat_count": 0.0, "routers_loss": 0.06737866252660751, "skip_count": 3.0, "step": 788, "text_loss": 0.2877073585987091 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.709128265336073, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009980009955670615, "loss": 0.0698, "macro_f1": 0.3144654333591461, "num_tokens": 1277193.0, "repeat_count": 0.0, "routers_loss": 0.10194934904575348, "skip_count": 3.0, "step": 790, "text_loss": 0.11860492825508118 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.7185206926915173, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.000997973250720773, "loss": 0.0552, "macro_f1": 0.32098764181137085, "num_tokens": 1280960.0, "repeat_count": 0.0, "routers_loss": 0.10297708213329315, "skip_count": 2.0, "step": 792, "text_loss": 0.13477706909179688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.727913120046962, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009979453150501954, "loss": 0.0663, "macro_f1": 0.32098764181137085, "num_tokens": 1284611.0, "repeat_count": 1.0, "routers_loss": 0.06122037023305893, "skip_count": 1.0, "step": 794, "text_loss": 0.40569379925727844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.737305547402407, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.000997917188566034, "loss": 0.062, "macro_f1": 0.32098764181137085, "num_tokens": 1287834.0, "repeat_count": 0.0, "routers_loss": 0.061135001480579376, "skip_count": 2.0, "step": 796, "text_loss": 0.2829287648200989 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.7466979747578515, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.109375, "learning_rate": 0.0009978888712790664, "loss": 0.0654, "macro_f1": 0.3272727429866791, "num_tokens": 1291666.0, "repeat_count": 0.0, "routers_loss": 0.04841872677206993, "skip_count": 1.0, "step": 798, "text_loss": 1.011757254600525 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 27.0, "epoch": 3.756090402113296, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 0.14453125, "learning_rate": 0.0009978603632001444, "loss": 0.0636, "macro_f1": 0.4104308485984802, "num_tokens": 1294627.0, "repeat_count": 1.0, "routers_loss": 0.15698759257793427, "skip_count": 5.0, "step": 800, "text_loss": 0.4457623362541199 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.765482829468741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0009978316643401916, "loss": 0.0688, "macro_f1": 0.3333333432674408, "num_tokens": 1297711.0, "repeat_count": 0.0, "routers_loss": 0.018952010199427605, "skip_count": 0.0, "step": 802, "text_loss": 0.2069481462240219 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.7748752568241857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.0009978027747102062, "loss": 0.0479, "macro_f1": 0.3333333432674408, "num_tokens": 1300569.0, "repeat_count": 0.0, "routers_loss": 0.014538386836647987, "skip_count": 0.0, "step": 804, "text_loss": 0.4983852505683899 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.78426768417963, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2109375, "learning_rate": 0.0009977736943212584, "loss": 0.0721, "macro_f1": 0.32098764181137085, "num_tokens": 1303969.0, "repeat_count": 0.0, "routers_loss": 0.11164087057113647, "skip_count": 2.0, "step": 806, "text_loss": 0.2910642921924591 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.793660111535075, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.000997744423184492, "loss": 0.0424, "macro_f1": 0.3272727429866791, "num_tokens": 1307263.0, "repeat_count": 0.0, "routers_loss": 0.06073406711220741, "skip_count": 1.0, "step": 808, "text_loss": 0.18831779062747955 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 3.8030525388905194, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.26171875, "learning_rate": 0.0009977149613111236, "loss": 0.0486, "macro_f1": 0.4400000274181366, "num_tokens": 1309953.0, "repeat_count": 1.0, "routers_loss": 0.11035524308681488, "skip_count": 4.0, "step": 810, "text_loss": 0.7872759699821472 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8124449662459643, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0009976853087124433, "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1313243.0, "repeat_count": 0.0, "routers_loss": 0.021804286167025566, "skip_count": 0.0, "step": 812, "text_loss": 0.22349292039871216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.8218373936014087, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28125, "learning_rate": 0.0009976554653998138, "loss": 0.0612, "macro_f1": 0.31446540355682373, "num_tokens": 1316165.0, "repeat_count": 0.0, "routers_loss": 0.10715524107217789, "skip_count": 2.0, "step": 814, "text_loss": 0.18035532534122467 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8312298209568536, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.000997625431384671, "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1319206.0, "repeat_count": 0.0, "routers_loss": 0.007173649035394192, "skip_count": 0.0, "step": 816, "text_loss": 0.48928648233413696 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8406222483122985, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0009975952066785243, "loss": 0.0655, "macro_f1": 0.3006536364555359, "num_tokens": 1322549.0, "repeat_count": 1.0, "routers_loss": 0.22308112680912018, "skip_count": 4.0, "step": 818, "text_loss": 0.5211259722709656 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.850014675667743, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0009975647912929557, "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1325213.0, "repeat_count": 0.0, "routers_loss": 0.00998698640614748, "skip_count": 0.0, "step": 820, "text_loss": 0.7117052674293518 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8594071030231873, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0009975341852396205, "loss": 0.0723, "macro_f1": 0.32098764181137085, "num_tokens": 1328383.0, "repeat_count": 0.0, "routers_loss": 0.07454588264226913, "skip_count": 2.0, "step": 822, "text_loss": 0.34539610147476196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8687995303786322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009975033885302469, "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 1331406.0, "repeat_count": 0.0, "routers_loss": 0.009157589636743069, "skip_count": 0.0, "step": 824, "text_loss": 0.7484824657440186 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.878191957734077, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.0009974724011766363, "loss": 0.0474, "macro_f1": 0.3272727429866791, "num_tokens": 1334410.0, "repeat_count": 1.0, "routers_loss": 0.17149391770362854, "skip_count": 0.0, "step": 826, "text_loss": 0.5913820266723633 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8875843850895215, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009974412231906632, "loss": 0.058, "macro_f1": 0.32098764181137085, "num_tokens": 1337653.0, "repeat_count": 1.0, "routers_loss": 0.09743282198905945, "skip_count": 1.0, "step": 828, "text_loss": 0.2505693733692169 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8969768124449664, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0009974098545842748, "loss": 0.0638, "macro_f1": 0.3272727429866791, "num_tokens": 1340860.0, "repeat_count": 0.0, "routers_loss": 0.041490405797958374, "skip_count": 1.0, "step": 830, "text_loss": 0.5585370063781738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.906369239800411, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.193359375, "learning_rate": 0.0009973782953694918, "loss": 0.0746, "macro_f1": 0.3006536066532135, "num_tokens": 1344232.0, "repeat_count": 1.0, "routers_loss": 0.16080693900585175, "skip_count": 3.0, "step": 832, "text_loss": 0.4782734513282776 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9157616671558557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.000997346545558408, "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1347667.0, "repeat_count": 0.0, "routers_loss": 0.01173500344157219, "skip_count": 0.0, "step": 834, "text_loss": 0.25036177039146423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.0009973146051631895, "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1350707.0, "repeat_count": 0.0, "routers_loss": 0.011477196589112282, "skip_count": 0.0, "step": 836, "text_loss": 0.5482863187789917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0009972824741960764, "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1353704.0, "repeat_count": 0.0, "routers_loss": 0.010528896935284138, "skip_count": 0.0, "step": 838, "text_loss": 0.6732596158981323 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9439389492221895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1181640625, "learning_rate": 0.000997250152669381, "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1356608.0, "repeat_count": 0.0, "routers_loss": 0.010678744874894619, "skip_count": 0.0, "step": 840, "text_loss": 0.5479338765144348 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9533313765776343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.181640625, "learning_rate": 0.000997217640595489, "loss": 0.0631, "macro_f1": 0.3333333432674408, "num_tokens": 1359809.0, "repeat_count": 0.0, "routers_loss": 0.00835978239774704, "skip_count": 0.0, "step": 842, "text_loss": 0.42543259263038635 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9627238039330788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.0009971849379868593, "loss": 0.0653, "macro_f1": 0.3333333432674408, "num_tokens": 1362201.0, "repeat_count": 0.0, "routers_loss": 0.009930923581123352, "skip_count": 0.0, "step": 844, "text_loss": 0.720462441444397 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9721162312885236, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.0009971520448560235, "loss": 0.0615, "macro_f1": 0.3272727429866791, "num_tokens": 1365790.0, "repeat_count": 0.0, "routers_loss": 0.06344373524188995, "skip_count": 1.0, "step": 846, "text_loss": 0.8423607349395752 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 3.9815086586439685, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.16796875, "learning_rate": 0.000997118961215586, "loss": 0.0674, "macro_f1": 0.4533333480358124, "num_tokens": 1368387.0, "repeat_count": 1.0, "routers_loss": 0.14688406884670258, "skip_count": 3.0, "step": 848, "text_loss": 0.3933577537536621 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000997085687078225, "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1371189.0, "repeat_count": 0.0, "routers_loss": 0.009953443892300129, "skip_count": 0.0, "step": 850, "text_loss": 0.41469162702560425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.0, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.0009970522224566909, "loss": 0.0555, "macro_f1": 0.32098767161369324, "num_tokens": 1374008.0, "repeat_count": 0.0, "routers_loss": 0.048870690166950226, "skip_count": 1.0, "step": 852, "text_loss": 0.613615870475769 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.009392427355444, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0009970185673638075, "loss": 0.0629, "macro_f1": 0.32098764181137085, "num_tokens": 1376662.0, "repeat_count": 1.0, "routers_loss": 0.06865929812192917, "skip_count": 1.0, "step": 854, "text_loss": 0.4392736256122589 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.01878485471089, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.162109375, "learning_rate": 0.0009969847218124716, "loss": 0.0506, "macro_f1": 0.5492662787437439, "num_tokens": 1380049.0, "repeat_count": 0.0, "routers_loss": 0.02382219396531582, "skip_count": 1.0, "step": 856, "text_loss": 0.19115346670150757 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.028177282066334, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009969506858156527, "loss": 0.0344, "macro_f1": 0.3272727429866791, "num_tokens": 1383008.0, "repeat_count": 0.0, "routers_loss": 0.03907281160354614, "skip_count": 1.0, "step": 858, "text_loss": 0.34842637181282043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.037569709421779, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12060546875, "learning_rate": 0.0009969164593863935, "loss": 0.0365, "macro_f1": 0.3333333432674408, "num_tokens": 1387051.0, "repeat_count": 0.0, "routers_loss": 0.007645803038030863, "skip_count": 0.0, "step": 860, "text_loss": 0.3810436725616455 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.046962136777223, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009968820425378098, "loss": 0.0463, "macro_f1": 0.3272727429866791, "num_tokens": 1390244.0, "repeat_count": 1.0, "routers_loss": 0.04435238987207413, "skip_count": 0.0, "step": 862, "text_loss": 0.34853485226631165 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.056354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28515625, "learning_rate": 0.00099684743528309, "loss": 0.0424, "macro_f1": 0.3333333432674408, "num_tokens": 1392976.0, "repeat_count": 0.0, "routers_loss": 0.006071661598980427, "skip_count": 0.0, "step": 864, "text_loss": 0.6395178437232971 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.065746991488113, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0810546875, "learning_rate": 0.0009968126376354958, "loss": 0.0477, "macro_f1": 0.5492662787437439, "num_tokens": 1396061.0, "repeat_count": 0.0, "routers_loss": 0.05011235550045967, "skip_count": 2.0, "step": 866, "text_loss": 0.09103966504335403 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.075139418843557, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009967776496083616, "loss": 0.0509, "macro_f1": 0.3272727429866791, "num_tokens": 1398993.0, "repeat_count": 1.0, "routers_loss": 0.03979124873876572, "skip_count": 0.0, "step": 868, "text_loss": 0.27257058024406433 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.084531846199002, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.14453125, "learning_rate": 0.000996742471215095, "loss": 0.0516, "macro_f1": 0.5492662787437439, "num_tokens": 1402080.0, "repeat_count": 0.0, "routers_loss": 0.030823837965726852, "skip_count": 2.0, "step": 870, "text_loss": 0.7047103047370911 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.093924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009967071024691763, "loss": 0.0461, "macro_f1": 0.3333333432674408, "num_tokens": 1404890.0, "repeat_count": 0.0, "routers_loss": 0.009721715934574604, "skip_count": 0.0, "step": 872, "text_loss": 0.959106981754303 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.000996671543384159, "loss": 0.05, "macro_f1": 0.3333333432674408, "num_tokens": 1407853.0, "repeat_count": 0.0, "routers_loss": 0.006025883834809065, "skip_count": 0.0, "step": 874, "text_loss": 0.47571972012519836 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.112709128265336, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.0009966357939736692, "loss": 0.0416, "macro_f1": 0.3272727429866791, "num_tokens": 1410723.0, "repeat_count": 0.0, "routers_loss": 0.025964925065636635, "skip_count": 0.0, "step": 876, "text_loss": 0.4964611530303955 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.122101555620781, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0009965998542514065, "loss": 0.0415, "macro_f1": 0.32098764181137085, "num_tokens": 1414008.0, "repeat_count": 0.0, "routers_loss": 0.09509637206792831, "skip_count": 2.0, "step": 878, "text_loss": 0.621494710445404 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 4.131493982976226, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11083984375, "learning_rate": 0.0009965637242311427, "loss": 0.0472, "macro_f1": 0.542222261428833, "num_tokens": 1417447.0, "repeat_count": 0.0, "routers_loss": 0.02520318515598774, "skip_count": 4.0, "step": 880, "text_loss": 0.40209758281707764 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 4.14088641033167, "f1_execute": 0.936170220375061, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.263671875, "learning_rate": 0.000996527403926723, "loss": 0.0495, "macro_f1": 0.5342789888381958, "num_tokens": 1419905.0, "repeat_count": 0.0, "routers_loss": 0.13183781504631042, "skip_count": 6.0, "step": 882, "text_loss": 0.642185389995575 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.1502788376871145, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1201171875, "learning_rate": 0.0009964908933520655, "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 1423436.0, "repeat_count": 0.0, "routers_loss": 0.009429510682821274, "skip_count": 0.0, "step": 884, "text_loss": 0.48232755064964294 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.15967126504256, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1669921875, "learning_rate": 0.0009964541925211613, "loss": 0.0349, "macro_f1": 0.32098764181137085, "num_tokens": 1426842.0, "repeat_count": 0.0, "routers_loss": 0.07629609107971191, "skip_count": 2.0, "step": 886, "text_loss": 0.16620934009552002 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.169063692398004, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0927734375, "learning_rate": 0.0009964173014480738, "loss": 0.0348, "macro_f1": 0.5492662787437439, "num_tokens": 1430430.0, "repeat_count": 0.0, "routers_loss": 0.036814019083976746, "skip_count": 2.0, "step": 888, "text_loss": 0.4866008758544922 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.178456119753449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.0009963802201469398, "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1433821.0, "repeat_count": 0.0, "routers_loss": 0.0041250260546803474, "skip_count": 0.0, "step": 890, "text_loss": 0.578216552734375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.187848547108893, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2373046875, "learning_rate": 0.0009963429486319693, "loss": 0.0463, "macro_f1": 0.32098764181137085, "num_tokens": 1436976.0, "repeat_count": 0.0, "routers_loss": 0.06213559955358505, "skip_count": 2.0, "step": 892, "text_loss": 0.221701517701149 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 4.197240974464338, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.361328125, "learning_rate": 0.0009963054869174446, "loss": 0.0313, "macro_f1": 0.4871794879436493, "num_tokens": 1440397.0, "repeat_count": 0.0, "routers_loss": 0.07532428950071335, "skip_count": 2.0, "step": 894, "text_loss": 0.6922838091850281 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.206633401819783, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.0009962678350177209, "loss": 0.0472, "macro_f1": 0.3272727429866791, "num_tokens": 1443604.0, "repeat_count": 0.0, "routers_loss": 0.0419243648648262, "skip_count": 1.0, "step": 896, "text_loss": 0.22092342376708984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.216025829175227, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009962299929472268, "loss": 0.034, "macro_f1": 0.32098764181137085, "num_tokens": 1446257.0, "repeat_count": 2.0, "routers_loss": 0.10849297791719437, "skip_count": 0.0, "step": 898, "text_loss": 0.26394811272621155 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.225418256530672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.000996191960720463, "loss": 0.0394, "macro_f1": 0.3333333432674408, "num_tokens": 1449669.0, "repeat_count": 0.0, "routers_loss": 0.0092767970636487, "skip_count": 0.0, "step": 900, "text_loss": 0.5338577628135681 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.234810683886117, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009961537383520042, "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1452450.0, "repeat_count": 1.0, "routers_loss": 0.02985367365181446, "skip_count": 0.0, "step": 902, "text_loss": 0.5875228047370911 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.2442031112415615, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.0009961153258564966, "loss": 0.0378, "macro_f1": 0.3144654333591461, "num_tokens": 1456909.0, "repeat_count": 0.0, "routers_loss": 0.06794842332601547, "skip_count": 3.0, "step": 904, "text_loss": 0.40959444642066956 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.253595538597006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009960767232486604, "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1461712.0, "repeat_count": 0.0, "routers_loss": 0.0023562447167932987, "skip_count": 0.0, "step": 906, "text_loss": 0.3932875096797943 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.262987965952451, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.000996037930543288, "loss": 0.0505, "macro_f1": 0.3272727429866791, "num_tokens": 1464817.0, "repeat_count": 0.0, "routers_loss": 0.03880339860916138, "skip_count": 1.0, "step": 908, "text_loss": 0.17482402920722961 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.272380393307896, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.000995998947755245, "loss": 0.0479, "macro_f1": 0.3272727429866791, "num_tokens": 1467810.0, "repeat_count": 0.0, "routers_loss": 0.01736828312277794, "skip_count": 1.0, "step": 910, "text_loss": 0.4140470325946808 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009959597748994695, "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1470802.0, "repeat_count": 0.0, "routers_loss": 0.011824851855635643, "skip_count": 0.0, "step": 912, "text_loss": 0.7153383493423462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.2911652480187845, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0009959204119909726, "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1474539.0, "repeat_count": 0.0, "routers_loss": 0.025456594303250313, "skip_count": 0.0, "step": 914, "text_loss": 0.42812058329582214 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009958808590448385, "loss": 0.0489, "macro_f1": 0.3333333432674408, "num_tokens": 1477552.0, "repeat_count": 0.0, "routers_loss": 0.006795851048082113, "skip_count": 0.0, "step": 916, "text_loss": 0.5402814149856567 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.309950102729674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0009958411160762234, "loss": 0.039, "macro_f1": 0.3333333432674408, "num_tokens": 1482547.0, "repeat_count": 0.0, "routers_loss": 0.015615932643413544, "skip_count": 0.0, "step": 918, "text_loss": 0.3836168050765991 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.319342530085119, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0009958011831003577, "loss": 0.0448, "macro_f1": 0.3272727429866791, "num_tokens": 1485807.0, "repeat_count": 0.0, "routers_loss": 0.043541423976421356, "skip_count": 1.0, "step": 920, "text_loss": 0.4333936274051666 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.328734957440563, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.000995761060132543, "loss": 0.0418, "macro_f1": 0.6538461446762085, "num_tokens": 1488941.0, "repeat_count": 1.0, "routers_loss": 0.05866432189941406, "skip_count": 2.0, "step": 922, "text_loss": 0.4106994867324829 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.3381273847960085, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1630859375, "learning_rate": 0.0009957207471881552, "loss": 0.0531, "macro_f1": 0.5492662787437439, "num_tokens": 1492026.0, "repeat_count": 0.0, "routers_loss": 0.02714901603758335, "skip_count": 2.0, "step": 924, "text_loss": 0.542091429233551 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.347519812151453, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.0009956802442826415, "loss": 0.0386, "macro_f1": 0.3272727429866791, "num_tokens": 1494543.0, "repeat_count": 1.0, "routers_loss": 0.0563737191259861, "skip_count": 0.0, "step": 926, "text_loss": 0.47209203243255615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.356912239506897, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009956395514315235, "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1497831.0, "repeat_count": 1.0, "routers_loss": 0.03285066783428192, "skip_count": 0.0, "step": 928, "text_loss": 0.6628931164741516 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.366304666862343, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009955986686503943, "loss": 0.0466, "macro_f1": 0.3272727429866791, "num_tokens": 1501375.0, "repeat_count": 0.0, "routers_loss": 0.024297121912240982, "skip_count": 1.0, "step": 930, "text_loss": 0.495676189661026 }, { "acc_repeat": 1.0, "acc_skip": 0.25, "avg_layers": 28.0, "epoch": 4.375697094217787, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.11181640625, "learning_rate": 0.0009955575959549202, "loss": 0.0424, "macro_f1": 0.7795917987823486, "num_tokens": 1504363.0, "repeat_count": 1.0, "routers_loss": 0.12196464836597443, "skip_count": 4.0, "step": 932, "text_loss": 0.26123273372650146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.3850895215732315, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.0009955163333608408, "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1507178.0, "repeat_count": 0.0, "routers_loss": 0.012947078794240952, "skip_count": 0.0, "step": 934, "text_loss": 0.32552677392959595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.394481948928676, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009954748808839674, "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 1509910.0, "repeat_count": 0.0, "routers_loss": 0.008946365676820278, "skip_count": 0.0, "step": 936, "text_loss": 0.533141016960144 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.403874376284121, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.000995433238540185, "loss": 0.0466, "macro_f1": 0.6538461446762085, "num_tokens": 1512826.0, "repeat_count": 1.0, "routers_loss": 0.029975678771734238, "skip_count": 1.0, "step": 938, "text_loss": 0.2953577935695648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.413266803639566, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0009953914063454512, "loss": 0.0497, "macro_f1": 0.3144654333591461, "num_tokens": 1517230.0, "repeat_count": 1.0, "routers_loss": 0.0889134630560875, "skip_count": 2.0, "step": 940, "text_loss": 0.5368834733963013 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.193359375, "learning_rate": 0.000995349384315796, "loss": 0.0413, "macro_f1": 0.3333333432674408, "num_tokens": 1519876.0, "repeat_count": 0.0, "routers_loss": 0.013458753935992718, "skip_count": 0.0, "step": 942, "text_loss": 0.2005518227815628 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.432051658350455, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.000995307172467322, "loss": 0.0444, "macro_f1": 0.31446540355682373, "num_tokens": 1522998.0, "repeat_count": 1.0, "routers_loss": 0.08850377053022385, "skip_count": 1.0, "step": 944, "text_loss": 0.227926567196846 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.4414440857059, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009952647708162054, "loss": 0.0503, "macro_f1": 0.3272727429866791, "num_tokens": 1527100.0, "repeat_count": 0.0, "routers_loss": 0.03199794515967369, "skip_count": 1.0, "step": 946, "text_loss": 0.4859686493873596 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.450836513061344, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009952221793786942, "loss": 0.0354, "macro_f1": 0.3333333432674408, "num_tokens": 1530028.0, "repeat_count": 0.0, "routers_loss": 0.006507779937237501, "skip_count": 0.0, "step": 948, "text_loss": 0.6855354905128479 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.460228940416789, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.0009951793981711097, "loss": 0.0584, "macro_f1": 0.6538461446762085, "num_tokens": 1533254.0, "repeat_count": 1.0, "routers_loss": 0.06175103038549423, "skip_count": 1.0, "step": 950, "text_loss": 0.7590400576591492 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.469621367772234, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1025390625, "learning_rate": 0.0009951364272098458, "loss": 0.0295, "macro_f1": 0.5492662787437439, "num_tokens": 1536239.0, "repeat_count": 0.0, "routers_loss": 0.03773383051156998, "skip_count": 2.0, "step": 952, "text_loss": 0.669784665107727 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.4790137951276785, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009950932665113688, "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1539682.0, "repeat_count": 0.0, "routers_loss": 0.07280613481998444, "skip_count": 2.0, "step": 954, "text_loss": 0.3365570902824402 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.0009950499160922184, "loss": 0.0541, "macro_f1": 0.3333333432674408, "num_tokens": 1542875.0, "repeat_count": 0.0, "routers_loss": 0.01770266517996788, "skip_count": 0.0, "step": 956, "text_loss": 0.0921545997262001 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.497798649838567, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09375, "learning_rate": 0.000995006375969006, "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1547135.0, "repeat_count": 1.0, "routers_loss": 0.07672002166509628, "skip_count": 0.0, "step": 958, "text_loss": 0.5887606739997864 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.507191077194013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009949626461584165, "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 1550100.0, "repeat_count": 0.0, "routers_loss": 0.006247182376682758, "skip_count": 0.0, "step": 960, "text_loss": 0.5777931213378906 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.516583504549457, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.119140625, "learning_rate": 0.0009949187266772076, "loss": 0.0366, "macro_f1": 0.5492662787437439, "num_tokens": 1553192.0, "repeat_count": 0.0, "routers_loss": 0.030319908633828163, "skip_count": 2.0, "step": 962, "text_loss": 0.2370252162218094 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.5259759319049016, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009948746175422088, "loss": 0.0511, "macro_f1": 0.3333333432674408, "num_tokens": 1556318.0, "repeat_count": 0.0, "routers_loss": 0.006004320923238993, "skip_count": 0.0, "step": 964, "text_loss": 0.6271032094955444 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000994830318770323, "loss": 0.0514, "macro_f1": 0.3333333432674408, "num_tokens": 1559195.0, "repeat_count": 0.0, "routers_loss": 0.011544366367161274, "skip_count": 0.0, "step": 966, "text_loss": 0.47256720066070557 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.544760786615791, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.171875, "learning_rate": 0.0009947858303785255, "loss": 0.0374, "macro_f1": 0.6603773832321167, "num_tokens": 1561813.0, "repeat_count": 1.0, "routers_loss": 0.05258861929178238, "skip_count": 1.0, "step": 968, "text_loss": 0.7703132629394531 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.554153213971236, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.0009947411523838648, "loss": 0.0453, "macro_f1": 0.3333333432674408, "num_tokens": 1564634.0, "repeat_count": 0.0, "routers_loss": 0.011216280050575733, "skip_count": 0.0, "step": 970, "text_loss": 0.4666804075241089 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0009946962848034608, "loss": 0.0696, "macro_f1": 0.3333333432674408, "num_tokens": 1567959.0, "repeat_count": 0.0, "routers_loss": 0.009387624450027943, "skip_count": 0.0, "step": 972, "text_loss": 0.4067264199256897 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.5729380686821255, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.0009946512276545075, "loss": 0.0397, "macro_f1": 0.3272727429866791, "num_tokens": 1571221.0, "repeat_count": 1.0, "routers_loss": 0.041713520884513855, "skip_count": 0.0, "step": 974, "text_loss": 0.5242366194725037 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 4.58233049603757, "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 0.228515625, "learning_rate": 0.0009946059809542705, "loss": 0.0487, "macro_f1": 0.7644445300102234, "num_tokens": 1575033.0, "repeat_count": 2.0, "routers_loss": 0.05748331546783447, "skip_count": 2.0, "step": 976, "text_loss": 0.5704690217971802 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.591722923393014, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0009945605447200887, "loss": 0.0445, "macro_f1": 0.3272727429866791, "num_tokens": 1579050.0, "repeat_count": 0.0, "routers_loss": 0.016765203326940536, "skip_count": 0.0, "step": 978, "text_loss": 0.4804173707962036 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.601115350748459, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1337890625, "learning_rate": 0.0009945149189693732, "loss": 0.0406, "macro_f1": 0.5492662787437439, "num_tokens": 1582967.0, "repeat_count": 0.0, "routers_loss": 0.021518222987651825, "skip_count": 2.0, "step": 980, "text_loss": 0.4138598144054413 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.610507778103904, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.0009944691037196078, "loss": 0.0456, "macro_f1": 0.3333333432674408, "num_tokens": 1586282.0, "repeat_count": 0.0, "routers_loss": 0.012246460653841496, "skip_count": 0.0, "step": 982, "text_loss": 0.22561736404895782 }, { "acc_repeat": 0.5, "acc_skip": 0.800000011920929, "avg_layers": 24.0, "epoch": 4.6199002054593485, "f1_execute": 0.930232584476471, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, "grad_norm": 0.1455078125, "learning_rate": 0.0009944230989883491, "loss": 0.0456, "macro_f1": 0.7989664077758789, "num_tokens": 1589279.0, "repeat_count": 2.0, "routers_loss": 0.09344895929098129, "skip_count": 5.0, "step": 984, "text_loss": 0.4416656494140625 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.629292632814793, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.111328125, "learning_rate": 0.0009943769047932264, "loss": 0.0404, "macro_f1": 0.5359477400779724, "num_tokens": 1592398.0, "repeat_count": 2.0, "routers_loss": 0.08916857838630676, "skip_count": 2.0, "step": 986, "text_loss": 0.5536438822746277 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.638685060170237, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000994330521151941, "loss": 0.039, "macro_f1": 0.32098764181137085, "num_tokens": 1596213.0, "repeat_count": 1.0, "routers_loss": 0.06114347651600838, "skip_count": 1.0, "step": 988, "text_loss": 0.5835405588150024 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.000994283948082267, "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1598827.0, "repeat_count": 0.0, "routers_loss": 0.0017335431184619665, "skip_count": 0.0, "step": 990, "text_loss": 0.5857380032539368 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.657469914881127, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10693359375, "learning_rate": 0.0009942371856020522, "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1602915.0, "repeat_count": 0.0, "routers_loss": 0.014606470242142677, "skip_count": 0.0, "step": 992, "text_loss": 0.6939892768859863 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 4.666862342236572, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0009941902337292155, "loss": 0.06, "macro_f1": 0.6598639488220215, "num_tokens": 1605776.0, "repeat_count": 3.0, "routers_loss": 0.06297315657138824, "skip_count": 1.0, "step": 994, "text_loss": 0.37616831064224243 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.676254769592017, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1083984375, "learning_rate": 0.0009941430924817487, "loss": 0.0572, "macro_f1": 0.5492662787437439, "num_tokens": 1609856.0, "repeat_count": 0.0, "routers_loss": 0.03297794610261917, "skip_count": 2.0, "step": 996, "text_loss": 0.2098303586244583 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.685647196947461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.000994095761877717, "loss": 0.0499, "macro_f1": 0.3333333432674408, "num_tokens": 1612904.0, "repeat_count": 0.0, "routers_loss": 0.012901155278086662, "skip_count": 0.0, "step": 998, "text_loss": 0.20103533565998077 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.695039624302906, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.259765625, "learning_rate": 0.000994048241935257, "loss": 0.0535, "macro_f1": 0.3272727429866791, "num_tokens": 1615540.0, "repeat_count": 0.0, "routers_loss": 0.020434845238924026, "skip_count": 0.0, "step": 1000, "text_loss": 0.32709044218063354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.70443205165835, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1669921875, "learning_rate": 0.0009940005326725789, "loss": 0.0453, "macro_f1": 0.32098764181137085, "num_tokens": 1618786.0, "repeat_count": 0.0, "routers_loss": 0.07831378281116486, "skip_count": 2.0, "step": 1002, "text_loss": 0.5789632797241211 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.713824479013795, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21875, "learning_rate": 0.0009939526341079647, "loss": 0.0511, "macro_f1": 0.32098764181137085, "num_tokens": 1621736.0, "repeat_count": 2.0, "routers_loss": 0.04863874986767769, "skip_count": 0.0, "step": 1004, "text_loss": 0.6128849387168884 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009939045462597693, "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1624649.0, "repeat_count": 0.0, "routers_loss": 0.00677989237010479, "skip_count": 0.0, "step": 1006, "text_loss": 0.6168264150619507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.732609333724684, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009938562691464202, "loss": 0.0524, "macro_f1": 0.3333333432674408, "num_tokens": 1627700.0, "repeat_count": 0.0, "routers_loss": 0.019490402191877365, "skip_count": 0.0, "step": 1008, "text_loss": 0.17463822662830353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.742001761080129, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.000993807802786417, "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1630714.0, "repeat_count": 0.0, "routers_loss": 0.0019022391643375158, "skip_count": 0.0, "step": 1010, "text_loss": 0.5675593018531799 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 4.751394188435574, "f1_execute": 0.9599999785423279, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 0.1640625, "learning_rate": 0.0009937591471983322, "loss": 0.0501, "macro_f1": 0.7644444704055786, "num_tokens": 1633770.0, "repeat_count": 1.0, "routers_loss": 0.042485643178224564, "skip_count": 2.0, "step": 1012, "text_loss": 0.42387229204177856 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.760786615791019, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0009937103024008109, "loss": 0.0545, "macro_f1": 0.3272727429866791, "num_tokens": 1637120.0, "repeat_count": 0.0, "routers_loss": 0.09427817165851593, "skip_count": 1.0, "step": 1014, "text_loss": 0.49511051177978516 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009936612684125702, "loss": 0.0503, "macro_f1": 0.3333333432674408, "num_tokens": 1640165.0, "repeat_count": 0.0, "routers_loss": 0.005106127820909023, "skip_count": 0.0, "step": 1016, "text_loss": 0.5398799180984497 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.7795714705019074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2734375, "learning_rate": 0.0009936120452524004, "loss": 0.0506, "macro_f1": 0.3333333432674408, "num_tokens": 1643251.0, "repeat_count": 0.0, "routers_loss": 0.016914300620555878, "skip_count": 0.0, "step": 1018, "text_loss": 0.20882178843021393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.788963897857353, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009935626329391637, "loss": 0.0537, "macro_f1": 0.32098764181137085, "num_tokens": 1646560.0, "repeat_count": 0.0, "routers_loss": 0.13481520116329193, "skip_count": 2.0, "step": 1020, "text_loss": 0.5719883441925049 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.798356325212797, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1552734375, "learning_rate": 0.0009935130314917948, "loss": 0.0602, "macro_f1": 0.5492662787437439, "num_tokens": 1649538.0, "repeat_count": 0.0, "routers_loss": 0.07700438797473907, "skip_count": 2.0, "step": 1022, "text_loss": 0.1303367167711258 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.807748752568242, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009934632409293015, "loss": 0.0611, "macro_f1": 0.32098764181137085, "num_tokens": 1652397.0, "repeat_count": 1.0, "routers_loss": 0.11416907608509064, "skip_count": 1.0, "step": 1024, "text_loss": 0.24076920747756958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.817141179923686, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.0009934132612707631, "loss": 0.0507, "macro_f1": 0.31446540355682373, "num_tokens": 1654938.0, "repeat_count": 0.0, "routers_loss": 0.09484589844942093, "skip_count": 2.0, "step": 1026, "text_loss": 0.1652517318725586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009933630925353324, "loss": 0.0395, "macro_f1": 0.3333333432674408, "num_tokens": 1658536.0, "repeat_count": 0.0, "routers_loss": 0.00741987070068717, "skip_count": 0.0, "step": 1028, "text_loss": 0.49296700954437256 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.835926034634576, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1845703125, "learning_rate": 0.0009933127347422337, "loss": 0.0602, "macro_f1": 0.32098764181137085, "num_tokens": 1661446.0, "repeat_count": 0.0, "routers_loss": 0.08399344235658646, "skip_count": 2.0, "step": 1030, "text_loss": 0.22363591194152832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.158203125, "learning_rate": 0.0009932621879107648, "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1664612.0, "repeat_count": 0.0, "routers_loss": 0.0031781597062945366, "skip_count": 0.0, "step": 1032, "text_loss": 0.36083245277404785 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.854710889345466, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.000993211452060295, "loss": 0.042, "macro_f1": 0.3272727429866791, "num_tokens": 1667467.0, "repeat_count": 0.0, "routers_loss": 0.03595469892024994, "skip_count": 1.0, "step": 1034, "text_loss": 0.16372856497764587 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.86410331670091, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000993160527210266, "loss": 0.061, "macro_f1": 0.3144654333591461, "num_tokens": 1670675.0, "repeat_count": 3.0, "routers_loss": 0.1597205102443695, "skip_count": 0.0, "step": 1036, "text_loss": 0.6049913763999939 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2197265625, "learning_rate": 0.000993109413380193, "loss": 0.0562, "macro_f1": 0.3333333432674408, "num_tokens": 1673477.0, "repeat_count": 0.0, "routers_loss": 0.009756010957062244, "skip_count": 0.0, "step": 1038, "text_loss": 0.7034620642662048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.882888171411799, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.0009930581105896624, "loss": 0.0559, "macro_f1": 0.3272727429866791, "num_tokens": 1676809.0, "repeat_count": 0.0, "routers_loss": 0.020718922838568687, "skip_count": 0.0, "step": 1040, "text_loss": 0.2814720571041107 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.892280598767244, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.0009930066188583338, "loss": 0.0445, "macro_f1": 0.32098764181137085, "num_tokens": 1679398.0, "repeat_count": 1.0, "routers_loss": 0.04755603149533272, "skip_count": 1.0, "step": 1042, "text_loss": 0.5445759296417236 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.901673026122689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.0009929549382059388, "loss": 0.0509, "macro_f1": 0.3333333432674408, "num_tokens": 1682269.0, "repeat_count": 0.0, "routers_loss": 0.01040949858725071, "skip_count": 0.0, "step": 1044, "text_loss": 0.2876914143562317 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.911065453478133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009929030686522816, "loss": 0.0363, "macro_f1": 0.3333333432674408, "num_tokens": 1685428.0, "repeat_count": 0.0, "routers_loss": 0.008158888667821884, "skip_count": 0.0, "step": 1046, "text_loss": 0.49053525924682617 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.9204578808335775, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009928510102172386, "loss": 0.0498, "macro_f1": 0.3333333432674408, "num_tokens": 1688252.0, "repeat_count": 0.0, "routers_loss": 0.005102572031319141, "skip_count": 0.0, "step": 1048, "text_loss": 0.5274341106414795 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0009927987629207587, "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1691289.0, "repeat_count": 0.0, "routers_loss": 0.016768503934144974, "skip_count": 0.0, "step": 1050, "text_loss": 0.9935035109519958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.939242735544467, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0009927463267828634, "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1694148.0, "repeat_count": 0.0, "routers_loss": 0.010905829258263111, "skip_count": 0.0, "step": 1052, "text_loss": 0.20895758271217346 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.948635162899912, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.000992693701823646, "loss": 0.0624, "macro_f1": 0.3272727429866791, "num_tokens": 1698543.0, "repeat_count": 1.0, "routers_loss": 0.10533971339464188, "skip_count": 0.0, "step": 1054, "text_loss": 0.5776236653327942 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.958027590255357, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0009926408880632726, "loss": 0.0556, "macro_f1": 0.3272727429866791, "num_tokens": 1702460.0, "repeat_count": 0.0, "routers_loss": 0.026313411071896553, "skip_count": 1.0, "step": 1056, "text_loss": 0.34990596771240234 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.967420017610801, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.0009925878855219818, "loss": 0.0391, "macro_f1": 0.3333333432674408, "num_tokens": 1705686.0, "repeat_count": 0.0, "routers_loss": 0.007763393223285675, "skip_count": 0.0, "step": 1058, "text_loss": 0.4980163276195526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.976812444966246, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.177734375, "learning_rate": 0.000992534694220084, "loss": 0.0613, "macro_f1": 0.3272727429866791, "num_tokens": 1708739.0, "repeat_count": 0.0, "routers_loss": 0.03998444974422455, "skip_count": 1.0, "step": 1060, "text_loss": 0.29092350602149963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.98620487232169, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.000992481314177962, "loss": 0.0312, "macro_f1": 0.32098764181137085, "num_tokens": 1711903.0, "repeat_count": 1.0, "routers_loss": 0.06966045498847961, "skip_count": 1.0, "step": 1062, "text_loss": 0.6267179250717163 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.995597299677136, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.244140625, "learning_rate": 0.0009924277454160717, "loss": 0.0548, "macro_f1": 0.3272727429866791, "num_tokens": 1715974.0, "repeat_count": 0.0, "routers_loss": 0.05536063387989998, "skip_count": 1.0, "step": 1064, "text_loss": 0.5813798904418945 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.004696213677723, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0009923739879549402, "loss": 0.0423, "macro_f1": 0.3333333432674408, "num_tokens": 1718828.0, "repeat_count": 0.0, "routers_loss": 0.020993782207369804, "skip_count": 0.0, "step": 1066, "text_loss": 0.22665327787399292 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0888671875, "learning_rate": 0.0009923200418151677, "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 1722419.0, "repeat_count": 0.0, "routers_loss": 0.007351701147854328, "skip_count": 0.0, "step": 1068, "text_loss": 0.5796169638633728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.0234810683886115, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009922659070174264, "loss": 0.0452, "macro_f1": 0.3272727429866791, "num_tokens": 1725663.0, "repeat_count": 1.0, "routers_loss": 0.026033315807580948, "skip_count": 0.0, "step": 1070, "text_loss": 0.25742828845977783 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009922115835824612, "loss": 0.041, "macro_f1": 0.3333333432674408, "num_tokens": 1729239.0, "repeat_count": 0.0, "routers_loss": 0.0118600158020854, "skip_count": 0.0, "step": 1072, "text_loss": 0.21630282700061798 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.042265923099501, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0009921570715310884, "loss": 0.0364, "macro_f1": 0.6666666865348816, "num_tokens": 1732507.0, "repeat_count": 1.0, "routers_loss": 0.016118815168738365, "skip_count": 0.0, "step": 1074, "text_loss": 0.5639925003051758 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.051658350454946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009921023708841974, "loss": 0.0407, "macro_f1": 0.3333333432674408, "num_tokens": 1736182.0, "repeat_count": 0.0, "routers_loss": 0.004275390412658453, "skip_count": 0.0, "step": 1076, "text_loss": 0.5758615136146545 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0009920474816627496, "loss": 0.037, "macro_f1": 0.3333333432674408, "num_tokens": 1739559.0, "repeat_count": 0.0, "routers_loss": 0.01299292128533125, "skip_count": 0.0, "step": 1078, "text_loss": 0.18221625685691833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.0704432051658355, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009919924038877788, "loss": 0.0343, "macro_f1": 0.32098764181137085, "num_tokens": 1742890.0, "repeat_count": 0.0, "routers_loss": 0.038295745849609375, "skip_count": 2.0, "step": 1080, "text_loss": 0.17354349792003632 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 5.07983563252128, "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.1884765625, "learning_rate": 0.0009919371375803905, "loss": 0.0455, "macro_f1": 0.8194444179534912, "num_tokens": 1746433.0, "repeat_count": 2.0, "routers_loss": 0.04052971675992012, "skip_count": 3.0, "step": 1082, "text_loss": 0.2250112146139145 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009918816827617632, "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 1750802.0, "repeat_count": 0.0, "routers_loss": 0.009114136919379234, "skip_count": 0.0, "step": 1084, "text_loss": 0.2526719272136688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.098620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.000991826039453147, "loss": 0.0392, "macro_f1": 0.3333333432674408, "num_tokens": 1754272.0, "repeat_count": 0.0, "routers_loss": 0.004904678091406822, "skip_count": 0.0, "step": 1086, "text_loss": 0.7308789491653442 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 5.108012914587614, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.000991770207675865, "loss": 0.0327, "macro_f1": 0.6666666865348816, "num_tokens": 1757231.0, "repeat_count": 0.0, "routers_loss": 0.02129189297556877, "skip_count": 2.0, "step": 1088, "text_loss": 0.21764220297336578 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.1174053419430585, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009917141874513113, "loss": 0.0315, "macro_f1": 0.3333333432674408, "num_tokens": 1760003.0, "repeat_count": 0.0, "routers_loss": 0.01310618408024311, "skip_count": 0.0, "step": 1090, "text_loss": 0.33892181515693665 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.126797769298503, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.171875, "learning_rate": 0.0009916579788009537, "loss": 0.0457, "macro_f1": 0.5492662787437439, "num_tokens": 1763052.0, "repeat_count": 0.0, "routers_loss": 0.02059309557080269, "skip_count": 2.0, "step": 1092, "text_loss": 0.6551769375801086 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.136190196653947, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10546875, "learning_rate": 0.0009916015817463312, "loss": 0.0385, "macro_f1": 0.5492662787437439, "num_tokens": 1766655.0, "repeat_count": 0.0, "routers_loss": 0.0274797435849905, "skip_count": 2.0, "step": 1094, "text_loss": 0.3984372019767761 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.000991544996309055, "loss": 0.0271, "macro_f1": 0.3333333432674408, "num_tokens": 1769997.0, "repeat_count": 0.0, "routers_loss": 0.01437368243932724, "skip_count": 0.0, "step": 1096, "text_loss": 0.4203338921070099 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.154975051364837, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.000991488222510809, "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 1773130.0, "repeat_count": 0.0, "routers_loss": 0.001382062560878694, "skip_count": 0.0, "step": 1098, "text_loss": 0.43132516741752625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.164367478720282, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.000991431260373349, "loss": 0.0329, "macro_f1": 0.3144654333591461, "num_tokens": 1775682.0, "repeat_count": 1.0, "routers_loss": 0.1115434318780899, "skip_count": 2.0, "step": 1100, "text_loss": 0.3218227028846741 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.111328125, "learning_rate": 0.000991374109918503, "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 1778407.0, "repeat_count": 0.0, "routers_loss": 0.009529678151011467, "skip_count": 0.0, "step": 1102, "text_loss": 0.17183731496334076 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.183152333431171, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1142578125, "learning_rate": 0.000991316771168171, "loss": 0.044, "macro_f1": 0.5492662787437439, "num_tokens": 1781518.0, "repeat_count": 0.0, "routers_loss": 0.018668074160814285, "skip_count": 2.0, "step": 1104, "text_loss": 1.1324785947799683 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.192544760786616, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.125, "learning_rate": 0.0009912592441443258, "loss": 0.0411, "macro_f1": 0.3272727429866791, "num_tokens": 1784878.0, "repeat_count": 0.0, "routers_loss": 0.04145100712776184, "skip_count": 1.0, "step": 1106, "text_loss": 0.6082063317298889 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.20193718814206, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0009912015288690112, "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1788978.0, "repeat_count": 0.0, "routers_loss": 0.021450644358992577, "skip_count": 1.0, "step": 1108, "text_loss": 0.5597621202468872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.2113296154975055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.0009911436253643444, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 1792321.0, "repeat_count": 0.0, "routers_loss": 0.017405325546860695, "skip_count": 0.0, "step": 1110, "text_loss": 0.2560598850250244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.0009910855336525137, "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1795182.0, "repeat_count": 0.0, "routers_loss": 0.007162237539887428, "skip_count": 0.0, "step": 1112, "text_loss": 0.3438240587711334 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 5.230114470208394, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.115234375, "learning_rate": 0.00099102725375578, "loss": 0.0326, "macro_f1": 0.480392187833786, "num_tokens": 1798987.0, "repeat_count": 1.0, "routers_loss": 0.11149197816848755, "skip_count": 3.0, "step": 1114, "text_loss": 0.20455503463745117 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.239506897563839, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.0009909687856964767, "loss": 0.035, "macro_f1": 0.3006536364555359, "num_tokens": 1802064.0, "repeat_count": 2.0, "routers_loss": 0.12679415941238403, "skip_count": 3.0, "step": 1116, "text_loss": 0.11996729671955109 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.248899324919284, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.12451171875, "learning_rate": 0.0009909101294970082, "loss": 0.0365, "macro_f1": 0.5492662787437439, "num_tokens": 1805412.0, "repeat_count": 0.0, "routers_loss": 0.05108053982257843, "skip_count": 2.0, "step": 1118, "text_loss": 0.13224145770072937 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.258291752274729, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.123046875, "learning_rate": 0.0009908512851798522, "loss": 0.0455, "macro_f1": 0.6603773832321167, "num_tokens": 1808196.0, "repeat_count": 1.0, "routers_loss": 0.02131766639649868, "skip_count": 1.0, "step": 1120, "text_loss": 0.7824069261550903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.0009907922527675576, "loss": 0.0405, "macro_f1": 0.3333333432674408, "num_tokens": 1811622.0, "repeat_count": 0.0, "routers_loss": 0.006226244382560253, "skip_count": 0.0, "step": 1122, "text_loss": 0.5419743061065674 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.277076606985618, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.12890625, "learning_rate": 0.000990733032282746, "loss": 0.0535, "macro_f1": 0.5492662787437439, "num_tokens": 1814628.0, "repeat_count": 0.0, "routers_loss": 0.03088250942528248, "skip_count": 2.0, "step": 1124, "text_loss": 0.37100958824157715 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.286469034341063, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0810546875, "learning_rate": 0.000990673623748111, "loss": 0.0348, "macro_f1": 0.32098767161369324, "num_tokens": 1817205.0, "repeat_count": 0.0, "routers_loss": 0.05495348572731018, "skip_count": 1.0, "step": 1126, "text_loss": 0.20241330564022064 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.295861461696507, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.0927734375, "learning_rate": 0.0009906140271864173, "loss": 0.0433, "macro_f1": 0.4871794879436493, "num_tokens": 1820141.0, "repeat_count": 0.0, "routers_loss": 0.037809282541275024, "skip_count": 2.0, "step": 1128, "text_loss": 0.32965806126594543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.305253889051952, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0908203125, "learning_rate": 0.0009905542426205032, "loss": 0.0348, "macro_f1": 0.32098767161369324, "num_tokens": 1824011.0, "repeat_count": 0.0, "routers_loss": 0.03320181369781494, "skip_count": 1.0, "step": 1130, "text_loss": 0.36329755187034607 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.314646316407397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009904942700732777, "loss": 0.0335, "macro_f1": 0.3333333432674408, "num_tokens": 1826873.0, "repeat_count": 0.0, "routers_loss": 0.004102326463907957, "skip_count": 0.0, "step": 1132, "text_loss": 0.6692602038383484 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.324038743762841, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08544921875, "learning_rate": 0.0009904341095677226, "loss": 0.03, "macro_f1": 0.29333335161209106, "num_tokens": 1830103.0, "repeat_count": 2.0, "routers_loss": 0.2376193106174469, "skip_count": 4.0, "step": 1134, "text_loss": 0.19212862849235535 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.333431171118286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.119140625, "learning_rate": 0.0009903737611268919, "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 1833201.0, "repeat_count": 0.0, "routers_loss": 0.005253395065665245, "skip_count": 0.0, "step": 1136, "text_loss": 0.6773360371589661 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.34282359847373, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009903132247739107, "loss": 0.0305, "macro_f1": 0.3076923191547394, "num_tokens": 1836045.0, "repeat_count": 1.0, "routers_loss": 0.14382585883140564, "skip_count": 3.0, "step": 1138, "text_loss": 0.2882297933101654 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.3522160258291755, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.150390625, "learning_rate": 0.0009902525005319766, "loss": 0.04, "macro_f1": 0.5427350401878357, "num_tokens": 1839721.0, "repeat_count": 1.0, "routers_loss": 0.04033960774540901, "skip_count": 2.0, "step": 1140, "text_loss": 0.7172559499740601 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.36160845318462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12109375, "learning_rate": 0.0009901915884243597, "loss": 0.0351, "macro_f1": 0.6666666865348816, "num_tokens": 1842614.0, "repeat_count": 1.0, "routers_loss": 0.005162308923900127, "skip_count": 0.0, "step": 1142, "text_loss": 0.42892804741859436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.371000880540064, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009901304884744014, "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1845444.0, "repeat_count": 1.0, "routers_loss": 0.10117656737565994, "skip_count": 2.0, "step": 1144, "text_loss": 0.20806430280208588 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.380393307895509, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0009900692007055152, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 1848558.0, "repeat_count": 0.0, "routers_loss": 0.014107038266956806, "skip_count": 0.0, "step": 1146, "text_loss": 0.5355974435806274 }, { "acc_repeat": 0.25, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 5.389785735250954, "f1_execute": 0.9166666865348816, "f1_repeat": 0.4000000059604645, "f1_skip": 0.6666666865348816, "grad_norm": 0.16015625, "learning_rate": 0.000990007725141187, "loss": 0.0449, "macro_f1": 0.6611111164093018, "num_tokens": 1852723.0, "repeat_count": 4.0, "routers_loss": 0.15537866950035095, "skip_count": 2.0, "step": 1148, "text_loss": 0.6388513445854187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.399178162606399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1181640625, "learning_rate": 0.0009899460618049741, "loss": 0.0397, "macro_f1": 0.3333333432674408, "num_tokens": 1856181.0, "repeat_count": 0.0, "routers_loss": 0.011800912208855152, "skip_count": 0.0, "step": 1150, "text_loss": 0.6113069653511047 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 5.408570589961843, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.000989884210720506, "loss": 0.0331, "macro_f1": 0.6666666865348816, "num_tokens": 1859685.0, "repeat_count": 2.0, "routers_loss": 0.022900646552443504, "skip_count": 0.0, "step": 1152, "text_loss": 0.25718021392822266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.4179630173172875, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009898221719114844, "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1862505.0, "repeat_count": 0.0, "routers_loss": 0.026814989745616913, "skip_count": 1.0, "step": 1154, "text_loss": 0.5426549911499023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.427355444672733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009897599454016823, "loss": 0.0401, "macro_f1": 0.3333333432674408, "num_tokens": 1866266.0, "repeat_count": 0.0, "routers_loss": 0.0032623792067170143, "skip_count": 0.0, "step": 1156, "text_loss": 0.37752896547317505 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.436747872028177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07080078125, "learning_rate": 0.0009896975312149454, "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 1870216.0, "repeat_count": 0.0, "routers_loss": 0.015617577359080315, "skip_count": 0.0, "step": 1158, "text_loss": 0.18207129836082458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.446140299383622, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009896349293751906, "loss": 0.0423, "macro_f1": 0.3272727429866791, "num_tokens": 1873338.0, "repeat_count": 0.0, "routers_loss": 0.02250153198838234, "skip_count": 1.0, "step": 1160, "text_loss": 0.548884391784668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.455532726739067, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009895721399064072, "loss": 0.0388, "macro_f1": 0.32098764181137085, "num_tokens": 1876470.0, "repeat_count": 1.0, "routers_loss": 0.055204521864652634, "skip_count": 1.0, "step": 1162, "text_loss": 0.48052409291267395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.464925154094511, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.0009895091628326564, "loss": 0.0293, "macro_f1": 0.3333333432674408, "num_tokens": 1879354.0, "repeat_count": 0.0, "routers_loss": 0.009093789383769035, "skip_count": 0.0, "step": 1164, "text_loss": 0.3908069431781769 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.474317581449956, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.000989445998178071, "loss": 0.0323, "macro_f1": 0.3272727429866791, "num_tokens": 1881941.0, "repeat_count": 0.0, "routers_loss": 0.015086972154676914, "skip_count": 1.0, "step": 1166, "text_loss": 0.4884725511074066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.4837100088054, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0009893826459668558, "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1885374.0, "repeat_count": 0.0, "routers_loss": 0.06587666273117065, "skip_count": 3.0, "step": 1168, "text_loss": 0.12760137021541595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0009893191062232873, "loss": 0.0322, "macro_f1": 0.3333333432674408, "num_tokens": 1888612.0, "repeat_count": 0.0, "routers_loss": 0.006088624242693186, "skip_count": 0.0, "step": 1170, "text_loss": 0.4821319580078125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0009892553789717143, "loss": 0.0389, "macro_f1": 0.3333333432674408, "num_tokens": 1891463.0, "repeat_count": 0.0, "routers_loss": 0.010113578289747238, "skip_count": 0.0, "step": 1172, "text_loss": 0.3613642454147339 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.5118872908717345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0009891914642365573, "loss": 0.0404, "macro_f1": 0.3333333432674408, "num_tokens": 1894230.0, "repeat_count": 0.0, "routers_loss": 0.004947459790855646, "skip_count": 0.0, "step": 1174, "text_loss": 0.5037549138069153 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.521279718227179, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.0009891273620423083, "loss": 0.0428, "macro_f1": 0.3272727429866791, "num_tokens": 1897294.0, "repeat_count": 1.0, "routers_loss": 0.026075217872858047, "skip_count": 0.0, "step": 1176, "text_loss": 0.32558977603912354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.530672145582624, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0009890630724135314, "loss": 0.0351, "macro_f1": 0.3272727429866791, "num_tokens": 1901553.0, "repeat_count": 0.0, "routers_loss": 0.06650999188423157, "skip_count": 1.0, "step": 1178, "text_loss": 0.23473620414733887 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.540064572938069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009889985953748625, "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 1904556.0, "repeat_count": 0.0, "routers_loss": 0.010361116379499435, "skip_count": 1.0, "step": 1180, "text_loss": 0.6927042007446289 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.549457000293513, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.0009889339309510094, "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 1908053.0, "repeat_count": 0.0, "routers_loss": 0.013286533765494823, "skip_count": 0.0, "step": 1182, "text_loss": 0.19977325201034546 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 5.558849427648958, "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, "grad_norm": 0.058837890625, "learning_rate": 0.0009888690791667518, "loss": 0.0204, "macro_f1": 0.7018141150474548, "num_tokens": 1911754.0, "repeat_count": 2.0, "routers_loss": 0.11920545995235443, "skip_count": 3.0, "step": 1184, "text_loss": 0.4072858691215515 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.568241855004403, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11083984375, "learning_rate": 0.0009888040400469408, "loss": 0.0391, "macro_f1": 0.3272727429866791, "num_tokens": 1914862.0, "repeat_count": 0.0, "routers_loss": 0.03652849420905113, "skip_count": 1.0, "step": 1186, "text_loss": 0.2654043138027191 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.577634282359847, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1689453125, "learning_rate": 0.0009887388136164996, "loss": 0.0336, "macro_f1": 0.5492662787437439, "num_tokens": 1918542.0, "repeat_count": 0.0, "routers_loss": 0.03991910070180893, "skip_count": 2.0, "step": 1188, "text_loss": 0.21130657196044922 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.587026709715292, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09521484375, "learning_rate": 0.000988673399900423, "loss": 0.0429, "macro_f1": 0.3272727429866791, "num_tokens": 1921589.0, "repeat_count": 0.0, "routers_loss": 0.014900135807693005, "skip_count": 0.0, "step": 1190, "text_loss": 0.5519335865974426 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.596419137070737, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009886077989237777, "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 1924320.0, "repeat_count": 0.0, "routers_loss": 0.06271552294492722, "skip_count": 1.0, "step": 1192, "text_loss": 0.213813915848732 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 24.0, "epoch": 5.6058115644261814, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.1875, "learning_rate": 0.000988542010711702, "loss": 0.0342, "macro_f1": 0.6225374937057495, "num_tokens": 1927178.0, "repeat_count": 0.0, "routers_loss": 0.03081391751766205, "skip_count": 5.0, "step": 1194, "text_loss": 0.7524349093437195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.615203991781626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0009884760352894064, "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1930216.0, "repeat_count": 0.0, "routers_loss": 0.008556773886084557, "skip_count": 0.0, "step": 1196, "text_loss": 0.28230375051498413 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.62459641913707, "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 0.1064453125, "learning_rate": 0.0009884098726821726, "loss": 0.0472, "macro_f1": 0.4871794879436493, "num_tokens": 1933312.0, "repeat_count": 3.0, "routers_loss": 0.05344727262854576, "skip_count": 0.0, "step": 1198, "text_loss": 0.5509607195854187 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 5.633988846492516, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.1298828125, "learning_rate": 0.000988343522915354, "loss": 0.0441, "macro_f1": 0.480392187833786, "num_tokens": 1936160.0, "repeat_count": 1.0, "routers_loss": 0.07324771583080292, "skip_count": 3.0, "step": 1200, "text_loss": 0.30565372109413147 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 25.0, "epoch": 5.64338127384796, "f1_execute": 0.8936169743537903, "f1_repeat": 0.0, "f1_skip": 0.444444477558136, "grad_norm": 0.2470703125, "learning_rate": 0.0009882769860143764, "loss": 0.0317, "macro_f1": 0.4460204839706421, "num_tokens": 1939266.0, "repeat_count": 0.0, "routers_loss": 0.18620699644088745, "skip_count": 6.0, "step": 1202, "text_loss": 0.976121723651886 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.6527737012034045, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1318359375, "learning_rate": 0.000988210262004737, "loss": 0.0474, "macro_f1": 0.6666666865348816, "num_tokens": 1942173.0, "repeat_count": 0.0, "routers_loss": 0.007703613489866257, "skip_count": 1.0, "step": 1204, "text_loss": 0.5647401809692383 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.66216612855885, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1484375, "learning_rate": 0.0009881433509120036, "loss": 0.0376, "macro_f1": 0.5492662787437439, "num_tokens": 1945071.0, "repeat_count": 0.0, "routers_loss": 0.02162683941423893, "skip_count": 2.0, "step": 1206, "text_loss": 0.24229218065738678 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.671558555914294, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0966796875, "learning_rate": 0.0009880762527618176, "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1949060.0, "repeat_count": 0.0, "routers_loss": 0.017667081207036972, "skip_count": 0.0, "step": 1208, "text_loss": 0.4035970866680145 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009880089675798908, "loss": 0.0367, "macro_f1": 0.3333333432674408, "num_tokens": 1951698.0, "repeat_count": 0.0, "routers_loss": 0.006405784282833338, "skip_count": 0.0, "step": 1210, "text_loss": 0.5319879055023193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.690343410625183, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009879414953920071, "loss": 0.0294, "macro_f1": 0.3333333432674408, "num_tokens": 1955266.0, "repeat_count": 0.0, "routers_loss": 0.009859707206487656, "skip_count": 0.0, "step": 1212, "text_loss": 0.6687407493591309 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.699735837980628, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.130859375, "learning_rate": 0.0009878738362240219, "loss": 0.045, "macro_f1": 0.5492662787437439, "num_tokens": 1958538.0, "repeat_count": 0.0, "routers_loss": 0.030890554189682007, "skip_count": 2.0, "step": 1214, "text_loss": 0.20820017158985138 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 5.709128265336073, "f1_execute": 0.9200000166893005, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.000987805990101862, "loss": 0.0317, "macro_f1": 0.47333335876464844, "num_tokens": 1961419.0, "repeat_count": 2.0, "routers_loss": 0.10383198410272598, "skip_count": 2.0, "step": 1216, "text_loss": 0.8664976358413696 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.718520692691517, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009877379570515268, "loss": 0.0366, "macro_f1": 0.3333333432674408, "num_tokens": 1964836.0, "repeat_count": 0.0, "routers_loss": 0.013376163318753242, "skip_count": 0.0, "step": 1218, "text_loss": 0.4223395884037018 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.727913120046962, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0859375, "learning_rate": 0.0009876697370990865, "loss": 0.0343, "macro_f1": 0.3333333432674408, "num_tokens": 1967620.0, "repeat_count": 0.0, "routers_loss": 0.008577900938689709, "skip_count": 0.0, "step": 1220, "text_loss": 0.4789901375770569 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.0009876013302706828, "loss": 0.049, "macro_f1": 0.3333333432674408, "num_tokens": 1971100.0, "repeat_count": 0.0, "routers_loss": 0.004730266984552145, "skip_count": 0.0, "step": 1222, "text_loss": 0.6799837946891785 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.7466979747578515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009875327365925295, "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1974408.0, "repeat_count": 0.0, "routers_loss": 0.010849526152014732, "skip_count": 0.0, "step": 1224, "text_loss": 0.18967926502227783 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.756090402113296, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009874639560909118, "loss": 0.0498, "macro_f1": 0.32098767161369324, "num_tokens": 1977046.0, "repeat_count": 0.0, "routers_loss": 0.04841252416372299, "skip_count": 1.0, "step": 1226, "text_loss": 0.6133310198783875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.765482829468741, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1318359375, "learning_rate": 0.0009873949887921867, "loss": 0.0402, "macro_f1": 0.3272727429866791, "num_tokens": 1980330.0, "repeat_count": 0.0, "routers_loss": 0.029638588428497314, "skip_count": 1.0, "step": 1228, "text_loss": 0.15649555623531342 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.774875256824186, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0009873258347227823, "loss": 0.0331, "macro_f1": 0.3272727429866791, "num_tokens": 1983173.0, "repeat_count": 0.0, "routers_loss": 0.009955910965800285, "skip_count": 0.0, "step": 1230, "text_loss": 0.4741005599498749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009872564939091989, "loss": 0.0342, "macro_f1": 0.3333333432674408, "num_tokens": 1986825.0, "repeat_count": 0.0, "routers_loss": 0.010205300524830818, "skip_count": 0.0, "step": 1232, "text_loss": 0.5315462350845337 }, { "acc_repeat": 1.0, "acc_skip": 0.5714285969734192, "avg_layers": 25.0, "epoch": 5.7936601115350745, "f1_execute": 0.9302325248718262, "f1_repeat": 1.0, "f1_skip": 0.7272727489471436, "grad_norm": 0.11865234375, "learning_rate": 0.0009871869663780077, "loss": 0.0336, "macro_f1": 0.8858351111412048, "num_tokens": 1990448.0, "repeat_count": 1.0, "routers_loss": 0.09120134264230728, "skip_count": 7.0, "step": 1234, "text_loss": 0.6187508702278137 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.125, "learning_rate": 0.0009871172521558522, "loss": 0.0475, "macro_f1": 0.6666666865348816, "num_tokens": 1993474.0, "repeat_count": 0.0, "routers_loss": 0.016188839450478554, "skip_count": 1.0, "step": 1236, "text_loss": 0.20783066749572754 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 5.812444966245964, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.216796875, "learning_rate": 0.0009870473512694465, "loss": 0.0373, "macro_f1": 0.5934640765190125, "num_tokens": 1996536.0, "repeat_count": 0.0, "routers_loss": 0.05046704784035683, "skip_count": 3.0, "step": 1238, "text_loss": 0.247748002409935 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.821837393601409, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.09033203125, "learning_rate": 0.0009869772637455772, "loss": 0.0251, "macro_f1": 0.4871794879436493, "num_tokens": 1999530.0, "repeat_count": 0.0, "routers_loss": 0.044926248490810394, "skip_count": 2.0, "step": 1240, "text_loss": 0.26001980900764465 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.831229820956853, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1513671875, "learning_rate": 0.000986906989611102, "loss": 0.0446, "macro_f1": 0.3272727429866791, "num_tokens": 2002782.0, "repeat_count": 0.0, "routers_loss": 0.025911526754498482, "skip_count": 0.0, "step": 1242, "text_loss": 0.9009982943534851 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.8406222483122985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0009868365288929492, "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2005331.0, "repeat_count": 0.0, "routers_loss": 0.0043760035187006, "skip_count": 0.0, "step": 1244, "text_loss": 0.5547386407852173 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.850014675667743, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.0009867658816181206, "loss": 0.0374, "macro_f1": 0.3333333432674408, "num_tokens": 2008115.0, "repeat_count": 0.0, "routers_loss": 0.009227181784808636, "skip_count": 0.0, "step": 1246, "text_loss": 1.0067731142044067 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.859407103023187, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.000986695047813688, "loss": 0.0261, "macro_f1": 0.3272727429866791, "num_tokens": 2011137.0, "repeat_count": 1.0, "routers_loss": 0.023822437971830368, "skip_count": 0.0, "step": 1248, "text_loss": 0.30058956146240234 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 5.868799530378633, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.1044921875, "learning_rate": 0.0009866240275067948, "loss": 0.044, "macro_f1": 0.47333335876464844, "num_tokens": 2014159.0, "repeat_count": 2.0, "routers_loss": 0.21523773670196533, "skip_count": 3.0, "step": 1250, "text_loss": 0.39072203636169434 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.878191957734077, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1201171875, "learning_rate": 0.0009865528207246563, "loss": 0.0351, "macro_f1": 0.5492662787437439, "num_tokens": 2017731.0, "repeat_count": 0.0, "routers_loss": 0.06184682995080948, "skip_count": 2.0, "step": 1252, "text_loss": 0.35751575231552124 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.8875843850895215, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.166015625, "learning_rate": 0.000986481427494559, "loss": 0.0336, "macro_f1": 0.3333333432674408, "num_tokens": 2020485.0, "repeat_count": 0.0, "routers_loss": 0.007573372684419155, "skip_count": 0.0, "step": 1254, "text_loss": 0.4061077833175659 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.896976812444966, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.000986409847843861, "loss": 0.0382, "macro_f1": 0.3272727429866791, "num_tokens": 2024149.0, "repeat_count": 1.0, "routers_loss": 0.07447971403598785, "skip_count": 0.0, "step": 1256, "text_loss": 0.41876497864723206 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.906369239800411, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.000986338081799992, "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 2026545.0, "repeat_count": 0.0, "routers_loss": 0.006609147880226374, "skip_count": 0.0, "step": 1258, "text_loss": 0.4673794209957123 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.915761667155856, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.0009862661293904523, "loss": 0.0498, "macro_f1": 0.32098764181137085, "num_tokens": 2029581.0, "repeat_count": 0.0, "routers_loss": 0.10624702274799347, "skip_count": 2.0, "step": 1260, "text_loss": 0.3483233153820038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1201171875, "learning_rate": 0.0009861939906428145, "loss": 0.0525, "macro_f1": 0.3333333432674408, "num_tokens": 2033936.0, "repeat_count": 0.0, "routers_loss": 0.007944886572659016, "skip_count": 0.0, "step": 1262, "text_loss": 0.16362667083740234 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.934546521866745, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009861216655847225, "loss": 0.0376, "macro_f1": 0.6666666865348816, "num_tokens": 2037876.0, "repeat_count": 1.0, "routers_loss": 0.007004092447459698, "skip_count": 0.0, "step": 1264, "text_loss": 0.43228110671043396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.94393894922219, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.0009860491542438912, "loss": 0.047, "macro_f1": 0.3272727429866791, "num_tokens": 2040842.0, "repeat_count": 0.0, "routers_loss": 0.026916226372122765, "skip_count": 1.0, "step": 1266, "text_loss": 0.5901188850402832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.953331376577634, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.000985976456648107, "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 2043890.0, "repeat_count": 0.0, "routers_loss": 0.007325216196477413, "skip_count": 0.0, "step": 1268, "text_loss": 0.8780109882354736 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.962723803933079, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.10205078125, "learning_rate": 0.000985903572825228, "loss": 0.0306, "macro_f1": 0.4871794879436493, "num_tokens": 2048848.0, "repeat_count": 0.0, "routers_loss": 0.05007527023553848, "skip_count": 2.0, "step": 1270, "text_loss": 0.5863722562789917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.972116231288524, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.000985830502803183, "loss": 0.0396, "macro_f1": 0.3272727429866791, "num_tokens": 2051561.0, "repeat_count": 0.0, "routers_loss": 0.023995524272322655, "skip_count": 0.0, "step": 1272, "text_loss": 0.7460709810256958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.9815086586439685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.0009857572466099732, "loss": 0.0431, "macro_f1": 0.3333333432674408, "num_tokens": 2054752.0, "repeat_count": 0.0, "routers_loss": 0.006928362417966127, "skip_count": 0.0, "step": 1274, "text_loss": 0.5130293369293213 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.162109375, "learning_rate": 0.0009856838042736698, "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 2058151.0, "repeat_count": 0.0, "routers_loss": 0.006969396956264973, "skip_count": 0.0, "step": 1276, "text_loss": 0.5911393761634827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0009856101758224166, "loss": 0.0441, "macro_f1": 0.3333333432674408, "num_tokens": 2061012.0, "repeat_count": 0.0, "routers_loss": 0.003499418031424284, "skip_count": 0.0, "step": 1278, "text_loss": 0.25347545742988586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.000985536361284428, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2064597.0, "repeat_count": 0.0, "routers_loss": 0.007856054231524467, "skip_count": 0.0, "step": 1280, "text_loss": 0.7476963400840759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.01878485471089, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0888671875, "learning_rate": 0.0009854623606879898, "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2067972.0, "repeat_count": 0.0, "routers_loss": 0.02617792971432209, "skip_count": 1.0, "step": 1282, "text_loss": 0.5775872468948364 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.028177282066334, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.000985388174061459, "loss": 0.0356, "macro_f1": 0.32098767161369324, "num_tokens": 2071812.0, "repeat_count": 0.0, "routers_loss": 0.035979997366666794, "skip_count": 1.0, "step": 1284, "text_loss": 0.2933400869369507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.037569709421779, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08447265625, "learning_rate": 0.0009853138014332646, "loss": 0.0273, "macro_f1": 0.3333333432674408, "num_tokens": 2074868.0, "repeat_count": 0.0, "routers_loss": 0.005142854526638985, "skip_count": 0.0, "step": 1286, "text_loss": 0.29085102677345276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.046962136777223, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.0009852392428319058, "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 2078225.0, "repeat_count": 0.0, "routers_loss": 0.0032799106556922197, "skip_count": 0.0, "step": 1288, "text_loss": 0.7293626070022583 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 6.056354564132668, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.08935546875, "learning_rate": 0.0009851644982859537, "loss": 0.0273, "macro_f1": 0.480392187833786, "num_tokens": 2081495.0, "repeat_count": 1.0, "routers_loss": 0.12224318832159042, "skip_count": 3.0, "step": 1290, "text_loss": 0.26125892996788025 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.065746991488113, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009850895678240508, "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2084390.0, "repeat_count": 1.0, "routers_loss": 0.010662888176739216, "skip_count": 0.0, "step": 1292, "text_loss": 0.3510764539241791 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.075139418843557, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1689453125, "learning_rate": 0.0009850144514749104, "loss": 0.0332, "macro_f1": 0.5492662787437439, "num_tokens": 2087210.0, "repeat_count": 0.0, "routers_loss": 0.01979079470038414, "skip_count": 2.0, "step": 1294, "text_loss": 0.40202176570892334 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.084531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11669921875, "learning_rate": 0.000984939149267317, "loss": 0.0253, "macro_f1": 0.6666666865348816, "num_tokens": 2090777.0, "repeat_count": 0.0, "routers_loss": 0.005172552540898323, "skip_count": 1.0, "step": 1296, "text_loss": 0.5275651216506958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.093924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009848636612301272, "loss": 0.0299, "macro_f1": 0.3333333432674408, "num_tokens": 2094248.0, "repeat_count": 0.0, "routers_loss": 0.0029599082190543413, "skip_count": 0.0, "step": 1298, "text_loss": 0.4517653286457062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.0009847879873922675, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2097139.0, "repeat_count": 0.0, "routers_loss": 0.011455860920250416, "skip_count": 0.0, "step": 1300, "text_loss": 0.16888445615768433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.112709128265336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.0009847121277827366, "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2100415.0, "repeat_count": 0.0, "routers_loss": 0.008091195486485958, "skip_count": 0.0, "step": 1302, "text_loss": 0.40061676502227783 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.122101555620781, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.000984636082430604, "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2103285.0, "repeat_count": 0.0, "routers_loss": 0.009593960829079151, "skip_count": 0.0, "step": 1304, "text_loss": 0.7211073637008667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.131493982976226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.107421875, "learning_rate": 0.0009845598513650103, "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2106255.0, "repeat_count": 0.0, "routers_loss": 0.0023068038281053305, "skip_count": 0.0, "step": 1306, "text_loss": 0.7077119946479797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.0009844834346151674, "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 2109305.0, "repeat_count": 0.0, "routers_loss": 0.007703019306063652, "skip_count": 0.0, "step": 1308, "text_loss": 0.3534316122531891 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.1502788376871145, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0009844068322103585, "loss": 0.0287, "macro_f1": 0.3272727429866791, "num_tokens": 2112216.0, "repeat_count": 0.0, "routers_loss": 0.023549847304821014, "skip_count": 1.0, "step": 1310, "text_loss": 0.6792599558830261 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.0009843300441799378, "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2114925.0, "repeat_count": 0.0, "routers_loss": 0.007605871185660362, "skip_count": 0.0, "step": 1312, "text_loss": 0.1571389138698578 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.169063692398004, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0009842530705533304, "loss": 0.0253, "macro_f1": 0.3272727429866791, "num_tokens": 2117744.0, "repeat_count": 0.0, "routers_loss": 0.014964760281145573, "skip_count": 0.0, "step": 1314, "text_loss": 0.7840361595153809 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.178456119753449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.000984175911360033, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2120848.0, "repeat_count": 0.0, "routers_loss": 0.004663798492401838, "skip_count": 0.0, "step": 1316, "text_loss": 0.536246120929718 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.187848547108893, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1201171875, "learning_rate": 0.000984098566629613, "loss": 0.0288, "macro_f1": 0.5492662787437439, "num_tokens": 2123651.0, "repeat_count": 0.0, "routers_loss": 0.022852955386042595, "skip_count": 2.0, "step": 1318, "text_loss": 0.43372172117233276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.197240974464338, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0009840210363917087, "loss": 0.0216, "macro_f1": 0.3333333432674408, "num_tokens": 2128011.0, "repeat_count": 0.0, "routers_loss": 0.012578422203660011, "skip_count": 0.0, "step": 1320, "text_loss": 0.28190380334854126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.0009839433206760306, "loss": 0.0204, "macro_f1": 0.3333333432674408, "num_tokens": 2131035.0, "repeat_count": 0.0, "routers_loss": 0.006863643880933523, "skip_count": 0.0, "step": 1322, "text_loss": 0.6340444087982178 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.216025829175227, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.0009838654195123589, "loss": 0.0243, "macro_f1": 0.3333333432674408, "num_tokens": 2133856.0, "repeat_count": 0.0, "routers_loss": 0.00468854233622551, "skip_count": 0.0, "step": 1324, "text_loss": 0.5138425827026367 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.225418256530672, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0009837873329305458, "loss": 0.0396, "macro_f1": 0.6666666865348816, "num_tokens": 2136451.0, "repeat_count": 1.0, "routers_loss": 0.005731126759201288, "skip_count": 0.0, "step": 1326, "text_loss": 0.742124617099762 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.000983709060960514, "loss": 0.0416, "macro_f1": 0.3333333432674408, "num_tokens": 2139496.0, "repeat_count": 0.0, "routers_loss": 0.0056343949399888515, "skip_count": 0.0, "step": 1328, "text_loss": 0.7317464351654053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.2442031112415615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.0009836306036322576, "loss": 0.0312, "macro_f1": 0.3333333432674408, "num_tokens": 2143120.0, "repeat_count": 0.0, "routers_loss": 0.005127966403961182, "skip_count": 0.0, "step": 1330, "text_loss": 0.538652241230011 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 6.253595538597006, "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.11083984375, "learning_rate": 0.0009835519609758415, "loss": 0.0301, "macro_f1": 0.590062141418457, "num_tokens": 2145807.0, "repeat_count": 3.0, "routers_loss": 0.1673707216978073, "skip_count": 4.0, "step": 1332, "text_loss": 0.3498198091983795 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.262987965952451, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009834731330214017, "loss": 0.0293, "macro_f1": 0.3272727429866791, "num_tokens": 2148397.0, "repeat_count": 1.0, "routers_loss": 0.04026653990149498, "skip_count": 0.0, "step": 1334, "text_loss": 0.8153424859046936 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 27.0, "epoch": 6.272380393307896, "f1_execute": 0.8999999761581421, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, "grad_norm": 0.16015625, "learning_rate": 0.0009833941197991455, "loss": 0.0329, "macro_f1": 0.7888889312744141, "num_tokens": 2152226.0, "repeat_count": 2.0, "routers_loss": 0.05481519177556038, "skip_count": 5.0, "step": 1336, "text_loss": 0.7802760004997253 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.28177282066334, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009833149213393506, "loss": 0.0304, "macro_f1": 0.3272727429866791, "num_tokens": 2156023.0, "repeat_count": 0.0, "routers_loss": 0.01760484278202057, "skip_count": 0.0, "step": 1338, "text_loss": 0.19721226394176483 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.2911652480187845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.000983235537672366, "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2160037.0, "repeat_count": 0.0, "routers_loss": 0.013206037692725658, "skip_count": 0.0, "step": 1340, "text_loss": 0.5003817081451416 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.30055767537423, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.000983155968828612, "loss": 0.0315, "macro_f1": 0.6666666865348816, "num_tokens": 2163910.0, "repeat_count": 1.0, "routers_loss": 0.01256406120955944, "skip_count": 0.0, "step": 1342, "text_loss": 0.5996923446655273 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.309950102729674, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11962890625, "learning_rate": 0.0009830762148385793, "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2166921.0, "repeat_count": 0.0, "routers_loss": 0.015086234547197819, "skip_count": 1.0, "step": 1344, "text_loss": 0.45356282591819763 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.319342530085119, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08447265625, "learning_rate": 0.0009829962757328297, "loss": 0.0223, "macro_f1": 0.32098764181137085, "num_tokens": 2170135.0, "repeat_count": 0.0, "routers_loss": 0.07909081131219864, "skip_count": 2.0, "step": 1346, "text_loss": 0.2874644994735718 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 6.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009829161515419959, "loss": 0.0246, "macro_f1": 0.6666666865348816, "num_tokens": 2173029.0, "repeat_count": 0.0, "routers_loss": 0.013569854199886322, "skip_count": 2.0, "step": 1348, "text_loss": 0.25533875823020935 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.3381273847960085, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0009828358422967823, "loss": 0.0226, "macro_f1": 0.32098764181137085, "num_tokens": 2176605.0, "repeat_count": 1.0, "routers_loss": 0.08111091703176498, "skip_count": 1.0, "step": 1350, "text_loss": 0.32827726006507874 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 6.347519812151453, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.091796875, "learning_rate": 0.0009827553480279627, "loss": 0.03, "macro_f1": 0.5427350401878357, "num_tokens": 2179406.0, "repeat_count": 0.0, "routers_loss": 0.026550088077783585, "skip_count": 2.0, "step": 1352, "text_loss": 0.2966301143169403 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009826746687663832, "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2182353.0, "repeat_count": 0.0, "routers_loss": 0.003914554137736559, "skip_count": 0.0, "step": 1354, "text_loss": 0.7596251964569092 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 6.366304666862343, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0859375, "learning_rate": 0.0009825938045429602, "loss": 0.0324, "macro_f1": 0.5866667032241821, "num_tokens": 2185786.0, "repeat_count": 1.0, "routers_loss": 0.059612665325403214, "skip_count": 3.0, "step": 1356, "text_loss": 0.12325898557901382 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.375697094217787, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10009765625, "learning_rate": 0.0009825127553886807, "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 2190157.0, "repeat_count": 0.0, "routers_loss": 0.0071132429875433445, "skip_count": 0.0, "step": 1358, "text_loss": 0.9287898540496826 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.3850895215732315, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0009824315213346033, "loss": 0.0348, "macro_f1": 0.3333333432674408, "num_tokens": 2193077.0, "repeat_count": 0.0, "routers_loss": 0.009611099027097225, "skip_count": 0.0, "step": 1360, "text_loss": 0.20427259802818298 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.394481948928676, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0009823501024118569, "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2196494.0, "repeat_count": 0.0, "routers_loss": 0.006913455203175545, "skip_count": 0.0, "step": 1362, "text_loss": 0.574759840965271 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.403874376284121, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009822684986516411, "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 2199839.0, "repeat_count": 0.0, "routers_loss": 0.009208920411765575, "skip_count": 0.0, "step": 1364, "text_loss": 0.42422571778297424 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.413266803639566, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.000982186710085227, "loss": 0.0208, "macro_f1": 0.32098764181137085, "num_tokens": 2203212.0, "repeat_count": 1.0, "routers_loss": 0.059975091367959976, "skip_count": 1.0, "step": 1366, "text_loss": 0.29213017225265503 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 6.42265923099501, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.181640625, "learning_rate": 0.0009821047367439561, "loss": 0.0358, "macro_f1": 0.44705885648727417, "num_tokens": 2206240.0, "repeat_count": 0.0, "routers_loss": 0.048244867473840714, "skip_count": 4.0, "step": 1368, "text_loss": 0.3072395324707031 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.432051658350455, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0009820225786592405, "loss": 0.0375, "macro_f1": 0.3272727429866791, "num_tokens": 2209903.0, "repeat_count": 1.0, "routers_loss": 0.026068156585097313, "skip_count": 0.0, "step": 1370, "text_loss": 0.5961400270462036 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.4414440857059, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.109375, "learning_rate": 0.0009819402358625634, "loss": 0.0366, "macro_f1": 0.3272727429866791, "num_tokens": 2213439.0, "repeat_count": 0.0, "routers_loss": 0.022615568712353706, "skip_count": 1.0, "step": 1372, "text_loss": 0.19375644624233246 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.450836513061344, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.000981857708385479, "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2216457.0, "repeat_count": 0.0, "routers_loss": 0.005855285096913576, "skip_count": 0.0, "step": 1374, "text_loss": 0.5123368501663208 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.460228940416789, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0009817749962596114, "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2219975.0, "repeat_count": 1.0, "routers_loss": 0.0651634931564331, "skip_count": 0.0, "step": 1376, "text_loss": 0.5999220609664917 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.469621367772234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0009816920995166568, "loss": 0.0371, "macro_f1": 0.6666666865348816, "num_tokens": 2222833.0, "repeat_count": 1.0, "routers_loss": 0.011408994905650616, "skip_count": 0.0, "step": 1378, "text_loss": 0.5323230624198914 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.4790137951276785, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.205078125, "learning_rate": 0.0009816090181883807, "loss": 0.0313, "macro_f1": 0.32098764181137085, "num_tokens": 2225842.0, "repeat_count": 0.0, "routers_loss": 0.039720915257930756, "skip_count": 2.0, "step": 1380, "text_loss": 0.23363439738750458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.0009815257523066204, "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 2229430.0, "repeat_count": 0.0, "routers_loss": 0.002765297656878829, "skip_count": 0.0, "step": 1382, "text_loss": 0.718977689743042 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.497798649838567, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0009814423019032835, "loss": 0.0396, "macro_f1": 0.5492662787437439, "num_tokens": 2232594.0, "repeat_count": 2.0, "routers_loss": 0.05362323671579361, "skip_count": 0.0, "step": 1384, "text_loss": 0.6392166614532471 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.507191077194013, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.0009813586670103483, "loss": 0.0426, "macro_f1": 0.6603773832321167, "num_tokens": 2236327.0, "repeat_count": 1.0, "routers_loss": 0.031728316098451614, "skip_count": 1.0, "step": 1386, "text_loss": 0.5951619148254395 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.516583504549457, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.126953125, "learning_rate": 0.0009812748476598638, "loss": 0.031, "macro_f1": 0.5492662787437439, "num_tokens": 2239746.0, "repeat_count": 0.0, "routers_loss": 0.03981253132224083, "skip_count": 2.0, "step": 1388, "text_loss": 0.22756551206111908 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.5259759319049016, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.12451171875, "learning_rate": 0.0009811908438839498, "loss": 0.0331, "macro_f1": 0.5492662787437439, "num_tokens": 2242786.0, "repeat_count": 0.0, "routers_loss": 0.04617162421345711, "skip_count": 2.0, "step": 1390, "text_loss": 0.3233799934387207 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.535368359260346, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.000981106655714797, "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2245696.0, "repeat_count": 0.0, "routers_loss": 0.046828847378492355, "skip_count": 1.0, "step": 1392, "text_loss": 0.24273279309272766 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.544760786615791, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.07373046875, "learning_rate": 0.0009810222831846656, "loss": 0.0307, "macro_f1": 0.5492662787437439, "num_tokens": 2249326.0, "repeat_count": 0.0, "routers_loss": 0.010921589098870754, "skip_count": 2.0, "step": 1394, "text_loss": 0.3921460807323456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.554153213971236, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0009809377263258882, "loss": 0.0315, "macro_f1": 0.32098767161369324, "num_tokens": 2253393.0, "repeat_count": 0.0, "routers_loss": 0.04564022272825241, "skip_count": 1.0, "step": 1396, "text_loss": 0.582602858543396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.56354564132668, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.000980852985170867, "loss": 0.0328, "macro_f1": 0.3272727429866791, "num_tokens": 2256626.0, "repeat_count": 0.0, "routers_loss": 0.013289985246956348, "skip_count": 0.0, "step": 1398, "text_loss": 0.41031694412231445 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.5729380686821255, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.0009807680597520745, "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2259326.0, "repeat_count": 0.0, "routers_loss": 0.0065213534981012344, "skip_count": 0.0, "step": 1400, "text_loss": 0.2888098657131195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.58233049603757, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.0009806829501020546, "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2262344.0, "repeat_count": 0.0, "routers_loss": 0.04199840500950813, "skip_count": 1.0, "step": 1402, "text_loss": 0.31973034143447876 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.591722923393014, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08935546875, "learning_rate": 0.0009805976562534215, "loss": 0.0317, "macro_f1": 0.6603773832321167, "num_tokens": 2266354.0, "repeat_count": 1.0, "routers_loss": 0.015434930101037025, "skip_count": 1.0, "step": 1404, "text_loss": 0.508630633354187 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 6.601115350748459, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0009805121782388599, "loss": 0.0339, "macro_f1": 0.6533333659172058, "num_tokens": 2269660.0, "repeat_count": 2.0, "routers_loss": 0.0720924660563469, "skip_count": 2.0, "step": 1406, "text_loss": 0.40927737951278687 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.610507778103904, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0634765625, "learning_rate": 0.0009804265160911253, "loss": 0.0266, "macro_f1": 0.5492662787437439, "num_tokens": 2273335.0, "repeat_count": 0.0, "routers_loss": 0.02400495670735836, "skip_count": 2.0, "step": 1408, "text_loss": 0.1777762621641159 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.6199002054593485, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2314453125, "learning_rate": 0.0009803406698430433, "loss": 0.0371, "macro_f1": 0.3272727429866791, "num_tokens": 2277107.0, "repeat_count": 0.0, "routers_loss": 0.02560107782483101, "skip_count": 1.0, "step": 1410, "text_loss": 0.17955881357192993 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.629292632814793, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009802546395275104, "loss": 0.0349, "macro_f1": 0.3333333432674408, "num_tokens": 2281638.0, "repeat_count": 0.0, "routers_loss": 0.006655813194811344, "skip_count": 0.0, "step": 1412, "text_loss": 0.20882295072078705 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 6.638685060170237, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.08740234375, "learning_rate": 0.000980168425177494, "loss": 0.0342, "macro_f1": 0.8200000524520874, "num_tokens": 2284876.0, "repeat_count": 1.0, "routers_loss": 0.06325097382068634, "skip_count": 3.0, "step": 1414, "text_loss": 0.26035264134407043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.648077487525683, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.000980082026826031, "loss": 0.0315, "macro_f1": 0.3272727429866791, "num_tokens": 2288938.0, "repeat_count": 1.0, "routers_loss": 0.013436575420200825, "skip_count": 0.0, "step": 1416, "text_loss": 0.5502325892448425 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.657469914881127, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0009799954445062296, "loss": 0.0193, "macro_f1": 0.6603773832321167, "num_tokens": 2292317.0, "repeat_count": 1.0, "routers_loss": 0.011264479719102383, "skip_count": 1.0, "step": 1418, "text_loss": 0.48075684905052185 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.666862342236572, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1611328125, "learning_rate": 0.0009799086782512686, "loss": 0.0292, "macro_f1": 0.5492662787437439, "num_tokens": 2295935.0, "repeat_count": 0.0, "routers_loss": 0.02833271212875843, "skip_count": 2.0, "step": 1420, "text_loss": 0.18221206963062286 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09375, "learning_rate": 0.0009798217280943967, "loss": 0.0356, "macro_f1": 0.6666666865348816, "num_tokens": 2298927.0, "repeat_count": 0.0, "routers_loss": 0.009208574891090393, "skip_count": 1.0, "step": 1422, "text_loss": 0.48686322569847107 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.685647196947461, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0009797345940689335, "loss": 0.0267, "macro_f1": 0.3272727429866791, "num_tokens": 2301541.0, "repeat_count": 0.0, "routers_loss": 0.015011847950518131, "skip_count": 0.0, "step": 1424, "text_loss": 0.49446266889572144 }, { "acc_repeat": 0.0, "acc_skip": 0.4000000059604645, "avg_layers": 26.0, "epoch": 6.695039624302906, "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, "f1_skip": 0.5714285969734192, "grad_norm": 0.1337890625, "learning_rate": 0.0009796472762082687, "loss": 0.0338, "macro_f1": 0.5034013986587524, "num_tokens": 2304589.0, "repeat_count": 0.0, "routers_loss": 0.05912091210484505, "skip_count": 5.0, "step": 1426, "text_loss": 0.23945684731006622 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.70443205165835, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.000979559774545863, "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 2307860.0, "repeat_count": 0.0, "routers_loss": 0.021242303773760796, "skip_count": 1.0, "step": 1428, "text_loss": 0.531273365020752 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.713824479013795, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.000979472089115247, "loss": 0.0276, "macro_f1": 0.32098764181137085, "num_tokens": 2311581.0, "repeat_count": 0.0, "routers_loss": 0.02768544852733612, "skip_count": 2.0, "step": 1430, "text_loss": 0.2497459501028061 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.000979384219950022, "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2314639.0, "repeat_count": 0.0, "routers_loss": 0.008678150363266468, "skip_count": 0.0, "step": 1432, "text_loss": 0.6579355001449585 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.732609333724684, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08056640625, "learning_rate": 0.0009792961670838595, "loss": 0.0362, "macro_f1": 0.3272727429866791, "num_tokens": 2317927.0, "repeat_count": 1.0, "routers_loss": 0.03325597569346428, "skip_count": 0.0, "step": 1434, "text_loss": 0.5209436416625977 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.742001761080129, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009792079305505016, "loss": 0.0306, "macro_f1": 0.3272727429866791, "num_tokens": 2321065.0, "repeat_count": 1.0, "routers_loss": 0.019228918477892876, "skip_count": 0.0, "step": 1436, "text_loss": 0.41087067127227783 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.751394188435574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.000979119510383761, "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2323714.0, "repeat_count": 0.0, "routers_loss": 0.017071325331926346, "skip_count": 0.0, "step": 1438, "text_loss": 0.21490029990673065 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.760786615791019, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2060546875, "learning_rate": 0.00097903090661752, "loss": 0.0309, "macro_f1": 0.3333333432674408, "num_tokens": 2326454.0, "repeat_count": 0.0, "routers_loss": 0.00991755723953247, "skip_count": 0.0, "step": 1440, "text_loss": 0.23847346007823944 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.770179043146463, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.232421875, "learning_rate": 0.000978942119285732, "loss": 0.0404, "macro_f1": 0.3272727429866791, "num_tokens": 2329462.0, "repeat_count": 0.0, "routers_loss": 0.04908733069896698, "skip_count": 1.0, "step": 1442, "text_loss": 0.23343028128147125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.7795714705019074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.0009788531484224204, "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2332146.0, "repeat_count": 0.0, "routers_loss": 0.0032628148328512907, "skip_count": 0.0, "step": 1444, "text_loss": 0.47423800826072693 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 6.788963897857353, "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, "grad_norm": 0.10693359375, "learning_rate": 0.0009787639940616788, "loss": 0.0405, "macro_f1": 0.7018141150474548, "num_tokens": 2335738.0, "repeat_count": 1.0, "routers_loss": 0.14336998760700226, "skip_count": 3.0, "step": 1446, "text_loss": 0.21837592124938965 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.189453125, "learning_rate": 0.0009786746562376717, "loss": 0.0241, "macro_f1": 0.6666666865348816, "num_tokens": 2338488.0, "repeat_count": 0.0, "routers_loss": 0.010542908683419228, "skip_count": 1.0, "step": 1448, "text_loss": 1.0614757537841797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.807748752568242, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.0009785851349846334, "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2342074.0, "repeat_count": 0.0, "routers_loss": 0.005998016335070133, "skip_count": 0.0, "step": 1450, "text_loss": 0.4269719421863556 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 6.817141179923686, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.1083984375, "learning_rate": 0.0009784954303368686, "loss": 0.0384, "macro_f1": 0.44705885648727417, "num_tokens": 2345838.0, "repeat_count": 0.0, "routers_loss": 0.0959126204252243, "skip_count": 3.0, "step": 1452, "text_loss": 0.3315916955471039 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.0009784055423287521, "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 2348939.0, "repeat_count": 0.0, "routers_loss": 0.0025467623490840197, "skip_count": 0.0, "step": 1454, "text_loss": 0.6162732839584351 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.835926034634576, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0009783154709947293, "loss": 0.0256, "macro_f1": 0.3272727429866791, "num_tokens": 2352232.0, "repeat_count": 0.0, "routers_loss": 0.01860538125038147, "skip_count": 1.0, "step": 1456, "text_loss": 0.23928768932819366 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.84531846199002, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0009782252163693158, "loss": 0.0201, "macro_f1": 0.3272727429866791, "num_tokens": 2355159.0, "repeat_count": 0.0, "routers_loss": 0.04412713274359703, "skip_count": 1.0, "step": 1458, "text_loss": 0.3371323347091675 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.0009781347784870973, "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 2358175.0, "repeat_count": 0.0, "routers_loss": 0.006809141952544451, "skip_count": 0.0, "step": 1460, "text_loss": 0.547267735004425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.86410331670091, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009780441573827296, "loss": 0.03, "macro_f1": 0.3076923191547394, "num_tokens": 2360991.0, "repeat_count": 0.0, "routers_loss": 0.08924390375614166, "skip_count": 4.0, "step": 1462, "text_loss": 0.7026563882827759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1865234375, "learning_rate": 0.000977953353090939, "loss": 0.0272, "macro_f1": 0.3333333432674408, "num_tokens": 2363894.0, "repeat_count": 0.0, "routers_loss": 0.021858472377061844, "skip_count": 0.0, "step": 1464, "text_loss": 0.2718065083026886 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.882888171411799, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.0009778623656465219, "loss": 0.0338, "macro_f1": 0.32098764181137085, "num_tokens": 2367265.0, "repeat_count": 0.0, "routers_loss": 0.044781096279621124, "skip_count": 0.0, "step": 1466, "text_loss": 0.5008095502853394 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.892280598767244, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009777711950843448, "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 2370186.0, "repeat_count": 0.0, "routers_loss": 0.0040459707379341125, "skip_count": 0.0, "step": 1468, "text_loss": 0.5242461562156677 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 6.901673026122689, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.134765625, "learning_rate": 0.0009776798414393446, "loss": 0.0279, "macro_f1": 0.6598639488220215, "num_tokens": 2373314.0, "repeat_count": 1.0, "routers_loss": 0.0708528608083725, "skip_count": 3.0, "step": 1470, "text_loss": 0.2821732461452484 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.911065453478133, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1328125, "learning_rate": 0.0009775883047465279, "loss": 0.0414, "macro_f1": 0.31446540355682373, "num_tokens": 2376435.0, "repeat_count": 1.0, "routers_loss": 0.0290578193962574, "skip_count": 1.0, "step": 1472, "text_loss": 0.8438440561294556 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.9204578808335775, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10546875, "learning_rate": 0.000977496585040972, "loss": 0.0373, "macro_f1": 0.3333333432674408, "num_tokens": 2380244.0, "repeat_count": 0.0, "routers_loss": 0.010360375046730042, "skip_count": 0.0, "step": 1474, "text_loss": 0.4356135427951813 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.929850308189023, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.000977404682357824, "loss": 0.0294, "macro_f1": 0.3272727429866791, "num_tokens": 2383498.0, "repeat_count": 0.0, "routers_loss": 0.023518972098827362, "skip_count": 0.0, "step": 1476, "text_loss": 0.25195425748825073 }, { "acc_repeat": 0.800000011920929, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 6.939242735544467, "f1_execute": 0.9743589162826538, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, "grad_norm": 0.11181640625, "learning_rate": 0.000977312596732301, "loss": 0.0375, "macro_f1": 0.9544159770011902, "num_tokens": 2386414.0, "repeat_count": 5.0, "routers_loss": 0.08190606534481049, "skip_count": 4.0, "step": 1478, "text_loss": 0.6586798429489136 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.948635162899912, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10546875, "learning_rate": 0.0009772203281996905, "loss": 0.0336, "macro_f1": 1.0, "num_tokens": 2389399.0, "repeat_count": 1.0, "routers_loss": 0.016441475600004196, "skip_count": 2.0, "step": 1480, "text_loss": 0.3671986758708954 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.958027590255357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009771278767953502, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2392400.0, "repeat_count": 0.0, "routers_loss": 0.019211363047361374, "skip_count": 0.0, "step": 1482, "text_loss": 0.27418580651283264 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.967420017610801, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0009770352425547072, "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 2395123.0, "repeat_count": 0.0, "routers_loss": 0.015800386667251587, "skip_count": 0.0, "step": 1484, "text_loss": 0.19896622002124786 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.976812444966246, "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009769424255132596, "loss": 0.0256, "macro_f1": 0.4871794879436493, "num_tokens": 2397359.0, "repeat_count": 3.0, "routers_loss": 0.06670158356428146, "skip_count": 0.0, "step": 1486, "text_loss": 0.4229799509048462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.98620487232169, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1162109375, "learning_rate": 0.0009768494257065747, "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 2400387.0, "repeat_count": 0.0, "routers_loss": 0.011144762858748436, "skip_count": 1.0, "step": 1488, "text_loss": 0.4264226257801056 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.995597299677136, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12353515625, "learning_rate": 0.0009767562431702904, "loss": 0.0387, "macro_f1": 0.3006536364555359, "num_tokens": 2403241.0, "repeat_count": 2.0, "routers_loss": 0.12339717149734497, "skip_count": 3.0, "step": 1490, "text_loss": 0.2850193977355957 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.004696213677723, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07177734375, "learning_rate": 0.0009766628779401142, "loss": 0.0215, "macro_f1": 0.6666666865348816, "num_tokens": 2406087.0, "repeat_count": 0.0, "routers_loss": 0.008174685761332512, "skip_count": 1.0, "step": 1492, "text_loss": 0.6756544709205627 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.000976569330051824, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 2409312.0, "repeat_count": 0.0, "routers_loss": 0.0021256296895444393, "skip_count": 0.0, "step": 1494, "text_loss": 0.4789894223213196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.0234810683886115, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.053955078125, "learning_rate": 0.0009764755995412677, "loss": 0.0193, "macro_f1": 0.3333333432674408, "num_tokens": 2412758.0, "repeat_count": 0.0, "routers_loss": 0.003944927826523781, "skip_count": 0.0, "step": 1496, "text_loss": 0.5157490968704224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.032873495744056, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009763816864443627, "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2416079.0, "repeat_count": 1.0, "routers_loss": 0.03893325850367546, "skip_count": 0.0, "step": 1498, "text_loss": 0.28045418858528137 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.042265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0009762875907970968, "loss": 0.0199, "macro_f1": 0.3333333432674408, "num_tokens": 2420340.0, "repeat_count": 0.0, "routers_loss": 0.0017725443467497826, "skip_count": 0.0, "step": 1500, "text_loss": 0.35550856590270996 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.051658350454946, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06298828125, "learning_rate": 0.0009761933126355277, "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2424735.0, "repeat_count": 0.0, "routers_loss": 0.01393749937415123, "skip_count": 1.0, "step": 1502, "text_loss": 0.38840189576148987 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009760988519957828, "loss": 0.0249, "macro_f1": 0.6666666865348816, "num_tokens": 2428132.0, "repeat_count": 0.0, "routers_loss": 0.01687910407781601, "skip_count": 2.0, "step": 1504, "text_loss": 0.3031681478023529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.0704432051658355, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0009760042089140598, "loss": 0.0193, "macro_f1": 0.3144654333591461, "num_tokens": 2431592.0, "repeat_count": 1.0, "routers_loss": 0.04704280197620392, "skip_count": 2.0, "step": 1506, "text_loss": 0.16355200111865997 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0009759093834266259, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2434236.0, "repeat_count": 0.0, "routers_loss": 0.0016075772000476718, "skip_count": 0.0, "step": 1508, "text_loss": 0.6080073118209839 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0009758143755698186, "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2437170.0, "repeat_count": 0.0, "routers_loss": 0.008451299741864204, "skip_count": 0.0, "step": 1510, "text_loss": 0.22100484371185303 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 7.098620487232169, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.06689453125, "learning_rate": 0.0009757191853800449, "loss": 0.0227, "macro_f1": 0.5866667032241821, "num_tokens": 2441187.0, "repeat_count": 1.0, "routers_loss": 0.046565692871809006, "skip_count": 3.0, "step": 1512, "text_loss": 0.25098952651023865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.108012914587614, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.000975623812893782, "loss": 0.0276, "macro_f1": 0.3272727429866791, "num_tokens": 2444664.0, "repeat_count": 0.0, "routers_loss": 0.02872578240931034, "skip_count": 1.0, "step": 1514, "text_loss": 0.4952253997325897 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.1174053419430585, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.0009755282581475768, "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2447748.0, "repeat_count": 0.0, "routers_loss": 0.002055214950814843, "skip_count": 0.0, "step": 1516, "text_loss": 0.7465500831604004 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.126797769298503, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10302734375, "learning_rate": 0.000975432521178046, "loss": 0.0216, "macro_f1": 0.3272727429866791, "num_tokens": 2450834.0, "repeat_count": 1.0, "routers_loss": 0.04498551785945892, "skip_count": 0.0, "step": 1518, "text_loss": 0.28144413232803345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0009753366020218763, "loss": 0.0234, "macro_f1": 0.3333333432674408, "num_tokens": 2454233.0, "repeat_count": 0.0, "routers_loss": 0.003669742727652192, "skip_count": 0.0, "step": 1520, "text_loss": 0.5667551755905151 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009752405007158238, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2457331.0, "repeat_count": 0.0, "routers_loss": 0.010455607436597347, "skip_count": 0.0, "step": 1522, "text_loss": 0.19575810432434082 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 7.154975051364837, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0009751442172967151, "loss": 0.0193, "macro_f1": 0.8823530077934265, "num_tokens": 2459935.0, "repeat_count": 2.0, "routers_loss": 0.025189083069562912, "skip_count": 1.0, "step": 1524, "text_loss": 0.45453405380249023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.164367478720282, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.000975047751801446, "loss": 0.0187, "macro_f1": 0.3272727429866791, "num_tokens": 2463008.0, "repeat_count": 0.0, "routers_loss": 0.012297490611672401, "skip_count": 0.0, "step": 1526, "text_loss": 0.31437572836875916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.0009749511042669823, "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2466475.0, "repeat_count": 0.0, "routers_loss": 0.011026266030967236, "skip_count": 0.0, "step": 1528, "text_loss": 0.46604859828948975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.183152333431171, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009748542747303595, "loss": 0.0182, "macro_f1": 0.3272727429866791, "num_tokens": 2469320.0, "repeat_count": 0.0, "routers_loss": 0.011934996582567692, "skip_count": 1.0, "step": 1530, "text_loss": 0.7764923572540283 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.192544760786616, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0966796875, "learning_rate": 0.0009747572632286827, "loss": 0.0203, "macro_f1": 0.3333333432674408, "num_tokens": 2472468.0, "repeat_count": 0.0, "routers_loss": 0.005786920432001352, "skip_count": 0.0, "step": 1532, "text_loss": 0.3555782437324524 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.20193718814206, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009746600697991271, "loss": 0.02, "macro_f1": 0.6666666865348816, "num_tokens": 2475736.0, "repeat_count": 1.0, "routers_loss": 0.0026990731712430716, "skip_count": 0.0, "step": 1534, "text_loss": 0.49561792612075806 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 7.2113296154975055, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0556640625, "learning_rate": 0.0009745626944789375, "loss": 0.0204, "macro_f1": 0.8823530077934265, "num_tokens": 2478887.0, "repeat_count": 1.0, "routers_loss": 0.020221207290887833, "skip_count": 2.0, "step": 1536, "text_loss": 0.5375416278839111 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.22072204285295, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0009744651373054279, "loss": 0.0286, "macro_f1": 0.3272727429866791, "num_tokens": 2481293.0, "repeat_count": 0.0, "routers_loss": 0.03131086751818657, "skip_count": 1.0, "step": 1538, "text_loss": 0.5241039395332336 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 7.230114470208394, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.08984375, "learning_rate": 0.0009743673983159828, "loss": 0.0241, "macro_f1": 0.6122449040412903, "num_tokens": 2484403.0, "repeat_count": 0.0, "routers_loss": 0.04448170214891434, "skip_count": 4.0, "step": 1540, "text_loss": 0.7465724349021912 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08935546875, "learning_rate": 0.0009742694775480557, "loss": 0.0265, "macro_f1": 0.6666666865348816, "num_tokens": 2487952.0, "repeat_count": 0.0, "routers_loss": 0.007171491626650095, "skip_count": 1.0, "step": 1542, "text_loss": 0.2877117097377777 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.248899324919284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0009741713750391703, "loss": 0.0171, "macro_f1": 0.6666666865348816, "num_tokens": 2490815.0, "repeat_count": 1.0, "routers_loss": 0.004559285007417202, "skip_count": 0.0, "step": 1544, "text_loss": 0.6097800135612488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.258291752274729, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0009740730908269193, "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 2494727.0, "repeat_count": 0.0, "routers_loss": 0.005271553061902523, "skip_count": 0.0, "step": 1546, "text_loss": 0.5431114435195923 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.0009739746249489658, "loss": 0.0239, "macro_f1": 0.3333333432674408, "num_tokens": 2499266.0, "repeat_count": 0.0, "routers_loss": 0.0015409323386847973, "skip_count": 0.0, "step": 1548, "text_loss": 0.4702678322792053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.277076606985618, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1171875, "learning_rate": 0.0009738759774430417, "loss": 0.0216, "macro_f1": 0.32098764181137085, "num_tokens": 2502273.0, "repeat_count": 1.0, "routers_loss": 0.030183158814907074, "skip_count": 1.0, "step": 1550, "text_loss": 0.3239189088344574 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.286469034341063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0009737771483469493, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2507624.0, "repeat_count": 0.0, "routers_loss": 0.005410848651081324, "skip_count": 0.0, "step": 1552, "text_loss": 0.4014642834663391 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009736781376985598, "loss": 0.0168, "macro_f1": 0.6666666865348816, "num_tokens": 2510366.0, "repeat_count": 0.0, "routers_loss": 0.0066976165398955345, "skip_count": 1.0, "step": 1554, "text_loss": 0.5924848914146423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.305253889051952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.13671875, "learning_rate": 0.0009735789455358144, "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2513317.0, "repeat_count": 0.0, "routers_loss": 0.002763477386906743, "skip_count": 0.0, "step": 1556, "text_loss": 0.3222943842411041 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.314646316407397, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11767578125, "learning_rate": 0.0009734795718967237, "loss": 0.0283, "macro_f1": 0.32098764181137085, "num_tokens": 2516628.0, "repeat_count": 0.0, "routers_loss": 0.061566028743982315, "skip_count": 2.0, "step": 1558, "text_loss": 0.3249334692955017 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.324038743762841, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.095703125, "learning_rate": 0.0009733800168193679, "loss": 0.0228, "macro_f1": 1.0, "num_tokens": 2519424.0, "repeat_count": 2.0, "routers_loss": 0.017976421862840652, "skip_count": 4.0, "step": 1560, "text_loss": 0.3341919481754303 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.333431171118286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.0009732802803418966, "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2522922.0, "repeat_count": 0.0, "routers_loss": 0.002525332849472761, "skip_count": 0.0, "step": 1562, "text_loss": 0.3176332712173462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.34282359847373, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07861328125, "learning_rate": 0.0009731803625025292, "loss": 0.0196, "macro_f1": 0.3272727429866791, "num_tokens": 2525811.0, "repeat_count": 0.0, "routers_loss": 0.015524424612522125, "skip_count": 1.0, "step": 1564, "text_loss": 0.532774031162262 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.3522160258291755, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10205078125, "learning_rate": 0.0009730802633395541, "loss": 0.0257, "macro_f1": 0.6603773832321167, "num_tokens": 2529157.0, "repeat_count": 1.0, "routers_loss": 0.08138631284236908, "skip_count": 1.0, "step": 1566, "text_loss": 0.529487133026123 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009729799828913298, "loss": 0.0223, "macro_f1": 0.3333333432674408, "num_tokens": 2532249.0, "repeat_count": 0.0, "routers_loss": 0.0035867292899638414, "skip_count": 0.0, "step": 1568, "text_loss": 0.503160297870636 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.371000880540064, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06884765625, "learning_rate": 0.0009728795211962838, "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2535904.0, "repeat_count": 0.0, "routers_loss": 0.02987455204129219, "skip_count": 2.0, "step": 1570, "text_loss": 0.9170270562171936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.380393307895509, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11865234375, "learning_rate": 0.0009727788782929131, "loss": 0.0273, "macro_f1": 0.3272727429866791, "num_tokens": 2538943.0, "repeat_count": 1.0, "routers_loss": 0.04676021635532379, "skip_count": 0.0, "step": 1572, "text_loss": 0.29146310687065125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.389785735250954, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0009726780542197844, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2541805.0, "repeat_count": 0.0, "routers_loss": 0.002127803163602948, "skip_count": 0.0, "step": 1574, "text_loss": 1.0126502513885498 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.399178162606399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009725770490155338, "loss": 0.0262, "macro_f1": 0.3333333432674408, "num_tokens": 2546213.0, "repeat_count": 0.0, "routers_loss": 0.007609677035361528, "skip_count": 0.0, "step": 1576, "text_loss": 0.190168559551239 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.408570589961843, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.0009724758627188665, "loss": 0.0356, "macro_f1": 0.3272727429866791, "num_tokens": 2549554.0, "repeat_count": 0.0, "routers_loss": 0.033554721623659134, "skip_count": 1.0, "step": 1578, "text_loss": 0.2977406084537506 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.4179630173172875, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0009723744953685572, "loss": 0.028, "macro_f1": 0.3272727429866791, "num_tokens": 2552785.0, "repeat_count": 1.0, "routers_loss": 0.027864238247275352, "skip_count": 0.0, "step": 1580, "text_loss": 0.2700682580471039 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.427355444672733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19921875, "learning_rate": 0.0009722729470034503, "loss": 0.0224, "macro_f1": 0.3333333432674408, "num_tokens": 2556550.0, "repeat_count": 0.0, "routers_loss": 0.004798175301402807, "skip_count": 0.0, "step": 1582, "text_loss": 0.6559903025627136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.436747872028177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.078125, "learning_rate": 0.0009721712176624591, "loss": 0.0242, "macro_f1": 0.3333333432674408, "num_tokens": 2559862.0, "repeat_count": 0.0, "routers_loss": 0.013764148578047752, "skip_count": 0.0, "step": 1584, "text_loss": 0.2257535308599472 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.446140299383622, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10986328125, "learning_rate": 0.0009720693073845667, "loss": 0.032, "macro_f1": 0.5492662787437439, "num_tokens": 2562766.0, "repeat_count": 0.0, "routers_loss": 0.01937069371342659, "skip_count": 2.0, "step": 1586, "text_loss": 0.178413525223732 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.455532726739067, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.0009719672162088252, "loss": 0.0306, "macro_f1": 0.32098767161369324, "num_tokens": 2566583.0, "repeat_count": 1.0, "routers_loss": 0.06224144622683525, "skip_count": 0.0, "step": 1588, "text_loss": 0.3992367684841156 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 7.464925154094511, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.185546875, "learning_rate": 0.0009718649441743559, "loss": 0.0239, "macro_f1": 0.9449735879898071, "num_tokens": 2569516.0, "repeat_count": 2.0, "routers_loss": 0.06937911361455917, "skip_count": 4.0, "step": 1590, "text_loss": 0.1945122629404068 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.00097176249132035, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2572418.0, "repeat_count": 0.0, "routers_loss": 0.0034326619934290648, "skip_count": 0.0, "step": 1592, "text_loss": 0.6259906888008118 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.4837100088054, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08642578125, "learning_rate": 0.0009716598576860676, "loss": 0.0278, "macro_f1": 0.6666666865348816, "num_tokens": 2575235.0, "repeat_count": 1.0, "routers_loss": 0.004557516425848007, "skip_count": 0.0, "step": 1594, "text_loss": 0.6638736724853516 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 7.493102436160846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.193359375, "learning_rate": 0.0009715570433108378, "loss": 0.0198, "macro_f1": 1.0, "num_tokens": 2578157.0, "repeat_count": 1.0, "routers_loss": 0.015363055281341076, "skip_count": 1.0, "step": 1596, "text_loss": 0.6530464887619019 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009714540482340595, "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 2581801.0, "repeat_count": 1.0, "routers_loss": 0.01257144846022129, "skip_count": 0.0, "step": 1598, "text_loss": 0.5916110277175903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.5118872908717345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058837890625, "learning_rate": 0.0009713508724952006, "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2585204.0, "repeat_count": 0.0, "routers_loss": 0.003175645601004362, "skip_count": 0.0, "step": 1600, "text_loss": 0.27901601791381836 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12353515625, "learning_rate": 0.0009712475161337981, "loss": 0.0261, "macro_f1": 0.3333333432674408, "num_tokens": 2588286.0, "repeat_count": 0.0, "routers_loss": 0.004122321493923664, "skip_count": 0.0, "step": 1602, "text_loss": 0.42420244216918945 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009711439791894585, "loss": 0.0341, "macro_f1": 0.6666666865348816, "num_tokens": 2591476.0, "repeat_count": 0.0, "routers_loss": 0.011215819045901299, "skip_count": 1.0, "step": 1604, "text_loss": 0.5549933910369873 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.540064572938069, "f1_execute": 0.9599999785423279, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.0703125, "learning_rate": 0.0009710402617018574, "loss": 0.0172, "macro_f1": 0.8200000524520874, "num_tokens": 2594336.0, "repeat_count": 1.0, "routers_loss": 0.02916567400097847, "skip_count": 2.0, "step": 1606, "text_loss": 0.3263779282569885 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.549457000293513, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009709363637107393, "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 2597462.0, "repeat_count": 0.0, "routers_loss": 0.015897957608103752, "skip_count": 1.0, "step": 1608, "text_loss": 0.20917139947414398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009708322852559184, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2601543.0, "repeat_count": 0.0, "routers_loss": 0.002211357234045863, "skip_count": 0.0, "step": 1610, "text_loss": 0.450550377368927 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.568241855004403, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1728515625, "learning_rate": 0.0009707280263772776, "loss": 0.0277, "macro_f1": 0.6666666865348816, "num_tokens": 2604462.0, "repeat_count": 0.0, "routers_loss": 0.01615734025835991, "skip_count": 2.0, "step": 1612, "text_loss": 0.6908381581306458 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.577634282359847, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0888671875, "learning_rate": 0.0009706235871147688, "loss": 0.0241, "macro_f1": 0.5492662787437439, "num_tokens": 2607484.0, "repeat_count": 0.0, "routers_loss": 0.022048067301511765, "skip_count": 2.0, "step": 1614, "text_loss": 0.36691340804100037 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.587026709715292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10546875, "learning_rate": 0.0009705189675084138, "loss": 0.0176, "macro_f1": 0.6666666865348816, "num_tokens": 2610204.0, "repeat_count": 0.0, "routers_loss": 0.008503952994942665, "skip_count": 1.0, "step": 1616, "text_loss": 0.5226598381996155 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.596419137070737, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009704141675983029, "loss": 0.0248, "macro_f1": 0.3333333432674408, "num_tokens": 2613128.0, "repeat_count": 0.0, "routers_loss": 0.0019020626787096262, "skip_count": 0.0, "step": 1618, "text_loss": 0.6465088725090027 }, { "acc_repeat": 0.0, "acc_skip": 0.5714285969734192, "avg_layers": 24.0, "epoch": 7.6058115644261814, "f1_execute": 0.9333333373069763, "f1_repeat": 0.0, "f1_skip": 0.7272727489471436, "grad_norm": 0.107421875, "learning_rate": 0.0009703091874245956, "loss": 0.032, "macro_f1": 0.5535354018211365, "num_tokens": 2616360.0, "repeat_count": 0.0, "routers_loss": 0.11837691068649292, "skip_count": 7.0, "step": 1620, "text_loss": 0.2987039089202881 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.615203991781626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009702040270275204, "loss": 0.0181, "macro_f1": 0.3333333432674408, "num_tokens": 2619606.0, "repeat_count": 0.0, "routers_loss": 0.0065958453342318535, "skip_count": 0.0, "step": 1622, "text_loss": 0.6262096166610718 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.103515625, "learning_rate": 0.000970098686447375, "loss": 0.0257, "macro_f1": 0.6666666865348816, "num_tokens": 2622499.0, "repeat_count": 0.0, "routers_loss": 0.013632026500999928, "skip_count": 1.0, "step": 1624, "text_loss": 0.2392602562904358 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.633988846492516, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.125, "learning_rate": 0.0009699931657245264, "loss": 0.0245, "macro_f1": 0.5492662787437439, "num_tokens": 2626002.0, "repeat_count": 0.0, "routers_loss": 0.012147823348641396, "skip_count": 2.0, "step": 1626, "text_loss": 0.4742976129055023 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009698874648994098, "loss": 0.0285, "macro_f1": 1.0, "num_tokens": 2629847.0, "repeat_count": 1.0, "routers_loss": 0.010692884214222431, "skip_count": 3.0, "step": 1628, "text_loss": 0.5090685486793518 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.6527737012034045, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009697815840125304, "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2633529.0, "repeat_count": 0.0, "routers_loss": 0.011442207731306553, "skip_count": 0.0, "step": 1630, "text_loss": 0.1874329298734665 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0009696755231044618, "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2636321.0, "repeat_count": 0.0, "routers_loss": 0.0026681360322982073, "skip_count": 0.0, "step": 1632, "text_loss": 0.7650400400161743 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.671558555914294, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10498046875, "learning_rate": 0.0009695692822158466, "loss": 0.0242, "macro_f1": 0.3272727429866791, "num_tokens": 2638840.0, "repeat_count": 1.0, "routers_loss": 0.033965807408094406, "skip_count": 0.0, "step": 1634, "text_loss": 0.6175784468650818 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0009694628613873968, "loss": 0.018, "macro_f1": 0.3333333432674408, "num_tokens": 2641886.0, "repeat_count": 0.0, "routers_loss": 0.007568214554339647, "skip_count": 0.0, "step": 1636, "text_loss": 0.43139931559562683 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.690343410625183, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.193359375, "learning_rate": 0.0009693562606598929, "loss": 0.025, "macro_f1": 0.3333333432674408, "num_tokens": 2645028.0, "repeat_count": 0.0, "routers_loss": 0.004973865579813719, "skip_count": 0.0, "step": 1638, "text_loss": 0.6430339217185974 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.699735837980628, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.0009692494800741844, "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2648209.0, "repeat_count": 1.0, "routers_loss": 0.049863800406455994, "skip_count": 0.0, "step": 1640, "text_loss": 0.28138160705566406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.709128265336073, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08544921875, "learning_rate": 0.0009691425196711901, "loss": 0.0398, "macro_f1": 0.3272727429866791, "num_tokens": 2651171.0, "repeat_count": 0.0, "routers_loss": 0.02112230286002159, "skip_count": 0.0, "step": 1642, "text_loss": 0.3745322525501251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.718520692691517, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.0009690353794918971, "loss": 0.0275, "macro_f1": 0.3333333432674408, "num_tokens": 2654093.0, "repeat_count": 0.0, "routers_loss": 0.0024304776452481747, "skip_count": 0.0, "step": 1644, "text_loss": 0.4275154173374176 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.727913120046962, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.000968928059577362, "loss": 0.0244, "macro_f1": 0.6666666865348816, "num_tokens": 2657079.0, "repeat_count": 0.0, "routers_loss": 0.009320619516074657, "skip_count": 1.0, "step": 1646, "text_loss": 0.46650025248527527 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.737305547402407, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009688205599687099, "loss": 0.0209, "macro_f1": 0.3272727429866791, "num_tokens": 2660951.0, "repeat_count": 0.0, "routers_loss": 0.011913162656128407, "skip_count": 0.0, "step": 1648, "text_loss": 0.46644100546836853 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.7466979747578515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0009687128807071347, "loss": 0.0284, "macro_f1": 0.3333333432674408, "num_tokens": 2663823.0, "repeat_count": 0.0, "routers_loss": 0.013754756189882755, "skip_count": 0.0, "step": 1650, "text_loss": 0.40808847546577454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.0009686050218338996, "loss": 0.0286, "macro_f1": 0.3333333432674408, "num_tokens": 2667079.0, "repeat_count": 0.0, "routers_loss": 0.009099726565182209, "skip_count": 0.0, "step": 1652, "text_loss": 0.2389989197254181 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.765482829468741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08837890625, "learning_rate": 0.0009684969833903359, "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2670162.0, "repeat_count": 0.0, "routers_loss": 0.0034928603563457727, "skip_count": 1.0, "step": 1654, "text_loss": 0.6930749416351318 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.774875256824186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0009683887654178445, "loss": 0.0261, "macro_f1": 0.6666666865348816, "num_tokens": 2673031.0, "repeat_count": 0.0, "routers_loss": 0.008340462110936642, "skip_count": 1.0, "step": 1656, "text_loss": 0.277752548456192 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06884765625, "learning_rate": 0.0009682803679578947, "loss": 0.0259, "macro_f1": 0.3333333432674408, "num_tokens": 2676092.0, "repeat_count": 0.0, "routers_loss": 0.004337446764111519, "skip_count": 0.0, "step": 1658, "text_loss": 0.5176776051521301 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.7936601115350745, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009681717910520244, "loss": 0.0242, "macro_f1": 0.32098764181137085, "num_tokens": 2679479.0, "repeat_count": 0.0, "routers_loss": 0.034611742943525314, "skip_count": 2.0, "step": 1660, "text_loss": 0.21485982835292816 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.80305253889052, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.07958984375, "learning_rate": 0.0009680630347418406, "loss": 0.022, "macro_f1": 0.5492662787437439, "num_tokens": 2683289.0, "repeat_count": 0.0, "routers_loss": 0.03297121450304985, "skip_count": 2.0, "step": 1662, "text_loss": 0.33801013231277466 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.812444966245964, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.000967954099069019, "loss": 0.0411, "macro_f1": 0.32098764181137085, "num_tokens": 2685879.0, "repeat_count": 1.0, "routers_loss": 0.04551183059811592, "skip_count": 1.0, "step": 1664, "text_loss": 0.41123488545417786 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.821837393601409, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009678449840753038, "loss": 0.0324, "macro_f1": 0.32098764181137085, "num_tokens": 2688910.0, "repeat_count": 0.0, "routers_loss": 0.05866450071334839, "skip_count": 2.0, "step": 1666, "text_loss": 0.1740892380475998 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009677356898025082, "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2691680.0, "repeat_count": 0.0, "routers_loss": 0.009243223816156387, "skip_count": 0.0, "step": 1668, "text_loss": 0.2512350380420685 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.8406222483122985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.000967626216292514, "loss": 0.0195, "macro_f1": 0.3333333432674408, "num_tokens": 2694895.0, "repeat_count": 0.0, "routers_loss": 0.005576452240347862, "skip_count": 0.0, "step": 1670, "text_loss": 0.43294376134872437 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 7.850014675667743, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.09130859375, "learning_rate": 0.0009675165635872715, "loss": 0.0306, "macro_f1": 0.44705885648727417, "num_tokens": 2697806.0, "repeat_count": 0.0, "routers_loss": 0.05372785031795502, "skip_count": 3.0, "step": 1672, "text_loss": 0.1614082306623459 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.859407103023187, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009674067317288, "loss": 0.0296, "macro_f1": 0.6666666865348816, "num_tokens": 2700529.0, "repeat_count": 1.0, "routers_loss": 0.018131591379642487, "skip_count": 0.0, "step": 1674, "text_loss": 0.2093173861503601 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.868799530378633, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009672967207591869, "loss": 0.0257, "macro_f1": 0.3272727429866791, "num_tokens": 2703650.0, "repeat_count": 0.0, "routers_loss": 0.0673515796661377, "skip_count": 1.0, "step": 1676, "text_loss": 0.3029400110244751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.878191957734077, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009671865307205892, "loss": 0.021, "macro_f1": 0.32098767161369324, "num_tokens": 2707615.0, "repeat_count": 0.0, "routers_loss": 0.03821169584989548, "skip_count": 1.0, "step": 1678, "text_loss": 0.2262786477804184 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 7.8875843850895215, "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.1396484375, "learning_rate": 0.0009670761616552315, "loss": 0.0465, "macro_f1": 0.9615669250488281, "num_tokens": 2710894.0, "repeat_count": 2.0, "routers_loss": 0.042625464498996735, "skip_count": 6.0, "step": 1680, "text_loss": 0.29623574018478394 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.896976812444966, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009669656136054074, "loss": 0.0289, "macro_f1": 0.3333333432674408, "num_tokens": 2714330.0, "repeat_count": 0.0, "routers_loss": 0.0037571541033685207, "skip_count": 0.0, "step": 1682, "text_loss": 0.7510389089584351 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.906369239800411, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07421875, "learning_rate": 0.0009668548866134795, "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2717176.0, "repeat_count": 0.0, "routers_loss": 0.004142968449741602, "skip_count": 0.0, "step": 1684, "text_loss": 0.3273485600948334 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07373046875, "learning_rate": 0.0009667439807218783, "loss": 0.0233, "macro_f1": 0.6666666865348816, "num_tokens": 2720628.0, "repeat_count": 0.0, "routers_loss": 0.008753842674195766, "skip_count": 2.0, "step": 1686, "text_loss": 0.4314708709716797 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.9251540945113, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0732421875, "learning_rate": 0.0009666328959731033, "loss": 0.0211, "macro_f1": 0.6603773832321167, "num_tokens": 2723739.0, "repeat_count": 1.0, "routers_loss": 0.022674910724163055, "skip_count": 1.0, "step": 1688, "text_loss": 0.25734150409698486 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 7.934546521866745, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1552734375, "learning_rate": 0.0009665216324097222, "loss": 0.0324, "macro_f1": 0.5934640765190125, "num_tokens": 2726644.0, "repeat_count": 0.0, "routers_loss": 0.03932750225067139, "skip_count": 3.0, "step": 1690, "text_loss": 0.24511034786701202 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.94393894922219, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.0009664101900743714, "loss": 0.0255, "macro_f1": 0.3272727429866791, "num_tokens": 2729662.0, "repeat_count": 0.0, "routers_loss": 0.012672754004597664, "skip_count": 1.0, "step": 1692, "text_loss": 0.39431414008140564 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.953331376577634, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.076171875, "learning_rate": 0.000966298569009756, "loss": 0.0231, "macro_f1": 0.5492662787437439, "num_tokens": 2732578.0, "repeat_count": 0.0, "routers_loss": 0.01548632513731718, "skip_count": 2.0, "step": 1694, "text_loss": 0.12439999729394913 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.962723803933079, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009661867692586494, "loss": 0.0153, "macro_f1": 0.32098764181137085, "num_tokens": 2735887.0, "repeat_count": 0.0, "routers_loss": 0.05622401833534241, "skip_count": 2.0, "step": 1696, "text_loss": 0.29024389386177063 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.972116231288524, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.0009660747908638933, "loss": 0.0205, "macro_f1": 0.3272727429866791, "num_tokens": 2739293.0, "repeat_count": 0.0, "routers_loss": 0.041060201823711395, "skip_count": 1.0, "step": 1698, "text_loss": 0.39461007714271545 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.9815086586439685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1767578125, "learning_rate": 0.0009659626338683981, "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 2742468.0, "repeat_count": 0.0, "routers_loss": 0.007251353468745947, "skip_count": 0.0, "step": 1700, "text_loss": 0.2751767635345459 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.990901085999413, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009658502983151427, "loss": 0.0186, "macro_f1": 0.3272727429866791, "num_tokens": 2745123.0, "repeat_count": 0.0, "routers_loss": 0.012847424484789371, "skip_count": 1.0, "step": 1702, "text_loss": 0.4756404757499695 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11767578125, "learning_rate": 0.0009657377842471742, "loss": 0.0313, "macro_f1": 0.6666666865348816, "num_tokens": 2748016.0, "repeat_count": 0.0, "routers_loss": 0.007060411386191845, "skip_count": 1.0, "step": 1704, "text_loss": 0.9571210145950317 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.009392427355445, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10009765625, "learning_rate": 0.0009656250917076081, "loss": 0.0188, "macro_f1": 0.5492662787437439, "num_tokens": 2750717.0, "repeat_count": 0.0, "routers_loss": 0.016748681664466858, "skip_count": 2.0, "step": 1706, "text_loss": 0.14542843401432037 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.018784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.0009655122207396285, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2753635.0, "repeat_count": 0.0, "routers_loss": 0.013607042841613293, "skip_count": 0.0, "step": 1708, "text_loss": 0.21836471557617188 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0009653991713864878, "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2756643.0, "repeat_count": 0.0, "routers_loss": 0.0012097888393327594, "skip_count": 0.0, "step": 1710, "text_loss": 0.635187029838562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1171875, "learning_rate": 0.0009652859436915066, "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2759432.0, "repeat_count": 0.0, "routers_loss": 0.006196760106831789, "skip_count": 0.0, "step": 1712, "text_loss": 0.5629420876502991 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.046962136777223, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0009651725376980743, "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2762538.0, "repeat_count": 0.0, "routers_loss": 0.0042513771913945675, "skip_count": 0.0, "step": 1714, "text_loss": 0.39522525668144226 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 8.056354564132668, "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.1494140625, "learning_rate": 0.0009650589534496479, "loss": 0.0194, "macro_f1": 0.8194444179534912, "num_tokens": 2765571.0, "repeat_count": 2.0, "routers_loss": 0.03596706688404083, "skip_count": 3.0, "step": 1716, "text_loss": 0.6252416968345642 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04833984375, "learning_rate": 0.0009649451909897532, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 2769206.0, "repeat_count": 0.0, "routers_loss": 0.0025788163766264915, "skip_count": 0.0, "step": 1718, "text_loss": 0.8851634860038757 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.075139418843557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.0009648312503619843, "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2772488.0, "repeat_count": 0.0, "routers_loss": 0.004443451762199402, "skip_count": 0.0, "step": 1720, "text_loss": 0.8568580746650696 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 8.084531846199003, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1552734375, "learning_rate": 0.0009647171316100034, "loss": 0.0265, "macro_f1": 0.9265305995941162, "num_tokens": 2776482.0, "repeat_count": 1.0, "routers_loss": 0.022948263213038445, "skip_count": 3.0, "step": 1722, "text_loss": 0.13431036472320557 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1572265625, "learning_rate": 0.0009646028347775409, "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 2778966.0, "repeat_count": 0.0, "routers_loss": 0.011328035034239292, "skip_count": 1.0, "step": 1724, "text_loss": 0.2085491120815277 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0009644883599083958, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2781968.0, "repeat_count": 0.0, "routers_loss": 0.002208018908277154, "skip_count": 0.0, "step": 1726, "text_loss": 0.4948323965072632 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.112709128265337, "f1_execute": 0.9411764740943909, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009643737070464349, "loss": 0.0158, "macro_f1": 0.6470588445663452, "num_tokens": 2784666.0, "repeat_count": 1.0, "routers_loss": 0.04391832649707794, "skip_count": 2.0, "step": 1728, "text_loss": 0.39060094952583313 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046630859375, "learning_rate": 0.0009642588762355935, "loss": 0.0212, "macro_f1": 0.6666666865348816, "num_tokens": 2787558.0, "repeat_count": 0.0, "routers_loss": 0.004497280344367027, "skip_count": 1.0, "step": 1730, "text_loss": 0.34908708930015564 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.131493982976226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0009641438675198748, "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2790474.0, "repeat_count": 0.0, "routers_loss": 0.00583475548774004, "skip_count": 0.0, "step": 1732, "text_loss": 0.5720033049583435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.140886410331671, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08154296875, "learning_rate": 0.0009640286809433508, "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2793272.0, "repeat_count": 0.0, "routers_loss": 0.007826375775039196, "skip_count": 0.0, "step": 1734, "text_loss": 0.32181721925735474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0009639133165501606, "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2797726.0, "repeat_count": 0.0, "routers_loss": 0.0019055595621466637, "skip_count": 0.0, "step": 1736, "text_loss": 0.620936393737793 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0009637977743845124, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2800706.0, "repeat_count": 0.0, "routers_loss": 0.0028302327264100313, "skip_count": 0.0, "step": 1738, "text_loss": 0.6473138332366943 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.169063692398003, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009636820544906823, "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 2803847.0, "repeat_count": 1.0, "routers_loss": 0.01105099730193615, "skip_count": 2.0, "step": 1740, "text_loss": 0.4401201903820038 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 8.178456119753449, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1455078125, "learning_rate": 0.0009635661569130141, "loss": 0.0195, "macro_f1": 0.5934640765190125, "num_tokens": 2807235.0, "repeat_count": 0.0, "routers_loss": 0.02619045600295067, "skip_count": 3.0, "step": 1742, "text_loss": 0.459264874458313 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.187848547108894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0009634500816959202, "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2810396.0, "repeat_count": 0.0, "routers_loss": 0.007915694266557693, "skip_count": 2.0, "step": 1744, "text_loss": 0.5084020495414734 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.197240974464338, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009633338288838805, "loss": 0.0271, "macro_f1": 0.5492662787437439, "num_tokens": 2813215.0, "repeat_count": 2.0, "routers_loss": 0.08364596217870712, "skip_count": 0.0, "step": 1746, "text_loss": 0.27681824564933777 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 30.0, "epoch": 8.206633401819783, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.051025390625, "learning_rate": 0.0009632173985214438, "loss": 0.0156, "macro_f1": 0.8817967176437378, "num_tokens": 2816452.0, "repeat_count": 3.0, "routers_loss": 0.028805451467633247, "skip_count": 2.0, "step": 1748, "text_loss": 0.4678419530391693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.216025829175228, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0625, "learning_rate": 0.000963100790653226, "loss": 0.0188, "macro_f1": 0.3272727429866791, "num_tokens": 2819364.0, "repeat_count": 0.0, "routers_loss": 0.03056817688047886, "skip_count": 1.0, "step": 1750, "text_loss": 0.3078109920024872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.225418256530672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009629840053239116, "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2823469.0, "repeat_count": 0.0, "routers_loss": 0.0019477814203128219, "skip_count": 0.0, "step": 1752, "text_loss": 0.45501336455345154 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057373046875, "learning_rate": 0.000962867042578253, "loss": 0.0173, "macro_f1": 0.3333333432674408, "num_tokens": 2826716.0, "repeat_count": 0.0, "routers_loss": 0.0032963966950774193, "skip_count": 0.0, "step": 1754, "text_loss": 0.49234694242477417 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.244203111241562, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0009627499024610707, "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2829733.0, "repeat_count": 0.0, "routers_loss": 0.010289114899933338, "skip_count": 1.0, "step": 1756, "text_loss": 0.22335539758205414 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.253595538597006, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0888671875, "learning_rate": 0.0009626325850172527, "loss": 0.0174, "macro_f1": 0.3272727429866791, "num_tokens": 2833350.0, "repeat_count": 0.0, "routers_loss": 0.03249066323041916, "skip_count": 1.0, "step": 1758, "text_loss": 0.6581931114196777 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.262987965952451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.0009625150902917555, "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 2836558.0, "repeat_count": 0.0, "routers_loss": 0.00870000571012497, "skip_count": 0.0, "step": 1760, "text_loss": 0.22938725352287292 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009623974183296031, "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2840560.0, "repeat_count": 0.0, "routers_loss": 0.007767196744680405, "skip_count": 0.0, "step": 1762, "text_loss": 0.24473799765110016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009622795691758876, "loss": 0.0244, "macro_f1": 0.3333333432674408, "num_tokens": 2843548.0, "repeat_count": 0.0, "routers_loss": 0.0021693643648177385, "skip_count": 0.0, "step": 1764, "text_loss": 0.3084608018398285 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0009621615428757693, "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 2847076.0, "repeat_count": 0.0, "routers_loss": 0.0024727333802729845, "skip_count": 0.0, "step": 1766, "text_loss": 0.5251734852790833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.300557675374229, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.000962043339474476, "loss": 0.0194, "macro_f1": 0.3333333432674408, "num_tokens": 2849751.0, "repeat_count": 0.0, "routers_loss": 0.005174890160560608, "skip_count": 0.0, "step": 1768, "text_loss": 0.4410129189491272 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.309950102729674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06103515625, "learning_rate": 0.0009619249590173032, "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 2853916.0, "repeat_count": 0.0, "routers_loss": 0.006785830482840538, "skip_count": 2.0, "step": 1770, "text_loss": 0.550076425075531 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 8.31934253008512, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.06591796875, "learning_rate": 0.0009618064015496149, "loss": 0.0192, "macro_f1": 0.5934640765190125, "num_tokens": 2857372.0, "repeat_count": 0.0, "routers_loss": 0.021370256319642067, "skip_count": 3.0, "step": 1772, "text_loss": 0.1988629847764969 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.072265625, "learning_rate": 0.0009616876671168423, "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2861028.0, "repeat_count": 0.0, "routers_loss": 0.004313841462135315, "skip_count": 1.0, "step": 1774, "text_loss": 0.42581331729888916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.338127384796008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0009615687557644847, "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2864847.0, "repeat_count": 0.0, "routers_loss": 0.0025742491707205772, "skip_count": 0.0, "step": 1776, "text_loss": 0.46510905027389526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009614496675381093, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2867392.0, "repeat_count": 0.0, "routers_loss": 0.0016813480760902166, "skip_count": 0.0, "step": 1778, "text_loss": 0.5922174453735352 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0810546875, "learning_rate": 0.0009613304024833507, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 2871273.0, "repeat_count": 0.0, "routers_loss": 0.004948933608829975, "skip_count": 0.0, "step": 1780, "text_loss": 0.6776977777481079 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.366304666862343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009612109606459117, "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 2874172.0, "repeat_count": 1.0, "routers_loss": 0.016950147226452827, "skip_count": 2.0, "step": 1782, "text_loss": 0.48758944869041443 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.375697094217786, "f1_execute": 0.9599999785423279, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 0.08251953125, "learning_rate": 0.0009610913420715623, "loss": 0.0237, "macro_f1": 0.7644444704055786, "num_tokens": 2877528.0, "repeat_count": 2.0, "routers_loss": 0.04880943149328232, "skip_count": 1.0, "step": 1784, "text_loss": 0.4404778480529785 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.385089521573232, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0009609715468061411, "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2880627.0, "repeat_count": 0.0, "routers_loss": 0.004678630735725164, "skip_count": 0.0, "step": 1786, "text_loss": 0.7295402884483337 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.394481948928677, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0009608515748955535, "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2883333.0, "repeat_count": 0.0, "routers_loss": 0.0026695074047893286, "skip_count": 0.0, "step": 1788, "text_loss": 0.9697831273078918 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 8.40387437628412, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.107421875, "learning_rate": 0.000960731426385773, "loss": 0.0157, "macro_f1": 0.4871794879436493, "num_tokens": 2887444.0, "repeat_count": 0.0, "routers_loss": 0.029743613675236702, "skip_count": 2.0, "step": 1790, "text_loss": 0.4737568199634552 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.0009606111013228407, "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2890221.0, "repeat_count": 0.0, "routers_loss": 0.0016153788892552257, "skip_count": 0.0, "step": 1792, "text_loss": 0.6693558096885681 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.422659230995011, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009604905997528655, "loss": 0.02, "macro_f1": 0.3272727429866791, "num_tokens": 2893262.0, "repeat_count": 0.0, "routers_loss": 0.01965433731675148, "skip_count": 1.0, "step": 1794, "text_loss": 0.45227760076522827 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.432051658350455, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08642578125, "learning_rate": 0.0009603699217220239, "loss": 0.0117, "macro_f1": 0.6601307392120361, "num_tokens": 2896823.0, "repeat_count": 1.0, "routers_loss": 0.024017298594117165, "skip_count": 2.0, "step": 1796, "text_loss": 0.48865509033203125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.0009602490672765597, "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 2899707.0, "repeat_count": 0.0, "routers_loss": 0.0012420224957168102, "skip_count": 0.0, "step": 1798, "text_loss": 0.43292415142059326 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07861328125, "learning_rate": 0.0009601280364627848, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2902795.0, "repeat_count": 0.0, "routers_loss": 0.0020389219280332327, "skip_count": 0.0, "step": 1800, "text_loss": 0.41021591424942017 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.460228940416789, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009600068293270783, "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 2905769.0, "repeat_count": 0.0, "routers_loss": 0.002006303984671831, "skip_count": 0.0, "step": 1802, "text_loss": 0.46892106533050537 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.000959885445915887, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2909475.0, "repeat_count": 0.0, "routers_loss": 0.003734810510650277, "skip_count": 0.0, "step": 1804, "text_loss": 0.45364710688591003 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 8.479013795127678, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11669921875, "learning_rate": 0.0009597638862757254, "loss": 0.0182, "macro_f1": 0.8823530077934265, "num_tokens": 2914348.0, "repeat_count": 1.0, "routers_loss": 0.038971323519945145, "skip_count": 2.0, "step": 1806, "text_loss": 0.42913779616355896 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.488406222483123, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.0009596421504531751, "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2917467.0, "repeat_count": 1.0, "routers_loss": 0.04800829663872719, "skip_count": 0.0, "step": 1808, "text_loss": 0.17332297563552856 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.497798649838568, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0009595202384948858, "loss": 0.0227, "macro_f1": 0.6666666865348816, "num_tokens": 2920223.0, "repeat_count": 1.0, "routers_loss": 0.009164143353700638, "skip_count": 0.0, "step": 1810, "text_loss": 0.33740702271461487 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0947265625, "learning_rate": 0.0009593981504475742, "loss": 0.0275, "macro_f1": 0.6666666865348816, "num_tokens": 2923780.0, "repeat_count": 0.0, "routers_loss": 0.011236993595957756, "skip_count": 2.0, "step": 1812, "text_loss": 0.1609916388988495 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.516583504549457, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10595703125, "learning_rate": 0.0009592758863580248, "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2926259.0, "repeat_count": 0.0, "routers_loss": 0.019026532769203186, "skip_count": 2.0, "step": 1814, "text_loss": 0.6460903882980347 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.525975931904902, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009591534462730894, "loss": 0.0206, "macro_f1": 0.5492662787437439, "num_tokens": 2929173.0, "repeat_count": 2.0, "routers_loss": 0.0608333982527256, "skip_count": 0.0, "step": 1816, "text_loss": 0.476126492023468 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.000959030830239687, "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2932703.0, "repeat_count": 0.0, "routers_loss": 0.0093300249427557, "skip_count": 0.0, "step": 1818, "text_loss": 0.5471875667572021 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.544760786615791, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2001953125, "learning_rate": 0.0009589080383048048, "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2936195.0, "repeat_count": 0.0, "routers_loss": 0.010434109717607498, "skip_count": 0.0, "step": 1820, "text_loss": 0.5068115592002869 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0009587850705154964, "loss": 0.0291, "macro_f1": 0.3333333432674408, "num_tokens": 2939412.0, "repeat_count": 0.0, "routers_loss": 0.004347751382738352, "skip_count": 0.0, "step": 1822, "text_loss": 0.4241984784603119 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.56354564132668, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0859375, "learning_rate": 0.0009586619269188836, "loss": 0.0224, "macro_f1": 0.32098767161369324, "num_tokens": 2942318.0, "repeat_count": 0.0, "routers_loss": 0.034238871186971664, "skip_count": 1.0, "step": 1824, "text_loss": 0.2328975349664688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.572938068682125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0009585386075621553, "loss": 0.027, "macro_f1": 0.3333333432674408, "num_tokens": 2945731.0, "repeat_count": 0.0, "routers_loss": 0.006097695790231228, "skip_count": 0.0, "step": 1826, "text_loss": 0.22816994786262512 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.582330496037569, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0908203125, "learning_rate": 0.0009584151124925676, "loss": 0.0208, "macro_f1": 0.3272727429866791, "num_tokens": 2948944.0, "repeat_count": 0.0, "routers_loss": 0.007790776435285807, "skip_count": 1.0, "step": 1828, "text_loss": 0.5009413361549377 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07275390625, "learning_rate": 0.0009582914417574438, "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 2951723.0, "repeat_count": 0.0, "routers_loss": 0.009144559502601624, "skip_count": 2.0, "step": 1830, "text_loss": 0.1402502954006195 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.60111535074846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0009581675954041751, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 2954726.0, "repeat_count": 1.0, "routers_loss": 0.006593191530555487, "skip_count": 0.0, "step": 1832, "text_loss": 0.4871736466884613 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.610507778103903, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009580435734802196, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2957853.0, "repeat_count": 0.0, "routers_loss": 0.01241068821400404, "skip_count": 0.0, "step": 1834, "text_loss": 0.30100154876708984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.619900205459349, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0009579193760331027, "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2960783.0, "repeat_count": 0.0, "routers_loss": 0.002219218760728836, "skip_count": 0.0, "step": 1836, "text_loss": 0.4961516559123993 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.629292632814794, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12255859375, "learning_rate": 0.0009577950031104169, "loss": 0.0166, "macro_f1": 0.6601307392120361, "num_tokens": 2963328.0, "repeat_count": 1.0, "routers_loss": 0.029363535344600677, "skip_count": 2.0, "step": 1838, "text_loss": 0.42814353108406067 }, { "acc_repeat": 1.0, "acc_skip": 0.25, "avg_layers": 28.0, "epoch": 8.638685060170237, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.1044921875, "learning_rate": 0.0009576704547598226, "loss": 0.0257, "macro_f1": 0.7795917987823486, "num_tokens": 2966108.0, "repeat_count": 1.0, "routers_loss": 0.0579402856528759, "skip_count": 4.0, "step": 1840, "text_loss": 0.20523512363433838 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.648077487525683, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0625, "learning_rate": 0.0009575457310290463, "loss": 0.0121, "macro_f1": 0.3272727429866791, "num_tokens": 2969137.0, "repeat_count": 0.0, "routers_loss": 0.008810589089989662, "skip_count": 0.0, "step": 1842, "text_loss": 0.6199528574943542 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009574208319658831, "loss": 0.0208, "macro_f1": 0.6666666865348816, "num_tokens": 2972407.0, "repeat_count": 0.0, "routers_loss": 0.0012295129708945751, "skip_count": 1.0, "step": 1844, "text_loss": 0.66938316822052 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 8.666862342236572, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.1474609375, "learning_rate": 0.000957295757618194, "loss": 0.0152, "macro_f1": 0.4871794879436493, "num_tokens": 2976045.0, "repeat_count": 0.0, "routers_loss": 0.06162935495376587, "skip_count": 2.0, "step": 1846, "text_loss": 0.5381782650947571 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009571705080339079, "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 2979025.0, "repeat_count": 0.0, "routers_loss": 0.003950524143874645, "skip_count": 0.0, "step": 1848, "text_loss": 0.5831671357154846 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.0009570450832610208, "loss": 0.0209, "macro_f1": 0.3333333432674408, "num_tokens": 2982276.0, "repeat_count": 0.0, "routers_loss": 0.010354886762797832, "skip_count": 0.0, "step": 1850, "text_loss": 0.27448201179504395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.695039624302906, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061279296875, "learning_rate": 0.0009569194833475956, "loss": 0.0199, "macro_f1": 0.3272727429866791, "num_tokens": 2985691.0, "repeat_count": 0.0, "routers_loss": 0.010167439468204975, "skip_count": 0.0, "step": 1852, "text_loss": 0.5264663696289062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.704432051658351, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1328125, "learning_rate": 0.0009567937083417624, "loss": 0.0194, "macro_f1": 0.3272727429866791, "num_tokens": 2989126.0, "repeat_count": 0.0, "routers_loss": 0.0371871180832386, "skip_count": 1.0, "step": 1854, "text_loss": 0.2008018046617508 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.0009566677582917185, "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 2992814.0, "repeat_count": 0.0, "routers_loss": 0.010190588422119617, "skip_count": 0.0, "step": 1856, "text_loss": 0.749717116355896 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.72321690636924, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.0009565416332457282, "loss": 0.0132, "macro_f1": 0.6538461446762085, "num_tokens": 2995729.0, "repeat_count": 1.0, "routers_loss": 0.022285036742687225, "skip_count": 1.0, "step": 1858, "text_loss": 0.5870219469070435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.732609333724685, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009564153332521228, "loss": 0.0224, "macro_f1": 0.3272727429866791, "num_tokens": 2998812.0, "repeat_count": 0.0, "routers_loss": 0.011050296947360039, "skip_count": 1.0, "step": 1860, "text_loss": 0.8444408774375916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.742001761080129, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06005859375, "learning_rate": 0.0009562888583593005, "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3001799.0, "repeat_count": 0.0, "routers_loss": 0.007125461008399725, "skip_count": 0.0, "step": 1862, "text_loss": 0.41510361433029175 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.751394188435574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06884765625, "learning_rate": 0.0009561622086157272, "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3005088.0, "repeat_count": 0.0, "routers_loss": 0.0049054501578211784, "skip_count": 0.0, "step": 1864, "text_loss": 0.3801248073577881 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.760786615791018, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.054443359375, "learning_rate": 0.000956035384069935, "loss": 0.0238, "macro_f1": 1.0, "num_tokens": 3008178.0, "repeat_count": 1.0, "routers_loss": 0.005162427201867104, "skip_count": 1.0, "step": 1866, "text_loss": 0.2687684893608093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.770179043146463, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10400390625, "learning_rate": 0.0009559083847705233, "loss": 0.0214, "macro_f1": 0.3272727429866791, "num_tokens": 3010923.0, "repeat_count": 0.0, "routers_loss": 0.028984658420085907, "skip_count": 1.0, "step": 1868, "text_loss": 0.6277349591255188 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.779571470501908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009557812107661584, "loss": 0.0208, "macro_f1": 1.0, "num_tokens": 3015030.0, "repeat_count": 1.0, "routers_loss": 0.012200530618429184, "skip_count": 1.0, "step": 1870, "text_loss": 0.6293368339538574 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.788963897857352, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11962890625, "learning_rate": 0.0009556538621055739, "loss": 0.0268, "macro_f1": 0.3272727429866791, "num_tokens": 3019067.0, "repeat_count": 0.0, "routers_loss": 0.06365182995796204, "skip_count": 1.0, "step": 1872, "text_loss": 0.39046618342399597 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.115234375, "learning_rate": 0.0009555263388375699, "loss": 0.014, "macro_f1": 0.6666666865348816, "num_tokens": 3022166.0, "repeat_count": 0.0, "routers_loss": 0.0041703456081449986, "skip_count": 1.0, "step": 1874, "text_loss": 0.42232340574264526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.807748752568243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11572265625, "learning_rate": 0.0009553986410110134, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3025865.0, "repeat_count": 0.0, "routers_loss": 0.005841755773872137, "skip_count": 0.0, "step": 1876, "text_loss": 0.37600573897361755 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.817141179923686, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009552707686748388, "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3029950.0, "repeat_count": 0.0, "routers_loss": 0.05165952071547508, "skip_count": 1.0, "step": 1878, "text_loss": 0.33717799186706543 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009551427218780467, "loss": 0.0219, "macro_f1": 0.6666666865348816, "num_tokens": 3033649.0, "repeat_count": 0.0, "routers_loss": 0.020680008456110954, "skip_count": 2.0, "step": 1880, "text_loss": 0.5011783838272095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.835926034634575, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.0009550145006697048, "loss": 0.0217, "macro_f1": 0.32098764181137085, "num_tokens": 3036847.0, "repeat_count": 0.0, "routers_loss": 0.07626450061798096, "skip_count": 2.0, "step": 1882, "text_loss": 0.3066408336162567 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.84531846199002, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.056396484375, "learning_rate": 0.0009548861050989482, "loss": 0.0136, "macro_f1": 1.0, "num_tokens": 3040353.0, "repeat_count": 1.0, "routers_loss": 0.010884666815400124, "skip_count": 1.0, "step": 1884, "text_loss": 0.49779415130615234 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0908203125, "learning_rate": 0.0009547575352149778, "loss": 0.0213, "macro_f1": 0.6666666865348816, "num_tokens": 3043504.0, "repeat_count": 0.0, "routers_loss": 0.006704333238303661, "skip_count": 2.0, "step": 1886, "text_loss": 0.12284614145755768 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.86410331670091, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11474609375, "learning_rate": 0.0009546287910670621, "loss": 0.0211, "macro_f1": 0.5427350401878357, "num_tokens": 3046422.0, "repeat_count": 1.0, "routers_loss": 0.04799000173807144, "skip_count": 2.0, "step": 1888, "text_loss": 0.1824081838130951 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009544998727045361, "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 3049819.0, "repeat_count": 0.0, "routers_loss": 0.008139612153172493, "skip_count": 0.0, "step": 1890, "text_loss": 0.18929053843021393 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 8.8828881714118, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.09375, "learning_rate": 0.0009543707801768015, "loss": 0.0175, "macro_f1": 0.5934640765190125, "num_tokens": 3052766.0, "repeat_count": 0.0, "routers_loss": 0.02966771461069584, "skip_count": 3.0, "step": 1892, "text_loss": 0.247748002409935 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 8.892280598767243, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.06689453125, "learning_rate": 0.0009542415135333267, "loss": 0.0193, "macro_f1": 0.44705885648727417, "num_tokens": 3056427.0, "repeat_count": 0.0, "routers_loss": 0.03637036308646202, "skip_count": 2.0, "step": 1894, "text_loss": 0.2583999037742615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.901673026122689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0595703125, "learning_rate": 0.0009541120728236472, "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3059497.0, "repeat_count": 0.0, "routers_loss": 0.007026574574410915, "skip_count": 0.0, "step": 1896, "text_loss": 0.5222375988960266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.076171875, "learning_rate": 0.0009539824580973646, "loss": 0.0219, "macro_f1": 0.3333333432674408, "num_tokens": 3062187.0, "repeat_count": 0.0, "routers_loss": 0.003449335927143693, "skip_count": 0.0, "step": 1898, "text_loss": 0.5736427307128906 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0009538526694041477, "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3066100.0, "repeat_count": 0.0, "routers_loss": 0.0035463871899992228, "skip_count": 0.0, "step": 1900, "text_loss": 0.5471583604812622 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.929850308189023, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.080078125, "learning_rate": 0.0009537227067937318, "loss": 0.0233, "macro_f1": 1.0, "num_tokens": 3068737.0, "repeat_count": 3.0, "routers_loss": 0.00597514258697629, "skip_count": 3.0, "step": 1902, "text_loss": 0.36644190549850464 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.939242735544468, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.166015625, "learning_rate": 0.0009535925703159186, "loss": 0.0301, "macro_f1": 0.32098764181137085, "num_tokens": 3071686.0, "repeat_count": 0.0, "routers_loss": 0.025420479476451874, "skip_count": 2.0, "step": 1904, "text_loss": 0.535789966583252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.948635162899912, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009534622600205769, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3074954.0, "repeat_count": 0.0, "routers_loss": 0.014377486892044544, "skip_count": 0.0, "step": 1906, "text_loss": 0.19009549915790558 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.958027590255357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11083984375, "learning_rate": 0.0009533317759576416, "loss": 0.0197, "macro_f1": 0.3333333432674408, "num_tokens": 3077540.0, "repeat_count": 0.0, "routers_loss": 0.004848944488912821, "skip_count": 0.0, "step": 1908, "text_loss": 0.5022001266479492 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009532011181771148, "loss": 0.0217, "macro_f1": 0.6666666865348816, "num_tokens": 3080445.0, "repeat_count": 0.0, "routers_loss": 0.009480170905590057, "skip_count": 2.0, "step": 1910, "text_loss": 0.35135936737060547 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10400390625, "learning_rate": 0.0009530702867290644, "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 3083657.0, "repeat_count": 0.0, "routers_loss": 0.0019353039097040892, "skip_count": 0.0, "step": 1912, "text_loss": 0.5123994946479797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.986204872321691, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0009529392816636256, "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 3086837.0, "repeat_count": 0.0, "routers_loss": 0.0010921972570940852, "skip_count": 0.0, "step": 1914, "text_loss": 0.44477662444114685 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.995597299677135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19140625, "learning_rate": 0.0009528081030309995, "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 3089892.0, "repeat_count": 0.0, "routers_loss": 0.0018027103506028652, "skip_count": 0.0, "step": 1916, "text_loss": 0.7356183528900146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.004696213677722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009526767508814542, "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3093058.0, "repeat_count": 0.0, "routers_loss": 0.003243023296818137, "skip_count": 0.0, "step": 1918, "text_loss": 0.48823556303977966 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.0009525452252653239, "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 3096404.0, "repeat_count": 0.0, "routers_loss": 0.009360014460980892, "skip_count": 0.0, "step": 1920, "text_loss": 0.21498437225818634 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 9.023481068388612, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.140625, "learning_rate": 0.0009524135262330098, "loss": 0.0224, "macro_f1": 0.9265305995941162, "num_tokens": 3099520.0, "repeat_count": 1.0, "routers_loss": 0.017444295808672905, "skip_count": 3.0, "step": 1922, "text_loss": 0.27608850598335266 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.032873495744056, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.050537109375, "learning_rate": 0.0009522816538349789, "loss": 0.0162, "macro_f1": 0.5492662787437439, "num_tokens": 3102956.0, "repeat_count": 0.0, "routers_loss": 0.06424452364444733, "skip_count": 2.0, "step": 1924, "text_loss": 0.21558666229248047 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.042265923099501, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0009521496081217651, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3106565.0, "repeat_count": 1.0, "routers_loss": 0.002270506462082267, "skip_count": 0.0, "step": 1926, "text_loss": 0.5641813278198242 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.051658350454945, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.095703125, "learning_rate": 0.0009520173891439684, "loss": 0.0216, "macro_f1": 0.6666666865348816, "num_tokens": 3109314.0, "repeat_count": 0.0, "routers_loss": 0.011512448079884052, "skip_count": 1.0, "step": 1928, "text_loss": 0.6351624727249146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009518849969522556, "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 3112956.0, "repeat_count": 0.0, "routers_loss": 0.003883908037096262, "skip_count": 0.0, "step": 1930, "text_loss": 0.35160085558891296 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.070443205165835, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0009517524315973595, "loss": 0.019, "macro_f1": 1.0, "num_tokens": 3115593.0, "repeat_count": 1.0, "routers_loss": 0.009479222819209099, "skip_count": 3.0, "step": 1932, "text_loss": 0.2900560200214386 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.079835632521279, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.0009516196931300794, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3118516.0, "repeat_count": 0.0, "routers_loss": 0.017834696918725967, "skip_count": 2.0, "step": 1934, "text_loss": 0.20094378292560577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009514867816012809, "loss": 0.02, "macro_f1": 0.3333333432674408, "num_tokens": 3122242.0, "repeat_count": 0.0, "routers_loss": 0.0017964740982279181, "skip_count": 0.0, "step": 1936, "text_loss": 0.6498590707778931 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.0009513536970618961, "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3125645.0, "repeat_count": 0.0, "routers_loss": 0.007437168620526791, "skip_count": 2.0, "step": 1938, "text_loss": 0.25863033533096313 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.108012914587613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0625, "learning_rate": 0.0009512204395629232, "loss": 0.0184, "macro_f1": 0.6666666865348816, "num_tokens": 3128740.0, "repeat_count": 0.0, "routers_loss": 0.0008759932243265212, "skip_count": 1.0, "step": 1940, "text_loss": 0.5638351440429688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.117405341943059, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06884765625, "learning_rate": 0.0009510870091554264, "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3131742.0, "repeat_count": 1.0, "routers_loss": 0.019906625151634216, "skip_count": 0.0, "step": 1942, "text_loss": 0.8410717844963074 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.126797769298504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.0009509534058905369, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3134407.0, "repeat_count": 0.0, "routers_loss": 0.0009229081333614886, "skip_count": 0.0, "step": 1944, "text_loss": 0.47506049275398254 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0009508196298194517, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3137053.0, "repeat_count": 0.0, "routers_loss": 0.003630586201325059, "skip_count": 0.0, "step": 1946, "text_loss": 0.32225799560546875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009506856809934338, "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 3140943.0, "repeat_count": 0.0, "routers_loss": 0.007580445148050785, "skip_count": 0.0, "step": 1948, "text_loss": 0.3120577931404114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0009505515594638127, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3144298.0, "repeat_count": 0.0, "routers_loss": 0.004471861757338047, "skip_count": 0.0, "step": 1950, "text_loss": 0.22052447497844696 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 9.164367478720282, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09130859375, "learning_rate": 0.0009504172652819843, "loss": 0.023, "macro_f1": 1.0, "num_tokens": 3147069.0, "repeat_count": 1.0, "routers_loss": 0.009606664068996906, "skip_count": 1.0, "step": 1952, "text_loss": 0.34773921966552734 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0625, "learning_rate": 0.0009502827984994099, "loss": 0.0148, "macro_f1": 0.6666666865348816, "num_tokens": 3149992.0, "repeat_count": 0.0, "routers_loss": 0.006443799939006567, "skip_count": 1.0, "step": 1954, "text_loss": 0.6442171335220337 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.0009501481591676177, "loss": 0.0188, "macro_f1": 0.3333333432674408, "num_tokens": 3153167.0, "repeat_count": 0.0, "routers_loss": 0.003219039412215352, "skip_count": 0.0, "step": 1956, "text_loss": 0.43369221687316895 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.192544760786616, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.000950013347338202, "loss": 0.0152, "macro_f1": 0.3272727429866791, "num_tokens": 3156590.0, "repeat_count": 0.0, "routers_loss": 0.025551019236445427, "skip_count": 1.0, "step": 1958, "text_loss": 0.294479101896286 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.201937188142061, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009498783630628225, "loss": 0.0158, "macro_f1": 1.0, "num_tokens": 3159451.0, "repeat_count": 1.0, "routers_loss": 0.013802438974380493, "skip_count": 2.0, "step": 1960, "text_loss": 0.20888492465019226 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.211329615497505, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009497432063932057, "loss": 0.0137, "macro_f1": 0.6601307392120361, "num_tokens": 3162889.0, "repeat_count": 1.0, "routers_loss": 0.02852988988161087, "skip_count": 2.0, "step": 1962, "text_loss": 0.5027125477790833 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045166015625, "learning_rate": 0.0009496078773811437, "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 3165979.0, "repeat_count": 0.0, "routers_loss": 0.01784522272646427, "skip_count": 2.0, "step": 1964, "text_loss": 0.1696339100599289 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.000949472376078495, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3168683.0, "repeat_count": 0.0, "routers_loss": 0.0017019887454807758, "skip_count": 0.0, "step": 1966, "text_loss": 0.48905447125434875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.051025390625, "learning_rate": 0.000949336702537184, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 3171968.0, "repeat_count": 0.0, "routers_loss": 0.004817947279661894, "skip_count": 2.0, "step": 1968, "text_loss": 0.20984773337841034 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.248899324919284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0009492008568092007, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3175947.0, "repeat_count": 0.0, "routers_loss": 0.0012963006738573313, "skip_count": 0.0, "step": 1970, "text_loss": 0.5215106010437012 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 9.258291752274728, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.044921875, "learning_rate": 0.0009490648389466019, "loss": 0.0135, "macro_f1": 0.4871794879436493, "num_tokens": 3179348.0, "repeat_count": 0.0, "routers_loss": 0.03950481489300728, "skip_count": 2.0, "step": 1972, "text_loss": 0.24640929698944092 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09326171875, "learning_rate": 0.0009489286490015097, "loss": 0.0183, "macro_f1": 0.6666666865348816, "num_tokens": 3182640.0, "repeat_count": 0.0, "routers_loss": 0.0043345349840819836, "skip_count": 2.0, "step": 1974, "text_loss": 0.6362852454185486 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.277076606985618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0009487922870261122, "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3185657.0, "repeat_count": 0.0, "routers_loss": 0.0015687479171901941, "skip_count": 0.0, "step": 1976, "text_loss": 0.8977144360542297 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.286469034341062, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061279296875, "learning_rate": 0.0009486557530726638, "loss": 0.0139, "macro_f1": 0.3333333432674408, "num_tokens": 3188772.0, "repeat_count": 0.0, "routers_loss": 0.0010977238416671753, "skip_count": 0.0, "step": 1978, "text_loss": 0.38512736558914185 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 9.295861461696507, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.0009485190471934844, "loss": 0.0196, "macro_f1": 0.6666666865348816, "num_tokens": 3193131.0, "repeat_count": 2.0, "routers_loss": 0.002264744369313121, "skip_count": 0.0, "step": 1980, "text_loss": 0.4171289801597595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.305253889051952, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.00094838216944096, "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3196668.0, "repeat_count": 0.0, "routers_loss": 0.042320676147937775, "skip_count": 1.0, "step": 1982, "text_loss": 0.19008000195026398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 9.314646316407396, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0009482451198675424, "loss": 0.0151, "macro_f1": 0.32098767161369324, "num_tokens": 3200282.0, "repeat_count": 0.0, "routers_loss": 0.01796630397439003, "skip_count": 1.0, "step": 1984, "text_loss": 0.5009249448776245 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.324038743762841, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061767578125, "learning_rate": 0.0009481078985257494, "loss": 0.0147, "macro_f1": 0.6666666865348816, "num_tokens": 3204439.0, "repeat_count": 0.0, "routers_loss": 0.01052347756922245, "skip_count": 1.0, "step": 1986, "text_loss": 0.15319275856018066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.333431171118287, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0009479705054681644, "loss": 0.015, "macro_f1": 0.3076923191547394, "num_tokens": 3207590.0, "repeat_count": 1.0, "routers_loss": 0.09640293568372726, "skip_count": 3.0, "step": 1988, "text_loss": 0.3654652535915375 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.34282359847373, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06689453125, "learning_rate": 0.0009478329407474366, "loss": 0.0183, "macro_f1": 0.5492662787437439, "num_tokens": 3211172.0, "repeat_count": 0.0, "routers_loss": 0.012670112773776054, "skip_count": 1.0, "step": 1990, "text_loss": 0.5817596316337585 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.352216025829176, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05859375, "learning_rate": 0.000947695204416281, "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 3214050.0, "repeat_count": 1.0, "routers_loss": 0.005263707600533962, "skip_count": 0.0, "step": 1992, "text_loss": 0.5985888242721558 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.361608453184619, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009475572965274787, "loss": 0.0144, "macro_f1": 0.3272727429866791, "num_tokens": 3217318.0, "repeat_count": 1.0, "routers_loss": 0.0682850033044815, "skip_count": 0.0, "step": 1994, "text_loss": 0.316506564617157 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.371000880540064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0595703125, "learning_rate": 0.000947419217133876, "loss": 0.019, "macro_f1": 0.6666666865348816, "num_tokens": 3220012.0, "repeat_count": 0.0, "routers_loss": 0.008508823812007904, "skip_count": 2.0, "step": 1996, "text_loss": 0.09665893763303757 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.38039330789551, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.053466796875, "learning_rate": 0.0009472809662883852, "loss": 0.0155, "macro_f1": 1.0, "num_tokens": 3223019.0, "repeat_count": 1.0, "routers_loss": 0.01100847590714693, "skip_count": 2.0, "step": 1998, "text_loss": 0.4938808083534241 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.389785735250953, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0009471425440439844, "loss": 0.0135, "macro_f1": 0.8817967176437378, "num_tokens": 3226013.0, "repeat_count": 2.0, "routers_loss": 0.04953207075595856, "skip_count": 3.0, "step": 2000, "text_loss": 0.22258254885673523 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 9.399178162606399, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009470039504537173, "loss": 0.0186, "macro_f1": 0.31446540355682373, "num_tokens": 3230031.0, "repeat_count": 0.0, "routers_loss": 0.052884332835674286, "skip_count": 2.0, "step": 2002, "text_loss": 0.1741616576910019 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.408570589961844, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009468651855706931, "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 3232991.0, "repeat_count": 1.0, "routers_loss": 0.008056716993451118, "skip_count": 0.0, "step": 2004, "text_loss": 0.3173636198043823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0009467262494480868, "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3236390.0, "repeat_count": 0.0, "routers_loss": 0.0053409393876791, "skip_count": 0.0, "step": 2006, "text_loss": 0.5806330442428589 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.427355444672733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.000946587142139139, "loss": 0.0147, "macro_f1": 0.3333333432674408, "num_tokens": 3239267.0, "repeat_count": 0.0, "routers_loss": 0.0015652200672775507, "skip_count": 0.0, "step": 2008, "text_loss": 0.6214317679405212 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.436747872028178, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11376953125, "learning_rate": 0.000946447863697156, "loss": 0.0151, "macro_f1": 0.6601307392120361, "num_tokens": 3242569.0, "repeat_count": 1.0, "routers_loss": 0.011673987843096256, "skip_count": 2.0, "step": 2010, "text_loss": 0.532565712928772 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.446140299383622, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0009463084141755093, "loss": 0.0159, "macro_f1": 0.3272727429866791, "num_tokens": 3245669.0, "repeat_count": 0.0, "routers_loss": 0.028480790555477142, "skip_count": 1.0, "step": 2012, "text_loss": 0.25210800766944885 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.455532726739067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009461687936276364, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3248751.0, "repeat_count": 0.0, "routers_loss": 0.007234727032482624, "skip_count": 0.0, "step": 2014, "text_loss": 0.35922971367836 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.46492515409451, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0009460290021070402, "loss": 0.0195, "macro_f1": 0.6666666865348816, "num_tokens": 3252614.0, "repeat_count": 1.0, "routers_loss": 0.014691276475787163, "skip_count": 0.0, "step": 2016, "text_loss": 0.2747853398323059 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051513671875, "learning_rate": 0.0009458890396672888, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3256374.0, "repeat_count": 0.0, "routers_loss": 0.002385235857218504, "skip_count": 0.0, "step": 2018, "text_loss": 0.5268719792366028 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 9.483710008805401, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.04443359375, "learning_rate": 0.0009457489063620164, "loss": 0.0133, "macro_f1": 0.8823530077934265, "num_tokens": 3259792.0, "repeat_count": 1.0, "routers_loss": 0.047268565744161606, "skip_count": 2.0, "step": 2020, "text_loss": 0.7785539627075195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.493102436160845, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009456086022449221, "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 3262833.0, "repeat_count": 0.0, "routers_loss": 0.015878718346357346, "skip_count": 1.0, "step": 2022, "text_loss": 0.42270028591156006 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.50249486351629, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08935546875, "learning_rate": 0.0009454681273697711, "loss": 0.0117, "macro_f1": 0.3272727429866791, "num_tokens": 3265718.0, "repeat_count": 1.0, "routers_loss": 0.030749641358852386, "skip_count": 0.0, "step": 2024, "text_loss": 0.18668225407600403 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.511887290871735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0009453274817903931, "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3268158.0, "repeat_count": 0.0, "routers_loss": 0.011538166552782059, "skip_count": 1.0, "step": 2026, "text_loss": 0.34090787172317505 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.000945186665560684, "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 3271082.0, "repeat_count": 0.0, "routers_loss": 0.009527760557830334, "skip_count": 0.0, "step": 2028, "text_loss": 0.2110334187746048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.530672145582624, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.119140625, "learning_rate": 0.000945045678734605, "loss": 0.0175, "macro_f1": 0.3144654333591461, "num_tokens": 3273488.0, "repeat_count": 0.0, "routers_loss": 0.03317151218652725, "skip_count": 3.0, "step": 2030, "text_loss": 0.2233227640390396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.540064572938068, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12451171875, "learning_rate": 0.0009449045213661822, "loss": 0.0201, "macro_f1": 0.3272727429866791, "num_tokens": 3276646.0, "repeat_count": 0.0, "routers_loss": 0.018510591238737106, "skip_count": 1.0, "step": 2032, "text_loss": 0.16100332140922546 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 9.549457000293513, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.1318359375, "learning_rate": 0.0009447631935095077, "loss": 0.0185, "macro_f1": 0.9452888369560242, "num_tokens": 3279441.0, "repeat_count": 1.0, "routers_loss": 0.028113311156630516, "skip_count": 4.0, "step": 2034, "text_loss": 0.29208317399024963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0009446216952187384, "loss": 0.0164, "macro_f1": 0.3333333432674408, "num_tokens": 3282697.0, "repeat_count": 0.0, "routers_loss": 0.008379172533750534, "skip_count": 0.0, "step": 2036, "text_loss": 0.16026398539543152 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06298828125, "learning_rate": 0.0009444800265480967, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3285574.0, "repeat_count": 0.0, "routers_loss": 0.00941354501992464, "skip_count": 0.0, "step": 2038, "text_loss": 0.29523080587387085 }, { "acc_repeat": 0.75, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 9.577634282359847, "f1_execute": 0.9230769276618958, "f1_repeat": 0.8571428656578064, "f1_skip": 0.800000011920929, "grad_norm": 0.076171875, "learning_rate": 0.0009443381875518703, "loss": 0.0197, "macro_f1": 0.8600732684135437, "num_tokens": 3289159.0, "repeat_count": 4.0, "routers_loss": 0.04974055662751198, "skip_count": 6.0, "step": 2040, "text_loss": 0.23033179342746735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.587026709715293, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0537109375, "learning_rate": 0.0009441961782844123, "loss": 0.0146, "macro_f1": 0.3272727429866791, "num_tokens": 3293598.0, "repeat_count": 0.0, "routers_loss": 0.022241825237870216, "skip_count": 1.0, "step": 2042, "text_loss": 0.8299165368080139 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.0009440539988001408, "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3296648.0, "repeat_count": 0.0, "routers_loss": 0.011019332334399223, "skip_count": 0.0, "step": 2044, "text_loss": 0.18207129836082458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.605811564426181, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0009439116491535394, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3300058.0, "repeat_count": 0.0, "routers_loss": 0.002889640862122178, "skip_count": 0.0, "step": 2046, "text_loss": 0.7051978707313538 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 9.615203991781627, "f1_execute": 0.9333333373069763, "f1_repeat": 0.5, "f1_skip": 0.8571428656578064, "grad_norm": 0.078125, "learning_rate": 0.0009437691293991563, "loss": 0.0192, "macro_f1": 0.7634921073913574, "num_tokens": 3303296.0, "repeat_count": 3.0, "routers_loss": 0.07741832733154297, "skip_count": 4.0, "step": 2048, "text_loss": 0.15563532710075378 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09521484375, "learning_rate": 0.0009436264395916061, "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 3306204.0, "repeat_count": 0.0, "routers_loss": 0.014225383289158344, "skip_count": 2.0, "step": 2050, "text_loss": 0.18117287755012512 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.633988846492516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009434835797855672, "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 3309444.0, "repeat_count": 0.0, "routers_loss": 0.0023932650219649076, "skip_count": 0.0, "step": 2052, "text_loss": 0.4645874798297882 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.643381273847961, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0009433405500357839, "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3312488.0, "repeat_count": 0.0, "routers_loss": 0.03193361684679985, "skip_count": 1.0, "step": 2054, "text_loss": 0.5291082859039307 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0009431973503970655, "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3315765.0, "repeat_count": 0.0, "routers_loss": 0.0020529816392809153, "skip_count": 0.0, "step": 2056, "text_loss": 0.5877931118011475 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.66216612855885, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0009430539809242864, "loss": 0.0185, "macro_f1": 0.32098764181137085, "num_tokens": 3318877.0, "repeat_count": 2.0, "routers_loss": 0.07907948642969131, "skip_count": 0.0, "step": 2058, "text_loss": 0.3836737871170044 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 9.671558555914293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009429104416723862, "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 3322576.0, "repeat_count": 2.0, "routers_loss": 0.003006070153787732, "skip_count": 0.0, "step": 2060, "text_loss": 0.3480920195579529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 0.0009427667326963689, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3325974.0, "repeat_count": 0.0, "routers_loss": 0.005013179033994675, "skip_count": 0.0, "step": 2062, "text_loss": 0.931358814239502 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.690343410625184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0009426228540513047, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 3329398.0, "repeat_count": 0.0, "routers_loss": 0.0059848143719136715, "skip_count": 0.0, "step": 2064, "text_loss": 0.47568953037261963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.699735837980628, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009424788057923277, "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3332029.0, "repeat_count": 0.0, "routers_loss": 0.00783882662653923, "skip_count": 0.0, "step": 2066, "text_loss": 0.22887596487998962 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.709128265336073, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0712890625, "learning_rate": 0.0009423345879746376, "loss": 0.0128, "macro_f1": 0.5492662787437439, "num_tokens": 3334858.0, "repeat_count": 0.0, "routers_loss": 0.01866884157061577, "skip_count": 2.0, "step": 2068, "text_loss": 0.17724967002868652 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.718520692691518, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06591796875, "learning_rate": 0.000942190200653499, "loss": 0.0162, "macro_f1": 0.32098764181137085, "num_tokens": 3338094.0, "repeat_count": 0.0, "routers_loss": 0.028636593371629715, "skip_count": 2.0, "step": 2070, "text_loss": 0.34344956278800964 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.727913120046962, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.07568359375, "learning_rate": 0.0009420456438842413, "loss": 0.0165, "macro_f1": 0.5492662787437439, "num_tokens": 3340526.0, "repeat_count": 0.0, "routers_loss": 0.023245645686984062, "skip_count": 2.0, "step": 2072, "text_loss": 0.7276164293289185 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.737305547402407, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11328125, "learning_rate": 0.000941900917722259, "loss": 0.0143, "macro_f1": 0.3272727429866791, "num_tokens": 3343303.0, "repeat_count": 1.0, "routers_loss": 0.01565689593553543, "skip_count": 0.0, "step": 2074, "text_loss": 0.5665070414543152 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1201171875, "learning_rate": 0.0009417560222230115, "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 3346409.0, "repeat_count": 0.0, "routers_loss": 0.0035056080669164658, "skip_count": 0.0, "step": 2076, "text_loss": 0.5112795233726501 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.0009416109574420229, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3349220.0, "repeat_count": 0.0, "routers_loss": 0.0027565446216613054, "skip_count": 0.0, "step": 2078, "text_loss": 0.5240910053253174 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 9.765482829468741, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08203125, "learning_rate": 0.0009414657234348823, "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 3352627.0, "repeat_count": 3.0, "routers_loss": 0.01652451977133751, "skip_count": 2.0, "step": 2080, "text_loss": 1.0217112302780151 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.774875256824185, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009413203202572438, "loss": 0.0179, "macro_f1": 0.32098764181137085, "num_tokens": 3355392.0, "repeat_count": 0.0, "routers_loss": 0.1012420505285263, "skip_count": 2.0, "step": 2082, "text_loss": 0.4085482358932495 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08251953125, "learning_rate": 0.000941174747964826, "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3358425.0, "repeat_count": 0.0, "routers_loss": 0.004962718114256859, "skip_count": 0.0, "step": 2084, "text_loss": 0.5833504796028137 }, { "acc_repeat": 0.5, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 9.793660111535075, "f1_execute": 0.9583333134651184, "f1_repeat": 0.6666666865348816, "f1_skip": 0.800000011920929, "grad_norm": 0.11376953125, "learning_rate": 0.0009410290066134124, "loss": 0.0211, "macro_f1": 0.8083333373069763, "num_tokens": 3361925.0, "repeat_count": 2.0, "routers_loss": 0.07889176905155182, "skip_count": 3.0, "step": 2086, "text_loss": 0.38126569986343384 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.803052538890519, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.051513671875, "learning_rate": 0.0009408830962588517, "loss": 0.0195, "macro_f1": 0.6601307392120361, "num_tokens": 3365963.0, "repeat_count": 1.0, "routers_loss": 0.033715736120939255, "skip_count": 2.0, "step": 2088, "text_loss": 0.23213914036750793 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.812444966245964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0009407370169570567, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3369422.0, "repeat_count": 0.0, "routers_loss": 0.0014188943896442652, "skip_count": 0.0, "step": 2090, "text_loss": 0.4648318886756897 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.82183739360141, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0009405907687640054, "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 3372506.0, "repeat_count": 0.0, "routers_loss": 0.015339684672653675, "skip_count": 1.0, "step": 2092, "text_loss": 0.2563800811767578 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 9.831229820956853, "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.054443359375, "learning_rate": 0.0009404443517357404, "loss": 0.0146, "macro_f1": 0.542222261428833, "num_tokens": 3375653.0, "repeat_count": 4.0, "routers_loss": 0.06562861055135727, "skip_count": 0.0, "step": 2094, "text_loss": 0.797835111618042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.840622248312298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.000940297765928369, "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3379018.0, "repeat_count": 0.0, "routers_loss": 0.005745889153331518, "skip_count": 0.0, "step": 2096, "text_loss": 0.4238114655017853 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0009401510113980631, "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 3382855.0, "repeat_count": 0.0, "routers_loss": 0.0026634482201188803, "skip_count": 0.0, "step": 2098, "text_loss": 0.4967166483402252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009400040882010592, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 3386386.0, "repeat_count": 0.0, "routers_loss": 0.0020642587915062904, "skip_count": 0.0, "step": 2100, "text_loss": 0.44390562176704407 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.868799530378633, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056640625, "learning_rate": 0.0009398569963936589, "loss": 0.017, "macro_f1": 0.3272727429866791, "num_tokens": 3389958.0, "repeat_count": 0.0, "routers_loss": 0.013722737319767475, "skip_count": 1.0, "step": 2102, "text_loss": 0.7207565903663635 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.878191957734076, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.0009397097360322276, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3392892.0, "repeat_count": 0.0, "routers_loss": 0.002051608171314001, "skip_count": 0.0, "step": 2104, "text_loss": 0.3196398913860321 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.887584385089522, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.000939562307173196, "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 3396636.0, "repeat_count": 0.0, "routers_loss": 0.007085663266479969, "skip_count": 0.0, "step": 2106, "text_loss": 0.5663776397705078 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.896976812444967, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11328125, "learning_rate": 0.0009394147098730592, "loss": 0.02, "macro_f1": 0.5492662787437439, "num_tokens": 3399475.0, "repeat_count": 0.0, "routers_loss": 0.019473131746053696, "skip_count": 2.0, "step": 2108, "text_loss": 0.7708223462104797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0009392669441883767, "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3402350.0, "repeat_count": 0.0, "routers_loss": 0.0028328890912234783, "skip_count": 0.0, "step": 2110, "text_loss": 0.5888006091117859 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10693359375, "learning_rate": 0.0009391190101757724, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3405561.0, "repeat_count": 0.0, "routers_loss": 0.023098422214388847, "skip_count": 2.0, "step": 2112, "text_loss": 0.09865197539329529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.925154094511301, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.000938970907891935, "loss": 0.0247, "macro_f1": 0.3333333432674408, "num_tokens": 3408513.0, "repeat_count": 0.0, "routers_loss": 0.002896632067859173, "skip_count": 0.0, "step": 2114, "text_loss": 0.6613234281539917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0009388226373936179, "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 3411195.0, "repeat_count": 0.0, "routers_loss": 0.015814457088708878, "skip_count": 0.0, "step": 2116, "text_loss": 0.17363053560256958 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.94393894922219, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12451171875, "learning_rate": 0.0009386741987376381, "loss": 0.015, "macro_f1": 0.6603773832321167, "num_tokens": 3414875.0, "repeat_count": 1.0, "routers_loss": 0.02676783688366413, "skip_count": 0.0, "step": 2118, "text_loss": 0.674056887626648 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0009385255919808778, "loss": 0.0203, "macro_f1": 0.6666666865348816, "num_tokens": 3418410.0, "repeat_count": 0.0, "routers_loss": 0.01022857241332531, "skip_count": 1.0, "step": 2120, "text_loss": 0.235092431306839 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.962723803933079, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0888671875, "learning_rate": 0.0009383768171802836, "loss": 0.0244, "macro_f1": 0.5492662787437439, "num_tokens": 3421289.0, "repeat_count": 0.0, "routers_loss": 0.013572212308645248, "skip_count": 2.0, "step": 2122, "text_loss": 0.5992844104766846 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.064453125, "learning_rate": 0.0009382278743928659, "loss": 0.0201, "macro_f1": 0.6666666865348816, "num_tokens": 3424781.0, "repeat_count": 0.0, "routers_loss": 0.0051873656921088696, "skip_count": 2.0, "step": 2124, "text_loss": 0.29915499687194824 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 9.981508658643968, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.07421875, "learning_rate": 0.0009380787636757001, "loss": 0.0155, "macro_f1": 0.6122449040412903, "num_tokens": 3427942.0, "repeat_count": 0.0, "routers_loss": 0.030079292133450508, "skip_count": 4.0, "step": 2126, "text_loss": 0.24181491136550903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0009379294850859256, "loss": 0.0141, "macro_f1": 0.3333333432674408, "num_tokens": 3431314.0, "repeat_count": 0.0, "routers_loss": 0.002675612922757864, "skip_count": 0.0, "step": 2128, "text_loss": 0.4669873118400574 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009377800386807465, "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 3435020.0, "repeat_count": 0.0, "routers_loss": 0.009334275498986244, "skip_count": 0.0, "step": 2130, "text_loss": 0.6478219628334045 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.009392427355445, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.134765625, "learning_rate": 0.0009376304245174306, "loss": 0.0137, "macro_f1": 0.6000000238418579, "num_tokens": 3438276.0, "repeat_count": 1.0, "routers_loss": 0.038227908313274384, "skip_count": 2.0, "step": 2132, "text_loss": 0.4401201903820038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.018784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041748046875, "learning_rate": 0.0009374806426533104, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3440938.0, "repeat_count": 0.0, "routers_loss": 0.006901399698108435, "skip_count": 0.0, "step": 2134, "text_loss": 0.5948942303657532 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0009373306931457827, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3444028.0, "repeat_count": 0.0, "routers_loss": 0.0037061909679323435, "skip_count": 0.0, "step": 2136, "text_loss": 0.5349751114845276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0009371805760523086, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 3448331.0, "repeat_count": 0.0, "routers_loss": 0.0025877030566334724, "skip_count": 0.0, "step": 2138, "text_loss": 0.4591051936149597 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.046962136777223, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.07373046875, "learning_rate": 0.0009370302914304129, "loss": 0.0144, "macro_f1": 0.5934640765190125, "num_tokens": 3451434.0, "repeat_count": 0.0, "routers_loss": 0.018742674961686134, "skip_count": 3.0, "step": 2140, "text_loss": 0.23470863699913025 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.056354564132668, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009368798393376851, "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 3454375.0, "repeat_count": 0.0, "routers_loss": 0.02382594160735607, "skip_count": 1.0, "step": 2142, "text_loss": 0.6077954769134521 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.065746991488112, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05517578125, "learning_rate": 0.0009367292198317787, "loss": 0.0164, "macro_f1": 0.5492662787437439, "num_tokens": 3457591.0, "repeat_count": 0.0, "routers_loss": 0.03331060707569122, "skip_count": 2.0, "step": 2144, "text_loss": 0.3691073954105377 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.075139418843557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0009365784329704115, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3460895.0, "repeat_count": 0.0, "routers_loss": 0.0016955457394942641, "skip_count": 0.0, "step": 2146, "text_loss": 0.3947436511516571 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.084531846199003, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0009364274788113651, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 3464101.0, "repeat_count": 1.0, "routers_loss": 0.006169239990413189, "skip_count": 0.0, "step": 2148, "text_loss": 0.3348555266857147 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 10.093924273554446, "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009362763574124858, "loss": 0.019, "macro_f1": 0.9265305995941162, "num_tokens": 3467417.0, "repeat_count": 3.0, "routers_loss": 0.024033790454268456, "skip_count": 1.0, "step": 2150, "text_loss": 0.496633380651474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0009361250688316829, "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3470917.0, "repeat_count": 0.0, "routers_loss": 0.0024986129719763994, "skip_count": 0.0, "step": 2152, "text_loss": 0.6857671737670898 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0546875, "learning_rate": 0.0009359736131269312, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3473624.0, "repeat_count": 0.0, "routers_loss": 0.008183322846889496, "skip_count": 1.0, "step": 2154, "text_loss": 0.13883116841316223 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06640625, "learning_rate": 0.0009358219903562684, "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 3476472.0, "repeat_count": 0.0, "routers_loss": 0.011198793537914753, "skip_count": 3.0, "step": 2156, "text_loss": 0.24243666231632233 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.131493982976226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009356702005777969, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3479688.0, "repeat_count": 0.0, "routers_loss": 0.002520184963941574, "skip_count": 0.0, "step": 2158, "text_loss": 0.6407818794250488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.140886410331671, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009355182438496825, "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3482598.0, "repeat_count": 0.0, "routers_loss": 0.0011065017897635698, "skip_count": 0.0, "step": 2160, "text_loss": 0.7214245796203613 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0009353661202301557, "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 3486271.0, "repeat_count": 0.0, "routers_loss": 0.0017824085662141442, "skip_count": 0.0, "step": 2162, "text_loss": 0.5140969157218933 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.053466796875, "learning_rate": 0.0009352138297775101, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3489206.0, "repeat_count": 0.0, "routers_loss": 0.001542879967018962, "skip_count": 0.0, "step": 2164, "text_loss": 0.7956416606903076 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.169063692398003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.000935061372550104, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 3492003.0, "repeat_count": 0.0, "routers_loss": 0.01420794241130352, "skip_count": 3.0, "step": 2166, "text_loss": 0.27489882707595825 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.178456119753449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0009349087486063594, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3494784.0, "repeat_count": 0.0, "routers_loss": 0.003614309709519148, "skip_count": 1.0, "step": 2168, "text_loss": 0.2962227761745453 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.187848547108894, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009347559580047618, "loss": 0.0175, "macro_f1": 0.8814815282821655, "num_tokens": 3497886.0, "repeat_count": 2.0, "routers_loss": 0.02122853323817253, "skip_count": 4.0, "step": 2170, "text_loss": 0.5919580459594727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.197240974464338, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06396484375, "learning_rate": 0.000934603000803861, "loss": 0.0135, "macro_f1": 0.5492662787437439, "num_tokens": 3500939.0, "repeat_count": 0.0, "routers_loss": 0.02042219042778015, "skip_count": 1.0, "step": 2172, "text_loss": 0.28722381591796875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009344498770622704, "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3504852.0, "repeat_count": 0.0, "routers_loss": 0.004345106892287731, "skip_count": 0.0, "step": 2174, "text_loss": 0.603236734867096 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.216025829175228, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1064453125, "learning_rate": 0.0009342965868386673, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 3508320.0, "repeat_count": 0.0, "routers_loss": 0.00368050136603415, "skip_count": 0.0, "step": 2176, "text_loss": 0.6020491719245911 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.225418256530672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.000934143130191793, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 3511278.0, "repeat_count": 0.0, "routers_loss": 0.013425769284367561, "skip_count": 0.0, "step": 2178, "text_loss": 0.5954724550247192 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060546875, "learning_rate": 0.000933989507180452, "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 3514361.0, "repeat_count": 0.0, "routers_loss": 0.002896249992772937, "skip_count": 0.0, "step": 2180, "text_loss": 0.39175131916999817 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.244203111241562, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.052978515625, "learning_rate": 0.0009338357178635135, "loss": 0.0147, "macro_f1": 0.6603773832321167, "num_tokens": 3517962.0, "repeat_count": 1.0, "routers_loss": 0.011538350023329258, "skip_count": 1.0, "step": 2182, "text_loss": 0.4482830762863159 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.253595538597006, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009336817622999093, "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 3521299.0, "repeat_count": 1.0, "routers_loss": 0.022787930443882942, "skip_count": 0.0, "step": 2184, "text_loss": 0.35177817940711975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.262987965952451, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009335276405486357, "loss": 0.0139, "macro_f1": 0.3272727429866791, "num_tokens": 3524611.0, "repeat_count": 0.0, "routers_loss": 0.011597735807299614, "skip_count": 1.0, "step": 2186, "text_loss": 0.24868851900100708 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0009333733526687524, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 3528012.0, "repeat_count": 0.0, "routers_loss": 0.014253967441618443, "skip_count": 0.0, "step": 2188, "text_loss": 0.3970910310745239 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.054931640625, "learning_rate": 0.000933218898719383, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3530908.0, "repeat_count": 0.0, "routers_loss": 0.001659149187617004, "skip_count": 0.0, "step": 2190, "text_loss": 0.7618573307991028 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009330642787597141, "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3533993.0, "repeat_count": 0.0, "routers_loss": 0.005574346985667944, "skip_count": 0.0, "step": 2192, "text_loss": 0.16470147669315338 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.300557675374229, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009329094928489969, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3537310.0, "repeat_count": 0.0, "routers_loss": 0.0026400673668831587, "skip_count": 0.0, "step": 2194, "text_loss": 0.3400416374206543 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.309950102729674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009327545410465452, "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3540045.0, "repeat_count": 0.0, "routers_loss": 0.008448398672044277, "skip_count": 3.0, "step": 2196, "text_loss": 0.3110542297363281 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.31934253008512, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.0009325994234117372, "loss": 0.0122, "macro_f1": 0.32098764181137085, "num_tokens": 3544097.0, "repeat_count": 0.0, "routers_loss": 0.037553198635578156, "skip_count": 2.0, "step": 2198, "text_loss": 0.36126700043678284 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.328734957440563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09716796875, "learning_rate": 0.000932444140004014, "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3547054.0, "repeat_count": 1.0, "routers_loss": 0.006464479025453329, "skip_count": 0.0, "step": 2200, "text_loss": 0.4947047233581543 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.338127384796008, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009322886908828805, "loss": 0.0138, "macro_f1": 0.6666666865348816, "num_tokens": 3549903.0, "repeat_count": 1.0, "routers_loss": 0.005384812597185373, "skip_count": 0.0, "step": 2202, "text_loss": 0.5923738479614258 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009321330761079052, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3553745.0, "repeat_count": 0.0, "routers_loss": 0.015346619300544262, "skip_count": 2.0, "step": 2204, "text_loss": 0.1904175877571106 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.356912239506897, "f1_execute": 0.9268292784690857, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, "grad_norm": 0.06494140625, "learning_rate": 0.00093197729573872, "loss": 0.0203, "macro_f1": 0.8422764539718628, "num_tokens": 3557235.0, "repeat_count": 3.0, "routers_loss": 0.1207597479224205, "skip_count": 6.0, "step": 2206, "text_loss": 0.3904837667942047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.366304666862343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0771484375, "learning_rate": 0.0009318213498350202, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3560795.0, "repeat_count": 0.0, "routers_loss": 0.003334777895361185, "skip_count": 0.0, "step": 2208, "text_loss": 0.4268290102481842 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.375697094217786, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0537109375, "learning_rate": 0.0009316652384565645, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3563754.0, "repeat_count": 0.0, "routers_loss": 0.004230072256177664, "skip_count": 0.0, "step": 2210, "text_loss": 0.40049710869789124 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.385089521573232, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046875, "learning_rate": 0.0009315089616631751, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 3567173.0, "repeat_count": 0.0, "routers_loss": 0.0006645230459980667, "skip_count": 0.0, "step": 2212, "text_loss": 0.42568323016166687 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.394481948928677, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009313525195147376, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3570831.0, "repeat_count": 0.0, "routers_loss": 0.0097877848893404, "skip_count": 0.0, "step": 2214, "text_loss": 0.45808279514312744 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 10.40387437628412, "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, "grad_norm": 0.076171875, "learning_rate": 0.000931195912071201, "loss": 0.0187, "macro_f1": 0.7018141150474548, "num_tokens": 3573745.0, "repeat_count": 2.0, "routers_loss": 0.07351134717464447, "skip_count": 3.0, "step": 2216, "text_loss": 0.285696804523468 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009310391393925775, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3576785.0, "repeat_count": 0.0, "routers_loss": 0.0033160944003611803, "skip_count": 0.0, "step": 2218, "text_loss": 0.17516443133354187 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.422659230995011, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.047119140625, "learning_rate": 0.0009308822015389424, "loss": 0.0241, "macro_f1": 0.5427350401878357, "num_tokens": 3580695.0, "repeat_count": 1.0, "routers_loss": 0.052930232137441635, "skip_count": 1.0, "step": 2220, "text_loss": 0.5918155908584595 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 10.432051658350455, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.072265625, "learning_rate": 0.0009307250985704352, "loss": 0.0128, "macro_f1": 0.6122449040412903, "num_tokens": 3583729.0, "repeat_count": 0.0, "routers_loss": 0.025454653427004814, "skip_count": 4.0, "step": 2222, "text_loss": 0.2652169466018677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0009305678305472575, "loss": 0.0158, "macro_f1": 0.3333333432674408, "num_tokens": 3586775.0, "repeat_count": 0.0, "routers_loss": 0.011279845610260963, "skip_count": 0.0, "step": 2224, "text_loss": 0.3511691987514496 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.000930410397529675, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3589676.0, "repeat_count": 0.0, "routers_loss": 0.002700264798477292, "skip_count": 0.0, "step": 2226, "text_loss": 0.24045433104038239 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.460228940416789, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.048095703125, "learning_rate": 0.000930252799578016, "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 3593242.0, "repeat_count": 1.0, "routers_loss": 0.00826631672680378, "skip_count": 2.0, "step": 2228, "text_loss": 0.3777645528316498 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.469621367772234, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0009300950367526728, "loss": 0.0131, "macro_f1": 0.8820862174034119, "num_tokens": 3596807.0, "repeat_count": 2.0, "routers_loss": 0.036221496760845184, "skip_count": 2.0, "step": 2230, "text_loss": 0.502962589263916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.0009299371091141001, "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3600150.0, "repeat_count": 0.0, "routers_loss": 0.006449893582612276, "skip_count": 0.0, "step": 2232, "text_loss": 0.20256924629211426 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04638671875, "learning_rate": 0.0009297790167228161, "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3602988.0, "repeat_count": 0.0, "routers_loss": 0.007872486487030983, "skip_count": 2.0, "step": 2234, "text_loss": 0.42476826906204224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.497798649838568, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0009296207596394022, "loss": 0.0101, "macro_f1": 0.32098764181137085, "num_tokens": 3606071.0, "repeat_count": 0.0, "routers_loss": 0.027397040277719498, "skip_count": 2.0, "step": 2236, "text_loss": 0.23432791233062744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0595703125, "learning_rate": 0.0009294623379245028, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3609389.0, "repeat_count": 0.0, "routers_loss": 0.01042645052075386, "skip_count": 0.0, "step": 2238, "text_loss": 0.16665785014629364 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.516583504549457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0009293037516388252, "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3612105.0, "repeat_count": 0.0, "routers_loss": 0.0012458425480872393, "skip_count": 0.0, "step": 2240, "text_loss": 0.59421306848526 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 10.525975931904902, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0009291450008431404, "loss": 0.0185, "macro_f1": 1.0, "num_tokens": 3615439.0, "repeat_count": 1.0, "routers_loss": 0.005781981628388166, "skip_count": 1.0, "step": 2242, "text_loss": 0.510798454284668 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 10.535368359260346, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.0966796875, "learning_rate": 0.0009289860855982814, "loss": 0.0166, "macro_f1": 0.4871794879436493, "num_tokens": 3618842.0, "repeat_count": 0.0, "routers_loss": 0.031195320188999176, "skip_count": 3.0, "step": 2244, "text_loss": 0.7574363350868225 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.544760786615791, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04931640625, "learning_rate": 0.0009288270059651454, "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 3621823.0, "repeat_count": 0.0, "routers_loss": 0.001746491645462811, "skip_count": 0.0, "step": 2246, "text_loss": 0.5125683546066284 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.554153213971237, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.220703125, "learning_rate": 0.0009286677620046918, "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3624502.0, "repeat_count": 0.0, "routers_loss": 0.03792348504066467, "skip_count": 2.0, "step": 2248, "text_loss": 0.7533677220344543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009285083537779429, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3627057.0, "repeat_count": 0.0, "routers_loss": 0.0009684451506473124, "skip_count": 0.0, "step": 2250, "text_loss": 0.2219279706478119 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.572938068682125, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11767578125, "learning_rate": 0.0009283487813459845, "loss": 0.0148, "macro_f1": 0.5492662787437439, "num_tokens": 3629720.0, "repeat_count": 0.0, "routers_loss": 0.022757573053240776, "skip_count": 2.0, "step": 2252, "text_loss": 0.6903313994407654 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.582330496037569, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009281890447699652, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 3633234.0, "repeat_count": 1.0, "routers_loss": 0.003613058477640152, "skip_count": 0.0, "step": 2254, "text_loss": 0.6278893351554871 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0009280291441110961, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3636289.0, "repeat_count": 0.0, "routers_loss": 0.006214062683284283, "skip_count": 0.0, "step": 2256, "text_loss": 0.3011114001274109 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.60111535074846, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.041015625, "learning_rate": 0.0009278690794306517, "loss": 0.014, "macro_f1": 0.5492662787437439, "num_tokens": 3640251.0, "repeat_count": 0.0, "routers_loss": 0.052556321024894714, "skip_count": 2.0, "step": 2258, "text_loss": 0.19894185662269592 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 10.610507778103903, "f1_execute": 0.978723406791687, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.08251953125, "learning_rate": 0.0009277088507899689, "loss": 0.0163, "macro_f1": 0.9452888369560242, "num_tokens": 3643527.0, "repeat_count": 4.0, "routers_loss": 0.0572301521897316, "skip_count": 1.0, "step": 2260, "text_loss": 0.5593410134315491 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.619900205459349, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0009275484582504475, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 3646959.0, "repeat_count": 0.0, "routers_loss": 0.008010074496269226, "skip_count": 0.0, "step": 2262, "text_loss": 0.2128177285194397 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 10.629292632814794, "f1_execute": 0.95652174949646, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, "grad_norm": 0.05419921875, "learning_rate": 0.0009273879018735505, "loss": 0.0138, "macro_f1": 0.8521739840507507, "num_tokens": 3651298.0, "repeat_count": 3.0, "routers_loss": 0.035729870200157166, "skip_count": 3.0, "step": 2264, "text_loss": 0.2987811267375946 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.638685060170237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009272271817208031, "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 3655609.0, "repeat_count": 0.0, "routers_loss": 0.002379779238253832, "skip_count": 0.0, "step": 2266, "text_loss": 0.6024088263511658 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0009270662978537939, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 3658444.0, "repeat_count": 0.0, "routers_loss": 0.008943650871515274, "skip_count": 0.0, "step": 2268, "text_loss": 0.1741207242012024 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 10.657469914881126, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0009269052503341736, "loss": 0.0161, "macro_f1": 0.6595745086669922, "num_tokens": 3662282.0, "repeat_count": 1.0, "routers_loss": 0.030201267451047897, "skip_count": 4.0, "step": 2270, "text_loss": 0.7300035953521729 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.666862342236572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.0009267440392236562, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3665531.0, "repeat_count": 0.0, "routers_loss": 0.0026635683607310057, "skip_count": 0.0, "step": 2272, "text_loss": 0.31535038352012634 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0009265826645840178, "loss": 0.0151, "macro_f1": 0.3333333432674408, "num_tokens": 3668407.0, "repeat_count": 0.0, "routers_loss": 0.004258926957845688, "skip_count": 0.0, "step": 2274, "text_loss": 0.7272579073905945 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 10.68564719694746, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.125, "learning_rate": 0.0009264211264770976, "loss": 0.0154, "macro_f1": 0.6122449040412903, "num_tokens": 3671503.0, "repeat_count": 0.0, "routers_loss": 0.038987524807453156, "skip_count": 4.0, "step": 2276, "text_loss": 0.7488982677459717 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.695039624302906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.099609375, "learning_rate": 0.0009262594249647975, "loss": 0.0164, "macro_f1": 0.6666666865348816, "num_tokens": 3674107.0, "repeat_count": 0.0, "routers_loss": 0.007211760152131319, "skip_count": 1.0, "step": 2278, "text_loss": 0.1992369294166565 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 10.704432051658351, "f1_execute": 0.9767441749572754, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.0546875, "learning_rate": 0.0009260975601090815, "loss": 0.0112, "macro_f1": 0.9446290731430054, "num_tokens": 3677184.0, "repeat_count": 4.0, "routers_loss": 0.02538592554628849, "skip_count": 3.0, "step": 2280, "text_loss": 0.46402135491371155 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0009259355319719768, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3680683.0, "repeat_count": 0.0, "routers_loss": 0.0038464947137981653, "skip_count": 0.0, "step": 2282, "text_loss": 0.5804527401924133 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009257733406155726, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3683928.0, "repeat_count": 0.0, "routers_loss": 0.004841136280447245, "skip_count": 0.0, "step": 2284, "text_loss": 0.4834538400173187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0009256109861020212, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 3687101.0, "repeat_count": 0.0, "routers_loss": 0.002191900508478284, "skip_count": 0.0, "step": 2286, "text_loss": 0.8199604749679565 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.742001761080129, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0927734375, "learning_rate": 0.000925448468493537, "loss": 0.0162, "macro_f1": 0.5427350401878357, "num_tokens": 3690490.0, "repeat_count": 1.0, "routers_loss": 0.03488675877451897, "skip_count": 2.0, "step": 2288, "text_loss": 0.33263635635375977 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.751394188435574, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0009252857878523971, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 3694109.0, "repeat_count": 1.0, "routers_loss": 0.002897309372201562, "skip_count": 0.0, "step": 2290, "text_loss": 0.47494807839393616 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.760786615791018, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05810546875, "learning_rate": 0.000925122944240941, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3697233.0, "repeat_count": 0.0, "routers_loss": 0.01842675730586052, "skip_count": 2.0, "step": 2292, "text_loss": 0.14693495631217957 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.770179043146463, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.045654296875, "learning_rate": 0.0009249599377215707, "loss": 0.0146, "macro_f1": 0.5866667032241821, "num_tokens": 3700376.0, "repeat_count": 1.0, "routers_loss": 0.04169808700680733, "skip_count": 3.0, "step": 2294, "text_loss": 0.38051268458366394 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.779571470501908, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05908203125, "learning_rate": 0.0009247967683567507, "loss": 0.0112, "macro_f1": 0.3272727429866791, "num_tokens": 3703212.0, "repeat_count": 0.0, "routers_loss": 0.012183113023638725, "skip_count": 1.0, "step": 2296, "text_loss": 0.23789077997207642 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 10.788963897857352, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05712890625, "learning_rate": 0.0009246334362090077, "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3706490.0, "repeat_count": 1.0, "routers_loss": 0.01880069635808468, "skip_count": 2.0, "step": 2298, "text_loss": 0.29067978262901306 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.798356325212797, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.000924469941340931, "loss": 0.0173, "macro_f1": 0.3272727429866791, "num_tokens": 3709804.0, "repeat_count": 1.0, "routers_loss": 0.027359159663319588, "skip_count": 0.0, "step": 2300, "text_loss": 0.67828369140625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.807748752568243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.000924306283815172, "loss": 0.0153, "macro_f1": 0.3333333432674408, "num_tokens": 3712824.0, "repeat_count": 0.0, "routers_loss": 0.003152279881760478, "skip_count": 0.0, "step": 2302, "text_loss": 0.8333184719085693 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.817141179923686, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0703125, "learning_rate": 0.0009241424636944445, "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3715385.0, "repeat_count": 0.0, "routers_loss": 0.0442950464785099, "skip_count": 2.0, "step": 2304, "text_loss": 0.41893699765205383 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 10.826533607279131, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.058837890625, "learning_rate": 0.0009239784810415249, "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3719080.0, "repeat_count": 1.0, "routers_loss": 0.015729321166872978, "skip_count": 2.0, "step": 2306, "text_loss": 0.13360483944416046 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.835926034634575, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.06787109375, "learning_rate": 0.0009238143359192514, "loss": 0.0136, "macro_f1": 0.5934640765190125, "num_tokens": 3722439.0, "repeat_count": 0.0, "routers_loss": 0.028816604986786842, "skip_count": 3.0, "step": 2308, "text_loss": 0.39594101905822754 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05419921875, "learning_rate": 0.000923650028390525, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3725092.0, "repeat_count": 0.0, "routers_loss": 0.0036455015651881695, "skip_count": 2.0, "step": 2310, "text_loss": 0.6169708371162415 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009234855585183086, "loss": 0.014, "macro_f1": 0.6666666865348816, "num_tokens": 3728412.0, "repeat_count": 0.0, "routers_loss": 0.007565604057163, "skip_count": 1.0, "step": 2312, "text_loss": 0.21257059276103973 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 10.86410331670091, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0517578125, "learning_rate": 0.0009233209263656273, "loss": 0.0184, "macro_f1": 0.9262410998344421, "num_tokens": 3731467.0, "repeat_count": 2.0, "routers_loss": 0.02510629966855049, "skip_count": 3.0, "step": 2314, "text_loss": 0.21639840304851532 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057861328125, "learning_rate": 0.0009231561319955684, "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3734906.0, "repeat_count": 0.0, "routers_loss": 0.00872227642685175, "skip_count": 0.0, "step": 2316, "text_loss": 0.35639774799346924 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009229911754712815, "loss": 0.0176, "macro_f1": 0.3333333432674408, "num_tokens": 3737943.0, "repeat_count": 0.0, "routers_loss": 0.004695790819823742, "skip_count": 0.0, "step": 2318, "text_loss": 0.5269573330879211 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.892280598767243, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.0009228260568559781, "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 3741833.0, "repeat_count": 1.0, "routers_loss": 0.0217357836663723, "skip_count": 0.0, "step": 2320, "text_loss": 0.5110208988189697 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.901673026122689, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.0009226607762129322, "loss": 0.0201, "macro_f1": 0.32098764181137085, "num_tokens": 3744642.0, "repeat_count": 1.0, "routers_loss": 0.05595960095524788, "skip_count": 1.0, "step": 2322, "text_loss": 0.6291998624801636 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0009224953336054796, "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3748127.0, "repeat_count": 0.0, "routers_loss": 0.0071634589694440365, "skip_count": 0.0, "step": 2324, "text_loss": 0.7404762506484985 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.000922329729097018, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3751373.0, "repeat_count": 0.0, "routers_loss": 0.0011676300782710314, "skip_count": 0.0, "step": 2326, "text_loss": 0.2915459871292114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061279296875, "learning_rate": 0.0009221639627510075, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3754518.0, "repeat_count": 0.0, "routers_loss": 0.01039792038500309, "skip_count": 0.0, "step": 2328, "text_loss": 0.22066321969032288 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0751953125, "learning_rate": 0.0009219980346309702, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3757621.0, "repeat_count": 0.0, "routers_loss": 0.0032070958986878395, "skip_count": 0.0, "step": 2330, "text_loss": 0.5558560490608215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.948635162899912, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.076171875, "learning_rate": 0.0009218319448004899, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3760885.0, "repeat_count": 0.0, "routers_loss": 0.007085457909852266, "skip_count": 0.0, "step": 2332, "text_loss": 0.4348253607749939 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.958027590255357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1103515625, "learning_rate": 0.0009216656933232129, "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 3764462.0, "repeat_count": 0.0, "routers_loss": 0.005504854489117861, "skip_count": 1.0, "step": 2334, "text_loss": 0.35828644037246704 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0009214992802628463, "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3767159.0, "repeat_count": 0.0, "routers_loss": 0.0013970810687169433, "skip_count": 0.0, "step": 2336, "text_loss": 0.2956557869911194 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.976812444966246, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009213327056831607, "loss": 0.0181, "macro_f1": 0.3272727429866791, "num_tokens": 3770408.0, "repeat_count": 0.0, "routers_loss": 0.0427570566534996, "skip_count": 1.0, "step": 2338, "text_loss": 0.14883014559745789 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.986204872321691, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0009211659696479875, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 3773474.0, "repeat_count": 0.0, "routers_loss": 0.0011273405980318785, "skip_count": 0.0, "step": 2340, "text_loss": 0.26011669635772705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.995597299677135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.00092099907222122, "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3776909.0, "repeat_count": 0.0, "routers_loss": 0.0016178421210497618, "skip_count": 0.0, "step": 2342, "text_loss": 0.49078530073165894 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.004696213677722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.000920832013466814, "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 3780741.0, "repeat_count": 0.0, "routers_loss": 0.005510095041245222, "skip_count": 0.0, "step": 2344, "text_loss": 0.4870249927043915 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.014088641033167, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0009206647934487866, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3784673.0, "repeat_count": 1.0, "routers_loss": 0.0047357892617583275, "skip_count": 0.0, "step": 2346, "text_loss": 0.3251725733280182 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05615234375, "learning_rate": 0.0009204974122312167, "loss": 0.0142, "macro_f1": 0.6666666865348816, "num_tokens": 3787503.0, "repeat_count": 0.0, "routers_loss": 0.00795028731226921, "skip_count": 1.0, "step": 2348, "text_loss": 0.18282145261764526 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.060546875, "learning_rate": 0.0009203298698782452, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 3790528.0, "repeat_count": 1.0, "routers_loss": 0.0009506374481134117, "skip_count": 0.0, "step": 2350, "text_loss": 0.4093080461025238 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.042265923099501, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.047607421875, "learning_rate": 0.0009201621664540747, "loss": 0.0155, "macro_f1": 0.6666666865348816, "num_tokens": 3794134.0, "repeat_count": 1.0, "routers_loss": 0.005159572698175907, "skip_count": 0.0, "step": 2352, "text_loss": 0.5451981425285339 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.051658350454945, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009199943020229694, "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3797414.0, "repeat_count": 0.0, "routers_loss": 0.002356168581172824, "skip_count": 0.0, "step": 2354, "text_loss": 0.3070453405380249 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0810546875, "learning_rate": 0.0009198262766492554, "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 3800094.0, "repeat_count": 0.0, "routers_loss": 0.0051761893555521965, "skip_count": 1.0, "step": 2356, "text_loss": 0.5880904197692871 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.070443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.00091965809039732, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3803280.0, "repeat_count": 0.0, "routers_loss": 0.0025952060241252184, "skip_count": 0.0, "step": 2358, "text_loss": 0.5210731625556946 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.079835632521279, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06787109375, "learning_rate": 0.0009194897433316127, "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 3805866.0, "repeat_count": 0.0, "routers_loss": 0.0042560105212032795, "skip_count": 2.0, "step": 2360, "text_loss": 0.6472984552383423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009193212355166446, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3808952.0, "repeat_count": 0.0, "routers_loss": 0.0026232977397739887, "skip_count": 0.0, "step": 2362, "text_loss": 0.450063556432724 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009191525670169881, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3812080.0, "repeat_count": 0.0, "routers_loss": 0.0034355956595391035, "skip_count": 0.0, "step": 2364, "text_loss": 0.49727216362953186 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.108012914587613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.000918983737897277, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3815282.0, "repeat_count": 0.0, "routers_loss": 0.0055653867311775684, "skip_count": 1.0, "step": 2366, "text_loss": 0.6336377859115601 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.117405341943059, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0009188147482222071, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 3818106.0, "repeat_count": 2.0, "routers_loss": 0.011016021482646465, "skip_count": 2.0, "step": 2368, "text_loss": 0.22513329982757568 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.126797769298504, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009186455980565358, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3821228.0, "repeat_count": 1.0, "routers_loss": 0.014039464294910431, "skip_count": 0.0, "step": 2370, "text_loss": 0.21331638097763062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009184762874650816, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3825048.0, "repeat_count": 0.0, "routers_loss": 0.001088051125407219, "skip_count": 0.0, "step": 2372, "text_loss": 0.6031543612480164 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.095703125, "learning_rate": 0.0009183068165127245, "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3828781.0, "repeat_count": 0.0, "routers_loss": 0.006263940595090389, "skip_count": 1.0, "step": 2374, "text_loss": 0.6249601244926453 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.154975051364836, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.0009181371852644062, "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 3832507.0, "repeat_count": 1.0, "routers_loss": 0.001987969037145376, "skip_count": 0.0, "step": 2376, "text_loss": 0.37972065806388855 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.164367478720282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0908203125, "learning_rate": 0.0009179673937851299, "loss": 0.0158, "macro_f1": 0.6666666865348816, "num_tokens": 3835644.0, "repeat_count": 0.0, "routers_loss": 0.007635094691067934, "skip_count": 1.0, "step": 2378, "text_loss": 0.46319663524627686 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009177974421399598, "loss": 0.0137, "macro_f1": 0.6666666865348816, "num_tokens": 3838700.0, "repeat_count": 0.0, "routers_loss": 0.01617279462516308, "skip_count": 2.0, "step": 2380, "text_loss": 0.32141056656837463 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056396484375, "learning_rate": 0.0009176273303940217, "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 3841953.0, "repeat_count": 0.0, "routers_loss": 0.0022273799404501915, "skip_count": 2.0, "step": 2382, "text_loss": 0.5908139944076538 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.192544760786616, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0009174570586125026, "loss": 0.0122, "macro_f1": 0.32098767161369324, "num_tokens": 3845763.0, "repeat_count": 1.0, "routers_loss": 0.030915161594748497, "skip_count": 0.0, "step": 2384, "text_loss": 0.41400137543678284 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.201937188142061, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04248046875, "learning_rate": 0.0009172866268606513, "loss": 0.0122, "macro_f1": 0.6666666865348816, "num_tokens": 3848984.0, "repeat_count": 0.0, "routers_loss": 0.010480951517820358, "skip_count": 2.0, "step": 2386, "text_loss": 0.2560874819755554 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056396484375, "learning_rate": 0.0009171160352037775, "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3852118.0, "repeat_count": 0.0, "routers_loss": 0.00809961836785078, "skip_count": 1.0, "step": 2388, "text_loss": 0.28236693143844604 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.22072204285295, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0009169452837072521, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 3855314.0, "repeat_count": 1.0, "routers_loss": 0.005569872446358204, "skip_count": 1.0, "step": 2390, "text_loss": 0.4578137695789337 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1123046875, "learning_rate": 0.0009167743724365073, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3858301.0, "repeat_count": 0.0, "routers_loss": 0.0038610948249697685, "skip_count": 1.0, "step": 2392, "text_loss": 0.14082716405391693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009166033014570368, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3861296.0, "repeat_count": 0.0, "routers_loss": 0.0017607157351449132, "skip_count": 0.0, "step": 2394, "text_loss": 0.384442001581192 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 11.248899324919284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0009164320708343954, "loss": 0.0131, "macro_f1": 0.6666666865348816, "num_tokens": 3863985.0, "repeat_count": 2.0, "routers_loss": 0.009627950377762318, "skip_count": 0.0, "step": 2396, "text_loss": 0.6969521045684814 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.258291752274728, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009162606806341989, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 3866636.0, "repeat_count": 0.0, "routers_loss": 0.006915586534887552, "skip_count": 0.0, "step": 2398, "text_loss": 0.48069697618484497 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.267684179630173, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.0009160891309221242, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3870867.0, "repeat_count": 1.0, "routers_loss": 0.0013031222624704242, "skip_count": 0.0, "step": 2400, "text_loss": 0.3882075846195221 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.277076606985618, "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0009159174217639096, "loss": 0.0112, "macro_f1": 0.5427350401878357, "num_tokens": 3873663.0, "repeat_count": 2.0, "routers_loss": 0.06621067970991135, "skip_count": 1.0, "step": 2402, "text_loss": 0.5740041136741638 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.286469034341062, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0009157455532253547, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3876788.0, "repeat_count": 1.0, "routers_loss": 0.005957918707281351, "skip_count": 0.0, "step": 2404, "text_loss": 0.26025933027267456 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 11.295861461696507, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.08642578125, "learning_rate": 0.0009155735253723191, "loss": 0.0126, "macro_f1": 0.9452888369560242, "num_tokens": 3879942.0, "repeat_count": 1.0, "routers_loss": 0.039429809898138046, "skip_count": 4.0, "step": 2406, "text_loss": 1.1349908113479614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.305253889051952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047607421875, "learning_rate": 0.0009154013382707251, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3882682.0, "repeat_count": 0.0, "routers_loss": 0.0012570557883009315, "skip_count": 0.0, "step": 2408, "text_loss": 0.5611135363578796 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.314646316407396, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0009152289919865543, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3886425.0, "repeat_count": 0.0, "routers_loss": 0.0017455556662753224, "skip_count": 0.0, "step": 2410, "text_loss": 0.7523751854896545 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.324038743762841, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0009150564865858506, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3889273.0, "repeat_count": 0.0, "routers_loss": 0.011178011074662209, "skip_count": 1.0, "step": 2412, "text_loss": 0.26942551136016846 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 11.333431171118287, "f1_execute": 0.9803921580314636, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.0009148838221347182, "loss": 0.0107, "macro_f1": 0.5934640765190125, "num_tokens": 3892199.0, "repeat_count": 3.0, "routers_loss": 0.019628092646598816, "skip_count": 0.0, "step": 2414, "text_loss": 0.5492315888404846 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.34282359847373, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04541015625, "learning_rate": 0.0009147109986993225, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 3895362.0, "repeat_count": 1.0, "routers_loss": 0.012255983427166939, "skip_count": 0.0, "step": 2416, "text_loss": 0.23798216879367828 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009145380163458899, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3898476.0, "repeat_count": 0.0, "routers_loss": 0.007018954027444124, "skip_count": 0.0, "step": 2418, "text_loss": 0.1923145055770874 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.361608453184619, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0009143648751407074, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 3901817.0, "repeat_count": 0.0, "routers_loss": 0.0008574824314564466, "skip_count": 0.0, "step": 2420, "text_loss": 0.4001806974411011 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 11.371000880540064, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11328125, "learning_rate": 0.0009141915751501231, "loss": 0.0102, "macro_f1": 0.5492662787437439, "num_tokens": 3905461.0, "repeat_count": 0.0, "routers_loss": 0.01572350226342678, "skip_count": 2.0, "step": 2422, "text_loss": 0.19519129395484924 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0009140181164405458, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3908878.0, "repeat_count": 0.0, "routers_loss": 0.0005503420252352953, "skip_count": 0.0, "step": 2424, "text_loss": 0.6937088370323181 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0009138444990784454, "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3912053.0, "repeat_count": 0.0, "routers_loss": 0.007556677330285311, "skip_count": 0.0, "step": 2426, "text_loss": 0.35431069135665894 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.399178162606399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.000913670723130352, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3915192.0, "repeat_count": 0.0, "routers_loss": 0.0013609991874545813, "skip_count": 0.0, "step": 2428, "text_loss": 0.5171207189559937 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.408570589961844, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.050048828125, "learning_rate": 0.0009134967886628573, "loss": 0.0115, "macro_f1": 1.0, "num_tokens": 3917927.0, "repeat_count": 2.0, "routers_loss": 0.010895746760070324, "skip_count": 2.0, "step": 2430, "text_loss": 0.2852934002876282 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.417963017317287, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009133226957426133, "loss": 0.0132, "macro_f1": 0.5492662787437439, "num_tokens": 3921460.0, "repeat_count": 2.0, "routers_loss": 0.04196908697485924, "skip_count": 0.0, "step": 2432, "text_loss": 0.4864770770072937 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.427355444672733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0009131484444363324, "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3924662.0, "repeat_count": 0.0, "routers_loss": 0.004484197124838829, "skip_count": 0.0, "step": 2434, "text_loss": 0.7568684220314026 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0009129740348107882, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3927337.0, "repeat_count": 0.0, "routers_loss": 0.004351360257714987, "skip_count": 2.0, "step": 2436, "text_loss": 0.5953161716461182 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 11.446140299383622, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.04736328125, "learning_rate": 0.0009127994669328151, "loss": 0.0085, "macro_f1": 0.6122449040412903, "num_tokens": 3930407.0, "repeat_count": 0.0, "routers_loss": 0.01664198748767376, "skip_count": 4.0, "step": 2438, "text_loss": 0.5320524573326111 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.455532726739067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0595703125, "learning_rate": 0.0009126247408693071, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3933184.0, "repeat_count": 0.0, "routers_loss": 0.0017819046042859554, "skip_count": 1.0, "step": 2440, "text_loss": 0.6051273345947266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0009124498566872204, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 3936620.0, "repeat_count": 0.0, "routers_loss": 0.005519696045666933, "skip_count": 0.0, "step": 2442, "text_loss": 0.12987950444221497 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.474317581449956, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0009122748144535704, "loss": 0.0111, "macro_f1": 0.32098764181137085, "num_tokens": 3940010.0, "repeat_count": 0.0, "routers_loss": 0.04543351009488106, "skip_count": 2.0, "step": 2444, "text_loss": 0.4642033576965332 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.483710008805401, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009120996142354338, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3943135.0, "repeat_count": 0.0, "routers_loss": 0.00550565542653203, "skip_count": 0.0, "step": 2446, "text_loss": 0.5697627067565918 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.493102436160845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0009119242560999477, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3946650.0, "repeat_count": 0.0, "routers_loss": 0.008842485956847668, "skip_count": 0.0, "step": 2448, "text_loss": 0.17046524584293365 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08154296875, "learning_rate": 0.0009117487401143095, "loss": 0.0154, "macro_f1": 0.6666666865348816, "num_tokens": 3949470.0, "repeat_count": 1.0, "routers_loss": 0.005900127813220024, "skip_count": 0.0, "step": 2450, "text_loss": 0.37260866165161133 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.511887290871735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0009115730663457773, "loss": 0.0137, "macro_f1": 1.0, "num_tokens": 3952546.0, "repeat_count": 1.0, "routers_loss": 0.003409258322790265, "skip_count": 1.0, "step": 2452, "text_loss": 0.5308008193969727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05224609375, "learning_rate": 0.0009113972348616698, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 3955817.0, "repeat_count": 0.0, "routers_loss": 0.010098597034811974, "skip_count": 1.0, "step": 2454, "text_loss": 0.39226648211479187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 11.530672145582624, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0009112212457293658, "loss": 0.0102, "macro_f1": 0.3272727429866791, "num_tokens": 3958911.0, "repeat_count": 0.0, "routers_loss": 0.08184818178415298, "skip_count": 0.0, "step": 2456, "text_loss": 0.45411455631256104 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0009110450990163047, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3962584.0, "repeat_count": 0.0, "routers_loss": 0.0009352223132736981, "skip_count": 0.0, "step": 2458, "text_loss": 0.47292324900627136 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.549457000293513, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0009108687947899863, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 3965597.0, "repeat_count": 1.0, "routers_loss": 0.008150188252329826, "skip_count": 2.0, "step": 2460, "text_loss": 0.33208340406417847 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 11.558849427648958, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.043212890625, "learning_rate": 0.0009106923331179707, "loss": 0.0125, "macro_f1": 0.5492662787437439, "num_tokens": 3968664.0, "repeat_count": 0.0, "routers_loss": 0.050999004393815994, "skip_count": 2.0, "step": 2462, "text_loss": 0.2459995150566101 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009105157140678782, "loss": 0.0126, "macro_f1": 0.6666666865348816, "num_tokens": 3971772.0, "repeat_count": 0.0, "routers_loss": 0.006196586415171623, "skip_count": 1.0, "step": 2464, "text_loss": 0.23956991732120514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.577634282359847, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009103389377073896, "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 3976224.0, "repeat_count": 0.0, "routers_loss": 0.008181816898286343, "skip_count": 0.0, "step": 2466, "text_loss": 0.3235875070095062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057373046875, "learning_rate": 0.0009101620041042462, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3978876.0, "repeat_count": 0.0, "routers_loss": 0.0015451472718268633, "skip_count": 0.0, "step": 2468, "text_loss": 0.4038759469985962 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.596419137070736, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09130859375, "learning_rate": 0.000909984913326249, "loss": 0.0131, "macro_f1": 0.3272727429866791, "num_tokens": 3981992.0, "repeat_count": 0.0, "routers_loss": 0.021785033866763115, "skip_count": 1.0, "step": 2470, "text_loss": 0.6346460580825806 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.605811564426181, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0009098076654412595, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 3984560.0, "repeat_count": 0.0, "routers_loss": 0.0011462471447885036, "skip_count": 0.0, "step": 2472, "text_loss": 0.3449646532535553 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.0009096302605171996, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3987548.0, "repeat_count": 0.0, "routers_loss": 0.0014367027906700969, "skip_count": 0.0, "step": 2474, "text_loss": 0.5918350219726562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.0009094526986220513, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 3990727.0, "repeat_count": 0.0, "routers_loss": 0.0008977655088528991, "skip_count": 0.0, "step": 2476, "text_loss": 0.463350385427475 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.633988846492516, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.0009092749798238563, "loss": 0.015, "macro_f1": 0.3272727429866791, "num_tokens": 3993757.0, "repeat_count": 1.0, "routers_loss": 0.016712551936507225, "skip_count": 0.0, "step": 2478, "text_loss": 0.5621229410171509 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.643381273847961, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.000909097104190717, "loss": 0.0172, "macro_f1": 0.32098764181137085, "num_tokens": 3997259.0, "repeat_count": 0.0, "routers_loss": 0.04134179651737213, "skip_count": 2.0, "step": 2480, "text_loss": 0.375476598739624 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0009089190717907956, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4000563.0, "repeat_count": 0.0, "routers_loss": 0.003462378401309252, "skip_count": 0.0, "step": 2482, "text_loss": 0.5553798675537109 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0009087408826923146, "loss": 0.0182, "macro_f1": 0.6666666865348816, "num_tokens": 4004065.0, "repeat_count": 0.0, "routers_loss": 0.008057428523898125, "skip_count": 2.0, "step": 2484, "text_loss": 0.4329465329647064 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.671558555914293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0009085625369635564, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4007119.0, "repeat_count": 0.0, "routers_loss": 0.005759050603955984, "skip_count": 0.0, "step": 2486, "text_loss": 0.501268744468689 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.680950983269739, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009083840346728631, "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 4010547.0, "repeat_count": 1.0, "routers_loss": 0.020763102918863297, "skip_count": 0.0, "step": 2488, "text_loss": 0.480196475982666 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.690343410625184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0009082053758886374, "loss": 0.0117, "macro_f1": 0.6666666865348816, "num_tokens": 4014600.0, "repeat_count": 0.0, "routers_loss": 0.005801836494356394, "skip_count": 1.0, "step": 2490, "text_loss": 0.18249782919883728 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.699735837980628, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009080265606793416, "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 4017964.0, "repeat_count": 1.0, "routers_loss": 0.004226063843816519, "skip_count": 1.0, "step": 2492, "text_loss": 0.6573076248168945 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.049072265625, "learning_rate": 0.000907847589113498, "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 4020694.0, "repeat_count": 0.0, "routers_loss": 0.004281101748347282, "skip_count": 2.0, "step": 2494, "text_loss": 0.3944586217403412 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.718520692691518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061279296875, "learning_rate": 0.000907668461259689, "loss": 0.0152, "macro_f1": 0.6666666865348816, "num_tokens": 4023757.0, "repeat_count": 0.0, "routers_loss": 0.008786370046436787, "skip_count": 1.0, "step": 2496, "text_loss": 0.6452898979187012 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.727913120046962, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009074891771865566, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4026601.0, "repeat_count": 0.0, "routers_loss": 0.005209595896303654, "skip_count": 0.0, "step": 2498, "text_loss": 0.9633619785308838 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 11.737305547402407, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0009073097369628028, "loss": 0.013, "macro_f1": 1.0, "num_tokens": 4030321.0, "repeat_count": 3.0, "routers_loss": 0.00860709697008133, "skip_count": 1.0, "step": 2500, "text_loss": 0.48566827178001404 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0009071301406571893, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4033234.0, "repeat_count": 0.0, "routers_loss": 0.0035277456045150757, "skip_count": 0.0, "step": 2502, "text_loss": 0.3771554231643677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.000906950388338538, "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 4036417.0, "repeat_count": 0.0, "routers_loss": 0.0013424850767478347, "skip_count": 0.0, "step": 2504, "text_loss": 0.8962806463241577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.765482829468741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0009067704800757301, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4039564.0, "repeat_count": 0.0, "routers_loss": 0.0010423909407109022, "skip_count": 0.0, "step": 2506, "text_loss": 0.43170279264450073 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.774875256824185, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.000906590415937707, "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 4043212.0, "repeat_count": 0.0, "routers_loss": 0.021780289709568024, "skip_count": 1.0, "step": 2508, "text_loss": 0.41495826840400696 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0009064101959934696, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4046687.0, "repeat_count": 0.0, "routers_loss": 0.007261929102241993, "skip_count": 1.0, "step": 2510, "text_loss": 0.21821187436580658 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.057861328125, "learning_rate": 0.0009062298203120783, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4050735.0, "repeat_count": 0.0, "routers_loss": 0.007447180338203907, "skip_count": 2.0, "step": 2512, "text_loss": 0.1818767935037613 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.803052538890519, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0009060492889626535, "loss": 0.0142, "macro_f1": 0.3272727429866791, "num_tokens": 4054426.0, "repeat_count": 1.0, "routers_loss": 0.0718490406870842, "skip_count": 0.0, "step": 2514, "text_loss": 0.22798970341682434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.812444966245964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.0009058686020143753, "loss": 0.0183, "macro_f1": 0.3333333432674408, "num_tokens": 4057615.0, "repeat_count": 0.0, "routers_loss": 0.0052676633931696415, "skip_count": 0.0, "step": 2516, "text_loss": 0.1712338626384735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0009056877595364832, "loss": 0.0137, "macro_f1": 0.3333333432674408, "num_tokens": 4060338.0, "repeat_count": 0.0, "routers_loss": 0.0018052728846669197, "skip_count": 0.0, "step": 2518, "text_loss": 0.6811438798904419 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.0009055067615982761, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4062887.0, "repeat_count": 0.0, "routers_loss": 0.0009029926732182503, "skip_count": 0.0, "step": 2520, "text_loss": 0.5480356812477112 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.840622248312298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0009053256082691133, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 4065357.0, "repeat_count": 0.0, "routers_loss": 0.0027515271212905645, "skip_count": 0.0, "step": 2522, "text_loss": 0.5234101414680481 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009051442996184127, "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 4068111.0, "repeat_count": 0.0, "routers_loss": 0.002199822571128607, "skip_count": 0.0, "step": 2524, "text_loss": 0.2418575882911682 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0625, "learning_rate": 0.0009049628357156521, "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 4071284.0, "repeat_count": 0.0, "routers_loss": 0.006303096655756235, "skip_count": 2.0, "step": 2526, "text_loss": 0.7948065996170044 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.868799530378633, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.000904781216630369, "loss": 0.0068, "macro_f1": 0.6601307392120361, "num_tokens": 4074750.0, "repeat_count": 1.0, "routers_loss": 0.01791904680430889, "skip_count": 2.0, "step": 2528, "text_loss": 0.809726357460022 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.878191957734076, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0576171875, "learning_rate": 0.0009045994424321602, "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4078617.0, "repeat_count": 2.0, "routers_loss": 0.016553178429603577, "skip_count": 2.0, "step": 2530, "text_loss": 0.8755000829696655 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.887584385089522, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061767578125, "learning_rate": 0.0009044175131906817, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 4080936.0, "repeat_count": 0.0, "routers_loss": 0.00884837657213211, "skip_count": 0.0, "step": 2532, "text_loss": 0.795871913433075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.896976812444967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0009042354289756491, "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4084459.0, "repeat_count": 0.0, "routers_loss": 0.0024387789890170097, "skip_count": 0.0, "step": 2534, "text_loss": 0.18875400722026825 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0625, "learning_rate": 0.0009040531898568379, "loss": 0.0171, "macro_f1": 0.3333333432674408, "num_tokens": 4088464.0, "repeat_count": 0.0, "routers_loss": 0.00491489190608263, "skip_count": 0.0, "step": 2536, "text_loss": 0.334369033575058 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.091796875, "learning_rate": 0.000903870795904082, "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 4091659.0, "repeat_count": 0.0, "routers_loss": 0.004592662677168846, "skip_count": 2.0, "step": 2538, "text_loss": 0.21298295259475708 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 11.925154094511301, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0458984375, "learning_rate": 0.000903688247187275, "loss": 0.0137, "macro_f1": 0.5492662787437439, "num_tokens": 4095496.0, "repeat_count": 0.0, "routers_loss": 0.011647242121398449, "skip_count": 2.0, "step": 2540, "text_loss": 0.2985081672668457 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0009035055437763704, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4098663.0, "repeat_count": 0.0, "routers_loss": 0.0021238960325717926, "skip_count": 0.0, "step": 2542, "text_loss": 0.35359489917755127 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.94393894922219, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05859375, "learning_rate": 0.0009033226857413803, "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 4101588.0, "repeat_count": 1.0, "routers_loss": 0.0024701557122170925, "skip_count": 0.0, "step": 2544, "text_loss": 1.1577601432800293 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.000903139673152376, "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4104643.0, "repeat_count": 0.0, "routers_loss": 0.002499542199075222, "skip_count": 0.0, "step": 2546, "text_loss": 1.0173401832580566 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.962723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.0009029565060794885, "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 4109247.0, "repeat_count": 0.0, "routers_loss": 0.0034200598020106554, "skip_count": 0.0, "step": 2548, "text_loss": 0.5690504312515259 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.972116231288524, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06884765625, "learning_rate": 0.0009027731845929079, "loss": 0.0155, "macro_f1": 0.8823530077934265, "num_tokens": 4112597.0, "repeat_count": 1.0, "routers_loss": 0.015981333330273628, "skip_count": 1.0, "step": 2550, "text_loss": 0.294549822807312 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 11.981508658643968, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06103515625, "learning_rate": 0.0009025897087628829, "loss": 0.0064, "macro_f1": 0.5492662787437439, "num_tokens": 4115844.0, "repeat_count": 0.0, "routers_loss": 0.02606951631605625, "skip_count": 2.0, "step": 2552, "text_loss": 0.22692419588565826 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.0009024060786597222, "loss": 0.0202, "macro_f1": 0.3333333432674408, "num_tokens": 4118634.0, "repeat_count": 0.0, "routers_loss": 0.001026194542646408, "skip_count": 0.0, "step": 2554, "text_loss": 0.6807059645652771 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.000902222294353793, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4122024.0, "repeat_count": 0.0, "routers_loss": 0.001974924933165312, "skip_count": 0.0, "step": 2556, "text_loss": 0.7373668551445007 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.009392427355445, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04833984375, "learning_rate": 0.0009020383559155219, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 4124803.0, "repeat_count": 1.0, "routers_loss": 0.004662613850086927, "skip_count": 2.0, "step": 2558, "text_loss": 0.21808166801929474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.018784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.0009018542634153943, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 4127680.0, "repeat_count": 0.0, "routers_loss": 0.006881687790155411, "skip_count": 0.0, "step": 2560, "text_loss": 0.25192978978157043 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 12.028177282066334, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0009016700169239551, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 4130431.0, "repeat_count": 1.0, "routers_loss": 0.005977808032184839, "skip_count": 1.0, "step": 2562, "text_loss": 0.4700816869735718 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009014856165118075, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 4133535.0, "repeat_count": 0.0, "routers_loss": 0.007005698047578335, "skip_count": 1.0, "step": 2564, "text_loss": 0.6558199524879456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.046962136777223, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030517578125, "learning_rate": 0.0009013010622496144, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4136534.0, "repeat_count": 0.0, "routers_loss": 0.007262171246111393, "skip_count": 0.0, "step": 2566, "text_loss": 0.2565421462059021 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 12.056354564132668, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.043212890625, "learning_rate": 0.0009011163542080971, "loss": 0.0088, "macro_f1": 0.5934640765190125, "num_tokens": 4139762.0, "repeat_count": 0.0, "routers_loss": 0.05431923270225525, "skip_count": 3.0, "step": 2568, "text_loss": 0.19896510243415833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0009009314924580363, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4143398.0, "repeat_count": 0.0, "routers_loss": 0.003667369019240141, "skip_count": 0.0, "step": 2570, "text_loss": 0.6581419110298157 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.075139418843557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052978515625, "learning_rate": 0.0009007464770702712, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4146248.0, "repeat_count": 0.0, "routers_loss": 0.00132099783513695, "skip_count": 0.0, "step": 2572, "text_loss": 0.5316711068153381 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0009005613081157002, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4149455.0, "repeat_count": 0.0, "routers_loss": 0.0020061524119228125, "skip_count": 0.0, "step": 2574, "text_loss": 0.5400773882865906 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05517578125, "learning_rate": 0.0009003759856652802, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4152774.0, "repeat_count": 0.0, "routers_loss": 0.002621434163302183, "skip_count": 1.0, "step": 2576, "text_loss": 0.3672606945037842 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051513671875, "learning_rate": 0.0009001905097900273, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4155835.0, "repeat_count": 0.0, "routers_loss": 0.005290219560265541, "skip_count": 0.0, "step": 2578, "text_loss": 0.8159038424491882 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0009000048805610161, "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 4158874.0, "repeat_count": 0.0, "routers_loss": 0.0013576085912063718, "skip_count": 0.0, "step": 2580, "text_loss": 0.5518951416015625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.00089981909804938, "loss": 0.0143, "macro_f1": 0.3333333432674408, "num_tokens": 4162076.0, "repeat_count": 0.0, "routers_loss": 0.0021483441814780235, "skip_count": 0.0, "step": 2582, "text_loss": 0.43552228808403015 }, { "acc_repeat": 1.0, "acc_skip": 0.25, "avg_layers": 28.0, "epoch": 12.131493982976226, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.068359375, "learning_rate": 0.0008996331623263114, "loss": 0.0117, "macro_f1": 0.7795917987823486, "num_tokens": 4165041.0, "repeat_count": 1.0, "routers_loss": 0.0544300302863121, "skip_count": 4.0, "step": 2584, "text_loss": 0.24812501668930054 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.140886410331671, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047607421875, "learning_rate": 0.0008994470734630611, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4168290.0, "repeat_count": 0.0, "routers_loss": 0.0017150711501017213, "skip_count": 0.0, "step": 2586, "text_loss": 0.6392097473144531 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0615234375, "learning_rate": 0.0008992608315309388, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4171310.0, "repeat_count": 0.0, "routers_loss": 0.0046473173424601555, "skip_count": 2.0, "step": 2588, "text_loss": 0.6534156799316406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.15967126504256, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06591796875, "learning_rate": 0.0008990744366013125, "loss": 0.0105, "macro_f1": 0.3144654333591461, "num_tokens": 4174042.0, "repeat_count": 2.0, "routers_loss": 0.060913100838661194, "skip_count": 1.0, "step": 2590, "text_loss": 0.5365690588951111 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 12.169063692398003, "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.055419921875, "learning_rate": 0.0008988878887456093, "loss": 0.0118, "macro_f1": 0.6051587462425232, "num_tokens": 4177666.0, "repeat_count": 1.0, "routers_loss": 0.06268956512212753, "skip_count": 4.0, "step": 2592, "text_loss": 0.226226806640625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.178456119753449, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008987011880353149, "loss": 0.0089, "macro_f1": 0.32098764181137085, "num_tokens": 4180490.0, "repeat_count": 0.0, "routers_loss": 0.030141465365886688, "skip_count": 2.0, "step": 2594, "text_loss": 0.2581401765346527 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 12.187848547108894, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.044677734375, "learning_rate": 0.0008985143345419729, "loss": 0.0082, "macro_f1": 0.5492662787437439, "num_tokens": 4183300.0, "repeat_count": 0.0, "routers_loss": 0.018745863810181618, "skip_count": 2.0, "step": 2596, "text_loss": 0.7778542637825012 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 12.197240974464338, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.064453125, "learning_rate": 0.0008983273283371862, "loss": 0.0096, "macro_f1": 0.5492662787437439, "num_tokens": 4186535.0, "repeat_count": 0.0, "routers_loss": 0.026792079210281372, "skip_count": 2.0, "step": 2598, "text_loss": 0.34700271487236023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008981401694926159, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4189082.0, "repeat_count": 0.0, "routers_loss": 0.001914160675369203, "skip_count": 0.0, "step": 2600, "text_loss": 0.6879339218139648 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.216025829175228, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0008979528580799815, "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 4192330.0, "repeat_count": 0.0, "routers_loss": 0.007978348061442375, "skip_count": 2.0, "step": 2602, "text_loss": 0.3524550497531891 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 12.225418256530672, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.0008977653941710613, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4196117.0, "repeat_count": 2.0, "routers_loss": 0.0035376469604671, "skip_count": 0.0, "step": 2604, "text_loss": 0.42356348037719727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05810546875, "learning_rate": 0.0008975777778376916, "loss": 0.0156, "macro_f1": 0.6666666865348816, "num_tokens": 4200423.0, "repeat_count": 0.0, "routers_loss": 0.008262477815151215, "skip_count": 1.0, "step": 2606, "text_loss": 0.5272893905639648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.244203111241562, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0008973900091517675, "loss": 0.0114, "macro_f1": 0.3272727429866791, "num_tokens": 4203257.0, "repeat_count": 0.0, "routers_loss": 0.022957922890782356, "skip_count": 1.0, "step": 2608, "text_loss": 0.2713734805583954 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.253595538597006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043701171875, "learning_rate": 0.000897202088185242, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 4206243.0, "repeat_count": 0.0, "routers_loss": 0.006623407825827599, "skip_count": 2.0, "step": 2610, "text_loss": 0.5920525789260864 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.262987965952451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0008970140150101274, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4209264.0, "repeat_count": 0.0, "routers_loss": 0.0008602747693657875, "skip_count": 0.0, "step": 2612, "text_loss": 0.33421996235847473 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0008968257896984932, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 4212058.0, "repeat_count": 0.0, "routers_loss": 0.0024653903674334288, "skip_count": 1.0, "step": 2614, "text_loss": 0.37923356890678406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06298828125, "learning_rate": 0.0008966374123224677, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4214929.0, "repeat_count": 0.0, "routers_loss": 0.010878405533730984, "skip_count": 0.0, "step": 2616, "text_loss": 0.4350503981113434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.291165248018785, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0008964488829542376, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4219170.0, "repeat_count": 0.0, "routers_loss": 0.02864212542772293, "skip_count": 1.0, "step": 2618, "text_loss": 0.26250728964805603 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.300557675374229, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.062255859375, "learning_rate": 0.0008962602016660478, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4222077.0, "repeat_count": 0.0, "routers_loss": 0.010444172658026218, "skip_count": 2.0, "step": 2620, "text_loss": 0.4718937575817108 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.309950102729674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0478515625, "learning_rate": 0.0008960713685302011, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4225383.0, "repeat_count": 0.0, "routers_loss": 0.006409442983567715, "skip_count": 1.0, "step": 2622, "text_loss": 0.30420538783073425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.31934253008512, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.0008958823836190588, "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 4228349.0, "repeat_count": 0.0, "routers_loss": 0.009996986016631126, "skip_count": 1.0, "step": 2624, "text_loss": 0.5392362475395203 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0008956932470050404, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 4232007.0, "repeat_count": 0.0, "routers_loss": 0.0014383369125425816, "skip_count": 0.0, "step": 2626, "text_loss": 0.7112401127815247 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 12.338127384796008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.0008955039587606233, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4235122.0, "repeat_count": 0.0, "routers_loss": 0.00781513936817646, "skip_count": 3.0, "step": 2628, "text_loss": 0.17802883684635162 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 12.347519812151454, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0439453125, "learning_rate": 0.0008953145189583429, "loss": 0.0126, "macro_f1": 0.542222261428833, "num_tokens": 4238248.0, "repeat_count": 0.0, "routers_loss": 0.062252625823020935, "skip_count": 4.0, "step": 2630, "text_loss": 0.5551572442054749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0008951249276707933, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4241042.0, "repeat_count": 0.0, "routers_loss": 0.0011421777307987213, "skip_count": 0.0, "step": 2632, "text_loss": 0.7092233896255493 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.366304666862343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0008949351849706261, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4243939.0, "repeat_count": 0.0, "routers_loss": 0.0032689040526747704, "skip_count": 0.0, "step": 2634, "text_loss": 0.19925718009471893 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.375697094217786, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.0008947452909305509, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4247535.0, "repeat_count": 1.0, "routers_loss": 0.002066014800220728, "skip_count": 0.0, "step": 2636, "text_loss": 0.5249715447425842 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 12.385089521573232, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.09326171875, "learning_rate": 0.0008945552456233356, "loss": 0.0169, "macro_f1": 0.8820862174034119, "num_tokens": 4251441.0, "repeat_count": 2.0, "routers_loss": 0.029332537204027176, "skip_count": 2.0, "step": 2638, "text_loss": 0.19229578971862793 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.394481948928677, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.078125, "learning_rate": 0.0008943650491218058, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4254314.0, "repeat_count": 0.0, "routers_loss": 0.0075911120511591434, "skip_count": 0.0, "step": 2640, "text_loss": 0.27059751749038696 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.0008941747014988453, "loss": 0.0156, "macro_f1": 0.3333333432674408, "num_tokens": 4257442.0, "repeat_count": 0.0, "routers_loss": 0.009030844084918499, "skip_count": 0.0, "step": 2642, "text_loss": 0.36747801303863525 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.123046875, "learning_rate": 0.0008939842028273956, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4260386.0, "repeat_count": 0.0, "routers_loss": 0.007844001986086369, "skip_count": 1.0, "step": 2644, "text_loss": 0.6397647857666016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.422659230995011, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0008937935531804562, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4263516.0, "repeat_count": 0.0, "routers_loss": 0.0018789108144119382, "skip_count": 0.0, "step": 2646, "text_loss": 0.4795534908771515 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.432051658350455, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0008936027526310844, "loss": 0.0098, "macro_f1": 0.3272727429866791, "num_tokens": 4266744.0, "repeat_count": 0.0, "routers_loss": 0.0348590686917305, "skip_count": 1.0, "step": 2648, "text_loss": 0.27691999077796936 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07275390625, "learning_rate": 0.000893411801252395, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4269766.0, "repeat_count": 0.0, "routers_loss": 0.004543309565633535, "skip_count": 1.0, "step": 2650, "text_loss": 0.18867231905460358 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0008932206991175615, "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 4273513.0, "repeat_count": 0.0, "routers_loss": 0.0035277456045150757, "skip_count": 1.0, "step": 2652, "text_loss": 0.45613357424736023 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.460228940416789, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008930294462998143, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4276878.0, "repeat_count": 1.0, "routers_loss": 0.011337592266499996, "skip_count": 0.0, "step": 2654, "text_loss": 0.24733254313468933 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0869140625, "learning_rate": 0.0008928380428724419, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4279915.0, "repeat_count": 0.0, "routers_loss": 0.0010295971296727657, "skip_count": 1.0, "step": 2656, "text_loss": 0.41722849011421204 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0008926464889087903, "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4282888.0, "repeat_count": 0.0, "routers_loss": 0.0017198545392602682, "skip_count": 2.0, "step": 2658, "text_loss": 0.738322377204895 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0008924547844822634, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4285805.0, "repeat_count": 0.0, "routers_loss": 0.001339946174994111, "skip_count": 0.0, "step": 2660, "text_loss": 0.4802379906177521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.497798649838568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05322265625, "learning_rate": 0.000892262929666323, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4290282.0, "repeat_count": 0.0, "routers_loss": 0.0022340165451169014, "skip_count": 0.0, "step": 2662, "text_loss": 0.6503544449806213 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008920709245344878, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4294106.0, "repeat_count": 0.0, "routers_loss": 0.005288850050419569, "skip_count": 1.0, "step": 2664, "text_loss": 0.12312037497758865 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.516583504549457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.0008918787691603347, "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 4298013.0, "repeat_count": 0.0, "routers_loss": 0.004259659443050623, "skip_count": 1.0, "step": 2666, "text_loss": 0.3070000112056732 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.000891686463617498, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4300799.0, "repeat_count": 0.0, "routers_loss": 0.009489355608820915, "skip_count": 1.0, "step": 2668, "text_loss": 0.18535588681697845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008914940079796696, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4304641.0, "repeat_count": 0.0, "routers_loss": 0.0025417013093829155, "skip_count": 0.0, "step": 2670, "text_loss": 0.482585072517395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.544760786615791, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008913014023205988, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4307462.0, "repeat_count": 0.0, "routers_loss": 0.006371749565005302, "skip_count": 0.0, "step": 2672, "text_loss": 0.7064456939697266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008911086467140925, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4310396.0, "repeat_count": 0.0, "routers_loss": 0.0027512952219694853, "skip_count": 0.0, "step": 2674, "text_loss": 0.23532851040363312 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05712890625, "learning_rate": 0.000890915741234015, "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 4314781.0, "repeat_count": 0.0, "routers_loss": 0.008253013715147972, "skip_count": 1.0, "step": 2676, "text_loss": 0.30950358510017395 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.572938068682125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0008907226859542879, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4317988.0, "repeat_count": 0.0, "routers_loss": 0.005409995559602976, "skip_count": 2.0, "step": 2678, "text_loss": 0.4930732846260071 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 12.582330496037569, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.060546875, "learning_rate": 0.0008905294809488907, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 4321014.0, "repeat_count": 1.0, "routers_loss": 0.0029942214023321867, "skip_count": 1.0, "step": 2680, "text_loss": 0.6224040389060974 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0008903361262918595, "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4324268.0, "repeat_count": 0.0, "routers_loss": 0.008411120623350143, "skip_count": 1.0, "step": 2682, "text_loss": 0.16296671330928802 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.60111535074846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05126953125, "learning_rate": 0.0008901426220572884, "loss": 0.0138, "macro_f1": 1.0, "num_tokens": 4327494.0, "repeat_count": 2.0, "routers_loss": 0.01039006095379591, "skip_count": 4.0, "step": 2684, "text_loss": 0.43866512179374695 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.610507778103903, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060791015625, "learning_rate": 0.0008899489683193286, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4330936.0, "repeat_count": 0.0, "routers_loss": 0.0009329111780971289, "skip_count": 0.0, "step": 2686, "text_loss": 0.44250962138175964 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.619900205459349, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07421875, "learning_rate": 0.0008897551651521885, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4334123.0, "repeat_count": 0.0, "routers_loss": 0.003197216661646962, "skip_count": 0.0, "step": 2688, "text_loss": 0.48313501477241516 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.629292632814794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09716796875, "learning_rate": 0.0008895612126301339, "loss": 0.0157, "macro_f1": 0.3333333432674408, "num_tokens": 4337610.0, "repeat_count": 0.0, "routers_loss": 0.0033548236824572086, "skip_count": 0.0, "step": 2690, "text_loss": 0.4715327322483063 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.638685060170237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051513671875, "learning_rate": 0.0008893671108274877, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4341026.0, "repeat_count": 0.0, "routers_loss": 0.0024757643695920706, "skip_count": 0.0, "step": 2692, "text_loss": 0.43402785062789917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0008891728598186302, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 4344422.0, "repeat_count": 0.0, "routers_loss": 0.003317243419587612, "skip_count": 0.0, "step": 2694, "text_loss": 0.8498559594154358 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 12.657469914881126, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0400390625, "learning_rate": 0.0008889784596779986, "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 4347507.0, "repeat_count": 0.0, "routers_loss": 0.01577926240861416, "skip_count": 3.0, "step": 2696, "text_loss": 0.5646669864654541 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.666862342236572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11328125, "learning_rate": 0.0008887839104800876, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4350414.0, "repeat_count": 0.0, "routers_loss": 0.002953822258859873, "skip_count": 0.0, "step": 2698, "text_loss": 0.5145012140274048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0008885892122994486, "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4354110.0, "repeat_count": 0.0, "routers_loss": 0.005849295295774937, "skip_count": 0.0, "step": 2700, "text_loss": 0.580982506275177 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.68564719694746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008883943652106903, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 4357323.0, "repeat_count": 1.0, "routers_loss": 0.012347398325800896, "skip_count": 2.0, "step": 2702, "text_loss": 0.2234988808631897 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.695039624302906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 0.0008881993692884787, "loss": 0.0128, "macro_f1": 0.6666666865348816, "num_tokens": 4360228.0, "repeat_count": 0.0, "routers_loss": 0.003574999049305916, "skip_count": 1.0, "step": 2704, "text_loss": 0.4261806607246399 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.704432051658351, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008880042246075365, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4363905.0, "repeat_count": 0.0, "routers_loss": 0.0031574300955981016, "skip_count": 0.0, "step": 2706, "text_loss": 0.691118061542511 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008878089312426433, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4366736.0, "repeat_count": 0.0, "routers_loss": 0.003195564029738307, "skip_count": 0.0, "step": 2708, "text_loss": 0.613926112651825 }, { "acc_repeat": 0.0, "acc_skip": 0.6000000238418579, "avg_layers": 25.0, "epoch": 12.72321690636924, "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.75, "grad_norm": 0.054443359375, "learning_rate": 0.0008876134892686363, "loss": 0.011, "macro_f1": 0.5694444179534912, "num_tokens": 4370146.0, "repeat_count": 0.0, "routers_loss": 0.038784291595220566, "skip_count": 5.0, "step": 2710, "text_loss": 0.2723451852798462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.000887417898760409, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 4373653.0, "repeat_count": 0.0, "routers_loss": 0.0006457131239585578, "skip_count": 0.0, "step": 2712, "text_loss": 0.31667640805244446 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.742001761080129, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10498046875, "learning_rate": 0.000887222159792912, "loss": 0.0155, "macro_f1": 0.6603773832321167, "num_tokens": 4376993.0, "repeat_count": 1.0, "routers_loss": 0.045078590512275696, "skip_count": 1.0, "step": 2714, "text_loss": 0.5872798562049866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.751394188435574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0008870262724411528, "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4380160.0, "repeat_count": 0.0, "routers_loss": 0.003628545207902789, "skip_count": 0.0, "step": 2716, "text_loss": 0.7468157410621643 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 12.760786615791018, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11181640625, "learning_rate": 0.0008868302367801962, "loss": 0.0118, "macro_f1": 0.6598639488220215, "num_tokens": 4383100.0, "repeat_count": 1.0, "routers_loss": 0.05404464527964592, "skip_count": 3.0, "step": 2718, "text_loss": 0.2970244884490967 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0008866340528851629, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4386700.0, "repeat_count": 0.0, "routers_loss": 0.007000274024903774, "skip_count": 0.0, "step": 2720, "text_loss": 0.34521186351776123 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 12.779571470501908, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.052978515625, "learning_rate": 0.0008864377208312313, "loss": 0.0082, "macro_f1": 0.8823530077934265, "num_tokens": 4390299.0, "repeat_count": 1.0, "routers_loss": 0.02025366574525833, "skip_count": 2.0, "step": 2722, "text_loss": 1.0536936521530151 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.788963897857352, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.000886241240693636, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 4393353.0, "repeat_count": 0.0, "routers_loss": 0.00251673418097198, "skip_count": 0.0, "step": 2724, "text_loss": 0.5678093433380127 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.798356325212797, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0008860446125476686, "loss": 0.0135, "macro_f1": 0.6666666865348816, "num_tokens": 4396446.0, "repeat_count": 1.0, "routers_loss": 0.009532532654702663, "skip_count": 0.0, "step": 2726, "text_loss": 0.23775041103363037 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.807748752568243, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.091796875, "learning_rate": 0.0008858478364686776, "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 4399977.0, "repeat_count": 1.0, "routers_loss": 0.008062181062996387, "skip_count": 0.0, "step": 2728, "text_loss": 0.18888695538043976 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.817141179923686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0008856509125320678, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4404406.0, "repeat_count": 0.0, "routers_loss": 0.0007731119985692203, "skip_count": 0.0, "step": 2730, "text_loss": 0.47331541776657104 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0498046875, "learning_rate": 0.0008854538408133006, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 4407165.0, "repeat_count": 0.0, "routers_loss": 0.003115242812782526, "skip_count": 1.0, "step": 2732, "text_loss": 0.491370290517807 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0008852566213878947, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4410101.0, "repeat_count": 0.0, "routers_loss": 0.0008958528051152825, "skip_count": 0.0, "step": 2734, "text_loss": 0.42188262939453125 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 12.84531846199002, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07763671875, "learning_rate": 0.0008850592543314246, "loss": 0.0118, "macro_f1": 1.0, "num_tokens": 4413015.0, "repeat_count": 1.0, "routers_loss": 0.01139112375676632, "skip_count": 1.0, "step": 2736, "text_loss": 0.4716498553752899 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.854710889345466, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0008848617397195218, "loss": 0.0084, "macro_f1": 0.6603773832321167, "num_tokens": 4416404.0, "repeat_count": 1.0, "routers_loss": 0.01609630137681961, "skip_count": 1.0, "step": 2738, "text_loss": 0.19490821659564972 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0008846640776278745, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 4419408.0, "repeat_count": 0.0, "routers_loss": 0.001489170710556209, "skip_count": 0.0, "step": 2740, "text_loss": 0.6443108320236206 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.873495744056354, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0008844662681322269, "loss": 0.0144, "macro_f1": 0.6666666865348816, "num_tokens": 4422067.0, "repeat_count": 1.0, "routers_loss": 0.0014755792217329144, "skip_count": 0.0, "step": 2742, "text_loss": 0.9150356650352478 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0008842683113083801, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 4425647.0, "repeat_count": 0.0, "routers_loss": 0.008962674997746944, "skip_count": 1.0, "step": 2744, "text_loss": 0.7103227972984314 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 12.892280598767243, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0008840702072321915, "loss": 0.0104, "macro_f1": 0.6598639488220215, "num_tokens": 4428855.0, "repeat_count": 1.0, "routers_loss": 0.02554207295179367, "skip_count": 3.0, "step": 2746, "text_loss": 0.27141591906547546 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.901673026122689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0008838719559795751, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4432838.0, "repeat_count": 0.0, "routers_loss": 0.0011747616808861494, "skip_count": 0.0, "step": 2748, "text_loss": 0.4007738530635834 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 12.911065453478134, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03466796875, "learning_rate": 0.0008836735576265009, "loss": 0.0073, "macro_f1": 0.5492662787437439, "num_tokens": 4435793.0, "repeat_count": 0.0, "routers_loss": 0.017564335837960243, "skip_count": 2.0, "step": 2750, "text_loss": 0.5972410440444946 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.920457880833577, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.044921875, "learning_rate": 0.0008834750122489956, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 4438871.0, "repeat_count": 1.0, "routers_loss": 0.007004009559750557, "skip_count": 0.0, "step": 2752, "text_loss": 0.2294853925704956 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0008832763199231423, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4441846.0, "repeat_count": 0.0, "routers_loss": 0.0014562139986082911, "skip_count": 0.0, "step": 2754, "text_loss": 0.722432017326355 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.939242735544468, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0751953125, "learning_rate": 0.0008830774807250802, "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 4444786.0, "repeat_count": 1.0, "routers_loss": 0.024773593991994858, "skip_count": 0.0, "step": 2756, "text_loss": 0.507905125617981 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 12.948635162899912, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.049072265625, "learning_rate": 0.0008828784947310049, "loss": 0.0129, "macro_f1": 0.8823530077934265, "num_tokens": 4448442.0, "repeat_count": 1.0, "routers_loss": 0.04959975928068161, "skip_count": 2.0, "step": 2758, "text_loss": 0.3617522418498993 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.958027590255357, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1025390625, "learning_rate": 0.000882679362017168, "loss": 0.0149, "macro_f1": 1.0, "num_tokens": 4451401.0, "repeat_count": 1.0, "routers_loss": 0.005783245898783207, "skip_count": 2.0, "step": 2760, "text_loss": 0.49187400937080383 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0008824800826598778, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 4454537.0, "repeat_count": 0.0, "routers_loss": 0.00656260596588254, "skip_count": 0.0, "step": 2762, "text_loss": 0.6823583245277405 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.976812444966246, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0546875, "learning_rate": 0.0008822806567354983, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4457706.0, "repeat_count": 1.0, "routers_loss": 0.005298966076225042, "skip_count": 0.0, "step": 2764, "text_loss": 0.554322361946106 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.986204872321691, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046630859375, "learning_rate": 0.0008820810843204501, "loss": 0.0096, "macro_f1": 0.3272727429866791, "num_tokens": 4460710.0, "repeat_count": 0.0, "routers_loss": 0.03164982795715332, "skip_count": 1.0, "step": 2766, "text_loss": 0.1656961441040039 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.995597299677135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.072265625, "learning_rate": 0.0008818813654912095, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4464001.0, "repeat_count": 0.0, "routers_loss": 0.000715116853825748, "skip_count": 0.0, "step": 2768, "text_loss": 0.5818144083023071 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.004696213677722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056396484375, "learning_rate": 0.0008816815003243093, "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 4467364.0, "repeat_count": 0.0, "routers_loss": 0.002851625671610236, "skip_count": 0.0, "step": 2770, "text_loss": 0.6068631410598755 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0008814814888963383, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4470681.0, "repeat_count": 0.0, "routers_loss": 0.004729873035103083, "skip_count": 1.0, "step": 2772, "text_loss": 0.5386646389961243 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.000881281331283941, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4473734.0, "repeat_count": 0.0, "routers_loss": 0.0031853127293288708, "skip_count": 1.0, "step": 2774, "text_loss": 0.5695263147354126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008810810275638182, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4478404.0, "repeat_count": 0.0, "routers_loss": 0.0008977465913631022, "skip_count": 0.0, "step": 2776, "text_loss": 0.4750773310661316 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.042265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0008808805778127269, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4481287.0, "repeat_count": 0.0, "routers_loss": 0.00469845999032259, "skip_count": 0.0, "step": 2778, "text_loss": 0.14078612625598907 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 13.051658350454945, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.049560546875, "learning_rate": 0.0008806799821074796, "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 4483929.0, "repeat_count": 0.0, "routers_loss": 0.01789761893451214, "skip_count": 2.0, "step": 2780, "text_loss": 0.2167191207408905 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056396484375, "learning_rate": 0.0008804792405249451, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 4487468.0, "repeat_count": 0.0, "routers_loss": 0.001018838956952095, "skip_count": 0.0, "step": 2782, "text_loss": 0.5424665212631226 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 13.070443205165835, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.07373046875, "learning_rate": 0.000880278353142048, "loss": 0.0077, "macro_f1": 0.8200000524520874, "num_tokens": 4490942.0, "repeat_count": 1.0, "routers_loss": 0.03260354697704315, "skip_count": 3.0, "step": 2784, "text_loss": 0.20994654297828674 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.079835632521279, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05322265625, "learning_rate": 0.0008800773200357683, "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4493986.0, "repeat_count": 0.0, "routers_loss": 0.003019835101440549, "skip_count": 0.0, "step": 2786, "text_loss": 0.5709528923034668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0008798761412831429, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4498232.0, "repeat_count": 0.0, "routers_loss": 0.00285192858427763, "skip_count": 0.0, "step": 2788, "text_loss": 0.5103896260261536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044921875, "learning_rate": 0.0008796748169612634, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4501231.0, "repeat_count": 0.0, "routers_loss": 0.0012469831854104996, "skip_count": 0.0, "step": 2790, "text_loss": 0.43669697642326355 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.108012914587613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0008794733471472778, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4504208.0, "repeat_count": 0.0, "routers_loss": 0.011512776836752892, "skip_count": 1.0, "step": 2792, "text_loss": 0.2299770563840866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.117405341943059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0008792717319183899, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4507013.0, "repeat_count": 0.0, "routers_loss": 0.00834917277097702, "skip_count": 0.0, "step": 2794, "text_loss": 0.2130603939294815 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.126797769298504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0008790699713518587, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 4510286.0, "repeat_count": 0.0, "routers_loss": 0.008616939187049866, "skip_count": 2.0, "step": 2796, "text_loss": 0.4377101957798004 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.0008788680655249994, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4513762.0, "repeat_count": 0.0, "routers_loss": 0.003408568911254406, "skip_count": 0.0, "step": 2798, "text_loss": 0.435138463973999 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 13.145582624009393, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0008786660145151826, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4516696.0, "repeat_count": 1.0, "routers_loss": 0.0029398901388049126, "skip_count": 0.0, "step": 2800, "text_loss": 0.3195655047893524 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0008784638183998348, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4519760.0, "repeat_count": 0.0, "routers_loss": 0.0013777425047010183, "skip_count": 0.0, "step": 2802, "text_loss": 0.8129430413246155 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.164367478720282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0008782614772564379, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4522106.0, "repeat_count": 0.0, "routers_loss": 0.0031694830395281315, "skip_count": 0.0, "step": 2804, "text_loss": 0.18083660304546356 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0008780589911625293, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4525743.0, "repeat_count": 0.0, "routers_loss": 0.002161208540201187, "skip_count": 0.0, "step": 2806, "text_loss": 0.8228182792663574 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07177734375, "learning_rate": 0.0008778563601957021, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 4529573.0, "repeat_count": 0.0, "routers_loss": 0.0028444856870919466, "skip_count": 1.0, "step": 2808, "text_loss": 0.3715563118457794 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.192544760786616, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0008776535844336049, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4532452.0, "repeat_count": 0.0, "routers_loss": 0.003807213855907321, "skip_count": 0.0, "step": 2810, "text_loss": 0.6012523174285889 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.201937188142061, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0008774506639539417, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4536077.0, "repeat_count": 0.0, "routers_loss": 0.006698979996144772, "skip_count": 0.0, "step": 2812, "text_loss": 0.27097949385643005 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.099609375, "learning_rate": 0.0008772475988344722, "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 4539057.0, "repeat_count": 0.0, "routers_loss": 0.004849409218877554, "skip_count": 1.0, "step": 2814, "text_loss": 1.026973843574524 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 13.22072204285295, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.041748046875, "learning_rate": 0.0008770443891530109, "loss": 0.0115, "macro_f1": 0.5934640765190125, "num_tokens": 4542253.0, "repeat_count": 0.0, "routers_loss": 0.019148651510477066, "skip_count": 3.0, "step": 2816, "text_loss": 0.2717585563659668 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.230114470208395, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.052490234375, "learning_rate": 0.0008768410349874286, "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 4545047.0, "repeat_count": 1.0, "routers_loss": 0.02231316640973091, "skip_count": 2.0, "step": 2818, "text_loss": 0.274346262216568 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043212890625, "learning_rate": 0.0008766375364156508, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4548371.0, "repeat_count": 0.0, "routers_loss": 0.008014129474759102, "skip_count": 2.0, "step": 2820, "text_loss": 0.22850871086120605 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.248899324919284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.0008764338935156586, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4551276.0, "repeat_count": 0.0, "routers_loss": 0.0014544493751600385, "skip_count": 0.0, "step": 2822, "text_loss": 0.6308462023735046 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 13.258291752274728, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.000876230106365488, "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 4554143.0, "repeat_count": 0.0, "routers_loss": 0.00818584579974413, "skip_count": 3.0, "step": 2824, "text_loss": 0.3484207093715668 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 13.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0008760261750432312, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 4557256.0, "repeat_count": 0.0, "routers_loss": 0.006275608204305172, "skip_count": 3.0, "step": 2826, "text_loss": 0.1927330046892166 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.277076606985618, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 0.0008758220996270348, "loss": 0.0103, "macro_f1": 1.0, "num_tokens": 4560202.0, "repeat_count": 2.0, "routers_loss": 0.0055974251590669155, "skip_count": 2.0, "step": 2828, "text_loss": 0.7796496748924255 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.286469034341062, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046142578125, "learning_rate": 0.0008756178801951007, "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 4563508.0, "repeat_count": 0.0, "routers_loss": 0.0019799957517534494, "skip_count": 0.0, "step": 2830, "text_loss": 0.49633297324180603 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0458984375, "learning_rate": 0.0008754135168256865, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4566776.0, "repeat_count": 0.0, "routers_loss": 0.004538947716355324, "skip_count": 0.0, "step": 2832, "text_loss": 0.5346745252609253 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.305253889051952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0008752090095971044, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4569787.0, "repeat_count": 0.0, "routers_loss": 0.001663343166001141, "skip_count": 0.0, "step": 2834, "text_loss": 0.5524004697799683 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.314646316407396, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.000875004358587722, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 4572813.0, "repeat_count": 0.0, "routers_loss": 0.0022988212294876575, "skip_count": 0.0, "step": 2836, "text_loss": 0.4232870042324066 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.324038743762841, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.000874799563875962, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4575563.0, "repeat_count": 0.0, "routers_loss": 0.007781553082168102, "skip_count": 1.0, "step": 2838, "text_loss": 0.19239822030067444 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 13.333431171118287, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03515625, "learning_rate": 0.0008745946255403021, "loss": 0.0072, "macro_f1": 0.5492662787437439, "num_tokens": 4578117.0, "repeat_count": 0.0, "routers_loss": 0.01872488670051098, "skip_count": 2.0, "step": 2840, "text_loss": 0.2148810178041458 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.34282359847373, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0008743895436592749, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 4582330.0, "repeat_count": 1.0, "routers_loss": 0.005634195636957884, "skip_count": 1.0, "step": 2842, "text_loss": 0.4929640591144562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048583984375, "learning_rate": 0.0008741843183114685, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4585765.0, "repeat_count": 0.0, "routers_loss": 0.0008928569150157273, "skip_count": 0.0, "step": 2844, "text_loss": 0.32702967524528503 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 13.361608453184619, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.0008739789495755253, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4589000.0, "repeat_count": 0.0, "routers_loss": 0.014715569093823433, "skip_count": 4.0, "step": 2846, "text_loss": 0.25125816464424133 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.371000880540064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.0008737734375301433, "loss": 0.0135, "macro_f1": 0.3333333432674408, "num_tokens": 4592391.0, "repeat_count": 0.0, "routers_loss": 0.0017551190685480833, "skip_count": 0.0, "step": 2848, "text_loss": 0.6595172882080078 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0008735677822540749, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4596662.0, "repeat_count": 0.0, "routers_loss": 0.0006456313421949744, "skip_count": 0.0, "step": 2850, "text_loss": 0.6290773153305054 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0008733619838261276, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 4599682.0, "repeat_count": 0.0, "routers_loss": 0.00765060493722558, "skip_count": 2.0, "step": 2852, "text_loss": 0.3268161416053772 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.399178162606399, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0008731560423251637, "loss": 0.01, "macro_f1": 1.0, "num_tokens": 4603324.0, "repeat_count": 1.0, "routers_loss": 0.01161442045122385, "skip_count": 2.0, "step": 2854, "text_loss": 0.3029932975769043 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 25.0, "epoch": 13.408570589961844, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.0419921875, "learning_rate": 0.0008729499578301005, "loss": 0.0098, "macro_f1": 0.9555556178092957, "num_tokens": 4606975.0, "repeat_count": 1.0, "routers_loss": 0.02055389992892742, "skip_count": 5.0, "step": 2856, "text_loss": 0.6268532872200012 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05078125, "learning_rate": 0.00087274373041991, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4609629.0, "repeat_count": 0.0, "routers_loss": 0.0013911726418882608, "skip_count": 0.0, "step": 2858, "text_loss": 0.534355640411377 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 13.427355444672733, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.053955078125, "learning_rate": 0.0008725373601736188, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 4612913.0, "repeat_count": 2.0, "routers_loss": 0.01010701060295105, "skip_count": 0.0, "step": 2860, "text_loss": 0.3391380310058594 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0008723308471703085, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4616718.0, "repeat_count": 0.0, "routers_loss": 0.005969462916254997, "skip_count": 1.0, "step": 2862, "text_loss": 0.47250816226005554 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.446140299383622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046630859375, "learning_rate": 0.0008721241914891152, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4619680.0, "repeat_count": 0.0, "routers_loss": 0.0027780034579336643, "skip_count": 0.0, "step": 2864, "text_loss": 0.3249278664588928 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.455532726739067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 0.0008719173932092295, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 4622700.0, "repeat_count": 0.0, "routers_loss": 0.0015912104863673449, "skip_count": 0.0, "step": 2866, "text_loss": 0.7789985537528992 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05126953125, "learning_rate": 0.0008717104524098973, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4626637.0, "repeat_count": 0.0, "routers_loss": 0.0036539011634886265, "skip_count": 0.0, "step": 2868, "text_loss": 0.619088351726532 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10400390625, "learning_rate": 0.0008715033691704187, "loss": 0.0118, "macro_f1": 0.6666666865348816, "num_tokens": 4629863.0, "repeat_count": 0.0, "routers_loss": 0.008402476087212563, "skip_count": 1.0, "step": 2870, "text_loss": 0.5550018548965454 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.483710008805401, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0008712961435701479, "loss": 0.0161, "macro_f1": 0.6666666865348816, "num_tokens": 4632657.0, "repeat_count": 0.0, "routers_loss": 0.01400839351117611, "skip_count": 1.0, "step": 2872, "text_loss": 0.17368625104427338 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.493102436160845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008710887756884947, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4635885.0, "repeat_count": 0.0, "routers_loss": 0.0014573842054232955, "skip_count": 0.0, "step": 2874, "text_loss": 0.5138643383979797 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008708812656049225, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 4639341.0, "repeat_count": 0.0, "routers_loss": 0.002810224425047636, "skip_count": 1.0, "step": 2876, "text_loss": 0.70310378074646 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 13.511887290871735, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.03564453125, "learning_rate": 0.0008706736133989497, "loss": 0.0105, "macro_f1": 0.9449735879898071, "num_tokens": 4642163.0, "repeat_count": 2.0, "routers_loss": 0.029783209785819054, "skip_count": 4.0, "step": 2878, "text_loss": 0.26898008584976196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008704658191501491, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4645858.0, "repeat_count": 0.0, "routers_loss": 0.0009193966398015618, "skip_count": 0.0, "step": 2880, "text_loss": 0.6047570705413818 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 13.530672145582624, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0008702578829381475, "loss": 0.0131, "macro_f1": 0.8814815282821655, "num_tokens": 4649237.0, "repeat_count": 2.0, "routers_loss": 0.05698608607053757, "skip_count": 4.0, "step": 2882, "text_loss": 0.10695219784975052 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0008700498048426269, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4652362.0, "repeat_count": 0.0, "routers_loss": 0.0011786938412114978, "skip_count": 0.0, "step": 2884, "text_loss": 0.4442957937717438 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 13.549457000293513, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.046142578125, "learning_rate": 0.0008698415849433229, "loss": 0.0092, "macro_f1": 0.5492662787437439, "num_tokens": 4655616.0, "repeat_count": 2.0, "routers_loss": 0.02142646163702011, "skip_count": 0.0, "step": 2886, "text_loss": 0.5820964574813843 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0008696332233200262, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4659294.0, "repeat_count": 0.0, "routers_loss": 0.004038636106997728, "skip_count": 0.0, "step": 2888, "text_loss": 0.11847645789384842 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.0008694247200525806, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4662512.0, "repeat_count": 0.0, "routers_loss": 0.0013256469974294305, "skip_count": 0.0, "step": 2890, "text_loss": 0.4873582720756531 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.577634282359847, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008692160752208856, "loss": 0.0129, "macro_f1": 0.3272727429866791, "num_tokens": 4666190.0, "repeat_count": 0.0, "routers_loss": 0.04477972164750099, "skip_count": 1.0, "step": 2892, "text_loss": 0.44243401288986206 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.587026709715293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09521484375, "learning_rate": 0.0008690072889048941, "loss": 0.0127, "macro_f1": 1.0, "num_tokens": 4668884.0, "repeat_count": 1.0, "routers_loss": 0.004407547414302826, "skip_count": 2.0, "step": 2894, "text_loss": 0.6847127079963684 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008687983611846133, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4672093.0, "repeat_count": 0.0, "routers_loss": 0.005245382897555828, "skip_count": 1.0, "step": 2896, "text_loss": 0.25583332777023315 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.605811564426181, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0458984375, "learning_rate": 0.0008685892921401049, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4674917.0, "repeat_count": 0.0, "routers_loss": 0.0010470855049788952, "skip_count": 0.0, "step": 2898, "text_loss": 0.41998377442359924 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0008683800818514844, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4677739.0, "repeat_count": 0.0, "routers_loss": 0.009026622399687767, "skip_count": 2.0, "step": 2900, "text_loss": 0.303053081035614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.0008681707303989215, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4680721.0, "repeat_count": 0.0, "routers_loss": 0.004500916693359613, "skip_count": 0.0, "step": 2902, "text_loss": 0.5573288798332214 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.633988846492516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0008679612378626404, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 4683339.0, "repeat_count": 0.0, "routers_loss": 0.005047840531915426, "skip_count": 1.0, "step": 2904, "text_loss": 0.321353554725647 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.643381273847961, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0008677516043229187, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4686453.0, "repeat_count": 0.0, "routers_loss": 0.010256914421916008, "skip_count": 1.0, "step": 2906, "text_loss": 0.4300784468650818 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 13.652773701203404, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0008675418298600883, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4689645.0, "repeat_count": 1.0, "routers_loss": 0.0022669637110084295, "skip_count": 0.0, "step": 2908, "text_loss": 0.5064885020256042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008673319145545358, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4692320.0, "repeat_count": 0.0, "routers_loss": 0.0011188550852239132, "skip_count": 0.0, "step": 2910, "text_loss": 0.7114819884300232 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.671558555914293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0008671218584867003, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4695116.0, "repeat_count": 0.0, "routers_loss": 0.002966561820358038, "skip_count": 2.0, "step": 2912, "text_loss": 0.5662392973899841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047607421875, "learning_rate": 0.0008669116617370762, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4698040.0, "repeat_count": 0.0, "routers_loss": 0.0012894890969619155, "skip_count": 0.0, "step": 2914, "text_loss": 0.718977689743042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.690343410625184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0008667013243862111, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4700963.0, "repeat_count": 0.0, "routers_loss": 0.0007232456118799746, "skip_count": 0.0, "step": 2916, "text_loss": 0.3447718024253845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.699735837980628, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.000866490846514707, "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 4704471.0, "repeat_count": 1.0, "routers_loss": 0.015166680328547955, "skip_count": 0.0, "step": 2918, "text_loss": 0.454946368932724 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.709128265336073, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04736328125, "learning_rate": 0.000866280228203219, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 4707238.0, "repeat_count": 1.0, "routers_loss": 0.0061312485486269, "skip_count": 1.0, "step": 2920, "text_loss": 0.721788227558136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.718520692691518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008660694695324564, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4711323.0, "repeat_count": 0.0, "routers_loss": 0.00169933564029634, "skip_count": 0.0, "step": 2922, "text_loss": 0.7562121748924255 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.727913120046962, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0008658585705831829, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 4714417.0, "repeat_count": 0.0, "routers_loss": 0.0022731393110007048, "skip_count": 0.0, "step": 2924, "text_loss": 0.5726147890090942 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.737305547402407, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0008656475314362148, "loss": 0.0131, "macro_f1": 0.8817967176437378, "num_tokens": 4717445.0, "repeat_count": 2.0, "routers_loss": 0.06477782875299454, "skip_count": 3.0, "step": 2926, "text_loss": 0.4505867660045624 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 13.74669797475785, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.06396484375, "learning_rate": 0.0008654363521724229, "loss": 0.0129, "macro_f1": 0.9449735879898071, "num_tokens": 4722253.0, "repeat_count": 2.0, "routers_loss": 0.027405790984630585, "skip_count": 4.0, "step": 2928, "text_loss": 0.24767601490020752 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0537109375, "learning_rate": 0.0008652250328727315, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4725465.0, "repeat_count": 0.0, "routers_loss": 0.006544729229062796, "skip_count": 2.0, "step": 2930, "text_loss": 0.4478724002838135 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 13.765482829468741, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0008650135736181184, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4729213.0, "repeat_count": 1.0, "routers_loss": 0.0055119614116847515, "skip_count": 0.0, "step": 2932, "text_loss": 0.6749323010444641 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 0.0008648019744896154, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4732280.0, "repeat_count": 0.0, "routers_loss": 0.008374541997909546, "skip_count": 0.0, "step": 2934, "text_loss": 0.4647359251976013 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 13.78426768417963, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06201171875, "learning_rate": 0.0008645902355683077, "loss": 0.0091, "macro_f1": 0.6595745086669922, "num_tokens": 4736244.0, "repeat_count": 1.0, "routers_loss": 0.068686343729496, "skip_count": 4.0, "step": 2936, "text_loss": 0.5356017351150513 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 13.793660111535075, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0008643783569353339, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4739810.0, "repeat_count": 2.0, "routers_loss": 0.017954571172595024, "skip_count": 0.0, "step": 2938, "text_loss": 0.3145926296710968 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.803052538890519, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.054443359375, "learning_rate": 0.0008641663386718863, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4742720.0, "repeat_count": 0.0, "routers_loss": 0.006261351052671671, "skip_count": 1.0, "step": 2940, "text_loss": 0.3200613856315613 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.812444966245964, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008639541808592109, "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 4745870.0, "repeat_count": 1.0, "routers_loss": 0.0025341357104480267, "skip_count": 1.0, "step": 2942, "text_loss": 0.5020416378974915 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0008637418835786067, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4748943.0, "repeat_count": 0.0, "routers_loss": 0.008970048278570175, "skip_count": 2.0, "step": 2944, "text_loss": 0.14517110586166382 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008635294469114265, "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4751360.0, "repeat_count": 0.0, "routers_loss": 0.002133632078766823, "skip_count": 0.0, "step": 2946, "text_loss": 0.5367856025695801 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.840622248312298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.0008633168709390766, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4754403.0, "repeat_count": 0.0, "routers_loss": 0.0011866620043292642, "skip_count": 0.0, "step": 2948, "text_loss": 0.38302522897720337 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 13.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0008631041557430163, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4757867.0, "repeat_count": 2.0, "routers_loss": 0.0026854004245251417, "skip_count": 0.0, "step": 2950, "text_loss": 0.43433454632759094 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05859375, "learning_rate": 0.0008628913014047585, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 4761171.0, "repeat_count": 0.0, "routers_loss": 0.002433479530736804, "skip_count": 0.0, "step": 2952, "text_loss": 0.4725971519947052 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.868799530378633, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0008626783080058696, "loss": 0.0066, "macro_f1": 0.3272727429866791, "num_tokens": 4764752.0, "repeat_count": 1.0, "routers_loss": 0.017182493582367897, "skip_count": 0.0, "step": 2954, "text_loss": 0.460641473531723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.878191957734076, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12353515625, "learning_rate": 0.0008624651756279687, "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 4767453.0, "repeat_count": 0.0, "routers_loss": 0.0018134774873033166, "skip_count": 0.0, "step": 2956, "text_loss": 0.4091459810733795 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.887584385089522, "f1_execute": 0.9777777791023254, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.053466796875, "learning_rate": 0.000862251904352729, "loss": 0.0108, "macro_f1": 0.9259259104728699, "num_tokens": 4771110.0, "repeat_count": 3.0, "routers_loss": 0.0365753099322319, "skip_count": 3.0, "step": 2958, "text_loss": 0.22408585250377655 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.896976812444967, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.000862038494261876, "loss": 0.0109, "macro_f1": 0.3272727429866791, "num_tokens": 4774464.0, "repeat_count": 0.0, "routers_loss": 0.024343067780137062, "skip_count": 1.0, "step": 2960, "text_loss": 0.16483014822006226 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0008618249454371891, "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 4777894.0, "repeat_count": 0.0, "routers_loss": 0.0008310087723657489, "skip_count": 0.0, "step": 2962, "text_loss": 0.5573428869247437 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0008616112579605006, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4781116.0, "repeat_count": 0.0, "routers_loss": 0.0065494864247739315, "skip_count": 0.0, "step": 2964, "text_loss": 0.18816794455051422 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.925154094511301, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.0008613974319136957, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4784886.0, "repeat_count": 0.0, "routers_loss": 0.0019726944155991077, "skip_count": 0.0, "step": 2966, "text_loss": 0.5097305774688721 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0008611834673787134, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4787563.0, "repeat_count": 0.0, "routers_loss": 0.006327496841549873, "skip_count": 0.0, "step": 2968, "text_loss": 0.6953814029693604 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.94393894922219, "f1_execute": 0.9600000381469727, "f1_repeat": 0.5, "f1_skip": 1.0, "grad_norm": 0.056884765625, "learning_rate": 0.0008609693644375449, "loss": 0.0086, "macro_f1": 0.8200000524520874, "num_tokens": 4790421.0, "repeat_count": 3.0, "routers_loss": 0.042896661907434464, "skip_count": 1.0, "step": 2970, "text_loss": 0.2573051154613495 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 13.953331376577633, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.14453125, "learning_rate": 0.000860755123172235, "loss": 0.0096, "macro_f1": 1.0, "num_tokens": 4793786.0, "repeat_count": 2.0, "routers_loss": 0.013228793628513813, "skip_count": 1.0, "step": 2972, "text_loss": 0.46614497900009155 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.962723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0008605407436648815, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4796864.0, "repeat_count": 0.0, "routers_loss": 0.007294759154319763, "skip_count": 2.0, "step": 2974, "text_loss": 0.21555091440677643 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 13.972116231288524, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.057861328125, "learning_rate": 0.0008603262259976348, "loss": 0.0129, "macro_f1": 1.0, "num_tokens": 4800080.0, "repeat_count": 1.0, "routers_loss": 0.0024024227168411016, "skip_count": 5.0, "step": 2976, "text_loss": 0.7855485081672668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0008601115702526987, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4802899.0, "repeat_count": 0.0, "routers_loss": 0.001433031284250319, "skip_count": 0.0, "step": 2978, "text_loss": 0.6777765154838562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04931640625, "learning_rate": 0.0008598967765123293, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4805835.0, "repeat_count": 0.0, "routers_loss": 0.003073975909501314, "skip_count": 0.0, "step": 2980, "text_loss": 0.5926910638809204 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 14.0, "f1_execute": 0.9333333373069763, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05322265625, "learning_rate": 0.0008596818448588364, "loss": 0.0139, "macro_f1": 0.8666667342185974, "num_tokens": 4809028.0, "repeat_count": 1.0, "routers_loss": 0.06438573449850082, "skip_count": 6.0, "step": 2982, "text_loss": 0.23975612223148346 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.009392427355445, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0008594667753745821, "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 4812831.0, "repeat_count": 0.0, "routers_loss": 0.014817612245678902, "skip_count": 1.0, "step": 2984, "text_loss": 0.17292268574237823 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.018784854710889, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.07421875, "learning_rate": 0.0008592515681419813, "loss": 0.0078, "macro_f1": 0.5492662787437439, "num_tokens": 4816005.0, "repeat_count": 2.0, "routers_loss": 0.025407327339053154, "skip_count": 0.0, "step": 2986, "text_loss": 0.6403061151504517 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0008590362232435018, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4818901.0, "repeat_count": 0.0, "routers_loss": 0.006826757453382015, "skip_count": 0.0, "step": 2988, "text_loss": 0.2572069466114044 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008588207407616644, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4823120.0, "repeat_count": 0.0, "routers_loss": 0.0009054148104041815, "skip_count": 0.0, "step": 2990, "text_loss": 0.4827076196670532 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.046962136777223, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0008586051207790422, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 4825774.0, "repeat_count": 0.0, "routers_loss": 0.0012294676853343844, "skip_count": 0.0, "step": 2992, "text_loss": 0.40157821774482727 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 14.056354564132668, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.052734375, "learning_rate": 0.0008583893633782612, "loss": 0.0084, "macro_f1": 0.5492662787437439, "num_tokens": 4828841.0, "repeat_count": 0.0, "routers_loss": 0.011474622413516045, "skip_count": 2.0, "step": 2994, "text_loss": 0.14842072129249573 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.058837890625, "learning_rate": 0.0008581734686419999, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4831458.0, "repeat_count": 0.0, "routers_loss": 0.009154081344604492, "skip_count": 2.0, "step": 2996, "text_loss": 0.365400105714798 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.075139418843557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.00085795743665299, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4834609.0, "repeat_count": 0.0, "routers_loss": 0.002899336162954569, "skip_count": 0.0, "step": 2998, "text_loss": 0.5574684143066406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0008577412674940152, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4838324.0, "repeat_count": 0.0, "routers_loss": 0.0034664268605411053, "skip_count": 0.0, "step": 3000, "text_loss": 0.6752855777740479 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 0.0008575249612479117, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 4841877.0, "repeat_count": 0.0, "routers_loss": 0.0036425739526748657, "skip_count": 2.0, "step": 3002, "text_loss": 0.6332980394363403 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048095703125, "learning_rate": 0.0008573085179975685, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4845840.0, "repeat_count": 0.0, "routers_loss": 0.0013783496106043458, "skip_count": 0.0, "step": 3004, "text_loss": 0.4219617545604706 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0008570919378259274, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4848766.0, "repeat_count": 0.0, "routers_loss": 0.004823608323931694, "skip_count": 1.0, "step": 3006, "text_loss": 0.7987180948257446 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.000856875220815982, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4852310.0, "repeat_count": 0.0, "routers_loss": 0.0014760984340682626, "skip_count": 0.0, "step": 3008, "text_loss": 0.35592713952064514 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.131493982976226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0008566583670507788, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4856146.0, "repeat_count": 0.0, "routers_loss": 0.0031717263627797365, "skip_count": 1.0, "step": 3010, "text_loss": 0.19379083812236786 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.140886410331671, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0008564413766134164, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 4859386.0, "repeat_count": 0.0, "routers_loss": 0.003361492184922099, "skip_count": 0.0, "step": 3012, "text_loss": 0.39129266142845154 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048583984375, "learning_rate": 0.0008562242495870463, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4862661.0, "repeat_count": 0.0, "routers_loss": 0.0010563990799710155, "skip_count": 0.0, "step": 3014, "text_loss": 0.5966938734054565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0008560069860548716, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4865410.0, "repeat_count": 0.0, "routers_loss": 0.001233913702890277, "skip_count": 0.0, "step": 3016, "text_loss": 0.3386077880859375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.169063692398003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055419921875, "learning_rate": 0.0008557895861001484, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4868931.0, "repeat_count": 0.0, "routers_loss": 0.0018066301709041, "skip_count": 0.0, "step": 3018, "text_loss": 0.5222050547599792 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.178456119753449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008555720498061845, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4873492.0, "repeat_count": 0.0, "routers_loss": 0.0050385501235723495, "skip_count": 1.0, "step": 3020, "text_loss": 0.4558849334716797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.187848547108894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008553543772563403, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4877026.0, "repeat_count": 0.0, "routers_loss": 0.004828717093914747, "skip_count": 0.0, "step": 3022, "text_loss": 0.36598992347717285 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 25.0, "epoch": 14.197240974464338, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.06103515625, "learning_rate": 0.0008551365685340285, "loss": 0.0084, "macro_f1": 0.9555556178092957, "num_tokens": 4879655.0, "repeat_count": 1.0, "routers_loss": 0.02049369551241398, "skip_count": 5.0, "step": 3024, "text_loss": 0.5069093704223633 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 14.206633401819783, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.043212890625, "learning_rate": 0.0008549186237227138, "loss": 0.0088, "macro_f1": 0.8823530077934265, "num_tokens": 4882606.0, "repeat_count": 1.0, "routers_loss": 0.03947242721915245, "skip_count": 2.0, "step": 3026, "text_loss": 0.2600715458393097 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 14.216025829175228, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.0008547005429059128, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4885246.0, "repeat_count": 2.0, "routers_loss": 0.0026363315992057323, "skip_count": 0.0, "step": 3028, "text_loss": 0.37642326951026917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.225418256530672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008544823261671948, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 4888109.0, "repeat_count": 0.0, "routers_loss": 0.003858231008052826, "skip_count": 0.0, "step": 3030, "text_loss": 0.5875385999679565 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 14.234810683886117, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.061279296875, "learning_rate": 0.0008542639735901804, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 4891168.0, "repeat_count": 1.0, "routers_loss": 0.004789089784026146, "skip_count": 1.0, "step": 3032, "text_loss": 0.6417325139045715 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.244203111241562, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0008540454852585434, "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4894355.0, "repeat_count": 0.0, "routers_loss": 0.007334680762141943, "skip_count": 2.0, "step": 3034, "text_loss": 0.23697198927402496 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 14.253595538597006, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.034423828125, "learning_rate": 0.0008538268612560084, "loss": 0.0058, "macro_f1": 0.4871794879436493, "num_tokens": 4897543.0, "repeat_count": 0.0, "routers_loss": 0.022096361964941025, "skip_count": 3.0, "step": 3036, "text_loss": 0.1989550143480301 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.262987965952451, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.047119140625, "learning_rate": 0.0008536081016663527, "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 4900752.0, "repeat_count": 1.0, "routers_loss": 0.0037680594250559807, "skip_count": 2.0, "step": 3038, "text_loss": 0.5001366138458252 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0400390625, "learning_rate": 0.0008533892065734055, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4903581.0, "repeat_count": 0.0, "routers_loss": 0.0032373068388551474, "skip_count": 1.0, "step": 3040, "text_loss": 0.5019411444664001 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.28177282066334, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.042724609375, "learning_rate": 0.0008531701760610476, "loss": 0.0121, "macro_f1": 1.0, "num_tokens": 4907108.0, "repeat_count": 1.0, "routers_loss": 0.0078013185411691666, "skip_count": 2.0, "step": 3042, "text_loss": 0.3460627794265747 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 14.291165248018785, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.04833984375, "learning_rate": 0.000852951010213212, "loss": 0.0089, "macro_f1": 0.8200000524520874, "num_tokens": 4911269.0, "repeat_count": 1.0, "routers_loss": 0.03576689213514328, "skip_count": 3.0, "step": 3044, "text_loss": 0.268994003534317 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 14.300557675374229, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0008527317091138835, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 4914203.0, "repeat_count": 1.0, "routers_loss": 0.0032140621915459633, "skip_count": 1.0, "step": 3046, "text_loss": 0.9998719692230225 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.309950102729674, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040771484375, "learning_rate": 0.0008525122728470987, "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4918562.0, "repeat_count": 1.0, "routers_loss": 0.008559177629649639, "skip_count": 3.0, "step": 3048, "text_loss": 0.3062439560890198 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0008522927014969459, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 4921940.0, "repeat_count": 0.0, "routers_loss": 0.008735597133636475, "skip_count": 2.0, "step": 3050, "text_loss": 0.3637430965900421 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05517578125, "learning_rate": 0.0008520729951475652, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4925416.0, "repeat_count": 0.0, "routers_loss": 0.0012709591537714005, "skip_count": 0.0, "step": 3052, "text_loss": 0.542036235332489 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.338127384796008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06640625, "learning_rate": 0.0008518531538831488, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4928695.0, "repeat_count": 0.0, "routers_loss": 0.0010660928674042225, "skip_count": 1.0, "step": 3054, "text_loss": 0.43144503235816956 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.059326171875, "learning_rate": 0.00085163317778794, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4931504.0, "repeat_count": 0.0, "routers_loss": 0.004558971151709557, "skip_count": 2.0, "step": 3056, "text_loss": 0.5257010459899902 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.0008514130669462341, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4934935.0, "repeat_count": 0.0, "routers_loss": 0.010774781927466393, "skip_count": 2.0, "step": 3058, "text_loss": 0.26061776280403137 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.366304666862343, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0008511928214423782, "loss": 0.0103, "macro_f1": 0.6601307392120361, "num_tokens": 4938047.0, "repeat_count": 1.0, "routers_loss": 0.014763157814741135, "skip_count": 2.0, "step": 3060, "text_loss": 0.2856905460357666 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.375697094217786, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0008509724413607705, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 4941041.0, "repeat_count": 1.0, "routers_loss": 0.004613345488905907, "skip_count": 0.0, "step": 3062, "text_loss": 0.2870287001132965 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.385089521573232, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0008507519267858612, "loss": 0.015, "macro_f1": 1.0, "num_tokens": 4944708.0, "repeat_count": 1.0, "routers_loss": 0.008584189228713512, "skip_count": 2.0, "step": 3064, "text_loss": 0.15828095376491547 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.394481948928677, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.0008505312778021519, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 4948295.0, "repeat_count": 0.0, "routers_loss": 0.0014670816017314792, "skip_count": 0.0, "step": 3066, "text_loss": 0.36697930097579956 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.0008503104944941958, "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 4951983.0, "repeat_count": 0.0, "routers_loss": 0.005348859820514917, "skip_count": 2.0, "step": 3068, "text_loss": 0.21612997353076935 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0008500895769465972, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4955023.0, "repeat_count": 0.0, "routers_loss": 0.0013203793205320835, "skip_count": 0.0, "step": 3070, "text_loss": 0.9757798314094543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.422659230995011, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.0008498685252440124, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 4957600.0, "repeat_count": 0.0, "routers_loss": 0.006907356437295675, "skip_count": 0.0, "step": 3072, "text_loss": 0.356107234954834 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.432051658350455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061279296875, "learning_rate": 0.0008496473394711487, "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4960746.0, "repeat_count": 0.0, "routers_loss": 0.0027704904787242413, "skip_count": 1.0, "step": 3074, "text_loss": 0.6812908053398132 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0576171875, "learning_rate": 0.0008494260197127649, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 4963845.0, "repeat_count": 0.0, "routers_loss": 0.0036796489730477333, "skip_count": 2.0, "step": 3076, "text_loss": 0.7215370535850525 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0556640625, "learning_rate": 0.0008492045660536712, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 4966887.0, "repeat_count": 0.0, "routers_loss": 0.0037137691397219896, "skip_count": 1.0, "step": 3078, "text_loss": 0.8700299859046936 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 14.460228940416789, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03857421875, "learning_rate": 0.0008489829785787291, "loss": 0.0078, "macro_f1": 0.8823530077934265, "num_tokens": 4969859.0, "repeat_count": 1.0, "routers_loss": 0.016492314636707306, "skip_count": 2.0, "step": 3080, "text_loss": 0.6520360112190247 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043701171875, "learning_rate": 0.0008487612573728513, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4972628.0, "repeat_count": 0.0, "routers_loss": 0.004022917244583368, "skip_count": 2.0, "step": 3082, "text_loss": 0.17498187720775604 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044677734375, "learning_rate": 0.0008485394025210016, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4975475.0, "repeat_count": 0.0, "routers_loss": 0.009141159243881702, "skip_count": 1.0, "step": 3084, "text_loss": 0.5975366234779358 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 0.0008483174141081956, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4978858.0, "repeat_count": 0.0, "routers_loss": 0.0031561285723000765, "skip_count": 0.0, "step": 3086, "text_loss": 0.18748866021633148 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.497798649838568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008480952922194991, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4982142.0, "repeat_count": 0.0, "routers_loss": 0.0007894713780842721, "skip_count": 0.0, "step": 3088, "text_loss": 0.42083197832107544 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008478730369400302, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4984872.0, "repeat_count": 0.0, "routers_loss": 0.0005908289458602667, "skip_count": 0.0, "step": 3090, "text_loss": 0.45337188243865967 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.516583504549457, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 0.0008476506483549573, "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 4988137.0, "repeat_count": 1.0, "routers_loss": 0.0016509373672306538, "skip_count": 2.0, "step": 3092, "text_loss": 0.6397262811660767 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0008474281265495002, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4991164.0, "repeat_count": 0.0, "routers_loss": 0.004088304936885834, "skip_count": 1.0, "step": 3094, "text_loss": 0.18352322280406952 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0008472054716089295, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4993876.0, "repeat_count": 0.0, "routers_loss": 0.005200014915317297, "skip_count": 0.0, "step": 3096, "text_loss": 0.2776511013507843 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.544760786615791, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.0008469826836185673, "loss": 0.01, "macro_f1": 0.6601307392120361, "num_tokens": 4997068.0, "repeat_count": 1.0, "routers_loss": 0.012686059810221195, "skip_count": 2.0, "step": 3098, "text_loss": 0.23209233582019806 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.554153213971237, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.055419921875, "learning_rate": 0.0008467597626637858, "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 5000038.0, "repeat_count": 1.0, "routers_loss": 0.006401528604328632, "skip_count": 2.0, "step": 3100, "text_loss": 0.45936745405197144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.56354564132668, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008465367088300093, "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 5002870.0, "repeat_count": 0.0, "routers_loss": 0.016640547662973404, "skip_count": 1.0, "step": 3102, "text_loss": 0.44502779841423035 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.572938068682125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0008463135222027124, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5006357.0, "repeat_count": 0.0, "routers_loss": 0.008411331102252007, "skip_count": 2.0, "step": 3104, "text_loss": 0.3414570391178131 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.582330496037569, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0008460902028674204, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5009059.0, "repeat_count": 0.0, "routers_loss": 0.0010406570509076118, "skip_count": 0.0, "step": 3106, "text_loss": 0.5931221842765808 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0008458667509097098, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 5012327.0, "repeat_count": 0.0, "routers_loss": 0.001959054498001933, "skip_count": 0.0, "step": 3108, "text_loss": 0.5191171169281006 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.60111535074846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0008456431664152078, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 5015472.0, "repeat_count": 0.0, "routers_loss": 0.000994380097836256, "skip_count": 0.0, "step": 3110, "text_loss": 0.4455361068248749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.610507778103903, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0008454194494695923, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 5018901.0, "repeat_count": 0.0, "routers_loss": 0.0037662344984710217, "skip_count": 0.0, "step": 3112, "text_loss": 0.5335362553596497 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 14.619900205459349, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02294921875, "learning_rate": 0.0008451956001585923, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5022520.0, "repeat_count": 0.0, "routers_loss": 0.008664715103805065, "skip_count": 3.0, "step": 3114, "text_loss": 0.16230148077011108 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.629292632814794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.000844971618567987, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 5025505.0, "repeat_count": 0.0, "routers_loss": 0.0015904927859082818, "skip_count": 0.0, "step": 3116, "text_loss": 0.6989432573318481 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.638685060170237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0008447475047836068, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 5028767.0, "repeat_count": 0.0, "routers_loss": 0.005853322334587574, "skip_count": 1.0, "step": 3118, "text_loss": 0.31420737504959106 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 14.648077487525683, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008445232588913325, "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 5032577.0, "repeat_count": 0.0, "routers_loss": 0.012760105542838573, "skip_count": 0.0, "step": 3120, "text_loss": 0.5534627437591553 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.0008442988809770953, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 5035381.0, "repeat_count": 0.0, "routers_loss": 0.0022257440723478794, "skip_count": 0.0, "step": 3122, "text_loss": 0.42492759227752686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.666862342236572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0008440743711268775, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 5038743.0, "repeat_count": 0.0, "routers_loss": 0.004648433532565832, "skip_count": 0.0, "step": 3124, "text_loss": 0.16404685378074646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0008438497294267117, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5041492.0, "repeat_count": 0.0, "routers_loss": 0.006313877180218697, "skip_count": 0.0, "step": 3126, "text_loss": 0.23191484808921814 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.68564719694746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0008436249559626807, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5043955.0, "repeat_count": 1.0, "routers_loss": 0.0036270488053560257, "skip_count": 0.0, "step": 3128, "text_loss": 0.5782018303871155 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.695039624302906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04345703125, "learning_rate": 0.0008434000508209187, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5047571.0, "repeat_count": 0.0, "routers_loss": 0.003809858812019229, "skip_count": 1.0, "step": 3130, "text_loss": 0.7129825949668884 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.704432051658351, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0008431750140876092, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 5051608.0, "repeat_count": 0.0, "routers_loss": 0.0022369057405740023, "skip_count": 0.0, "step": 3132, "text_loss": 0.4433445930480957 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.713824479013795, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.000842949845848987, "loss": 0.0135, "macro_f1": 0.32098764181137085, "num_tokens": 5054656.0, "repeat_count": 0.0, "routers_loss": 0.0425117202103138, "skip_count": 2.0, "step": 3134, "text_loss": 0.38721024990081787 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0008427245461913368, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 5059108.0, "repeat_count": 0.0, "routers_loss": 0.0018077283166348934, "skip_count": 0.0, "step": 3136, "text_loss": 0.7496368885040283 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.732609333724685, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12109375, "learning_rate": 0.0008424991152009941, "loss": 0.0111, "macro_f1": 1.0, "num_tokens": 5062371.0, "repeat_count": 1.0, "routers_loss": 0.008801834657788277, "skip_count": 2.0, "step": 3138, "text_loss": 0.5337086319923401 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 14.742001761080129, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0008422735529643444, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5065593.0, "repeat_count": 0.0, "routers_loss": 0.00548676960170269, "skip_count": 3.0, "step": 3140, "text_loss": 0.2561623156070709 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.751394188435574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0008420478595678233, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5068271.0, "repeat_count": 0.0, "routers_loss": 0.006389956455677748, "skip_count": 0.0, "step": 3142, "text_loss": 0.15605193376541138 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.760786615791018, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07958984375, "learning_rate": 0.0008418220350979175, "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 5071358.0, "repeat_count": 1.0, "routers_loss": 0.012387622147798538, "skip_count": 2.0, "step": 3144, "text_loss": 0.3085838258266449 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008415960796411628, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5075584.0, "repeat_count": 0.0, "routers_loss": 0.00311864772811532, "skip_count": 1.0, "step": 3146, "text_loss": 0.4786977469921112 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.779571470501908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0008413699932841461, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5078388.0, "repeat_count": 0.0, "routers_loss": 0.0030679800547659397, "skip_count": 0.0, "step": 3148, "text_loss": 0.5222916603088379 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.788963897857352, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0008411437761135039, "loss": 0.011, "macro_f1": 1.0, "num_tokens": 5081584.0, "repeat_count": 1.0, "routers_loss": 0.012907958589494228, "skip_count": 2.0, "step": 3150, "text_loss": 0.5369884371757507 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0008409174282159232, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 5084450.0, "repeat_count": 0.0, "routers_loss": 0.012314042076468468, "skip_count": 2.0, "step": 3152, "text_loss": 0.25685277581214905 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.807748752568243, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.000840690949678141, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 5087865.0, "repeat_count": 1.0, "routers_loss": 0.00899206381291151, "skip_count": 0.0, "step": 3154, "text_loss": 0.1717093288898468 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.817141179923686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06103515625, "learning_rate": 0.0008404643405869441, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 5090857.0, "repeat_count": 0.0, "routers_loss": 0.0013312003575265408, "skip_count": 0.0, "step": 3156, "text_loss": 0.27446436882019043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0008402376010291695, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 5093917.0, "repeat_count": 0.0, "routers_loss": 0.002653320087119937, "skip_count": 0.0, "step": 3158, "text_loss": 0.4237489402294159 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045654296875, "learning_rate": 0.0008400107310917045, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5096656.0, "repeat_count": 0.0, "routers_loss": 0.012976993806660175, "skip_count": 2.0, "step": 3160, "text_loss": 0.42361980676651 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.000839783730861486, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5099582.0, "repeat_count": 0.0, "routers_loss": 0.006936746649444103, "skip_count": 2.0, "step": 3162, "text_loss": 0.26656073331832886 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0008395566004255008, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 5102908.0, "repeat_count": 0.0, "routers_loss": 0.006619359832257032, "skip_count": 1.0, "step": 3164, "text_loss": 0.590774416923523 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06884765625, "learning_rate": 0.0008393293398707858, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5105829.0, "repeat_count": 0.0, "routers_loss": 0.010120268911123276, "skip_count": 2.0, "step": 3166, "text_loss": 0.605930507183075 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008391019492844275, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 5109850.0, "repeat_count": 0.0, "routers_loss": 0.004940980114042759, "skip_count": 2.0, "step": 3168, "text_loss": 0.12973152101039886 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0008388744287535627, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 5113353.0, "repeat_count": 0.0, "routers_loss": 0.0031777634285390377, "skip_count": 1.0, "step": 3170, "text_loss": 0.18577200174331665 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0008386467783653775, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 5116421.0, "repeat_count": 0.0, "routers_loss": 0.005431659985333681, "skip_count": 0.0, "step": 3172, "text_loss": 0.2302747517824173 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 14.901673026122689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.000838418998207108, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5119457.0, "repeat_count": 0.0, "routers_loss": 0.0077286697924137115, "skip_count": 4.0, "step": 3174, "text_loss": 0.19606637954711914 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0008381910883660399, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5123201.0, "repeat_count": 0.0, "routers_loss": 0.003982985392212868, "skip_count": 0.0, "step": 3176, "text_loss": 0.716376006603241 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09423828125, "learning_rate": 0.0008379630489295089, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 5126035.0, "repeat_count": 0.0, "routers_loss": 0.005626026075333357, "skip_count": 1.0, "step": 3178, "text_loss": 0.5144625902175903 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008377348799849, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 5129179.0, "repeat_count": 0.0, "routers_loss": 0.015458245761692524, "skip_count": 2.0, "step": 3180, "text_loss": 0.29887503385543823 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 14.939242735544468, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.062255859375, "learning_rate": 0.0008375065816196479, "loss": 0.0086, "macro_f1": 0.5492662787437439, "num_tokens": 5132149.0, "repeat_count": 0.0, "routers_loss": 0.012210468761622906, "skip_count": 2.0, "step": 3182, "text_loss": 0.8981851935386658 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.948635162899912, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0008372781539212371, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5135287.0, "repeat_count": 0.0, "routers_loss": 0.0052537876181304455, "skip_count": 0.0, "step": 3184, "text_loss": 0.4245666563510895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.958027590255357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0008370495969772014, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5138589.0, "repeat_count": 0.0, "routers_loss": 0.012873421423137188, "skip_count": 2.0, "step": 3186, "text_loss": 0.40581050515174866 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 14.9674200176108, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0008368209108751244, "loss": 0.0127, "macro_f1": 0.6521739363670349, "num_tokens": 5141635.0, "repeat_count": 2.0, "routers_loss": 0.07720445841550827, "skip_count": 4.0, "step": 3188, "text_loss": 0.3755173981189728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0008365920957026389, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5144728.0, "repeat_count": 0.0, "routers_loss": 0.001440995605662465, "skip_count": 0.0, "step": 3190, "text_loss": 0.5067034363746643 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.986204872321691, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.041748046875, "learning_rate": 0.0008363631515474275, "loss": 0.0089, "macro_f1": 0.6538461446762085, "num_tokens": 5147963.0, "repeat_count": 1.0, "routers_loss": 0.018752984702587128, "skip_count": 2.0, "step": 3192, "text_loss": 0.20224551856517792 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.995597299677135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0008361340784972217, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 5151184.0, "repeat_count": 0.0, "routers_loss": 0.0005360354552976787, "skip_count": 0.0, "step": 3194, "text_loss": 0.4588058292865753 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.004696213677722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0008359048766398031, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5153889.0, "repeat_count": 0.0, "routers_loss": 0.0009184491937048733, "skip_count": 1.0, "step": 3196, "text_loss": 0.2980220317840576 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.000835675546063002, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5156758.0, "repeat_count": 0.0, "routers_loss": 0.001252970308996737, "skip_count": 0.0, "step": 3198, "text_loss": 0.6775755882263184 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0008354460868546985, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5160247.0, "repeat_count": 0.0, "routers_loss": 0.0037315806839615107, "skip_count": 0.0, "step": 3200, "text_loss": 0.35867011547088623 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0008352164991028217, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 5163456.0, "repeat_count": 1.0, "routers_loss": 0.001497485558502376, "skip_count": 0.0, "step": 3202, "text_loss": 0.690290093421936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.042265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.0008349867828953501, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 5166139.0, "repeat_count": 0.0, "routers_loss": 0.001051135826855898, "skip_count": 0.0, "step": 3204, "text_loss": 0.3340415954589844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.051658350454945, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0008347569383203113, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 5169009.0, "repeat_count": 0.0, "routers_loss": 0.0010544003453105688, "skip_count": 0.0, "step": 3206, "text_loss": 0.8584878444671631 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.06105077781039, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008345269654657823, "loss": 0.0085, "macro_f1": 1.0, "num_tokens": 5172618.0, "repeat_count": 1.0, "routers_loss": 0.007312417030334473, "skip_count": 1.0, "step": 3208, "text_loss": 0.19500218331813812 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.070443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0008342968644198892, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 5175857.0, "repeat_count": 0.0, "routers_loss": 0.00276504410430789, "skip_count": 0.0, "step": 3210, "text_loss": 0.5446314215660095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.079835632521279, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0008340666352708068, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 5178585.0, "repeat_count": 0.0, "routers_loss": 0.002669303445145488, "skip_count": 0.0, "step": 3212, "text_loss": 0.3687484860420227 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0008338362781067596, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5181777.0, "repeat_count": 0.0, "routers_loss": 0.0031585274264216423, "skip_count": 0.0, "step": 3214, "text_loss": 0.27325859665870667 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.000833605793016021, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 5184312.0, "repeat_count": 0.0, "routers_loss": 0.008807534351944923, "skip_count": 2.0, "step": 3216, "text_loss": 0.4466548562049866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.108012914587613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008333751800869133, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5187497.0, "repeat_count": 0.0, "routers_loss": 0.003171310294419527, "skip_count": 0.0, "step": 3218, "text_loss": 0.5423526763916016 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.117405341943059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0008331444394078076, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 5190982.0, "repeat_count": 0.0, "routers_loss": 0.0016481258207932115, "skip_count": 2.0, "step": 3220, "text_loss": 0.48984917998313904 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.126797769298504, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.000832913571067124, "loss": 0.0107, "macro_f1": 1.0, "num_tokens": 5194044.0, "repeat_count": 1.0, "routers_loss": 0.003957313951104879, "skip_count": 1.0, "step": 3222, "text_loss": 0.4533331096172333 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0008326825751533322, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5197092.0, "repeat_count": 0.0, "routers_loss": 0.0016904744552448392, "skip_count": 0.0, "step": 3224, "text_loss": 0.5538802742958069 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05224609375, "learning_rate": 0.0008324514517549501, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5199941.0, "repeat_count": 0.0, "routers_loss": 0.005608258303254843, "skip_count": 1.0, "step": 3226, "text_loss": 0.416242778301239 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 15.154975051364836, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.040771484375, "learning_rate": 0.0008322202009605444, "loss": 0.0072, "macro_f1": 0.8823530077934265, "num_tokens": 5202618.0, "repeat_count": 1.0, "routers_loss": 0.020965175703167915, "skip_count": 2.0, "step": 3228, "text_loss": 0.17496295273303986 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 15.164367478720282, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008319888228587311, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 5206414.0, "repeat_count": 1.0, "routers_loss": 0.021259209141135216, "skip_count": 5.0, "step": 3230, "text_loss": 0.22471418976783752 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0008317573175381745, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 5209768.0, "repeat_count": 0.0, "routers_loss": 0.0018647604156285524, "skip_count": 0.0, "step": 3232, "text_loss": 0.4415269196033478 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0008315256850875881, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5213257.0, "repeat_count": 0.0, "routers_loss": 0.002345515415072441, "skip_count": 0.0, "step": 3234, "text_loss": 0.347247838973999 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 15.192544760786616, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0008312939255957336, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 5215800.0, "repeat_count": 0.0, "routers_loss": 0.007112892810255289, "skip_count": 3.0, "step": 3236, "text_loss": 0.31091734766960144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.201937188142061, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0008310620391514219, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5219205.0, "repeat_count": 0.0, "routers_loss": 0.00432228296995163, "skip_count": 0.0, "step": 3238, "text_loss": 0.3421775996685028 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.0008308300258435124, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 5222422.0, "repeat_count": 0.0, "routers_loss": 0.0076514314860105515, "skip_count": 2.0, "step": 3240, "text_loss": 0.22378318011760712 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0008305978857609128, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5225625.0, "repeat_count": 0.0, "routers_loss": 0.0007617069641128182, "skip_count": 0.0, "step": 3242, "text_loss": 0.5880323648452759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0008303656189925799, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5229113.0, "repeat_count": 0.0, "routers_loss": 0.0017418119823560119, "skip_count": 0.0, "step": 3244, "text_loss": 0.3302813768386841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0008301332256275183, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5232061.0, "repeat_count": 0.0, "routers_loss": 0.0026667986530810595, "skip_count": 0.0, "step": 3246, "text_loss": 0.5679706335067749 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.248899324919284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.0008299007057547821, "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 5235279.0, "repeat_count": 1.0, "routers_loss": 0.011016624979674816, "skip_count": 2.0, "step": 3248, "text_loss": 0.5081504583358765 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.258291752274728, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0008296680594634731, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5239655.0, "repeat_count": 1.0, "routers_loss": 0.005492044147104025, "skip_count": 0.0, "step": 3250, "text_loss": 0.14675180613994598 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0008294352868427418, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5243579.0, "repeat_count": 0.0, "routers_loss": 0.00404445780441165, "skip_count": 1.0, "step": 3252, "text_loss": 0.4201085865497589 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.277076606985618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0008292023879817871, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 5247059.0, "repeat_count": 0.0, "routers_loss": 0.006886140909045935, "skip_count": 1.0, "step": 3254, "text_loss": 0.2289208322763443 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.286469034341062, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057861328125, "learning_rate": 0.0008289693629698564, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 5249940.0, "repeat_count": 0.0, "routers_loss": 0.0005736657767556608, "skip_count": 0.0, "step": 3256, "text_loss": 0.5670450925827026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.295861461696507, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0008287362118962452, "loss": 0.006, "macro_f1": 0.3272727429866791, "num_tokens": 5253580.0, "repeat_count": 0.0, "routers_loss": 0.011349895037710667, "skip_count": 1.0, "step": 3258, "text_loss": 0.5042323470115662 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.305253889051952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0008285029348502973, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5257080.0, "repeat_count": 0.0, "routers_loss": 0.0013626761501654983, "skip_count": 0.0, "step": 3260, "text_loss": 0.3227672874927521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.314646316407396, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0008282695319214053, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5259951.0, "repeat_count": 0.0, "routers_loss": 0.00471635302528739, "skip_count": 0.0, "step": 3262, "text_loss": 0.20773714780807495 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.324038743762841, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008280360031990093, "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 5263314.0, "repeat_count": 0.0, "routers_loss": 0.010472415015101433, "skip_count": 2.0, "step": 3264, "text_loss": 0.34397366642951965 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.333431171118287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.000827802348772598, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 5267358.0, "repeat_count": 0.0, "routers_loss": 0.0007814752752892673, "skip_count": 0.0, "step": 3266, "text_loss": 0.747342586517334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.34282359847373, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0008275685687317084, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5270400.0, "repeat_count": 0.0, "routers_loss": 0.000902949133887887, "skip_count": 0.0, "step": 3268, "text_loss": 0.43782034516334534 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0008273346631659252, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5273147.0, "repeat_count": 0.0, "routers_loss": 0.00043462219764478505, "skip_count": 0.0, "step": 3270, "text_loss": 0.6358205080032349 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.361608453184619, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008271006321648816, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5277638.0, "repeat_count": 0.0, "routers_loss": 0.002211218234151602, "skip_count": 0.0, "step": 3272, "text_loss": 0.20220105350017548 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.371000880540064, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.0008268664758182589, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5280638.0, "repeat_count": 1.0, "routers_loss": 0.010536720044910908, "skip_count": 0.0, "step": 3274, "text_loss": 0.7579061388969421 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 0.0008266321942157859, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5283847.0, "repeat_count": 0.0, "routers_loss": 0.0017158017726615071, "skip_count": 0.0, "step": 3276, "text_loss": 0.669302761554718 }, { "acc_repeat": 0.800000011920929, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.389785735250953, "f1_execute": 0.9743589162826538, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, "grad_norm": 0.06005859375, "learning_rate": 0.0008263977874472399, "loss": 0.0088, "macro_f1": 0.9544159770011902, "num_tokens": 5286627.0, "repeat_count": 5.0, "routers_loss": 0.011220700107514858, "skip_count": 4.0, "step": 3278, "text_loss": 0.8703984022140503 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.399178162606399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008261632556024461, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5289766.0, "repeat_count": 0.0, "routers_loss": 0.0020442772656679153, "skip_count": 0.0, "step": 3280, "text_loss": 0.5009346008300781 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.0008259285987712774, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5293010.0, "repeat_count": 0.0, "routers_loss": 0.005645765457302332, "skip_count": 0.0, "step": 3282, "text_loss": 0.2546011209487915 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.042236328125, "learning_rate": 0.0008256938170436549, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 5296732.0, "repeat_count": 0.0, "routers_loss": 0.0027385836001485586, "skip_count": 2.0, "step": 3284, "text_loss": 0.5244000554084778 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.427355444672733, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0008254589105095473, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 5299926.0, "repeat_count": 1.0, "routers_loss": 0.007451715879142284, "skip_count": 1.0, "step": 3286, "text_loss": 0.28979742527008057 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0008252238792589711, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5303006.0, "repeat_count": 0.0, "routers_loss": 0.004805843345820904, "skip_count": 2.0, "step": 3288, "text_loss": 0.5131978392601013 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.446140299383622, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.000824988723381991, "loss": 0.0091, "macro_f1": 0.3272727429866791, "num_tokens": 5306953.0, "repeat_count": 0.0, "routers_loss": 0.010639613494277, "skip_count": 1.0, "step": 3290, "text_loss": 0.4901447296142578 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 15.455532726739067, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.044189453125, "learning_rate": 0.0008247534429687191, "loss": 0.007, "macro_f1": 0.5492662787437439, "num_tokens": 5310516.0, "repeat_count": 0.0, "routers_loss": 0.013625577092170715, "skip_count": 2.0, "step": 3292, "text_loss": 0.2124534696340561 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0008245180381093152, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 5313959.0, "repeat_count": 0.0, "routers_loss": 0.004958513658493757, "skip_count": 1.0, "step": 3294, "text_loss": 0.46682238578796387 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0008242825088939867, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5316609.0, "repeat_count": 0.0, "routers_loss": 0.003962756600230932, "skip_count": 0.0, "step": 3296, "text_loss": 0.7010108232498169 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.483710008805401, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008240468554129892, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5319638.0, "repeat_count": 0.0, "routers_loss": 0.0006996620795689523, "skip_count": 0.0, "step": 3298, "text_loss": 0.4966355860233307 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.493102436160845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0008238110777566255, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 5323019.0, "repeat_count": 0.0, "routers_loss": 0.0016031896229833364, "skip_count": 0.0, "step": 3300, "text_loss": 0.38668957352638245 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0008235751760152459, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 5326099.0, "repeat_count": 2.0, "routers_loss": 0.00344281829893589, "skip_count": 2.0, "step": 3302, "text_loss": 0.5330720543861389 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.511887290871735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06005859375, "learning_rate": 0.0008233391502792484, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5328993.0, "repeat_count": 0.0, "routers_loss": 0.007886730134487152, "skip_count": 1.0, "step": 3304, "text_loss": 0.5470269322395325 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0008231030006390786, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5331554.0, "repeat_count": 0.0, "routers_loss": 0.008180000819265842, "skip_count": 1.0, "step": 3306, "text_loss": 0.4023340344429016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0008228667271852294, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5335712.0, "repeat_count": 0.0, "routers_loss": 0.0002942821884062141, "skip_count": 0.0, "step": 3308, "text_loss": 0.5306711792945862 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05908203125, "learning_rate": 0.0008226303300082414, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5338701.0, "repeat_count": 0.0, "routers_loss": 0.0006134595023468137, "skip_count": 0.0, "step": 3310, "text_loss": 0.5906263589859009 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.549457000293513, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0008223938091987022, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5342274.0, "repeat_count": 0.0, "routers_loss": 0.0016656654188409448, "skip_count": 0.0, "step": 3312, "text_loss": 0.5201764106750488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0008221571648472472, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5345185.0, "repeat_count": 0.0, "routers_loss": 0.0038612703792750835, "skip_count": 0.0, "step": 3314, "text_loss": 0.36633720993995667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.568241855004402, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0008219203970445589, "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 5348804.0, "repeat_count": 0.0, "routers_loss": 0.009782899171113968, "skip_count": 1.0, "step": 3316, "text_loss": 0.3117460012435913 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.577634282359847, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008216835058813672, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 5351896.0, "repeat_count": 0.0, "routers_loss": 0.007713229861110449, "skip_count": 0.0, "step": 3318, "text_loss": 0.253496378660202 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0008214464914484492, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5355058.0, "repeat_count": 0.0, "routers_loss": 0.006227815989404917, "skip_count": 2.0, "step": 3320, "text_loss": 0.32693132758140564 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0008212093538366292, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 5358365.0, "repeat_count": 0.0, "routers_loss": 0.002601418411359191, "skip_count": 0.0, "step": 3322, "text_loss": 0.40394455194473267 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 15.605811564426181, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.000820972093136779, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5360981.0, "repeat_count": 0.0, "routers_loss": 0.005545300897210836, "skip_count": 3.0, "step": 3324, "text_loss": 0.6758295893669128 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.615203991781627, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05078125, "learning_rate": 0.0008207347094398172, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 5364018.0, "repeat_count": 1.0, "routers_loss": 0.001924700103700161, "skip_count": 0.0, "step": 3326, "text_loss": 0.5196860432624817 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0008204972028367097, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5366986.0, "repeat_count": 0.0, "routers_loss": 0.012254828587174416, "skip_count": 1.0, "step": 3328, "text_loss": 0.24661913514137268 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.633988846492516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0008202595734184694, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5371463.0, "repeat_count": 0.0, "routers_loss": 0.005094083491712809, "skip_count": 0.0, "step": 3330, "text_loss": 0.2525769770145416 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.643381273847961, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0008200218212761566, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 5374823.0, "repeat_count": 1.0, "routers_loss": 0.0025883198250085115, "skip_count": 0.0, "step": 3332, "text_loss": 0.21849912405014038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.000819783946500878, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5377640.0, "repeat_count": 0.0, "routers_loss": 0.008240507915616035, "skip_count": 0.0, "step": 3334, "text_loss": 0.2662734091281891 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 15.66216612855885, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.050537109375, "learning_rate": 0.000819545949183788, "loss": 0.01, "macro_f1": 0.5934640765190125, "num_tokens": 5380593.0, "repeat_count": 0.0, "routers_loss": 0.038378193974494934, "skip_count": 3.0, "step": 3336, "text_loss": 0.2431795746088028 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.671558555914293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 0.0008193078294160874, "loss": 0.0097, "macro_f1": 1.0, "num_tokens": 5384487.0, "repeat_count": 1.0, "routers_loss": 0.005926199723035097, "skip_count": 1.0, "step": 3338, "text_loss": 0.5663705468177795 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0008190695872890242, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5387511.0, "repeat_count": 0.0, "routers_loss": 0.010842559859156609, "skip_count": 2.0, "step": 3340, "text_loss": 0.11517292261123657 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.690343410625184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0008188312228938933, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 5390698.0, "repeat_count": 0.0, "routers_loss": 0.001304097007960081, "skip_count": 0.0, "step": 3342, "text_loss": 0.4827076196670532 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.699735837980628, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0008185927363220363, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5393778.0, "repeat_count": 1.0, "routers_loss": 0.005354117136448622, "skip_count": 0.0, "step": 3344, "text_loss": 0.44467049837112427 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.040771484375, "learning_rate": 0.0008183541276648418, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5396925.0, "repeat_count": 0.0, "routers_loss": 0.004800073802471161, "skip_count": 2.0, "step": 3346, "text_loss": 0.2032834142446518 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.718520692691518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0008181153970137449, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5400522.0, "repeat_count": 0.0, "routers_loss": 0.0021674633026123047, "skip_count": 0.0, "step": 3348, "text_loss": 0.4507528841495514 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.727913120046962, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.051513671875, "learning_rate": 0.0008178765444602278, "loss": 0.0117, "macro_f1": 0.8820862174034119, "num_tokens": 5403526.0, "repeat_count": 2.0, "routers_loss": 0.04263930395245552, "skip_count": 2.0, "step": 3350, "text_loss": 0.3606615960597992 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.737305547402407, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008176375700958194, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5407127.0, "repeat_count": 1.0, "routers_loss": 0.006953123956918716, "skip_count": 0.0, "step": 3352, "text_loss": 0.2290353775024414 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0008173984740120948, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5410829.0, "repeat_count": 0.0, "routers_loss": 0.0014363783411681652, "skip_count": 0.0, "step": 3354, "text_loss": 0.4220392405986786 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0008171592563006762, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5414152.0, "repeat_count": 0.0, "routers_loss": 0.00202389364130795, "skip_count": 1.0, "step": 3356, "text_loss": 0.37729766964912415 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.765482829468741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0008169199170532323, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5417312.0, "repeat_count": 0.0, "routers_loss": 0.006253739818930626, "skip_count": 2.0, "step": 3358, "text_loss": 0.1304289996623993 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.774875256824185, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0703125, "learning_rate": 0.0008166804563614785, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 5421227.0, "repeat_count": 2.0, "routers_loss": 0.01622140221297741, "skip_count": 2.0, "step": 3360, "text_loss": 0.298664391040802 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.78426768417963, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0008164408743171763, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5424646.0, "repeat_count": 1.0, "routers_loss": 0.0037176944315433502, "skip_count": 2.0, "step": 3362, "text_loss": 0.12147632241249084 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046630859375, "learning_rate": 0.0008162011710121339, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5427897.0, "repeat_count": 0.0, "routers_loss": 0.0020403533708304167, "skip_count": 1.0, "step": 3364, "text_loss": 0.2656533420085907 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.803052538890519, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041748046875, "learning_rate": 0.0008159613465382066, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5430474.0, "repeat_count": 0.0, "routers_loss": 0.0018634048756211996, "skip_count": 0.0, "step": 3366, "text_loss": 0.9133086204528809 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.812444966245964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.0008157214009872951, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5433113.0, "repeat_count": 0.0, "routers_loss": 0.012944488786160946, "skip_count": 2.0, "step": 3368, "text_loss": 0.24352453649044037 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05712890625, "learning_rate": 0.0008154813344513472, "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 5436259.0, "repeat_count": 0.0, "routers_loss": 0.002347963862121105, "skip_count": 2.0, "step": 3370, "text_loss": 0.7601244449615479 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0008152411470223568, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 5439126.0, "repeat_count": 0.0, "routers_loss": 0.0016609140438959002, "skip_count": 0.0, "step": 3372, "text_loss": 0.5551947355270386 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.840622248312298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0008150008387923643, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5442739.0, "repeat_count": 0.0, "routers_loss": 0.008321396075189114, "skip_count": 0.0, "step": 3374, "text_loss": 0.25028282403945923 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 15.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08544921875, "learning_rate": 0.000814760409853456, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 5445247.0, "repeat_count": 2.0, "routers_loss": 0.009738070890307426, "skip_count": 1.0, "step": 3376, "text_loss": 0.37271201610565186 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0008145198602977651, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5449044.0, "repeat_count": 0.0, "routers_loss": 0.0028421466704458, "skip_count": 0.0, "step": 3378, "text_loss": 0.1458655595779419 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.868799530378633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.0008142791902174701, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 5453063.0, "repeat_count": 0.0, "routers_loss": 0.0015170135302469134, "skip_count": 0.0, "step": 3380, "text_loss": 0.5548722743988037 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.878191957734076, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0008140383997047966, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 5455814.0, "repeat_count": 0.0, "routers_loss": 0.0022444510832428932, "skip_count": 1.0, "step": 3382, "text_loss": 0.8034513592720032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.887584385089522, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.000813797488852016, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5459392.0, "repeat_count": 0.0, "routers_loss": 0.00038578867679461837, "skip_count": 0.0, "step": 3384, "text_loss": 0.6940088868141174 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.896976812444967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0008135564577514458, "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 5462413.0, "repeat_count": 0.0, "routers_loss": 0.0019727381877601147, "skip_count": 0.0, "step": 3386, "text_loss": 0.5124650597572327 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.0008133153064954495, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 5465552.0, "repeat_count": 0.0, "routers_loss": 0.0019896167796105146, "skip_count": 0.0, "step": 3388, "text_loss": 0.4292517900466919 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.915761667155856, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0008130740351764367, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 5468573.0, "repeat_count": 1.0, "routers_loss": 0.0030118159484118223, "skip_count": 1.0, "step": 3390, "text_loss": 0.48903173208236694 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.925154094511301, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 0.000812832643886863, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5471547.0, "repeat_count": 0.0, "routers_loss": 0.005084246397018433, "skip_count": 2.0, "step": 3392, "text_loss": 0.35789889097213745 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0008125911327192299, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5474331.0, "repeat_count": 0.0, "routers_loss": 0.0008874498889781535, "skip_count": 0.0, "step": 3394, "text_loss": 0.6267408728599548 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0008123495017660851, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5477633.0, "repeat_count": 0.0, "routers_loss": 0.001794386887922883, "skip_count": 0.0, "step": 3396, "text_loss": 0.3701885938644409 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0008121077511200221, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5481277.0, "repeat_count": 0.0, "routers_loss": 0.002140481723472476, "skip_count": 0.0, "step": 3398, "text_loss": 0.6362857818603516 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.962723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0556640625, "learning_rate": 0.00081186588087368, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 5484237.0, "repeat_count": 0.0, "routers_loss": 0.000867189432028681, "skip_count": 0.0, "step": 3400, "text_loss": 1.0847382545471191 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0008116238911197442, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5487423.0, "repeat_count": 0.0, "routers_loss": 0.0029817656613886356, "skip_count": 0.0, "step": 3402, "text_loss": 0.3813740313053131 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.0008113817819509454, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5490155.0, "repeat_count": 0.0, "routers_loss": 0.0035141287371516228, "skip_count": 0.0, "step": 3404, "text_loss": 0.2113083451986313 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0008111395534600603, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5493415.0, "repeat_count": 0.0, "routers_loss": 0.003317659953609109, "skip_count": 0.0, "step": 3406, "text_loss": 0.5869330167770386 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.052001953125, "learning_rate": 0.0008108972057399114, "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 5496032.0, "repeat_count": 0.0, "routers_loss": 0.003833734430372715, "skip_count": 2.0, "step": 3408, "text_loss": 0.2938928008079529 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11328125, "learning_rate": 0.0008106547388833669, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5498890.0, "repeat_count": 0.0, "routers_loss": 0.002622978063300252, "skip_count": 1.0, "step": 3410, "text_loss": 0.3130980432033539 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.01878485471089, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0008104121529833402, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5502010.0, "repeat_count": 1.0, "routers_loss": 0.007447598036378622, "skip_count": 0.0, "step": 3412, "text_loss": 0.4413072466850281 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.000810169448132791, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5505212.0, "repeat_count": 0.0, "routers_loss": 0.0031087708193808794, "skip_count": 1.0, "step": 3414, "text_loss": 0.2910428047180176 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.037569709421778, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0008099266244247243, "loss": 0.0082, "macro_f1": 0.3272727429866791, "num_tokens": 5508755.0, "repeat_count": 0.0, "routers_loss": 0.02510393038392067, "skip_count": 1.0, "step": 3416, "text_loss": 0.33022749423980713 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008096836819521903, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5512034.0, "repeat_count": 0.0, "routers_loss": 0.0020537273958325386, "skip_count": 1.0, "step": 3418, "text_loss": 0.4731218218803406 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0008094406208082853, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5515707.0, "repeat_count": 0.0, "routers_loss": 0.004218162503093481, "skip_count": 2.0, "step": 3420, "text_loss": 0.23429590463638306 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 16.065746991488112, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0869140625, "learning_rate": 0.0008091974410861507, "loss": 0.0069, "macro_f1": 0.9265305995941162, "num_tokens": 5518436.0, "repeat_count": 1.0, "routers_loss": 0.013488355092704296, "skip_count": 3.0, "step": 3422, "text_loss": 0.45768749713897705 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0008089541428789733, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5522368.0, "repeat_count": 0.0, "routers_loss": 0.0010335417464375496, "skip_count": 1.0, "step": 3424, "text_loss": 0.43423423171043396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0008087107262799855, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 5526061.0, "repeat_count": 0.0, "routers_loss": 0.002134323585778475, "skip_count": 0.0, "step": 3426, "text_loss": 0.4031757414340973 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1318359375, "learning_rate": 0.0008084671913824651, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5529284.0, "repeat_count": 0.0, "routers_loss": 0.0097216060385108, "skip_count": 2.0, "step": 3428, "text_loss": 0.2836039960384369 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.000808223538279735, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5532159.0, "repeat_count": 0.0, "routers_loss": 0.001684269867837429, "skip_count": 0.0, "step": 3430, "text_loss": 0.5804527401924133 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.112709128265337, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0008079797670651637, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 5536050.0, "repeat_count": 1.0, "routers_loss": 0.013918434269726276, "skip_count": 1.0, "step": 3432, "text_loss": 0.31325826048851013 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0008077358778321647, "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 5538885.0, "repeat_count": 0.0, "routers_loss": 0.0007751787197776139, "skip_count": 0.0, "step": 3434, "text_loss": 0.783108115196228 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.131493982976224, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0008074918706741966, "loss": 0.0063, "macro_f1": 0.9262410998344421, "num_tokens": 5541909.0, "repeat_count": 3.0, "routers_loss": 0.021819550544023514, "skip_count": 2.0, "step": 3436, "text_loss": 0.6558083295822144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.14088641033167, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0008072477456847638, "loss": 0.0057, "macro_f1": 0.3272727429866791, "num_tokens": 5545101.0, "repeat_count": 1.0, "routers_loss": 0.03309348225593567, "skip_count": 0.0, "step": 3438, "text_loss": 0.9877075552940369 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.150278837687114, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.0008070035029574151, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 5548971.0, "repeat_count": 1.0, "routers_loss": 0.008696741424500942, "skip_count": 1.0, "step": 3440, "text_loss": 0.24766330420970917 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 16.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.000806759142585745, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 5552174.0, "repeat_count": 0.0, "routers_loss": 0.004240929149091244, "skip_count": 3.0, "step": 3442, "text_loss": 0.37255001068115234 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05322265625, "learning_rate": 0.0008065146646633927, "loss": 0.0088, "macro_f1": 0.6666666865348816, "num_tokens": 5555005.0, "repeat_count": 0.0, "routers_loss": 0.014345484785735607, "skip_count": 1.0, "step": 3444, "text_loss": 0.26157206296920776 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.17845611975345, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06005859375, "learning_rate": 0.0008062700692840428, "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 5559127.0, "repeat_count": 1.0, "routers_loss": 0.008315163664519787, "skip_count": 2.0, "step": 3446, "text_loss": 0.21971040964126587 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 16.187848547108892, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.056396484375, "learning_rate": 0.0008060253565414246, "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 5562254.0, "repeat_count": 0.0, "routers_loss": 0.009582413360476494, "skip_count": 3.0, "step": 3448, "text_loss": 0.6758295893669128 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.19724097446434, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0008057805265293124, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5565515.0, "repeat_count": 0.0, "routers_loss": 0.002429503947496414, "skip_count": 0.0, "step": 3450, "text_loss": 0.696592390537262 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0008055355793415257, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5568392.0, "repeat_count": 0.0, "routers_loss": 0.0007724192109890282, "skip_count": 0.0, "step": 3452, "text_loss": 0.7092870473861694 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008052905150719285, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 5571090.0, "repeat_count": 0.0, "routers_loss": 0.0010859938338398933, "skip_count": 0.0, "step": 3454, "text_loss": 0.6593860387802124 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.225418256530673, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008050453338144301, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 5574552.0, "repeat_count": 1.0, "routers_loss": 0.0030258705373853445, "skip_count": 1.0, "step": 3456, "text_loss": 0.3479384481906891 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 0.0008048000356629844, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 5577484.0, "repeat_count": 0.0, "routers_loss": 0.005052885971963406, "skip_count": 2.0, "step": 3458, "text_loss": 0.21858671307563782 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.24420311124156, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029541015625, "learning_rate": 0.0008045546207115901, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 5581605.0, "repeat_count": 1.0, "routers_loss": 0.009976249188184738, "skip_count": 3.0, "step": 3460, "text_loss": 0.16868001222610474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0008043090890542904, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5584994.0, "repeat_count": 0.0, "routers_loss": 0.00270817126147449, "skip_count": 0.0, "step": 3462, "text_loss": 0.785690426826477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0008040634407851739, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5588067.0, "repeat_count": 0.0, "routers_loss": 0.0018436965765431523, "skip_count": 0.0, "step": 3464, "text_loss": 0.5006644129753113 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.0008038176759983731, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5590789.0, "repeat_count": 0.0, "routers_loss": 0.008516279980540276, "skip_count": 2.0, "step": 3466, "text_loss": 0.20963478088378906 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.281772820663342, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0008035717947880659, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 5593472.0, "repeat_count": 0.0, "routers_loss": 0.0016293043736368418, "skip_count": 0.0, "step": 3468, "text_loss": 0.7376078963279724 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.0008033257972484742, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5596108.0, "repeat_count": 0.0, "routers_loss": 0.002364142332226038, "skip_count": 0.0, "step": 3470, "text_loss": 0.5156455039978027 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0008030796834738649, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5599103.0, "repeat_count": 0.0, "routers_loss": 0.008872323669493198, "skip_count": 0.0, "step": 3472, "text_loss": 0.2996419668197632 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 16.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043701171875, "learning_rate": 0.0008028334535585491, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5602410.0, "repeat_count": 0.0, "routers_loss": 0.011508257128298283, "skip_count": 3.0, "step": 3474, "text_loss": 0.25438693165779114 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.31934253008512, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.0008025871075968827, "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 5605424.0, "repeat_count": 2.0, "routers_loss": 0.017225435003638268, "skip_count": 2.0, "step": 3476, "text_loss": 0.2549574077129364 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.328734957440563, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0008023406456832657, "loss": 0.0111, "macro_f1": 0.9262410998344421, "num_tokens": 5608266.0, "repeat_count": 3.0, "routers_loss": 0.039165645837783813, "skip_count": 2.0, "step": 3478, "text_loss": 0.1797947734594345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0008020940679121429, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5611471.0, "repeat_count": 0.0, "routers_loss": 0.0009718866203911602, "skip_count": 0.0, "step": 3480, "text_loss": 0.8267702460289001 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0008018473743780036, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5615046.0, "repeat_count": 0.0, "routers_loss": 0.006087122485041618, "skip_count": 2.0, "step": 3482, "text_loss": 0.7267677187919617 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.000801600565175381, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5618350.0, "repeat_count": 0.0, "routers_loss": 0.0007539413054473698, "skip_count": 0.0, "step": 3484, "text_loss": 0.5910211801528931 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046142578125, "learning_rate": 0.0008013536403988529, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5621381.0, "repeat_count": 0.0, "routers_loss": 0.0008076327503658831, "skip_count": 0.0, "step": 3486, "text_loss": 0.30616798996925354 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 16.375697094217788, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.049072265625, "learning_rate": 0.0008011066001430412, "loss": 0.0086, "macro_f1": 0.6122449040412903, "num_tokens": 5624617.0, "repeat_count": 0.0, "routers_loss": 0.023835813626646996, "skip_count": 4.0, "step": 3488, "text_loss": 0.3376443088054657 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.38508952157323, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0008008594445026122, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5627989.0, "repeat_count": 0.0, "routers_loss": 0.004226419143378735, "skip_count": 2.0, "step": 3490, "text_loss": 0.8185343146324158 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.394481948928675, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0008006121735722767, "loss": 0.0084, "macro_f1": 0.32098764181137085, "num_tokens": 5632286.0, "repeat_count": 0.0, "routers_loss": 0.0366671048104763, "skip_count": 2.0, "step": 3492, "text_loss": 0.2209547609090805 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.403874376284122, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0008003647874467892, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 5635368.0, "repeat_count": 1.0, "routers_loss": 0.012956378981471062, "skip_count": 0.0, "step": 3494, "text_loss": 0.20468664169311523 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.413266803639566, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.0008001172862209485, "loss": 0.0103, "macro_f1": 0.6666666865348816, "num_tokens": 5638440.0, "repeat_count": 1.0, "routers_loss": 0.0017375422175973654, "skip_count": 0.0, "step": 3496, "text_loss": 0.6647221446037292 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 16.42265923099501, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.0007998696699895976, "loss": 0.0091, "macro_f1": 0.6592592597007751, "num_tokens": 5641996.0, "repeat_count": 1.0, "routers_loss": 0.025240756571292877, "skip_count": 5.0, "step": 3498, "text_loss": 0.23892143368721008 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021728515625, "learning_rate": 0.0007996219388476236, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5645071.0, "repeat_count": 0.0, "routers_loss": 0.007436830550432205, "skip_count": 1.0, "step": 3500, "text_loss": 0.7580804228782654 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0007993740928899571, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 5648175.0, "repeat_count": 0.0, "routers_loss": 0.001126602990552783, "skip_count": 0.0, "step": 3502, "text_loss": 0.5281378626823425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0007991261322115737, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5650973.0, "repeat_count": 0.0, "routers_loss": 0.0007907263352535665, "skip_count": 0.0, "step": 3504, "text_loss": 0.25220927596092224 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.46022894041679, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.000798878056907492, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 5654252.0, "repeat_count": 2.0, "routers_loss": 0.006263538729399443, "skip_count": 2.0, "step": 3506, "text_loss": 0.46569153666496277 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 16.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0703125, "learning_rate": 0.0007986298670727752, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 5657229.0, "repeat_count": 0.0, "routers_loss": 0.004049144219607115, "skip_count": 3.0, "step": 3508, "text_loss": 0.15174436569213867 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 16.479013795127678, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0791015625, "learning_rate": 0.0007983815628025301, "loss": 0.0074, "macro_f1": 0.9262410998344421, "num_tokens": 5659974.0, "repeat_count": 2.0, "routers_loss": 0.0471976138651371, "skip_count": 3.0, "step": 3510, "text_loss": 0.39072203636169434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.488406222483125, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.000798133144191907, "loss": 0.0082, "macro_f1": 0.3272727429866791, "num_tokens": 5662893.0, "repeat_count": 0.0, "routers_loss": 0.04030488431453705, "skip_count": 1.0, "step": 3512, "text_loss": 0.3562147617340088 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0595703125, "learning_rate": 0.0007978846113361009, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5666476.0, "repeat_count": 0.0, "routers_loss": 0.007475079502910376, "skip_count": 1.0, "step": 3514, "text_loss": 0.26518192887306213 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044189453125, "learning_rate": 0.0007976359643303497, "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 5669647.0, "repeat_count": 0.0, "routers_loss": 0.00558585487306118, "skip_count": 2.0, "step": 3516, "text_loss": 0.29284560680389404 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.516583504549455, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0007973872032699354, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 5673491.0, "repeat_count": 1.0, "routers_loss": 0.0026981087867170572, "skip_count": 1.0, "step": 3518, "text_loss": 0.35089045763015747 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.525975931904902, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.000797138328250184, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5676529.0, "repeat_count": 1.0, "routers_loss": 0.0027328627184033394, "skip_count": 0.0, "step": 3520, "text_loss": 0.41077399253845215 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 16.535368359260346, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0007968893393664646, "loss": 0.01, "macro_f1": 0.6592592597007751, "num_tokens": 5679987.0, "repeat_count": 1.0, "routers_loss": 0.02695014327764511, "skip_count": 5.0, "step": 3522, "text_loss": 0.44942837953567505 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007966402367141903, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5683185.0, "repeat_count": 0.0, "routers_loss": 0.00817026849836111, "skip_count": 2.0, "step": 3524, "text_loss": 0.14528048038482666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0007963910203888176, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 5686544.0, "repeat_count": 0.0, "routers_loss": 0.0021973433904349804, "skip_count": 0.0, "step": 3526, "text_loss": 0.22358648478984833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.56354564132668, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0007961416904858469, "loss": 0.0078, "macro_f1": 0.3272727429866791, "num_tokens": 5689579.0, "repeat_count": 0.0, "routers_loss": 0.033712416887283325, "skip_count": 1.0, "step": 3528, "text_loss": 0.3083649277687073 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0007958922471008217, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5692869.0, "repeat_count": 0.0, "routers_loss": 0.011182719841599464, "skip_count": 2.0, "step": 3530, "text_loss": 0.21288011968135834 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0007956426903293292, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5696007.0, "repeat_count": 0.0, "routers_loss": 0.0015808293828740716, "skip_count": 0.0, "step": 3532, "text_loss": 0.6068631410598755 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.591722923393014, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0007953930202670001, "loss": 0.0062, "macro_f1": 0.5492662787437439, "num_tokens": 5699474.0, "repeat_count": 2.0, "routers_loss": 0.03205178305506706, "skip_count": 0.0, "step": 3534, "text_loss": 0.4317135512828827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.601115350748458, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0007951432370095084, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 5703483.0, "repeat_count": 0.0, "routers_loss": 0.003518853336572647, "skip_count": 0.0, "step": 3536, "text_loss": 0.5432273149490356 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.610507778103905, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11083984375, "learning_rate": 0.0007948933406525715, "loss": 0.01, "macro_f1": 1.0, "num_tokens": 5707301.0, "repeat_count": 1.0, "routers_loss": 0.004982157610356808, "skip_count": 1.0, "step": 3538, "text_loss": 0.40061065554618835 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.61990020545935, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0751953125, "learning_rate": 0.0007946433312919502, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5710847.0, "repeat_count": 0.0, "routers_loss": 0.003067734418436885, "skip_count": 0.0, "step": 3540, "text_loss": 0.5396234393119812 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 16.629292632814792, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05224609375, "learning_rate": 0.0007943932090234486, "loss": 0.0097, "macro_f1": 0.5492662787437439, "num_tokens": 5713683.0, "repeat_count": 0.0, "routers_loss": 0.03728383034467697, "skip_count": 2.0, "step": 3542, "text_loss": 0.18310914933681488 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 16.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007941429739429138, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 5716397.0, "repeat_count": 0.0, "routers_loss": 0.0025092530995607376, "skip_count": 3.0, "step": 3544, "text_loss": 0.5806207060813904 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007938926261462366, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5719984.0, "repeat_count": 0.0, "routers_loss": 0.002493767999112606, "skip_count": 0.0, "step": 3546, "text_loss": 0.38606807589530945 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 16.657469914881126, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05078125, "learning_rate": 0.0007936421657293507, "loss": 0.0094, "macro_f1": 0.8823530077934265, "num_tokens": 5723571.0, "repeat_count": 1.0, "routers_loss": 0.014810923486948013, "skip_count": 2.0, "step": 3548, "text_loss": 0.49558472633361816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.666862342236573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0007933915927882327, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5726405.0, "repeat_count": 0.0, "routers_loss": 0.00152928801253438, "skip_count": 0.0, "step": 3550, "text_loss": 0.8674797415733337 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.000793140907418903, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5729955.0, "repeat_count": 0.0, "routers_loss": 0.005522782914340496, "skip_count": 2.0, "step": 3552, "text_loss": 0.3274473249912262 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.0007928901097174248, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5733030.0, "repeat_count": 0.0, "routers_loss": 0.009207013063132763, "skip_count": 2.0, "step": 3554, "text_loss": 0.18237128853797913 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0007926391997799039, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5735978.0, "repeat_count": 0.0, "routers_loss": 0.00695531303063035, "skip_count": 0.0, "step": 3556, "text_loss": 0.3266434967517853 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05419921875, "learning_rate": 0.0007923881777024898, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5738901.0, "repeat_count": 0.0, "routers_loss": 0.002743212040513754, "skip_count": 1.0, "step": 3558, "text_loss": 0.4971913695335388 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.713824479013795, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04931640625, "learning_rate": 0.0007921370435813741, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5741946.0, "repeat_count": 1.0, "routers_loss": 0.007037297356873751, "skip_count": 0.0, "step": 3560, "text_loss": 0.5645473599433899 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0007918857975127924, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5744987.0, "repeat_count": 0.0, "routers_loss": 0.0030746585689485073, "skip_count": 0.0, "step": 3562, "text_loss": 0.17717665433883667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0007916344395930224, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 5747837.0, "repeat_count": 0.0, "routers_loss": 0.004522138275206089, "skip_count": 0.0, "step": 3564, "text_loss": 0.7676118612289429 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.000791382969918385, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5750716.0, "repeat_count": 0.0, "routers_loss": 0.0026240211445838213, "skip_count": 0.0, "step": 3566, "text_loss": 0.4975173771381378 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.751394188435572, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.000791131388585244, "loss": 0.011, "macro_f1": 0.8820862174034119, "num_tokens": 5754368.0, "repeat_count": 2.0, "routers_loss": 0.021831991150975227, "skip_count": 2.0, "step": 3568, "text_loss": 0.9670342206954956 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.76078661579102, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0007908796956900055, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5757076.0, "repeat_count": 1.0, "routers_loss": 0.0017586691537871957, "skip_count": 0.0, "step": 3570, "text_loss": 0.3057977259159088 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.000790627891329119, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5760613.0, "repeat_count": 0.0, "routers_loss": 0.005515786819159985, "skip_count": 0.0, "step": 3572, "text_loss": 0.5860086679458618 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0007903759755990763, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 5763557.0, "repeat_count": 0.0, "routers_loss": 0.004096484277397394, "skip_count": 0.0, "step": 3574, "text_loss": 0.17175781726837158 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.788963897857354, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04541015625, "learning_rate": 0.000790123948596412, "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 5767430.0, "repeat_count": 1.0, "routers_loss": 0.005216122139245272, "skip_count": 0.0, "step": 3576, "text_loss": 0.7520374059677124 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0007898718104177031, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 5770175.0, "repeat_count": 0.0, "routers_loss": 0.0037980107590556145, "skip_count": 0.0, "step": 3578, "text_loss": 0.18117885291576385 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007896195611595699, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5773032.0, "repeat_count": 0.0, "routers_loss": 0.003672175807878375, "skip_count": 2.0, "step": 3580, "text_loss": 0.7241058349609375 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.817141179923688, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0615234375, "learning_rate": 0.0007893672009186744, "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 5776077.0, "repeat_count": 1.0, "routers_loss": 0.01229850109666586, "skip_count": 3.0, "step": 3582, "text_loss": 0.29140418767929077 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.82653360727913, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007891147297917216, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5779088.0, "repeat_count": 1.0, "routers_loss": 0.0035251814406365156, "skip_count": 0.0, "step": 3584, "text_loss": 0.1727485954761505 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.055908203125, "learning_rate": 0.000788862147875459, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 5782201.0, "repeat_count": 0.0, "routers_loss": 0.004725661128759384, "skip_count": 2.0, "step": 3586, "text_loss": 0.43512848019599915 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06396484375, "learning_rate": 0.0007886094552666765, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5785039.0, "repeat_count": 0.0, "routers_loss": 0.005632172804325819, "skip_count": 0.0, "step": 3588, "text_loss": 0.3534786105155945 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0556640625, "learning_rate": 0.0007883566520622062, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 5788017.0, "repeat_count": 0.0, "routers_loss": 0.006249965168535709, "skip_count": 1.0, "step": 3590, "text_loss": 0.2089710384607315 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.0007881037383589229, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 5791168.0, "repeat_count": 0.0, "routers_loss": 0.0013797614956274629, "skip_count": 0.0, "step": 3592, "text_loss": 0.4349329471588135 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0007878507142537436, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 5793927.0, "repeat_count": 0.0, "routers_loss": 0.0019719740375876427, "skip_count": 1.0, "step": 3594, "text_loss": 0.6087368726730347 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.8828881714118, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0007875975798436274, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5797214.0, "repeat_count": 1.0, "routers_loss": 0.0037070370744913816, "skip_count": 0.0, "step": 3596, "text_loss": 0.4258122444152832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048583984375, "learning_rate": 0.0007873443352255764, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5800691.0, "repeat_count": 0.0, "routers_loss": 0.008431311696767807, "skip_count": 0.0, "step": 3598, "text_loss": 0.6006711721420288 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055419921875, "learning_rate": 0.0007870909804966337, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5804712.0, "repeat_count": 0.0, "routers_loss": 0.0017720256000757217, "skip_count": 0.0, "step": 3600, "text_loss": 0.6055042743682861 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.911065453478134, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0007868375157538861, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 5807670.0, "repeat_count": 1.0, "routers_loss": 0.010697763413190842, "skip_count": 0.0, "step": 3602, "text_loss": 0.8039056658744812 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.920457880833577, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0007865839410944611, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5810880.0, "repeat_count": 1.0, "routers_loss": 0.0030022128485143185, "skip_count": 0.0, "step": 3604, "text_loss": 0.596110463142395 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0007863302566155295, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5814171.0, "repeat_count": 0.0, "routers_loss": 0.006257854867726564, "skip_count": 2.0, "step": 3606, "text_loss": 0.5700319409370422 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.939242735544468, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0007860764624143031, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5817607.0, "repeat_count": 1.0, "routers_loss": 0.004838473163545132, "skip_count": 0.0, "step": 3608, "text_loss": 0.8319530487060547 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 16.94863516289991, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.08154296875, "learning_rate": 0.0007858225585880369, "loss": 0.0067, "macro_f1": 0.8823530077934265, "num_tokens": 5821452.0, "repeat_count": 1.0, "routers_loss": 0.02173662930727005, "skip_count": 2.0, "step": 3610, "text_loss": 0.3738477826118469 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007855685452340269, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5824683.0, "repeat_count": 0.0, "routers_loss": 0.0032719180453568697, "skip_count": 0.0, "step": 3612, "text_loss": 0.4054839015007019 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.967420017610802, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0007853144224496118, "loss": 0.0093, "macro_f1": 0.3272727429866791, "num_tokens": 5827860.0, "repeat_count": 1.0, "routers_loss": 0.032171256840229034, "skip_count": 0.0, "step": 3614, "text_loss": 0.18112395703792572 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0458984375, "learning_rate": 0.0007850601903321716, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5831651.0, "repeat_count": 0.0, "routers_loss": 0.013230946846306324, "skip_count": 1.0, "step": 3616, "text_loss": 0.2698844075202942 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.000784805848979129, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5834369.0, "repeat_count": 0.0, "routers_loss": 0.00162619655020535, "skip_count": 0.0, "step": 3618, "text_loss": 0.2430931180715561 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.995597299677137, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0007845513984879477, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5838102.0, "repeat_count": 1.0, "routers_loss": 0.002781603019684553, "skip_count": 0.0, "step": 3620, "text_loss": 0.4968300759792328 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0007842968389561337, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5841029.0, "repeat_count": 0.0, "routers_loss": 0.0023873315658420324, "skip_count": 0.0, "step": 3622, "text_loss": 0.5842974781990051 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0007840421704812346, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 5845158.0, "repeat_count": 0.0, "routers_loss": 0.00400173757225275, "skip_count": 1.0, "step": 3624, "text_loss": 0.8312450647354126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.00078378739316084, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 5849175.0, "repeat_count": 0.0, "routers_loss": 0.0004974664188921452, "skip_count": 0.0, "step": 3626, "text_loss": 0.48637253046035767 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 25.0, "epoch": 17.032873495744056, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.10693359375, "learning_rate": 0.000783532507092581, "loss": 0.0079, "macro_f1": 0.9555556178092957, "num_tokens": 5852020.0, "repeat_count": 1.0, "routers_loss": 0.02555239573121071, "skip_count": 5.0, "step": 3628, "text_loss": 0.5407033562660217 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0007832775123741306, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5854873.0, "repeat_count": 0.0, "routers_loss": 0.0025962977670133114, "skip_count": 0.0, "step": 3630, "text_loss": 0.618230938911438 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.000783022409103203, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5858086.0, "repeat_count": 0.0, "routers_loss": 0.0029271875973790884, "skip_count": 0.0, "step": 3632, "text_loss": 0.21259798109531403 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0007827671973775542, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5860886.0, "repeat_count": 0.0, "routers_loss": 0.004102068953216076, "skip_count": 0.0, "step": 3634, "text_loss": 0.4991208016872406 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.070443205165834, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0007825118772949819, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5864291.0, "repeat_count": 0.0, "routers_loss": 0.0023497689981013536, "skip_count": 1.0, "step": 3636, "text_loss": 0.3878401517868042 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0007822564489533255, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 5867155.0, "repeat_count": 0.0, "routers_loss": 0.007680345326662064, "skip_count": 2.0, "step": 3638, "text_loss": 0.6132124066352844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.053466796875, "learning_rate": 0.0007820009124504653, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5870325.0, "repeat_count": 0.0, "routers_loss": 0.0008242831099778414, "skip_count": 0.0, "step": 3640, "text_loss": 0.3552473187446594 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.098620487232168, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0007817452678843236, "loss": 0.0073, "macro_f1": 0.6601307392120361, "num_tokens": 5873301.0, "repeat_count": 1.0, "routers_loss": 0.023831043392419815, "skip_count": 2.0, "step": 3642, "text_loss": 0.18363867700099945 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0007814895153528635, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5876225.0, "repeat_count": 0.0, "routers_loss": 0.001999989850446582, "skip_count": 0.0, "step": 3644, "text_loss": 0.17581747472286224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0007812336549540903, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5879501.0, "repeat_count": 0.0, "routers_loss": 0.001098626758903265, "skip_count": 0.0, "step": 3646, "text_loss": 0.5040884613990784 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.126797769298502, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0007809776867860499, "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 5882608.0, "repeat_count": 0.0, "routers_loss": 0.012210183776915073, "skip_count": 1.0, "step": 3648, "text_loss": 0.27114811539649963 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00078072161094683, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5886106.0, "repeat_count": 0.0, "routers_loss": 0.005191771313548088, "skip_count": 2.0, "step": 3650, "text_loss": 0.5167917609214783 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0007804654275345591, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5889122.0, "repeat_count": 0.0, "routers_loss": 0.0016411367105320096, "skip_count": 1.0, "step": 3652, "text_loss": 0.7691274285316467 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 17.154975051364836, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0007802091366474074, "loss": 0.005, "macro_f1": 0.8823530077934265, "num_tokens": 5892313.0, "repeat_count": 2.0, "routers_loss": 0.015627093613147736, "skip_count": 1.0, "step": 3654, "text_loss": 0.4646325409412384 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0007799527383835858, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5895577.0, "repeat_count": 0.0, "routers_loss": 0.0009879748104140162, "skip_count": 0.0, "step": 3656, "text_loss": 0.5587969422340393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0007796962328413469, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5898546.0, "repeat_count": 0.0, "routers_loss": 0.004864919930696487, "skip_count": 0.0, "step": 3658, "text_loss": 0.6981375813484192 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.18315233343117, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0007794396201189839, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 5901618.0, "repeat_count": 1.0, "routers_loss": 0.006617432460188866, "skip_count": 2.0, "step": 3660, "text_loss": 0.22521957755088806 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.192544760786618, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0007791829003148312, "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 5904540.0, "repeat_count": 1.0, "routers_loss": 0.0782252699136734, "skip_count": 2.0, "step": 3662, "text_loss": 0.2649642825126648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0007789260735272647, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 5907827.0, "repeat_count": 0.0, "routers_loss": 0.0012057392159476876, "skip_count": 0.0, "step": 3664, "text_loss": 0.6943771243095398 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.018310546875, "learning_rate": 0.0007786691398547005, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 5911163.0, "repeat_count": 0.0, "routers_loss": 0.007476957980543375, "skip_count": 2.0, "step": 3666, "text_loss": 0.1502683162689209 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.220722042852948, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0007784120993955962, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5913948.0, "repeat_count": 1.0, "routers_loss": 0.004082011990249157, "skip_count": 0.0, "step": 3668, "text_loss": 0.4127517640590668 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 17.230114470208395, "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.0007781549522484503, "loss": 0.0066, "macro_f1": 0.9265305995941162, "num_tokens": 5917360.0, "repeat_count": 3.0, "routers_loss": 0.027505695819854736, "skip_count": 1.0, "step": 3670, "text_loss": 0.23892618715763092 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0007778976985118018, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5920524.0, "repeat_count": 0.0, "routers_loss": 0.0024977331049740314, "skip_count": 2.0, "step": 3672, "text_loss": 0.5076471567153931 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0007776403382842312, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5923632.0, "repeat_count": 0.0, "routers_loss": 0.0015700991498306394, "skip_count": 0.0, "step": 3674, "text_loss": 0.6287924647331238 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.25829175227473, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05810546875, "learning_rate": 0.0007773828716643591, "loss": 0.0085, "macro_f1": 0.3272727429866791, "num_tokens": 5926438.0, "repeat_count": 1.0, "routers_loss": 0.05108916014432907, "skip_count": 0.0, "step": 3676, "text_loss": 0.26517006754875183 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007771252987508474, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5930081.0, "repeat_count": 0.0, "routers_loss": 0.003439917229115963, "skip_count": 0.0, "step": 3678, "text_loss": 0.5189079642295837 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 17.277076606985617, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.056884765625, "learning_rate": 0.0007768676196423984, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 5933463.0, "repeat_count": 1.0, "routers_loss": 0.001935846172273159, "skip_count": 1.0, "step": 3680, "text_loss": 0.6703575849533081 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 17.286469034341064, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007766098344377553, "loss": 0.0082, "macro_f1": 0.31446540355682373, "num_tokens": 5937098.0, "repeat_count": 0.0, "routers_loss": 0.0384826585650444, "skip_count": 2.0, "step": 3682, "text_loss": 0.6424444913864136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0007763519432357018, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 5940436.0, "repeat_count": 0.0, "routers_loss": 0.0008654671837575734, "skip_count": 0.0, "step": 3684, "text_loss": 0.4189988672733307 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0007760939461350623, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 5943731.0, "repeat_count": 0.0, "routers_loss": 0.007468715775758028, "skip_count": 2.0, "step": 3686, "text_loss": 0.2875453233718872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.314646316407398, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0007758358432347019, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5946707.0, "repeat_count": 0.0, "routers_loss": 0.001252831774763763, "skip_count": 0.0, "step": 3688, "text_loss": 0.5093055367469788 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007755776346335259, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5949833.0, "repeat_count": 0.0, "routers_loss": 0.001680848654359579, "skip_count": 0.0, "step": 3690, "text_loss": 0.4031114876270294 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0007753193204304807, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5953095.0, "repeat_count": 0.0, "routers_loss": 0.0047258250415325165, "skip_count": 2.0, "step": 3692, "text_loss": 0.17632785439491272 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.342823598473732, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0007750609007245524, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5955971.0, "repeat_count": 2.0, "routers_loss": 0.001980359200388193, "skip_count": 4.0, "step": 3694, "text_loss": 0.3423727750778198 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0007748023756147679, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5958948.0, "repeat_count": 0.0, "routers_loss": 0.00511702848598361, "skip_count": 0.0, "step": 3696, "text_loss": 0.28279972076416016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007745437452001949, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5961819.0, "repeat_count": 0.0, "routers_loss": 0.0005220443126745522, "skip_count": 0.0, "step": 3698, "text_loss": 0.4793325662612915 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.371000880540066, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0007742850095799408, "loss": 0.0084, "macro_f1": 0.3272727429866791, "num_tokens": 5964625.0, "repeat_count": 1.0, "routers_loss": 0.06411020457744598, "skip_count": 0.0, "step": 3700, "text_loss": 0.2825184464454651 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0007740261688531536, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 5967134.0, "repeat_count": 0.0, "routers_loss": 0.004408109001815319, "skip_count": 3.0, "step": 3702, "text_loss": 0.690429151058197 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0007737672231190215, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 5969831.0, "repeat_count": 0.0, "routers_loss": 0.0006747521692886949, "skip_count": 0.0, "step": 3704, "text_loss": 0.32556024193763733 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0007735081724767732, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5973015.0, "repeat_count": 0.0, "routers_loss": 0.0020414739847183228, "skip_count": 0.0, "step": 3706, "text_loss": 0.5876469612121582 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.408570589961844, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.072265625, "learning_rate": 0.0007732490170256769, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 5975778.0, "repeat_count": 1.0, "routers_loss": 0.005610425490885973, "skip_count": 0.0, "step": 3708, "text_loss": 0.2968577444553375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0007729897568650422, "loss": 0.0097, "macro_f1": 0.3333333432674408, "num_tokens": 5979115.0, "repeat_count": 0.0, "routers_loss": 0.001248046406544745, "skip_count": 0.0, "step": 3710, "text_loss": 0.626361608505249 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06787109375, "learning_rate": 0.0007727303920942176, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 5982213.0, "repeat_count": 0.0, "routers_loss": 0.005791695322841406, "skip_count": 2.0, "step": 3712, "text_loss": 0.4133484661579132 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 17.436747872028178, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.08740234375, "learning_rate": 0.0007724709228125922, "loss": 0.0105, "macro_f1": 0.5492662787437439, "num_tokens": 5984930.0, "repeat_count": 0.0, "routers_loss": 0.02114664763212204, "skip_count": 2.0, "step": 3714, "text_loss": 0.4646461308002472 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.44614029938362, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.0007722113491195952, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 5988017.0, "repeat_count": 2.0, "routers_loss": 0.005913930479437113, "skip_count": 5.0, "step": 3716, "text_loss": 0.15474505722522736 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0007719516711146957, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5991562.0, "repeat_count": 0.0, "routers_loss": 0.0075925313867628574, "skip_count": 2.0, "step": 3718, "text_loss": 0.5293686985969543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.000771691888897403, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 5994675.0, "repeat_count": 0.0, "routers_loss": 0.0012335237115621567, "skip_count": 0.0, "step": 3720, "text_loss": 0.5210637450218201 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.0007714320025672657, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 5999070.0, "repeat_count": 0.0, "routers_loss": 0.010582062415778637, "skip_count": 2.0, "step": 3722, "text_loss": 0.2783571779727936 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.4837100088054, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.000771172012223873, "loss": 0.0078, "macro_f1": 0.6598639488220215, "num_tokens": 6002702.0, "repeat_count": 1.0, "routers_loss": 0.015008784830570221, "skip_count": 3.0, "step": 3724, "text_loss": 0.358705073595047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0007709119179668538, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6005517.0, "repeat_count": 0.0, "routers_loss": 0.00111615180503577, "skip_count": 0.0, "step": 3726, "text_loss": 0.45202162861824036 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 17.50249486351629, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0007706517198958764, "loss": 0.0096, "macro_f1": 0.6595745086669922, "num_tokens": 6009111.0, "repeat_count": 1.0, "routers_loss": 0.05215252563357353, "skip_count": 4.0, "step": 3728, "text_loss": 0.20360413193702698 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0007703914181106497, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6012989.0, "repeat_count": 0.0, "routers_loss": 0.010039499960839748, "skip_count": 3.0, "step": 3730, "text_loss": 0.20334361493587494 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.52127971822718, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0007701310127109211, "loss": 0.0062, "macro_f1": 0.3272727429866791, "num_tokens": 6016420.0, "repeat_count": 0.0, "routers_loss": 0.01090205181390047, "skip_count": 1.0, "step": 3732, "text_loss": 0.47959551215171814 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 24.0, "epoch": 17.530672145582624, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.0341796875, "learning_rate": 0.0007698705037964791, "loss": 0.0076, "macro_f1": 0.6225374937057495, "num_tokens": 6019551.0, "repeat_count": 0.0, "routers_loss": 0.02677762135863304, "skip_count": 5.0, "step": 3734, "text_loss": 0.2621438801288605 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.540064572938068, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.056640625, "learning_rate": 0.000769609891467151, "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 6022262.0, "repeat_count": 1.0, "routers_loss": 0.00460716662928462, "skip_count": 0.0, "step": 3736, "text_loss": 0.3433022201061249 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037109375, "learning_rate": 0.0007693491758228037, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6025723.0, "repeat_count": 0.0, "routers_loss": 0.0036111194640398026, "skip_count": 2.0, "step": 3738, "text_loss": 0.38703784346580505 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007690883569633442, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6028652.0, "repeat_count": 0.0, "routers_loss": 0.003299296135082841, "skip_count": 0.0, "step": 3740, "text_loss": 0.24203069508075714 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0007688274349887188, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 6032280.0, "repeat_count": 0.0, "routers_loss": 0.003173880511894822, "skip_count": 0.0, "step": 3742, "text_loss": 0.2827291488647461 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0007685664099989131, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6035111.0, "repeat_count": 0.0, "routers_loss": 0.0008576177642680705, "skip_count": 0.0, "step": 3744, "text_loss": 0.43613526225090027 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0007683052820939524, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6038428.0, "repeat_count": 0.0, "routers_loss": 0.004335585981607437, "skip_count": 2.0, "step": 3746, "text_loss": 1.0385624170303345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0007680440513739015, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6041185.0, "repeat_count": 0.0, "routers_loss": 0.0008210531086660922, "skip_count": 0.0, "step": 3748, "text_loss": 0.7070431709289551 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 17.60581156442618, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.056640625, "learning_rate": 0.0007677827179388646, "loss": 0.0089, "macro_f1": 1.0, "num_tokens": 6046333.0, "repeat_count": 1.0, "routers_loss": 0.003778942162171006, "skip_count": 1.0, "step": 3750, "text_loss": 0.3682238757610321 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 17.615203991781627, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.000767521281888985, "loss": 0.009, "macro_f1": 1.0, "num_tokens": 6049528.0, "repeat_count": 1.0, "routers_loss": 0.002767334459349513, "skip_count": 1.0, "step": 3752, "text_loss": 0.7619418501853943 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0007672597433244455, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 6053202.0, "repeat_count": 0.0, "routers_loss": 0.004796457476913929, "skip_count": 2.0, "step": 3754, "text_loss": 0.4157083034515381 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0007669981023454682, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 6056609.0, "repeat_count": 0.0, "routers_loss": 0.0013067846884950995, "skip_count": 0.0, "step": 3756, "text_loss": 0.4529118537902832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0007667363590523142, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 6060504.0, "repeat_count": 0.0, "routers_loss": 0.0010285493917763233, "skip_count": 0.0, "step": 3758, "text_loss": 0.8363246321678162 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.055419921875, "learning_rate": 0.0007664745135452844, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 6063526.0, "repeat_count": 0.0, "routers_loss": 0.006289863493293524, "skip_count": 3.0, "step": 3760, "text_loss": 0.5313657522201538 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05517578125, "learning_rate": 0.0007662125659247183, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6067147.0, "repeat_count": 0.0, "routers_loss": 0.0028537956532090902, "skip_count": 0.0, "step": 3762, "text_loss": 0.5668109059333801 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0007659505162909949, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6070350.0, "repeat_count": 0.0, "routers_loss": 0.0026814753655344248, "skip_count": 0.0, "step": 3764, "text_loss": 0.4983512759208679 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056884765625, "learning_rate": 0.0007656883647445318, "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 6073091.0, "repeat_count": 0.0, "routers_loss": 0.005981382913887501, "skip_count": 1.0, "step": 3766, "text_loss": 0.30372318625450134 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0007654261113857863, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6076244.0, "repeat_count": 0.0, "routers_loss": 0.000803640519734472, "skip_count": 0.0, "step": 3768, "text_loss": 0.6100738048553467 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0007651637563152539, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 6078936.0, "repeat_count": 0.0, "routers_loss": 0.0013324898900464177, "skip_count": 0.0, "step": 3770, "text_loss": 0.4733821153640747 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.709128265336073, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0007649012996334701, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6081951.0, "repeat_count": 1.0, "routers_loss": 0.0021543330512940884, "skip_count": 0.0, "step": 3772, "text_loss": 0.6794875860214233 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007646387414410085, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 6085165.0, "repeat_count": 0.0, "routers_loss": 0.0005426189745776355, "skip_count": 0.0, "step": 3774, "text_loss": 0.5886107683181763 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0007643760818384819, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6088370.0, "repeat_count": 0.0, "routers_loss": 0.002537576947361231, "skip_count": 0.0, "step": 3776, "text_loss": 0.23591920733451843 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0007641133209265423, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6092319.0, "repeat_count": 0.0, "routers_loss": 0.002613696036860347, "skip_count": 0.0, "step": 3778, "text_loss": 0.3217754662036896 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052978515625, "learning_rate": 0.0007638504588058796, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 6095799.0, "repeat_count": 0.0, "routers_loss": 0.0007219464750960469, "skip_count": 0.0, "step": 3780, "text_loss": 0.4276983141899109 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.0007635874955772234, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6098789.0, "repeat_count": 0.0, "routers_loss": 0.005965052172541618, "skip_count": 3.0, "step": 3782, "text_loss": 0.30936646461486816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0007633244313413417, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6101631.0, "repeat_count": 0.0, "routers_loss": 0.0007469559786841273, "skip_count": 0.0, "step": 3784, "text_loss": 0.44460123777389526 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045654296875, "learning_rate": 0.0007630612661990412, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 6105097.0, "repeat_count": 0.0, "routers_loss": 0.004300760570913553, "skip_count": 1.0, "step": 3786, "text_loss": 0.41950157284736633 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007627980002511672, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6107847.0, "repeat_count": 0.0, "routers_loss": 0.0023050960153341293, "skip_count": 1.0, "step": 3788, "text_loss": 0.48561373353004456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0007625346335986039, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6110546.0, "repeat_count": 0.0, "routers_loss": 0.0018124044872820377, "skip_count": 0.0, "step": 3790, "text_loss": 0.20882295072078705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0007622711663422735, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6113600.0, "repeat_count": 0.0, "routers_loss": 0.0007613401976414025, "skip_count": 0.0, "step": 3792, "text_loss": 0.31751760840415955 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0400390625, "learning_rate": 0.0007620075985831375, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 6116916.0, "repeat_count": 0.0, "routers_loss": 0.005452962126582861, "skip_count": 2.0, "step": 3794, "text_loss": 0.3246645927429199 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 17.82183739360141, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0007617439304221956, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6120056.0, "repeat_count": 2.0, "routers_loss": 0.0043787881731987, "skip_count": 0.0, "step": 3796, "text_loss": 0.4859195947647095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0007614801619604856, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6122668.0, "repeat_count": 0.0, "routers_loss": 0.0033891722559928894, "skip_count": 0.0, "step": 3798, "text_loss": 0.48194369673728943 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0007612162932990845, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6126792.0, "repeat_count": 0.0, "routers_loss": 0.001883238204754889, "skip_count": 0.0, "step": 3800, "text_loss": 0.3740062117576599 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0007609523245391068, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 6129801.0, "repeat_count": 0.0, "routers_loss": 0.00882677361369133, "skip_count": 2.0, "step": 3802, "text_loss": 0.5759486556053162 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007606882557817062, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6133613.0, "repeat_count": 0.0, "routers_loss": 0.009537030011415482, "skip_count": 2.0, "step": 3804, "text_loss": 0.3217554986476898 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0007604240871280742, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6137784.0, "repeat_count": 0.0, "routers_loss": 0.0023913346230983734, "skip_count": 0.0, "step": 3806, "text_loss": 0.3718445599079132 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.878191957734078, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007601598186794407, "loss": 0.0081, "macro_f1": 0.6603773832321167, "num_tokens": 6141356.0, "repeat_count": 1.0, "routers_loss": 0.033796411007642746, "skip_count": 1.0, "step": 3808, "text_loss": 0.2717749774456024 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.000759895450537074, "loss": 0.01, "macro_f1": 0.6666666865348816, "num_tokens": 6144448.0, "repeat_count": 0.0, "routers_loss": 0.0037919918540865183, "skip_count": 2.0, "step": 3810, "text_loss": 0.5935076475143433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007596309828022803, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6147526.0, "repeat_count": 0.0, "routers_loss": 0.0008182782912626863, "skip_count": 0.0, "step": 3812, "text_loss": 0.449336439371109 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.906369239800412, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0007593664155764044, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6150620.0, "repeat_count": 1.0, "routers_loss": 0.001734903547912836, "skip_count": 0.0, "step": 3814, "text_loss": 0.6647221446037292 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.915761667155856, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0007591017489608286, "loss": 0.0088, "macro_f1": 0.3272727429866791, "num_tokens": 6153714.0, "repeat_count": 1.0, "routers_loss": 0.04721754416823387, "skip_count": 0.0, "step": 3816, "text_loss": 0.25481200218200684 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007588369830569738, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6156974.0, "repeat_count": 0.0, "routers_loss": 0.0002484306460246444, "skip_count": 0.0, "step": 3818, "text_loss": 0.7195295691490173 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.934546521866746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0007585721179662988, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6159660.0, "repeat_count": 0.0, "routers_loss": 0.0051363613456487656, "skip_count": 2.0, "step": 3820, "text_loss": 0.5073586702346802 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0007583071537903005, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6163146.0, "repeat_count": 0.0, "routers_loss": 0.006719176657497883, "skip_count": 0.0, "step": 3822, "text_loss": 0.6950558423995972 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.953331376577633, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0007580420906305136, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 6166257.0, "repeat_count": 1.0, "routers_loss": 0.00871267355978489, "skip_count": 3.0, "step": 3824, "text_loss": 0.2549148201942444 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.0007577769285885109, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 6169624.0, "repeat_count": 0.0, "routers_loss": 0.0015642556827515364, "skip_count": 0.0, "step": 3826, "text_loss": 0.3720305860042572 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0007575116677659029, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6172673.0, "repeat_count": 0.0, "routers_loss": 0.0011551049537956715, "skip_count": 0.0, "step": 3828, "text_loss": 0.6819429397583008 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0007572463082643377, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 6175414.0, "repeat_count": 0.0, "routers_loss": 0.0008922060951590538, "skip_count": 0.0, "step": 3830, "text_loss": 0.5424665212631226 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0007569808501855023, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6178701.0, "repeat_count": 0.0, "routers_loss": 0.004167596809566021, "skip_count": 1.0, "step": 3832, "text_loss": 0.4429764151573181 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04931640625, "learning_rate": 0.00075671529363112, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 6183036.0, "repeat_count": 0.0, "routers_loss": 0.0008732969872653484, "skip_count": 0.0, "step": 3834, "text_loss": 0.8015334010124207 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007564496387029531, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6186325.0, "repeat_count": 0.0, "routers_loss": 0.0021374202333390713, "skip_count": 1.0, "step": 3836, "text_loss": 0.4233771562576294 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.01878485471089, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.000756183885502801, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6189919.0, "repeat_count": 1.0, "routers_loss": 0.004017227329313755, "skip_count": 0.0, "step": 3838, "text_loss": 0.33691394329071045 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 0.0007559180341325005, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6193412.0, "repeat_count": 0.0, "routers_loss": 0.0013120946241542697, "skip_count": 0.0, "step": 3840, "text_loss": 0.14970099925994873 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 18.037569709421778, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.031982421875, "learning_rate": 0.0007556520846939265, "loss": 0.0061, "macro_f1": 0.5492662787437439, "num_tokens": 6196588.0, "repeat_count": 0.0, "routers_loss": 0.011793316341936588, "skip_count": 2.0, "step": 3842, "text_loss": 0.2714047133922577 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 18.046962136777225, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0007553860372889914, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6200841.0, "repeat_count": 1.0, "routers_loss": 0.019968654960393906, "skip_count": 4.0, "step": 3844, "text_loss": 0.23680976033210754 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 18.05635456413267, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.052490234375, "learning_rate": 0.0007551198920196452, "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 6203797.0, "repeat_count": 0.0, "routers_loss": 0.013615630567073822, "skip_count": 2.0, "step": 3846, "text_loss": 0.25839608907699585 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0546875, "learning_rate": 0.000754853648987875, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6206790.0, "repeat_count": 0.0, "routers_loss": 0.002420815173536539, "skip_count": 1.0, "step": 3848, "text_loss": 0.5358025431632996 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 18.07513941884356, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.032470703125, "learning_rate": 0.0007545873082957057, "loss": 0.0072, "macro_f1": 0.9265305995941162, "num_tokens": 6209791.0, "repeat_count": 1.0, "routers_loss": 0.018236197531223297, "skip_count": 3.0, "step": 3850, "text_loss": 0.1463700383901596 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 18.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0007543208700451998, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6212792.0, "repeat_count": 0.0, "routers_loss": 0.006242573726922274, "skip_count": 3.0, "step": 3852, "text_loss": 0.9441591501235962 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.093924273554446, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0007540543343384565, "loss": 0.0062, "macro_f1": 0.3272727429866791, "num_tokens": 6215747.0, "repeat_count": 0.0, "routers_loss": 0.01451140083372593, "skip_count": 1.0, "step": 3854, "text_loss": 0.41610902547836304 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007537877012776132, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6218593.0, "repeat_count": 0.0, "routers_loss": 0.00037674361374229193, "skip_count": 0.0, "step": 3856, "text_loss": 0.6048852205276489 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.112709128265337, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0007535209709648439, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 6221315.0, "repeat_count": 1.0, "routers_loss": 0.005776284262537956, "skip_count": 3.0, "step": 3858, "text_loss": 0.35627537965774536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0007532541435023605, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6225012.0, "repeat_count": 0.0, "routers_loss": 0.0009280376834794879, "skip_count": 0.0, "step": 3860, "text_loss": 0.6440183520317078 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0007529872189924114, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6227650.0, "repeat_count": 0.0, "routers_loss": 0.0009876530384644866, "skip_count": 0.0, "step": 3862, "text_loss": 0.35507893562316895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.14088641033167, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.0007527201975372827, "loss": 0.0045, "macro_f1": 0.6603773832321167, "num_tokens": 6230557.0, "repeat_count": 1.0, "routers_loss": 0.013780162669718266, "skip_count": 1.0, "step": 3864, "text_loss": 0.38958442211151123 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 18.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04638671875, "learning_rate": 0.0007524530792392977, "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 6233371.0, "repeat_count": 0.0, "routers_loss": 0.004849869292229414, "skip_count": 3.0, "step": 3866, "text_loss": 0.3826720714569092 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0191650390625, "learning_rate": 0.0007521858642008163, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6236770.0, "repeat_count": 0.0, "routers_loss": 0.008618295192718506, "skip_count": 1.0, "step": 3868, "text_loss": 0.3596078157424927 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.0007519185525242363, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6239661.0, "repeat_count": 0.0, "routers_loss": 0.0013421972980722785, "skip_count": 0.0, "step": 3870, "text_loss": 0.5585550665855408 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 0.0007516511443119916, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6242459.0, "repeat_count": 0.0, "routers_loss": 0.0038009448908269405, "skip_count": 1.0, "step": 3872, "text_loss": 0.4418395757675171 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.187848547108892, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0007513836396665534, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 6245489.0, "repeat_count": 1.0, "routers_loss": 0.002785376040264964, "skip_count": 2.0, "step": 3874, "text_loss": 0.551510751247406 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.19724097446434, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 0.0007511160386904305, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6249014.0, "repeat_count": 0.0, "routers_loss": 0.0021424589212983847, "skip_count": 1.0, "step": 3876, "text_loss": 1.0502676963806152 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0007508483414861679, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6252357.0, "repeat_count": 0.0, "routers_loss": 0.0085759861394763, "skip_count": 1.0, "step": 3878, "text_loss": 0.49212515354156494 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0007505805481563477, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6254975.0, "repeat_count": 0.0, "routers_loss": 0.0010723904706537724, "skip_count": 0.0, "step": 3880, "text_loss": 0.7022985816001892 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.225418256530673, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0007503126588035887, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 6258001.0, "repeat_count": 1.0, "routers_loss": 0.012809890322387218, "skip_count": 2.0, "step": 3882, "text_loss": 0.1829151213169098 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.0007500446735305466, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 6261795.0, "repeat_count": 0.0, "routers_loss": 0.0026790346018970013, "skip_count": 1.0, "step": 3884, "text_loss": 0.20436066389083862 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.24420311124156, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.000749776592439914, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 6265585.0, "repeat_count": 1.0, "routers_loss": 0.005243788007646799, "skip_count": 2.0, "step": 3886, "text_loss": 0.4479229748249054 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.00074950841563442, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6269039.0, "repeat_count": 0.0, "routers_loss": 0.007998534478247166, "skip_count": 1.0, "step": 3888, "text_loss": 0.2154676914215088 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0007492401432168303, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6272315.0, "repeat_count": 0.0, "routers_loss": 0.004648822825402021, "skip_count": 1.0, "step": 3890, "text_loss": 0.3375042676925659 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.272380393307895, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0007489717752899477, "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 6275342.0, "repeat_count": 0.0, "routers_loss": 0.012154200114309788, "skip_count": 1.0, "step": 3892, "text_loss": 0.1964082419872284 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.000748703311956611, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6278700.0, "repeat_count": 1.0, "routers_loss": 0.004610476549714804, "skip_count": 2.0, "step": 3894, "text_loss": 0.26545581221580505 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06201171875, "learning_rate": 0.0007484347533196961, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 6281864.0, "repeat_count": 0.0, "routers_loss": 0.0075586591847240925, "skip_count": 2.0, "step": 3896, "text_loss": 0.3106999397277832 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02099609375, "learning_rate": 0.0007481660994821151, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6284676.0, "repeat_count": 0.0, "routers_loss": 0.007845268584787846, "skip_count": 1.0, "step": 3898, "text_loss": 0.4094304144382477 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.309950102729672, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0007478973505468165, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 6287470.0, "repeat_count": 1.0, "routers_loss": 0.011116391979157925, "skip_count": 2.0, "step": 3900, "text_loss": 0.1838909536600113 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.31934253008512, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0007476285066167857, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 6290432.0, "repeat_count": 1.0, "routers_loss": 0.004599364474415779, "skip_count": 0.0, "step": 3902, "text_loss": 0.25872838497161865 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.0007473595677950439, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 6293557.0, "repeat_count": 0.0, "routers_loss": 0.0016367282951250672, "skip_count": 1.0, "step": 3904, "text_loss": 0.5272360444068909 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0007470905341846492, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 6295979.0, "repeat_count": 0.0, "routers_loss": 0.0004760588926728815, "skip_count": 0.0, "step": 3906, "text_loss": 0.666959822177887 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007468214058886956, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6299215.0, "repeat_count": 0.0, "routers_loss": 0.000524883100297302, "skip_count": 0.0, "step": 3908, "text_loss": 0.5144801139831543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0007465521830103137, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6302320.0, "repeat_count": 0.0, "routers_loss": 0.0016085522947832942, "skip_count": 0.0, "step": 3910, "text_loss": 0.14342890679836273 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007462828656526702, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6305212.0, "repeat_count": 0.0, "routers_loss": 0.002720315707847476, "skip_count": 2.0, "step": 3912, "text_loss": 0.31109121441841125 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.375697094217788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06884765625, "learning_rate": 0.0007460134539189681, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 6308964.0, "repeat_count": 0.0, "routers_loss": 0.0010418406454846263, "skip_count": 1.0, "step": 3914, "text_loss": 0.5662030577659607 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.38508952157323, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0007457439479124459, "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 6313195.0, "repeat_count": 0.0, "routers_loss": 0.0020303844939917326, "skip_count": 0.0, "step": 3916, "text_loss": 0.6358339190483093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.394481948928675, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0007454743477363797, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6315949.0, "repeat_count": 0.0, "routers_loss": 0.0006592223653569818, "skip_count": 0.0, "step": 3918, "text_loss": 0.35648423433303833 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.403874376284122, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0007452046534940803, "loss": 0.0075, "macro_f1": 0.6603773832321167, "num_tokens": 6319024.0, "repeat_count": 1.0, "routers_loss": 0.024555351585149765, "skip_count": 1.0, "step": 3920, "text_loss": 0.21955153346061707 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0007449348652888952, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6321633.0, "repeat_count": 0.0, "routers_loss": 0.003606822807341814, "skip_count": 1.0, "step": 3922, "text_loss": 0.6079489588737488 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007446649832242075, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 6325209.0, "repeat_count": 0.0, "routers_loss": 0.0035831446293741465, "skip_count": 1.0, "step": 3924, "text_loss": 0.2774808406829834 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0007443950074034368, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6327822.0, "repeat_count": 0.0, "routers_loss": 0.006809544749557972, "skip_count": 2.0, "step": 3926, "text_loss": 0.48236769437789917 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.4414440857059, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.049072265625, "learning_rate": 0.0007441249379300381, "loss": 0.007, "macro_f1": 0.6601307392120361, "num_tokens": 6331662.0, "repeat_count": 1.0, "routers_loss": 0.023832591250538826, "skip_count": 2.0, "step": 3928, "text_loss": 0.7287537455558777 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.450836513061343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0007438547749075028, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 6335801.0, "repeat_count": 1.0, "routers_loss": 0.011755098588764668, "skip_count": 3.0, "step": 3930, "text_loss": 0.17253030836582184 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.46022894041679, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0007435845184393577, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6338747.0, "repeat_count": 1.0, "routers_loss": 0.005972472485154867, "skip_count": 0.0, "step": 3932, "text_loss": 0.6400216817855835 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0007433141686291657, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 6342772.0, "repeat_count": 0.0, "routers_loss": 0.0030393085908144712, "skip_count": 1.0, "step": 3934, "text_loss": 0.6865074038505554 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.0007430437255805252, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6345957.0, "repeat_count": 0.0, "routers_loss": 0.0006984061910770833, "skip_count": 0.0, "step": 3936, "text_loss": 0.40398702025413513 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.488406222483125, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0007427731893970706, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6349162.0, "repeat_count": 1.0, "routers_loss": 0.005219762213528156, "skip_count": 0.0, "step": 3938, "text_loss": 0.5951031446456909 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 18.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007425025601824717, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 6352655.0, "repeat_count": 0.0, "routers_loss": 0.015575960278511047, "skip_count": 3.0, "step": 3940, "text_loss": 0.26689088344573975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007422318380404346, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6355890.0, "repeat_count": 0.0, "routers_loss": 0.0012208883417770267, "skip_count": 0.0, "step": 3942, "text_loss": 0.570725679397583 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.516583504549455, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0007419610230746999, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6358891.0, "repeat_count": 1.0, "routers_loss": 0.0029412026051431894, "skip_count": 0.0, "step": 3944, "text_loss": 0.5521301031112671 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0007416901153890448, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6361586.0, "repeat_count": 0.0, "routers_loss": 0.0010283910669386387, "skip_count": 0.0, "step": 3946, "text_loss": 0.4046417772769928 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0007414191150872818, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6364954.0, "repeat_count": 0.0, "routers_loss": 0.008222512900829315, "skip_count": 2.0, "step": 3948, "text_loss": 0.2803446352481842 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0007411480222732583, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6367660.0, "repeat_count": 0.0, "routers_loss": 0.001304348581470549, "skip_count": 0.0, "step": 3950, "text_loss": 0.45553359389305115 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0007408768370508576, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6371585.0, "repeat_count": 0.0, "routers_loss": 0.0016345062758773565, "skip_count": 0.0, "step": 3952, "text_loss": 0.25424402952194214 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0007406055595239986, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6374365.0, "repeat_count": 0.0, "routers_loss": 0.0005097290268167853, "skip_count": 0.0, "step": 3954, "text_loss": 0.5856026411056519 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.060546875, "learning_rate": 0.0007403341897966356, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6377335.0, "repeat_count": 0.0, "routers_loss": 0.002482263371348381, "skip_count": 1.0, "step": 3956, "text_loss": 0.5145615339279175 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0007400627279727574, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 6380799.0, "repeat_count": 0.0, "routers_loss": 0.0011743451468646526, "skip_count": 0.0, "step": 3958, "text_loss": 0.31868961453437805 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0286865234375, "learning_rate": 0.0007397911741563892, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6383963.0, "repeat_count": 1.0, "routers_loss": 0.009861881844699383, "skip_count": 0.0, "step": 3960, "text_loss": 0.21192194521427155 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.601115350748458, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0007395195284515905, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 6387410.0, "repeat_count": 1.0, "routers_loss": 0.004189098719507456, "skip_count": 0.0, "step": 3962, "text_loss": 0.5809708833694458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.610507778103905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.0007392477909624567, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6390670.0, "repeat_count": 0.0, "routers_loss": 0.001853612600825727, "skip_count": 0.0, "step": 3964, "text_loss": 0.48985618352890015 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.61990020545935, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.0007389759617931182, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6393609.0, "repeat_count": 1.0, "routers_loss": 0.003303771372884512, "skip_count": 0.0, "step": 3966, "text_loss": 0.28729453682899475 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 18.629292632814792, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.10595703125, "learning_rate": 0.0007387040410477404, "loss": 0.0058, "macro_f1": 0.9452888369560242, "num_tokens": 6396608.0, "repeat_count": 1.0, "routers_loss": 0.01791577786207199, "skip_count": 4.0, "step": 3968, "text_loss": 0.30386820435523987 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0007384320288305235, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6399793.0, "repeat_count": 0.0, "routers_loss": 0.0005771282012574375, "skip_count": 0.0, "step": 3970, "text_loss": 0.47285011410713196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0007381599252457037, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6403365.0, "repeat_count": 0.0, "routers_loss": 0.003010645741596818, "skip_count": 0.0, "step": 3972, "text_loss": 0.5313063859939575 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.657469914881126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.000737887730397551, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6406205.0, "repeat_count": 1.0, "routers_loss": 0.006457438692450523, "skip_count": 0.0, "step": 3974, "text_loss": 0.2323843240737915 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.666862342236573, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0007376154443903713, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6409552.0, "repeat_count": 1.0, "routers_loss": 0.010693981312215328, "skip_count": 0.0, "step": 3976, "text_loss": 0.6304101943969727 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.676254769592017, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007373430673285051, "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 6412386.0, "repeat_count": 1.0, "routers_loss": 0.03116440214216709, "skip_count": 0.0, "step": 3978, "text_loss": 0.23448467254638672 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.68564719694746, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10009765625, "learning_rate": 0.0007370705993163278, "loss": 0.0111, "macro_f1": 0.3272727429866791, "num_tokens": 6416054.0, "repeat_count": 1.0, "routers_loss": 0.011973714455962181, "skip_count": 0.0, "step": 3980, "text_loss": 0.6371755599975586 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.695039624302908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05224609375, "learning_rate": 0.0007367980404582497, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 6419238.0, "repeat_count": 1.0, "routers_loss": 0.005117347463965416, "skip_count": 2.0, "step": 3982, "text_loss": 0.19822923839092255 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0007365253908587158, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6422122.0, "repeat_count": 0.0, "routers_loss": 0.0010648667812347412, "skip_count": 0.0, "step": 3984, "text_loss": 0.566700279712677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.0007362526506222058, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6425313.0, "repeat_count": 0.0, "routers_loss": 0.005726494826376438, "skip_count": 0.0, "step": 3986, "text_loss": 0.6568437814712524 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.723216906369238, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0007359798198532343, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 6428422.0, "repeat_count": 1.0, "routers_loss": 0.004504100419580936, "skip_count": 0.0, "step": 3988, "text_loss": 0.598754346370697 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0007357068986563509, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 6431512.0, "repeat_count": 0.0, "routers_loss": 0.0019837068393826485, "skip_count": 1.0, "step": 3990, "text_loss": 0.7152895927429199 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0007354338871361393, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6434358.0, "repeat_count": 0.0, "routers_loss": 0.0026031541638076305, "skip_count": 1.0, "step": 3992, "text_loss": 0.4986513555049896 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.751394188435572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.000735160785397218, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6438175.0, "repeat_count": 0.0, "routers_loss": 0.0024831905029714108, "skip_count": 2.0, "step": 3994, "text_loss": 0.4406205713748932 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007348875935442401, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6441228.0, "repeat_count": 0.0, "routers_loss": 0.0008635876583866775, "skip_count": 0.0, "step": 3996, "text_loss": 0.48884135484695435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007346143116818932, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6444318.0, "repeat_count": 0.0, "routers_loss": 0.004007008858025074, "skip_count": 0.0, "step": 3998, "text_loss": 0.6669428944587708 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0007343409399148994, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6448317.0, "repeat_count": 0.0, "routers_loss": 0.0031380734872072935, "skip_count": 0.0, "step": 4000, "text_loss": 0.6468493938446045 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0007340674783480154, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 6451673.0, "repeat_count": 0.0, "routers_loss": 0.004996029660105705, "skip_count": 0.0, "step": 4002, "text_loss": 0.28135430812835693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.798356325212797, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007337939270860323, "loss": 0.009, "macro_f1": 0.3272727429866791, "num_tokens": 6456372.0, "repeat_count": 1.0, "routers_loss": 0.03784399852156639, "skip_count": 0.0, "step": 4004, "text_loss": 0.41668644547462463 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007335202862337753, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6459047.0, "repeat_count": 0.0, "routers_loss": 0.0011750755365937948, "skip_count": 0.0, "step": 4006, "text_loss": 0.6853910684585571 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 18.817141179923688, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.05908203125, "learning_rate": 0.000733246555896104, "loss": 0.0062, "macro_f1": 0.9452888369560242, "num_tokens": 6462390.0, "repeat_count": 1.0, "routers_loss": 0.01630394533276558, "skip_count": 4.0, "step": 4008, "text_loss": 0.7110592126846313 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0007329727361779124, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6466057.0, "repeat_count": 0.0, "routers_loss": 0.0052404399029910564, "skip_count": 2.0, "step": 4010, "text_loss": 0.13856995105743408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.000732698827184129, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6468878.0, "repeat_count": 0.0, "routers_loss": 0.002138581359758973, "skip_count": 0.0, "step": 4012, "text_loss": 0.3999565839767456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.000732424829019716, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6472364.0, "repeat_count": 0.0, "routers_loss": 0.0037466560024768114, "skip_count": 0.0, "step": 4014, "text_loss": 0.28161346912384033 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0007321507417896699, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 6475379.0, "repeat_count": 0.0, "routers_loss": 0.0010469373082742095, "skip_count": 0.0, "step": 4016, "text_loss": 1.0490952730178833 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06591796875, "learning_rate": 0.0007318765655990218, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6478585.0, "repeat_count": 0.0, "routers_loss": 0.009968385100364685, "skip_count": 2.0, "step": 4018, "text_loss": 0.31696680188179016 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0007316023005528362, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 6484153.0, "repeat_count": 0.0, "routers_loss": 0.002349073765799403, "skip_count": 1.0, "step": 4020, "text_loss": 0.30981555581092834 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 18.8828881714118, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0299072265625, "learning_rate": 0.0007313279467562124, "loss": 0.0053, "macro_f1": 0.9452888369560242, "num_tokens": 6487029.0, "repeat_count": 1.0, "routers_loss": 0.011854278855025768, "skip_count": 4.0, "step": 4022, "text_loss": 0.9689550399780273 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.892280598767243, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007310535043142829, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 6490315.0, "repeat_count": 1.0, "routers_loss": 0.00908346101641655, "skip_count": 3.0, "step": 4024, "text_loss": 0.1705625057220459 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0007307789733322146, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 6493921.0, "repeat_count": 0.0, "routers_loss": 0.0007360641611739993, "skip_count": 0.0, "step": 4026, "text_loss": 0.6252996325492859 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.0007305043539152083, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6496689.0, "repeat_count": 0.0, "routers_loss": 0.0017757206223905087, "skip_count": 0.0, "step": 4028, "text_loss": 0.40533265471458435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.000730229646168499, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6500090.0, "repeat_count": 0.0, "routers_loss": 0.0022657213266938925, "skip_count": 0.0, "step": 4030, "text_loss": 0.25954708456993103 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0007299548501973548, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6503023.0, "repeat_count": 0.0, "routers_loss": 0.0021747269202023745, "skip_count": 0.0, "step": 4032, "text_loss": 0.6223418712615967 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 18.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0007296799661070782, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6506382.0, "repeat_count": 0.0, "routers_loss": 0.006400502752512693, "skip_count": 4.0, "step": 4034, "text_loss": 0.6873653531074524 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.94863516289991, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0007294049940030055, "loss": 0.0065, "macro_f1": 0.3272727429866791, "num_tokens": 6509194.0, "repeat_count": 0.0, "routers_loss": 0.0197185929864645, "skip_count": 1.0, "step": 4036, "text_loss": 0.16156800091266632 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0007291299339905059, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6512271.0, "repeat_count": 0.0, "routers_loss": 0.0009541353792883456, "skip_count": 0.0, "step": 4038, "text_loss": 0.5038442015647888 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0007288547861749838, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 6516403.0, "repeat_count": 0.0, "routers_loss": 0.008226391859352589, "skip_count": 2.0, "step": 4040, "text_loss": 0.3706657588481903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.976812444966246, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0007285795506618758, "loss": 0.0063, "macro_f1": 0.3272727429866791, "num_tokens": 6519310.0, "repeat_count": 0.0, "routers_loss": 0.017001887783408165, "skip_count": 1.0, "step": 4042, "text_loss": 0.24296723306179047 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0615234375, "learning_rate": 0.0007283042275566528, "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 6521979.0, "repeat_count": 0.0, "routers_loss": 0.01666323095560074, "skip_count": 2.0, "step": 4044, "text_loss": 0.36904850602149963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 0.0007280288169648192, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6524976.0, "repeat_count": 0.0, "routers_loss": 0.0007593175978399813, "skip_count": 0.0, "step": 4046, "text_loss": 0.7312731146812439 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 19.00469621367772, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0007277533189919127, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 6528638.0, "repeat_count": 1.0, "routers_loss": 0.005652119871228933, "skip_count": 1.0, "step": 4048, "text_loss": 0.23326151072978973 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0286865234375, "learning_rate": 0.0007274777337435046, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6532193.0, "repeat_count": 0.0, "routers_loss": 0.010509157553315163, "skip_count": 2.0, "step": 4050, "text_loss": 0.23918013274669647 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007272020613251999, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 6534994.0, "repeat_count": 0.0, "routers_loss": 0.002153293928131461, "skip_count": 0.0, "step": 4052, "text_loss": 0.5890526175498962 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0007269263018426367, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 6537469.0, "repeat_count": 1.0, "routers_loss": 0.0018494052346795797, "skip_count": 2.0, "step": 4054, "text_loss": 0.36058738827705383 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0007266504554014866, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6541271.0, "repeat_count": 0.0, "routers_loss": 0.0007579320226795971, "skip_count": 0.0, "step": 4056, "text_loss": 0.4089007079601288 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.051658350454947, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0007263745221074545, "loss": 0.0086, "macro_f1": 0.6601307392120361, "num_tokens": 6544293.0, "repeat_count": 1.0, "routers_loss": 0.06202420964837074, "skip_count": 2.0, "step": 4058, "text_loss": 0.2226305454969406 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 19.06105077781039, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0286865234375, "learning_rate": 0.0007260985020662784, "loss": 0.0049, "macro_f1": 0.5934640765190125, "num_tokens": 6547640.0, "repeat_count": 0.0, "routers_loss": 0.044639844447374344, "skip_count": 3.0, "step": 4060, "text_loss": 0.23004353046417236 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 19.070443205165834, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0206298828125, "learning_rate": 0.0007258223953837298, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6550840.0, "repeat_count": 1.0, "routers_loss": 0.004215611144900322, "skip_count": 0.0, "step": 4062, "text_loss": 0.2891770601272583 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0007255462021656132, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6554122.0, "repeat_count": 0.0, "routers_loss": 0.0011056234361603856, "skip_count": 0.0, "step": 4064, "text_loss": 0.7485370635986328 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007252699225177666, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6557138.0, "repeat_count": 0.0, "routers_loss": 0.008258933201432228, "skip_count": 2.0, "step": 4066, "text_loss": 0.25219282507896423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0007249935565460606, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6560654.0, "repeat_count": 0.0, "routers_loss": 0.005102175287902355, "skip_count": 0.0, "step": 4068, "text_loss": 0.5553314089775085 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0007247171043563994, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6563814.0, "repeat_count": 0.0, "routers_loss": 0.01283820066601038, "skip_count": 2.0, "step": 4070, "text_loss": 0.15729956328868866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 0.0007244405660547199, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6567060.0, "repeat_count": 0.0, "routers_loss": 0.0009684927063062787, "skip_count": 0.0, "step": 4072, "text_loss": 0.3725031912326813 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01953125, "learning_rate": 0.000724163941746992, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6571608.0, "repeat_count": 0.0, "routers_loss": 0.0007890827837400138, "skip_count": 0.0, "step": 4074, "text_loss": 0.8438301682472229 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 19.13619019665395, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0007238872315392189, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 6575214.0, "repeat_count": 1.0, "routers_loss": 0.0040600355714559555, "skip_count": 1.0, "step": 4076, "text_loss": 0.5923112034797668 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0007236104355374363, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 6578383.0, "repeat_count": 0.0, "routers_loss": 0.0024899677373468876, "skip_count": 2.0, "step": 4078, "text_loss": 0.20302526652812958 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05517578125, "learning_rate": 0.000723333553847713, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6582175.0, "repeat_count": 0.0, "routers_loss": 0.006120906211435795, "skip_count": 2.0, "step": 4080, "text_loss": 0.5400223731994629 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0007230565865761504, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6585516.0, "repeat_count": 0.0, "routers_loss": 0.0029941233806312084, "skip_count": 0.0, "step": 4082, "text_loss": 0.19460804760456085 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07373046875, "learning_rate": 0.0007227795338288831, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 6588266.0, "repeat_count": 0.0, "routers_loss": 0.009357884526252747, "skip_count": 2.0, "step": 4084, "text_loss": 0.35237613320350647 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0007225023957120782, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 6591009.0, "repeat_count": 0.0, "routers_loss": 0.0023083325941115618, "skip_count": 2.0, "step": 4086, "text_loss": 0.4336731433868408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 0.0007222251723319356, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 6594472.0, "repeat_count": 0.0, "routers_loss": 0.0008416616474278271, "skip_count": 0.0, "step": 4088, "text_loss": 0.6390535831451416 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045166015625, "learning_rate": 0.0007219478637946877, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6597477.0, "repeat_count": 0.0, "routers_loss": 0.004390760324895382, "skip_count": 1.0, "step": 4090, "text_loss": 0.525839626789093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0007216704702065997, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6600431.0, "repeat_count": 0.0, "routers_loss": 0.0010311100631952286, "skip_count": 0.0, "step": 4092, "text_loss": 0.5310423374176025 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0007213929916739695, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 6603899.0, "repeat_count": 0.0, "routers_loss": 0.0032497600186616182, "skip_count": 1.0, "step": 4094, "text_loss": 0.2775326073169708 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.230114470208395, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.044189453125, "learning_rate": 0.000721115428303127, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 6606544.0, "repeat_count": 1.0, "routers_loss": 0.004692315589636564, "skip_count": 3.0, "step": 4096, "text_loss": 0.6667124032974243 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0007208377802004353, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6610097.0, "repeat_count": 0.0, "routers_loss": 0.0007263485458679497, "skip_count": 0.0, "step": 4098, "text_loss": 0.6916406750679016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0007205600474722897, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6613836.0, "repeat_count": 0.0, "routers_loss": 0.0017989488551393151, "skip_count": 0.0, "step": 4100, "text_loss": 0.5257929563522339 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.000720282230225118, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6616780.0, "repeat_count": 0.0, "routers_loss": 0.0011308686807751656, "skip_count": 1.0, "step": 4102, "text_loss": 0.4410906732082367 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0007200043285653799, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6620110.0, "repeat_count": 0.0, "routers_loss": 0.002058265497907996, "skip_count": 2.0, "step": 4104, "text_loss": 0.8581191897392273 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 19.277076606985617, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007197263425995681, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 6622585.0, "repeat_count": 1.0, "routers_loss": 0.0017528717871755362, "skip_count": 0.0, "step": 4106, "text_loss": 0.5000449419021606 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.286469034341064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0007194482724342075, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6626356.0, "repeat_count": 0.0, "routers_loss": 0.0021995846182107925, "skip_count": 0.0, "step": 4108, "text_loss": 0.401346892118454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0007191701181758547, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6629738.0, "repeat_count": 0.0, "routers_loss": 0.0014869922306388617, "skip_count": 0.0, "step": 4110, "text_loss": 0.9598422050476074 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0007188918799310993, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 6632807.0, "repeat_count": 0.0, "routers_loss": 0.0012853415682911873, "skip_count": 0.0, "step": 4112, "text_loss": 0.3996548354625702 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.314646316407398, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029296875, "learning_rate": 0.0007186135578065627, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6636227.0, "repeat_count": 0.0, "routers_loss": 0.0009887361666187644, "skip_count": 0.0, "step": 4114, "text_loss": 0.4127283990383148 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007183351519088982, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6639443.0, "repeat_count": 0.0, "routers_loss": 0.006282114889472723, "skip_count": 1.0, "step": 4116, "text_loss": 0.20028606057167053 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.333431171118285, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.061767578125, "learning_rate": 0.0007180566623447917, "loss": 0.0114, "macro_f1": 0.6603773832321167, "num_tokens": 6642127.0, "repeat_count": 1.0, "routers_loss": 0.008101986721158028, "skip_count": 0.0, "step": 4118, "text_loss": 0.763931155204773 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0007177780892209607, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6645376.0, "repeat_count": 0.0, "routers_loss": 0.001953610684722662, "skip_count": 0.0, "step": 4120, "text_loss": 0.42317715287208557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0007174994326441551, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6648150.0, "repeat_count": 0.0, "routers_loss": 0.003279355587437749, "skip_count": 0.0, "step": 4122, "text_loss": 0.19656142592430115 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0007172206927211567, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6650935.0, "repeat_count": 0.0, "routers_loss": 0.0032076311763375998, "skip_count": 0.0, "step": 4124, "text_loss": 0.13608409464359283 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0007169418695587791, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6654464.0, "repeat_count": 0.0, "routers_loss": 0.004065621178597212, "skip_count": 2.0, "step": 4126, "text_loss": 0.4882086217403412 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0007166629632638678, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6657749.0, "repeat_count": 0.0, "routers_loss": 0.0009243001695722342, "skip_count": 0.0, "step": 4128, "text_loss": 0.31632331013679504 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.0007163839739433003, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6660997.0, "repeat_count": 0.0, "routers_loss": 0.0018459554994478822, "skip_count": 0.0, "step": 4130, "text_loss": 0.6123947501182556 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.399178162606397, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0007161049017039857, "loss": 0.0073, "macro_f1": 0.8820862174034119, "num_tokens": 6663542.0, "repeat_count": 2.0, "routers_loss": 0.030032536014914513, "skip_count": 2.0, "step": 4132, "text_loss": 0.6985659003257751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 0.0007158257466528652, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6666178.0, "repeat_count": 0.0, "routers_loss": 0.0013813833938911557, "skip_count": 0.0, "step": 4134, "text_loss": 0.38380664587020874 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 19.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021484375, "learning_rate": 0.0007155465088969114, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 6668852.0, "repeat_count": 0.0, "routers_loss": 0.00513424864038825, "skip_count": 3.0, "step": 4136, "text_loss": 0.49724283814430237 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0007152671885431288, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6671430.0, "repeat_count": 0.0, "routers_loss": 0.0005165594047866762, "skip_count": 0.0, "step": 4138, "text_loss": 0.666959822177887 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047119140625, "learning_rate": 0.0007149877856985535, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6675215.0, "repeat_count": 0.0, "routers_loss": 0.001685218419879675, "skip_count": 0.0, "step": 4140, "text_loss": 0.3127259612083435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.000714708300470253, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6678505.0, "repeat_count": 0.0, "routers_loss": 0.004025314934551716, "skip_count": 0.0, "step": 4142, "text_loss": 0.3179470896720886 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 19.455532726739065, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0007144287329653269, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 6681127.0, "repeat_count": 1.0, "routers_loss": 0.005965690594166517, "skip_count": 0.0, "step": 4144, "text_loss": 0.3862907886505127 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.464925154094512, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0007141490832909058, "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 6683968.0, "repeat_count": 0.0, "routers_loss": 0.012896374799311161, "skip_count": 1.0, "step": 4146, "text_loss": 0.48156118392944336 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0007138693515541519, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6687196.0, "repeat_count": 0.0, "routers_loss": 0.0006367767928168178, "skip_count": 1.0, "step": 4148, "text_loss": 0.676702082157135 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 19.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030029296875, "learning_rate": 0.0007135895378622592, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 6689972.0, "repeat_count": 0.0, "routers_loss": 0.004532640799880028, "skip_count": 3.0, "step": 4150, "text_loss": 0.5865558981895447 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.493102436160846, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007133096423224526, "loss": 0.0081, "macro_f1": 0.3272727429866791, "num_tokens": 6693568.0, "repeat_count": 1.0, "routers_loss": 0.0377078577876091, "skip_count": 0.0, "step": 4152, "text_loss": 0.2790502607822418 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056640625, "learning_rate": 0.0007130296650419885, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6696468.0, "repeat_count": 0.0, "routers_loss": 0.004455826710909605, "skip_count": 1.0, "step": 4154, "text_loss": 0.5869500041007996 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0007127496061281551, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6699307.0, "repeat_count": 0.0, "routers_loss": 0.001998464809730649, "skip_count": 0.0, "step": 4156, "text_loss": 0.6931945085525513 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 19.52127971822718, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007124694656882713, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6702647.0, "repeat_count": 3.0, "routers_loss": 0.004117495380342007, "skip_count": 0.0, "step": 4158, "text_loss": 0.4325876832008362 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 0.0007121892438296874, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6705964.0, "repeat_count": 0.0, "routers_loss": 0.0014713290147483349, "skip_count": 0.0, "step": 4160, "text_loss": 0.3672060966491699 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04345703125, "learning_rate": 0.0007119089406597849, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6710182.0, "repeat_count": 0.0, "routers_loss": 0.0037311650812625885, "skip_count": 1.0, "step": 4162, "text_loss": 0.6643805503845215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007116285562859767, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6713410.0, "repeat_count": 0.0, "routers_loss": 0.006017287727445364, "skip_count": 0.0, "step": 4164, "text_loss": 0.4606415927410126 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 19.55884942764896, "f1_execute": 0.9545454382896423, "f1_repeat": 0.5, "f1_skip": 1.0, "grad_norm": 0.05419921875, "learning_rate": 0.0007113480908157065, "loss": 0.0108, "macro_f1": 0.8181818723678589, "num_tokens": 6716056.0, "repeat_count": 3.0, "routers_loss": 0.08640352636575699, "skip_count": 4.0, "step": 4166, "text_loss": 0.3139408528804779 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0007110675443564491, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6719497.0, "repeat_count": 0.0, "routers_loss": 0.0012731150491163135, "skip_count": 0.0, "step": 4168, "text_loss": 0.7283861637115479 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0007107869170157108, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6722297.0, "repeat_count": 0.0, "routers_loss": 0.0021509863436222076, "skip_count": 2.0, "step": 4170, "text_loss": 0.5767703056335449 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 0.000710506208901028, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6725762.0, "repeat_count": 0.0, "routers_loss": 0.00257494836114347, "skip_count": 1.0, "step": 4172, "text_loss": 0.33571913838386536 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.596419137070736, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.000710225420119969, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 6728436.0, "repeat_count": 1.0, "routers_loss": 0.00943201594054699, "skip_count": 3.0, "step": 4174, "text_loss": 0.6849368810653687 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0007099445507801323, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6731427.0, "repeat_count": 0.0, "routers_loss": 0.01046718005090952, "skip_count": 2.0, "step": 4176, "text_loss": 0.3346157670021057 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0007096636009891477, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6734800.0, "repeat_count": 0.0, "routers_loss": 0.0007813365664333105, "skip_count": 0.0, "step": 4178, "text_loss": 0.49989959597587585 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.000709382570854676, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6738244.0, "repeat_count": 0.0, "routers_loss": 0.002825600327923894, "skip_count": 0.0, "step": 4180, "text_loss": 0.15744923055171967 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007091014604844078, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6741695.0, "repeat_count": 0.0, "routers_loss": 0.0017124463338404894, "skip_count": 0.0, "step": 4182, "text_loss": 0.3752405643463135 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0007088202699860655, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 6744882.0, "repeat_count": 1.0, "routers_loss": 0.005134924780577421, "skip_count": 3.0, "step": 4184, "text_loss": 0.18534569442272186 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 0.000708538999467402, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6747811.0, "repeat_count": 0.0, "routers_loss": 0.002371585462242365, "skip_count": 1.0, "step": 4186, "text_loss": 0.6251029968261719 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0007082576490362004, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6750765.0, "repeat_count": 0.0, "routers_loss": 0.002088436856865883, "skip_count": 0.0, "step": 4188, "text_loss": 0.35471436381340027 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 0.000707976218800275, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6754021.0, "repeat_count": 0.0, "routers_loss": 0.0012272283202037215, "skip_count": 0.0, "step": 4190, "text_loss": 0.5737302899360657 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0007076947088674701, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6756793.0, "repeat_count": 0.0, "routers_loss": 0.0026050808373838663, "skip_count": 0.0, "step": 4192, "text_loss": 0.526336669921875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.054931640625, "learning_rate": 0.000707413119345661, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 6760221.0, "repeat_count": 0.0, "routers_loss": 0.0013151296880096197, "skip_count": 0.0, "step": 4194, "text_loss": 0.5678895711898804 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0007071314503427532, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6763721.0, "repeat_count": 0.0, "routers_loss": 0.001528652966953814, "skip_count": 0.0, "step": 4196, "text_loss": 0.7640175223350525 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0007068497019666829, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6768581.0, "repeat_count": 0.0, "routers_loss": 0.0019202446565032005, "skip_count": 0.0, "step": 4198, "text_loss": 0.41878414154052734 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.051513671875, "learning_rate": 0.0007065678743254167, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6772758.0, "repeat_count": 0.0, "routers_loss": 0.004667408298701048, "skip_count": 1.0, "step": 4200, "text_loss": 0.3550313413143158 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 19.727913120046964, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0007062859675269513, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6776671.0, "repeat_count": 3.0, "routers_loss": 0.00568761583417654, "skip_count": 0.0, "step": 4202, "text_loss": 0.1707649976015091 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007060039816793141, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6780284.0, "repeat_count": 0.0, "routers_loss": 0.0030401297844946384, "skip_count": 0.0, "step": 4204, "text_loss": 0.2686377167701721 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 19.74669797475785, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007057219168905625, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 6783525.0, "repeat_count": 1.0, "routers_loss": 0.003353122156113386, "skip_count": 5.0, "step": 4206, "text_loss": 0.5235374569892883 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.000705439773268784, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6787691.0, "repeat_count": 0.0, "routers_loss": 0.0016532237641513348, "skip_count": 1.0, "step": 4208, "text_loss": 0.5002681612968445 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007051575509220972, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 6790833.0, "repeat_count": 0.0, "routers_loss": 0.0011808308772742748, "skip_count": 0.0, "step": 4210, "text_loss": 0.7251001596450806 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04443359375, "learning_rate": 0.0007048752499586497, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6794260.0, "repeat_count": 0.0, "routers_loss": 0.006246297620236874, "skip_count": 2.0, "step": 4212, "text_loss": 0.2430499643087387 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.00070459287048662, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6797413.0, "repeat_count": 0.0, "routers_loss": 0.0012964420020580292, "skip_count": 0.0, "step": 4214, "text_loss": 0.48889362812042236 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0007043104126142163, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6800815.0, "repeat_count": 0.0, "routers_loss": 0.0018109704833477736, "skip_count": 0.0, "step": 4216, "text_loss": 0.5617026686668396 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 19.80305253889052, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0007040278764496771, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6803937.0, "repeat_count": 2.0, "routers_loss": 0.0028699536342173815, "skip_count": 1.0, "step": 4218, "text_loss": 0.548405647277832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007037452621012708, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6806946.0, "repeat_count": 0.0, "routers_loss": 0.0007951617590151727, "skip_count": 0.0, "step": 4220, "text_loss": 0.5702725648880005 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0007034625696772958, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6810083.0, "repeat_count": 0.0, "routers_loss": 0.003436052706092596, "skip_count": 2.0, "step": 4222, "text_loss": 0.3898725211620331 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.00070317979928608, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6812845.0, "repeat_count": 0.0, "routers_loss": 0.0005070401239208877, "skip_count": 0.0, "step": 4224, "text_loss": 0.5244157910346985 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.840622248312297, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.000702896951035982, "loss": 0.0101, "macro_f1": 0.3272727429866791, "num_tokens": 6815801.0, "repeat_count": 0.0, "routers_loss": 0.01560303382575512, "skip_count": 1.0, "step": 4226, "text_loss": 0.26503118872642517 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03564453125, "learning_rate": 0.0007026140250353896, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 6819464.0, "repeat_count": 0.0, "routers_loss": 0.009310240857303143, "skip_count": 2.0, "step": 4228, "text_loss": 0.15597499907016754 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0007023310213927208, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6822657.0, "repeat_count": 0.0, "routers_loss": 0.005309136584401131, "skip_count": 0.0, "step": 4230, "text_loss": 0.5271651148796082 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046875, "learning_rate": 0.0007020479402164226, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 6825661.0, "repeat_count": 0.0, "routers_loss": 0.005936166271567345, "skip_count": 2.0, "step": 4232, "text_loss": 0.6105108857154846 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.878191957734078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007017647816149727, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6828688.0, "repeat_count": 0.0, "routers_loss": 0.001653556595556438, "skip_count": 0.0, "step": 4234, "text_loss": 0.6966437101364136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.000701481545696878, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 6831850.0, "repeat_count": 0.0, "routers_loss": 0.0013501866487786174, "skip_count": 0.0, "step": 4236, "text_loss": 1.259678840637207 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.059814453125, "learning_rate": 0.0007011982325706747, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6834862.0, "repeat_count": 0.0, "routers_loss": 0.008970130234956741, "skip_count": 1.0, "step": 4238, "text_loss": 0.24906545877456665 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.906369239800412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0007009148423449292, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6838148.0, "repeat_count": 0.0, "routers_loss": 0.0026013399474322796, "skip_count": 0.0, "step": 4240, "text_loss": 0.291467547416687 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.915761667155856, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0007006313751282371, "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 6841142.0, "repeat_count": 0.0, "routers_loss": 0.021415632218122482, "skip_count": 1.0, "step": 4242, "text_loss": 0.507606029510498 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0007003478310292236, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6844042.0, "repeat_count": 0.0, "routers_loss": 0.0023636550176888704, "skip_count": 0.0, "step": 4244, "text_loss": 0.11626995354890823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.934546521866746, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0007000642101565433, "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 6847359.0, "repeat_count": 1.0, "routers_loss": 0.025154776871204376, "skip_count": 0.0, "step": 4246, "text_loss": 0.42898693680763245 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 0.0006997805126188803, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6850443.0, "repeat_count": 0.0, "routers_loss": 0.00540317315608263, "skip_count": 0.0, "step": 4248, "text_loss": 0.18085283041000366 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.000699496738524948, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 6853495.0, "repeat_count": 0.0, "routers_loss": 0.0014433214673772454, "skip_count": 0.0, "step": 4250, "text_loss": 0.5524004697799683 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.96272380393308, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006992128879834891, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 6856774.0, "repeat_count": 1.0, "routers_loss": 0.013381492346525192, "skip_count": 3.0, "step": 4252, "text_loss": 0.19605717062950134 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04248046875, "learning_rate": 0.0006989289611032758, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 6860313.0, "repeat_count": 0.0, "routers_loss": 0.007140172645449638, "skip_count": 1.0, "step": 4254, "text_loss": 0.3182447552680969 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006986449579931091, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6863683.0, "repeat_count": 0.0, "routers_loss": 0.006486213766038418, "skip_count": 1.0, "step": 4256, "text_loss": 0.19250160455703735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.0006983608787618201, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6867609.0, "repeat_count": 0.0, "routers_loss": 0.001465818495489657, "skip_count": 0.0, "step": 4258, "text_loss": 0.5912898182868958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.000698076723518268, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6870040.0, "repeat_count": 0.0, "routers_loss": 0.0031106441747397184, "skip_count": 0.0, "step": 4260, "text_loss": 0.13542121648788452 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.0006977924923713418, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6873441.0, "repeat_count": 0.0, "routers_loss": 0.0005377951893024147, "skip_count": 0.0, "step": 4262, "text_loss": 0.352464497089386 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0006975081854299594, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 6876637.0, "repeat_count": 0.0, "routers_loss": 0.007052485831081867, "skip_count": 0.0, "step": 4264, "text_loss": 0.5023844242095947 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0006972238028030678, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6879928.0, "repeat_count": 0.0, "routers_loss": 0.0013608322478830814, "skip_count": 0.0, "step": 4266, "text_loss": 0.8664718270301819 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0006969393445996429, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6883425.0, "repeat_count": 0.0, "routers_loss": 0.0007607188890688121, "skip_count": 0.0, "step": 4268, "text_loss": 0.5131992101669312 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0006966548109286897, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6886790.0, "repeat_count": 0.0, "routers_loss": 0.00035804163780994713, "skip_count": 0.0, "step": 4270, "text_loss": 0.5352054834365845 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.000696370201899242, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6889747.0, "repeat_count": 0.0, "routers_loss": 0.004451376851648092, "skip_count": 1.0, "step": 4272, "text_loss": 0.47865036129951477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0006960855176203623, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6892604.0, "repeat_count": 0.0, "routers_loss": 0.0015342880506068468, "skip_count": 0.0, "step": 4274, "text_loss": 0.36278650164604187 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0006958007582011425, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6895563.0, "repeat_count": 0.0, "routers_loss": 0.0022974940948188305, "skip_count": 2.0, "step": 4276, "text_loss": 0.6695618629455566 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0006955159237507027, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6898591.0, "repeat_count": 0.0, "routers_loss": 0.00859096460044384, "skip_count": 1.0, "step": 4278, "text_loss": 0.44284722208976746 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.093924273554446, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0006952310143781921, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6903119.0, "repeat_count": 1.0, "routers_loss": 0.007919861935079098, "skip_count": 3.0, "step": 4280, "text_loss": 0.5006136298179626 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0006949460301927886, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6906394.0, "repeat_count": 0.0, "routers_loss": 0.0008476210059598088, "skip_count": 0.0, "step": 4282, "text_loss": 0.8153555989265442 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048095703125, "learning_rate": 0.0006946609713036985, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 6909136.0, "repeat_count": 0.0, "routers_loss": 0.006711610127240419, "skip_count": 2.0, "step": 4284, "text_loss": 0.43136683106422424 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0185546875, "learning_rate": 0.0006943758378201571, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 6912734.0, "repeat_count": 0.0, "routers_loss": 0.0038677838165313005, "skip_count": 0.0, "step": 4286, "text_loss": 0.2693749964237213 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.0006940906298514278, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6915838.0, "repeat_count": 0.0, "routers_loss": 0.0012188015971332788, "skip_count": 0.0, "step": 4288, "text_loss": 0.5809219479560852 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0006938053475068031, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6919225.0, "repeat_count": 0.0, "routers_loss": 0.001955829095095396, "skip_count": 0.0, "step": 4290, "text_loss": 0.5116089582443237 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.150278837687114, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.0006935199908956037, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 6922495.0, "repeat_count": 1.0, "routers_loss": 0.0035709093790501356, "skip_count": 0.0, "step": 4292, "text_loss": 0.2745901644229889 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0006932345601271786, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6925317.0, "repeat_count": 0.0, "routers_loss": 0.0005745319649577141, "skip_count": 0.0, "step": 4294, "text_loss": 0.6039219498634338 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 20.169063692398005, "f1_execute": 0.9743589162826538, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.0693359375, "learning_rate": 0.0006929490553109056, "loss": 0.0107, "macro_f1": 0.9247862696647644, "num_tokens": 6928054.0, "repeat_count": 3.0, "routers_loss": 0.061689916998147964, "skip_count": 6.0, "step": 4296, "text_loss": 0.3904837667942047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0006926634765561907, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 6931348.0, "repeat_count": 0.0, "routers_loss": 0.002007248578593135, "skip_count": 0.0, "step": 4298, "text_loss": 0.5170742273330688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.187848547108892, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.000692377823972468, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6934411.0, "repeat_count": 0.0, "routers_loss": 0.0005786226247437298, "skip_count": 0.0, "step": 4300, "text_loss": 0.8032443523406982 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.19724097446434, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006920920976692004, "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 6938153.0, "repeat_count": 1.0, "routers_loss": 0.024602646008133888, "skip_count": 0.0, "step": 4302, "text_loss": 0.446534663438797 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.0006918062977558784, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6940731.0, "repeat_count": 0.0, "routers_loss": 0.005759815219789743, "skip_count": 2.0, "step": 4304, "text_loss": 0.15479247272014618 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0006915204243420214, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6943246.0, "repeat_count": 0.0, "routers_loss": 0.005315347574651241, "skip_count": 1.0, "step": 4306, "text_loss": 0.22127842903137207 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.225418256530673, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0006912344775371765, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6947197.0, "repeat_count": 0.0, "routers_loss": 0.0012061651796102524, "skip_count": 0.0, "step": 4308, "text_loss": 0.7058854103088379 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0006909484574509191, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6951817.0, "repeat_count": 0.0, "routers_loss": 0.0029203309677541256, "skip_count": 0.0, "step": 4310, "text_loss": 0.6014000773429871 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0006906623641928525, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6955094.0, "repeat_count": 0.0, "routers_loss": 0.005703397560864687, "skip_count": 2.0, "step": 4312, "text_loss": 0.5923848152160645 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.253595538597008, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08154296875, "learning_rate": 0.0006903761978726084, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 6958127.0, "repeat_count": 1.0, "routers_loss": 0.004489895887672901, "skip_count": 2.0, "step": 4314, "text_loss": 0.36911651492118835 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.000690089958599846, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 6960871.0, "repeat_count": 0.0, "routers_loss": 0.003871412482112646, "skip_count": 2.0, "step": 4316, "text_loss": 0.442545086145401 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.272380393307895, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.000689803646484253, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6963980.0, "repeat_count": 1.0, "routers_loss": 0.008667866699397564, "skip_count": 2.0, "step": 4318, "text_loss": 0.1987489014863968 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.030517578125, "learning_rate": 0.0006895172616355446, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6967132.0, "repeat_count": 1.0, "routers_loss": 0.00843339879065752, "skip_count": 0.0, "step": 4320, "text_loss": 0.48267918825149536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0006892308041634639, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6969971.0, "repeat_count": 0.0, "routers_loss": 0.0004312851815484464, "skip_count": 0.0, "step": 4322, "text_loss": 0.3662732243537903 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 20.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0006889442741777822, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6973114.0, "repeat_count": 0.0, "routers_loss": 0.004588035400956869, "skip_count": 3.0, "step": 4324, "text_loss": 0.6707104444503784 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.309950102729672, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0006886576717882982, "loss": 0.0057, "macro_f1": 0.8817967176437378, "num_tokens": 6976013.0, "repeat_count": 2.0, "routers_loss": 0.0687296912074089, "skip_count": 3.0, "step": 4326, "text_loss": 0.1662217676639557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0006883709971048384, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6979200.0, "repeat_count": 0.0, "routers_loss": 0.002950174268335104, "skip_count": 0.0, "step": 4328, "text_loss": 0.21168152987957 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0006880842502372572, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6982640.0, "repeat_count": 0.0, "routers_loss": 0.0032158740796148777, "skip_count": 0.0, "step": 4330, "text_loss": 0.26790961623191833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0006877974312954365, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6985917.0, "repeat_count": 0.0, "routers_loss": 0.0005083635332994163, "skip_count": 0.0, "step": 4332, "text_loss": 0.9736502170562744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.347519812151454, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.000687510540389286, "loss": 0.0053, "macro_f1": 0.32098764181137085, "num_tokens": 6988388.0, "repeat_count": 0.0, "routers_loss": 0.03473830223083496, "skip_count": 2.0, "step": 4334, "text_loss": 0.21662230789661407 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006872235776287425, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6991360.0, "repeat_count": 0.0, "routers_loss": 0.002206524135544896, "skip_count": 0.0, "step": 4336, "text_loss": 0.6026972532272339 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 0.0006869365431237711, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6995080.0, "repeat_count": 1.0, "routers_loss": 0.000969731598161161, "skip_count": 0.0, "step": 4338, "text_loss": 0.5833017230033875 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.375697094217788, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0006866494369843635, "loss": 0.0054, "macro_f1": 0.8820862174034119, "num_tokens": 6998526.0, "repeat_count": 2.0, "routers_loss": 0.013962293043732643, "skip_count": 2.0, "step": 4340, "text_loss": 0.41465985774993896 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 20.38508952157323, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0006863622593205397, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 7001494.0, "repeat_count": 0.0, "routers_loss": 0.0064964210614562035, "skip_count": 3.0, "step": 4342, "text_loss": 0.3774271011352539 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 20.394481948928675, "f1_execute": 0.9767441749572754, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0006860750102423464, "loss": 0.0062, "macro_f1": 0.6589147448539734, "num_tokens": 7005544.0, "repeat_count": 1.0, "routers_loss": 0.023250726982951164, "skip_count": 6.0, "step": 4344, "text_loss": 0.2732464373111725 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.403874376284122, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0006857876898598582, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 7008847.0, "repeat_count": 0.0, "routers_loss": 0.0038170060142874718, "skip_count": 2.0, "step": 4346, "text_loss": 0.29610875248908997 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0006855002982831769, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7012577.0, "repeat_count": 0.0, "routers_loss": 0.0012856025714427233, "skip_count": 0.0, "step": 4348, "text_loss": 0.6098502278327942 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061767578125, "learning_rate": 0.0006852128356224314, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7015650.0, "repeat_count": 0.0, "routers_loss": 0.008162742480635643, "skip_count": 1.0, "step": 4350, "text_loss": 0.20868146419525146 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.432051658350456, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.023193359375, "learning_rate": 0.0006849253019877778, "loss": 0.0074, "macro_f1": 0.8817967176437378, "num_tokens": 7019925.0, "repeat_count": 2.0, "routers_loss": 0.023544032126665115, "skip_count": 3.0, "step": 4352, "text_loss": 0.628226101398468 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0006846376974893996, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 7023130.0, "repeat_count": 0.0, "routers_loss": 0.004982319660484791, "skip_count": 2.0, "step": 4354, "text_loss": 0.7037544250488281 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.450836513061343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0006843500222375074, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7026422.0, "repeat_count": 1.0, "routers_loss": 0.004015266429632902, "skip_count": 0.0, "step": 4356, "text_loss": 0.22352729737758636 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 20.46022894041679, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.042724609375, "learning_rate": 0.0006840622763423391, "loss": 0.0071, "macro_f1": 0.9449735879898071, "num_tokens": 7029077.0, "repeat_count": 2.0, "routers_loss": 0.021162014454603195, "skip_count": 4.0, "step": 4358, "text_loss": 0.2431403249502182 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0006837744599141591, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7032582.0, "repeat_count": 0.0, "routers_loss": 0.0007044129306450486, "skip_count": 0.0, "step": 4360, "text_loss": 0.26667487621307373 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0006834865730632594, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7035642.0, "repeat_count": 0.0, "routers_loss": 0.0067853196524083614, "skip_count": 1.0, "step": 4362, "text_loss": 0.20965275168418884 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0006831986158999588, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7038601.0, "repeat_count": 0.0, "routers_loss": 0.00899333506822586, "skip_count": 2.0, "step": 4364, "text_loss": 0.26860126852989197 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.000682910588534603, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7042274.0, "repeat_count": 0.0, "routers_loss": 0.0019194348715245724, "skip_count": 0.0, "step": 4366, "text_loss": 0.14046810567378998 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.507191077194012, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0006826224910775647, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 7045268.0, "repeat_count": 1.0, "routers_loss": 0.006915684789419174, "skip_count": 3.0, "step": 4368, "text_loss": 0.5900366306304932 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.516583504549455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0006823343236392432, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7049407.0, "repeat_count": 0.0, "routers_loss": 0.001678116386756301, "skip_count": 0.0, "step": 4370, "text_loss": 0.7868026494979858 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.000682046086330065, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7052783.0, "repeat_count": 0.0, "routers_loss": 0.0003459530707914382, "skip_count": 0.0, "step": 4372, "text_loss": 0.6349637508392334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0006817577792604831, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7055757.0, "repeat_count": 0.0, "routers_loss": 0.0011729507241398096, "skip_count": 0.0, "step": 4374, "text_loss": 0.43258991837501526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0006814694025409773, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 7058684.0, "repeat_count": 0.0, "routers_loss": 0.0006664610700681806, "skip_count": 0.0, "step": 4376, "text_loss": 0.5307940244674683 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.091796875, "learning_rate": 0.0006811809562820542, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 7061902.0, "repeat_count": 0.0, "routers_loss": 0.004595907870680094, "skip_count": 2.0, "step": 4378, "text_loss": 0.5830042362213135 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0006808924405942467, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7065100.0, "repeat_count": 0.0, "routers_loss": 0.0032026609405875206, "skip_count": 0.0, "step": 4380, "text_loss": 0.20797798037528992 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.572938068682124, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0184326171875, "learning_rate": 0.0006806038555881148, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 7068556.0, "repeat_count": 1.0, "routers_loss": 0.0024626904632896185, "skip_count": 0.0, "step": 4382, "text_loss": 0.5791074633598328 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.58233049603757, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 0.0006803152013742448, "loss": 0.0075, "macro_f1": 1.0, "num_tokens": 7071284.0, "repeat_count": 1.0, "routers_loss": 0.010723610408604145, "skip_count": 2.0, "step": 4384, "text_loss": 0.13227243721485138 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.0006800264780632495, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7074428.0, "repeat_count": 1.0, "routers_loss": 0.0011231007520109415, "skip_count": 0.0, "step": 4386, "text_loss": 0.4360627233982086 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 20.601115350748458, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0006797376857657681, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 7078313.0, "repeat_count": 2.0, "routers_loss": 0.008419238030910492, "skip_count": 1.0, "step": 4388, "text_loss": 0.5183924436569214 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.610507778103905, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.0006794488245924664, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 7081258.0, "repeat_count": 1.0, "routers_loss": 0.006582668516784906, "skip_count": 3.0, "step": 4390, "text_loss": 0.2797473669052124 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.61990020545935, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046630859375, "learning_rate": 0.0006791598946540368, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 7084527.0, "repeat_count": 0.0, "routers_loss": 0.00557357631623745, "skip_count": 2.0, "step": 4392, "text_loss": 0.39495575428009033 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.629292632814792, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06005859375, "learning_rate": 0.0006788708960611975, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 7087675.0, "repeat_count": 0.0, "routers_loss": 0.007155992556363344, "skip_count": 0.0, "step": 4394, "text_loss": 0.3785299062728882 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01806640625, "learning_rate": 0.0006785818289246934, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7090171.0, "repeat_count": 0.0, "routers_loss": 0.0009265039698220789, "skip_count": 0.0, "step": 4396, "text_loss": 0.42634522914886475 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 20.648077487525683, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.0006782926933552955, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 7092529.0, "repeat_count": 1.0, "routers_loss": 0.008679097518324852, "skip_count": 7.0, "step": 4398, "text_loss": 0.4283660054206848 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0006780034894638014, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7095141.0, "repeat_count": 0.0, "routers_loss": 0.002363949315622449, "skip_count": 0.0, "step": 4400, "text_loss": 0.481539249420166 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 20.666862342236573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 0.000677714217361034, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7098208.0, "repeat_count": 0.0, "routers_loss": 0.004005146212875843, "skip_count": 3.0, "step": 4402, "text_loss": 0.6443291902542114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0006774248771578435, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7101681.0, "repeat_count": 0.0, "routers_loss": 0.0026864963583648205, "skip_count": 0.0, "step": 4404, "text_loss": 0.16315312683582306 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 20.68564719694746, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0322265625, "learning_rate": 0.0006771354689651054, "loss": 0.005, "macro_f1": 0.9449735879898071, "num_tokens": 7104719.0, "repeat_count": 2.0, "routers_loss": 0.02719845622777939, "skip_count": 4.0, "step": 4406, "text_loss": 0.37855592370033264 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0006768459928937213, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7108697.0, "repeat_count": 0.0, "routers_loss": 0.010488593950867653, "skip_count": 0.0, "step": 4408, "text_loss": 0.23133711516857147 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.70443205165835, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0006765564490546193, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7111426.0, "repeat_count": 1.0, "routers_loss": 0.0013637891970574856, "skip_count": 0.0, "step": 4410, "text_loss": 0.41399383544921875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0006762668375587528, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7114241.0, "repeat_count": 0.0, "routers_loss": 0.000900395680218935, "skip_count": 0.0, "step": 4412, "text_loss": 0.6460412740707397 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0006759771585171016, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7117031.0, "repeat_count": 0.0, "routers_loss": 0.0024001260753721, "skip_count": 0.0, "step": 4414, "text_loss": 0.7645824551582336 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.732609333724685, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0006756874120406714, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 7120766.0, "repeat_count": 3.0, "routers_loss": 0.005034091416746378, "skip_count": 4.0, "step": 4416, "text_loss": 0.31753066182136536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0006753975982404934, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7125243.0, "repeat_count": 0.0, "routers_loss": 0.002483269665390253, "skip_count": 0.0, "step": 4418, "text_loss": 0.5304268002510071 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.751394188435572, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0006751077172276249, "loss": 0.0052, "macro_f1": 0.3272727429866791, "num_tokens": 7127795.0, "repeat_count": 0.0, "routers_loss": 0.02676006779074669, "skip_count": 1.0, "step": 4420, "text_loss": 0.22011354565620422 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06201171875, "learning_rate": 0.000674817769113149, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7130837.0, "repeat_count": 0.0, "routers_loss": 0.003267093561589718, "skip_count": 2.0, "step": 4422, "text_loss": 0.2906076908111572 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 20.770179043146463, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.027099609375, "learning_rate": 0.000674527754008174, "loss": 0.0045, "macro_f1": 0.5934640765190125, "num_tokens": 7135090.0, "repeat_count": 0.0, "routers_loss": 0.022510390728712082, "skip_count": 3.0, "step": 4424, "text_loss": 0.2544902563095093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0006742376720238345, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 7138751.0, "repeat_count": 0.0, "routers_loss": 0.0011178571730852127, "skip_count": 0.0, "step": 4426, "text_loss": 0.6811438798904419 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 20.788963897857354, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0006739475232712904, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 7141762.0, "repeat_count": 2.0, "routers_loss": 0.005595206283032894, "skip_count": 1.0, "step": 4428, "text_loss": 0.38743990659713745 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0006736573078617272, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7145235.0, "repeat_count": 0.0, "routers_loss": 0.002793942578136921, "skip_count": 2.0, "step": 4430, "text_loss": 0.21894219517707825 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 20.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.0006733670259063561, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 7149042.0, "repeat_count": 0.0, "routers_loss": 0.006146818865090609, "skip_count": 3.0, "step": 4432, "text_loss": 0.17822015285491943 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 20.817141179923688, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.042236328125, "learning_rate": 0.0006730766775164136, "loss": 0.0061, "macro_f1": 0.5492662787437439, "num_tokens": 7152166.0, "repeat_count": 0.0, "routers_loss": 0.026045087724924088, "skip_count": 2.0, "step": 4434, "text_loss": 0.2910420000553131 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 20.82653360727913, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0006727862628031618, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7155506.0, "repeat_count": 2.0, "routers_loss": 0.0022973387967795134, "skip_count": 0.0, "step": 4436, "text_loss": 0.3502544164657593 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 0.0006724957818778882, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7158739.0, "repeat_count": 0.0, "routers_loss": 0.002357073128223419, "skip_count": 1.0, "step": 4438, "text_loss": 0.26200664043426514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0006722052348519054, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 7161776.0, "repeat_count": 0.0, "routers_loss": 0.0005521026905626059, "skip_count": 0.0, "step": 4440, "text_loss": 0.3922915458679199 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044189453125, "learning_rate": 0.000671914621836552, "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 7164763.0, "repeat_count": 0.0, "routers_loss": 0.007691344246268272, "skip_count": 2.0, "step": 4442, "text_loss": 0.6021351218223572 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.000671623942943191, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7167924.0, "repeat_count": 0.0, "routers_loss": 0.0032181134447455406, "skip_count": 0.0, "step": 4444, "text_loss": 0.23639555275440216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.873495744056356, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.0006713331982832113, "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 7170743.0, "repeat_count": 1.0, "routers_loss": 0.024979131296277046, "skip_count": 0.0, "step": 4446, "text_loss": 0.4957772493362427 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0006710423879680271, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7174660.0, "repeat_count": 0.0, "routers_loss": 0.002571308286860585, "skip_count": 0.0, "step": 4448, "text_loss": 0.47968071699142456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.000670751512109077, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7177965.0, "repeat_count": 0.0, "routers_loss": 0.00212799571454525, "skip_count": 0.0, "step": 4450, "text_loss": 0.6550716161727905 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0006704605708178252, "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 7181512.0, "repeat_count": 0.0, "routers_loss": 0.004176430404186249, "skip_count": 1.0, "step": 4452, "text_loss": 0.36959558725357056 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0006701695642057613, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7184555.0, "repeat_count": 0.0, "routers_loss": 0.0010968588758260012, "skip_count": 0.0, "step": 4454, "text_loss": 0.6686749458312988 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0006698784923843993, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7187474.0, "repeat_count": 0.0, "routers_loss": 0.0014241471653804183, "skip_count": 0.0, "step": 4456, "text_loss": 0.6147221922874451 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0006695873554652784, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7190649.0, "repeat_count": 0.0, "routers_loss": 0.008801907300949097, "skip_count": 0.0, "step": 4458, "text_loss": 0.26381927728652954 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04638671875, "learning_rate": 0.0006692961535599634, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 7193961.0, "repeat_count": 0.0, "routers_loss": 0.009027508087456226, "skip_count": 1.0, "step": 4460, "text_loss": 0.1926470547914505 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0006690048867800427, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7197456.0, "repeat_count": 0.0, "routers_loss": 0.0022697453387081623, "skip_count": 0.0, "step": 4462, "text_loss": 0.6736721992492676 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0006687135552371305, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7200290.0, "repeat_count": 0.0, "routers_loss": 0.006747903767973185, "skip_count": 1.0, "step": 4464, "text_loss": 0.2026437371969223 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006684221590428657, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7203320.0, "repeat_count": 0.0, "routers_loss": 0.0011565096210688353, "skip_count": 0.0, "step": 4466, "text_loss": 0.7587730288505554 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.976812444966246, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.0006681306983089121, "loss": 0.0083, "macro_f1": 0.8820862174034119, "num_tokens": 7206411.0, "repeat_count": 2.0, "routers_loss": 0.023645581677556038, "skip_count": 2.0, "step": 4468, "text_loss": 0.8981561660766602 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0006678391731469575, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 7209421.0, "repeat_count": 0.0, "routers_loss": 0.0035848666448146105, "skip_count": 0.0, "step": 4470, "text_loss": 0.1522839516401291 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 20.995597299677137, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0006675475836687152, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 7212267.0, "repeat_count": 1.0, "routers_loss": 0.005046425387263298, "skip_count": 1.0, "step": 4472, "text_loss": 0.46007999777793884 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0006672559299859228, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7215195.0, "repeat_count": 0.0, "routers_loss": 0.0019333874806761742, "skip_count": 0.0, "step": 4474, "text_loss": 1.0859547853469849 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0006669642122103423, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7217941.0, "repeat_count": 0.0, "routers_loss": 0.0005401032394729555, "skip_count": 0.0, "step": 4476, "text_loss": 0.9754356145858765 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.023481068388612, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0006666724304537611, "loss": 0.0053, "macro_f1": 0.3272727429866791, "num_tokens": 7222494.0, "repeat_count": 1.0, "routers_loss": 0.015569722279906273, "skip_count": 0.0, "step": 4478, "text_loss": 0.2896423637866974 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0006663805848279898, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7225292.0, "repeat_count": 0.0, "routers_loss": 0.0020135147497057915, "skip_count": 0.0, "step": 4480, "text_loss": 0.8492724299430847 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 21.0422659230995, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0006660886754448648, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 7229184.0, "repeat_count": 1.0, "routers_loss": 0.002355351345613599, "skip_count": 0.0, "step": 4482, "text_loss": 0.189764603972435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.0006657967024162459, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7232906.0, "repeat_count": 0.0, "routers_loss": 0.003044391982257366, "skip_count": 0.0, "step": 4484, "text_loss": 0.4239847660064697 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0006655046658540179, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7235996.0, "repeat_count": 0.0, "routers_loss": 0.00602696230635047, "skip_count": 2.0, "step": 4486, "text_loss": 0.217103973031044 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.070443205165834, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0169677734375, "learning_rate": 0.0006652125658700896, "loss": 0.0031, "macro_f1": 0.6666666865348816, "num_tokens": 7238882.0, "repeat_count": 0.0, "routers_loss": 0.001470155781134963, "skip_count": 1.0, "step": 4488, "text_loss": 0.6090770363807678 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.07983563252128, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0006649204025763945, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 7241815.0, "repeat_count": 1.0, "routers_loss": 0.008737480267882347, "skip_count": 2.0, "step": 4490, "text_loss": 0.48314425349235535 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0177001953125, "learning_rate": 0.0006646281760848902, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 7244848.0, "repeat_count": 0.0, "routers_loss": 0.0008257135050371289, "skip_count": 0.0, "step": 4492, "text_loss": 0.5884748101234436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006643358865075581, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7247930.0, "repeat_count": 0.0, "routers_loss": 0.0016262239078059793, "skip_count": 0.0, "step": 4494, "text_loss": 0.21444730460643768 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0006640435339564042, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7251776.0, "repeat_count": 0.0, "routers_loss": 0.001315156347118318, "skip_count": 0.0, "step": 4496, "text_loss": 0.6890370845794678 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.11740534194306, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0006637511185434588, "loss": 0.0091, "macro_f1": 1.0, "num_tokens": 7255070.0, "repeat_count": 1.0, "routers_loss": 0.007614497095346451, "skip_count": 3.0, "step": 4498, "text_loss": 0.516417920589447 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 21.126797769298502, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0006634586403807758, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 7258115.0, "repeat_count": 3.0, "routers_loss": 0.004906686954200268, "skip_count": 2.0, "step": 4500, "text_loss": 0.577463686466217 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.13619019665395, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.0006631660995804334, "loss": 0.0067, "macro_f1": 0.6601307392120361, "num_tokens": 7260769.0, "repeat_count": 1.0, "routers_loss": 0.013337121345102787, "skip_count": 2.0, "step": 4502, "text_loss": 0.37124839425086975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0006628734962545339, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7263908.0, "repeat_count": 0.0, "routers_loss": 0.0023418180644512177, "skip_count": 0.0, "step": 4504, "text_loss": 0.17937727272510529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0006625808305152033, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7267391.0, "repeat_count": 0.0, "routers_loss": 0.0006556165171787143, "skip_count": 0.0, "step": 4506, "text_loss": 0.45344987511634827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0006622881024745919, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 7271402.0, "repeat_count": 0.0, "routers_loss": 0.0021988123189657927, "skip_count": 0.0, "step": 4508, "text_loss": 0.5842905640602112 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 0.0006619953122448734, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 7274354.0, "repeat_count": 0.0, "routers_loss": 0.00774174090474844, "skip_count": 2.0, "step": 4510, "text_loss": 0.27159228920936584 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0006617024599382456, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7277378.0, "repeat_count": 0.0, "routers_loss": 0.0006942499312572181, "skip_count": 0.0, "step": 4512, "text_loss": 0.4464176297187805 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0006614095456669302, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7280526.0, "repeat_count": 0.0, "routers_loss": 0.003003394464030862, "skip_count": 0.0, "step": 4514, "text_loss": 0.31188079714775085 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0006611165695431725, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7283916.0, "repeat_count": 0.0, "routers_loss": 0.0006948060472495854, "skip_count": 0.0, "step": 4516, "text_loss": 0.5266574025154114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0006608235316792413, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7286843.0, "repeat_count": 0.0, "routers_loss": 0.0014080886030569673, "skip_count": 0.0, "step": 4518, "text_loss": 0.5880120396614075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0006605304321874295, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7289940.0, "repeat_count": 0.0, "routers_loss": 0.0016894340515136719, "skip_count": 0.0, "step": 4520, "text_loss": 0.6623797416687012 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006602372711800531, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7292869.0, "repeat_count": 0.0, "routers_loss": 0.003522444050759077, "skip_count": 0.0, "step": 4522, "text_loss": 0.5488807559013367 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0006599440487694521, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7296618.0, "repeat_count": 0.0, "routers_loss": 0.0011981099378317595, "skip_count": 0.0, "step": 4524, "text_loss": 0.4128517210483551 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.248899324919282, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00065965076506799, "loss": 0.0047, "macro_f1": 0.9262410998344421, "num_tokens": 7300481.0, "repeat_count": 3.0, "routers_loss": 0.010548194870352745, "skip_count": 2.0, "step": 4526, "text_loss": 0.26450902223587036 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.0006593574201880536, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7303272.0, "repeat_count": 0.0, "routers_loss": 0.005642973352223635, "skip_count": 1.0, "step": 4528, "text_loss": 0.35269856452941895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.000659064014242053, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 7306615.0, "repeat_count": 0.0, "routers_loss": 0.004171932581812143, "skip_count": 1.0, "step": 4530, "text_loss": 0.18814080953598022 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0006587705473424223, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 7310368.0, "repeat_count": 0.0, "routers_loss": 0.002289367141202092, "skip_count": 2.0, "step": 4532, "text_loss": 0.7363705635070801 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.286469034341064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.000658477019601618, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 7313788.0, "repeat_count": 0.0, "routers_loss": 0.004440625663846731, "skip_count": 1.0, "step": 4534, "text_loss": 0.8126176595687866 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006581834311321211, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 7317864.0, "repeat_count": 0.0, "routers_loss": 0.0013160990783944726, "skip_count": 2.0, "step": 4536, "text_loss": 0.7015916109085083 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04736328125, "learning_rate": 0.000657889782046435, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7320693.0, "repeat_count": 0.0, "routers_loss": 0.0032275544945150614, "skip_count": 2.0, "step": 4538, "text_loss": 0.6481677293777466 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.314646316407398, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0006575960724570865, "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 7324335.0, "repeat_count": 0.0, "routers_loss": 0.009769129566848278, "skip_count": 1.0, "step": 4540, "text_loss": 0.22194676101207733 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.32403874376284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.042724609375, "learning_rate": 0.0006573023024766258, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 7327431.0, "repeat_count": 2.0, "routers_loss": 0.0036973082460463047, "skip_count": 4.0, "step": 4542, "text_loss": 0.475127637386322 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.000657008472217626, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7330262.0, "repeat_count": 0.0, "routers_loss": 0.0007046440150588751, "skip_count": 0.0, "step": 4544, "text_loss": 0.2649917006492615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0006567145817926836, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7333110.0, "repeat_count": 0.0, "routers_loss": 0.0026714997366070747, "skip_count": 0.0, "step": 4546, "text_loss": 0.5490524768829346 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0006564206313144175, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7336101.0, "repeat_count": 0.0, "routers_loss": 0.006552211008965969, "skip_count": 0.0, "step": 4548, "text_loss": 0.14098678529262543 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.0006561266208954707, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 7339435.0, "repeat_count": 0.0, "routers_loss": 0.0035560601390898228, "skip_count": 2.0, "step": 4550, "text_loss": 0.20412275195121765 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0006558325506485081, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7342609.0, "repeat_count": 0.0, "routers_loss": 0.0020106974989175797, "skip_count": 1.0, "step": 4552, "text_loss": 0.6184256076812744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0006555384206862183, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 7345614.0, "repeat_count": 0.0, "routers_loss": 0.0014235252747312188, "skip_count": 0.0, "step": 4554, "text_loss": 1.0108838081359863 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.389785735250953, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0006552442311213121, "loss": 0.0041, "macro_f1": 0.3272727429866791, "num_tokens": 7348957.0, "repeat_count": 1.0, "routers_loss": 0.01703745685517788, "skip_count": 0.0, "step": 4556, "text_loss": 0.21315747499465942 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 21.399178162606397, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0269775390625, "learning_rate": 0.0006549499820665237, "loss": 0.0077, "macro_f1": 0.5934640765190125, "num_tokens": 7352724.0, "repeat_count": 0.0, "routers_loss": 0.013315381482243538, "skip_count": 3.0, "step": 4558, "text_loss": 0.34369465708732605 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.00065465567363461, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7356592.0, "repeat_count": 0.0, "routers_loss": 0.0017354936571791768, "skip_count": 0.0, "step": 4560, "text_loss": 0.6267461180686951 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0006543613059383503, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7359774.0, "repeat_count": 0.0, "routers_loss": 0.011646085418760777, "skip_count": 2.0, "step": 4562, "text_loss": 0.4400193989276886 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006540668790905471, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7362765.0, "repeat_count": 0.0, "routers_loss": 0.0019345436012372375, "skip_count": 0.0, "step": 4564, "text_loss": 0.49204275012016296 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0006537723932040251, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7366337.0, "repeat_count": 0.0, "routers_loss": 0.00562885170802474, "skip_count": 1.0, "step": 4566, "text_loss": 0.22566382586956024 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.44614029938362, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0006534778483916319, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 7369851.0, "repeat_count": 2.0, "routers_loss": 0.005508176051080227, "skip_count": 2.0, "step": 4568, "text_loss": 0.8057850003242493 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006531832447662377, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7373918.0, "repeat_count": 0.0, "routers_loss": 0.006460923235863447, "skip_count": 2.0, "step": 4570, "text_loss": 0.5141497254371643 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0006528885824407351, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7376674.0, "repeat_count": 0.0, "routers_loss": 0.0032120654359459877, "skip_count": 0.0, "step": 4572, "text_loss": 0.1281338930130005 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0006525938615280394, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 7379791.0, "repeat_count": 0.0, "routers_loss": 0.00443810923025012, "skip_count": 0.0, "step": 4574, "text_loss": 0.268352210521698 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.000652299082141088, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 7382886.0, "repeat_count": 0.0, "routers_loss": 0.008284369483590126, "skip_count": 2.0, "step": 4576, "text_loss": 0.30193832516670227 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.493102436160846, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006520042443928411, "loss": 0.0068, "macro_f1": 0.8823530077934265, "num_tokens": 7386036.0, "repeat_count": 2.0, "routers_loss": 0.03383317217230797, "skip_count": 1.0, "step": 4578, "text_loss": 0.23106542229652405 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 0.000651709348396281, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7388908.0, "repeat_count": 0.0, "routers_loss": 0.0017075951909646392, "skip_count": 1.0, "step": 4580, "text_loss": 0.386099249124527 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0006514143942644124, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7392004.0, "repeat_count": 0.0, "routers_loss": 0.009516917169094086, "skip_count": 1.0, "step": 4582, "text_loss": 0.3162059485912323 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051513671875, "learning_rate": 0.0006511193821102623, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 7395538.0, "repeat_count": 0.0, "routers_loss": 0.0031392278615385294, "skip_count": 0.0, "step": 4584, "text_loss": 0.5536221861839294 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0006508243120468799, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7398461.0, "repeat_count": 0.0, "routers_loss": 0.0014138511614874005, "skip_count": 0.0, "step": 4586, "text_loss": 0.7934318780899048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0006505291841873367, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7401611.0, "repeat_count": 0.0, "routers_loss": 0.0005265916115604341, "skip_count": 0.0, "step": 4588, "text_loss": 0.4569905698299408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.000650233998644726, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7404641.0, "repeat_count": 0.0, "routers_loss": 0.0024988956283777952, "skip_count": 0.0, "step": 4590, "text_loss": 0.49998772144317627 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0006499387555321636, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7407574.0, "repeat_count": 0.0, "routers_loss": 0.004110113717615604, "skip_count": 1.0, "step": 4592, "text_loss": 0.5679413676261902 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006496434549627874, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7410806.0, "repeat_count": 0.0, "routers_loss": 0.0032845588866621256, "skip_count": 0.0, "step": 4594, "text_loss": 0.35515281558036804 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006493480970497568, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7413402.0, "repeat_count": 0.0, "routers_loss": 0.010577172972261906, "skip_count": 1.0, "step": 4596, "text_loss": 0.26111698150634766 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.587026709715293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0006490526819062537, "loss": 0.0091, "macro_f1": 1.0, "num_tokens": 7417236.0, "repeat_count": 1.0, "routers_loss": 0.002054794691503048, "skip_count": 2.0, "step": 4598, "text_loss": 0.6480993628501892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0006487572096454818, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7420278.0, "repeat_count": 0.0, "routers_loss": 0.0017989084590226412, "skip_count": 0.0, "step": 4600, "text_loss": 0.4935401678085327 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0006484616803806665, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7423866.0, "repeat_count": 0.0, "routers_loss": 0.006671485956758261, "skip_count": 1.0, "step": 4602, "text_loss": 0.15030258893966675 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 21.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0006481660942250552, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7426884.0, "repeat_count": 0.0, "routers_loss": 0.008334980346262455, "skip_count": 3.0, "step": 4604, "text_loss": 0.29933279752731323 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 21.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0006478704512919173, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 7431017.0, "repeat_count": 0.0, "routers_loss": 0.011923984624445438, "skip_count": 3.0, "step": 4606, "text_loss": 0.35141825675964355 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 21.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0006475747516945432, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7434406.0, "repeat_count": 0.0, "routers_loss": 0.0031092462595552206, "skip_count": 3.0, "step": 4608, "text_loss": 0.21021464467048645 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 21.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.000647278995546246, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7437204.0, "repeat_count": 1.0, "routers_loss": 0.0006713552866131067, "skip_count": 0.0, "step": 4610, "text_loss": 0.4052635431289673 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0006469831829603598, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7439741.0, "repeat_count": 0.0, "routers_loss": 0.0022583482787013054, "skip_count": 2.0, "step": 4612, "text_loss": 0.5443860292434692 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044677734375, "learning_rate": 0.0006466873140502407, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7443619.0, "repeat_count": 0.0, "routers_loss": 0.004187075886875391, "skip_count": 2.0, "step": 4614, "text_loss": 0.30709847807884216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0006463913889292661, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7446696.0, "repeat_count": 0.0, "routers_loss": 0.008314833045005798, "skip_count": 0.0, "step": 4616, "text_loss": 0.22949637472629547 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0006460954077108353, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7450377.0, "repeat_count": 0.0, "routers_loss": 0.001277514616958797, "skip_count": 0.0, "step": 4618, "text_loss": 0.37715134024620056 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0006457993705083684, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 7453271.0, "repeat_count": 0.0, "routers_loss": 0.0022756033577024937, "skip_count": 2.0, "step": 4620, "text_loss": 0.7373883128166199 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02099609375, "learning_rate": 0.0006455032774353078, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7456492.0, "repeat_count": 0.0, "routers_loss": 0.0039057908579707146, "skip_count": 2.0, "step": 4622, "text_loss": 0.5058769583702087 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0203857421875, "learning_rate": 0.0006452071286051169, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 7459619.0, "repeat_count": 0.0, "routers_loss": 0.0019458672031760216, "skip_count": 0.0, "step": 4624, "text_loss": 0.5110082030296326 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0006449109241312802, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 7462552.0, "repeat_count": 0.0, "routers_loss": 0.0002716891176532954, "skip_count": 1.0, "step": 4626, "text_loss": 0.6197522878646851 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045654296875, "learning_rate": 0.0006446146641273042, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7466769.0, "repeat_count": 0.0, "routers_loss": 0.0037578947376459837, "skip_count": 2.0, "step": 4628, "text_loss": 0.1653924286365509 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.000644318348706716, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7470216.0, "repeat_count": 0.0, "routers_loss": 0.0012791058979928493, "skip_count": 0.0, "step": 4630, "text_loss": 0.7114694118499756 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.0006440219779830643, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 7472975.0, "repeat_count": 0.0, "routers_loss": 0.00736592011526227, "skip_count": 2.0, "step": 4632, "text_loss": 0.26601463556289673 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.000643725552069919, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7475672.0, "repeat_count": 0.0, "routers_loss": 0.00045455715735442936, "skip_count": 0.0, "step": 4634, "text_loss": 0.5028402805328369 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 0.0006434290710808711, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7478850.0, "repeat_count": 0.0, "routers_loss": 0.004247233271598816, "skip_count": 2.0, "step": 4636, "text_loss": 0.12746070325374603 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 21.774875256824185, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.04052734375, "learning_rate": 0.0006431325351295324, "loss": 0.0083, "macro_f1": 0.5427350401878357, "num_tokens": 7481747.0, "repeat_count": 1.0, "routers_loss": 0.047564394772052765, "skip_count": 2.0, "step": 4638, "text_loss": 0.24056802690029144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0006428359443295362, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7484885.0, "repeat_count": 0.0, "routers_loss": 0.0011175100225955248, "skip_count": 0.0, "step": 4640, "text_loss": 0.6265338063240051 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 21.793660111535075, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.035400390625, "learning_rate": 0.0006425392987945369, "loss": 0.0086, "macro_f1": 0.5492662787437439, "num_tokens": 7487973.0, "repeat_count": 0.0, "routers_loss": 0.016879938542842865, "skip_count": 2.0, "step": 4642, "text_loss": 0.2523447275161743 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 21.80305253889052, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.032958984375, "learning_rate": 0.0006422425986382093, "loss": 0.0055, "macro_f1": 0.5934640765190125, "num_tokens": 7491024.0, "repeat_count": 0.0, "routers_loss": 0.018616504967212677, "skip_count": 3.0, "step": 4644, "text_loss": 0.38890624046325684 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.812444966245963, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0006419458439742496, "loss": 0.0056, "macro_f1": 0.3272727429866791, "num_tokens": 7494199.0, "repeat_count": 0.0, "routers_loss": 0.023129139095544815, "skip_count": 1.0, "step": 4646, "text_loss": 0.4060848355293274 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0006416490349163747, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 7497287.0, "repeat_count": 0.0, "routers_loss": 0.0018601802876219153, "skip_count": 0.0, "step": 4648, "text_loss": 0.3387545943260193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0006413521715783225, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 7500598.0, "repeat_count": 0.0, "routers_loss": 0.0017482215771451592, "skip_count": 0.0, "step": 4650, "text_loss": 0.4290996193885803 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.840622248312297, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0006410552540738514, "loss": 0.007, "macro_f1": 0.3272727429866791, "num_tokens": 7503252.0, "repeat_count": 1.0, "routers_loss": 0.0420118011534214, "skip_count": 0.0, "step": 4652, "text_loss": 0.439496248960495 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 0.000640758282516741, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 7506382.0, "repeat_count": 1.0, "routers_loss": 0.0017782216891646385, "skip_count": 1.0, "step": 4654, "text_loss": 0.8513308167457581 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 21.859407103023187, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 0.0006404612570207911, "loss": 0.0102, "macro_f1": 0.3272727429866791, "num_tokens": 7510423.0, "repeat_count": 0.0, "routers_loss": 0.010385853238403797, "skip_count": 0.0, "step": 4656, "text_loss": 0.7159742712974548 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006401641776998223, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7513394.0, "repeat_count": 0.0, "routers_loss": 0.0011917101219296455, "skip_count": 0.0, "step": 4658, "text_loss": 0.6165401339530945 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.878191957734078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006398670446676766, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7516828.0, "repeat_count": 3.0, "routers_loss": 0.008860073052346706, "skip_count": 4.0, "step": 4660, "text_loss": 0.923275887966156 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0006395698580382153, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7519764.0, "repeat_count": 0.0, "routers_loss": 0.000505418807733804, "skip_count": 0.0, "step": 4662, "text_loss": 0.6143050789833069 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.0006392726179253212, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7522390.0, "repeat_count": 0.0, "routers_loss": 0.004020806401968002, "skip_count": 1.0, "step": 4664, "text_loss": 0.6935067176818848 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.906369239800412, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.052001953125, "learning_rate": 0.0006389753244428972, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 7525821.0, "repeat_count": 1.0, "routers_loss": 0.00957963801920414, "skip_count": 2.0, "step": 4666, "text_loss": 0.3350338637828827 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.915761667155856, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0006386779777048666, "loss": 0.0063, "macro_f1": 0.6601307392120361, "num_tokens": 7529513.0, "repeat_count": 1.0, "routers_loss": 0.020673364400863647, "skip_count": 2.0, "step": 4668, "text_loss": 0.47800472378730774 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0006383805778251735, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7533450.0, "repeat_count": 0.0, "routers_loss": 0.007217096630483866, "skip_count": 1.0, "step": 4670, "text_loss": 0.4506106972694397 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 21.934546521866746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0006380831249177817, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 7536287.0, "repeat_count": 1.0, "routers_loss": 0.007001714315265417, "skip_count": 0.0, "step": 4672, "text_loss": 0.4081715941429138 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0006377856190966762, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 7539442.0, "repeat_count": 0.0, "routers_loss": 0.0015112817054614425, "skip_count": 0.0, "step": 4674, "text_loss": 0.21451139450073242 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0006374880604758615, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 7542594.0, "repeat_count": 0.0, "routers_loss": 0.007311929017305374, "skip_count": 2.0, "step": 4676, "text_loss": 0.14785248041152954 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0006371904491693626, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7545780.0, "repeat_count": 0.0, "routers_loss": 0.007489737123250961, "skip_count": 1.0, "step": 4678, "text_loss": 0.2248108983039856 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.972116231288524, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0006368927852912247, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 7548287.0, "repeat_count": 1.0, "routers_loss": 0.009772555902600288, "skip_count": 1.0, "step": 4680, "text_loss": 0.1566995233297348 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0006365950689555133, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7551424.0, "repeat_count": 0.0, "routers_loss": 0.002134992741048336, "skip_count": 0.0, "step": 4682, "text_loss": 0.7322417497634888 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 21.99090108599941, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0006362973002763139, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7554182.0, "repeat_count": 1.0, "routers_loss": 0.008511497639119625, "skip_count": 4.0, "step": 4684, "text_loss": 0.24387991428375244 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.0006359994793677319, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 7557044.0, "repeat_count": 0.0, "routers_loss": 0.004151526838541031, "skip_count": 2.0, "step": 4686, "text_loss": 0.6139411330223083 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006357016063438928, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7560231.0, "repeat_count": 0.0, "routers_loss": 0.0009724601986818016, "skip_count": 0.0, "step": 4688, "text_loss": 0.7875718474388123 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.0006354036813189421, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7562953.0, "repeat_count": 0.0, "routers_loss": 0.0008926765876822174, "skip_count": 0.0, "step": 4690, "text_loss": 0.5195512771606445 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0006351057044070455, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 7566137.0, "repeat_count": 0.0, "routers_loss": 0.0031294538639485836, "skip_count": 0.0, "step": 4692, "text_loss": 0.7288873195648193 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.0006348076757223877, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 7569073.0, "repeat_count": 0.0, "routers_loss": 0.0015065820189192891, "skip_count": 2.0, "step": 4694, "text_loss": 0.7242236137390137 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0006345095953791746, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7573025.0, "repeat_count": 0.0, "routers_loss": 0.0005603441968560219, "skip_count": 0.0, "step": 4696, "text_loss": 0.34443899989128113 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.0006342114634916307, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7576546.0, "repeat_count": 0.0, "routers_loss": 0.0011047758162021637, "skip_count": 0.0, "step": 4698, "text_loss": 0.4892682731151581 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02490234375, "learning_rate": 0.0006339132801740008, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 7580711.0, "repeat_count": 0.0, "routers_loss": 0.0019803126342594624, "skip_count": 2.0, "step": 4700, "text_loss": 0.4479489028453827 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.07513941884356, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0458984375, "learning_rate": 0.0006336150455405494, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 7583385.0, "repeat_count": 1.0, "routers_loss": 0.0005326359532773495, "skip_count": 0.0, "step": 4702, "text_loss": 0.627504825592041 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0006333167597055604, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 7586584.0, "repeat_count": 0.0, "routers_loss": 0.0005587987834587693, "skip_count": 0.0, "step": 4704, "text_loss": 0.43891432881355286 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.0006330184227833376, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 7590408.0, "repeat_count": 0.0, "routers_loss": 0.007053783163428307, "skip_count": 2.0, "step": 4706, "text_loss": 0.19946859776973724 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006327200348882043, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7593857.0, "repeat_count": 1.0, "routers_loss": 0.0009479080326855183, "skip_count": 0.0, "step": 4708, "text_loss": 0.7973214387893677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0006324215961345032, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7596429.0, "repeat_count": 0.0, "routers_loss": 0.0012403312139213085, "skip_count": 0.0, "step": 4710, "text_loss": 0.48477989435195923 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.0006321231066365966, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7599618.0, "repeat_count": 0.0, "routers_loss": 0.0005520360427908599, "skip_count": 0.0, "step": 4712, "text_loss": 0.44222453236579895 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0006318245665088665, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 7603180.0, "repeat_count": 0.0, "routers_loss": 0.0015553623670712113, "skip_count": 0.0, "step": 4714, "text_loss": 0.5132410526275635 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.0006315259758657138, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7606457.0, "repeat_count": 0.0, "routers_loss": 0.004210884217172861, "skip_count": 1.0, "step": 4716, "text_loss": 0.39850690960884094 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.150278837687114, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0006312273348215589, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 7609317.0, "repeat_count": 1.0, "routers_loss": 0.001220117206685245, "skip_count": 0.0, "step": 4718, "text_loss": 0.3509018123149872 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0006309286434908419, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 7613076.0, "repeat_count": 0.0, "routers_loss": 0.007768960203975439, "skip_count": 2.0, "step": 4720, "text_loss": 0.33361560106277466 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0006306299019880217, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7616242.0, "repeat_count": 0.0, "routers_loss": 0.006226699333637953, "skip_count": 0.0, "step": 4722, "text_loss": 0.23661087453365326 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.17845611975345, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0006303311104275766, "loss": 0.0073, "macro_f1": 0.6603773832321167, "num_tokens": 7619069.0, "repeat_count": 1.0, "routers_loss": 0.015590761788189411, "skip_count": 1.0, "step": 4724, "text_loss": 0.23373056948184967 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.187848547108892, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006300322689240041, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 7622581.0, "repeat_count": 1.0, "routers_loss": 0.006862971931695938, "skip_count": 2.0, "step": 4726, "text_loss": 0.8301828503608704 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.19724097446434, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.038818359375, "learning_rate": 0.0006297333775918209, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 7625566.0, "repeat_count": 1.0, "routers_loss": 0.006256614346057177, "skip_count": 1.0, "step": 4728, "text_loss": 0.3756707012653351 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.206633401819783, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0006294344365455626, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 7629047.0, "repeat_count": 1.0, "routers_loss": 0.009151885285973549, "skip_count": 2.0, "step": 4730, "text_loss": 0.33362850546836853 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0006291354458997841, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7631847.0, "repeat_count": 0.0, "routers_loss": 0.0009307434665970504, "skip_count": 0.0, "step": 4732, "text_loss": 0.4572524130344391 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.225418256530673, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0006288364057690591, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7635181.0, "repeat_count": 0.0, "routers_loss": 0.00041220212006010115, "skip_count": 0.0, "step": 4734, "text_loss": 0.40211325883865356 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0006285373162679804, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7637752.0, "repeat_count": 0.0, "routers_loss": 0.0006696670898236334, "skip_count": 2.0, "step": 4736, "text_loss": 0.7588053345680237 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 22.24420311124156, "f1_execute": 0.9777777791023254, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0006282381775111597, "loss": 0.0081, "macro_f1": 0.9449735879898071, "num_tokens": 7640719.0, "repeat_count": 4.0, "routers_loss": 0.016283133998513222, "skip_count": 2.0, "step": 4738, "text_loss": 0.5697863101959229 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 22.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0006279389896132274, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7643524.0, "repeat_count": 0.0, "routers_loss": 0.00763951288536191, "skip_count": 3.0, "step": 4740, "text_loss": 0.548592209815979 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 22.26298796595245, "f1_execute": 0.9756097793579102, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006276397526888329, "loss": 0.0094, "macro_f1": 0.925203263759613, "num_tokens": 7646919.0, "repeat_count": 3.0, "routers_loss": 0.038590483367443085, "skip_count": 5.0, "step": 4742, "text_loss": 0.27226054668426514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0006273404668526443, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7650404.0, "repeat_count": 0.0, "routers_loss": 0.0012555639259517193, "skip_count": 0.0, "step": 4744, "text_loss": 0.47892290353775024 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0006270411322193488, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7652942.0, "repeat_count": 1.0, "routers_loss": 0.0015356402145698667, "skip_count": 0.0, "step": 4746, "text_loss": 0.5515767931938171 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0006267417489036517, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7656269.0, "repeat_count": 0.0, "routers_loss": 0.005182140972465277, "skip_count": 0.0, "step": 4748, "text_loss": 0.3496028184890747 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0006264423170202773, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7658664.0, "repeat_count": 0.0, "routers_loss": 0.004144361708313227, "skip_count": 0.0, "step": 4750, "text_loss": 0.2786032557487488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0006261428366839685, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7661471.0, "repeat_count": 0.0, "routers_loss": 0.00035335420398041606, "skip_count": 0.0, "step": 4752, "text_loss": 0.4838487505912781 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0006258433080094868, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7664593.0, "repeat_count": 0.0, "routers_loss": 0.0103341368958354, "skip_count": 2.0, "step": 4754, "text_loss": 0.24325360357761383 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0006255437311116119, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 7667573.0, "repeat_count": 0.0, "routers_loss": 0.014633853919804096, "skip_count": 2.0, "step": 4756, "text_loss": 0.21569855511188507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0006252441061051426, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7671171.0, "repeat_count": 0.0, "routers_loss": 0.004900569561868906, "skip_count": 0.0, "step": 4758, "text_loss": 0.12832018733024597 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006249444331048955, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 7673932.0, "repeat_count": 0.0, "routers_loss": 0.0020371589343994856, "skip_count": 0.0, "step": 4760, "text_loss": 0.38652482628822327 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.000624644712225706, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7677396.0, "repeat_count": 0.0, "routers_loss": 0.0028059002943336964, "skip_count": 2.0, "step": 4762, "text_loss": 0.7937633395195007 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0006243449435824276, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7680392.0, "repeat_count": 0.0, "routers_loss": 0.0007225095760077238, "skip_count": 0.0, "step": 4764, "text_loss": 0.5690395832061768 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.375697094217788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0006240451272899321, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7684121.0, "repeat_count": 0.0, "routers_loss": 0.002052050782367587, "skip_count": 1.0, "step": 4766, "text_loss": 0.5321336984634399 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.38508952157323, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.0006237452634631099, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 7687236.0, "repeat_count": 1.0, "routers_loss": 0.0039039517287164927, "skip_count": 0.0, "step": 4768, "text_loss": 0.30823320150375366 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 22.394481948928675, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0303955078125, "learning_rate": 0.0006234453522168694, "loss": 0.0084, "macro_f1": 0.5492662787437439, "num_tokens": 7690355.0, "repeat_count": 0.0, "routers_loss": 0.014570238068699837, "skip_count": 2.0, "step": 4770, "text_loss": 0.21501587331295013 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 22.403874376284122, "f1_execute": 0.949999988079071, "f1_repeat": 0.800000011920929, "f1_skip": 0.9090909361839294, "grad_norm": 0.04541015625, "learning_rate": 0.000623145393666137, "loss": 0.0069, "macro_f1": 0.886363685131073, "num_tokens": 7693559.0, "repeat_count": 3.0, "routers_loss": 0.061707716435194016, "skip_count": 6.0, "step": 4772, "text_loss": 0.24371100962162018 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0006228453879258576, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 7696422.0, "repeat_count": 0.0, "routers_loss": 0.005053870379924774, "skip_count": 2.0, "step": 4774, "text_loss": 0.237778440117836 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.0006225453351109934, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 7700460.0, "repeat_count": 0.0, "routers_loss": 0.0017990898340940475, "skip_count": 0.0, "step": 4776, "text_loss": 0.612456738948822 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 0.000622245235336526, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7703330.0, "repeat_count": 0.0, "routers_loss": 0.004507021512836218, "skip_count": 2.0, "step": 4778, "text_loss": 0.36898812651634216 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006219450887174537, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7707243.0, "repeat_count": 0.0, "routers_loss": 0.006295828148722649, "skip_count": 1.0, "step": 4780, "text_loss": 0.14474599063396454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.0006216448953687932, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7711121.0, "repeat_count": 0.0, "routers_loss": 0.005049831233918667, "skip_count": 0.0, "step": 4782, "text_loss": 0.4696790277957916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0006213446554055795, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7714889.0, "repeat_count": 0.0, "routers_loss": 0.0006010758224874735, "skip_count": 0.0, "step": 4784, "text_loss": 0.46253830194473267 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 22.469621367772234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006210443689428649, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 7718420.0, "repeat_count": 3.0, "routers_loss": 0.006691234186291695, "skip_count": 1.0, "step": 4786, "text_loss": 0.579987645149231 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.00062074403609572, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7721720.0, "repeat_count": 0.0, "routers_loss": 0.001864895923063159, "skip_count": 0.0, "step": 4788, "text_loss": 0.325242817401886 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0006204436569792324, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 7724916.0, "repeat_count": 0.0, "routers_loss": 0.00202955212444067, "skip_count": 0.0, "step": 4790, "text_loss": 0.49637556076049805 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.49779864983857, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006201432317085083, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 7728081.0, "repeat_count": 1.0, "routers_loss": 0.0037843603640794754, "skip_count": 0.0, "step": 4792, "text_loss": 0.38812628388404846 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 22.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0006198427603986711, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7731457.0, "repeat_count": 0.0, "routers_loss": 0.012036679312586784, "skip_count": 3.0, "step": 4794, "text_loss": 0.2996312379837036 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.516583504549455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0006195422431648623, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 7734595.0, "repeat_count": 0.0, "routers_loss": 0.0008874868508428335, "skip_count": 1.0, "step": 4796, "text_loss": 0.3203189969062805 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.525975931904902, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 0.0006192416801222403, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 7737565.0, "repeat_count": 1.0, "routers_loss": 0.0032894534524530172, "skip_count": 1.0, "step": 4798, "text_loss": 0.3283322751522064 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0006189410713859815, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 7740439.0, "repeat_count": 0.0, "routers_loss": 0.009667043574154377, "skip_count": 2.0, "step": 4800, "text_loss": 0.25219282507896423 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 22.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006186404170712797, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 7743813.0, "repeat_count": 0.0, "routers_loss": 0.012643060646951199, "skip_count": 4.0, "step": 4802, "text_loss": 0.22567439079284668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0006183397172933462, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7747182.0, "repeat_count": 0.0, "routers_loss": 0.002678517485037446, "skip_count": 0.0, "step": 4804, "text_loss": 0.19188879430294037 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0006180389721674101, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 7750735.0, "repeat_count": 0.0, "routers_loss": 0.0013385121710598469, "skip_count": 0.0, "step": 4806, "text_loss": 0.5860441327095032 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.000617738181808717, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7753843.0, "repeat_count": 0.0, "routers_loss": 0.0034869094379246235, "skip_count": 1.0, "step": 4808, "text_loss": 0.4366260766983032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.0006174373463325306, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7757039.0, "repeat_count": 0.0, "routers_loss": 0.0013648992171511054, "skip_count": 0.0, "step": 4810, "text_loss": 0.5217258334159851 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0006171364658541314, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 7760016.0, "repeat_count": 1.0, "routers_loss": 0.0038017008919268847, "skip_count": 2.0, "step": 4812, "text_loss": 0.8130963444709778 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.601115350748458, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 0.0006168355404888177, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 7762961.0, "repeat_count": 0.0, "routers_loss": 0.006867518648505211, "skip_count": 2.0, "step": 4814, "text_loss": 0.17822521924972534 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.610507778103905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006165345703519043, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7766399.0, "repeat_count": 0.0, "routers_loss": 0.0004653502255678177, "skip_count": 0.0, "step": 4816, "text_loss": 0.5316070914268494 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.61990020545935, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0006162335555587238, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 7769039.0, "repeat_count": 1.0, "routers_loss": 0.0016906452365219593, "skip_count": 1.0, "step": 4818, "text_loss": 0.5680997967720032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.629292632814792, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0006159324962246257, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7772768.0, "repeat_count": 0.0, "routers_loss": 0.002541248919442296, "skip_count": 0.0, "step": 4820, "text_loss": 0.6169226169586182 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0006156313924649762, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7775545.0, "repeat_count": 0.0, "routers_loss": 0.008644679561257362, "skip_count": 2.0, "step": 4822, "text_loss": 0.2211475968360901 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02880859375, "learning_rate": 0.0006153302443951589, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7778837.0, "repeat_count": 0.0, "routers_loss": 0.0041346061043441296, "skip_count": 2.0, "step": 4824, "text_loss": 0.5369775891304016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.0006150290521305746, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 7782309.0, "repeat_count": 0.0, "routers_loss": 0.0012756052892655134, "skip_count": 0.0, "step": 4826, "text_loss": 0.5294989943504333 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.666862342236573, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0006147278157866403, "loss": 0.0046, "macro_f1": 0.3272727429866791, "num_tokens": 7785565.0, "repeat_count": 0.0, "routers_loss": 0.029718991369009018, "skip_count": 1.0, "step": 4828, "text_loss": 0.6920449733734131 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006144265354787906, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7788218.0, "repeat_count": 0.0, "routers_loss": 0.004829924553632736, "skip_count": 0.0, "step": 4830, "text_loss": 0.17072243988513947 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0006141252113224767, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7790788.0, "repeat_count": 0.0, "routers_loss": 0.00254037044942379, "skip_count": 0.0, "step": 4832, "text_loss": 0.20075996220111847 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01519775390625, "learning_rate": 0.0006138238434331666, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7793913.0, "repeat_count": 0.0, "routers_loss": 0.0004426188243087381, "skip_count": 0.0, "step": 4834, "text_loss": 0.695742130279541 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.70443205165835, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.022216796875, "learning_rate": 0.000613522431926345, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 7796932.0, "repeat_count": 1.0, "routers_loss": 0.005176798906177282, "skip_count": 3.0, "step": 4836, "text_loss": 0.4910822808742523 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0006132209769175132, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7800686.0, "repeat_count": 0.0, "routers_loss": 0.004120545461773872, "skip_count": 0.0, "step": 4838, "text_loss": 0.3701378405094147 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0006129194785221894, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7804765.0, "repeat_count": 0.0, "routers_loss": 0.0043835826218128204, "skip_count": 0.0, "step": 4840, "text_loss": 0.343635618686676 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0006126179368559086, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 7807498.0, "repeat_count": 0.0, "routers_loss": 0.001394893741235137, "skip_count": 1.0, "step": 4842, "text_loss": 0.47756674885749817 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.000612316352034222, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7810784.0, "repeat_count": 0.0, "routers_loss": 0.0031262130942195654, "skip_count": 2.0, "step": 4844, "text_loss": 0.13077901303768158 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.751394188435572, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.0006120147241726972, "loss": 0.0081, "macro_f1": 0.8823530077934265, "num_tokens": 7814754.0, "repeat_count": 2.0, "routers_loss": 0.016139274463057518, "skip_count": 1.0, "step": 4846, "text_loss": 0.18850074708461761 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0006117130533869189, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7818245.0, "repeat_count": 0.0, "routers_loss": 0.0009124451316893101, "skip_count": 0.0, "step": 4848, "text_loss": 0.42503559589385986 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0006114113397924878, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7822214.0, "repeat_count": 0.0, "routers_loss": 0.0015132242115214467, "skip_count": 0.0, "step": 4850, "text_loss": 0.16767354309558868 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.779571470501907, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0006111095835050212, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 7825019.0, "repeat_count": 2.0, "routers_loss": 0.006253300234675407, "skip_count": 2.0, "step": 4852, "text_loss": 0.44826745986938477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.0006108077846401524, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 7828113.0, "repeat_count": 0.0, "routers_loss": 0.0024391328915953636, "skip_count": 0.0, "step": 4854, "text_loss": 0.2009880244731903 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.798356325212797, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0006105059433135317, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 7831177.0, "repeat_count": 1.0, "routers_loss": 0.0020866121631115675, "skip_count": 1.0, "step": 4856, "text_loss": 0.7082528471946716 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025390625, "learning_rate": 0.0006102040596408251, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 7834485.0, "repeat_count": 0.0, "routers_loss": 0.004373365081846714, "skip_count": 1.0, "step": 4858, "text_loss": 0.2541539669036865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.817141179923688, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0006099021337377148, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7837749.0, "repeat_count": 0.0, "routers_loss": 0.004309024661779404, "skip_count": 0.0, "step": 4860, "text_loss": 0.3163885176181793 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 22.82653360727913, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.049072265625, "learning_rate": 0.0006096001657198995, "loss": 0.0065, "macro_f1": 0.6122449040412903, "num_tokens": 7840979.0, "repeat_count": 0.0, "routers_loss": 0.023044804111123085, "skip_count": 4.0, "step": 4862, "text_loss": 0.49609798192977905 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.835926034634575, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0006092981557030941, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 7844905.0, "repeat_count": 1.0, "routers_loss": 0.010683654807507992, "skip_count": 3.0, "step": 4864, "text_loss": 0.16866883635520935 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0006089961038030291, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 7847800.0, "repeat_count": 0.0, "routers_loss": 0.0011224723421037197, "skip_count": 0.0, "step": 4866, "text_loss": 0.5093055367469788 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0006086940101354515, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7850983.0, "repeat_count": 0.0, "routers_loss": 0.003944621421396732, "skip_count": 1.0, "step": 4868, "text_loss": 0.5753747224807739 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 22.86410331670091, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0289306640625, "learning_rate": 0.0006083918748161244, "loss": 0.0069, "macro_f1": 0.5492662787437439, "num_tokens": 7855041.0, "repeat_count": 0.0, "routers_loss": 0.02532145567238331, "skip_count": 2.0, "step": 4870, "text_loss": 0.8082366585731506 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.0006080896979608262, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7858058.0, "repeat_count": 0.0, "routers_loss": 0.0007558314246125519, "skip_count": 0.0, "step": 4872, "text_loss": 0.6476574540138245 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.000607787479685352, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7861223.0, "repeat_count": 0.0, "routers_loss": 0.0009224560926668346, "skip_count": 0.0, "step": 4874, "text_loss": 0.5012133717536926 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.0006074852201055121, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7864180.0, "repeat_count": 0.0, "routers_loss": 0.0028308273758739233, "skip_count": 0.0, "step": 4876, "text_loss": 0.7447214722633362 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0006071829193371331, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7866726.0, "repeat_count": 0.0, "routers_loss": 0.0021505290642380714, "skip_count": 0.0, "step": 4878, "text_loss": 0.5444929599761963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.0006068805774960573, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7870166.0, "repeat_count": 0.0, "routers_loss": 0.0021109723020344973, "skip_count": 0.0, "step": 4880, "text_loss": 0.3577263355255127 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.0006065781946981425, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7873028.0, "repeat_count": 0.0, "routers_loss": 0.0027144821360707283, "skip_count": 0.0, "step": 4882, "text_loss": 0.28464797139167786 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0006062757710592624, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7876747.0, "repeat_count": 0.0, "routers_loss": 0.0004638207610696554, "skip_count": 0.0, "step": 4884, "text_loss": 0.381534606218338 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.939242735544468, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0006059733066953066, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 7879524.0, "repeat_count": 1.0, "routers_loss": 0.002225410658866167, "skip_count": 2.0, "step": 4886, "text_loss": 0.5167883634567261 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006056708017221796, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 7882809.0, "repeat_count": 0.0, "routers_loss": 0.00419368501752615, "skip_count": 1.0, "step": 4888, "text_loss": 0.22688335180282593 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.000605368256255802, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7886310.0, "repeat_count": 0.0, "routers_loss": 0.0017340193735435605, "skip_count": 1.0, "step": 4890, "text_loss": 1.0128135681152344 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0006050656704121098, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 7889483.0, "repeat_count": 0.0, "routers_loss": 0.0016647159354761243, "skip_count": 0.0, "step": 4892, "text_loss": 0.2213262915611267 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 22.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0006047630443070547, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7892615.0, "repeat_count": 0.0, "routers_loss": 0.0038971947506070137, "skip_count": 3.0, "step": 4894, "text_loss": 0.45751357078552246 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.98620487232169, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0006044603780566032, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 7895747.0, "repeat_count": 1.0, "routers_loss": 0.0036852145567536354, "skip_count": 1.0, "step": 4896, "text_loss": 0.13489919900894165 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0006041576717767379, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7899155.0, "repeat_count": 0.0, "routers_loss": 0.007661987561732531, "skip_count": 1.0, "step": 4898, "text_loss": 0.281853586435318 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.00469621367772, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0006038549255834563, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7901667.0, "repeat_count": 2.0, "routers_loss": 0.01836695335805416, "skip_count": 5.0, "step": 4900, "text_loss": 0.24879895150661469 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.000603552139592771, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7904506.0, "repeat_count": 0.0, "routers_loss": 0.0011829182039946318, "skip_count": 0.0, "step": 4902, "text_loss": 0.7550268769264221 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 23.023481068388612, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0006032493139207106, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7907316.0, "repeat_count": 1.0, "routers_loss": 0.0022891140542924404, "skip_count": 0.0, "step": 4904, "text_loss": 0.37596020102500916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0006029464486833186, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 7911283.0, "repeat_count": 0.0, "routers_loss": 0.001990227960050106, "skip_count": 0.0, "step": 4906, "text_loss": 0.5879577994346619 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0211181640625, "learning_rate": 0.0006026435439966531, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 7913907.0, "repeat_count": 0.0, "routers_loss": 0.0026039890944957733, "skip_count": 1.0, "step": 4908, "text_loss": 0.41484713554382324 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0006023405999767879, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7916772.0, "repeat_count": 0.0, "routers_loss": 0.009183229878544807, "skip_count": 1.0, "step": 4910, "text_loss": 0.20732562243938446 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 0.0006020376167398116, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7919346.0, "repeat_count": 0.0, "routers_loss": 0.005508727394044399, "skip_count": 1.0, "step": 4912, "text_loss": 0.41416165232658386 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 23.070443205165834, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0006017345944018284, "loss": 0.0051, "macro_f1": 0.3272727429866791, "num_tokens": 7922404.0, "repeat_count": 0.0, "routers_loss": 0.008651934564113617, "skip_count": 0.0, "step": 4914, "text_loss": 0.4290519952774048 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0006014315330789563, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 7925165.0, "repeat_count": 0.0, "routers_loss": 0.003601635340601206, "skip_count": 1.0, "step": 4916, "text_loss": 0.8447931408882141 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.089228059876724, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0006011284328873296, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 7928146.0, "repeat_count": 1.0, "routers_loss": 0.0049415635876357555, "skip_count": 2.0, "step": 4918, "text_loss": 0.32237401604652405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0006008252939430967, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7931163.0, "repeat_count": 0.0, "routers_loss": 0.0024150956887751818, "skip_count": 0.0, "step": 4920, "text_loss": 0.2251713126897812 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.108012914587615, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0006005221163624209, "loss": 0.0057, "macro_f1": 0.3272727429866791, "num_tokens": 7934084.0, "repeat_count": 1.0, "routers_loss": 0.03181030973792076, "skip_count": 0.0, "step": 4922, "text_loss": 0.4962928593158722 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.054931640625, "learning_rate": 0.0006002189002614806, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 7937021.0, "repeat_count": 0.0, "routers_loss": 0.00227518193423748, "skip_count": 2.0, "step": 4924, "text_loss": 0.34440335631370544 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0005999156457564685, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 7940205.0, "repeat_count": 0.0, "routers_loss": 0.004331593867391348, "skip_count": 1.0, "step": 4926, "text_loss": 0.14114083349704742 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0005996123529635925, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7945174.0, "repeat_count": 0.0, "routers_loss": 0.000612895586527884, "skip_count": 0.0, "step": 4928, "text_loss": 0.3895469009876251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.145582624009393, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.000599309021999075, "loss": 0.006, "macro_f1": 0.3272727429866791, "num_tokens": 7948716.0, "repeat_count": 0.0, "routers_loss": 0.02319233864545822, "skip_count": 1.0, "step": 4930, "text_loss": 0.38103172183036804 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0005990056529791528, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7952497.0, "repeat_count": 0.0, "routers_loss": 0.003423231653869152, "skip_count": 0.0, "step": 4932, "text_loss": 0.30447322130203247 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.017822265625, "learning_rate": 0.0005987022460200778, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7955578.0, "repeat_count": 0.0, "routers_loss": 0.0007005351362749934, "skip_count": 0.0, "step": 4934, "text_loss": 0.49621838331222534 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 23.173759906075727, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 0.0005983988012381159, "loss": 0.0061, "macro_f1": 0.8823530077934265, "num_tokens": 7958741.0, "repeat_count": 2.0, "routers_loss": 0.03962617367506027, "skip_count": 1.0, "step": 4936, "text_loss": 0.1920493096113205 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022216796875, "learning_rate": 0.0005980953187495476, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 7962236.0, "repeat_count": 0.0, "routers_loss": 0.0026006060652434826, "skip_count": 3.0, "step": 4938, "text_loss": 0.5286803841590881 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0005977917986706681, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7965631.0, "repeat_count": 0.0, "routers_loss": 0.005010952707380056, "skip_count": 0.0, "step": 4940, "text_loss": 0.3507745563983917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0005974882411177871, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7968516.0, "repeat_count": 0.0, "routers_loss": 0.0023964287247508764, "skip_count": 0.0, "step": 4942, "text_loss": 0.9110504388809204 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.000597184646207228, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7971310.0, "repeat_count": 0.0, "routers_loss": 0.0026230409275740385, "skip_count": 1.0, "step": 4944, "text_loss": 0.4131232798099518 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0005968810140553292, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 7974809.0, "repeat_count": 0.0, "routers_loss": 0.0007397596491500735, "skip_count": 0.0, "step": 4946, "text_loss": 0.5130466222763062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0005965773447784431, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7977800.0, "repeat_count": 0.0, "routers_loss": 0.0009955473942682147, "skip_count": 0.0, "step": 4948, "text_loss": 0.5366153717041016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01373291015625, "learning_rate": 0.0005962736384929362, "loss": 0.0026, "macro_f1": 0.3333333432674408, "num_tokens": 7981027.0, "repeat_count": 0.0, "routers_loss": 0.0049227322451770306, "skip_count": 0.0, "step": 4950, "text_loss": 0.17266370356082916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0005959698953151895, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7983580.0, "repeat_count": 0.0, "routers_loss": 0.0009975163266062737, "skip_count": 0.0, "step": 4952, "text_loss": 0.2474549114704132 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0005956661153615979, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7986711.0, "repeat_count": 0.0, "routers_loss": 0.0006475782720372081, "skip_count": 0.0, "step": 4954, "text_loss": 0.5748327970504761 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0005953622987485703, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7990194.0, "repeat_count": 0.0, "routers_loss": 0.001449751085601747, "skip_count": 0.0, "step": 4956, "text_loss": 0.5163559317588806 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0005950584455925301, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7993050.0, "repeat_count": 0.0, "routers_loss": 0.0017087773885577917, "skip_count": 0.0, "step": 4958, "text_loss": 0.15892620384693146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.286469034341064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0005947545560099142, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 7996383.0, "repeat_count": 0.0, "routers_loss": 0.0044417232275009155, "skip_count": 0.0, "step": 4960, "text_loss": 0.48022928833961487 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 23.295861461696507, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.031982421875, "learning_rate": 0.0005944506301171734, "loss": 0.0066, "macro_f1": 0.5492662787437439, "num_tokens": 7999843.0, "repeat_count": 0.0, "routers_loss": 0.010093312710523605, "skip_count": 2.0, "step": 4962, "text_loss": 0.5050316452980042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0005941466680307732, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8003504.0, "repeat_count": 0.0, "routers_loss": 0.009699694812297821, "skip_count": 0.0, "step": 4964, "text_loss": 0.30474427342414856 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 23.314646316407398, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0005938426698671922, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 8007427.0, "repeat_count": 1.0, "routers_loss": 0.0016759657301008701, "skip_count": 0.0, "step": 4966, "text_loss": 0.25060293078422546 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.32403874376284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04443359375, "learning_rate": 0.0005935386357429232, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 8010265.0, "repeat_count": 2.0, "routers_loss": 0.006916914135217667, "skip_count": 3.0, "step": 4968, "text_loss": 0.49084481596946716 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 23.333431171118285, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.0005932345657744723, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 8013733.0, "repeat_count": 1.0, "routers_loss": 0.017182426527142525, "skip_count": 5.0, "step": 4970, "text_loss": 0.2705717980861664 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00059293046007836, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8017068.0, "repeat_count": 0.0, "routers_loss": 0.008485594764351845, "skip_count": 2.0, "step": 4972, "text_loss": 0.18570218980312347 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0005926263187711201, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 8020185.0, "repeat_count": 0.0, "routers_loss": 0.0021750847809016705, "skip_count": 2.0, "step": 4974, "text_loss": 0.4457069933414459 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0005923221419693001, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 8023038.0, "repeat_count": 0.0, "routers_loss": 0.0020193420350551605, "skip_count": 0.0, "step": 4976, "text_loss": 0.7394505143165588 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.054931640625, "learning_rate": 0.0005920179297894613, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 8026236.0, "repeat_count": 0.0, "routers_loss": 0.001450369250960648, "skip_count": 1.0, "step": 4978, "text_loss": 0.5914503335952759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.000591713682348178, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8028765.0, "repeat_count": 0.0, "routers_loss": 0.0017808573320508003, "skip_count": 0.0, "step": 4980, "text_loss": 0.19231407344341278 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005914093997620388, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8032043.0, "repeat_count": 0.0, "routers_loss": 0.0018225493840873241, "skip_count": 0.0, "step": 4982, "text_loss": 0.3567875325679779 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0005911050821476449, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8035086.0, "repeat_count": 0.0, "routers_loss": 0.0016285666497424245, "skip_count": 0.0, "step": 4984, "text_loss": 0.34609633684158325 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0005908007296216119, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8038193.0, "repeat_count": 0.0, "routers_loss": 0.0014699801104143262, "skip_count": 0.0, "step": 4986, "text_loss": 0.4492359757423401 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.000590496342300568, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 8041099.0, "repeat_count": 0.0, "routers_loss": 0.002442725468426943, "skip_count": 0.0, "step": 4988, "text_loss": 0.5162975788116455 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0005901919203011548, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 8044350.0, "repeat_count": 0.0, "routers_loss": 0.008624207228422165, "skip_count": 2.0, "step": 4990, "text_loss": 0.2533033490180969 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.0005898874637400279, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 8047467.0, "repeat_count": 0.0, "routers_loss": 0.0015421364223584533, "skip_count": 0.0, "step": 4992, "text_loss": 0.4890289306640625 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.44614029938362, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0005895829727338552, "loss": 0.0065, "macro_f1": 1.0, "num_tokens": 8050626.0, "repeat_count": 1.0, "routers_loss": 0.0024516626726835966, "skip_count": 2.0, "step": 4994, "text_loss": 0.50797039270401 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0005892784473993184, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 8053386.0, "repeat_count": 0.0, "routers_loss": 0.0018553845584392548, "skip_count": 2.0, "step": 4996, "text_loss": 0.628828763961792 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0286865234375, "learning_rate": 0.000588973887853112, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8055941.0, "repeat_count": 0.0, "routers_loss": 0.004258487373590469, "skip_count": 0.0, "step": 4998, "text_loss": 0.2643229067325592 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.474317581449956, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0005886692942119441, "loss": 0.0062, "macro_f1": 0.8820862174034119, "num_tokens": 8058638.0, "repeat_count": 2.0, "routers_loss": 0.019064312800765038, "skip_count": 2.0, "step": 5000, "text_loss": 0.4925006031990051 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0005883646665925353, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 8062097.0, "repeat_count": 0.0, "routers_loss": 0.0007969749276526272, "skip_count": 0.0, "step": 5002, "text_loss": 0.49412909150123596 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0005880600051116196, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 8065202.0, "repeat_count": 0.0, "routers_loss": 0.005813780706375837, "skip_count": 2.0, "step": 5004, "text_loss": 0.5681346654891968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0005877553098859439, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 8068574.0, "repeat_count": 0.0, "routers_loss": 0.005012941546738148, "skip_count": 0.0, "step": 5006, "text_loss": 0.2682424485683441 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0005874505810322678, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 8071834.0, "repeat_count": 0.0, "routers_loss": 0.005859757773578167, "skip_count": 3.0, "step": 5008, "text_loss": 0.6460036039352417 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.000587145818667364, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 8074687.0, "repeat_count": 0.0, "routers_loss": 0.002868571551516652, "skip_count": 2.0, "step": 5010, "text_loss": 0.2405751347541809 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0005868410229080181, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8077617.0, "repeat_count": 0.0, "routers_loss": 0.0021759893279522657, "skip_count": 1.0, "step": 5012, "text_loss": 0.7455595135688782 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.0005865361938710286, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8080734.0, "repeat_count": 0.0, "routers_loss": 0.0008311949786730111, "skip_count": 0.0, "step": 5014, "text_loss": 0.44876906275749207 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 23.549457000293515, "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.0390625, "learning_rate": 0.0005862313316732063, "loss": 0.0054, "macro_f1": 0.9615669250488281, "num_tokens": 8085092.0, "repeat_count": 2.0, "routers_loss": 0.012511664070189, "skip_count": 6.0, "step": 5016, "text_loss": 0.26010942459106445 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.000585926436431375, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 8088333.0, "repeat_count": 0.0, "routers_loss": 0.0035441694781184196, "skip_count": 0.0, "step": 5018, "text_loss": 0.28225192427635193 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 23.568241855004402, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.031494140625, "learning_rate": 0.0005856215082623711, "loss": 0.0093, "macro_f1": 0.8823530077934265, "num_tokens": 8091298.0, "repeat_count": 1.0, "routers_loss": 0.023543989285826683, "skip_count": 2.0, "step": 5020, "text_loss": 0.5757577419281006 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.0005853165472830439, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8094361.0, "repeat_count": 0.0, "routers_loss": 0.003124240320175886, "skip_count": 0.0, "step": 5022, "text_loss": 0.4021305739879608 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0005850115536102546, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8097514.0, "repeat_count": 0.0, "routers_loss": 0.008170558139681816, "skip_count": 1.0, "step": 5024, "text_loss": 0.18926584720611572 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 23.596419137070736, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0005847065273608777, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 8100525.0, "repeat_count": 1.0, "routers_loss": 0.02127663604915142, "skip_count": 5.0, "step": 5026, "text_loss": 0.18827557563781738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0005844014686517998, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 8104016.0, "repeat_count": 0.0, "routers_loss": 0.00272122910246253, "skip_count": 0.0, "step": 5028, "text_loss": 0.15534701943397522 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 23.615203991781627, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0005840963775999199, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 8106697.0, "repeat_count": 5.0, "routers_loss": 0.008979840204119682, "skip_count": 4.0, "step": 5030, "text_loss": 0.8123718500137329 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0005837912543221493, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 8110986.0, "repeat_count": 0.0, "routers_loss": 0.005006929859519005, "skip_count": 0.0, "step": 5032, "text_loss": 0.26128846406936646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0005834860989354121, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 8114010.0, "repeat_count": 0.0, "routers_loss": 0.0005531277856789529, "skip_count": 0.0, "step": 5034, "text_loss": 0.5100266933441162 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.64338127384796, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0556640625, "learning_rate": 0.0005831809115566442, "loss": 0.0073, "macro_f1": 0.6538461446762085, "num_tokens": 8117168.0, "repeat_count": 2.0, "routers_loss": 0.04978533461689949, "skip_count": 1.0, "step": 5036, "text_loss": 0.41049885749816895 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0005828756923027941, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8119900.0, "repeat_count": 0.0, "routers_loss": 0.0006322385743260384, "skip_count": 0.0, "step": 5038, "text_loss": 0.5584380626678467 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0005825704412908225, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 8123928.0, "repeat_count": 0.0, "routers_loss": 0.001000594231300056, "skip_count": 0.0, "step": 5040, "text_loss": 0.6460791230201721 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0005822651586377019, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 8127926.0, "repeat_count": 0.0, "routers_loss": 0.011595834977924824, "skip_count": 2.0, "step": 5042, "text_loss": 0.3131820261478424 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0005819598444604173, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 8131092.0, "repeat_count": 0.0, "routers_loss": 0.004449303261935711, "skip_count": 3.0, "step": 5044, "text_loss": 0.2774372696876526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0005816544988759658, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 8134051.0, "repeat_count": 0.0, "routers_loss": 0.0007877505850046873, "skip_count": 0.0, "step": 5046, "text_loss": 0.39496293663978577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.0005813491220013563, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 8138725.0, "repeat_count": 0.0, "routers_loss": 0.002868623472750187, "skip_count": 0.0, "step": 5048, "text_loss": 0.3779948651790619 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.709128265336073, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0005810437139536098, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 8141913.0, "repeat_count": 2.0, "routers_loss": 0.006244937423616648, "skip_count": 4.0, "step": 5050, "text_loss": 0.4512978494167328 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06396484375, "learning_rate": 0.0005807382748497592, "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 8146193.0, "repeat_count": 0.0, "routers_loss": 0.0011013929033651948, "skip_count": 0.0, "step": 5052, "text_loss": 0.6194499731063843 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0005804328048068493, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8149701.0, "repeat_count": 0.0, "routers_loss": 0.005505079869180918, "skip_count": 1.0, "step": 5054, "text_loss": 0.2932305335998535 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 23.737305547402407, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005801273039419368, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 8152861.0, "repeat_count": 1.0, "routers_loss": 0.0057641929015517235, "skip_count": 1.0, "step": 5056, "text_loss": 0.2631317973136902 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 23.74669797475785, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0005798217723720904, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 8155843.0, "repeat_count": 1.0, "routers_loss": 0.0021671492140740156, "skip_count": 5.0, "step": 5058, "text_loss": 0.2889988422393799 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0005795162102143902, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 8158812.0, "repeat_count": 0.0, "routers_loss": 0.004476628266274929, "skip_count": 1.0, "step": 5060, "text_loss": 0.48028868436813354 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.76548282946874, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.0005792106175859283, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 8162719.0, "repeat_count": 1.0, "routers_loss": 0.0038497636560350657, "skip_count": 3.0, "step": 5062, "text_loss": 0.4559471607208252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0005789049946038083, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 8165692.0, "repeat_count": 0.0, "routers_loss": 0.004451582673937082, "skip_count": 0.0, "step": 5064, "text_loss": 0.3782602548599243 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.0005785993413851456, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8168900.0, "repeat_count": 0.0, "routers_loss": 0.002951978938654065, "skip_count": 0.0, "step": 5066, "text_loss": 0.32392629981040955 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.000578293658047067, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 8171661.0, "repeat_count": 0.0, "routers_loss": 0.011171254329383373, "skip_count": 2.0, "step": 5068, "text_loss": 0.24492619931697845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0005779879447067109, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 8175075.0, "repeat_count": 0.0, "routers_loss": 0.0016067599644884467, "skip_count": 0.0, "step": 5070, "text_loss": 0.7738823294639587 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.000577682201481227, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8178515.0, "repeat_count": 0.0, "routers_loss": 0.009113503620028496, "skip_count": 1.0, "step": 5072, "text_loss": 0.2082248032093048 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 23.82183739360141, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.0005773764284877774, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 8181790.0, "repeat_count": 1.0, "routers_loss": 0.007332196459174156, "skip_count": 1.0, "step": 5074, "text_loss": 0.4557662904262543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0537109375, "learning_rate": 0.0005770706258435342, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8184854.0, "repeat_count": 0.0, "routers_loss": 0.0016252279747277498, "skip_count": 0.0, "step": 5076, "text_loss": 0.2888098657131195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0005767647936656818, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 8187860.0, "repeat_count": 0.0, "routers_loss": 0.003406575648114085, "skip_count": 0.0, "step": 5078, "text_loss": 0.6533790230751038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0005764589320714158, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 8191683.0, "repeat_count": 0.0, "routers_loss": 0.0006520140450447798, "skip_count": 0.0, "step": 5080, "text_loss": 0.6903796195983887 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0005761530411779426, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 8195109.0, "repeat_count": 0.0, "routers_loss": 0.01188349537551403, "skip_count": 1.0, "step": 5082, "text_loss": 0.20460398495197296 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.083984375, "learning_rate": 0.0005758471211024804, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 8198340.0, "repeat_count": 0.0, "routers_loss": 0.004826809279620647, "skip_count": 3.0, "step": 5084, "text_loss": 0.2203969657421112 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.878191957734078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 0.0005755411719622584, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8200882.0, "repeat_count": 0.0, "routers_loss": 0.0019170823507010937, "skip_count": 0.0, "step": 5086, "text_loss": 0.6744595170021057 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.0005752351938745167, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 8203777.0, "repeat_count": 0.0, "routers_loss": 0.002110893838107586, "skip_count": 1.0, "step": 5088, "text_loss": 0.4137859046459198 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0194091796875, "learning_rate": 0.000574929186956507, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 8207627.0, "repeat_count": 0.0, "routers_loss": 0.0018580821342766285, "skip_count": 1.0, "step": 5090, "text_loss": 0.4830456078052521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.906369239800412, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0005746231513254912, "loss": 0.0066, "macro_f1": 0.3272727429866791, "num_tokens": 8210263.0, "repeat_count": 1.0, "routers_loss": 0.0194723978638649, "skip_count": 0.0, "step": 5092, "text_loss": 0.17383277416229248 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005743170870987433, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 8214166.0, "repeat_count": 0.0, "routers_loss": 0.006944256369024515, "skip_count": 2.0, "step": 5094, "text_loss": 0.20003484189510345 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0005740109943935472, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8217545.0, "repeat_count": 0.0, "routers_loss": 0.002044794149696827, "skip_count": 1.0, "step": 5096, "text_loss": 0.5117167830467224 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.934546521866746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06494140625, "learning_rate": 0.0005737048733271986, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 8220673.0, "repeat_count": 1.0, "routers_loss": 0.009966124780476093, "skip_count": 2.0, "step": 5098, "text_loss": 0.2705996036529541 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0005733987240170035, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 8223796.0, "repeat_count": 0.0, "routers_loss": 0.0009675708715803921, "skip_count": 0.0, "step": 5100, "text_loss": 0.7016357183456421 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0208740234375, "learning_rate": 0.0005730925465802788, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8227048.0, "repeat_count": 0.0, "routers_loss": 0.0009548200177960098, "skip_count": 0.0, "step": 5102, "text_loss": 0.30823078751564026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0005727863411343526, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8229971.0, "repeat_count": 0.0, "routers_loss": 0.0005767418188042939, "skip_count": 0.0, "step": 5104, "text_loss": 0.6897505521774292 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0005724801077965629, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8232758.0, "repeat_count": 0.0, "routers_loss": 0.009297889657318592, "skip_count": 3.0, "step": 5106, "text_loss": 0.21293514966964722 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.981508658643968, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0005721738466842592, "loss": 0.0079, "macro_f1": 0.3272727429866791, "num_tokens": 8238154.0, "repeat_count": 1.0, "routers_loss": 0.013964693062007427, "skip_count": 0.0, "step": 5108, "text_loss": 0.7273620367050171 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 23.99090108599941, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0005718675579148014, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 8240818.0, "repeat_count": 3.0, "routers_loss": 0.007218098267912865, "skip_count": 1.0, "step": 5110, "text_loss": 0.5607150793075562 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0005715612416055598, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 8244048.0, "repeat_count": 0.0, "routers_loss": 0.007558444049209356, "skip_count": 2.0, "step": 5112, "text_loss": 0.23694385588169098 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.009392427355444, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0005712548978739154, "loss": 0.0072, "macro_f1": 0.6603773832321167, "num_tokens": 8247240.0, "repeat_count": 1.0, "routers_loss": 0.015726923942565918, "skip_count": 1.0, "step": 5114, "text_loss": 0.6032099723815918 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.01878485471089, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.019775390625, "learning_rate": 0.0005709485268372598, "loss": 0.0046, "macro_f1": 0.9262410998344421, "num_tokens": 8250585.0, "repeat_count": 3.0, "routers_loss": 0.011148860678076744, "skip_count": 2.0, "step": 5116, "text_loss": 0.6825997233390808 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0005706421286129948, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 8254240.0, "repeat_count": 0.0, "routers_loss": 0.006977916229516268, "skip_count": 0.0, "step": 5118, "text_loss": 0.2532844543457031 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0005703357033185328, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 8257133.0, "repeat_count": 0.0, "routers_loss": 0.006415650714188814, "skip_count": 2.0, "step": 5120, "text_loss": 0.6132124066352844 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.046962136777225, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0005700292510712967, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 8261076.0, "repeat_count": 1.0, "routers_loss": 0.0044475216418504715, "skip_count": 1.0, "step": 5122, "text_loss": 0.4277699887752533 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0005697227719887194, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 8264607.0, "repeat_count": 0.0, "routers_loss": 0.005743155721575022, "skip_count": 2.0, "step": 5124, "text_loss": 0.2570968270301819 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0005694162661882444, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8267992.0, "repeat_count": 0.0, "routers_loss": 0.0007581565878354013, "skip_count": 0.0, "step": 5126, "text_loss": 0.5850184559822083 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0005691097337873252, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 8271010.0, "repeat_count": 0.0, "routers_loss": 0.0036611228715628386, "skip_count": 0.0, "step": 5128, "text_loss": 0.660999059677124 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0005688031749034258, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 8273638.0, "repeat_count": 0.0, "routers_loss": 0.0039906189776957035, "skip_count": 0.0, "step": 5130, "text_loss": 0.5839648246765137 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.093924273554446, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.037109375, "learning_rate": 0.0005684965896540198, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 8276504.0, "repeat_count": 1.0, "routers_loss": 0.007539632264524698, "skip_count": 3.0, "step": 5132, "text_loss": 0.27675092220306396 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 24.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0005681899781565915, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 8279977.0, "repeat_count": 2.0, "routers_loss": 0.0026953567285090685, "skip_count": 0.0, "step": 5134, "text_loss": 0.532974123954773 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.000567883340528635, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 8282781.0, "repeat_count": 0.0, "routers_loss": 0.005754240322858095, "skip_count": 1.0, "step": 5136, "text_loss": 0.31100207567214966 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005675766768876542, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 8286533.0, "repeat_count": 0.0, "routers_loss": 0.0051517849788069725, "skip_count": 0.0, "step": 5138, "text_loss": 0.5734741687774658 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005672699873511635, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 8289858.0, "repeat_count": 0.0, "routers_loss": 0.0025852699764072895, "skip_count": 2.0, "step": 5140, "text_loss": 0.37045374512672424 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005669632720366868, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 8293038.0, "repeat_count": 0.0, "routers_loss": 0.0038520018570125103, "skip_count": 0.0, "step": 5142, "text_loss": 0.25952374935150146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0005666565310617577, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8295717.0, "repeat_count": 0.0, "routers_loss": 0.00026914477348327637, "skip_count": 0.0, "step": 5144, "text_loss": 0.32531213760375977 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 0.0005663497645439203, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 8299750.0, "repeat_count": 0.0, "routers_loss": 0.0055860537104308605, "skip_count": 2.0, "step": 5146, "text_loss": 0.2520618438720703 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0005660429726007279, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 8303075.0, "repeat_count": 0.0, "routers_loss": 0.004446739796549082, "skip_count": 1.0, "step": 5148, "text_loss": 0.43672287464141846 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.17845611975345, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.07080078125, "learning_rate": 0.000565736155349744, "loss": 0.0076, "macro_f1": 0.8814815282821655, "num_tokens": 8306268.0, "repeat_count": 2.0, "routers_loss": 0.046915046870708466, "skip_count": 4.0, "step": 5150, "text_loss": 0.35405927896499634 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 24.187848547108892, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0005654293129085412, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8310480.0, "repeat_count": 0.0, "routers_loss": 0.010549088008701801, "skip_count": 4.0, "step": 5152, "text_loss": 0.3523249626159668 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.19724097446434, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0005651224453947023, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8313367.0, "repeat_count": 1.0, "routers_loss": 0.002893900265917182, "skip_count": 0.0, "step": 5154, "text_loss": 0.4503810703754425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0005648155529258195, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 8318006.0, "repeat_count": 0.0, "routers_loss": 0.0018450213829055429, "skip_count": 0.0, "step": 5156, "text_loss": 0.5687127113342285 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047119140625, "learning_rate": 0.0005645086356194943, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8320646.0, "repeat_count": 0.0, "routers_loss": 0.0026727779768407345, "skip_count": 0.0, "step": 5158, "text_loss": 0.38920050859451294 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.225418256530673, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0005642016935933385, "loss": 0.0035, "macro_f1": 1.0, "num_tokens": 8323915.0, "repeat_count": 1.0, "routers_loss": 0.00611621281132102, "skip_count": 2.0, "step": 5160, "text_loss": 0.3003547787666321 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 24.0, "epoch": 24.234810683886117, "f1_execute": 0.9767441749572754, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.0257568359375, "learning_rate": 0.0005638947269649726, "loss": 0.0063, "macro_f1": 0.9619450569152832, "num_tokens": 8327073.0, "repeat_count": 1.0, "routers_loss": 0.028447439894080162, "skip_count": 6.0, "step": 5162, "text_loss": 0.24053414165973663 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0005635877358520268, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8330388.0, "repeat_count": 0.0, "routers_loss": 0.0013072624569758773, "skip_count": 0.0, "step": 5164, "text_loss": 0.43772217631340027 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0005632807203721406, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 8333241.0, "repeat_count": 0.0, "routers_loss": 0.0009456822881475091, "skip_count": 0.0, "step": 5166, "text_loss": 0.5217573046684265 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 0.000562973680642963, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8337257.0, "repeat_count": 0.0, "routers_loss": 0.0023840824142098427, "skip_count": 0.0, "step": 5168, "text_loss": 0.31814974546432495 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0196533203125, "learning_rate": 0.0005626666167821521, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 8340143.0, "repeat_count": 0.0, "routers_loss": 0.0020231492817401886, "skip_count": 3.0, "step": 5170, "text_loss": 0.5478505492210388 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0162353515625, "learning_rate": 0.0005623595289073755, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 8343566.0, "repeat_count": 1.0, "routers_loss": 0.01070715207606554, "skip_count": 2.0, "step": 5172, "text_loss": 0.23213914036750793 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0005620524171363099, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 8346836.0, "repeat_count": 0.0, "routers_loss": 0.003720001084730029, "skip_count": 3.0, "step": 5174, "text_loss": 0.5114789009094238 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.30055767537423, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0005617452815866409, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 8349726.0, "repeat_count": 1.0, "routers_loss": 0.003322509117424488, "skip_count": 1.0, "step": 5176, "text_loss": 0.4894506335258484 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0005614381223760635, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 8352478.0, "repeat_count": 0.0, "routers_loss": 0.00028752797516062856, "skip_count": 0.0, "step": 5178, "text_loss": 0.6418307423591614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.0005611309396222817, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 8355766.0, "repeat_count": 0.0, "routers_loss": 0.0028724796138703823, "skip_count": 0.0, "step": 5180, "text_loss": 0.23635952174663544 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.328734957440563, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0005608237334430085, "loss": 0.0068, "macro_f1": 0.6601307392120361, "num_tokens": 8358888.0, "repeat_count": 1.0, "routers_loss": 0.058520980179309845, "skip_count": 2.0, "step": 5182, "text_loss": 0.23434793949127197 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1015625, "learning_rate": 0.000560516503955966, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 8361761.0, "repeat_count": 0.0, "routers_loss": 0.0021356395445764065, "skip_count": 1.0, "step": 5184, "text_loss": 0.40855672955513 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.000560209251278885, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 8364376.0, "repeat_count": 0.0, "routers_loss": 0.0016185789136216044, "skip_count": 0.0, "step": 5186, "text_loss": 0.6265131831169128 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0005599019755295053, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8367769.0, "repeat_count": 0.0, "routers_loss": 0.0031490204855799675, "skip_count": 2.0, "step": 5188, "text_loss": 0.4716353118419647 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0005595946768255756, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 8370705.0, "repeat_count": 1.0, "routers_loss": 0.003500689286738634, "skip_count": 0.0, "step": 5190, "text_loss": 0.5467679500579834 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.375697094217788, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0005592873552848532, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 8374217.0, "repeat_count": 2.0, "routers_loss": 0.010764475911855698, "skip_count": 3.0, "step": 5192, "text_loss": 0.4345340132713318 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 24.38508952157323, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005589800110251045, "loss": 0.0087, "macro_f1": 1.0, "num_tokens": 8378182.0, "repeat_count": 2.0, "routers_loss": 0.0010365343187004328, "skip_count": 1.0, "step": 5194, "text_loss": 0.46722909808158875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.394481948928675, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.0005586726441641044, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8381227.0, "repeat_count": 0.0, "routers_loss": 0.006349093746393919, "skip_count": 2.0, "step": 5196, "text_loss": 0.35410359501838684 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.403874376284122, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0005583652548196362, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 8384886.0, "repeat_count": 0.0, "routers_loss": 0.00038166221929714084, "skip_count": 0.0, "step": 5198, "text_loss": 0.5950250625610352 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0005580578431094924, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 8388939.0, "repeat_count": 0.0, "routers_loss": 0.0023578559048473835, "skip_count": 2.0, "step": 5200, "text_loss": 0.6553771495819092 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0005577504091514735, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 8391629.0, "repeat_count": 0.0, "routers_loss": 0.0010771085508167744, "skip_count": 0.0, "step": 5202, "text_loss": 0.4441985785961151 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.000557442953063389, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8394440.0, "repeat_count": 0.0, "routers_loss": 0.005844325292855501, "skip_count": 3.0, "step": 5204, "text_loss": 0.5807011723518372 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0005571354749630564, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8397731.0, "repeat_count": 0.0, "routers_loss": 0.006837233901023865, "skip_count": 1.0, "step": 5206, "text_loss": 0.27780941128730774 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.000556827974968302, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 8400859.0, "repeat_count": 0.0, "routers_loss": 0.007656649220734835, "skip_count": 3.0, "step": 5208, "text_loss": 0.4746324121952057 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0005565204531969606, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8404164.0, "repeat_count": 0.0, "routers_loss": 0.0028129038400948048, "skip_count": 1.0, "step": 5210, "text_loss": 0.8513513803482056 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0005562129097668746, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8407196.0, "repeat_count": 0.0, "routers_loss": 0.00492360582575202, "skip_count": 1.0, "step": 5212, "text_loss": 0.12255420535802841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0005559053447958958, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8410633.0, "repeat_count": 0.0, "routers_loss": 0.0020713545382022858, "skip_count": 0.0, "step": 5214, "text_loss": 0.6878522634506226 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0005555977584018833, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8413414.0, "repeat_count": 0.0, "routers_loss": 0.0007216963567771018, "skip_count": 0.0, "step": 5216, "text_loss": 0.845878541469574 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.057861328125, "learning_rate": 0.0005552901507027048, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 8416817.0, "repeat_count": 0.0, "routers_loss": 0.002400130731984973, "skip_count": 1.0, "step": 5218, "text_loss": 0.16753672063350677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 0.0005549825218162365, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 8419617.0, "repeat_count": 0.0, "routers_loss": 0.004563181661069393, "skip_count": 0.0, "step": 5220, "text_loss": 0.26107168197631836 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.516583504549455, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.000554674871860362, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 8422686.0, "repeat_count": 1.0, "routers_loss": 0.006413881666958332, "skip_count": 1.0, "step": 5222, "text_loss": 0.6333847045898438 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005543672009529734, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 8425571.0, "repeat_count": 0.0, "routers_loss": 0.0057656955905258656, "skip_count": 3.0, "step": 5224, "text_loss": 0.4552212357521057 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.535368359260346, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0005540595092119709, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 8429038.0, "repeat_count": 2.0, "routers_loss": 0.011755156330764294, "skip_count": 2.0, "step": 5226, "text_loss": 0.16597330570220947 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0005537517967552626, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8432117.0, "repeat_count": 0.0, "routers_loss": 0.0007519085193052888, "skip_count": 0.0, "step": 5228, "text_loss": 0.6283590197563171 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.000553444063700764, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 8435176.0, "repeat_count": 0.0, "routers_loss": 0.003066456411033869, "skip_count": 0.0, "step": 5230, "text_loss": 0.2360922247171402 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0198974609375, "learning_rate": 0.0005531363101663998, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 8438515.0, "repeat_count": 0.0, "routers_loss": 0.002865589689463377, "skip_count": 0.0, "step": 5232, "text_loss": 0.8075396418571472 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0005528285362701011, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 8441731.0, "repeat_count": 0.0, "routers_loss": 0.0012521179160103202, "skip_count": 0.0, "step": 5234, "text_loss": 0.584335446357727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0005525207421298077, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 8444535.0, "repeat_count": 0.0, "routers_loss": 0.005398475099354982, "skip_count": 3.0, "step": 5236, "text_loss": 0.22711622714996338 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0966796875, "learning_rate": 0.0005522129278634669, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 8448337.0, "repeat_count": 0.0, "routers_loss": 0.002957914723083377, "skip_count": 1.0, "step": 5238, "text_loss": 0.3157515823841095 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.601115350748458, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.019287109375, "learning_rate": 0.0005519050935890335, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 8451530.0, "repeat_count": 0.0, "routers_loss": 0.007757039275020361, "skip_count": 3.0, "step": 5240, "text_loss": 0.2815830111503601 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.610507778103905, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0005515972394244704, "loss": 0.0063, "macro_f1": 0.6603773832321167, "num_tokens": 8454171.0, "repeat_count": 1.0, "routers_loss": 0.021602008491754532, "skip_count": 1.0, "step": 5242, "text_loss": 0.6024490594863892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.61990020545935, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.0005512893654877478, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 8457544.0, "repeat_count": 0.0, "routers_loss": 0.006062488537281752, "skip_count": 0.0, "step": 5244, "text_loss": 0.550110936164856 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.629292632814792, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0005509814718968435, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 8460135.0, "repeat_count": 0.0, "routers_loss": 0.002793943975120783, "skip_count": 0.0, "step": 5246, "text_loss": 0.4361286163330078 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.0005506735587697433, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8463516.0, "repeat_count": 0.0, "routers_loss": 0.0016669550677761436, "skip_count": 0.0, "step": 5248, "text_loss": 0.4642958641052246 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0005503656262244395, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 8466406.0, "repeat_count": 0.0, "routers_loss": 0.0006051387754268944, "skip_count": 0.0, "step": 5250, "text_loss": 0.3445641100406647 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 24.657469914881126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0005500576743789329, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 8468838.0, "repeat_count": 2.0, "routers_loss": 0.00654293829575181, "skip_count": 1.0, "step": 5252, "text_loss": 0.2842808663845062 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.666862342236573, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0005497497033512309, "loss": 0.0077, "macro_f1": 0.8817967176437378, "num_tokens": 8471815.0, "repeat_count": 2.0, "routers_loss": 0.03845973685383797, "skip_count": 3.0, "step": 5254, "text_loss": 0.2597215175628662 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 24.676254769592017, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0274658203125, "learning_rate": 0.0005494417132593487, "loss": 0.0047, "macro_f1": 0.9452888369560242, "num_tokens": 8475202.0, "repeat_count": 1.0, "routers_loss": 0.02252381667494774, "skip_count": 4.0, "step": 5256, "text_loss": 0.32269927859306335 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.055419921875, "learning_rate": 0.0005491337042213088, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 8478650.0, "repeat_count": 0.0, "routers_loss": 0.01232751365751028, "skip_count": 2.0, "step": 5258, "text_loss": 0.6523372530937195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0005488256763551408, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8481724.0, "repeat_count": 0.0, "routers_loss": 0.0028322834987193346, "skip_count": 0.0, "step": 5260, "text_loss": 0.4212580621242523 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0177001953125, "learning_rate": 0.0005485176297788814, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 8485833.0, "repeat_count": 0.0, "routers_loss": 0.002623105887323618, "skip_count": 2.0, "step": 5262, "text_loss": 0.16906329989433289 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.713824479013795, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0005482095646105748, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 8489089.0, "repeat_count": 1.0, "routers_loss": 0.0007179114618338645, "skip_count": 0.0, "step": 5264, "text_loss": 0.4523872137069702 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0005479014809682721, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 8492905.0, "repeat_count": 0.0, "routers_loss": 0.005234059412032366, "skip_count": 0.0, "step": 5266, "text_loss": 0.207139790058136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0005475933789700314, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 8495480.0, "repeat_count": 0.0, "routers_loss": 0.0023258263245224953, "skip_count": 0.0, "step": 5268, "text_loss": 0.18060965836048126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005472852587339183, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8499070.0, "repeat_count": 0.0, "routers_loss": 0.0013497259933501482, "skip_count": 0.0, "step": 5270, "text_loss": 0.7460769414901733 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.751394188435572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056640625, "learning_rate": 0.0005469771203780048, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 8502886.0, "repeat_count": 0.0, "routers_loss": 0.0003589815751183778, "skip_count": 0.0, "step": 5272, "text_loss": 0.48119160532951355 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044677734375, "learning_rate": 0.0005466689640203701, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8506646.0, "repeat_count": 0.0, "routers_loss": 0.006619705818593502, "skip_count": 1.0, "step": 5274, "text_loss": 0.15656520426273346 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0005463607897791005, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 8509450.0, "repeat_count": 0.0, "routers_loss": 0.002992175053805113, "skip_count": 1.0, "step": 5276, "text_loss": 0.486930251121521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0005460525977722886, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 8512851.0, "repeat_count": 0.0, "routers_loss": 0.0027784097474068403, "skip_count": 0.0, "step": 5278, "text_loss": 0.19654682278633118 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0005457443881180345, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8516858.0, "repeat_count": 0.0, "routers_loss": 0.0017648129723966122, "skip_count": 0.0, "step": 5280, "text_loss": 0.580982506275177 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.798356325212797, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0005454361609344444, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 8519912.0, "repeat_count": 2.0, "routers_loss": 0.010817649774253368, "skip_count": 3.0, "step": 5282, "text_loss": 0.2644204795360565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.000545127916339632, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8522396.0, "repeat_count": 0.0, "routers_loss": 0.001453282660804689, "skip_count": 0.0, "step": 5284, "text_loss": 0.5014839172363281 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.817141179923688, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0005448196544517168, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8525326.0, "repeat_count": 0.0, "routers_loss": 0.006645771209150553, "skip_count": 2.0, "step": 5286, "text_loss": 0.2983154058456421 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0005445113753888254, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 8528611.0, "repeat_count": 0.0, "routers_loss": 0.0005447337171062827, "skip_count": 0.0, "step": 5288, "text_loss": 0.43598243594169617 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.000544203079269091, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 8531571.0, "repeat_count": 0.0, "routers_loss": 0.0026976624503731728, "skip_count": 0.0, "step": 5290, "text_loss": 0.6454944610595703 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.0005438947662106533, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 8534565.0, "repeat_count": 0.0, "routers_loss": 0.002217630622908473, "skip_count": 0.0, "step": 5292, "text_loss": 0.742935836315155 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 24.854710889345466, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.029052734375, "learning_rate": 0.0005435864363316584, "loss": 0.0073, "macro_f1": 0.8820862174034119, "num_tokens": 8537581.0, "repeat_count": 2.0, "routers_loss": 0.030740609392523766, "skip_count": 2.0, "step": 5294, "text_loss": 0.48913639783859253 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0005432780897502588, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 8541271.0, "repeat_count": 0.0, "routers_loss": 0.005306888837367296, "skip_count": 1.0, "step": 5296, "text_loss": 0.5820846557617188 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.873495744056356, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0005429697265846137, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 8545052.0, "repeat_count": 1.0, "routers_loss": 0.002255369909107685, "skip_count": 0.0, "step": 5298, "text_loss": 0.565483808517456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0005426613469528881, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 8548605.0, "repeat_count": 0.0, "routers_loss": 0.0010787079809233546, "skip_count": 0.0, "step": 5300, "text_loss": 0.40154510736465454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.000542352950973254, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 8552581.0, "repeat_count": 0.0, "routers_loss": 0.0017972089117392898, "skip_count": 0.0, "step": 5302, "text_loss": 0.5430748462677002 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04736328125, "learning_rate": 0.0005420445387638891, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 8556360.0, "repeat_count": 0.0, "routers_loss": 0.0016180560924112797, "skip_count": 2.0, "step": 5304, "text_loss": 0.544040322303772 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.911065453478134, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0005417361104429777, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 8559264.0, "repeat_count": 1.0, "routers_loss": 0.012688961811363697, "skip_count": 2.0, "step": 5306, "text_loss": 0.2018517404794693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0005414276661287101, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 8562169.0, "repeat_count": 0.0, "routers_loss": 0.0012141643092036247, "skip_count": 0.0, "step": 5308, "text_loss": 0.5685747265815735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.059326171875, "learning_rate": 0.0005411192059392826, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 8565231.0, "repeat_count": 0.0, "routers_loss": 0.0015626107342541218, "skip_count": 0.0, "step": 5310, "text_loss": 0.8073471784591675 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0005408107299928979, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8568122.0, "repeat_count": 0.0, "routers_loss": 0.004773529712110758, "skip_count": 0.0, "step": 5312, "text_loss": 0.22583355009555817 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0005405022384077644, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 8571056.0, "repeat_count": 0.0, "routers_loss": 0.0025621228851377964, "skip_count": 1.0, "step": 5314, "text_loss": 0.25274428725242615 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.0005401937313020967, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 8574300.0, "repeat_count": 0.0, "routers_loss": 0.009726752527058125, "skip_count": 2.0, "step": 5316, "text_loss": 0.3283393979072571 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 24.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0005398852087941155, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 8577424.0, "repeat_count": 0.0, "routers_loss": 0.012483839876949787, "skip_count": 4.0, "step": 5318, "text_loss": 0.1876130849123001 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.000539576671002047, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 8580309.0, "repeat_count": 0.0, "routers_loss": 0.0009830677881836891, "skip_count": 0.0, "step": 5320, "text_loss": 0.6955490708351135 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046875, "learning_rate": 0.0005392681180441235, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 8583399.0, "repeat_count": 0.0, "routers_loss": 0.0010819481685757637, "skip_count": 0.0, "step": 5322, "text_loss": 0.4708341956138611 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.000538959550038583, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 8586259.0, "repeat_count": 0.0, "routers_loss": 0.005763369146734476, "skip_count": 0.0, "step": 5324, "text_loss": 0.20463642477989197 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0005386509671036695, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 8589067.0, "repeat_count": 0.0, "routers_loss": 0.0006229027640074492, "skip_count": 0.0, "step": 5326, "text_loss": 0.6819888353347778 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 24.0, "epoch": 25.014088641033165, "f1_execute": 0.9767441749572754, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.03466796875, "learning_rate": 0.0005383423693576325, "loss": 0.0087, "macro_f1": 0.9619450569152832, "num_tokens": 8592837.0, "repeat_count": 1.0, "routers_loss": 0.030066559091210365, "skip_count": 6.0, "step": 5328, "text_loss": 0.24606549739837646 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.023481068388612, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 0.0005380337569187272, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 8596293.0, "repeat_count": 1.0, "routers_loss": 0.007445990107953548, "skip_count": 0.0, "step": 5330, "text_loss": 0.16730253398418427 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 25.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0005377251299052145, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 8599360.0, "repeat_count": 1.0, "routers_loss": 0.004563331138342619, "skip_count": 1.0, "step": 5332, "text_loss": 0.6856988668441772 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0005374164884353608, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8602376.0, "repeat_count": 0.0, "routers_loss": 0.0015491938684135675, "skip_count": 0.0, "step": 5334, "text_loss": 1.3248854875564575 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005371078326274382, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 8605400.0, "repeat_count": 0.0, "routers_loss": 0.0016098044579848647, "skip_count": 0.0, "step": 5336, "text_loss": 0.747150182723999 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 25.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0005367991625997243, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 8608100.0, "repeat_count": 0.0, "routers_loss": 0.0034471298567950726, "skip_count": 3.0, "step": 5338, "text_loss": 0.6443291902542114 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.070443205165834, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0005364904784705015, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 8611768.0, "repeat_count": 0.0, "routers_loss": 0.007947597652673721, "skip_count": 1.0, "step": 5340, "text_loss": 0.7768037915229797 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 25.07983563252128, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.049072265625, "learning_rate": 0.0005361817803580588, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 8614424.0, "repeat_count": 2.0, "routers_loss": 0.009964234195649624, "skip_count": 2.0, "step": 5342, "text_loss": 0.22826914489269257 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0005358730683806896, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 8617826.0, "repeat_count": 0.0, "routers_loss": 0.0014116480015218258, "skip_count": 0.0, "step": 5344, "text_loss": 0.49022090435028076 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 25.098620487232168, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03564453125, "learning_rate": 0.0005355643426566929, "loss": 0.0061, "macro_f1": 0.8823530077934265, "num_tokens": 8621220.0, "repeat_count": 1.0, "routers_loss": 0.013940622098743916, "skip_count": 2.0, "step": 5346, "text_loss": 0.26819515228271484 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.000535255603304373, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 8623957.0, "repeat_count": 0.0, "routers_loss": 0.0032230091746896505, "skip_count": 2.0, "step": 5348, "text_loss": 0.46905452013015747 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005349468504420395, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 8626760.0, "repeat_count": 0.0, "routers_loss": 0.002631337149068713, "skip_count": 1.0, "step": 5350, "text_loss": 0.5312309861183167 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.0005346380841880068, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 8630207.0, "repeat_count": 0.0, "routers_loss": 0.004526057746261358, "skip_count": 2.0, "step": 5352, "text_loss": 0.5810666084289551 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0005343293046605949, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8633241.0, "repeat_count": 0.0, "routers_loss": 0.0023941127583384514, "skip_count": 0.0, "step": 5354, "text_loss": 0.18468725681304932 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.145582624009393, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0005340205119781288, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 8636215.0, "repeat_count": 1.0, "routers_loss": 0.0017020340310409665, "skip_count": 0.0, "step": 5356, "text_loss": 0.6665788888931274 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005337117062589383, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 8639326.0, "repeat_count": 0.0, "routers_loss": 0.004964717663824558, "skip_count": 2.0, "step": 5358, "text_loss": 0.19770404696464539 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023681640625, "learning_rate": 0.0005334028876213585, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8642157.0, "repeat_count": 0.0, "routers_loss": 0.006587155628949404, "skip_count": 0.0, "step": 5360, "text_loss": 0.2295130044221878 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0005330940561837291, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8645355.0, "repeat_count": 0.0, "routers_loss": 0.0006586945964954793, "skip_count": 0.0, "step": 5362, "text_loss": 0.2701159417629242 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.18315233343117, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0005327852120643947, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8648911.0, "repeat_count": 1.0, "routers_loss": 0.0014281768817454576, "skip_count": 0.0, "step": 5364, "text_loss": 0.8957229852676392 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0005324763553817053, "loss": 0.0027, "macro_f1": 0.3333333432674408, "num_tokens": 8652037.0, "repeat_count": 0.0, "routers_loss": 0.0005899337120354176, "skip_count": 0.0, "step": 5366, "text_loss": 0.38642236590385437 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 25.20193718814206, "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0005321674862540154, "loss": 0.0058, "macro_f1": 0.9265305995941162, "num_tokens": 8655381.0, "repeat_count": 3.0, "routers_loss": 0.024511313065886497, "skip_count": 1.0, "step": 5368, "text_loss": 0.6439879536628723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.000531858604799684, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 8658476.0, "repeat_count": 0.0, "routers_loss": 0.0012558114249259233, "skip_count": 0.0, "step": 5370, "text_loss": 0.3227672874927521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06005859375, "learning_rate": 0.0005315497111370752, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 8661982.0, "repeat_count": 0.0, "routers_loss": 0.0013541636290028691, "skip_count": 0.0, "step": 5372, "text_loss": 0.6375321745872498 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 25.230114470208395, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.051513671875, "learning_rate": 0.0005312408053845575, "loss": 0.0052, "macro_f1": 0.5492662787437439, "num_tokens": 8665071.0, "repeat_count": 0.0, "routers_loss": 0.010432626120746136, "skip_count": 2.0, "step": 5374, "text_loss": 0.536924421787262 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.0005309318876605042, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 8668411.0, "repeat_count": 0.0, "routers_loss": 0.004450209904462099, "skip_count": 1.0, "step": 5376, "text_loss": 0.2643466889858246 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.248899324919282, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005306229580832933, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 8672088.0, "repeat_count": 1.0, "routers_loss": 0.011189920827746391, "skip_count": 3.0, "step": 5378, "text_loss": 0.8259533047676086 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.000530314016771307, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8675206.0, "repeat_count": 0.0, "routers_loss": 0.0020095291547477245, "skip_count": 0.0, "step": 5380, "text_loss": 0.31364113092422485 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.267684179630173, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0005300050638429324, "loss": 0.0078, "macro_f1": 0.3272727429866791, "num_tokens": 8678289.0, "repeat_count": 0.0, "routers_loss": 0.010738557204604149, "skip_count": 1.0, "step": 5382, "text_loss": 0.19013966619968414 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0005296960994165607, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 8681555.0, "repeat_count": 0.0, "routers_loss": 0.0018534278497099876, "skip_count": 1.0, "step": 5384, "text_loss": 0.762248694896698 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.286469034341064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0005293871236105877, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 8684413.0, "repeat_count": 0.0, "routers_loss": 0.009143726900219917, "skip_count": 2.0, "step": 5386, "text_loss": 0.19994212687015533 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 25.295861461696507, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0005290781365434134, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 8687450.0, "repeat_count": 2.0, "routers_loss": 0.002034468576312065, "skip_count": 0.0, "step": 5388, "text_loss": 0.5519160628318787 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.30525388905195, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0005287691383334425, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 8690651.0, "repeat_count": 1.0, "routers_loss": 0.006834167055785656, "skip_count": 0.0, "step": 5390, "text_loss": 0.5439304709434509 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.314646316407398, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.060791015625, "learning_rate": 0.0005284601290990832, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8693929.0, "repeat_count": 1.0, "routers_loss": 0.0022327799815684557, "skip_count": 0.0, "step": 5392, "text_loss": 0.24108269810676575 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0005281511089587491, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 8696727.0, "repeat_count": 0.0, "routers_loss": 0.002669565612450242, "skip_count": 0.0, "step": 5394, "text_loss": 0.8659077286720276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0005278420780308568, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8700934.0, "repeat_count": 0.0, "routers_loss": 0.007252473384141922, "skip_count": 0.0, "step": 5396, "text_loss": 0.5592793226242065 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 0.0005275330364338276, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 8704449.0, "repeat_count": 0.0, "routers_loss": 0.001793015981093049, "skip_count": 0.0, "step": 5398, "text_loss": 0.5211784243583679 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 25.352216025829176, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 0.0005272239842860868, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 8707384.0, "repeat_count": 5.0, "routers_loss": 0.00963665172457695, "skip_count": 4.0, "step": 5400, "text_loss": 0.6092788577079773 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 25.36160845318462, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03515625, "learning_rate": 0.0005269149217060642, "loss": 0.0059, "macro_f1": 0.5492662787437439, "num_tokens": 8710453.0, "repeat_count": 0.0, "routers_loss": 0.01758105307817459, "skip_count": 2.0, "step": 5402, "text_loss": 0.3423936069011688 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0005266058488121926, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 8713514.0, "repeat_count": 0.0, "routers_loss": 0.0025636721402406693, "skip_count": 1.0, "step": 5404, "text_loss": 0.484171986579895 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.38039330789551, "f1_execute": 0.9767441749572754, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0005262967657229095, "loss": 0.0064, "macro_f1": 0.9255813956260681, "num_tokens": 8717051.0, "repeat_count": 3.0, "routers_loss": 0.022406045347452164, "skip_count": 4.0, "step": 5406, "text_loss": 0.23368191719055176 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0005259876725566563, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8719987.0, "repeat_count": 0.0, "routers_loss": 0.004114408977329731, "skip_count": 2.0, "step": 5408, "text_loss": 0.20237496495246887 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.000525678569431878, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 8723258.0, "repeat_count": 0.0, "routers_loss": 0.006741158664226532, "skip_count": 2.0, "step": 5410, "text_loss": 0.7969435453414917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 0.0005253694564670233, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 8726294.0, "repeat_count": 0.0, "routers_loss": 0.0034468702506273985, "skip_count": 0.0, "step": 5412, "text_loss": 0.5533816814422607 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.000525060333780545, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 8729603.0, "repeat_count": 0.0, "routers_loss": 0.01086533535271883, "skip_count": 2.0, "step": 5414, "text_loss": 0.31856611371040344 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 25.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0005247512014908998, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 8733423.0, "repeat_count": 0.0, "routers_loss": 0.00512756546959281, "skip_count": 6.0, "step": 5416, "text_loss": 0.6710903644561768 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06103515625, "learning_rate": 0.0005244420597165472, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 8736457.0, "repeat_count": 0.0, "routers_loss": 0.0026201079599559307, "skip_count": 0.0, "step": 5418, "text_loss": 0.6469964981079102 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.0005241329085759514, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 8739617.0, "repeat_count": 0.0, "routers_loss": 0.004130818881094456, "skip_count": 0.0, "step": 5420, "text_loss": 0.4868837296962738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0005238237481875795, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 8742653.0, "repeat_count": 0.0, "routers_loss": 0.003171122632920742, "skip_count": 0.0, "step": 5422, "text_loss": 0.12026242166757584 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0005235145786699021, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 8745835.0, "repeat_count": 0.0, "routers_loss": 0.0008553664083592594, "skip_count": 0.0, "step": 5424, "text_loss": 0.601640522480011 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0005232054001413941, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 8749006.0, "repeat_count": 0.0, "routers_loss": 0.0006958908052183688, "skip_count": 0.0, "step": 5426, "text_loss": 0.7083519101142883 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 0.0005228962127205329, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 8752493.0, "repeat_count": 0.0, "routers_loss": 0.0012221037177368999, "skip_count": 1.0, "step": 5428, "text_loss": 0.3949109613895416 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.493102436160846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0005225870165257997, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 8755294.0, "repeat_count": 1.0, "routers_loss": 0.003924673888832331, "skip_count": 2.0, "step": 5430, "text_loss": 0.7487186789512634 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005222778116756793, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8758043.0, "repeat_count": 0.0, "routers_loss": 0.002388258930295706, "skip_count": 0.0, "step": 5432, "text_loss": 0.4092858135700226 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.511887290871734, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0005219685982886594, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 8760618.0, "repeat_count": 1.0, "routers_loss": 0.0045886957086622715, "skip_count": 0.0, "step": 5434, "text_loss": 0.5889580249786377 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.52127971822718, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.052978515625, "learning_rate": 0.0005216593764832311, "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 8764269.0, "repeat_count": 1.0, "routers_loss": 0.00704155582934618, "skip_count": 2.0, "step": 5436, "text_loss": 0.2634117007255554 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 0.0005213501463778889, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 8767142.0, "repeat_count": 0.0, "routers_loss": 0.00368728069588542, "skip_count": 2.0, "step": 5438, "text_loss": 0.3512301445007324 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05322265625, "learning_rate": 0.0005210409080911304, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 8770239.0, "repeat_count": 0.0, "routers_loss": 0.0012925115879625082, "skip_count": 0.0, "step": 5440, "text_loss": 0.9330073595046997 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0005207316617414561, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8772927.0, "repeat_count": 0.0, "routers_loss": 0.005604506935924292, "skip_count": 0.0, "step": 5442, "text_loss": 0.23477613925933838 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.55884942764896, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0196533203125, "learning_rate": 0.0005204224074473701, "loss": 0.0049, "macro_f1": 0.6601307392120361, "num_tokens": 8776451.0, "repeat_count": 1.0, "routers_loss": 0.010945434682071209, "skip_count": 2.0, "step": 5444, "text_loss": 0.6184295415878296 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0005201131453273789, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 8779481.0, "repeat_count": 0.0, "routers_loss": 0.0024414353538304567, "skip_count": 0.0, "step": 5446, "text_loss": 0.16186967492103577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.57763428235985, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.0005198038754999926, "loss": 0.0052, "macro_f1": 0.3272727429866791, "num_tokens": 8782425.0, "repeat_count": 1.0, "routers_loss": 0.013872416689991951, "skip_count": 0.0, "step": 5448, "text_loss": 0.42294546961784363 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0005194945980837237, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 8785466.0, "repeat_count": 0.0, "routers_loss": 0.0006147907115519047, "skip_count": 0.0, "step": 5450, "text_loss": 0.6285432577133179 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0005191853131970881, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 8788461.0, "repeat_count": 0.0, "routers_loss": 0.0010585964191704988, "skip_count": 0.0, "step": 5452, "text_loss": 0.6032317876815796 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.042724609375, "learning_rate": 0.0005188760209586044, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8791572.0, "repeat_count": 0.0, "routers_loss": 0.005267909727990627, "skip_count": 1.0, "step": 5454, "text_loss": 0.3015609681606293 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0005185667214867937, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 8794697.0, "repeat_count": 0.0, "routers_loss": 0.000532392121385783, "skip_count": 0.0, "step": 5456, "text_loss": 0.9596265554428101 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0005182574149001805, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 8797880.0, "repeat_count": 0.0, "routers_loss": 0.0007176774088293314, "skip_count": 0.0, "step": 5458, "text_loss": 0.5599364638328552 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0005179481013172912, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8801995.0, "repeat_count": 0.0, "routers_loss": 0.0022756673861294985, "skip_count": 0.0, "step": 5460, "text_loss": 0.47327280044555664 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005176387808566558, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 8805138.0, "repeat_count": 0.0, "routers_loss": 0.0025084633380174637, "skip_count": 0.0, "step": 5462, "text_loss": 0.26674970984458923 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05078125, "learning_rate": 0.0005173294536368061, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 8808102.0, "repeat_count": 0.0, "routers_loss": 0.0008814680040813982, "skip_count": 0.0, "step": 5464, "text_loss": 0.5981299877166748 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0005170201197762773, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8811431.0, "repeat_count": 0.0, "routers_loss": 0.0005443177651613951, "skip_count": 0.0, "step": 5466, "text_loss": 1.037438988685608 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0005167107793936065, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 8814256.0, "repeat_count": 0.0, "routers_loss": 0.000494555220939219, "skip_count": 0.0, "step": 5468, "text_loss": 0.5005733966827393 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005164014326073333, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 8817024.0, "repeat_count": 0.0, "routers_loss": 0.004793747793883085, "skip_count": 2.0, "step": 5470, "text_loss": 0.6999614834785461 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005160920795360002, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 8819892.0, "repeat_count": 0.0, "routers_loss": 0.0020966180600225925, "skip_count": 0.0, "step": 5472, "text_loss": 0.5536707043647766 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0005157827202981521, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8822928.0, "repeat_count": 0.0, "routers_loss": 0.0020367507822811604, "skip_count": 0.0, "step": 5474, "text_loss": 0.43655988574028015 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0005154733550123356, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8825842.0, "repeat_count": 0.0, "routers_loss": 0.0020070383325219154, "skip_count": 0.0, "step": 5476, "text_loss": 0.48149657249450684 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0005151639837971004, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 8829534.0, "repeat_count": 0.0, "routers_loss": 0.0016327418852597475, "skip_count": 0.0, "step": 5478, "text_loss": 0.6693689227104187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.000514854606770998, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 8833177.0, "repeat_count": 0.0, "routers_loss": 0.0012691980227828026, "skip_count": 0.0, "step": 5480, "text_loss": 0.44926801323890686 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.737305547402407, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0005145452240525822, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8836933.0, "repeat_count": 1.0, "routers_loss": 0.0007724820752628148, "skip_count": 0.0, "step": 5482, "text_loss": 0.5759884119033813 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 25.74669797475785, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005142358357604092, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 8840093.0, "repeat_count": 1.0, "routers_loss": 0.008331702090799809, "skip_count": 7.0, "step": 5484, "text_loss": 0.47393685579299927 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 0.0005139264420130368, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 8843918.0, "repeat_count": 0.0, "routers_loss": 0.003124477108940482, "skip_count": 2.0, "step": 5486, "text_loss": 0.5298711061477661 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08447265625, "learning_rate": 0.0005136170429290259, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 8846558.0, "repeat_count": 0.0, "routers_loss": 0.0034127775579690933, "skip_count": 2.0, "step": 5488, "text_loss": 0.43582668900489807 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.774875256824185, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.0005133076386269383, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 8849724.0, "repeat_count": 1.0, "routers_loss": 0.0018056259723380208, "skip_count": 0.0, "step": 5490, "text_loss": 0.8116800785064697 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 25.784267684179632, "f1_execute": 0.9767441749572754, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0005129982292253384, "loss": 0.0063, "macro_f1": 0.6589147448539734, "num_tokens": 8852447.0, "repeat_count": 1.0, "routers_loss": 0.021452350541949272, "skip_count": 6.0, "step": 5492, "text_loss": 0.31878748536109924 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0005126888148427927, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 8855886.0, "repeat_count": 0.0, "routers_loss": 0.0026911941822618246, "skip_count": 0.0, "step": 5494, "text_loss": 0.4021807909011841 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 25.80305253889052, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.025634765625, "learning_rate": 0.0005123793955978693, "loss": 0.007, "macro_f1": 0.5492662787437439, "num_tokens": 8859378.0, "repeat_count": 0.0, "routers_loss": 0.019764510914683342, "skip_count": 2.0, "step": 5496, "text_loss": 0.21608132123947144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0005120699716091379, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 8862310.0, "repeat_count": 0.0, "routers_loss": 0.0008988190093077719, "skip_count": 0.0, "step": 5498, "text_loss": 0.34666743874549866 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0005117605429951707, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8865166.0, "repeat_count": 0.0, "routers_loss": 0.011137975379824638, "skip_count": 2.0, "step": 5500, "text_loss": 0.25385144352912903 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 25.831229820956853, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0005114511098745412, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 8869923.0, "repeat_count": 1.0, "routers_loss": 0.006476947572082281, "skip_count": 4.0, "step": 5502, "text_loss": 0.4503856301307678 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.000511141672365825, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8872451.0, "repeat_count": 0.0, "routers_loss": 0.0022727579344063997, "skip_count": 0.0, "step": 5504, "text_loss": 0.7522464990615845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.0005108322305875987, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8875968.0, "repeat_count": 0.0, "routers_loss": 0.0020014268811792135, "skip_count": 0.0, "step": 5506, "text_loss": 0.30184176564216614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04833984375, "learning_rate": 0.0005105227846584414, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8879705.0, "repeat_count": 0.0, "routers_loss": 0.001179999322630465, "skip_count": 0.0, "step": 5508, "text_loss": 0.6187804937362671 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.86879953037863, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0005102133346969329, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 8883535.0, "repeat_count": 1.0, "routers_loss": 0.002946492750197649, "skip_count": 0.0, "step": 5510, "text_loss": 0.5961501002311707 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.878191957734078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0005099038808216555, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 8886683.0, "repeat_count": 1.0, "routers_loss": 0.004532935563474894, "skip_count": 3.0, "step": 5512, "text_loss": 0.38462957739830017 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0005095944231511922, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 8891049.0, "repeat_count": 0.0, "routers_loss": 0.00917842984199524, "skip_count": 2.0, "step": 5514, "text_loss": 0.27541956305503845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0005092849618041279, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 8893604.0, "repeat_count": 0.0, "routers_loss": 0.0008756510796956718, "skip_count": 0.0, "step": 5516, "text_loss": 0.681315541267395 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.906369239800412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.042236328125, "learning_rate": 0.0005089754968990487, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 8898072.0, "repeat_count": 0.0, "routers_loss": 0.0008704439387656748, "skip_count": 1.0, "step": 5518, "text_loss": 0.5060005187988281 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0005086660285545422, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 8901539.0, "repeat_count": 0.0, "routers_loss": 0.004750201944261789, "skip_count": 1.0, "step": 5520, "text_loss": 0.6008047461509705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.000508356556889197, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8904525.0, "repeat_count": 0.0, "routers_loss": 0.0026552649214863777, "skip_count": 0.0, "step": 5522, "text_loss": 0.4539012908935547 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.934546521866746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005080470820216037, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 8907624.0, "repeat_count": 0.0, "routers_loss": 0.002621029270812869, "skip_count": 1.0, "step": 5524, "text_loss": 0.20088370144367218 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 25.94393894922219, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0005077376040703533, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 8910515.0, "repeat_count": 3.0, "routers_loss": 0.0028921898920089006, "skip_count": 0.0, "step": 5526, "text_loss": 0.6575983166694641 }, { "acc_repeat": 1.0, "acc_skip": 0.8888888955116272, "avg_layers": 21.0, "epoch": 25.953331376577633, "f1_execute": 0.9729729890823364, "f1_repeat": 1.0, "f1_skip": 0.9411765336990356, "grad_norm": 0.02734375, "learning_rate": 0.0005074281231540384, "loss": 0.0076, "macro_f1": 0.9713832139968872, "num_tokens": 8914419.0, "repeat_count": 1.0, "routers_loss": 0.024232301861047745, "skip_count": 9.0, "step": 5528, "text_loss": 0.5435594916343689 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 0.0005071186393912527, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 8917543.0, "repeat_count": 0.0, "routers_loss": 0.003731841454282403, "skip_count": 2.0, "step": 5530, "text_loss": 0.5152071118354797 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.972116231288524, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0005068091529005909, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 8920728.0, "repeat_count": 1.0, "routers_loss": 0.005905418191105127, "skip_count": 0.0, "step": 5532, "text_loss": 0.29741042852401733 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.981508658643968, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.000506499663800649, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 8924112.0, "repeat_count": 1.0, "routers_loss": 0.0021933517418801785, "skip_count": 0.0, "step": 5534, "text_loss": 0.45704230666160583 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 25.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0005061901722100235, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 8927323.0, "repeat_count": 0.0, "routers_loss": 0.009227502159774303, "skip_count": 4.0, "step": 5536, "text_loss": 0.1968434453010559 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.0, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.0005058806782473125, "loss": 0.0053, "macro_f1": 0.6601307392120361, "num_tokens": 8931052.0, "repeat_count": 1.0, "routers_loss": 0.02054760232567787, "skip_count": 2.0, "step": 5538, "text_loss": 0.23851273953914642 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0005055711820311144, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8934215.0, "repeat_count": 0.0, "routers_loss": 0.0008434011251665652, "skip_count": 0.0, "step": 5540, "text_loss": 0.85942542552948 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 26.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.0005052616836800288, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 8937173.0, "repeat_count": 0.0, "routers_loss": 0.011105241253972054, "skip_count": 4.0, "step": 5542, "text_loss": 0.2614556849002838 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0005049521833126561, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8940553.0, "repeat_count": 0.0, "routers_loss": 0.0006273435428738594, "skip_count": 0.0, "step": 5544, "text_loss": 0.6430498957633972 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0005046426810475976, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 8943753.0, "repeat_count": 0.0, "routers_loss": 0.0023464353289455175, "skip_count": 1.0, "step": 5546, "text_loss": 0.7015808820724487 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06689453125, "learning_rate": 0.0005043331770034547, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 8947149.0, "repeat_count": 0.0, "routers_loss": 0.0016024730866774917, "skip_count": 1.0, "step": 5548, "text_loss": 0.5875257253646851 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0005040236712988304, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8950374.0, "repeat_count": 0.0, "routers_loss": 0.004096277989447117, "skip_count": 0.0, "step": 5550, "text_loss": 0.1712338626384735 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.065746991488112, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.0005037141640523275, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 8953256.0, "repeat_count": 1.0, "routers_loss": 0.00441550649702549, "skip_count": 0.0, "step": 5552, "text_loss": 0.16560404002666473 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.07513941884356, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0005034046553825501, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 8956845.0, "repeat_count": 4.0, "routers_loss": 0.011712636798620224, "skip_count": 6.0, "step": 5554, "text_loss": 0.24278216063976288 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 0.0005030951454081023, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8961165.0, "repeat_count": 0.0, "routers_loss": 0.00235542468726635, "skip_count": 1.0, "step": 5556, "text_loss": 0.17214511334896088 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.093924273554446, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0005027856342475888, "loss": 0.0037, "macro_f1": 0.3272727429866791, "num_tokens": 8965262.0, "repeat_count": 0.0, "routers_loss": 0.0160827673971653, "skip_count": 1.0, "step": 5558, "text_loss": 0.40229740738868713 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0005024761220196151, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 8968278.0, "repeat_count": 1.0, "routers_loss": 0.004786997567862272, "skip_count": 0.0, "step": 5560, "text_loss": 0.24828575551509857 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.112709128265337, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.0005021666088427868, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 8971443.0, "repeat_count": 1.0, "routers_loss": 0.0015378865646198392, "skip_count": 0.0, "step": 5562, "text_loss": 0.7269657254219055 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01904296875, "learning_rate": 0.0005018570948357099, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8975312.0, "repeat_count": 0.0, "routers_loss": 0.0015218508196994662, "skip_count": 0.0, "step": 5564, "text_loss": 0.5198811292648315 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.0005015475801169908, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 8977951.0, "repeat_count": 0.0, "routers_loss": 0.008865317329764366, "skip_count": 1.0, "step": 5566, "text_loss": 0.1541406810283661 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.14088641033167, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0005012380648052359, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 8981325.0, "repeat_count": 1.0, "routers_loss": 0.0055318837985396385, "skip_count": 0.0, "step": 5568, "text_loss": 0.510314404964447 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0005009285490190523, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8984661.0, "repeat_count": 0.0, "routers_loss": 0.0035060355439782143, "skip_count": 0.0, "step": 5570, "text_loss": 0.29421761631965637 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.000500619032877047, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 8987573.0, "repeat_count": 0.0, "routers_loss": 0.0050126477144658566, "skip_count": 2.0, "step": 5572, "text_loss": 0.1984361708164215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005003095164978271, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 8991136.0, "repeat_count": 0.0, "routers_loss": 0.0019407360814511776, "skip_count": 0.0, "step": 5574, "text_loss": 0.42751404643058777 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 0.0005, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 8994198.0, "repeat_count": 0.0, "routers_loss": 0.0029819176997989416, "skip_count": 2.0, "step": 5576, "text_loss": 0.20589640736579895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.187848547108892, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 0.0004996904835021729, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 8997907.0, "repeat_count": 0.0, "routers_loss": 0.000878945691511035, "skip_count": 1.0, "step": 5578, "text_loss": 0.2801406979560852 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.19724097446434, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.000499380967122953, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 9001141.0, "repeat_count": 0.0, "routers_loss": 0.005223734769970179, "skip_count": 1.0, "step": 5580, "text_loss": 0.20542480051517487 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0004990714509809478, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 9004794.0, "repeat_count": 0.0, "routers_loss": 0.0015868612099438906, "skip_count": 0.0, "step": 5582, "text_loss": 0.32094934582710266 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 26.216025829175226, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.041259765625, "learning_rate": 0.0004987619351947643, "loss": 0.0064, "macro_f1": 0.6122449040412903, "num_tokens": 9009250.0, "repeat_count": 0.0, "routers_loss": 0.031923454254865646, "skip_count": 4.0, "step": 5584, "text_loss": 0.609201967716217 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.225418256530673, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0004984524198830095, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 9013254.0, "repeat_count": 0.0, "routers_loss": 0.0033124545589089394, "skip_count": 0.0, "step": 5586, "text_loss": 0.3698650300502777 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0004981429051642903, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 9016598.0, "repeat_count": 0.0, "routers_loss": 0.0017190382350236177, "skip_count": 1.0, "step": 5588, "text_loss": 0.5306026935577393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.24420311124156, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0004978333911572132, "loss": 0.0059, "macro_f1": 0.3272727429866791, "num_tokens": 9019558.0, "repeat_count": 0.0, "routers_loss": 0.02051064372062683, "skip_count": 1.0, "step": 5590, "text_loss": 0.23494470119476318 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0004975238779803849, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 9023024.0, "repeat_count": 0.0, "routers_loss": 0.0010489600244909525, "skip_count": 0.0, "step": 5592, "text_loss": 0.579275906085968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 0.0004972143657524112, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 9026161.0, "repeat_count": 0.0, "routers_loss": 0.0012039231369271874, "skip_count": 0.0, "step": 5594, "text_loss": 0.5776295065879822 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0004969048545918978, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 9028814.0, "repeat_count": 0.0, "routers_loss": 0.0010212450288236141, "skip_count": 1.0, "step": 5596, "text_loss": 0.6816855669021606 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 26.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00049659534461745, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 9032243.0, "repeat_count": 2.0, "routers_loss": 0.0024297661148011684, "skip_count": 0.0, "step": 5598, "text_loss": 0.743188202381134 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0004962858359476726, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 9035493.0, "repeat_count": 0.0, "routers_loss": 0.002151754219084978, "skip_count": 0.0, "step": 5600, "text_loss": 0.5213983654975891 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.0004959763287011698, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 9038213.0, "repeat_count": 0.0, "routers_loss": 0.0028108188416808844, "skip_count": 2.0, "step": 5602, "text_loss": 0.5128397345542908 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0004956668229965454, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 9041152.0, "repeat_count": 0.0, "routers_loss": 0.004022551700472832, "skip_count": 2.0, "step": 5604, "text_loss": 0.15361636877059937 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0004953573189524026, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 9044503.0, "repeat_count": 0.0, "routers_loss": 0.0010689410846680403, "skip_count": 1.0, "step": 5606, "text_loss": 0.6454885005950928 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0004950478166873439, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 9047742.0, "repeat_count": 0.0, "routers_loss": 0.0025760293938219547, "skip_count": 0.0, "step": 5608, "text_loss": 0.7654000520706177 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0004947383163199713, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 9050349.0, "repeat_count": 0.0, "routers_loss": 0.0009846165776252747, "skip_count": 0.0, "step": 5610, "text_loss": 0.41533342003822327 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 0.0004944288179688858, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 9053667.0, "repeat_count": 0.0, "routers_loss": 0.0017193946987390518, "skip_count": 1.0, "step": 5612, "text_loss": 1.0172475576400757 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0004941193217526875, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 9056777.0, "repeat_count": 0.0, "routers_loss": 0.0026750199031084776, "skip_count": 0.0, "step": 5614, "text_loss": 0.17584927380084991 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 26.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0004938098277899765, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 9060609.0, "repeat_count": 1.0, "routers_loss": 0.005259076599031687, "skip_count": 1.0, "step": 5616, "text_loss": 0.5522297024726868 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.375697094217788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004935003361993511, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9063633.0, "repeat_count": 0.0, "routers_loss": 0.0006837095716036856, "skip_count": 0.0, "step": 5618, "text_loss": 0.5212588310241699 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.38508952157323, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0004931908470994091, "loss": 0.0059, "macro_f1": 0.6603773832321167, "num_tokens": 9067777.0, "repeat_count": 1.0, "routers_loss": 0.01067375484853983, "skip_count": 1.0, "step": 5620, "text_loss": 0.5515062808990479 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 26.394481948928675, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.019775390625, "learning_rate": 0.0004928813606087474, "loss": 0.0043, "macro_f1": 0.5934640765190125, "num_tokens": 9070938.0, "repeat_count": 0.0, "routers_loss": 0.016635602340102196, "skip_count": 3.0, "step": 5622, "text_loss": 0.3225076198577881 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.403874376284122, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004925718768459617, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9074050.0, "repeat_count": 0.0, "routers_loss": 0.002216119086369872, "skip_count": 0.0, "step": 5624, "text_loss": 0.32438889145851135 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 26.413266803639566, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040771484375, "learning_rate": 0.0004922623959296469, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 9076785.0, "repeat_count": 1.0, "routers_loss": 0.012125075794756413, "skip_count": 5.0, "step": 5626, "text_loss": 0.39563658833503723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0004919529179783965, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9080239.0, "repeat_count": 0.0, "routers_loss": 0.0026486809365451336, "skip_count": 0.0, "step": 5628, "text_loss": 0.5401569604873657 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0004916434431108031, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9083935.0, "repeat_count": 0.0, "routers_loss": 0.0011849761940538883, "skip_count": 0.0, "step": 5630, "text_loss": 0.4798774719238281 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 0.000491333971445458, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 9087174.0, "repeat_count": 0.0, "routers_loss": 0.002799210138618946, "skip_count": 0.0, "step": 5632, "text_loss": 0.22488386929035187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0004910245031009515, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 9089803.0, "repeat_count": 0.0, "routers_loss": 0.00139117450453341, "skip_count": 0.0, "step": 5634, "text_loss": 0.6237335205078125 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0004907150381958723, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 9093075.0, "repeat_count": 0.0, "routers_loss": 0.006503603886812925, "skip_count": 1.0, "step": 5636, "text_loss": 0.18781614303588867 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 0.0004904055768488077, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9096355.0, "repeat_count": 0.0, "routers_loss": 0.0009764843271113932, "skip_count": 0.0, "step": 5638, "text_loss": 0.6821450591087341 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.479013795127678, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0004900961191783445, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 9098994.0, "repeat_count": 1.0, "routers_loss": 0.00693159457296133, "skip_count": 3.0, "step": 5640, "text_loss": 0.214790940284729 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0004897866653030671, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 9102048.0, "repeat_count": 0.0, "routers_loss": 0.002469591563567519, "skip_count": 0.0, "step": 5642, "text_loss": 0.1556607335805893 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0004894772153415588, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9105379.0, "repeat_count": 0.0, "routers_loss": 0.0004824921488761902, "skip_count": 0.0, "step": 5644, "text_loss": 0.499972403049469 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0004891677694124013, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 9108240.0, "repeat_count": 0.0, "routers_loss": 0.0029356612358242273, "skip_count": 1.0, "step": 5646, "text_loss": 0.5169754028320312 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.516583504549455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0174560546875, "learning_rate": 0.0004888583276341751, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 9111381.0, "repeat_count": 0.0, "routers_loss": 0.009489183314144611, "skip_count": 1.0, "step": 5648, "text_loss": 0.23630797863006592 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.017822265625, "learning_rate": 0.0004885488901254588, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 9114015.0, "repeat_count": 0.0, "routers_loss": 0.004154495894908905, "skip_count": 1.0, "step": 5650, "text_loss": 0.3345947563648224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0546875, "learning_rate": 0.0004882394570048294, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 9117044.0, "repeat_count": 0.0, "routers_loss": 0.0018865863094106317, "skip_count": 0.0, "step": 5652, "text_loss": 0.32814112305641174 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 0.0004879300283908623, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 9120035.0, "repeat_count": 0.0, "routers_loss": 0.0035278978757560253, "skip_count": 1.0, "step": 5654, "text_loss": 0.4081386625766754 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.554153213971237, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.00048762060440213096, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 9122955.0, "repeat_count": 1.0, "routers_loss": 0.0053498269990086555, "skip_count": 0.0, "step": 5656, "text_loss": 0.31027838587760925 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0004873111851572075, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9125635.0, "repeat_count": 0.0, "routers_loss": 0.004556098487228155, "skip_count": 0.0, "step": 5658, "text_loss": 0.25703540444374084 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.0004870017707746617, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 9128906.0, "repeat_count": 0.0, "routers_loss": 0.0031165245454758406, "skip_count": 2.0, "step": 5660, "text_loss": 0.20663656294345856 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.58233049603757, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0004866923613730617, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 9132030.0, "repeat_count": 1.0, "routers_loss": 0.004887583665549755, "skip_count": 2.0, "step": 5662, "text_loss": 0.6062649488449097 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0004863829570709741, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 9135274.0, "repeat_count": 0.0, "routers_loss": 0.0021857863757759333, "skip_count": 0.0, "step": 5664, "text_loss": 0.49644309282302856 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 26.601115350748458, "f1_execute": 0.9756097793579102, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0004860735579869631, "loss": 0.0088, "macro_f1": 0.925203263759613, "num_tokens": 9139735.0, "repeat_count": 3.0, "routers_loss": 0.05413912236690521, "skip_count": 5.0, "step": 5666, "text_loss": 0.25161290168762207 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.610507778103905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00048576416423959097, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 9142419.0, "repeat_count": 0.0, "routers_loss": 0.002229376696050167, "skip_count": 0.0, "step": 5668, "text_loss": 0.5332949161529541 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 26.61990020545935, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0004854547759474179, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 9145443.0, "repeat_count": 1.0, "routers_loss": 0.005968933925032616, "skip_count": 4.0, "step": 5670, "text_loss": 0.5282154083251953 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.629292632814792, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.0004851453932290021, "loss": 0.0085, "macro_f1": 0.3272727429866791, "num_tokens": 9147754.0, "repeat_count": 0.0, "routers_loss": 0.04015754163265228, "skip_count": 1.0, "step": 5672, "text_loss": 0.8564629554748535 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.63868506017024, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00048483601620289974, "loss": 0.0058, "macro_f1": 0.8820862174034119, "num_tokens": 9151714.0, "repeat_count": 2.0, "routers_loss": 0.019172413274645805, "skip_count": 2.0, "step": 5674, "text_loss": 0.4149441123008728 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.648077487525683, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0004845266449876645, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9154524.0, "repeat_count": 1.0, "routers_loss": 0.005025535821914673, "skip_count": 0.0, "step": 5676, "text_loss": 0.26525792479515076 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.000484217279701848, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 9158546.0, "repeat_count": 0.0, "routers_loss": 0.0012200147612020373, "skip_count": 0.0, "step": 5678, "text_loss": 0.5532271862030029 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.666862342236573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0004839079204639998, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 9161003.0, "repeat_count": 0.0, "routers_loss": 0.0013485675444826484, "skip_count": 1.0, "step": 5680, "text_loss": 0.36826151609420776 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02001953125, "learning_rate": 0.0004835985673926668, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 9164741.0, "repeat_count": 0.0, "routers_loss": 0.00532014574855566, "skip_count": 2.0, "step": 5682, "text_loss": 0.16154609620571136 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.68564719694746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0004832892206063938, "loss": 0.0075, "macro_f1": 1.0, "num_tokens": 9168079.0, "repeat_count": 2.0, "routers_loss": 0.007782323285937309, "skip_count": 3.0, "step": 5684, "text_loss": 0.4323575496673584 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 0.0004829798802237228, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9171352.0, "repeat_count": 0.0, "routers_loss": 0.0024159469176083803, "skip_count": 2.0, "step": 5686, "text_loss": 0.3163119852542877 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.000482670546363194, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 9175197.0, "repeat_count": 0.0, "routers_loss": 0.002455134643241763, "skip_count": 0.0, "step": 5688, "text_loss": 0.59735506772995 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.713824479013795, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0004823612191433443, "loss": 0.0042, "macro_f1": 0.8820862174034119, "num_tokens": 9177648.0, "repeat_count": 2.0, "routers_loss": 0.015524548478424549, "skip_count": 2.0, "step": 5690, "text_loss": 0.759812593460083 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.00048205189868270887, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 9180694.0, "repeat_count": 0.0, "routers_loss": 0.002112736226990819, "skip_count": 2.0, "step": 5692, "text_loss": 0.3516882061958313 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 26.732609333724685, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.025146484375, "learning_rate": 0.00048174258509981973, "loss": 0.0063, "macro_f1": 0.9262410998344421, "num_tokens": 9183502.0, "repeat_count": 2.0, "routers_loss": 0.03100527822971344, "skip_count": 3.0, "step": 5694, "text_loss": 0.3722715973854065 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.0004814332785132064, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 9186417.0, "repeat_count": 0.0, "routers_loss": 0.009176591411232948, "skip_count": 2.0, "step": 5696, "text_loss": 0.33363673090934753 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.751394188435572, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0004811239790413958, "loss": 0.0076, "macro_f1": 0.3272727429866791, "num_tokens": 9189478.0, "repeat_count": 0.0, "routers_loss": 0.023586507886648178, "skip_count": 1.0, "step": 5698, "text_loss": 0.19698107242584229 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00048081468680291194, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9192115.0, "repeat_count": 0.0, "routers_loss": 0.005083440337330103, "skip_count": 1.0, "step": 5700, "text_loss": 0.3476336896419525 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0004805054019162764, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 9195176.0, "repeat_count": 0.0, "routers_loss": 0.007766073569655418, "skip_count": 1.0, "step": 5702, "text_loss": 0.27114811539649963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.0004801961245000076, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9199091.0, "repeat_count": 0.0, "routers_loss": 0.0009058842551894486, "skip_count": 0.0, "step": 5704, "text_loss": 0.6249846816062927 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0004798868546726212, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9202003.0, "repeat_count": 0.0, "routers_loss": 0.005479823332279921, "skip_count": 0.0, "step": 5706, "text_loss": 0.47223609685897827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0166015625, "learning_rate": 0.00047957759255263014, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9205277.0, "repeat_count": 0.0, "routers_loss": 0.001055705244652927, "skip_count": 0.0, "step": 5708, "text_loss": 0.677215576171875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.00047926833825854377, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 9208844.0, "repeat_count": 0.0, "routers_loss": 0.003291431115940213, "skip_count": 2.0, "step": 5710, "text_loss": 0.12439999729394913 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.817141179923688, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06591796875, "learning_rate": 0.0004789590919088696, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 9211619.0, "repeat_count": 0.0, "routers_loss": 0.005120242480188608, "skip_count": 2.0, "step": 5712, "text_loss": 0.5771954655647278 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.82653360727913, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0004786498536221111, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 9214914.0, "repeat_count": 1.0, "routers_loss": 0.004877795465290546, "skip_count": 2.0, "step": 5714, "text_loss": 0.6432198882102966 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.00047834062351676893, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 9218186.0, "repeat_count": 0.0, "routers_loss": 0.0026507999282330275, "skip_count": 0.0, "step": 5716, "text_loss": 0.23814935982227325 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.00047803140171134075, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 9221754.0, "repeat_count": 0.0, "routers_loss": 0.002605629386380315, "skip_count": 1.0, "step": 5718, "text_loss": 0.2910388708114624 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 26.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0004777221883243208, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 9224502.0, "repeat_count": 0.0, "routers_loss": 0.0048494706861674786, "skip_count": 3.0, "step": 5720, "text_loss": 0.6195104122161865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0004774129834742004, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 9227350.0, "repeat_count": 0.0, "routers_loss": 0.003092368133366108, "skip_count": 0.0, "step": 5722, "text_loss": 0.35447990894317627 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.00047710378727946725, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 9230166.0, "repeat_count": 0.0, "routers_loss": 0.012780336663126945, "skip_count": 2.0, "step": 5724, "text_loss": 0.27581867575645447 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00047679459985860604, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 9233029.0, "repeat_count": 0.0, "routers_loss": 0.005429140292108059, "skip_count": 1.0, "step": 5726, "text_loss": 0.2636827826499939 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.00047648542133009794, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9236317.0, "repeat_count": 0.0, "routers_loss": 0.0023909916635602713, "skip_count": 0.0, "step": 5728, "text_loss": 0.4801979064941406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.00047617625181242077, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 9239796.0, "repeat_count": 0.0, "routers_loss": 0.003603481687605381, "skip_count": 0.0, "step": 5730, "text_loss": 0.8374754786491394 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02294921875, "learning_rate": 0.0004758670914240488, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 9243489.0, "repeat_count": 0.0, "routers_loss": 0.004478964954614639, "skip_count": 2.0, "step": 5732, "text_loss": 0.3870154917240143 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 0.000475557940283453, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9246758.0, "repeat_count": 0.0, "routers_loss": 0.00312575395219028, "skip_count": 1.0, "step": 5734, "text_loss": 0.42341071367263794 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 26.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.00047524879850910026, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 9250053.0, "repeat_count": 0.0, "routers_loss": 0.010855631902813911, "skip_count": 4.0, "step": 5736, "text_loss": 0.25729796290397644 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.0004749396662194549, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 9253691.0, "repeat_count": 0.0, "routers_loss": 0.0009250419097952545, "skip_count": 0.0, "step": 5738, "text_loss": 0.6151770949363708 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.94863516289991, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0004746305435329767, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 9256866.0, "repeat_count": 1.0, "routers_loss": 0.007521102204918861, "skip_count": 3.0, "step": 5740, "text_loss": 0.3094986379146576 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0004743214305681221, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9259790.0, "repeat_count": 0.0, "routers_loss": 0.0022241887636482716, "skip_count": 1.0, "step": 5742, "text_loss": 0.5418204069137573 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.967420017610802, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.00047401232744334376, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 9263205.0, "repeat_count": 1.0, "routers_loss": 0.008611299097537994, "skip_count": 2.0, "step": 5744, "text_loss": 0.35824623703956604 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 26.976812444966246, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0380859375, "learning_rate": 0.0004737032342770906, "loss": 0.0062, "macro_f1": 0.5492662787437439, "num_tokens": 9266126.0, "repeat_count": 0.0, "routers_loss": 0.010788857005536556, "skip_count": 2.0, "step": 5746, "text_loss": 0.2172674983739853 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0004733941511878074, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 9269308.0, "repeat_count": 0.0, "routers_loss": 0.005309196189045906, "skip_count": 2.0, "step": 5748, "text_loss": 0.1696814000606537 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04248046875, "learning_rate": 0.00047308507829393594, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 9272801.0, "repeat_count": 0.0, "routers_loss": 0.009940510615706444, "skip_count": 2.0, "step": 5750, "text_loss": 0.24295592308044434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.00047277601571391314, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9276197.0, "repeat_count": 0.0, "routers_loss": 0.000687236781232059, "skip_count": 0.0, "step": 5752, "text_loss": 0.8511804342269897 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.014088641033165, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.00047246696356617254, "loss": 0.0059, "macro_f1": 0.6603773832321167, "num_tokens": 9278965.0, "repeat_count": 1.0, "routers_loss": 0.009816894307732582, "skip_count": 1.0, "step": 5754, "text_loss": 0.45420053601264954 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019287109375, "learning_rate": 0.0004721579219691434, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9282076.0, "repeat_count": 0.0, "routers_loss": 0.0015747188590466976, "skip_count": 0.0, "step": 5756, "text_loss": 0.21671754121780396 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0004718488910412511, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9285465.0, "repeat_count": 0.0, "routers_loss": 0.008654040284454823, "skip_count": 2.0, "step": 5758, "text_loss": 0.25920194387435913 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 0.00047153987090091674, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 9288156.0, "repeat_count": 0.0, "routers_loss": 0.0011430777376517653, "skip_count": 0.0, "step": 5760, "text_loss": 0.7655444741249084 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0004712308616665576, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 9291529.0, "repeat_count": 0.0, "routers_loss": 0.003674200503155589, "skip_count": 2.0, "step": 5762, "text_loss": 0.269486665725708 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0004709218634565866, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 9294699.0, "repeat_count": 0.0, "routers_loss": 0.003249827306717634, "skip_count": 1.0, "step": 5764, "text_loss": 0.5073734521865845 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.070443205165834, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.00047061287638941235, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 9297863.0, "repeat_count": 1.0, "routers_loss": 0.002763139782473445, "skip_count": 2.0, "step": 5766, "text_loss": 0.2572014033794403 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 27.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 0.00047030390058343935, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 9301124.0, "repeat_count": 0.0, "routers_loss": 0.007100266870111227, "skip_count": 3.0, "step": 5768, "text_loss": 0.4147387742996216 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 0.0004699949361570676, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 9304330.0, "repeat_count": 0.0, "routers_loss": 0.005467240232974291, "skip_count": 1.0, "step": 5770, "text_loss": 0.21510964632034302 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.000469685983228693, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9306882.0, "repeat_count": 0.0, "routers_loss": 0.003167890477925539, "skip_count": 0.0, "step": 5772, "text_loss": 0.45717427134513855 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.108012914587615, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.00046937704191670675, "loss": 0.0057, "macro_f1": 0.6601307392120361, "num_tokens": 9309767.0, "repeat_count": 1.0, "routers_loss": 0.014881107024848461, "skip_count": 2.0, "step": 5774, "text_loss": 0.3464985191822052 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0004690681123394959, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 9313045.0, "repeat_count": 0.0, "routers_loss": 0.00379011663608253, "skip_count": 2.0, "step": 5776, "text_loss": 0.33194616436958313 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00046875919461544265, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 9315736.0, "repeat_count": 0.0, "routers_loss": 0.0016733441734686494, "skip_count": 0.0, "step": 5778, "text_loss": 0.5009998679161072 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.00046845028886292493, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9318456.0, "repeat_count": 0.0, "routers_loss": 0.005318894516676664, "skip_count": 1.0, "step": 5780, "text_loss": 0.17702752351760864 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.145582624009393, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.044921875, "learning_rate": 0.00046814139520031615, "loss": 0.006, "macro_f1": 0.8820862174034119, "num_tokens": 9323152.0, "repeat_count": 2.0, "routers_loss": 0.01133672520518303, "skip_count": 2.0, "step": 5782, "text_loss": 0.2886650860309601 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0004678325137459845, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 9326318.0, "repeat_count": 0.0, "routers_loss": 0.002458433620631695, "skip_count": 0.0, "step": 5784, "text_loss": 0.5832745432853699 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0004675236446182946, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 9329779.0, "repeat_count": 0.0, "routers_loss": 0.0005402310052886605, "skip_count": 0.0, "step": 5786, "text_loss": 0.5699237585067749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.00046721478793560525, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 9333360.0, "repeat_count": 0.0, "routers_loss": 0.0002638917067088187, "skip_count": 0.0, "step": 5788, "text_loss": 0.6555714011192322 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.00046690594381627106, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 9336498.0, "repeat_count": 0.0, "routers_loss": 0.003998351749032736, "skip_count": 2.0, "step": 5790, "text_loss": 0.2076750248670578 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00046659711237864157, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 9339724.0, "repeat_count": 0.0, "routers_loss": 0.0045847659930586815, "skip_count": 1.0, "step": 5792, "text_loss": 0.22027169167995453 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 0.00046628829374106167, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 9342835.0, "repeat_count": 0.0, "routers_loss": 0.0014064523857086897, "skip_count": 1.0, "step": 5794, "text_loss": 0.5120179057121277 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0004659794880218712, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 9346757.0, "repeat_count": 0.0, "routers_loss": 0.0011155207175761461, "skip_count": 1.0, "step": 5796, "text_loss": 0.6415372490882874 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004656706953394051, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 9349652.0, "repeat_count": 0.0, "routers_loss": 0.0020385095849633217, "skip_count": 0.0, "step": 5798, "text_loss": 0.5410398840904236 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.230114470208395, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.0004653619158119933, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 9354286.0, "repeat_count": 1.0, "routers_loss": 0.0012847178149968386, "skip_count": 0.0, "step": 5800, "text_loss": 0.4386860728263855 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0194091796875, "learning_rate": 0.00046505314955796074, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 9357682.0, "repeat_count": 0.0, "routers_loss": 0.0035008061677217484, "skip_count": 2.0, "step": 5802, "text_loss": 0.13655950129032135 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 0.00046474439669562715, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 9361058.0, "repeat_count": 0.0, "routers_loss": 0.0020033426117151976, "skip_count": 1.0, "step": 5804, "text_loss": 0.6293444037437439 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00046443565734330714, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9364173.0, "repeat_count": 0.0, "routers_loss": 0.0004935986362397671, "skip_count": 0.0, "step": 5806, "text_loss": 0.2923166751861572 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0004641269316193104, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9366980.0, "repeat_count": 0.0, "routers_loss": 0.001654456602409482, "skip_count": 0.0, "step": 5808, "text_loss": 0.7273373007774353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0004638182196419411, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 9370581.0, "repeat_count": 0.0, "routers_loss": 0.0017011919990181923, "skip_count": 0.0, "step": 5810, "text_loss": 0.6029995083808899 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 27.286469034341064, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.038330078125, "learning_rate": 0.0004635095215294984, "loss": 0.0072, "macro_f1": 0.9265305995941162, "num_tokens": 9374233.0, "repeat_count": 1.0, "routers_loss": 0.01361197978258133, "skip_count": 3.0, "step": 5812, "text_loss": 0.14051523804664612 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.00046320083740027584, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 9377217.0, "repeat_count": 0.0, "routers_loss": 0.004597014281898737, "skip_count": 0.0, "step": 5814, "text_loss": 0.2766880691051483 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 27.30525388905195, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 0.00046289216737256184, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 9380336.0, "repeat_count": 3.0, "routers_loss": 0.006628422066569328, "skip_count": 1.0, "step": 5816, "text_loss": 0.8092381954193115 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.314646316407398, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.0004625835115646393, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9382968.0, "repeat_count": 0.0, "routers_loss": 0.002737772185355425, "skip_count": 0.0, "step": 5818, "text_loss": 0.22090643644332886 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 27.32403874376284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.0004622748700947856, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 9386203.0, "repeat_count": 1.0, "routers_loss": 0.004552177153527737, "skip_count": 1.0, "step": 5820, "text_loss": 0.42869850993156433 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0004619662430812729, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 9388968.0, "repeat_count": 0.0, "routers_loss": 0.003149240743368864, "skip_count": 2.0, "step": 5822, "text_loss": 0.45137661695480347 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0004616576306423677, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 9392487.0, "repeat_count": 0.0, "routers_loss": 0.0008133690571412444, "skip_count": 0.0, "step": 5824, "text_loss": 0.638685941696167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0004613490328963307, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 9395665.0, "repeat_count": 0.0, "routers_loss": 0.00042717234464362264, "skip_count": 0.0, "step": 5826, "text_loss": 0.8134317398071289 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.00046104044996141716, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 9398831.0, "repeat_count": 0.0, "routers_loss": 0.0084775285795331, "skip_count": 2.0, "step": 5828, "text_loss": 0.19263958930969238 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.371000880540066, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0004607318819558768, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 9403118.0, "repeat_count": 1.0, "routers_loss": 0.0030239911284297705, "skip_count": 0.0, "step": 5830, "text_loss": 0.45556432008743286 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 27.38039330789551, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0458984375, "learning_rate": 0.00046042332899795313, "loss": 0.0075, "macro_f1": 0.5492662787437439, "num_tokens": 9406206.0, "repeat_count": 0.0, "routers_loss": 0.026389889419078827, "skip_count": 2.0, "step": 5832, "text_loss": 0.26458361744880676 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0004601147912058845, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 9409806.0, "repeat_count": 0.0, "routers_loss": 0.0013476534513756633, "skip_count": 0.0, "step": 5834, "text_loss": 0.7443689107894897 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0004598062686979033, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 9412737.0, "repeat_count": 0.0, "routers_loss": 0.004275512881577015, "skip_count": 1.0, "step": 5836, "text_loss": 0.2808683514595032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00045949776159223563, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 9415818.0, "repeat_count": 0.0, "routers_loss": 0.0027225434314459562, "skip_count": 0.0, "step": 5838, "text_loss": 0.6283587217330933 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.417963017317287, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.055419921875, "learning_rate": 0.0004591892700071022, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 9419119.0, "repeat_count": 1.0, "routers_loss": 0.01574302278459072, "skip_count": 2.0, "step": 5840, "text_loss": 0.33239027857780457 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.00045888079406071746, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 9422257.0, "repeat_count": 0.0, "routers_loss": 0.0007227854221127927, "skip_count": 0.0, "step": 5842, "text_loss": 0.6658740043640137 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.00045857233387129, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 9425071.0, "repeat_count": 0.0, "routers_loss": 0.0020696306601166725, "skip_count": 2.0, "step": 5844, "text_loss": 0.5773820877075195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 0.0004582638895570224, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 9427980.0, "repeat_count": 0.0, "routers_loss": 0.0019764541648328304, "skip_count": 0.0, "step": 5846, "text_loss": 0.3388919532299042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.455532726739065, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.000457955461236111, "loss": 0.0058, "macro_f1": 0.3272727429866791, "num_tokens": 9430733.0, "repeat_count": 1.0, "routers_loss": 0.04235004261136055, "skip_count": 0.0, "step": 5848, "text_loss": 0.44346582889556885 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0004576470490267462, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 9433347.0, "repeat_count": 0.0, "routers_loss": 0.000801609072368592, "skip_count": 0.0, "step": 5850, "text_loss": 0.5825944542884827 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0004573386530471121, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 9436172.0, "repeat_count": 0.0, "routers_loss": 0.0018224078230559826, "skip_count": 2.0, "step": 5852, "text_loss": 0.8111652135848999 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0004570302734153866, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 9439040.0, "repeat_count": 0.0, "routers_loss": 0.006614950485527515, "skip_count": 2.0, "step": 5854, "text_loss": 0.31270334124565125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05859375, "learning_rate": 0.0004567219102497412, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 9442138.0, "repeat_count": 0.0, "routers_loss": 0.0012984242057427764, "skip_count": 0.0, "step": 5856, "text_loss": 0.6126856803894043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0004564135636683416, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 9445600.0, "repeat_count": 0.0, "routers_loss": 0.0008388847345486283, "skip_count": 0.0, "step": 5858, "text_loss": 0.8526380658149719 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046875, "learning_rate": 0.0004561052337893467, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 9449609.0, "repeat_count": 0.0, "routers_loss": 0.008125773631036282, "skip_count": 2.0, "step": 5860, "text_loss": 0.2843833863735199 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.000455796920730909, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 9452756.0, "repeat_count": 0.0, "routers_loss": 0.0019371749367564917, "skip_count": 0.0, "step": 5862, "text_loss": 0.5293750166893005 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.530672145582624, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0004554886246111746, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 9455467.0, "repeat_count": 1.0, "routers_loss": 0.005594742484390736, "skip_count": 2.0, "step": 5864, "text_loss": 0.572329044342041 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 27.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0004551803455482833, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 9458953.0, "repeat_count": 0.0, "routers_loss": 0.005960086826235056, "skip_count": 3.0, "step": 5866, "text_loss": 0.19459208846092224 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00045487208366036807, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 9462130.0, "repeat_count": 0.0, "routers_loss": 0.0034781871363520622, "skip_count": 1.0, "step": 5868, "text_loss": 0.20467053353786469 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.00045456383906555554, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 9465590.0, "repeat_count": 0.0, "routers_loss": 0.0012246103724464774, "skip_count": 0.0, "step": 5870, "text_loss": 0.6086251735687256 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00045425561188196565, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9468092.0, "repeat_count": 0.0, "routers_loss": 0.002874316181987524, "skip_count": 1.0, "step": 5872, "text_loss": 0.3430633544921875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.0004539474022277115, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 9471433.0, "repeat_count": 0.0, "routers_loss": 0.004340244457125664, "skip_count": 2.0, "step": 5874, "text_loss": 0.28219133615493774 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0004536392102208997, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 9474363.0, "repeat_count": 0.0, "routers_loss": 0.0007322742021642625, "skip_count": 0.0, "step": 5876, "text_loss": 0.7305856943130493 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.0004533310359796299, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 9478469.0, "repeat_count": 0.0, "routers_loss": 0.0018631393322721124, "skip_count": 0.0, "step": 5878, "text_loss": 0.5821442604064941 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 27.60581156442618, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0299072265625, "learning_rate": 0.0004530228796219952, "loss": 0.0088, "macro_f1": 0.9262410998344421, "num_tokens": 9481200.0, "repeat_count": 2.0, "routers_loss": 0.026109615340828896, "skip_count": 3.0, "step": 5880, "text_loss": 0.3962891101837158 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.00045271474126608167, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9484200.0, "repeat_count": 0.0, "routers_loss": 0.0004716445691883564, "skip_count": 0.0, "step": 5882, "text_loss": 0.31901776790618896 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0004524066210299685, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 9488939.0, "repeat_count": 0.0, "routers_loss": 0.0003797562967520207, "skip_count": 0.0, "step": 5884, "text_loss": 0.3992912471294403 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0004520985190317279, "loss": 0.0032, "macro_f1": 0.6666666865348816, "num_tokens": 9492010.0, "repeat_count": 0.0, "routers_loss": 0.005681614391505718, "skip_count": 1.0, "step": 5886, "text_loss": 0.5318995118141174 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0004517904353894253, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 9494770.0, "repeat_count": 0.0, "routers_loss": 0.0021422000136226416, "skip_count": 0.0, "step": 5888, "text_loss": 0.435088187456131 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.652773701203404, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.0004514823702211187, "loss": 0.0052, "macro_f1": 0.8820862174034119, "num_tokens": 9497327.0, "repeat_count": 2.0, "routers_loss": 0.01593884639441967, "skip_count": 2.0, "step": 5890, "text_loss": 0.5068450570106506 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.662166128558848, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0257568359375, "learning_rate": 0.00045117432364485927, "loss": 0.0075, "macro_f1": 0.6601307392120361, "num_tokens": 9500488.0, "repeat_count": 1.0, "routers_loss": 0.0729660913348198, "skip_count": 2.0, "step": 5892, "text_loss": 0.42718732357025146 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.00045086629577869127, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 9503593.0, "repeat_count": 0.0, "routers_loss": 0.007092897780239582, "skip_count": 2.0, "step": 5894, "text_loss": 0.4264345169067383 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043212890625, "learning_rate": 0.00045055828674065134, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 9507188.0, "repeat_count": 0.0, "routers_loss": 0.004088073968887329, "skip_count": 2.0, "step": 5896, "text_loss": 0.20932413637638092 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.690343410625182, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00045025029664876926, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 9510126.0, "repeat_count": 1.0, "routers_loss": 0.0026970503386110067, "skip_count": 0.0, "step": 5898, "text_loss": 0.47661110758781433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0164794921875, "learning_rate": 0.0004499423256210673, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9513891.0, "repeat_count": 0.0, "routers_loss": 0.003428407246246934, "skip_count": 0.0, "step": 5900, "text_loss": 0.18232668936252594 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.00044963437377556066, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 9516718.0, "repeat_count": 0.0, "routers_loss": 0.0020270352251827717, "skip_count": 0.0, "step": 5902, "text_loss": 0.16833586990833282 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 0.000449326441230257, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9520248.0, "repeat_count": 0.0, "routers_loss": 0.0019144838443025947, "skip_count": 0.0, "step": 5904, "text_loss": 0.44434574246406555 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 0.00044901852810315634, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 9523651.0, "repeat_count": 0.0, "routers_loss": 0.0044578867964446545, "skip_count": 2.0, "step": 5906, "text_loss": 0.1248839721083641 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.0004487106345122522, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 9527235.0, "repeat_count": 0.0, "routers_loss": 0.000827222247608006, "skip_count": 0.0, "step": 5908, "text_loss": 0.6052893996238708 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.74669797475785, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0004484027605755296, "loss": 0.0065, "macro_f1": 0.5492662787437439, "num_tokens": 9530407.0, "repeat_count": 2.0, "routers_loss": 0.029739778488874435, "skip_count": 0.0, "step": 5910, "text_loss": 0.7625715732574463 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0213623046875, "learning_rate": 0.00044809490641096653, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 9533229.0, "repeat_count": 0.0, "routers_loss": 0.0025658784434199333, "skip_count": 0.0, "step": 5912, "text_loss": 0.27842655777931213 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 27.76548282946874, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.042724609375, "learning_rate": 0.00044778707213653324, "loss": 0.0069, "macro_f1": 0.9265305995941162, "num_tokens": 9537397.0, "repeat_count": 1.0, "routers_loss": 0.010157953947782516, "skip_count": 3.0, "step": 5914, "text_loss": 0.45196083188056946 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.774875256824185, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0004474792578701924, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 9540564.0, "repeat_count": 3.0, "routers_loss": 0.011994685977697372, "skip_count": 5.0, "step": 5916, "text_loss": 0.22617442905902863 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.000447171463729899, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 9543602.0, "repeat_count": 0.0, "routers_loss": 0.0022214490454643965, "skip_count": 0.0, "step": 5918, "text_loss": 0.5089073777198792 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.793660111535075, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0004468636898336003, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 9546829.0, "repeat_count": 1.0, "routers_loss": 0.009353389963507652, "skip_count": 2.0, "step": 5920, "text_loss": 0.7560386657714844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057373046875, "learning_rate": 0.00044655593629923596, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 9550259.0, "repeat_count": 0.0, "routers_loss": 0.005637963302433491, "skip_count": 0.0, "step": 5922, "text_loss": 0.17084793746471405 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.812444966245963, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0286865234375, "learning_rate": 0.00044624820324473766, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 9554376.0, "repeat_count": 1.0, "routers_loss": 0.008556432090699673, "skip_count": 2.0, "step": 5924, "text_loss": 0.5906872749328613 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.82183739360141, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0004459404907880292, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9558348.0, "repeat_count": 1.0, "routers_loss": 0.0016659445827826858, "skip_count": 0.0, "step": 5926, "text_loss": 0.8197194933891296 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 27.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.00044563279904702674, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9561139.0, "repeat_count": 0.0, "routers_loss": 0.01341368816792965, "skip_count": 3.0, "step": 5928, "text_loss": 0.3264874815940857 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 0.000445325128139638, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9564387.0, "repeat_count": 0.0, "routers_loss": 0.005023977253586054, "skip_count": 2.0, "step": 5930, "text_loss": 0.9055862426757812 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0004450174781837635, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9567053.0, "repeat_count": 0.0, "routers_loss": 0.0006051476229913533, "skip_count": 0.0, "step": 5932, "text_loss": 0.6908539533615112 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0004447098492972951, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 9570036.0, "repeat_count": 0.0, "routers_loss": 0.003152312943711877, "skip_count": 0.0, "step": 5934, "text_loss": 0.6321061849594116 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.047119140625, "learning_rate": 0.0004444022415981167, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 9574146.0, "repeat_count": 0.0, "routers_loss": 0.004859412554651499, "skip_count": 1.0, "step": 5936, "text_loss": 0.5905604958534241 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 27.878191957734078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.00044409465520410426, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 9577071.0, "repeat_count": 1.0, "routers_loss": 0.004376287572085857, "skip_count": 1.0, "step": 5938, "text_loss": 0.6928377747535706 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.00044378709023312535, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 9580537.0, "repeat_count": 0.0, "routers_loss": 0.004038849379867315, "skip_count": 1.0, "step": 5940, "text_loss": 0.2686770558357239 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0004434795468030396, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 9583225.0, "repeat_count": 0.0, "routers_loss": 0.005459951236844063, "skip_count": 2.0, "step": 5942, "text_loss": 0.16855180263519287 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.906369239800412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 0.000443172025031698, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 9586018.0, "repeat_count": 0.0, "routers_loss": 0.0032985717989504337, "skip_count": 2.0, "step": 5944, "text_loss": 0.20335732400417328 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.915761667155856, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0004428645250369437, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 9589321.0, "repeat_count": 1.0, "routers_loss": 0.003573323367163539, "skip_count": 0.0, "step": 5946, "text_loss": 0.6318653225898743 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.00044255704693661117, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 9592518.0, "repeat_count": 0.0, "routers_loss": 0.002226749900728464, "skip_count": 0.0, "step": 5948, "text_loss": 0.5320658683776855 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.934546521866746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0004422495908485265, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 9595664.0, "repeat_count": 0.0, "routers_loss": 0.0007805621717125177, "skip_count": 0.0, "step": 5950, "text_loss": 0.6330106258392334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0004419421568905077, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 9598885.0, "repeat_count": 0.0, "routers_loss": 0.0017050127498805523, "skip_count": 0.0, "step": 5952, "text_loss": 0.6098045706748962 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00044163474518036375, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 9603021.0, "repeat_count": 0.0, "routers_loss": 0.0025974081363528967, "skip_count": 0.0, "step": 5954, "text_loss": 0.2655932903289795 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.96272380393308, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.00044132735583589567, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 9605841.0, "repeat_count": 1.0, "routers_loss": 0.010364850051701069, "skip_count": 2.0, "step": 5956, "text_loss": 0.3028552532196045 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.972116231288524, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.015869140625, "learning_rate": 0.00044101998897489553, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 9608810.0, "repeat_count": 1.0, "routers_loss": 0.0015063622267916799, "skip_count": 0.0, "step": 5958, "text_loss": 0.5602094531059265 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 27.981508658643968, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.02880859375, "learning_rate": 0.00044071264471514683, "loss": 0.0051, "macro_f1": 0.5934640765190125, "num_tokens": 9611995.0, "repeat_count": 0.0, "routers_loss": 0.011538165621459484, "skip_count": 3.0, "step": 5960, "text_loss": 0.14332173764705658 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00044040532317442455, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 9615434.0, "repeat_count": 0.0, "routers_loss": 0.004693889059126377, "skip_count": 0.0, "step": 5962, "text_loss": 0.334369033575058 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.021728515625, "learning_rate": 0.00044009802447049474, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 9618056.0, "repeat_count": 1.0, "routers_loss": 0.0045085870660841465, "skip_count": 1.0, "step": 5964, "text_loss": 0.8163170218467712 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.00043979074872111507, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 9621428.0, "repeat_count": 0.0, "routers_loss": 0.0018220023484900594, "skip_count": 0.0, "step": 5966, "text_loss": 0.2513850927352905 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.01878485471089, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0004394834960440341, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 9625433.0, "repeat_count": 4.0, "routers_loss": 0.007051277905702591, "skip_count": 5.0, "step": 5968, "text_loss": 0.6263421177864075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.00043917626655699154, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 9629508.0, "repeat_count": 0.0, "routers_loss": 0.0006454752874560654, "skip_count": 0.0, "step": 5970, "text_loss": 0.645618736743927 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0004388690603777184, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 9632504.0, "repeat_count": 0.0, "routers_loss": 0.004847112577408552, "skip_count": 1.0, "step": 5972, "text_loss": 0.47306978702545166 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.00043856187762393665, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 9636685.0, "repeat_count": 0.0, "routers_loss": 0.0006580828921869397, "skip_count": 0.0, "step": 5974, "text_loss": 0.42226532101631165 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0004382547184133593, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 9639958.0, "repeat_count": 0.0, "routers_loss": 0.002188180573284626, "skip_count": 0.0, "step": 5976, "text_loss": 0.4456600248813629 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.065746991488112, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0004379475828636901, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 9643228.0, "repeat_count": 1.0, "routers_loss": 0.0017135308589786291, "skip_count": 2.0, "step": 5978, "text_loss": 0.6295822262763977 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0004376404710926244, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 9646746.0, "repeat_count": 0.0, "routers_loss": 0.0008841048111207783, "skip_count": 0.0, "step": 5980, "text_loss": 0.5102712512016296 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.00043733338321784784, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 9649452.0, "repeat_count": 0.0, "routers_loss": 0.0006229099817574024, "skip_count": 0.0, "step": 5982, "text_loss": 0.6944046020507812 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.000437026319357037, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9652700.0, "repeat_count": 0.0, "routers_loss": 0.005293759983032942, "skip_count": 2.0, "step": 5984, "text_loss": 0.6748214960098267 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.00043671927962785946, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 9655825.0, "repeat_count": 0.0, "routers_loss": 0.0013537590857595205, "skip_count": 0.0, "step": 5986, "text_loss": 1.000306248664856 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0004364122641479733, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 9658713.0, "repeat_count": 0.0, "routers_loss": 0.004548195283859968, "skip_count": 0.0, "step": 5988, "text_loss": 0.24580086767673492 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 28.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0004361052730350275, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 9661535.0, "repeat_count": 0.0, "routers_loss": 0.011149964295327663, "skip_count": 4.0, "step": 5990, "text_loss": 0.5737863779067993 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.131493982976224, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.00043579830640666154, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 9664406.0, "repeat_count": 1.0, "routers_loss": 0.003783488878980279, "skip_count": 1.0, "step": 5992, "text_loss": 0.7836558222770691 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.00043549136438050573, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 9669050.0, "repeat_count": 0.0, "routers_loss": 0.0050374288111925125, "skip_count": 1.0, "step": 5994, "text_loss": 0.13072487711906433 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.00043518444707418076, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 9672698.0, "repeat_count": 0.0, "routers_loss": 0.004047670867294073, "skip_count": 2.0, "step": 5996, "text_loss": 0.4748993217945099 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0211181640625, "learning_rate": 0.00043487755460529796, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 9676159.0, "repeat_count": 0.0, "routers_loss": 0.008628991432487965, "skip_count": 2.0, "step": 5998, "text_loss": 0.1921990066766739 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.169063692398005, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.00043457068709145904, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 9679528.0, "repeat_count": 3.0, "routers_loss": 0.01094671618193388, "skip_count": 3.0, "step": 6000, "text_loss": 0.3651769459247589 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 28.17845611975345, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0184326171875, "learning_rate": 0.00043426384465025604, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 9682677.0, "repeat_count": 2.0, "routers_loss": 0.0011284075444564223, "skip_count": 0.0, "step": 6002, "text_loss": 0.28305181860923767 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.187848547108892, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.000433957027399272, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9685310.0, "repeat_count": 0.0, "routers_loss": 0.0030473743099719286, "skip_count": 1.0, "step": 6004, "text_loss": 0.3650054931640625 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.19724097446434, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.00043365023545607965, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 9687944.0, "repeat_count": 1.0, "routers_loss": 0.011621905490756035, "skip_count": 2.0, "step": 6006, "text_loss": 0.5409000515937805 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004333434689382423, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 9690932.0, "repeat_count": 0.0, "routers_loss": 0.0005297541501931846, "skip_count": 0.0, "step": 6008, "text_loss": 0.4311029314994812 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.216025829175226, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.00043303672796331336, "loss": 0.0058, "macro_f1": 0.3272727429866791, "num_tokens": 9693972.0, "repeat_count": 1.0, "routers_loss": 0.06166421249508858, "skip_count": 0.0, "step": 6010, "text_loss": 0.2658997178077698 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.225418256530673, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 0.00043273001264883655, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 9697712.0, "repeat_count": 0.0, "routers_loss": 0.0018419031985104084, "skip_count": 0.0, "step": 6012, "text_loss": 0.5813497304916382 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0004324233231123458, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 9700746.0, "repeat_count": 0.0, "routers_loss": 0.003635555040091276, "skip_count": 0.0, "step": 6014, "text_loss": 0.24211904406547546 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 28.24420311124156, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.038330078125, "learning_rate": 0.0004321166594713651, "loss": 0.0048, "macro_f1": 0.5492662787437439, "num_tokens": 9704087.0, "repeat_count": 0.0, "routers_loss": 0.021067705005407333, "skip_count": 2.0, "step": 6016, "text_loss": 0.5908042788505554 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00043181002184340857, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 9708695.0, "repeat_count": 0.0, "routers_loss": 0.0008712753187865019, "skip_count": 0.0, "step": 6018, "text_loss": 0.7788549661636353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.26298796595245, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0004315034103459803, "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 9711631.0, "repeat_count": 1.0, "routers_loss": 0.03231092542409897, "skip_count": 0.0, "step": 6020, "text_loss": 0.6127741932868958 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 0.0004311968250965743, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9715526.0, "repeat_count": 0.0, "routers_loss": 0.0020149527117609978, "skip_count": 2.0, "step": 6022, "text_loss": 0.49970078468322754 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.281772820663342, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0004308902662126748, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 9718475.0, "repeat_count": 0.0, "routers_loss": 0.0031795913819223642, "skip_count": 0.0, "step": 6024, "text_loss": 0.3254713714122772 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.291165248018785, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.00043058373381175567, "loss": 0.004, "macro_f1": 0.3272727429866791, "num_tokens": 9722194.0, "repeat_count": 0.0, "routers_loss": 0.0148378387093544, "skip_count": 1.0, "step": 6026, "text_loss": 0.17670343816280365 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.30055767537423, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.064453125, "learning_rate": 0.0004302772280112806, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 9725489.0, "repeat_count": 1.0, "routers_loss": 0.005742347799241543, "skip_count": 2.0, "step": 6028, "text_loss": 0.26184776425361633 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.00042997074892870335, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 9729416.0, "repeat_count": 0.0, "routers_loss": 0.0023561837151646614, "skip_count": 0.0, "step": 6030, "text_loss": 0.3026008605957031 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.0004296642966814673, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 9732559.0, "repeat_count": 0.0, "routers_loss": 0.0010108393616974354, "skip_count": 1.0, "step": 6032, "text_loss": 0.43198078870773315 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.328734957440563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.00042935787138700525, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 9736324.0, "repeat_count": 2.0, "routers_loss": 0.005443581845611334, "skip_count": 2.0, "step": 6034, "text_loss": 0.24883155524730682 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.338127384796007, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 0.0004290514731627403, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 9739630.0, "repeat_count": 1.0, "routers_loss": 0.010645060800015926, "skip_count": 2.0, "step": 6036, "text_loss": 0.24207182228565216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018798828125, "learning_rate": 0.0004287451021260846, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 9742221.0, "repeat_count": 0.0, "routers_loss": 0.0008162845042534173, "skip_count": 0.0, "step": 6038, "text_loss": 0.33018553256988525 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0004284387583944403, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 9744925.0, "repeat_count": 0.0, "routers_loss": 0.003782407147809863, "skip_count": 1.0, "step": 6040, "text_loss": 0.6600399613380432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0004281324420851987, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 9748103.0, "repeat_count": 0.0, "routers_loss": 0.0009834285592660308, "skip_count": 0.0, "step": 6042, "text_loss": 0.6402350664138794 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.375697094217788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037109375, "learning_rate": 0.0004278261533157409, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 9751128.0, "repeat_count": 0.0, "routers_loss": 0.004100334830582142, "skip_count": 2.0, "step": 6044, "text_loss": 0.1545136719942093 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.38508952157323, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0004275198922034372, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 9754140.0, "repeat_count": 0.0, "routers_loss": 0.0017166603356599808, "skip_count": 1.0, "step": 6046, "text_loss": 0.5875935554504395 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.394481948928675, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.00042721365886564766, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 9756945.0, "repeat_count": 1.0, "routers_loss": 0.00915827602148056, "skip_count": 2.0, "step": 6048, "text_loss": 0.3885214328765869 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.403874376284122, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.00042690745341972134, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9759738.0, "repeat_count": 0.0, "routers_loss": 0.0057020667009055614, "skip_count": 2.0, "step": 6050, "text_loss": 0.3107164204120636 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.00042660127598299647, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 9762987.0, "repeat_count": 0.0, "routers_loss": 0.004196313209831715, "skip_count": 2.0, "step": 6052, "text_loss": 0.3073577582836151 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.00042629512667280135, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 9765828.0, "repeat_count": 0.0, "routers_loss": 0.0023119752295315266, "skip_count": 1.0, "step": 6054, "text_loss": 0.8228643536567688 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0004259890056064527, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 9769129.0, "repeat_count": 0.0, "routers_loss": 0.0021007524337619543, "skip_count": 1.0, "step": 6056, "text_loss": 0.8334706425666809 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.4414440857059, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0004256829129012568, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 9771821.0, "repeat_count": 1.0, "routers_loss": 0.00671970471739769, "skip_count": 2.0, "step": 6058, "text_loss": 0.17845536768436432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00042537684867450875, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 9774566.0, "repeat_count": 0.0, "routers_loss": 0.0014770646812394261, "skip_count": 0.0, "step": 6060, "text_loss": 0.4445459246635437 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 28.46022894041679, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00042507081304349315, "loss": 0.0067, "macro_f1": 0.5492662787437439, "num_tokens": 9777909.0, "repeat_count": 2.0, "routers_loss": 0.014822427183389664, "skip_count": 0.0, "step": 6062, "text_loss": 0.45526158809661865 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0004247648061254833, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 9781159.0, "repeat_count": 0.0, "routers_loss": 0.00568385748192668, "skip_count": 1.0, "step": 6064, "text_loss": 0.18535588681697845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.479013795127678, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.00042445882803774173, "loss": 0.0046, "macro_f1": 0.3272727429866791, "num_tokens": 9784960.0, "repeat_count": 1.0, "routers_loss": 0.0179694052785635, "skip_count": 0.0, "step": 6066, "text_loss": 0.23591181635856628 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 0.00042415287889751966, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 9787941.0, "repeat_count": 0.0, "routers_loss": 0.0019039154285565019, "skip_count": 0.0, "step": 6068, "text_loss": 0.9447930455207825 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0004238469588220575, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 9791096.0, "repeat_count": 0.0, "routers_loss": 0.004039563238620758, "skip_count": 0.0, "step": 6070, "text_loss": 0.3134256601333618 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0286865234375, "learning_rate": 0.00042354106792858446, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 9794082.0, "repeat_count": 0.0, "routers_loss": 0.0018352365586906672, "skip_count": 0.0, "step": 6072, "text_loss": 0.5681536197662354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.516583504549455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0250244140625, "learning_rate": 0.00042323520633431833, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 9797303.0, "repeat_count": 0.0, "routers_loss": 0.0019325513858348131, "skip_count": 0.0, "step": 6074, "text_loss": 0.2835809290409088 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00042292937415646574, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 9800435.0, "repeat_count": 0.0, "routers_loss": 0.002513401210308075, "skip_count": 0.0, "step": 6076, "text_loss": 0.1931663602590561 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.00042262357151222265, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 9803873.0, "repeat_count": 0.0, "routers_loss": 0.004864581860601902, "skip_count": 0.0, "step": 6078, "text_loss": 0.25809767842292786 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 28.54476078661579, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0004223177985187728, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9806438.0, "repeat_count": 1.0, "routers_loss": 0.004932792857289314, "skip_count": 0.0, "step": 6080, "text_loss": 0.6409249305725098 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00042201205529328925, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 9809400.0, "repeat_count": 0.0, "routers_loss": 0.00590938376262784, "skip_count": 1.0, "step": 6082, "text_loss": 0.31158050894737244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.00042170634195293314, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9813246.0, "repeat_count": 0.0, "routers_loss": 0.006805860437452793, "skip_count": 0.0, "step": 6084, "text_loss": 0.32945963740348816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0004214006586148545, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 9816513.0, "repeat_count": 0.0, "routers_loss": 0.0010186503641307354, "skip_count": 0.0, "step": 6086, "text_loss": 0.48659923672676086 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 0.0004210950053961917, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 9819908.0, "repeat_count": 0.0, "routers_loss": 0.00402973173186183, "skip_count": 1.0, "step": 6088, "text_loss": 0.6249601244926453 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 0.00042078938241407174, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 9822950.0, "repeat_count": 0.0, "routers_loss": 0.00236532068811357, "skip_count": 1.0, "step": 6090, "text_loss": 0.26589256525039673 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.601115350748458, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.038818359375, "learning_rate": 0.0004204837897856098, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 9826493.0, "repeat_count": 1.0, "routers_loss": 0.003072192659601569, "skip_count": 2.0, "step": 6092, "text_loss": 0.5216912627220154 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.610507778103905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0004201782276279096, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 9829698.0, "repeat_count": 0.0, "routers_loss": 0.0027553171385079622, "skip_count": 1.0, "step": 6094, "text_loss": 0.40127676725387573 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.61990020545935, "f1_execute": 0.9756097793579102, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00041987269605806325, "loss": 0.0045, "macro_f1": 0.9442509412765503, "num_tokens": 9833719.0, "repeat_count": 4.0, "routers_loss": 0.013845407404005527, "skip_count": 4.0, "step": 6096, "text_loss": 0.23114071786403656 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.629292632814792, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0004195671951931509, "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 9838235.0, "repeat_count": 0.0, "routers_loss": 0.0019887303933501244, "skip_count": 2.0, "step": 6098, "text_loss": 0.7467341423034668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0004192617251502409, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 9840867.0, "repeat_count": 0.0, "routers_loss": 0.0007213905337266624, "skip_count": 0.0, "step": 6100, "text_loss": 0.6283472180366516 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.00041895628604639036, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 9843827.0, "repeat_count": 0.0, "routers_loss": 0.003863139310851693, "skip_count": 1.0, "step": 6102, "text_loss": 0.3602744936943054 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 0.00041865087799864374, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 9846939.0, "repeat_count": 0.0, "routers_loss": 0.0013336286647245288, "skip_count": 0.0, "step": 6104, "text_loss": 0.4182434678077698 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.666862342236573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0004183455011240341, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 9849827.0, "repeat_count": 0.0, "routers_loss": 0.00038455065805464983, "skip_count": 0.0, "step": 6106, "text_loss": 0.7122722864151001 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 28.676254769592017, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0004180401555395826, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 9853487.0, "repeat_count": 3.0, "routers_loss": 0.0038226440083235502, "skip_count": 1.0, "step": 6108, "text_loss": 0.2521185576915741 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0004177348413622981, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 9856321.0, "repeat_count": 0.0, "routers_loss": 0.0015809801407158375, "skip_count": 0.0, "step": 6110, "text_loss": 0.423979252576828 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0004174295587091776, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 9859238.0, "repeat_count": 0.0, "routers_loss": 0.0007586454739794135, "skip_count": 0.0, "step": 6112, "text_loss": 0.4720100462436676 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.70443205165835, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.00041712430769720593, "loss": 0.0091, "macro_f1": 1.0, "num_tokens": 9862282.0, "repeat_count": 1.0, "routers_loss": 0.0045816488564014435, "skip_count": 1.0, "step": 6114, "text_loss": 0.279577374458313 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.713824479013795, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0004168190884433559, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 9865394.0, "repeat_count": 1.0, "routers_loss": 0.004728195257484913, "skip_count": 1.0, "step": 6116, "text_loss": 0.3826395571231842 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 28.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0179443359375, "learning_rate": 0.0004165139010645881, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 9869165.0, "repeat_count": 0.0, "routers_loss": 0.006160226184874773, "skip_count": 3.0, "step": 6118, "text_loss": 0.4668935537338257 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 24.0, "epoch": 28.732609333724685, "f1_execute": 0.9767441749572754, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.04736328125, "learning_rate": 0.0004162087456778509, "loss": 0.0074, "macro_f1": 0.9619450569152832, "num_tokens": 9872381.0, "repeat_count": 1.0, "routers_loss": 0.027831824496388435, "skip_count": 6.0, "step": 6120, "text_loss": 0.28708913922309875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004159036224000804, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9875668.0, "repeat_count": 0.0, "routers_loss": 0.0030764432158321142, "skip_count": 1.0, "step": 6122, "text_loss": 0.37078607082366943 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.751394188435572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0004155985313482002, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9878533.0, "repeat_count": 0.0, "routers_loss": 0.00043521137558855116, "skip_count": 0.0, "step": 6124, "text_loss": 0.34975379705429077 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.00041529347263912224, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 9881478.0, "repeat_count": 0.0, "routers_loss": 0.0016251741908490658, "skip_count": 0.0, "step": 6126, "text_loss": 0.39166271686553955 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.770179043146463, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00041498844638974535, "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 9884252.0, "repeat_count": 1.0, "routers_loss": 0.019553523510694504, "skip_count": 0.0, "step": 6128, "text_loss": 0.2309480905532837 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 28.779571470501907, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0004146834527169562, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 9887485.0, "repeat_count": 1.0, "routers_loss": 0.0036251386627554893, "skip_count": 0.0, "step": 6130, "text_loss": 0.4464457631111145 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.00041437849173762894, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9890711.0, "repeat_count": 0.0, "routers_loss": 0.0008515548543073237, "skip_count": 0.0, "step": 6132, "text_loss": 0.5012133717536926 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 28.798356325212797, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0004140735635686251, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 9894458.0, "repeat_count": 1.0, "routers_loss": 0.001084602321498096, "skip_count": 0.0, "step": 6134, "text_loss": 0.32015663385391235 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0004137686683267938, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 9897634.0, "repeat_count": 0.0, "routers_loss": 0.0025203595869243145, "skip_count": 0.0, "step": 6136, "text_loss": 0.15804508328437805 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.817141179923688, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0004134638061289715, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 9901157.0, "repeat_count": 0.0, "routers_loss": 0.0029381231870502234, "skip_count": 0.0, "step": 6138, "text_loss": 0.14375236630439758 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0004131589770919819, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 9903958.0, "repeat_count": 0.0, "routers_loss": 0.002789110178127885, "skip_count": 0.0, "step": 6140, "text_loss": 0.2474033683538437 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.835926034634575, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0004128541813326361, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 9906799.0, "repeat_count": 2.0, "routers_loss": 0.010770512744784355, "skip_count": 3.0, "step": 6142, "text_loss": 0.2304249256849289 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 28.845318461990022, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0004125494189677325, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 9909286.0, "repeat_count": 1.0, "routers_loss": 0.003122122259810567, "skip_count": 0.0, "step": 6144, "text_loss": 0.3781827688217163 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.854710889345466, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.049072265625, "learning_rate": 0.00041224469011405643, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 9912416.0, "repeat_count": 1.0, "routers_loss": 0.008443298749625683, "skip_count": 1.0, "step": 6146, "text_loss": 0.3004767596721649 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0004119399948883806, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 9915290.0, "repeat_count": 0.0, "routers_loss": 0.0033219947945326567, "skip_count": 1.0, "step": 6148, "text_loss": 0.748744547367096 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 28.873495744056356, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0004116353334074647, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 9918493.0, "repeat_count": 1.0, "routers_loss": 0.005501769948750734, "skip_count": 0.0, "step": 6150, "text_loss": 0.330759733915329 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.000411330705788056, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 9921027.0, "repeat_count": 0.0, "routers_loss": 0.0013694261433556676, "skip_count": 0.0, "step": 6152, "text_loss": 0.43070924282073975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0164794921875, "learning_rate": 0.000411026112146888, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 9924303.0, "repeat_count": 0.0, "routers_loss": 0.00046192589798010886, "skip_count": 0.0, "step": 6154, "text_loss": 0.5674887895584106 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 28.901673026122687, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0004107215526006817, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9927065.0, "repeat_count": 1.0, "routers_loss": 0.004311304073780775, "skip_count": 0.0, "step": 6156, "text_loss": 0.16138267517089844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0004104170272661449, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 9930713.0, "repeat_count": 0.0, "routers_loss": 0.0035845425445586443, "skip_count": 0.0, "step": 6158, "text_loss": 0.18728356063365936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.00041011253625997227, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 9934393.0, "repeat_count": 0.0, "routers_loss": 0.00247366214171052, "skip_count": 0.0, "step": 6160, "text_loss": 0.3624019920825958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0004098080796988452, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 9937457.0, "repeat_count": 0.0, "routers_loss": 0.003240241203457117, "skip_count": 0.0, "step": 6162, "text_loss": 0.12348521500825882 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 0.0004095036576994321, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 9940523.0, "repeat_count": 0.0, "routers_loss": 0.001985874492675066, "skip_count": 1.0, "step": 6164, "text_loss": 0.2688066363334656 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 28.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.00040919927037838815, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9943802.0, "repeat_count": 0.0, "routers_loss": 0.004264154937118292, "skip_count": 3.0, "step": 6166, "text_loss": 0.49316367506980896 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0556640625, "learning_rate": 0.00040889491785235513, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 9946649.0, "repeat_count": 0.0, "routers_loss": 0.002545441733673215, "skip_count": 0.0, "step": 6168, "text_loss": 0.4079313576221466 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0004085906002379614, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 9949800.0, "repeat_count": 0.0, "routers_loss": 0.0009590961271896958, "skip_count": 0.0, "step": 6170, "text_loss": 0.6166561245918274 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0004082863176518221, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9954008.0, "repeat_count": 0.0, "routers_loss": 0.003795337164774537, "skip_count": 2.0, "step": 6172, "text_loss": 0.4791361689567566 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044921875, "learning_rate": 0.0004079820702105388, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 9957153.0, "repeat_count": 0.0, "routers_loss": 0.0015634822193533182, "skip_count": 0.0, "step": 6174, "text_loss": 0.7208777666091919 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.995597299677137, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0004076778580306999, "loss": 0.0056, "macro_f1": 0.8820862174034119, "num_tokens": 9960060.0, "repeat_count": 2.0, "routers_loss": 0.03223998099565506, "skip_count": 2.0, "step": 6176, "text_loss": 0.6617992520332336 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.00040737368122887983, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 9963396.0, "repeat_count": 0.0, "routers_loss": 0.0033978577703237534, "skip_count": 0.0, "step": 6178, "text_loss": 0.7339215278625488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00040706953992164, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 9966364.0, "repeat_count": 0.0, "routers_loss": 0.0005358994239941239, "skip_count": 0.0, "step": 6180, "text_loss": 0.44187214970588684 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00040676543422552767, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 9969813.0, "repeat_count": 0.0, "routers_loss": 0.0018544091144576669, "skip_count": 1.0, "step": 6182, "text_loss": 0.6244927048683167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004064613642570769, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 9973015.0, "repeat_count": 0.0, "routers_loss": 0.005692692007869482, "skip_count": 0.0, "step": 6184, "text_loss": 0.18860043585300446 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00040615733013280784, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 9976201.0, "repeat_count": 0.0, "routers_loss": 0.0018737476784735918, "skip_count": 0.0, "step": 6186, "text_loss": 0.21189232170581818 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.00040585333196922687, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 9979711.0, "repeat_count": 0.0, "routers_loss": 0.011945146135985851, "skip_count": 2.0, "step": 6188, "text_loss": 0.2628154456615448 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.00040554936988282663, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9983003.0, "repeat_count": 0.0, "routers_loss": 0.0036045778542757034, "skip_count": 1.0, "step": 6190, "text_loss": 0.5926038026809692 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.070443205165834, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0004052454439900861, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 9986841.0, "repeat_count": 0.0, "routers_loss": 0.004170368425548077, "skip_count": 0.0, "step": 6192, "text_loss": 0.3088737726211548 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.00040494155440747015, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 9989596.0, "repeat_count": 0.0, "routers_loss": 0.002254750579595566, "skip_count": 2.0, "step": 6194, "text_loss": 0.6309700012207031 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 29.089228059876724, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.00040463770125142987, "loss": 0.0087, "macro_f1": 0.8814815282821655, "num_tokens": 9992789.0, "repeat_count": 2.0, "routers_loss": 0.04092822223901749, "skip_count": 4.0, "step": 6196, "text_loss": 0.09625697880983353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.00040433388463840213, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 9995782.0, "repeat_count": 0.0, "routers_loss": 0.00029065192211419344, "skip_count": 0.0, "step": 6198, "text_loss": 0.5600258111953735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0004040301046848105, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 9998712.0, "repeat_count": 0.0, "routers_loss": 0.0005865268758498132, "skip_count": 0.0, "step": 6200, "text_loss": 0.6426429748535156 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 29.11740534194306, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0283203125, "learning_rate": 0.0004037263615070638, "loss": 0.0078, "macro_f1": 0.9265305995941162, "num_tokens": 10002020.0, "repeat_count": 1.0, "routers_loss": 0.025357060134410858, "skip_count": 3.0, "step": 6202, "text_loss": 0.25125735998153687 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.000403422655221557, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 10005381.0, "repeat_count": 0.0, "routers_loss": 0.003139561740681529, "skip_count": 1.0, "step": 6204, "text_loss": 0.3639419376850128 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.00040311898594467085, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 10008348.0, "repeat_count": 0.0, "routers_loss": 0.004091196693480015, "skip_count": 2.0, "step": 6206, "text_loss": 0.1602363884449005 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00040281535379277204, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 10011171.0, "repeat_count": 0.0, "routers_loss": 0.005771483760327101, "skip_count": 0.0, "step": 6208, "text_loss": 0.5593504905700684 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 0.000402511758882213, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 10014374.0, "repeat_count": 0.0, "routers_loss": 0.005212264601141214, "skip_count": 1.0, "step": 6210, "text_loss": 0.15668229758739471 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0004022082013293319, "loss": 0.0032, "macro_f1": 0.6666666865348816, "num_tokens": 10017327.0, "repeat_count": 0.0, "routers_loss": 0.0027585842180997133, "skip_count": 1.0, "step": 6212, "text_loss": 0.21188466250896454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.173759906075727, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.00040190468125045255, "loss": 0.0061, "macro_f1": 0.3272727429866791, "num_tokens": 10020518.0, "repeat_count": 0.0, "routers_loss": 0.013210589066147804, "skip_count": 1.0, "step": 6214, "text_loss": 0.2551073729991913 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 29.18315233343117, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.01708984375, "learning_rate": 0.00040160119876188436, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 10023799.0, "repeat_count": 1.0, "routers_loss": 0.001590219559147954, "skip_count": 0.0, "step": 6216, "text_loss": 0.5634782314300537 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0004012977539799224, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 10027107.0, "repeat_count": 0.0, "routers_loss": 0.003917343448847532, "skip_count": 0.0, "step": 6218, "text_loss": 0.6412819027900696 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0205078125, "learning_rate": 0.0004009943470208473, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 10030460.0, "repeat_count": 0.0, "routers_loss": 0.00874288845807314, "skip_count": 2.0, "step": 6220, "text_loss": 0.13269923627376556 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.211329615497505, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.000400690978000925, "loss": 0.0075, "macro_f1": 0.8817967176437378, "num_tokens": 10034086.0, "repeat_count": 2.0, "routers_loss": 0.03736349940299988, "skip_count": 3.0, "step": 6222, "text_loss": 0.4956454336643219 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.0004003876470364075, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 10037312.0, "repeat_count": 0.0, "routers_loss": 0.008481289260089397, "skip_count": 2.0, "step": 6224, "text_loss": 0.2148810178041458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0152587890625, "learning_rate": 0.0004000843542435315, "loss": 0.0028, "macro_f1": 0.3333333432674408, "num_tokens": 10040393.0, "repeat_count": 0.0, "routers_loss": 0.002235144842416048, "skip_count": 0.0, "step": 6226, "text_loss": 0.17645306885242462 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 29.23950689756384, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0003997810997385195, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 10044386.0, "repeat_count": 1.0, "routers_loss": 0.004541373811662197, "skip_count": 0.0, "step": 6228, "text_loss": 0.5098661184310913 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.00039947788363757915, "loss": 0.0088, "macro_f1": 0.6666666865348816, "num_tokens": 10049046.0, "repeat_count": 0.0, "routers_loss": 0.0019183673430234194, "skip_count": 1.0, "step": 6230, "text_loss": 0.6953724026679993 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.25829175227473, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0291748046875, "learning_rate": 0.00039917470605690334, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 10051787.0, "repeat_count": 2.0, "routers_loss": 0.0032311067916452885, "skip_count": 4.0, "step": 6232, "text_loss": 0.475127637386322 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 29.267684179630173, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.00039887156711267043, "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 10055396.0, "repeat_count": 2.0, "routers_loss": 0.03247373178601265, "skip_count": 0.0, "step": 6234, "text_loss": 0.4239100515842438 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 29.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 0.00039856846692104363, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 10058395.0, "repeat_count": 0.0, "routers_loss": 0.006287421099841595, "skip_count": 3.0, "step": 6236, "text_loss": 0.24084535241127014 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 29.286469034341064, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.016357421875, "learning_rate": 0.0003982654055981718, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 10061302.0, "repeat_count": 1.0, "routers_loss": 0.0008686117362231016, "skip_count": 1.0, "step": 6238, "text_loss": 0.4740419089794159 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0003979623832601884, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 10065318.0, "repeat_count": 0.0, "routers_loss": 0.0037686119321733713, "skip_count": 2.0, "step": 6240, "text_loss": 0.43965795636177063 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.0003976594000232123, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 10068291.0, "repeat_count": 0.0, "routers_loss": 0.005804901942610741, "skip_count": 0.0, "step": 6242, "text_loss": 0.24424348771572113 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.314646316407398, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.00039735645600334714, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 10071645.0, "repeat_count": 0.0, "routers_loss": 0.002001055981963873, "skip_count": 1.0, "step": 6244, "text_loss": 0.6524377465248108 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0003970535513166815, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 10075136.0, "repeat_count": 0.0, "routers_loss": 0.001252001617103815, "skip_count": 0.0, "step": 6246, "text_loss": 0.22803714871406555 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0003967506860792893, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 10078230.0, "repeat_count": 0.0, "routers_loss": 0.004913780372589827, "skip_count": 1.0, "step": 6248, "text_loss": 0.9835516214370728 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.000396447860407229, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 10080852.0, "repeat_count": 0.0, "routers_loss": 0.0037437966093420982, "skip_count": 2.0, "step": 6250, "text_loss": 0.4021640121936798 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.00039614507441654393, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 10084139.0, "repeat_count": 0.0, "routers_loss": 0.005433002021163702, "skip_count": 2.0, "step": 6252, "text_loss": 0.23060470819473267 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.00039584232822326224, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 10088501.0, "repeat_count": 0.0, "routers_loss": 0.0007705377647653222, "skip_count": 0.0, "step": 6254, "text_loss": 0.5994830131530762 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0003955396219433969, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10091506.0, "repeat_count": 0.0, "routers_loss": 0.0012310115853324533, "skip_count": 0.0, "step": 6256, "text_loss": 0.4639038145542145 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 0.0003952369556929455, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 10096236.0, "repeat_count": 0.0, "routers_loss": 0.008964627049863338, "skip_count": 2.0, "step": 6258, "text_loss": 0.24845287203788757 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0003949343295878903, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 10099213.0, "repeat_count": 0.0, "routers_loss": 0.0033088945783674717, "skip_count": 0.0, "step": 6260, "text_loss": 0.6527073979377747 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 29.399178162606397, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 0.00039463174374419817, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 10103160.0, "repeat_count": 2.0, "routers_loss": 0.003462672932073474, "skip_count": 1.0, "step": 6262, "text_loss": 0.4209299683570862 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 29.408570589961844, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00039432919827782066, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 10105881.0, "repeat_count": 2.0, "routers_loss": 0.0027124532498419285, "skip_count": 2.0, "step": 6264, "text_loss": 0.4442266821861267 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0172119140625, "learning_rate": 0.00039402669330469367, "loss": 0.0032, "macro_f1": 0.6666666865348816, "num_tokens": 10108596.0, "repeat_count": 0.0, "routers_loss": 0.005055282264947891, "skip_count": 2.0, "step": 6266, "text_loss": 0.3331456780433655 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.00039372422894073765, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 10111673.0, "repeat_count": 0.0, "routers_loss": 0.0009340311517007649, "skip_count": 0.0, "step": 6268, "text_loss": 0.7664456367492676 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.00039342180530185745, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 10116141.0, "repeat_count": 0.0, "routers_loss": 0.00032052272581495345, "skip_count": 0.0, "step": 6270, "text_loss": 0.47610244154930115 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00039311942250394274, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 10119151.0, "repeat_count": 0.0, "routers_loss": 0.0015820999396964908, "skip_count": 0.0, "step": 6272, "text_loss": 0.3815282881259918 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 0.0003928170806628669, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10122684.0, "repeat_count": 0.0, "routers_loss": 0.0007423736387863755, "skip_count": 0.0, "step": 6274, "text_loss": 0.4630914628505707 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.00039251477989448797, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 10126751.0, "repeat_count": 0.0, "routers_loss": 0.0006216703332029283, "skip_count": 0.0, "step": 6276, "text_loss": 0.4342454671859741 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 29.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045654296875, "learning_rate": 0.00039221252031464816, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 10129784.0, "repeat_count": 0.0, "routers_loss": 0.004239698871970177, "skip_count": 3.0, "step": 6278, "text_loss": 0.24661089479923248 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 29.4837100088054, "f1_execute": 0.9743589162826538, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0003919103020391738, "loss": 0.006, "macro_f1": 0.8803418874740601, "num_tokens": 10133066.0, "repeat_count": 2.0, "routers_loss": 0.027879100292921066, "skip_count": 7.0, "step": 6280, "text_loss": 0.4705188274383545 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.00039160812518387574, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 10136860.0, "repeat_count": 0.0, "routers_loss": 0.002533538034185767, "skip_count": 0.0, "step": 6282, "text_loss": 0.1953880786895752 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00039130598986454845, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 10140066.0, "repeat_count": 1.0, "routers_loss": 0.002462630858644843, "skip_count": 2.0, "step": 6284, "text_loss": 0.378487765789032 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 29.511887290871734, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 0.000391003896196971, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 10143646.0, "repeat_count": 1.0, "routers_loss": 0.011922914534807205, "skip_count": 1.0, "step": 6286, "text_loss": 0.2467316836118698 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 29.52127971822718, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 0.00039070184429690607, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 10146507.0, "repeat_count": 1.0, "routers_loss": 0.0059767309576272964, "skip_count": 1.0, "step": 6288, "text_loss": 0.9603674411773682 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.530672145582624, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0003903998342801006, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 10149301.0, "repeat_count": 1.0, "routers_loss": 0.0030056277755647898, "skip_count": 2.0, "step": 6290, "text_loss": 0.36631715297698975 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 29.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00039009786626228543, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 10152158.0, "repeat_count": 0.0, "routers_loss": 0.005298118572682142, "skip_count": 3.0, "step": 6292, "text_loss": 0.2876455783843994 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0003897959403591751, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 10155852.0, "repeat_count": 0.0, "routers_loss": 0.004937763791531324, "skip_count": 2.0, "step": 6294, "text_loss": 0.14649681746959686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0003894940566864683, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 10159164.0, "repeat_count": 0.0, "routers_loss": 0.0021474575623869896, "skip_count": 0.0, "step": 6296, "text_loss": 0.5694304704666138 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 29.568241855004402, "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.08251953125, "learning_rate": 0.00038919221535984753, "loss": 0.0073, "macro_f1": 0.875, "num_tokens": 10161806.0, "repeat_count": 1.0, "routers_loss": 0.040340203791856766, "skip_count": 3.0, "step": 6298, "text_loss": 0.1574537754058838 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.00038889041649497894, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 10165669.0, "repeat_count": 0.0, "routers_loss": 0.0028486931696534157, "skip_count": 0.0, "step": 6300, "text_loss": 0.9158071279525757 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0003885886602075123, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 10168945.0, "repeat_count": 0.0, "routers_loss": 0.006565484683960676, "skip_count": 2.0, "step": 6302, "text_loss": 0.3530846834182739 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.00038828694661308116, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 10171914.0, "repeat_count": 0.0, "routers_loss": 0.0009084723424166441, "skip_count": 0.0, "step": 6304, "text_loss": 0.4603337347507477 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.60581156442618, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0003879852758273029, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 10175737.0, "repeat_count": 1.0, "routers_loss": 0.004121702630072832, "skip_count": 2.0, "step": 6306, "text_loss": 0.5294032096862793 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00038768364796577814, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 10178543.0, "repeat_count": 0.0, "routers_loss": 0.0013208909658715129, "skip_count": 0.0, "step": 6308, "text_loss": 0.41084006428718567 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 29.62459641913707, "f1_execute": 0.9743589162826538, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00038738206314409144, "loss": 0.0079, "macro_f1": 0.9247862696647644, "num_tokens": 10181880.0, "repeat_count": 3.0, "routers_loss": 0.03674180060625076, "skip_count": 6.0, "step": 6310, "text_loss": 0.6920746564865112 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0003870805214778106, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 10185173.0, "repeat_count": 0.0, "routers_loss": 0.00221974472515285, "skip_count": 2.0, "step": 6312, "text_loss": 0.1376657634973526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0003867790230824869, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 10188642.0, "repeat_count": 0.0, "routers_loss": 0.001809283159673214, "skip_count": 0.0, "step": 6314, "text_loss": 0.5220870971679688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0003864775680736552, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 10191750.0, "repeat_count": 0.0, "routers_loss": 0.0013956360053271055, "skip_count": 0.0, "step": 6316, "text_loss": 0.4109838902950287 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0257568359375, "learning_rate": 0.00038617615656683356, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 10194578.0, "repeat_count": 0.0, "routers_loss": 0.002947692759335041, "skip_count": 2.0, "step": 6318, "text_loss": 0.4818590581417084 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0003858747886775232, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 10197131.0, "repeat_count": 0.0, "routers_loss": 0.0008140999125316739, "skip_count": 2.0, "step": 6320, "text_loss": 0.4004709720611572 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.68095098326974, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.04638671875, "learning_rate": 0.0003855734645212093, "loss": 0.0089, "macro_f1": 0.8820862174034119, "num_tokens": 10199965.0, "repeat_count": 2.0, "routers_loss": 0.013056626543402672, "skip_count": 2.0, "step": 6322, "text_loss": 0.3367139995098114 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.690343410625182, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.00038527218421335977, "loss": 0.0087, "macro_f1": 1.0, "num_tokens": 10203184.0, "repeat_count": 1.0, "routers_loss": 0.0038112467154860497, "skip_count": 2.0, "step": 6324, "text_loss": 0.5747989416122437 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0003849709478694255, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 10206436.0, "repeat_count": 0.0, "routers_loss": 0.001232540002092719, "skip_count": 0.0, "step": 6326, "text_loss": 0.4981732964515686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.00038466975560484115, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 10209889.0, "repeat_count": 0.0, "routers_loss": 0.004343799781054258, "skip_count": 0.0, "step": 6328, "text_loss": 0.2160186469554901 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 0.000384368607535024, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 10212520.0, "repeat_count": 0.0, "routers_loss": 0.0014161963481456041, "skip_count": 1.0, "step": 6330, "text_loss": 0.3556232154369354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0185546875, "learning_rate": 0.0003840675037753745, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 10215456.0, "repeat_count": 0.0, "routers_loss": 0.0014989010524004698, "skip_count": 0.0, "step": 6332, "text_loss": 0.8510926961898804 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.0003837664444412762, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 10218558.0, "repeat_count": 0.0, "routers_loss": 0.006702739745378494, "skip_count": 0.0, "step": 6334, "text_loss": 0.3995226323604584 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0003834654296480958, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 10221862.0, "repeat_count": 0.0, "routers_loss": 0.00826781615614891, "skip_count": 2.0, "step": 6336, "text_loss": 0.3534671664237976 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0003831644595111825, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 10224820.0, "repeat_count": 0.0, "routers_loss": 0.002143894787877798, "skip_count": 0.0, "step": 6338, "text_loss": 0.20216144621372223 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 29.76548282946874, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.04736328125, "learning_rate": 0.0003828635341458687, "loss": 0.0064, "macro_f1": 0.5492662787437439, "num_tokens": 10227479.0, "repeat_count": 0.0, "routers_loss": 0.012319118715822697, "skip_count": 2.0, "step": 6340, "text_loss": 0.26248639822006226 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.0003825626536674697, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 10231347.0, "repeat_count": 0.0, "routers_loss": 0.00334449321962893, "skip_count": 0.0, "step": 6342, "text_loss": 0.6357201337814331 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.000382261818191283, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 10234347.0, "repeat_count": 0.0, "routers_loss": 0.0027788348961621523, "skip_count": 0.0, "step": 6344, "text_loss": 0.2813846468925476 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.00038196102783258996, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 10237105.0, "repeat_count": 0.0, "routers_loss": 0.001545077539049089, "skip_count": 0.0, "step": 6346, "text_loss": 0.47612661123275757 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.060791015625, "learning_rate": 0.0003816602827066537, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 10240249.0, "repeat_count": 0.0, "routers_loss": 0.005602670833468437, "skip_count": 2.0, "step": 6348, "text_loss": 0.18197228014469147 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0003813595829287204, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 10243417.0, "repeat_count": 0.0, "routers_loss": 0.0004317959537729621, "skip_count": 0.0, "step": 6350, "text_loss": 0.3818575143814087 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0213623046875, "learning_rate": 0.0003810589286140186, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 10246824.0, "repeat_count": 0.0, "routers_loss": 0.002225276781246066, "skip_count": 0.0, "step": 6352, "text_loss": 0.14129821956157684 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 29.831229820956853, "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.0003807583198777599, "loss": 0.0062, "macro_f1": 0.9265305995941162, "num_tokens": 10249836.0, "repeat_count": 3.0, "routers_loss": 0.02445496805012226, "skip_count": 1.0, "step": 6354, "text_loss": 0.3237064480781555 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00038045775683513786, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 10252900.0, "repeat_count": 0.0, "routers_loss": 0.0009264222462661564, "skip_count": 0.0, "step": 6356, "text_loss": 0.6777551174163818 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 29.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0003801572396013289, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 10255526.0, "repeat_count": 1.0, "routers_loss": 0.007189550437033176, "skip_count": 5.0, "step": 6358, "text_loss": 0.25438982248306274 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00037985676829149187, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 10258865.0, "repeat_count": 0.0, "routers_loss": 0.0014201018493622541, "skip_count": 0.0, "step": 6360, "text_loss": 0.5063154101371765 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 29.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0003795563430207678, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 10261677.0, "repeat_count": 0.0, "routers_loss": 0.0035477925557643175, "skip_count": 3.0, "step": 6362, "text_loss": 0.4815357029438019 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.878191957734078, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 0.0003792559639042803, "loss": 0.0049, "macro_f1": 0.3272727429866791, "num_tokens": 10264805.0, "repeat_count": 0.0, "routers_loss": 0.013723359443247318, "skip_count": 1.0, "step": 6364, "text_loss": 0.5563676357269287 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0003789556310571351, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 10267885.0, "repeat_count": 0.0, "routers_loss": 0.0028159532230347395, "skip_count": 0.0, "step": 6366, "text_loss": 0.7284183502197266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0003786553445944204, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 10270934.0, "repeat_count": 0.0, "routers_loss": 0.0005918835522606969, "skip_count": 0.0, "step": 6368, "text_loss": 0.7387746572494507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.906369239800412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0003783551046312067, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 10273818.0, "repeat_count": 0.0, "routers_loss": 0.0011416864581406116, "skip_count": 0.0, "step": 6370, "text_loss": 0.5360285043716431 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 29.915761667155856, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.00037805491128254645, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 10276494.0, "repeat_count": 2.0, "routers_loss": 0.002382483799010515, "skip_count": 1.0, "step": 6372, "text_loss": 0.7536854147911072 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03564453125, "learning_rate": 0.00037775476466347414, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 10279719.0, "repeat_count": 0.0, "routers_loss": 0.0021104486659169197, "skip_count": 1.0, "step": 6374, "text_loss": 0.6807253956794739 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.934546521866746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.0003774546648890066, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 10283000.0, "repeat_count": 0.0, "routers_loss": 0.003148776013404131, "skip_count": 2.0, "step": 6376, "text_loss": 0.30774110555648804 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 29.94393894922219, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0003771546120741426, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 10285666.0, "repeat_count": 1.0, "routers_loss": 0.007700880523771048, "skip_count": 1.0, "step": 6378, "text_loss": 0.4476076364517212 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 29.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.0003768546063338631, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 10289127.0, "repeat_count": 0.0, "routers_loss": 0.0023625255562365055, "skip_count": 1.0, "step": 6380, "text_loss": 0.4350969195365906 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0179443359375, "learning_rate": 0.0003765546477831307, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 10292485.0, "repeat_count": 0.0, "routers_loss": 0.001428726245649159, "skip_count": 0.0, "step": 6382, "text_loss": 0.49078530073165894 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 29.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0003762547365368902, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 10295361.0, "repeat_count": 0.0, "routers_loss": 0.0027160397730767727, "skip_count": 2.0, "step": 6384, "text_loss": 0.3476370573043823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 29.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.00037595487271006807, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 10298717.0, "repeat_count": 0.0, "routers_loss": 0.002456068294122815, "skip_count": 0.0, "step": 6386, "text_loss": 0.3634916841983795 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 29.99090108599941, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.021240234375, "learning_rate": 0.0003756550564175727, "loss": 0.0049, "macro_f1": 0.9265305995941162, "num_tokens": 10302102.0, "repeat_count": 1.0, "routers_loss": 0.02546076290309429, "skip_count": 3.0, "step": 6388, "text_loss": 0.2422582060098648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.00037535528777429426, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 10305060.0, "repeat_count": 0.0, "routers_loss": 0.001045907847583294, "skip_count": 0.0, "step": 6390, "text_loss": 0.5563194155693054 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.009392427355444, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0003750555668951045, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 10307903.0, "repeat_count": 1.0, "routers_loss": 0.007391332648694515, "skip_count": 2.0, "step": 6392, "text_loss": 0.3423991799354553 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 30.01878485471089, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.00037475589389485744, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 10311396.0, "repeat_count": 1.0, "routers_loss": 0.0029360291082412004, "skip_count": 1.0, "step": 6394, "text_loss": 0.9877024292945862 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.00037445626888838807, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 10314250.0, "repeat_count": 0.0, "routers_loss": 0.0014932662015780807, "skip_count": 0.0, "step": 6396, "text_loss": 0.3978523313999176 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 30.037569709421778, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0003741566919905133, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 10316894.0, "repeat_count": 1.0, "routers_loss": 0.007003722712397575, "skip_count": 5.0, "step": 6398, "text_loss": 0.2945566475391388 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 30.046962136777225, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.00037385716331603155, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 10319603.0, "repeat_count": 1.0, "routers_loss": 0.006710570305585861, "skip_count": 1.0, "step": 6400, "text_loss": 0.2984389662742615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0179443359375, "learning_rate": 0.00037355768297972275, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 10322670.0, "repeat_count": 0.0, "routers_loss": 0.00048738415353000164, "skip_count": 0.0, "step": 6402, "text_loss": 0.483262300491333 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 30.065746991488112, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0198974609375, "learning_rate": 0.00037325825109634837, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 10326280.0, "repeat_count": 1.0, "routers_loss": 0.001625525183044374, "skip_count": 1.0, "step": 6404, "text_loss": 0.42678722739219666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0003729588677806513, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 10329008.0, "repeat_count": 0.0, "routers_loss": 0.004408636130392551, "skip_count": 0.0, "step": 6406, "text_loss": 0.2264070063829422 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 0.0003726595331473557, "loss": 0.0032, "macro_f1": 0.6666666865348816, "num_tokens": 10332533.0, "repeat_count": 0.0, "routers_loss": 0.0038099216762930155, "skip_count": 2.0, "step": 6408, "text_loss": 0.6670092940330505 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.093924273554446, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0003723602473111672, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 10335643.0, "repeat_count": 1.0, "routers_loss": 0.003097689710557461, "skip_count": 0.0, "step": 6410, "text_loss": 0.45228812098503113 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.00037206101038677274, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 10338522.0, "repeat_count": 0.0, "routers_loss": 0.005268602631986141, "skip_count": 1.0, "step": 6412, "text_loss": 0.7288079857826233 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0003717618224888405, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 10341516.0, "repeat_count": 0.0, "routers_loss": 0.004640138708055019, "skip_count": 2.0, "step": 6414, "text_loss": 0.22850871086120605 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.00037146268373201954, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 10344831.0, "repeat_count": 0.0, "routers_loss": 0.0006379318656399846, "skip_count": 0.0, "step": 6416, "text_loss": 0.7864460945129395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0003711635942309408, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 10348499.0, "repeat_count": 0.0, "routers_loss": 0.0004005273221991956, "skip_count": 0.0, "step": 6418, "text_loss": 0.605839192867279 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0157470703125, "learning_rate": 0.0003708645541002159, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 10351722.0, "repeat_count": 0.0, "routers_loss": 0.001061634044162929, "skip_count": 0.0, "step": 6420, "text_loss": 0.8226510286331177 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 30.150278837687114, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0284423828125, "learning_rate": 0.0003705655634544374, "loss": 0.0052, "macro_f1": 0.5492662787437439, "num_tokens": 10355275.0, "repeat_count": 0.0, "routers_loss": 0.013980664312839508, "skip_count": 2.0, "step": 6422, "text_loss": 0.2709597647190094 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.159671265042558, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.0003702666224081792, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 10359702.0, "repeat_count": 1.0, "routers_loss": 0.0013196271611377597, "skip_count": 0.0, "step": 6424, "text_loss": 0.6451483368873596 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00036996773107599604, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 10363364.0, "repeat_count": 0.0, "routers_loss": 0.0028023163322359324, "skip_count": 1.0, "step": 6426, "text_loss": 0.2770799398422241 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01373291015625, "learning_rate": 0.0003696688895724235, "loss": 0.0029, "macro_f1": 0.3333333432674408, "num_tokens": 10366554.0, "repeat_count": 0.0, "routers_loss": 0.0011023655533790588, "skip_count": 0.0, "step": 6428, "text_loss": 0.5466503500938416 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.187848547108892, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02001953125, "learning_rate": 0.0003693700980119784, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 10369733.0, "repeat_count": 0.0, "routers_loss": 0.00230707717128098, "skip_count": 0.0, "step": 6430, "text_loss": 0.45667049288749695 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.19724097446434, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.00036907135650915824, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 10373382.0, "repeat_count": 0.0, "routers_loss": 0.0036784098483622074, "skip_count": 2.0, "step": 6432, "text_loss": 0.13856995105743408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.00036877266517844115, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 10376202.0, "repeat_count": 0.0, "routers_loss": 0.0008461157558485866, "skip_count": 0.0, "step": 6434, "text_loss": 0.27238601446151733 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.0003684740241342863, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 10380748.0, "repeat_count": 0.0, "routers_loss": 0.0052765593864023685, "skip_count": 0.0, "step": 6436, "text_loss": 0.6182295083999634 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.225418256530673, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 0.00036817543349113355, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 10386148.0, "repeat_count": 1.0, "routers_loss": 0.005562922917306423, "skip_count": 2.0, "step": 6438, "text_loss": 0.5591027140617371 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.0003678768933634033, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 10389385.0, "repeat_count": 0.0, "routers_loss": 0.0008686366491019726, "skip_count": 0.0, "step": 6440, "text_loss": 0.5158660411834717 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 0.0003675784038654968, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 10391893.0, "repeat_count": 0.0, "routers_loss": 0.0022222092375159264, "skip_count": 1.0, "step": 6442, "text_loss": 0.2865697741508484 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.0003672799651117958, "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 10395082.0, "repeat_count": 0.0, "routers_loss": 0.0030799773521721363, "skip_count": 2.0, "step": 6444, "text_loss": 0.21298295259475708 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 30.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0003669815772166625, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 10398015.0, "repeat_count": 0.0, "routers_loss": 0.0035721305757761, "skip_count": 3.0, "step": 6446, "text_loss": 0.5286803841590881 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 30.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0203857421875, "learning_rate": 0.00036668324029443975, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 10400749.0, "repeat_count": 0.0, "routers_loss": 0.00741040613502264, "skip_count": 4.0, "step": 6448, "text_loss": 0.3922366201877594 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.281772820663342, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.064453125, "learning_rate": 0.0003663849544594507, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 10404439.0, "repeat_count": 0.0, "routers_loss": 0.002974750241264701, "skip_count": 2.0, "step": 6450, "text_loss": 0.21894219517707825 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0458984375, "learning_rate": 0.00036608671982599927, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 10408476.0, "repeat_count": 0.0, "routers_loss": 0.004810616374015808, "skip_count": 0.0, "step": 6452, "text_loss": 0.3928622305393219 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.30055767537423, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0003657885365083694, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 10411533.0, "repeat_count": 1.0, "routers_loss": 0.005527745466679335, "skip_count": 0.0, "step": 6454, "text_loss": 0.22816279530525208 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.00036549040462082556, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 10414501.0, "repeat_count": 0.0, "routers_loss": 0.0021297158673405647, "skip_count": 0.0, "step": 6456, "text_loss": 0.20487719774246216 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 30.31934253008512, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0003651923242776124, "loss": 0.0082, "macro_f1": 0.6592592597007751, "num_tokens": 10418296.0, "repeat_count": 1.0, "routers_loss": 0.046412210911512375, "skip_count": 5.0, "step": 6458, "text_loss": 0.2890419065952301 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0223388671875, "learning_rate": 0.00036489429559295484, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 10421211.0, "repeat_count": 0.0, "routers_loss": 0.004002603702247143, "skip_count": 0.0, "step": 6460, "text_loss": 0.23165544867515564 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0003645963186810581, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 10424231.0, "repeat_count": 0.0, "routers_loss": 0.003480088198557496, "skip_count": 1.0, "step": 6462, "text_loss": 0.6286683082580566 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0003642983936561075, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 10427387.0, "repeat_count": 0.0, "routers_loss": 0.009358933195471764, "skip_count": 2.0, "step": 6464, "text_loss": 0.3258316218852997 }, { "acc_repeat": 0.800000011920929, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.356912239506897, "f1_execute": 0.9729729890823364, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.00036400052063226816, "loss": 0.0048, "macro_f1": 0.9539539813995361, "num_tokens": 10430813.0, "repeat_count": 5.0, "routers_loss": 0.03567950055003166, "skip_count": 5.0, "step": 6466, "text_loss": 0.7278715968132019 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.00036370269972368615, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 10434175.0, "repeat_count": 1.0, "routers_loss": 0.00226925453171134, "skip_count": 2.0, "step": 6468, "text_loss": 0.5652450919151306 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.375697094217788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0174560546875, "learning_rate": 0.0003634049310444867, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 10437393.0, "repeat_count": 0.0, "routers_loss": 0.0013644809368997812, "skip_count": 0.0, "step": 6470, "text_loss": 0.5985191464424133 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.38508952157323, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0003631072147087753, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 10440412.0, "repeat_count": 0.0, "routers_loss": 0.0003114990540780127, "skip_count": 0.0, "step": 6472, "text_loss": 0.5588209629058838 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.394481948928675, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.00036280955083063747, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 10443471.0, "repeat_count": 0.0, "routers_loss": 0.0005486322334036231, "skip_count": 0.0, "step": 6474, "text_loss": 0.6969016194343567 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.403874376284122, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.00036251193952413865, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 10446548.0, "repeat_count": 1.0, "routers_loss": 0.008256378583610058, "skip_count": 2.0, "step": 6476, "text_loss": 0.27083566784858704 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0003622143809033239, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 10449478.0, "repeat_count": 0.0, "routers_loss": 0.001008771825581789, "skip_count": 0.0, "step": 6478, "text_loss": 0.1689433604478836 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.42265923099501, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.00036191687508221827, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 10453017.0, "repeat_count": 1.0, "routers_loss": 0.0014678959269076586, "skip_count": 0.0, "step": 6480, "text_loss": 0.9571998715400696 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0203857421875, "learning_rate": 0.0003616194221748267, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 10456061.0, "repeat_count": 0.0, "routers_loss": 0.001516164978966117, "skip_count": 0.0, "step": 6482, "text_loss": 0.5750429034233093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0003613220222951335, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 10459130.0, "repeat_count": 0.0, "routers_loss": 0.0031315975356847048, "skip_count": 0.0, "step": 6484, "text_loss": 0.47120073437690735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0003610246755571029, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 10462190.0, "repeat_count": 0.0, "routers_loss": 0.0006079549202695489, "skip_count": 0.0, "step": 6486, "text_loss": 0.8426173329353333 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.000360727382074679, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 10465233.0, "repeat_count": 0.0, "routers_loss": 0.00596054969355464, "skip_count": 0.0, "step": 6488, "text_loss": 0.18435880541801453 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.469621367772234, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00036043014196178463, "loss": 0.0046, "macro_f1": 0.3272727429866791, "num_tokens": 10468135.0, "repeat_count": 0.0, "routers_loss": 0.008584967814385891, "skip_count": 1.0, "step": 6490, "text_loss": 0.3827758729457855 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 30.479013795127678, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 0.00036013295533232344, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 10471032.0, "repeat_count": 2.0, "routers_loss": 0.005076571833342314, "skip_count": 5.0, "step": 6492, "text_loss": 0.1215854063630104 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 30.488406222483125, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.0003598358223001776, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 10474779.0, "repeat_count": 3.0, "routers_loss": 0.005972118582576513, "skip_count": 0.0, "step": 6494, "text_loss": 0.22768665850162506 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0003595387429792091, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 10478015.0, "repeat_count": 0.0, "routers_loss": 0.004733685404062271, "skip_count": 1.0, "step": 6496, "text_loss": 0.5013535618782043 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 0.00035924171748325916, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 10481113.0, "repeat_count": 0.0, "routers_loss": 0.01148980576545, "skip_count": 2.0, "step": 6498, "text_loss": 0.3281762897968292 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.516583504549455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0003589447459261487, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 10484049.0, "repeat_count": 0.0, "routers_loss": 0.007726775947958231, "skip_count": 2.0, "step": 6500, "text_loss": 0.46294569969177246 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.525975931904902, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00035864782842167763, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 10487443.0, "repeat_count": 1.0, "routers_loss": 0.0013331319205462933, "skip_count": 0.0, "step": 6502, "text_loss": 0.5122153759002686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.00035835096508362544, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 10490535.0, "repeat_count": 0.0, "routers_loss": 0.0011629529763013124, "skip_count": 0.0, "step": 6504, "text_loss": 0.40683525800704956 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.00035805415602575054, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10493575.0, "repeat_count": 0.0, "routers_loss": 0.004780632443726063, "skip_count": 0.0, "step": 6506, "text_loss": 0.37263134121894836 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.00035775740136179075, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 10496193.0, "repeat_count": 0.0, "routers_loss": 0.0018355643842369318, "skip_count": 0.0, "step": 6508, "text_loss": 0.2074306458234787 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.00035746070120546314, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 10500135.0, "repeat_count": 0.0, "routers_loss": 0.004067617934197187, "skip_count": 1.0, "step": 6510, "text_loss": 0.26313406229019165 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00035716405567046383, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 10503533.0, "repeat_count": 0.0, "routers_loss": 0.005438363179564476, "skip_count": 0.0, "step": 6512, "text_loss": 0.3448122441768646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.00035686746487046767, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 10506207.0, "repeat_count": 0.0, "routers_loss": 0.0012895528925582767, "skip_count": 0.0, "step": 6514, "text_loss": 0.43096476793289185 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0003565709289191291, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10509257.0, "repeat_count": 0.0, "routers_loss": 0.003141741268336773, "skip_count": 0.0, "step": 6516, "text_loss": 0.22349724173545837 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.601115350748458, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0196533203125, "learning_rate": 0.0003562744479300811, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10512554.0, "repeat_count": 0.0, "routers_loss": 0.0005669888923875988, "skip_count": 0.0, "step": 6518, "text_loss": 0.5319190621376038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.610507778103905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.00035597802201693587, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 10515720.0, "repeat_count": 0.0, "routers_loss": 0.0020814717281609774, "skip_count": 0.0, "step": 6520, "text_loss": 0.20216144621372223 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.61990020545935, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0003556816512932841, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 10518517.0, "repeat_count": 2.0, "routers_loss": 0.010716461576521397, "skip_count": 3.0, "step": 6522, "text_loss": 0.15843836963176727 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.629292632814792, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01806640625, "learning_rate": 0.0003553853358726959, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 10521414.0, "repeat_count": 0.0, "routers_loss": 0.0014748790999874473, "skip_count": 0.0, "step": 6524, "text_loss": 0.393892377614975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029296875, "learning_rate": 0.00035508907586871984, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 10524210.0, "repeat_count": 0.0, "routers_loss": 0.0004757299611810595, "skip_count": 0.0, "step": 6526, "text_loss": 0.2557907700538635 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.648077487525683, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.00035479287139488327, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 10527327.0, "repeat_count": 1.0, "routers_loss": 0.002445317106321454, "skip_count": 0.0, "step": 6528, "text_loss": 0.48338422179222107 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 0.0003544967225646922, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 10530363.0, "repeat_count": 0.0, "routers_loss": 0.0015845977468416095, "skip_count": 0.0, "step": 6530, "text_loss": 0.6474354267120361 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.666862342236573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.00035420062949163166, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 10533444.0, "repeat_count": 0.0, "routers_loss": 0.002190655330196023, "skip_count": 0.0, "step": 6532, "text_loss": 0.3789777457714081 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0003539045922891649, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 10536711.0, "repeat_count": 0.0, "routers_loss": 0.00317079434171319, "skip_count": 0.0, "step": 6534, "text_loss": 0.25758084654808044 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.00035360861107073394, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 10539849.0, "repeat_count": 0.0, "routers_loss": 0.0010938458144664764, "skip_count": 0.0, "step": 6536, "text_loss": 0.9821014404296875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0003533126859497592, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 10543004.0, "repeat_count": 0.0, "routers_loss": 0.003071998478844762, "skip_count": 2.0, "step": 6538, "text_loss": 0.6314182281494141 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0003530168170396401, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 10545965.0, "repeat_count": 0.0, "routers_loss": 0.006067665759474039, "skip_count": 2.0, "step": 6540, "text_loss": 0.5021927356719971 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0167236328125, "learning_rate": 0.000352721004453754, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 10549188.0, "repeat_count": 0.0, "routers_loss": 0.0019109295681118965, "skip_count": 0.0, "step": 6542, "text_loss": 0.3008780777454376 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 30.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.00035242524830545683, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 10552298.0, "repeat_count": 0.0, "routers_loss": 0.007457790896296501, "skip_count": 3.0, "step": 6544, "text_loss": 0.5675695538520813 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0003521295487080829, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 10555123.0, "repeat_count": 0.0, "routers_loss": 0.007243642583489418, "skip_count": 1.0, "step": 6546, "text_loss": 0.17955881357192993 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.00035183390577494476, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 10559653.0, "repeat_count": 0.0, "routers_loss": 0.004024330526590347, "skip_count": 0.0, "step": 6548, "text_loss": 0.2634682357311249 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.751394188435572, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.017578125, "learning_rate": 0.0003515383196193336, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 10563770.0, "repeat_count": 1.0, "routers_loss": 0.010837121866643429, "skip_count": 0.0, "step": 6550, "text_loss": 0.1608252227306366 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0003512427903545183, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 10567117.0, "repeat_count": 0.0, "routers_loss": 0.003473864868283272, "skip_count": 0.0, "step": 6552, "text_loss": 0.231611430644989 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0003509473180937464, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 10570622.0, "repeat_count": 0.0, "routers_loss": 0.004441239405423403, "skip_count": 1.0, "step": 6554, "text_loss": 0.3193909227848053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0003506519029502433, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 10573411.0, "repeat_count": 0.0, "routers_loss": 0.0008821079391054809, "skip_count": 0.0, "step": 6556, "text_loss": 0.4478783905506134 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.788963897857354, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0003503565450372128, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 10576422.0, "repeat_count": 1.0, "routers_loss": 0.0014448441797867417, "skip_count": 0.0, "step": 6558, "text_loss": 0.46065983176231384 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0003500612444678365, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 10579879.0, "repeat_count": 0.0, "routers_loss": 0.007939066737890244, "skip_count": 1.0, "step": 6560, "text_loss": 0.3299395740032196 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.000349766001355274, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 10583067.0, "repeat_count": 0.0, "routers_loss": 0.010073966346681118, "skip_count": 2.0, "step": 6562, "text_loss": 0.278255820274353 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.817141179923688, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.00034947081581266335, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 10586276.0, "repeat_count": 0.0, "routers_loss": 0.0062315030954778194, "skip_count": 1.0, "step": 6564, "text_loss": 0.22706018388271332 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.82653360727913, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0003491756879531201, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 10589257.0, "repeat_count": 3.0, "routers_loss": 0.0023778853937983513, "skip_count": 4.0, "step": 6566, "text_loss": 0.5567800998687744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0196533203125, "learning_rate": 0.0003488806178897377, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 10592163.0, "repeat_count": 0.0, "routers_loss": 0.0004184350254945457, "skip_count": 0.0, "step": 6568, "text_loss": 0.4027897119522095 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0003485856057355876, "loss": 0.0027, "macro_f1": 0.6666666865348816, "num_tokens": 10595326.0, "repeat_count": 0.0, "routers_loss": 0.0035254736430943012, "skip_count": 1.0, "step": 6570, "text_loss": 0.3044572174549103 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.000348290651603719, "loss": 0.0029, "macro_f1": 0.3333333432674408, "num_tokens": 10598236.0, "repeat_count": 0.0, "routers_loss": 0.0030894684605300426, "skip_count": 0.0, "step": 6572, "text_loss": 0.23021161556243896 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 30.86410331670091, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.00034799575560715896, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 10601653.0, "repeat_count": 1.0, "routers_loss": 0.0036557347048074007, "skip_count": 0.0, "step": 6574, "text_loss": 0.5437754392623901 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.873495744056356, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03564453125, "learning_rate": 0.0003477009178589121, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 10604581.0, "repeat_count": 2.0, "routers_loss": 0.021344119682908058, "skip_count": 4.0, "step": 6576, "text_loss": 0.29078927636146545 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 30.8828881714118, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0003474061384719608, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 10607676.0, "repeat_count": 1.0, "routers_loss": 0.0037169242277741432, "skip_count": 1.0, "step": 6578, "text_loss": 1.1790896654129028 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.892280598767243, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0003471114175592649, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 10611269.0, "repeat_count": 2.0, "routers_loss": 0.005873420741409063, "skip_count": 4.0, "step": 6580, "text_loss": 0.36204129457473755 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.901673026122687, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0003468167552337624, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 10614335.0, "repeat_count": 1.0, "routers_loss": 0.01030842587351799, "skip_count": 2.0, "step": 6582, "text_loss": 0.20400437712669373 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061767578125, "learning_rate": 0.00034652215160836826, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 10617565.0, "repeat_count": 0.0, "routers_loss": 0.0025721401907503605, "skip_count": 0.0, "step": 6584, "text_loss": 0.44676345586776733 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 30.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.00034622760679597507, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 10620706.0, "repeat_count": 0.0, "routers_loss": 0.005751762073487043, "skip_count": 1.0, "step": 6586, "text_loss": 0.4733653664588928 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 30.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.00034593312090945306, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 10623916.0, "repeat_count": 0.0, "routers_loss": 0.0029759553726762533, "skip_count": 3.0, "step": 6588, "text_loss": 0.49876922369003296 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 0.0003456386940616498, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 10628093.0, "repeat_count": 0.0, "routers_loss": 0.0010031822603195906, "skip_count": 0.0, "step": 6590, "text_loss": 0.42708611488342285 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.00034534432636539004, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 10631739.0, "repeat_count": 0.0, "routers_loss": 0.0014793311711400747, "skip_count": 0.0, "step": 6592, "text_loss": 0.18193726241588593 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.0003450500179334762, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 10634862.0, "repeat_count": 0.0, "routers_loss": 0.0059733521193265915, "skip_count": 2.0, "step": 6594, "text_loss": 0.28596529364585876 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.967420017610802, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0003447557688786879, "loss": 0.0043, "macro_f1": 0.3272727429866791, "num_tokens": 10637758.0, "repeat_count": 0.0, "routers_loss": 0.0076768649742007256, "skip_count": 1.0, "step": 6596, "text_loss": 0.39428210258483887 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.00034446157931378185, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 10640440.0, "repeat_count": 0.0, "routers_loss": 0.0015128811355680227, "skip_count": 0.0, "step": 6598, "text_loss": 0.45584383606910706 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 30.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043701171875, "learning_rate": 0.00034416744935149193, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 10643600.0, "repeat_count": 0.0, "routers_loss": 0.000757391273509711, "skip_count": 0.0, "step": 6600, "text_loss": 0.503209114074707 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 30.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.060302734375, "learning_rate": 0.0003438733791045294, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 10646907.0, "repeat_count": 0.0, "routers_loss": 0.0025944956578314304, "skip_count": 2.0, "step": 6602, "text_loss": 0.4370735287666321 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00034357936868558255, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 10649995.0, "repeat_count": 0.0, "routers_loss": 0.0006543452036567032, "skip_count": 0.0, "step": 6604, "text_loss": 0.4125586748123169 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00034328541820731663, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 10653251.0, "repeat_count": 0.0, "routers_loss": 0.00027016724925488234, "skip_count": 1.0, "step": 6606, "text_loss": 0.7309898734092712 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 31.023481068388612, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.020751953125, "learning_rate": 0.00034299152778237413, "loss": 0.0062, "macro_f1": 0.8823530077934265, "num_tokens": 10657229.0, "repeat_count": 1.0, "routers_loss": 0.01905548945069313, "skip_count": 2.0, "step": 6608, "text_loss": 0.42367079854011536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019287109375, "learning_rate": 0.0003426976975233744, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 10660524.0, "repeat_count": 0.0, "routers_loss": 0.0004718089767266065, "skip_count": 0.0, "step": 6610, "text_loss": 0.6613664627075195 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 31.0422659230995, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.00034240392754291343, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 10663908.0, "repeat_count": 1.0, "routers_loss": 0.0027069442439824343, "skip_count": 0.0, "step": 6612, "text_loss": 0.859471321105957 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.000342110217953565, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 10667814.0, "repeat_count": 0.0, "routers_loss": 0.0015497280983254313, "skip_count": 0.0, "step": 6614, "text_loss": 0.18337638676166534 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0003418165688678788, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 10671630.0, "repeat_count": 0.0, "routers_loss": 0.0013396464055404067, "skip_count": 0.0, "step": 6616, "text_loss": 0.860016405582428 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 31.070443205165834, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0205078125, "learning_rate": 0.0003415229803983819, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 10675308.0, "repeat_count": 0.0, "routers_loss": 0.007542039267718792, "skip_count": 3.0, "step": 6618, "text_loss": 0.15481022000312805 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038818359375, "learning_rate": 0.0003412294526575779, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 10678092.0, "repeat_count": 0.0, "routers_loss": 0.002029839437454939, "skip_count": 2.0, "step": 6620, "text_loss": 0.5121933221817017 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.00034093598575794706, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 10681382.0, "repeat_count": 0.0, "routers_loss": 0.0013001341139897704, "skip_count": 0.0, "step": 6622, "text_loss": 0.4555061161518097 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.00034064257981194655, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 10684255.0, "repeat_count": 0.0, "routers_loss": 0.0007926415419206023, "skip_count": 0.0, "step": 6624, "text_loss": 0.7298227548599243 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0003403492349320101, "loss": 0.0031, "macro_f1": 0.6666666865348816, "num_tokens": 10686904.0, "repeat_count": 0.0, "routers_loss": 0.0021080176811665297, "skip_count": 1.0, "step": 6626, "text_loss": 0.45434215664863586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.000340055951230548, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 10690311.0, "repeat_count": 0.0, "routers_loss": 0.004011874087154865, "skip_count": 0.0, "step": 6628, "text_loss": 0.15496443212032318 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.00033976272881994707, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 10693395.0, "repeat_count": 0.0, "routers_loss": 0.0031893099658191204, "skip_count": 2.0, "step": 6630, "text_loss": 0.5291517972946167 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0003394695678125708, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 10697046.0, "repeat_count": 0.0, "routers_loss": 0.0033124347683042288, "skip_count": 1.0, "step": 6632, "text_loss": 0.2893230617046356 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.00033917646832075886, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 10700111.0, "repeat_count": 0.0, "routers_loss": 0.002547801472246647, "skip_count": 0.0, "step": 6634, "text_loss": 0.10363512486219406 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 31.154975051364836, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0208740234375, "learning_rate": 0.0003388834304568275, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 10703939.0, "repeat_count": 2.0, "routers_loss": 0.0019040531478822231, "skip_count": 0.0, "step": 6636, "text_loss": 0.5185034275054932 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00033859045433306975, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 10707187.0, "repeat_count": 0.0, "routers_loss": 0.0074104927480220795, "skip_count": 2.0, "step": 6638, "text_loss": 0.1618153154850006 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048583984375, "learning_rate": 0.0003382975400617543, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 10710029.0, "repeat_count": 0.0, "routers_loss": 0.0013861875049769878, "skip_count": 1.0, "step": 6640, "text_loss": 0.6674485206604004 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 0.0003380046877551266, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 10713318.0, "repeat_count": 0.0, "routers_loss": 0.0034452753607183695, "skip_count": 0.0, "step": 6642, "text_loss": 0.39299124479293823 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029541015625, "learning_rate": 0.0003377118975254082, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 10716130.0, "repeat_count": 0.0, "routers_loss": 0.006802885327488184, "skip_count": 2.0, "step": 6644, "text_loss": 0.12942606210708618 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.20193718814206, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0003374191694847968, "loss": 0.0052, "macro_f1": 0.6601307392120361, "num_tokens": 10719400.0, "repeat_count": 1.0, "routers_loss": 0.03718209266662598, "skip_count": 2.0, "step": 6646, "text_loss": 0.34327754378318787 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.042236328125, "learning_rate": 0.0003371265037454663, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 10722108.0, "repeat_count": 0.0, "routers_loss": 0.006016947794705629, "skip_count": 2.0, "step": 6648, "text_loss": 0.15644726157188416 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.220722042852948, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00033683390041956663, "loss": 0.0075, "macro_f1": 0.6601307392120361, "num_tokens": 10725709.0, "repeat_count": 1.0, "routers_loss": 0.04308273270726204, "skip_count": 2.0, "step": 6650, "text_loss": 0.1875772923231125 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 31.230114470208395, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0003365413596192243, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 10728717.0, "repeat_count": 2.0, "routers_loss": 0.006372809875756502, "skip_count": 1.0, "step": 6652, "text_loss": 0.4948291778564453 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.00033624888145654137, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 10732082.0, "repeat_count": 0.0, "routers_loss": 0.0014530479675158858, "skip_count": 0.0, "step": 6654, "text_loss": 0.44932305812835693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00033595646604359585, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 10734663.0, "repeat_count": 0.0, "routers_loss": 0.001924810465425253, "skip_count": 0.0, "step": 6656, "text_loss": 0.45626893639564514 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 31.25829175227473, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.00033566411349244206, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 10737470.0, "repeat_count": 1.0, "routers_loss": 0.0040014320984482765, "skip_count": 0.0, "step": 6658, "text_loss": 0.2700682580471039 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.00033537182391510996, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10740228.0, "repeat_count": 0.0, "routers_loss": 0.0008573737577535212, "skip_count": 0.0, "step": 6660, "text_loss": 0.5626822113990784 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0003350795974236055, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 10742883.0, "repeat_count": 0.0, "routers_loss": 0.011166860349476337, "skip_count": 1.0, "step": 6662, "text_loss": 0.23357805609703064 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 31.286469034341064, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.00033478743412991037, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 10746459.0, "repeat_count": 1.0, "routers_loss": 0.01719980500638485, "skip_count": 6.0, "step": 6664, "text_loss": 0.150017648935318 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04443359375, "learning_rate": 0.00033449533414598223, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 10749984.0, "repeat_count": 0.0, "routers_loss": 0.0038280142471194267, "skip_count": 2.0, "step": 6666, "text_loss": 0.6312657594680786 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.00033420329758375423, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 10752792.0, "repeat_count": 0.0, "routers_loss": 0.0007688060286454856, "skip_count": 1.0, "step": 6668, "text_loss": 0.6794863939285278 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.314646316407398, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.00033391132455513537, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 10756125.0, "repeat_count": 0.0, "routers_loss": 0.003196930279955268, "skip_count": 2.0, "step": 6670, "text_loss": 0.22897565364837646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0003336194151720102, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 10759296.0, "repeat_count": 0.0, "routers_loss": 0.0026212623342871666, "skip_count": 0.0, "step": 6672, "text_loss": 0.5236268639564514 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 0.0003333275695462391, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 10762574.0, "repeat_count": 0.0, "routers_loss": 0.007855101488530636, "skip_count": 2.0, "step": 6674, "text_loss": 0.2971038818359375 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0003330357877896577, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 10765758.0, "repeat_count": 0.0, "routers_loss": 0.004191791173070669, "skip_count": 2.0, "step": 6676, "text_loss": 0.17358586192131042 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0286865234375, "learning_rate": 0.0003327440700140774, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 10769396.0, "repeat_count": 0.0, "routers_loss": 0.004101858474314213, "skip_count": 1.0, "step": 6678, "text_loss": 0.28932204842567444 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.000332452416331285, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 10772605.0, "repeat_count": 0.0, "routers_loss": 0.0008305918308906257, "skip_count": 0.0, "step": 6680, "text_loss": 0.47090092301368713 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0184326171875, "learning_rate": 0.0003321608268530427, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 10776576.0, "repeat_count": 0.0, "routers_loss": 0.003022305201739073, "skip_count": 1.0, "step": 6682, "text_loss": 0.4467788338661194 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 31.38039330789551, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.00033186930169108795, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 10779648.0, "repeat_count": 1.0, "routers_loss": 0.0021474999375641346, "skip_count": 0.0, "step": 6684, "text_loss": 0.6249470710754395 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.054931640625, "learning_rate": 0.00033157784095713417, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 10782665.0, "repeat_count": 0.0, "routers_loss": 0.0025120675563812256, "skip_count": 1.0, "step": 6686, "text_loss": 0.6763803958892822 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0003312864447628695, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 10785789.0, "repeat_count": 0.0, "routers_loss": 0.0013111691223457456, "skip_count": 1.0, "step": 6688, "text_loss": 0.6609058380126953 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.00033099511321995744, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 10788846.0, "repeat_count": 0.0, "routers_loss": 0.0012354454956948757, "skip_count": 0.0, "step": 6690, "text_loss": 0.4421829283237457 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0003307038464400368, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 10791611.0, "repeat_count": 0.0, "routers_loss": 0.0035219944547861814, "skip_count": 2.0, "step": 6692, "text_loss": 0.16222824156284332 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 31.42735544467273, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.00033041264453472153, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 10794868.0, "repeat_count": 1.0, "routers_loss": 0.0007216202793642879, "skip_count": 0.0, "step": 6694, "text_loss": 0.37388721108436584 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 31.436747872028178, "f1_execute": 0.9743589162826538, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0003301215076156008, "loss": 0.0063, "macro_f1": 0.8803418874740601, "num_tokens": 10797737.0, "repeat_count": 2.0, "routers_loss": 0.025403080508112907, "skip_count": 7.0, "step": 6696, "text_loss": 0.5086690187454224 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0003298304357942389, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 10800972.0, "repeat_count": 0.0, "routers_loss": 0.010532539337873459, "skip_count": 2.0, "step": 6698, "text_loss": 0.22500646114349365 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 0.00032953942918217494, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 10803654.0, "repeat_count": 0.0, "routers_loss": 0.0009591903653927147, "skip_count": 0.0, "step": 6700, "text_loss": 0.6256277561187744 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 0.0003292484878909232, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 10807506.0, "repeat_count": 0.0, "routers_loss": 0.003801517654210329, "skip_count": 2.0, "step": 6702, "text_loss": 0.522081196308136 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0189208984375, "learning_rate": 0.00032895761203197317, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 10810163.0, "repeat_count": 0.0, "routers_loss": 0.002608039416372776, "skip_count": 2.0, "step": 6704, "text_loss": 0.3600201904773712 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00032866680171678874, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 10813202.0, "repeat_count": 0.0, "routers_loss": 0.0026464913971722126, "skip_count": 0.0, "step": 6706, "text_loss": 0.2513798773288727 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023681640625, "learning_rate": 0.00032837605705680895, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 10816484.0, "repeat_count": 0.0, "routers_loss": 0.0027157769072800875, "skip_count": 0.0, "step": 6708, "text_loss": 0.34391456842422485 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 31.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0003280853781634481, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 10819794.0, "repeat_count": 1.0, "routers_loss": 0.0016086180694401264, "skip_count": 1.0, "step": 6710, "text_loss": 0.6535179615020752 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0003277947651480946, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 10823033.0, "repeat_count": 0.0, "routers_loss": 0.002368347719311714, "skip_count": 0.0, "step": 6712, "text_loss": 0.5596423745155334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0003275042181221119, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 10826276.0, "repeat_count": 0.0, "routers_loss": 0.003124286886304617, "skip_count": 0.0, "step": 6714, "text_loss": 0.6584402322769165 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0003272137371968382, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 10828846.0, "repeat_count": 0.0, "routers_loss": 0.0006088328082114458, "skip_count": 0.0, "step": 6716, "text_loss": 0.4602710008621216 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0205078125, "learning_rate": 0.00032692332248358645, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 10832025.0, "repeat_count": 0.0, "routers_loss": 0.002511275466531515, "skip_count": 2.0, "step": 6718, "text_loss": 0.42790886759757996 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 31.549457000293515, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.000326632974093644, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 10835110.0, "repeat_count": 1.0, "routers_loss": 0.01076667383313179, "skip_count": 0.0, "step": 6720, "text_loss": 0.5659847855567932 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 31.55884942764896, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.021484375, "learning_rate": 0.0003263426921382728, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 10838279.0, "repeat_count": 2.0, "routers_loss": 0.004973042290657759, "skip_count": 2.0, "step": 6722, "text_loss": 0.675341010093689 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.00032605247672870964, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 10841381.0, "repeat_count": 0.0, "routers_loss": 0.0013990222942084074, "skip_count": 0.0, "step": 6724, "text_loss": 0.5389315485954285 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.00032576232797616554, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 10844583.0, "repeat_count": 0.0, "routers_loss": 0.003186358604580164, "skip_count": 1.0, "step": 6726, "text_loss": 0.5603348016738892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0003254722459918261, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 10847670.0, "repeat_count": 0.0, "routers_loss": 0.001443870598450303, "skip_count": 0.0, "step": 6728, "text_loss": 0.6922405362129211 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0003251822308868512, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 10851479.0, "repeat_count": 0.0, "routers_loss": 0.004294445738196373, "skip_count": 0.0, "step": 6730, "text_loss": 0.7145437002182007 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.00032489228277237514, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10854489.0, "repeat_count": 0.0, "routers_loss": 0.0032078945077955723, "skip_count": 0.0, "step": 6732, "text_loss": 0.4077773094177246 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.615203991781627, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.00032460240175950664, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 10856954.0, "repeat_count": 1.0, "routers_loss": 0.0038214854430407286, "skip_count": 2.0, "step": 6734, "text_loss": 0.32071781158447266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0003243125879593286, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 10860016.0, "repeat_count": 0.0, "routers_loss": 0.0013407845981419086, "skip_count": 0.0, "step": 6736, "text_loss": 0.45335495471954346 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0003240228414828984, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 10863021.0, "repeat_count": 0.0, "routers_loss": 0.0010989385191351175, "skip_count": 0.0, "step": 6738, "text_loss": 0.562619149684906 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046630859375, "learning_rate": 0.0003237331624412473, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 10866548.0, "repeat_count": 0.0, "routers_loss": 0.006139552686363459, "skip_count": 0.0, "step": 6740, "text_loss": 0.14510060846805573 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00032344355094538087, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10869402.0, "repeat_count": 0.0, "routers_loss": 0.004785746335983276, "skip_count": 0.0, "step": 6742, "text_loss": 0.5655979514122009 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.00032315400710627876, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 10874165.0, "repeat_count": 0.0, "routers_loss": 0.0052397786639630795, "skip_count": 0.0, "step": 6744, "text_loss": 0.4785873591899872 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 31.671558555914295, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0003228645310348948, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 10876919.0, "repeat_count": 3.0, "routers_loss": 0.00460197776556015, "skip_count": 1.0, "step": 6746, "text_loss": 0.5683879256248474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.0003225751228421566, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 10880179.0, "repeat_count": 0.0, "routers_loss": 0.0032690472435206175, "skip_count": 0.0, "step": 6748, "text_loss": 0.5268497467041016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052978515625, "learning_rate": 0.00032228578263896607, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 10883711.0, "repeat_count": 0.0, "routers_loss": 0.0036305058747529984, "skip_count": 0.0, "step": 6750, "text_loss": 0.16675594449043274 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0003219965105361989, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 10887041.0, "repeat_count": 0.0, "routers_loss": 0.002453352091833949, "skip_count": 1.0, "step": 6752, "text_loss": 0.7010246515274048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 0.00032170730664470465, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 10890053.0, "repeat_count": 0.0, "routers_loss": 0.0020381701178848743, "skip_count": 0.0, "step": 6754, "text_loss": 0.46637895703315735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0003214181710753069, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 10893501.0, "repeat_count": 0.0, "routers_loss": 0.004525696858763695, "skip_count": 0.0, "step": 6756, "text_loss": 0.1768684983253479 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 31.727913120046964, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0003211291039388026, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 10896480.0, "repeat_count": 1.0, "routers_loss": 0.0038154330104589462, "skip_count": 0.0, "step": 6758, "text_loss": 0.7908347845077515 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0400390625, "learning_rate": 0.00032084010534596326, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 10899158.0, "repeat_count": 0.0, "routers_loss": 0.004711449146270752, "skip_count": 2.0, "step": 6760, "text_loss": 0.37209007143974304 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 31.74669797475785, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0003205511754075335, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 10901791.0, "repeat_count": 1.0, "routers_loss": 0.0025003373157233, "skip_count": 1.0, "step": 6762, "text_loss": 0.8081201314926147 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 31.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04443359375, "learning_rate": 0.00032026231423423204, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 10904817.0, "repeat_count": 0.0, "routers_loss": 0.007387075573205948, "skip_count": 3.0, "step": 6764, "text_loss": 0.30355480313301086 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 31.76548282946874, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0003199735219367507, "loss": 0.0061, "macro_f1": 0.5492662787437439, "num_tokens": 10908018.0, "repeat_count": 2.0, "routers_loss": 0.04275592789053917, "skip_count": 0.0, "step": 6766, "text_loss": 0.26562029123306274 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.774875256824185, "f1_execute": 0.9767441749572754, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.029541015625, "learning_rate": 0.0003196847986257553, "loss": 0.008, "macro_f1": 0.9255813956260681, "num_tokens": 10911264.0, "repeat_count": 3.0, "routers_loss": 0.034824032336473465, "skip_count": 4.0, "step": 6768, "text_loss": 0.2761698067188263 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.00031939614441188523, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 10915964.0, "repeat_count": 0.0, "routers_loss": 0.0011179742868989706, "skip_count": 0.0, "step": 6770, "text_loss": 0.4107927083969116 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.00031910755940575344, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 10918678.0, "repeat_count": 0.0, "routers_loss": 0.0011521469568833709, "skip_count": 0.0, "step": 6772, "text_loss": 0.43064895272254944 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 31.80305253889052, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.01708984375, "learning_rate": 0.000318819043717946, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 10921757.0, "repeat_count": 1.0, "routers_loss": 0.002861087443307042, "skip_count": 1.0, "step": 6774, "text_loss": 0.5945150852203369 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0208740234375, "learning_rate": 0.0003185305974590229, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 10924767.0, "repeat_count": 0.0, "routers_loss": 0.0011365334503352642, "skip_count": 0.0, "step": 6776, "text_loss": 0.36615172028541565 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 31.82183739360141, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0003182422207395171, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 10927750.0, "repeat_count": 1.0, "routers_loss": 0.0034391419030725956, "skip_count": 0.0, "step": 6778, "text_loss": 0.17081251740455627 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.0003179539136699351, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 10930817.0, "repeat_count": 0.0, "routers_loss": 0.004941808991134167, "skip_count": 2.0, "step": 6780, "text_loss": 0.7683762311935425 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 31.840622248312297, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.038330078125, "learning_rate": 0.00031766567636075675, "loss": 0.0061, "macro_f1": 0.8823530077934265, "num_tokens": 10933882.0, "repeat_count": 1.0, "routers_loss": 0.017502857372164726, "skip_count": 2.0, "step": 6782, "text_loss": 0.38010457158088684 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0003173775089224353, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 10936909.0, "repeat_count": 1.0, "routers_loss": 0.0035372809506952763, "skip_count": 2.0, "step": 6784, "text_loss": 0.5760656595230103 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.859407103023187, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 0.00031708941146539707, "loss": 0.0061, "macro_f1": 0.3272727429866791, "num_tokens": 10940032.0, "repeat_count": 1.0, "routers_loss": 0.02229934185743332, "skip_count": 0.0, "step": 6786, "text_loss": 0.5767728090286255 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 0.00031680138410004123, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 10943217.0, "repeat_count": 0.0, "routers_loss": 0.0028649091254919767, "skip_count": 1.0, "step": 6788, "text_loss": 0.9756367802619934 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.878191957734078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025390625, "learning_rate": 0.00031651342693674066, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 10947847.0, "repeat_count": 0.0, "routers_loss": 0.0039158593863248825, "skip_count": 2.0, "step": 6790, "text_loss": 0.2504335045814514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.000316225540085841, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 10950879.0, "repeat_count": 0.0, "routers_loss": 0.0022091215942054987, "skip_count": 0.0, "step": 6792, "text_loss": 0.525842547416687 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0223388671875, "learning_rate": 0.00031593772365766105, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 10954960.0, "repeat_count": 0.0, "routers_loss": 0.0006841494468972087, "skip_count": 0.0, "step": 6794, "text_loss": 0.6383582353591919 }, { "acc_repeat": 0.800000011920929, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 31.906369239800412, "f1_execute": 0.9729729890823364, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.0003156499777624926, "loss": 0.006, "macro_f1": 0.9539539813995361, "num_tokens": 10958278.0, "repeat_count": 5.0, "routers_loss": 0.03810702636837959, "skip_count": 5.0, "step": 6796, "text_loss": 0.5901661515235901 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01708984375, "learning_rate": 0.0003153623025106005, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 10962412.0, "repeat_count": 0.0, "routers_loss": 0.00046833412488922477, "skip_count": 0.0, "step": 6798, "text_loss": 0.42693984508514404 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00031507469801222233, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 10966037.0, "repeat_count": 0.0, "routers_loss": 0.006818041671067476, "skip_count": 2.0, "step": 6800, "text_loss": 0.5326262712478638 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.934546521866746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.00031478716437756876, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 10969369.0, "repeat_count": 0.0, "routers_loss": 0.0029889161232858896, "skip_count": 0.0, "step": 6802, "text_loss": 0.49028220772743225 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0003144997017168232, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 10972016.0, "repeat_count": 0.0, "routers_loss": 0.0038266500923782587, "skip_count": 2.0, "step": 6804, "text_loss": 0.43391722440719604 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0189208984375, "learning_rate": 0.0003142123101401417, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 10975153.0, "repeat_count": 0.0, "routers_loss": 0.0005866789724677801, "skip_count": 0.0, "step": 6806, "text_loss": 0.5888382196426392 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.00031392498975765353, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 10977881.0, "repeat_count": 0.0, "routers_loss": 0.002122384263202548, "skip_count": 0.0, "step": 6808, "text_loss": 0.30313390493392944 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0003136377406794604, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 10982025.0, "repeat_count": 0.0, "routers_loss": 0.0005535652744583786, "skip_count": 0.0, "step": 6810, "text_loss": 0.5788959264755249 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 31.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0003133505630156365, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 10985419.0, "repeat_count": 0.0, "routers_loss": 0.010623604990541935, "skip_count": 2.0, "step": 6812, "text_loss": 0.18577243387699127 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 31.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.00031306345687622905, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 10989116.0, "repeat_count": 0.0, "routers_loss": 0.0004721239674836397, "skip_count": 0.0, "step": 6814, "text_loss": 0.4818301200866699 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0167236328125, "learning_rate": 0.0003127764223712575, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 10992064.0, "repeat_count": 0.0, "routers_loss": 0.0004238430701661855, "skip_count": 0.0, "step": 6816, "text_loss": 0.7482771277427673 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.00939242735544, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0003124894596107141, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 10994903.0, "repeat_count": 1.0, "routers_loss": 0.005224394146353006, "skip_count": 2.0, "step": 6818, "text_loss": 0.186603844165802 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.01878485471089, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.00031220256870456356, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 10998692.0, "repeat_count": 1.0, "routers_loss": 0.0021751862950623035, "skip_count": 2.0, "step": 6820, "text_loss": 0.45633986592292786 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 32.02817728206633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.00031191574976274284, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 11001284.0, "repeat_count": 0.0, "routers_loss": 0.004747046157717705, "skip_count": 4.0, "step": 6822, "text_loss": 0.5651670694351196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0003116290028951617, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 11004293.0, "repeat_count": 0.0, "routers_loss": 0.0008316585444845259, "skip_count": 0.0, "step": 6824, "text_loss": 0.3167279362678528 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055419921875, "learning_rate": 0.000311342328211702, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 11007080.0, "repeat_count": 0.0, "routers_loss": 0.0004732926026917994, "skip_count": 0.0, "step": 6826, "text_loss": 0.49171411991119385 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.05635456413267, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.000311055725822218, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 11010078.0, "repeat_count": 1.0, "routers_loss": 0.004238729365170002, "skip_count": 0.0, "step": 6828, "text_loss": 0.21484950184822083 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.06574699148811, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.0003107691958365361, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 11013368.0, "repeat_count": 0.0, "routers_loss": 0.0029175232630223036, "skip_count": 2.0, "step": 6830, "text_loss": 0.3718266189098358 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.075139418843555, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0003104827383644555, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 11016704.0, "repeat_count": 0.0, "routers_loss": 0.00191891985014081, "skip_count": 0.0, "step": 6832, "text_loss": 0.28772637248039246 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.084531846199, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 0.00031019635351574705, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 11019651.0, "repeat_count": 0.0, "routers_loss": 0.004300855100154877, "skip_count": 2.0, "step": 6834, "text_loss": 0.6583508849143982 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.09392427355445, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.000309910041400154, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 11023847.0, "repeat_count": 0.0, "routers_loss": 0.00037701442488469183, "skip_count": 0.0, "step": 6836, "text_loss": 0.36090534925460815 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 32.10331670090989, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0269775390625, "learning_rate": 0.0003096238021273917, "loss": 0.0077, "macro_f1": 0.9265305995941162, "num_tokens": 11027804.0, "repeat_count": 1.0, "routers_loss": 0.03601725772023201, "skip_count": 3.0, "step": 6838, "text_loss": 0.24180401861667633 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.11270912826534, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.00030933763580714757, "loss": 0.0052, "macro_f1": 0.6601307392120361, "num_tokens": 11030778.0, "repeat_count": 1.0, "routers_loss": 0.023780640214681625, "skip_count": 2.0, "step": 6840, "text_loss": 0.4978102743625641 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.12210155562078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00030905154254908104, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 11034863.0, "repeat_count": 1.0, "routers_loss": 0.00565778324380517, "skip_count": 0.0, "step": 6842, "text_loss": 0.558772623538971 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.00030876552246282356, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 11038488.0, "repeat_count": 0.0, "routers_loss": 0.010575232096016407, "skip_count": 0.0, "step": 6844, "text_loss": 0.2955974340438843 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0003084795756579787, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 11041796.0, "repeat_count": 0.0, "routers_loss": 0.0015910190995782614, "skip_count": 0.0, "step": 6846, "text_loss": 0.5009704828262329 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.15027883768712, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0003081937022441217, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 11045141.0, "repeat_count": 0.0, "routers_loss": 0.0008034126949496567, "skip_count": 0.0, "step": 6848, "text_loss": 0.3965311646461487 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 32.15967126504256, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0003079079023307999, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 11047814.0, "repeat_count": 2.0, "routers_loss": 0.00810160581022501, "skip_count": 0.0, "step": 6850, "text_loss": 0.24341927468776703 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.169063692398005, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0003076221760275321, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 11051330.0, "repeat_count": 1.0, "routers_loss": 0.006590691395103931, "skip_count": 0.0, "step": 6852, "text_loss": 0.5887606739997864 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00030733652344380936, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 11055006.0, "repeat_count": 0.0, "routers_loss": 0.0005845054984092712, "skip_count": 0.0, "step": 6854, "text_loss": 0.6621366739273071 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.18784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0003070509446890944, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 11058470.0, "repeat_count": 0.0, "routers_loss": 0.0041051446460187435, "skip_count": 1.0, "step": 6856, "text_loss": 0.31603100895881653 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.197240974464336, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0003067654398728214, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 11061620.0, "repeat_count": 1.0, "routers_loss": 0.001603201380930841, "skip_count": 0.0, "step": 6858, "text_loss": 0.5167516469955444 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.20663340181978, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 0.00030648000910439636, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 11064727.0, "repeat_count": 0.0, "routers_loss": 0.0024816282093524933, "skip_count": 0.0, "step": 6860, "text_loss": 0.5869330167770386 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.21602582917523, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00030619465249319693, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 11068208.0, "repeat_count": 1.0, "routers_loss": 0.003121294779703021, "skip_count": 0.0, "step": 6862, "text_loss": 0.3920222818851471 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.22541825653067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 0.0003059093701485722, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 11071315.0, "repeat_count": 0.0, "routers_loss": 0.0033239589538425207, "skip_count": 1.0, "step": 6864, "text_loss": 0.4201887845993042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.23481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0184326171875, "learning_rate": 0.00030562416217984296, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 11074144.0, "repeat_count": 0.0, "routers_loss": 0.0016117560444399714, "skip_count": 0.0, "step": 6866, "text_loss": 0.5283045172691345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0003053390286963015, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 11077152.0, "repeat_count": 0.0, "routers_loss": 0.003879208816215396, "skip_count": 0.0, "step": 6868, "text_loss": 0.16188788414001465 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.253595538597004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020263671875, "learning_rate": 0.00030505396980721143, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 11080200.0, "repeat_count": 0.0, "routers_loss": 0.007632353343069553, "skip_count": 1.0, "step": 6870, "text_loss": 0.25986847281455994 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.00030476898562180793, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 11083356.0, "repeat_count": 0.0, "routers_loss": 0.004322016146034002, "skip_count": 2.0, "step": 6872, "text_loss": 0.49556297063827515 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.2723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0208740234375, "learning_rate": 0.0003044840762492974, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 11086354.0, "repeat_count": 0.0, "routers_loss": 0.0031272871419787407, "skip_count": 2.0, "step": 6874, "text_loss": 0.1658666580915451 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0003041992417988577, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 11088850.0, "repeat_count": 0.0, "routers_loss": 0.005371398758143187, "skip_count": 2.0, "step": 6876, "text_loss": 0.22437214851379395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0003039144823796378, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 11091784.0, "repeat_count": 0.0, "routers_loss": 0.0025086402893066406, "skip_count": 0.0, "step": 6878, "text_loss": 0.7293354868888855 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0003036297981007581, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 11095204.0, "repeat_count": 0.0, "routers_loss": 0.015590827912092209, "skip_count": 1.0, "step": 6880, "text_loss": 0.6406328678131104 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.30995010272967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0003033451890713103, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 11098367.0, "repeat_count": 0.0, "routers_loss": 0.0013142531970515847, "skip_count": 0.0, "step": 6882, "text_loss": 0.5209086537361145 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 32.319342530085116, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0003030606554003571, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 11101047.0, "repeat_count": 2.0, "routers_loss": 0.0018484699539840221, "skip_count": 0.0, "step": 6884, "text_loss": 0.743188202381134 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.32873495744057, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.00030277619719693217, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 11104269.0, "repeat_count": 0.0, "routers_loss": 0.0016667681047692895, "skip_count": 0.0, "step": 6886, "text_loss": 0.7918420433998108 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.33812738479601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 0.0003024918145700406, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 11107248.0, "repeat_count": 0.0, "routers_loss": 0.0008098077378235757, "skip_count": 0.0, "step": 6888, "text_loss": 0.3871288299560547 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0003022075076286582, "loss": 0.0031, "macro_f1": 0.3333333432674408, "num_tokens": 11111204.0, "repeat_count": 0.0, "routers_loss": 0.002324736909940839, "skip_count": 0.0, "step": 6890, "text_loss": 0.3722921907901764 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.3569122395069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0003019232764817321, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 11114363.0, "repeat_count": 0.0, "routers_loss": 0.00254769716411829, "skip_count": 0.0, "step": 6892, "text_loss": 0.418519526720047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.00030163912123818006, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 11117718.0, "repeat_count": 0.0, "routers_loss": 0.000547234492842108, "skip_count": 0.0, "step": 6894, "text_loss": 0.6087009310722351 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.375697094217784, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0003013550420068909, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 11120437.0, "repeat_count": 0.0, "routers_loss": 0.00015221568173728883, "skip_count": 0.0, "step": 6896, "text_loss": 0.6013991832733154 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 32.385089521573235, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.046142578125, "learning_rate": 0.00030107103889672436, "loss": 0.0085, "macro_f1": 0.5492662787437439, "num_tokens": 11123708.0, "repeat_count": 0.0, "routers_loss": 0.024048971012234688, "skip_count": 2.0, "step": 6898, "text_loss": 0.3612423837184906 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.39448194892868, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0003007871120165111, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 11127294.0, "repeat_count": 0.0, "routers_loss": 0.0013236473314464092, "skip_count": 0.0, "step": 6900, "text_loss": 0.5277031064033508 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0208740234375, "learning_rate": 0.00030050326147505226, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 11130270.0, "repeat_count": 0.0, "routers_loss": 0.0028277861420065165, "skip_count": 0.0, "step": 6902, "text_loss": 0.5726971626281738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0003002194873811197, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 11132955.0, "repeat_count": 0.0, "routers_loss": 0.0022369837388396263, "skip_count": 0.0, "step": 6904, "text_loss": 0.18510448932647705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.00029993578984345673, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 11136387.0, "repeat_count": 0.0, "routers_loss": 0.0038351211696863174, "skip_count": 0.0, "step": 6906, "text_loss": 0.28313153982162476 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.43205165835045, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0002996521689707764, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 11139740.0, "repeat_count": 0.0, "routers_loss": 0.00032925375853665173, "skip_count": 0.0, "step": 6908, "text_loss": 0.7315025329589844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.441444085705896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.0002993686248717629, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 11142587.0, "repeat_count": 0.0, "routers_loss": 0.002886304398998618, "skip_count": 0.0, "step": 6910, "text_loss": 0.677378237247467 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.45083651306135, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.00029908515765507084, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 11145415.0, "repeat_count": 1.0, "routers_loss": 0.0038471966981887817, "skip_count": 0.0, "step": 6912, "text_loss": 0.5207083225250244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0002988017674293254, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 11148524.0, "repeat_count": 0.0, "routers_loss": 0.0023522782139480114, "skip_count": 0.0, "step": 6914, "text_loss": 0.42507871985435486 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0189208984375, "learning_rate": 0.0002985184543031222, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 11152069.0, "repeat_count": 0.0, "routers_loss": 0.0012464249739423394, "skip_count": 0.0, "step": 6916, "text_loss": 0.5694169998168945 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.47901379512768, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 0.0002982352183850274, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 11155675.0, "repeat_count": 0.0, "routers_loss": 0.00828156154602766, "skip_count": 2.0, "step": 6918, "text_loss": 0.22304373979568481 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.48840622248312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.00029795205978357754, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 11158555.0, "repeat_count": 0.0, "routers_loss": 0.0019234733190387487, "skip_count": 0.0, "step": 6920, "text_loss": 0.5519064664840698 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.497798649838565, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0002976689786072795, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 11161407.0, "repeat_count": 0.0, "routers_loss": 0.0003542431222740561, "skip_count": 0.0, "step": 6922, "text_loss": 0.6748810410499573 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.507191077194015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0002973859749646104, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 11166007.0, "repeat_count": 0.0, "routers_loss": 0.0004024899681098759, "skip_count": 0.0, "step": 6924, "text_loss": 0.6613664627075195 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 32.51658350454946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.000297103048964018, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 11169007.0, "repeat_count": 0.0, "routers_loss": 0.005519595462828875, "skip_count": 3.0, "step": 6926, "text_loss": 0.3815552592277527 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.5259759319049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00029682020071392, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 11172939.0, "repeat_count": 0.0, "routers_loss": 0.0016999440267682076, "skip_count": 0.0, "step": 6928, "text_loss": 0.6727893352508545 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.535368359260346, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0002965374303227044, "loss": 0.0055, "macro_f1": 0.5492662787437439, "num_tokens": 11176232.0, "repeat_count": 2.0, "routers_loss": 0.030950307846069336, "skip_count": 0.0, "step": 6930, "text_loss": 0.5577763915061951 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 0.00029625473789872923, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 11179775.0, "repeat_count": 0.0, "routers_loss": 0.00525702815502882, "skip_count": 1.0, "step": 6932, "text_loss": 0.5860039591789246 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.55415321397123, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0206298828125, "learning_rate": 0.000295972123550323, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 11183262.0, "repeat_count": 1.0, "routers_loss": 0.0048187971115112305, "skip_count": 2.0, "step": 6934, "text_loss": 0.7328732013702393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.563545641326684, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.016357421875, "learning_rate": 0.00029568958738578364, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 11186591.0, "repeat_count": 0.0, "routers_loss": 0.0015159632312133908, "skip_count": 0.0, "step": 6936, "text_loss": 0.40563541650772095 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 32.57293806868213, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.017333984375, "learning_rate": 0.0002954071295133801, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 11190056.0, "repeat_count": 1.0, "routers_loss": 0.011282073333859444, "skip_count": 1.0, "step": 6938, "text_loss": 0.15986496210098267 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.58233049603757, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.062255859375, "learning_rate": 0.0002951247500413504, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 11193504.0, "repeat_count": 3.0, "routers_loss": 0.010220487602055073, "skip_count": 5.0, "step": 6940, "text_loss": 0.2604432702064514 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.0002948424490779029, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 11196725.0, "repeat_count": 0.0, "routers_loss": 0.002620660001412034, "skip_count": 1.0, "step": 6942, "text_loss": 0.48028868436813354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.60111535074846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.00029456022673121597, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 11199303.0, "repeat_count": 0.0, "routers_loss": 0.00042651945841498673, "skip_count": 0.0, "step": 6944, "text_loss": 0.5135554671287537 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.6105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0002942780831094377, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 11202319.0, "repeat_count": 0.0, "routers_loss": 0.005366047378629446, "skip_count": 2.0, "step": 6946, "text_loss": 0.2809196710586548 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.619900205459345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0002939960183206861, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 11205622.0, "repeat_count": 0.0, "routers_loss": 0.0033479216508567333, "skip_count": 0.0, "step": 6948, "text_loss": 0.2013140618801117 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.629292632814796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00029371403247304887, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 11208637.0, "repeat_count": 1.0, "routers_loss": 0.0013508419506251812, "skip_count": 0.0, "step": 6950, "text_loss": 0.4427332580089569 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0002934321256745833, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 11211618.0, "repeat_count": 0.0, "routers_loss": 0.0020944071002304554, "skip_count": 0.0, "step": 6952, "text_loss": 0.5406652688980103 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.64807748752568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.00029315029803331704, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 11214432.0, "repeat_count": 0.0, "routers_loss": 0.0012655078899115324, "skip_count": 0.0, "step": 6954, "text_loss": 0.7720552086830139 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.00029286854965724686, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 11218127.0, "repeat_count": 0.0, "routers_loss": 0.009041395038366318, "skip_count": 0.0, "step": 6956, "text_loss": 0.258109986782074 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 32.66686234223657, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0002925868806543391, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 11221440.0, "repeat_count": 1.0, "routers_loss": 0.0034558263141661882, "skip_count": 1.0, "step": 6958, "text_loss": 0.5378029942512512 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.67625476959201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02490234375, "learning_rate": 0.00029230529113253, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 11225391.0, "repeat_count": 0.0, "routers_loss": 0.005263930186629295, "skip_count": 2.0, "step": 6960, "text_loss": 0.3616539537906647 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.685647196947464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0002920237811997251, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 11228648.0, "repeat_count": 0.0, "routers_loss": 0.003730480559170246, "skip_count": 1.0, "step": 6962, "text_loss": 0.46682238578796387 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.69503962430291, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043701171875, "learning_rate": 0.00029174235096379963, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 11231828.0, "repeat_count": 0.0, "routers_loss": 0.004831735976040363, "skip_count": 1.0, "step": 6964, "text_loss": 0.5718355178833008 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 32.70443205165835, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.046875, "learning_rate": 0.0002914610005325981, "loss": 0.0102, "macro_f1": 0.5492662787437439, "num_tokens": 11234984.0, "repeat_count": 0.0, "routers_loss": 0.03880132734775543, "skip_count": 2.0, "step": 6966, "text_loss": 0.3139013946056366 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0002911797300139345, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 11239153.0, "repeat_count": 0.0, "routers_loss": 0.0006673726020380855, "skip_count": 0.0, "step": 6968, "text_loss": 0.6040399074554443 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.72321690636924, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.00029089853951559235, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 11242178.0, "repeat_count": 1.0, "routers_loss": 0.0028971200808882713, "skip_count": 0.0, "step": 6970, "text_loss": 0.304967999458313 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.73260933372468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.00029061742914532427, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 11245865.0, "repeat_count": 0.0, "routers_loss": 0.0010410466929897666, "skip_count": 0.0, "step": 6972, "text_loss": 0.47892290353775024 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 0.0002903363990108524, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 11248806.0, "repeat_count": 0.0, "routers_loss": 0.002133697969838977, "skip_count": 0.0, "step": 6974, "text_loss": 0.2561415433883667 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 32.751394188435576, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0002900554492198677, "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 11251807.0, "repeat_count": 2.0, "routers_loss": 0.002402493730187416, "skip_count": 0.0, "step": 6976, "text_loss": 0.652428388595581 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.76078661579102, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0002897745798800311, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 11254615.0, "repeat_count": 1.0, "routers_loss": 0.006423915736377239, "skip_count": 0.0, "step": 6978, "text_loss": 0.22414511442184448 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.77017904314646, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.000289493791098972, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 11257721.0, "repeat_count": 0.0, "routers_loss": 0.002536606043577194, "skip_count": 0.0, "step": 6980, "text_loss": 0.1328018754720688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.77957147050191, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.00028921308298428933, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 11260840.0, "repeat_count": 0.0, "routers_loss": 0.000745086173992604, "skip_count": 0.0, "step": 6982, "text_loss": 0.61724853515625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.78896389785735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0002889324556435509, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 11264279.0, "repeat_count": 0.0, "routers_loss": 0.005258981604129076, "skip_count": 0.0, "step": 6984, "text_loss": 0.1664455235004425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.798356325212794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.00028865190918429356, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 11268096.0, "repeat_count": 0.0, "routers_loss": 0.0008756023598834872, "skip_count": 0.0, "step": 6986, "text_loss": 0.45111921429634094 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.807748752568244, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.00028837144371402336, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 11270611.0, "repeat_count": 0.0, "routers_loss": 0.0008175788098014891, "skip_count": 0.0, "step": 6988, "text_loss": 0.5332239270210266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.81714117992369, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.00028809105934021517, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 11273826.0, "repeat_count": 0.0, "routers_loss": 0.003494064789265394, "skip_count": 0.0, "step": 6990, "text_loss": 0.20264241099357605 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.82653360727913, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0002878107561703127, "loss": 0.0056, "macro_f1": 0.8817967176437378, "num_tokens": 11276917.0, "repeat_count": 2.0, "routers_loss": 0.025257345288991928, "skip_count": 3.0, "step": 6992, "text_loss": 0.18000070750713348 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.835926034634575, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.0002875305343117289, "loss": 0.0044, "macro_f1": 0.6603773832321167, "num_tokens": 11279637.0, "repeat_count": 1.0, "routers_loss": 0.019206687808036804, "skip_count": 1.0, "step": 6994, "text_loss": 0.5872798562049866 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.00028725039387184504, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 11282717.0, "repeat_count": 0.0, "routers_loss": 0.009358765557408333, "skip_count": 1.0, "step": 6996, "text_loss": 0.3412095904350281 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 32.85471088934546, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.00028697033495801163, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 11285433.0, "repeat_count": 1.0, "routers_loss": 0.0038775671273469925, "skip_count": 1.0, "step": 6998, "text_loss": 0.4316727817058563 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 32.86410331670091, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0002866903576775475, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 11288414.0, "repeat_count": 1.0, "routers_loss": 0.004292591474950314, "skip_count": 0.0, "step": 7000, "text_loss": 0.45106515288352966 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.873495744056356, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046875, "learning_rate": 0.0002864104621377409, "loss": 0.007, "macro_f1": 0.6601307392120361, "num_tokens": 11291811.0, "repeat_count": 1.0, "routers_loss": 0.02195967361330986, "skip_count": 2.0, "step": 7002, "text_loss": 0.29841285943984985 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0002861306484458481, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 11295179.0, "repeat_count": 0.0, "routers_loss": 0.0010119527578353882, "skip_count": 0.0, "step": 7004, "text_loss": 0.5218569040298462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.89228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.00028585091670909436, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 11298182.0, "repeat_count": 0.0, "routers_loss": 0.002615996403619647, "skip_count": 0.0, "step": 7006, "text_loss": 0.20382621884346008 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.90167302612269, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.00028557126703467316, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 11301262.0, "repeat_count": 0.0, "routers_loss": 0.002726050792261958, "skip_count": 0.0, "step": 7008, "text_loss": 0.26718559861183167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.91106545347813, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0002852916995297471, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 11304590.0, "repeat_count": 0.0, "routers_loss": 0.0005590448854491115, "skip_count": 0.0, "step": 7010, "text_loss": 0.5392091274261475 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.92045788083358, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02099609375, "learning_rate": 0.00028501221430144667, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 11307690.0, "repeat_count": 0.0, "routers_loss": 0.004541353322565556, "skip_count": 2.0, "step": 7012, "text_loss": 0.16159705817699432 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 32.929850308189025, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.00028473281145687137, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 11310866.0, "repeat_count": 0.0, "routers_loss": 0.0029630991630256176, "skip_count": 1.0, "step": 7014, "text_loss": 0.9148072600364685 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 32.93924273554447, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0302734375, "learning_rate": 0.0002844534911030888, "loss": 0.0067, "macro_f1": 0.9262410998344421, "num_tokens": 11314517.0, "repeat_count": 2.0, "routers_loss": 0.023258809000253677, "skip_count": 3.0, "step": 7016, "text_loss": 0.3853590488433838 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.94863516289991, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060546875, "learning_rate": 0.000284174253347135, "loss": 0.0064, "macro_f1": 0.3272727429866791, "num_tokens": 11317526.0, "repeat_count": 0.0, "routers_loss": 0.010060093365609646, "skip_count": 1.0, "step": 7018, "text_loss": 0.3412325382232666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00028389509829601444, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 11321684.0, "repeat_count": 0.0, "routers_loss": 0.0016713893273845315, "skip_count": 0.0, "step": 7020, "text_loss": 0.9049796462059021 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00028361602605670003, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 11324709.0, "repeat_count": 0.0, "routers_loss": 0.004167001228779554, "skip_count": 2.0, "step": 7022, "text_loss": 0.24364058673381805 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 32.97681244496625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 0.00028333703673613224, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 11327449.0, "repeat_count": 0.0, "routers_loss": 0.0027954576071351767, "skip_count": 4.0, "step": 7024, "text_loss": 0.2872125506401062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 32.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.00028305813044122096, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 11330846.0, "repeat_count": 0.0, "routers_loss": 0.004644687287509441, "skip_count": 0.0, "step": 7026, "text_loss": 0.1717570424079895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 32.99559729967714, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06884765625, "learning_rate": 0.00028277930727884336, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 11333575.0, "repeat_count": 0.0, "routers_loss": 0.00557848671451211, "skip_count": 2.0, "step": 7028, "text_loss": 0.3501792550086975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.004696213677725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.00028250056735584496, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 11336899.0, "repeat_count": 0.0, "routers_loss": 0.0005694970604963601, "skip_count": 0.0, "step": 7030, "text_loss": 0.5541794300079346 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.01408864103317, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.00028222191077903946, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 11340163.0, "repeat_count": 0.0, "routers_loss": 0.0032896639313548803, "skip_count": 0.0, "step": 7032, "text_loss": 0.5618721842765808 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 33.02348106838861, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00028194333765520853, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 11343494.0, "repeat_count": 1.0, "routers_loss": 0.005377276800572872, "skip_count": 0.0, "step": 7034, "text_loss": 0.325153648853302 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030517578125, "learning_rate": 0.00028166484809110206, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 11346126.0, "repeat_count": 0.0, "routers_loss": 0.001204605447128415, "skip_count": 0.0, "step": 7036, "text_loss": 0.5016651749610901 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.00028138644219343736, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 11348879.0, "repeat_count": 0.0, "routers_loss": 0.005026837810873985, "skip_count": 2.0, "step": 7038, "text_loss": 0.2430499643087387 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.05165835045494, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.00028110812006890064, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 11352457.0, "repeat_count": 0.0, "routers_loss": 0.0019850607495754957, "skip_count": 0.0, "step": 7040, "text_loss": 0.42376917600631714 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.061050777810394, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0179443359375, "learning_rate": 0.00028082988182414524, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 11356602.0, "repeat_count": 1.0, "routers_loss": 0.003362950636073947, "skip_count": 2.0, "step": 7042, "text_loss": 0.4165397882461548 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.07044320516584, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0002805517275657926, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 11359451.0, "repeat_count": 0.0, "routers_loss": 0.0019725612364709377, "skip_count": 1.0, "step": 7044, "text_loss": 0.5597621202468872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0002802736574004319, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 11363614.0, "repeat_count": 0.0, "routers_loss": 0.0013963640667498112, "skip_count": 0.0, "step": 7046, "text_loss": 0.6112356185913086 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.00027999567143462015, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 11367015.0, "repeat_count": 0.0, "routers_loss": 0.0005658161826431751, "skip_count": 0.0, "step": 7048, "text_loss": 0.4920886754989624 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 33.09862048723217, "f1_execute": 0.9756097793579102, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00027971776977488193, "loss": 0.0064, "macro_f1": 0.925203263759613, "num_tokens": 11370489.0, "repeat_count": 3.0, "routers_loss": 0.03657131269574165, "skip_count": 5.0, "step": 7050, "text_loss": 0.28003939986228943 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.10801291458761, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01708984375, "learning_rate": 0.00027943995252771017, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 11373614.0, "repeat_count": 0.0, "routers_loss": 0.004096088465303183, "skip_count": 2.0, "step": 7052, "text_loss": 0.3145081400871277 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.117405341943055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.00027916221979956457, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 11377631.0, "repeat_count": 0.0, "routers_loss": 0.0009888096246868372, "skip_count": 0.0, "step": 7054, "text_loss": 0.4898056983947754 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.126797769298506, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.00027888457169687297, "loss": 0.0065, "macro_f1": 0.6603773832321167, "num_tokens": 11380620.0, "repeat_count": 1.0, "routers_loss": 0.013347696512937546, "skip_count": 1.0, "step": 7056, "text_loss": 0.7011964917182922 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 0.00027860700832603056, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 11383297.0, "repeat_count": 0.0, "routers_loss": 0.000849733711220324, "skip_count": 1.0, "step": 7058, "text_loss": 0.4007014334201813 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.14558262400939, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.0002783295297934003, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 11386460.0, "repeat_count": 0.0, "routers_loss": 0.001546313869766891, "skip_count": 1.0, "step": 7060, "text_loss": 0.3992713689804077 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0002780521362053123, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 11389605.0, "repeat_count": 0.0, "routers_loss": 0.001045585609972477, "skip_count": 0.0, "step": 7062, "text_loss": 0.4440680146217346 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 33.16436747872028, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.00027777482766806446, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 11392105.0, "repeat_count": 1.0, "routers_loss": 0.00752411549910903, "skip_count": 0.0, "step": 7064, "text_loss": 0.20152349770069122 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 33.17375990607572, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.031982421875, "learning_rate": 0.0002774976042879218, "loss": 0.0088, "macro_f1": 0.5934640765190125, "num_tokens": 11396142.0, "repeat_count": 0.0, "routers_loss": 0.019917849451303482, "skip_count": 3.0, "step": 7066, "text_loss": 0.24365149438381195 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 33.183152333431174, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.00027722046617111696, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 11398827.0, "repeat_count": 1.0, "routers_loss": 0.0015933843096718192, "skip_count": 0.0, "step": 7068, "text_loss": 0.31948477029800415 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.19254476078662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.00027694341342384977, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 11402623.0, "repeat_count": 0.0, "routers_loss": 0.0018986845389008522, "skip_count": 2.0, "step": 7070, "text_loss": 0.47721394896507263 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 0.00027666644615228727, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 11405628.0, "repeat_count": 0.0, "routers_loss": 0.002975719515234232, "skip_count": 1.0, "step": 7072, "text_loss": 0.3972358703613281 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0002763895644625637, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 11409468.0, "repeat_count": 0.0, "routers_loss": 0.005657708737999201, "skip_count": 1.0, "step": 7074, "text_loss": 0.6004229187965393 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0002761127684607811, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 11412572.0, "repeat_count": 0.0, "routers_loss": 0.0038351903203874826, "skip_count": 2.0, "step": 7076, "text_loss": 1.0837591886520386 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 33.23011447020839, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.00027583605825300795, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 11416831.0, "repeat_count": 2.0, "routers_loss": 0.005529445596039295, "skip_count": 2.0, "step": 7078, "text_loss": 0.575986921787262 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.00027555943394528014, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 11420557.0, "repeat_count": 0.0, "routers_loss": 0.006243749521672726, "skip_count": 0.0, "step": 7080, "text_loss": 0.606263279914856 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.248899324919286, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.00027528289564360064, "loss": 0.0058, "macro_f1": 0.6603773832321167, "num_tokens": 11423471.0, "repeat_count": 1.0, "routers_loss": 0.031515009701251984, "skip_count": 1.0, "step": 7082, "text_loss": 0.19393208622932434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029296875, "learning_rate": 0.0002750064434539394, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 11426732.0, "repeat_count": 0.0, "routers_loss": 0.0005052287015132606, "skip_count": 0.0, "step": 7084, "text_loss": 0.7202399969100952 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.26768417963017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 0.00027473007748223357, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 11429391.0, "repeat_count": 0.0, "routers_loss": 0.005099403206259012, "skip_count": 1.0, "step": 7086, "text_loss": 0.20651355385780334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.27707660698562, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029296875, "learning_rate": 0.00027445379783438685, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 11432161.0, "repeat_count": 0.0, "routers_loss": 0.001447655027732253, "skip_count": 0.0, "step": 7088, "text_loss": 0.34758952260017395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.28646903434106, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.00027417760461627037, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 11435417.0, "repeat_count": 0.0, "routers_loss": 0.000808655982837081, "skip_count": 0.0, "step": 7090, "text_loss": 0.7414838671684265 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.295861461696504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.00027390149793372177, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 11438313.0, "repeat_count": 0.0, "routers_loss": 0.005151710007339716, "skip_count": 0.0, "step": 7092, "text_loss": 0.17792417109012604 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.305253889051954, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.00027362547789254574, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 11441681.0, "repeat_count": 1.0, "routers_loss": 0.0037353152874857187, "skip_count": 3.0, "step": 7094, "text_loss": 0.5577781796455383 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.3146463164074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 0.0002733495445985135, "loss": 0.0026, "macro_f1": 0.3333333432674408, "num_tokens": 11444521.0, "repeat_count": 0.0, "routers_loss": 0.00038075417978689075, "skip_count": 0.0, "step": 7096, "text_loss": 0.5052862167358398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.32403874376284, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0002730736981573632, "loss": 0.0033, "macro_f1": 0.3272727429866791, "num_tokens": 11448481.0, "repeat_count": 0.0, "routers_loss": 0.007313522044569254, "skip_count": 1.0, "step": 7098, "text_loss": 0.5869139432907104 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0002727979386748001, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 11452164.0, "repeat_count": 0.0, "routers_loss": 0.0020673887338489294, "skip_count": 0.0, "step": 7100, "text_loss": 0.4354212284088135 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.34282359847373, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0002725222662564954, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 11455995.0, "repeat_count": 0.0, "routers_loss": 0.0008315460290759802, "skip_count": 0.0, "step": 7102, "text_loss": 0.8714128732681274 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 33.35221602582917, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0002722466810080874, "loss": 0.0053, "macro_f1": 0.6603773832321167, "num_tokens": 11458828.0, "repeat_count": 1.0, "routers_loss": 0.010913078673183918, "skip_count": 1.0, "step": 7104, "text_loss": 0.6226683855056763 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.36160845318462, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0002719711830351809, "loss": 0.0076, "macro_f1": 0.6603773832321167, "num_tokens": 11462448.0, "repeat_count": 1.0, "routers_loss": 0.040428292006254196, "skip_count": 1.0, "step": 7106, "text_loss": 0.2543688118457794 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.00027169577244334726, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 11465796.0, "repeat_count": 0.0, "routers_loss": 0.004473939072340727, "skip_count": 1.0, "step": 7108, "text_loss": 0.12356872111558914 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.00027142044933812424, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 11469176.0, "repeat_count": 0.0, "routers_loss": 0.0017961655976250768, "skip_count": 0.0, "step": 7110, "text_loss": 0.6800211668014526 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 33.38978573525095, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.0002711452138250162, "loss": 0.0065, "macro_f1": 1.0, "num_tokens": 11471983.0, "repeat_count": 2.0, "routers_loss": 0.003279087832197547, "skip_count": 2.0, "step": 7112, "text_loss": 0.340279757976532 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.3991781626064, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.00027087006600949403, "loss": 0.0065, "macro_f1": 0.6603773832321167, "num_tokens": 11475656.0, "repeat_count": 1.0, "routers_loss": 0.017024178057909012, "skip_count": 1.0, "step": 7114, "text_loss": 0.3556337058544159 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.40857058996184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0002705950059969948, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 11479410.0, "repeat_count": 0.0, "routers_loss": 0.015487123280763626, "skip_count": 1.0, "step": 7116, "text_loss": 0.4404350817203522 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.41796301731729, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019287109375, "learning_rate": 0.00027032003389292194, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 11483302.0, "repeat_count": 0.0, "routers_loss": 0.0011217560386285186, "skip_count": 0.0, "step": 7118, "text_loss": 0.46771445870399475 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.427355444672735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0002700451498026454, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 11486212.0, "repeat_count": 0.0, "routers_loss": 0.0010832607513293624, "skip_count": 0.0, "step": 7120, "text_loss": 0.6795281767845154 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.43674787202818, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00026977035383150106, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 11489320.0, "repeat_count": 0.0, "routers_loss": 0.002290027216076851, "skip_count": 1.0, "step": 7122, "text_loss": 0.5304523706436157 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 33.44614029938362, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02490234375, "learning_rate": 0.00026949564608479164, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 11492056.0, "repeat_count": 2.0, "routers_loss": 0.009950211271643639, "skip_count": 6.0, "step": 7124, "text_loss": 0.21328973770141602 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 33.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0185546875, "learning_rate": 0.0002692210266677855, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 11495165.0, "repeat_count": 0.0, "routers_loss": 0.0079165268689394, "skip_count": 3.0, "step": 7126, "text_loss": 0.19840657711029053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.00026894649568571724, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 11497636.0, "repeat_count": 0.0, "routers_loss": 0.0013852717820554972, "skip_count": 0.0, "step": 7128, "text_loss": 0.3360055088996887 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.47431758144996, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.00026867205324378776, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 11500806.0, "repeat_count": 0.0, "routers_loss": 0.0010151927126571536, "skip_count": 0.0, "step": 7130, "text_loss": 0.6827390193939209 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02001953125, "learning_rate": 0.00026839769944716373, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 11504187.0, "repeat_count": 0.0, "routers_loss": 0.001110393786802888, "skip_count": 0.0, "step": 7132, "text_loss": 0.5081584453582764 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.49310243616085, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0002681234344009783, "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 11507900.0, "repeat_count": 0.0, "routers_loss": 0.010587670840322971, "skip_count": 1.0, "step": 7134, "text_loss": 0.28684356808662415 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.00026784925821033014, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 11510627.0, "repeat_count": 0.0, "routers_loss": 0.006658690981566906, "skip_count": 0.0, "step": 7136, "text_loss": 0.24232104420661926 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.00026757517098028417, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 11513304.0, "repeat_count": 0.0, "routers_loss": 0.0014556109672412276, "skip_count": 0.0, "step": 7138, "text_loss": 0.4718358516693115 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 33.52127971822718, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0281982421875, "learning_rate": 0.00026730117281587116, "loss": 0.0062, "macro_f1": 0.9265305995941162, "num_tokens": 11516593.0, "repeat_count": 1.0, "routers_loss": 0.01590067707002163, "skip_count": 3.0, "step": 7140, "text_loss": 0.2810344696044922 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.53067214558262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00026702726382208774, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 11519776.0, "repeat_count": 0.0, "routers_loss": 0.0014479428064078093, "skip_count": 0.0, "step": 7142, "text_loss": 0.48876339197158813 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.54006457293807, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 0.00026675344410389623, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 11522499.0, "repeat_count": 0.0, "routers_loss": 0.003729258431121707, "skip_count": 2.0, "step": 7144, "text_loss": 0.5350890755653381 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 33.549457000293515, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0002664797137662248, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 11525220.0, "repeat_count": 1.0, "routers_loss": 0.0015156447188928723, "skip_count": 1.0, "step": 7146, "text_loss": 0.5742373466491699 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 33.55884942764896, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.00026620607291396773, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 11527926.0, "repeat_count": 2.0, "routers_loss": 0.004842780064791441, "skip_count": 2.0, "step": 7148, "text_loss": 0.4994547665119171 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.5682418550044, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.00026593252165198455, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 11531622.0, "repeat_count": 0.0, "routers_loss": 0.0026556351222097874, "skip_count": 0.0, "step": 7150, "text_loss": 0.1567893922328949 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.577634282359845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.00026565906008510064, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 11535191.0, "repeat_count": 0.0, "routers_loss": 0.008135059848427773, "skip_count": 1.0, "step": 7152, "text_loss": 0.289173424243927 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 33.58702670971529, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.000265385688318107, "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 11539060.0, "repeat_count": 1.0, "routers_loss": 0.0020754633005708456, "skip_count": 1.0, "step": 7154, "text_loss": 0.35089045763015747 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 33.59641913707074, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.0002651124064557602, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 11541662.0, "repeat_count": 1.0, "routers_loss": 0.0023738413583487272, "skip_count": 0.0, "step": 7156, "text_loss": 0.5026801228523254 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.00026483921460278227, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 11544763.0, "repeat_count": 0.0, "routers_loss": 0.003311366541311145, "skip_count": 1.0, "step": 7158, "text_loss": 0.22975654900074005 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.61520399178163, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.0002645661128638609, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 11547649.0, "repeat_count": 0.0, "routers_loss": 0.0008209354127757251, "skip_count": 0.0, "step": 7160, "text_loss": 0.32840636372566223 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.00026429310134364926, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 11550648.0, "repeat_count": 0.0, "routers_loss": 0.0028574815951287746, "skip_count": 0.0, "step": 7162, "text_loss": 0.23239612579345703 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0177001953125, "learning_rate": 0.00026402018014676584, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 11553790.0, "repeat_count": 0.0, "routers_loss": 0.005469404626637697, "skip_count": 1.0, "step": 7164, "text_loss": 0.22877025604248047 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.0002637473493777943, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 11556802.0, "repeat_count": 1.0, "routers_loss": 0.0032242932356894016, "skip_count": 2.0, "step": 7166, "text_loss": 0.6376226544380188 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.65277370120341, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 0.00026347460914128443, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 11559607.0, "repeat_count": 1.0, "routers_loss": 0.0040627880953252316, "skip_count": 2.0, "step": 7168, "text_loss": 0.6879657506942749 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.66216612855885, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.00026320195954175043, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 11562677.0, "repeat_count": 2.0, "routers_loss": 0.020494163036346436, "skip_count": 4.0, "step": 7170, "text_loss": 0.3710069954395294 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.00026292940068367224, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 11565948.0, "repeat_count": 0.0, "routers_loss": 0.002662271959707141, "skip_count": 0.0, "step": 7172, "text_loss": 0.15041157603263855 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00026265693267149494, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 11568836.0, "repeat_count": 0.0, "routers_loss": 0.0039914860390126705, "skip_count": 1.0, "step": 7174, "text_loss": 0.5372130870819092 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.69034341062518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.00026238455560962884, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 11572542.0, "repeat_count": 0.0, "routers_loss": 0.0034708199091255665, "skip_count": 0.0, "step": 7176, "text_loss": 0.2956286072731018 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.699735837980626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022216796875, "learning_rate": 0.00026211226960244914, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 11575352.0, "repeat_count": 0.0, "routers_loss": 0.007794995326548815, "skip_count": 2.0, "step": 7178, "text_loss": 0.3691073954105377 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.70912826533607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.0002618400747542964, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 11579110.0, "repeat_count": 0.0, "routers_loss": 0.0009694626205600798, "skip_count": 0.0, "step": 7180, "text_loss": 0.6523211598396301 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.71852069269152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0002615679711694764, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 11582476.0, "repeat_count": 0.0, "routers_loss": 0.004227840341627598, "skip_count": 1.0, "step": 7182, "text_loss": 0.1997286081314087 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.72791312004696, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022216796875, "learning_rate": 0.00026129595895225965, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 11585685.0, "repeat_count": 0.0, "routers_loss": 0.00126146269030869, "skip_count": 0.0, "step": 7184, "text_loss": 0.486299604177475 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 33.73730554740241, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0002610240382068818, "loss": 0.006, "macro_f1": 0.8814815282821655, "num_tokens": 11588804.0, "repeat_count": 2.0, "routers_loss": 0.04553814232349396, "skip_count": 4.0, "step": 7186, "text_loss": 0.1622236669063568 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0196533203125, "learning_rate": 0.00026075220903754324, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 11591822.0, "repeat_count": 0.0, "routers_loss": 0.002460496500134468, "skip_count": 2.0, "step": 7188, "text_loss": 0.5573232173919678 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.756090402113294, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0002604804715484095, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 11594899.0, "repeat_count": 0.0, "routers_loss": 0.006854622159153223, "skip_count": 1.0, "step": 7190, "text_loss": 0.4753095507621765 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.00026020882584361094, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 11598333.0, "repeat_count": 0.0, "routers_loss": 0.001945660449564457, "skip_count": 1.0, "step": 7192, "text_loss": 0.8912903666496277 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 33.77487525682419, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.061767578125, "learning_rate": 0.0002599372720272426, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 11601814.0, "repeat_count": 4.0, "routers_loss": 0.005749753676354885, "skip_count": 1.0, "step": 7194, "text_loss": 0.6041871905326843 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0002596658102033643, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 11604661.0, "repeat_count": 0.0, "routers_loss": 0.0025942171923816204, "skip_count": 1.0, "step": 7196, "text_loss": 0.4760607182979584 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 33.793660111535075, "f1_execute": 0.9756097793579102, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.00025939444047600114, "loss": 0.0075, "macro_f1": 0.8807588815689087, "num_tokens": 11608459.0, "repeat_count": 2.0, "routers_loss": 0.020141327753663063, "skip_count": 6.0, "step": 7198, "text_loss": 0.6670252084732056 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0002591231629491423, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 11611489.0, "repeat_count": 0.0, "routers_loss": 0.005721202120184898, "skip_count": 1.0, "step": 7200, "text_loss": 0.31318753957748413 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.81244496624596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 0.00025885197772674174, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 11615234.0, "repeat_count": 0.0, "routers_loss": 0.0027279339265078306, "skip_count": 1.0, "step": 7202, "text_loss": 0.25728851556777954 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.821837393601406, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00025858088491271825, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 11618892.0, "repeat_count": 0.0, "routers_loss": 0.0006987092201597989, "skip_count": 0.0, "step": 7204, "text_loss": 0.5504243969917297 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.83122982095686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00025830988461095504, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 11622237.0, "repeat_count": 0.0, "routers_loss": 0.0029056845232844353, "skip_count": 0.0, "step": 7206, "text_loss": 0.5319080948829651 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.8406222483123, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0213623046875, "learning_rate": 0.0002580389769253001, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 11624713.0, "repeat_count": 4.0, "routers_loss": 0.007346974220126867, "skip_count": 5.0, "step": 7208, "text_loss": 0.8925374746322632 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0203857421875, "learning_rate": 0.0002577681619595655, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 11628689.0, "repeat_count": 0.0, "routers_loss": 0.0004166684520896524, "skip_count": 0.0, "step": 7210, "text_loss": 0.37282413244247437 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.85940710302319, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.00025749743981752824, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 11631581.0, "repeat_count": 0.0, "routers_loss": 0.013194780796766281, "skip_count": 2.0, "step": 7212, "text_loss": 0.220115065574646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0002572268106029295, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 11634503.0, "repeat_count": 0.0, "routers_loss": 0.0009112557163462043, "skip_count": 0.0, "step": 7214, "text_loss": 0.5631879568099976 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.878191957734074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.00025695627441947496, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 11637790.0, "repeat_count": 0.0, "routers_loss": 0.011178883723914623, "skip_count": 2.0, "step": 7216, "text_loss": 0.24482154846191406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.887584385089525, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00025668583137083447, "loss": 0.0047, "macro_f1": 0.32098764181137085, "num_tokens": 11640806.0, "repeat_count": 0.0, "routers_loss": 0.01877705194056034, "skip_count": 2.0, "step": 7218, "text_loss": 0.2229214459657669 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.89697681244497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.0002564154815606422, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 11644479.0, "repeat_count": 0.0, "routers_loss": 0.0030277224723249674, "skip_count": 0.0, "step": 7220, "text_loss": 0.6025711894035339 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 0.00025614522509249715, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 11647340.0, "repeat_count": 0.0, "routers_loss": 0.002354414900764823, "skip_count": 1.0, "step": 7222, "text_loss": 0.6497155427932739 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.915761667155856, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0002558750620699618, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 11650433.0, "repeat_count": 1.0, "routers_loss": 0.009801039472222328, "skip_count": 2.0, "step": 7224, "text_loss": 0.32049307227134705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0002556049925965632, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 11654451.0, "repeat_count": 0.0, "routers_loss": 0.002949854824692011, "skip_count": 0.0, "step": 7226, "text_loss": 0.17923395335674286 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 33.93454652186674, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00025533501677579254, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 11657440.0, "repeat_count": 1.0, "routers_loss": 0.0032915703486651182, "skip_count": 1.0, "step": 7228, "text_loss": 0.60064297914505 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 33.943938949222186, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02001953125, "learning_rate": 0.0002550651347111049, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 11660599.0, "repeat_count": 1.0, "routers_loss": 0.00594533933326602, "skip_count": 1.0, "step": 7230, "text_loss": 0.32829397916793823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 33.95333137657764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.00025479534650591976, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 11663387.0, "repeat_count": 0.0, "routers_loss": 0.0014214308466762304, "skip_count": 0.0, "step": 7232, "text_loss": 0.7317177653312683 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 33.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0198974609375, "learning_rate": 0.00025452565226362036, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 11666729.0, "repeat_count": 0.0, "routers_loss": 0.0056374757550656796, "skip_count": 2.0, "step": 7234, "text_loss": 0.3394623398780823 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 33.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0152587890625, "learning_rate": 0.00025425605208755406, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 11669871.0, "repeat_count": 0.0, "routers_loss": 0.006422565318644047, "skip_count": 3.0, "step": 7236, "text_loss": 0.1725512444972992 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 33.98150865864397, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0002539865460810322, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 11673008.0, "repeat_count": 1.0, "routers_loss": 0.0023537934757769108, "skip_count": 0.0, "step": 7238, "text_loss": 0.8873519897460938 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 33.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.00025371713434733, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 11675988.0, "repeat_count": 0.0, "routers_loss": 0.0026300614699721336, "skip_count": 1.0, "step": 7240, "text_loss": 0.4877084195613861 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 34.0, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.028076171875, "learning_rate": 0.0002534478169896864, "loss": 0.0052, "macro_f1": 0.9265305995941162, "num_tokens": 11679068.0, "repeat_count": 1.0, "routers_loss": 0.019549336284399033, "skip_count": 3.0, "step": 7242, "text_loss": 0.15101417899131775 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.00939242735544, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0002531785941113044, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 11682205.0, "repeat_count": 0.0, "routers_loss": 0.007769173942506313, "skip_count": 1.0, "step": 7244, "text_loss": 0.4035153090953827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0002529094658153508, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 11685162.0, "repeat_count": 0.0, "routers_loss": 0.003636054927483201, "skip_count": 0.0, "step": 7246, "text_loss": 0.21048080921173096 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.02817728206633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048583984375, "learning_rate": 0.00025264043220495606, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 11688512.0, "repeat_count": 0.0, "routers_loss": 0.0013363865436986089, "skip_count": 0.0, "step": 7248, "text_loss": 0.6582038402557373 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.00025237149338321437, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 11691753.0, "repeat_count": 0.0, "routers_loss": 0.0005587349878624082, "skip_count": 0.0, "step": 7250, "text_loss": 0.6899203658103943 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.046962136777225, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0002521026494531835, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 11694689.0, "repeat_count": 1.0, "routers_loss": 0.006221035961061716, "skip_count": 0.0, "step": 7252, "text_loss": 0.17377600073814392 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0205078125, "learning_rate": 0.000251833900517885, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 11697950.0, "repeat_count": 0.0, "routers_loss": 0.004368607886135578, "skip_count": 1.0, "step": 7254, "text_loss": 0.4147649109363556 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.06574699148811, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.000251565246680304, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 11701214.0, "repeat_count": 0.0, "routers_loss": 0.0038269520737230778, "skip_count": 2.0, "step": 7256, "text_loss": 0.42076823115348816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.075139418843555, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.00025129668804338906, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 11703935.0, "repeat_count": 0.0, "routers_loss": 0.0011755652958527207, "skip_count": 0.0, "step": 7258, "text_loss": 0.5484340190887451 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.084531846199, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.00025102822471005247, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 11706818.0, "repeat_count": 1.0, "routers_loss": 0.00735129788517952, "skip_count": 2.0, "step": 7260, "text_loss": 0.29214802384376526 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.09392427355445, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00025075985678316983, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 11709979.0, "repeat_count": 1.0, "routers_loss": 0.0011552777141332626, "skip_count": 0.0, "step": 7262, "text_loss": 0.6514551639556885 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 34.10331670090989, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0002504915843655802, "loss": 0.0067, "macro_f1": 0.8814815282821655, "num_tokens": 11714075.0, "repeat_count": 2.0, "routers_loss": 0.01438678614795208, "skip_count": 4.0, "step": 7264, "text_loss": 0.5144859552383423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.11270912826534, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0002502234075600862, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 11717610.0, "repeat_count": 0.0, "routers_loss": 0.0027831171173602343, "skip_count": 0.0, "step": 7266, "text_loss": 0.6494308114051819 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.00024995532646945336, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 11721415.0, "repeat_count": 0.0, "routers_loss": 0.0012327058939263225, "skip_count": 0.0, "step": 7268, "text_loss": 0.5111991763114929 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 34.131493982976224, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.042236328125, "learning_rate": 0.0002496873411964113, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 11724488.0, "repeat_count": 2.0, "routers_loss": 0.003060065908357501, "skip_count": 1.0, "step": 7270, "text_loss": 0.5780492424964905 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0002494194518436523, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 11727708.0, "repeat_count": 0.0, "routers_loss": 0.001369593315757811, "skip_count": 0.0, "step": 7272, "text_loss": 0.3151950240135193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.15027883768712, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.00024915165851383203, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 11730897.0, "repeat_count": 0.0, "routers_loss": 0.005724756047129631, "skip_count": 0.0, "step": 7274, "text_loss": 0.5267965197563171 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.15967126504256, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.00024888396130956947, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 11733870.0, "repeat_count": 1.0, "routers_loss": 0.010036137886345387, "skip_count": 0.0, "step": 7276, "text_loss": 0.5330777168273926 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00024861636033344657, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 11737413.0, "repeat_count": 0.0, "routers_loss": 0.008341848850250244, "skip_count": 2.0, "step": 7278, "text_loss": 0.25949522852897644 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.17845611975345, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025390625, "learning_rate": 0.0002483488556880087, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 11740691.0, "repeat_count": 1.0, "routers_loss": 0.008208763785660267, "skip_count": 2.0, "step": 7280, "text_loss": 0.1867891401052475 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.18784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.000248081447475764, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 11743715.0, "repeat_count": 0.0, "routers_loss": 0.0038434381131082773, "skip_count": 0.0, "step": 7282, "text_loss": 0.4835410416126251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.197240974464336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0002478141357991838, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 11746818.0, "repeat_count": 0.0, "routers_loss": 0.0019067893736064434, "skip_count": 0.0, "step": 7284, "text_loss": 0.5959038734436035 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.20663340181978, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.00024754692076070256, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 11750160.0, "repeat_count": 0.0, "routers_loss": 0.007199060171842575, "skip_count": 0.0, "step": 7286, "text_loss": 0.5068115592002869 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.21602582917523, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0002472798024627175, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 11752836.0, "repeat_count": 0.0, "routers_loss": 0.0014214382972568274, "skip_count": 0.0, "step": 7288, "text_loss": 0.5742631554603577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.22541825653067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.0002470127810075889, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 11756276.0, "repeat_count": 0.0, "routers_loss": 0.0018025166355073452, "skip_count": 0.0, "step": 7290, "text_loss": 0.6616888642311096 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.23481068388612, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.00024674585649763983, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 11760235.0, "repeat_count": 1.0, "routers_loss": 0.0024077212437987328, "skip_count": 0.0, "step": 7292, "text_loss": 0.7984768748283386 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06494140625, "learning_rate": 0.00024647902903515614, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 11763430.0, "repeat_count": 0.0, "routers_loss": 0.007843999192118645, "skip_count": 1.0, "step": 7294, "text_loss": 0.1943647861480713 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.253595538597004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0002462122987223869, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 11766583.0, "repeat_count": 0.0, "routers_loss": 0.0019727738108485937, "skip_count": 0.0, "step": 7296, "text_loss": 0.43924200534820557 }, { "acc_repeat": 1.0, "acc_skip": 0.6000000238418579, "avg_layers": 27.0, "epoch": 34.26298796595245, "f1_execute": 0.9545454382896423, "f1_repeat": 1.0, "f1_skip": 0.75, "grad_norm": 0.041015625, "learning_rate": 0.0002459456656615436, "loss": 0.0069, "macro_f1": 0.9015151858329773, "num_tokens": 11770360.0, "repeat_count": 2.0, "routers_loss": 0.04594529792666435, "skip_count": 5.0, "step": 7298, "text_loss": 0.32582250237464905 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.2723803933079, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0002456791299548004, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 11773239.0, "repeat_count": 1.0, "routers_loss": 0.0011880286037921906, "skip_count": 0.0, "step": 7300, "text_loss": 0.7723727226257324 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.00024541269170429435, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 11776945.0, "repeat_count": 0.0, "routers_loss": 0.0010577787179499865, "skip_count": 0.0, "step": 7302, "text_loss": 0.8173839449882507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0002451463510121252, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 11780121.0, "repeat_count": 0.0, "routers_loss": 0.0019757342524826527, "skip_count": 0.0, "step": 7304, "text_loss": 0.4015064239501953 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.000244880107980355, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 11783172.0, "repeat_count": 0.0, "routers_loss": 0.002577328821644187, "skip_count": 0.0, "step": 7306, "text_loss": 0.5465171933174133 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.30995010272967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 0.00024461396271100876, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 11788608.0, "repeat_count": 0.0, "routers_loss": 0.004162502940744162, "skip_count": 0.0, "step": 7308, "text_loss": 0.2419646978378296 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.319342530085116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0002443479153060735, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 11791912.0, "repeat_count": 0.0, "routers_loss": 0.003301614662632346, "skip_count": 0.0, "step": 7310, "text_loss": 0.2568489909172058 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.32873495744057, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.00024408196586749964, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 11794849.0, "repeat_count": 0.0, "routers_loss": 0.0019893983844667673, "skip_count": 0.0, "step": 7312, "text_loss": 0.7044196128845215 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.33812738479601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0002438161144971992, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 11797587.0, "repeat_count": 0.0, "routers_loss": 0.006637922488152981, "skip_count": 1.0, "step": 7314, "text_loss": 0.6863232254981995 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 0.000243550361297047, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 11800173.0, "repeat_count": 0.0, "routers_loss": 0.003078785724937916, "skip_count": 2.0, "step": 7316, "text_loss": 0.2868897616863251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.3569122395069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 0.00024328470636888005, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 11802889.0, "repeat_count": 0.0, "routers_loss": 0.0011882453691214323, "skip_count": 0.0, "step": 7318, "text_loss": 0.5522798299789429 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0002430191498144979, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 11805607.0, "repeat_count": 0.0, "routers_loss": 0.0008720619371160865, "skip_count": 0.0, "step": 7320, "text_loss": 0.5531370639801025 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.375697094217784, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.00024275369173566236, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 11808838.0, "repeat_count": 1.0, "routers_loss": 0.003213440766558051, "skip_count": 0.0, "step": 7322, "text_loss": 0.5252627730369568 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.385089521573235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04345703125, "learning_rate": 0.00024248833223409715, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 11811965.0, "repeat_count": 0.0, "routers_loss": 0.004736232105642557, "skip_count": 1.0, "step": 7324, "text_loss": 0.6033701300621033 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.39448194892868, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 0.00024222307141148907, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 11814832.0, "repeat_count": 0.0, "routers_loss": 0.0007559265359304845, "skip_count": 0.0, "step": 7326, "text_loss": 0.5607737302780151 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 0.00024195790936948626, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 11818802.0, "repeat_count": 0.0, "routers_loss": 0.005338212475180626, "skip_count": 2.0, "step": 7328, "text_loss": 0.20618735253810883 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 34.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0002416928462096994, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 11821998.0, "repeat_count": 0.0, "routers_loss": 0.001919696107506752, "skip_count": 3.0, "step": 7330, "text_loss": 0.42486369609832764 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.00024142788203370107, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 11824505.0, "repeat_count": 0.0, "routers_loss": 0.0013797834981232882, "skip_count": 0.0, "step": 7332, "text_loss": 0.48403388261795044 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.43205165835045, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00024116301694302621, "loss": 0.0053, "macro_f1": 0.3272727429866791, "num_tokens": 11828504.0, "repeat_count": 0.0, "routers_loss": 0.008978237397968769, "skip_count": 1.0, "step": 7334, "text_loss": 0.43872755765914917 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.441444085705896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 0.00024089825103917152, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 11831171.0, "repeat_count": 0.0, "routers_loss": 0.004589964635670185, "skip_count": 1.0, "step": 7336, "text_loss": 0.5126842260360718 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.45083651306135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.00024063358442359572, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 11834387.0, "repeat_count": 0.0, "routers_loss": 0.002857893006876111, "skip_count": 0.0, "step": 7338, "text_loss": 0.7521272301673889 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0002403690171977197, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 11838693.0, "repeat_count": 0.0, "routers_loss": 0.0009023012826219201, "skip_count": 0.0, "step": 7340, "text_loss": 0.6335242390632629 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.469621367772234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.00024010454946292586, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 11841882.0, "repeat_count": 1.0, "routers_loss": 0.010992717929184437, "skip_count": 0.0, "step": 7342, "text_loss": 0.64045649766922 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.47901379512768, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0002398401813205592, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 11845181.0, "repeat_count": 0.0, "routers_loss": 0.002247930970042944, "skip_count": 2.0, "step": 7344, "text_loss": 0.31022098660469055 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.48840622248312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.00023957591287192577, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 11848537.0, "repeat_count": 0.0, "routers_loss": 0.003184020286425948, "skip_count": 2.0, "step": 7346, "text_loss": 0.5709269642829895 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.497798649838565, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.00023931174421829376, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 11851437.0, "repeat_count": 2.0, "routers_loss": 0.006582654081285, "skip_count": 4.0, "step": 7348, "text_loss": 0.3547070026397705 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.507191077194015, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.00023904767546089318, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 11854161.0, "repeat_count": 1.0, "routers_loss": 0.0022124287206679583, "skip_count": 0.0, "step": 7350, "text_loss": 0.6984702348709106 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.51658350454946, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.00023878370670091565, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 11856811.0, "repeat_count": 1.0, "routers_loss": 0.0029868825804442167, "skip_count": 0.0, "step": 7352, "text_loss": 0.25389090180397034 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.5259759319049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01708984375, "learning_rate": 0.00023851983803951444, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 11860110.0, "repeat_count": 0.0, "routers_loss": 0.0028468978125602007, "skip_count": 1.0, "step": 7354, "text_loss": 0.5729252099990845 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.535368359260346, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.00023825606957780454, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 11863058.0, "repeat_count": 1.0, "routers_loss": 0.003115740604698658, "skip_count": 2.0, "step": 7356, "text_loss": 0.60753333568573 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.00023799240141686258, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 11865865.0, "repeat_count": 0.0, "routers_loss": 0.0022254586219787598, "skip_count": 0.0, "step": 7358, "text_loss": 0.2568866014480591 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.55415321397123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01904296875, "learning_rate": 0.00023772883365772658, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 11869133.0, "repeat_count": 0.0, "routers_loss": 0.0017388637643307447, "skip_count": 0.0, "step": 7360, "text_loss": 0.7657097578048706 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.563545641326684, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.00023746536640139633, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 11872988.0, "repeat_count": 0.0, "routers_loss": 0.002158832037821412, "skip_count": 0.0, "step": 7362, "text_loss": 0.19717472791671753 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.57293806868213, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.00023720199974883294, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 11875810.0, "repeat_count": 0.0, "routers_loss": 0.001037398586049676, "skip_count": 0.0, "step": 7364, "text_loss": 0.47334593534469604 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 34.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.00023693873380095876, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 11878558.0, "repeat_count": 0.0, "routers_loss": 0.011853457428514957, "skip_count": 5.0, "step": 7366, "text_loss": 0.2567826211452484 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.01806640625, "learning_rate": 0.00023667556865865824, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 11881473.0, "repeat_count": 1.0, "routers_loss": 0.0015339091187343001, "skip_count": 0.0, "step": 7368, "text_loss": 0.40981143712997437 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.60111535074846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.00023641250442277655, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 11885033.0, "repeat_count": 1.0, "routers_loss": 0.010062574408948421, "skip_count": 0.0, "step": 7370, "text_loss": 0.3183043301105499 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.6105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022216796875, "learning_rate": 0.00023614954119412042, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 11889136.0, "repeat_count": 0.0, "routers_loss": 0.0010769609361886978, "skip_count": 0.0, "step": 7372, "text_loss": 0.5279555916786194 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 34.619900205459345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0196533203125, "learning_rate": 0.00023588667907345785, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 11893102.0, "repeat_count": 0.0, "routers_loss": 0.0032862431835383177, "skip_count": 3.0, "step": 7374, "text_loss": 0.5425930023193359 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 34.629292632814796, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0341796875, "learning_rate": 0.00023562391816151808, "loss": 0.0057, "macro_f1": 0.5934640765190125, "num_tokens": 11895841.0, "repeat_count": 0.0, "routers_loss": 0.02405562624335289, "skip_count": 3.0, "step": 7376, "text_loss": 0.26054954528808594 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.63868506017024, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00023536125855899153, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 11899594.0, "repeat_count": 1.0, "routers_loss": 0.008315852843225002, "skip_count": 3.0, "step": 7378, "text_loss": 0.19068174064159393 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 34.64807748752568, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.00023509870036652998, "loss": 0.0065, "macro_f1": 1.0, "num_tokens": 11902843.0, "repeat_count": 1.0, "routers_loss": 0.006180883850902319, "skip_count": 4.0, "step": 7380, "text_loss": 0.18461982905864716 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.00023483624368474614, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 11905786.0, "repeat_count": 0.0, "routers_loss": 0.0008856299100443721, "skip_count": 0.0, "step": 7382, "text_loss": 0.5216618180274963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.66686234223657, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.00023457388861421397, "loss": 0.0059, "macro_f1": 0.32098764181137085, "num_tokens": 11908706.0, "repeat_count": 1.0, "routers_loss": 0.04762765392661095, "skip_count": 1.0, "step": 7384, "text_loss": 0.25329193472862244 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 34.67625476959201, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 0.00023431163525546833, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 11911862.0, "repeat_count": 1.0, "routers_loss": 0.000989250373095274, "skip_count": 1.0, "step": 7386, "text_loss": 0.2657507658004761 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.685647196947464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01708984375, "learning_rate": 0.0002340494837090053, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 11915483.0, "repeat_count": 0.0, "routers_loss": 0.0008857969660311937, "skip_count": 0.0, "step": 7388, "text_loss": 0.5136669874191284 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.69503962430291, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 0.00023378743407528164, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 11918778.0, "repeat_count": 0.0, "routers_loss": 0.0041572838090360165, "skip_count": 1.0, "step": 7390, "text_loss": 0.5212553143501282 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 0.00023352548645471556, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 11921916.0, "repeat_count": 0.0, "routers_loss": 0.0010537431808188558, "skip_count": 0.0, "step": 7392, "text_loss": 0.48122525215148926 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.713824479013795, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00023326364094768576, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 11924273.0, "repeat_count": 1.0, "routers_loss": 0.004077036865055561, "skip_count": 0.0, "step": 7394, "text_loss": 0.2128690630197525 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 0.00023300189765453194, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 11927424.0, "repeat_count": 0.0, "routers_loss": 0.005371362902224064, "skip_count": 2.0, "step": 7396, "text_loss": 0.19448284804821014 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.73260933372468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00023274025667555464, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 11930919.0, "repeat_count": 0.0, "routers_loss": 0.002137752715498209, "skip_count": 0.0, "step": 7398, "text_loss": 0.7537064552307129 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.00023247871811101512, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 11933680.0, "repeat_count": 0.0, "routers_loss": 0.0002398790093138814, "skip_count": 0.0, "step": 7400, "text_loss": 0.5589297413825989 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.751394188435576, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.00023221728206113546, "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 11937090.0, "repeat_count": 0.0, "routers_loss": 0.019718777388334274, "skip_count": 1.0, "step": 7402, "text_loss": 0.8014751672744751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 0.0002319559486260985, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 11940581.0, "repeat_count": 0.0, "routers_loss": 0.001230534864589572, "skip_count": 0.0, "step": 7404, "text_loss": 0.5218383073806763 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.77017904314646, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0002316947179060477, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 11943832.0, "repeat_count": 0.0, "routers_loss": 0.0016393321566283703, "skip_count": 0.0, "step": 7406, "text_loss": 0.17122556269168854 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.77957147050191, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.00023143359000108704, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 11947025.0, "repeat_count": 0.0, "routers_loss": 0.005269679240882397, "skip_count": 2.0, "step": 7408, "text_loss": 0.2015499323606491 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 34.78896389785735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0208740234375, "learning_rate": 0.00023117256501128136, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 11950077.0, "repeat_count": 1.0, "routers_loss": 0.005140089895576239, "skip_count": 2.0, "step": 7410, "text_loss": 0.39068636298179626 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.798356325212794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.00023091164303665592, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 11953800.0, "repeat_count": 0.0, "routers_loss": 0.005578748416155577, "skip_count": 0.0, "step": 7412, "text_loss": 0.18851874768733978 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.807748752568244, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.00023065082417719624, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 11956383.0, "repeat_count": 0.0, "routers_loss": 0.0006410991190932691, "skip_count": 0.0, "step": 7414, "text_loss": 0.5663703083992004 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 34.81714117992369, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0002303901085328491, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 11959554.0, "repeat_count": 0.0, "routers_loss": 0.0005902954144403338, "skip_count": 5.0, "step": 7416, "text_loss": 0.5225661993026733 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0002301294962035209, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 11962582.0, "repeat_count": 0.0, "routers_loss": 0.00045644037891179323, "skip_count": 0.0, "step": 7418, "text_loss": 0.40572360157966614 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 0.0002298689872890789, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 11965649.0, "repeat_count": 0.0, "routers_loss": 0.01017778366804123, "skip_count": 2.0, "step": 7420, "text_loss": 0.12190715968608856 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.00022960858188935052, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 11968850.0, "repeat_count": 0.0, "routers_loss": 0.0008010792662389576, "skip_count": 0.0, "step": 7422, "text_loss": 0.5606820583343506 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.85471088934546, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0002293482801041236, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 11972064.0, "repeat_count": 0.0, "routers_loss": 0.001889281440526247, "skip_count": 0.0, "step": 7424, "text_loss": 0.44142210483551025 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.00022908808203314635, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 11975466.0, "repeat_count": 0.0, "routers_loss": 0.00647713290527463, "skip_count": 2.0, "step": 7426, "text_loss": 0.23273423314094543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0002288279877761271, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 11979875.0, "repeat_count": 0.0, "routers_loss": 0.004027119372040033, "skip_count": 0.0, "step": 7428, "text_loss": 0.5608086585998535 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.0002285679974327345, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 11982808.0, "repeat_count": 0.0, "routers_loss": 0.0009015435934998095, "skip_count": 0.0, "step": 7430, "text_loss": 0.3976539373397827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.89228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0002283081111025973, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 11985978.0, "repeat_count": 0.0, "routers_loss": 0.00047143330448307097, "skip_count": 0.0, "step": 7432, "text_loss": 0.4280148446559906 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.90167302612269, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.00022804832888530447, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 11988925.0, "repeat_count": 0.0, "routers_loss": 0.0004895820748060942, "skip_count": 0.0, "step": 7434, "text_loss": 0.5137463808059692 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.91106545347813, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0198974609375, "learning_rate": 0.000227788650880405, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 11991631.0, "repeat_count": 0.0, "routers_loss": 0.0008349024574272335, "skip_count": 0.0, "step": 7436, "text_loss": 0.4306720197200775 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.92045788083358, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00022752907718740807, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 11995476.0, "repeat_count": 0.0, "routers_loss": 0.0038723985198885202, "skip_count": 0.0, "step": 7438, "text_loss": 0.6413722038269043 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 34.929850308189025, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.043701171875, "learning_rate": 0.00022726960790578248, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 11998846.0, "repeat_count": 1.0, "routers_loss": 0.004433541093021631, "skip_count": 0.0, "step": 7440, "text_loss": 0.6424159407615662 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 23.0, "epoch": 34.93924273554447, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.036376953125, "learning_rate": 0.0002270102431349579, "loss": 0.0062, "macro_f1": 0.6289562582969666, "num_tokens": 12002228.0, "repeat_count": 0.0, "routers_loss": 0.023979803547263145, "skip_count": 6.0, "step": 7442, "text_loss": 0.16657918691635132 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 34.94863516289991, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 0.00022675098297432307, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 12005003.0, "repeat_count": 3.0, "routers_loss": 0.005645833443850279, "skip_count": 1.0, "step": 7444, "text_loss": 0.6388722658157349 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 0.00022649182752322705, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 12007657.0, "repeat_count": 0.0, "routers_loss": 0.001629356062039733, "skip_count": 2.0, "step": 7446, "text_loss": 0.35670006275177 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 34.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00022623277688097864, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 12010652.0, "repeat_count": 0.0, "routers_loss": 0.006375396624207497, "skip_count": 2.0, "step": 7448, "text_loss": 0.24273613095283508 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.97681244496625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0002259738311468466, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 12014042.0, "repeat_count": 0.0, "routers_loss": 0.003734540194272995, "skip_count": 0.0, "step": 7450, "text_loss": 0.4262580871582031 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 34.98620487232169, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0002257149904200592, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 12016987.0, "repeat_count": 1.0, "routers_loss": 0.0027926203329116106, "skip_count": 1.0, "step": 7452, "text_loss": 0.366216778755188 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 34.99559729967714, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.00022545625479980508, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 12021584.0, "repeat_count": 0.0, "routers_loss": 0.0008985420572571456, "skip_count": 0.0, "step": 7454, "text_loss": 0.533937394618988 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.004696213677725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.00022519762438523205, "loss": 0.0029, "macro_f1": 0.6666666865348816, "num_tokens": 12024142.0, "repeat_count": 0.0, "routers_loss": 0.005394646432250738, "skip_count": 1.0, "step": 7456, "text_loss": 0.2401239275932312 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.01408864103317, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030517578125, "learning_rate": 0.0002249390992754477, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 12027262.0, "repeat_count": 0.0, "routers_loss": 0.00275063537992537, "skip_count": 0.0, "step": 7458, "text_loss": 0.21824975311756134 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.02348106838861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.00022468067956951944, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 12030528.0, "repeat_count": 0.0, "routers_loss": 0.0008951274212449789, "skip_count": 1.0, "step": 7460, "text_loss": 0.610903263092041 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00022442236536647408, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 12033699.0, "repeat_count": 0.0, "routers_loss": 0.004062872380018234, "skip_count": 2.0, "step": 7462, "text_loss": 0.26921433210372925 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.00022416415676529823, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 12037402.0, "repeat_count": 0.0, "routers_loss": 0.0023089025635272264, "skip_count": 1.0, "step": 7464, "text_loss": 0.4746153950691223 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.05165835045494, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021728515625, "learning_rate": 0.00022390605386493756, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 12041129.0, "repeat_count": 0.0, "routers_loss": 0.0021355501376092434, "skip_count": 2.0, "step": 7466, "text_loss": 0.4265538454055786 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.061050777810394, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.00022364805676429816, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 12044356.0, "repeat_count": 0.0, "routers_loss": 0.0061582159250974655, "skip_count": 1.0, "step": 7468, "text_loss": 0.12020833045244217 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.07044320516584, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.00022339016556224467, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 12047158.0, "repeat_count": 0.0, "routers_loss": 0.003753372235223651, "skip_count": 1.0, "step": 7470, "text_loss": 0.6406939625740051 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 35.07983563252128, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 0.00022313238035760158, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 12050149.0, "repeat_count": 1.0, "routers_loss": 0.005371729377657175, "skip_count": 5.0, "step": 7472, "text_loss": 0.5184400677680969 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.0002228747012491526, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 12053560.0, "repeat_count": 0.0, "routers_loss": 0.000824139395263046, "skip_count": 0.0, "step": 7474, "text_loss": 0.32644152641296387 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0002226171283356409, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 12056309.0, "repeat_count": 0.0, "routers_loss": 0.0044801668263971806, "skip_count": 1.0, "step": 7476, "text_loss": 0.7027081847190857 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.10801291458761, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02880859375, "learning_rate": 0.00022235966171576887, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 12059191.0, "repeat_count": 0.0, "routers_loss": 0.007496353704482317, "skip_count": 2.0, "step": 7478, "text_loss": 0.28705671429634094 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.117405341943055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0211181640625, "learning_rate": 0.0002221023014881982, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 12062365.0, "repeat_count": 0.0, "routers_loss": 0.0018641395727172494, "skip_count": 1.0, "step": 7480, "text_loss": 0.715477466583252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.126797769298506, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.00022184504775154984, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 12065508.0, "repeat_count": 0.0, "routers_loss": 0.0005825075786560774, "skip_count": 0.0, "step": 7482, "text_loss": 0.7481293678283691 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00022158790060440394, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 12068043.0, "repeat_count": 0.0, "routers_loss": 0.0028906071092933416, "skip_count": 0.0, "step": 7484, "text_loss": 0.6151962876319885 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.14558262400939, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0286865234375, "learning_rate": 0.00022133086014529968, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 12070897.0, "repeat_count": 0.0, "routers_loss": 0.0030862605199217796, "skip_count": 1.0, "step": 7486, "text_loss": 0.4923575222492218 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.00022107392647273527, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 12074644.0, "repeat_count": 0.0, "routers_loss": 0.0011101154377683997, "skip_count": 0.0, "step": 7488, "text_loss": 0.5217859148979187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.16436747872028, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.00022081709968516867, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 12077718.0, "repeat_count": 0.0, "routers_loss": 0.004303969442844391, "skip_count": 0.0, "step": 7490, "text_loss": 0.18933317065238953 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.17375990607572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 0.00022056037988101612, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12080509.0, "repeat_count": 0.0, "routers_loss": 0.0019941304344683886, "skip_count": 1.0, "step": 7492, "text_loss": 0.6760565042495728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.183152333431174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.00022030376715865313, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 12083580.0, "repeat_count": 0.0, "routers_loss": 0.0017090907786041498, "skip_count": 0.0, "step": 7494, "text_loss": 0.4140956401824951 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.19254476078662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0002200472616164142, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 12086923.0, "repeat_count": 0.0, "routers_loss": 0.005131757352501154, "skip_count": 1.0, "step": 7496, "text_loss": 0.43287888169288635 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00021979086335259269, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 12090003.0, "repeat_count": 0.0, "routers_loss": 0.0007472267607226968, "skip_count": 0.0, "step": 7498, "text_loss": 0.6692602038383484 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.00021953457246544095, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 12092936.0, "repeat_count": 0.0, "routers_loss": 0.0012374494690448046, "skip_count": 0.0, "step": 7500, "text_loss": 0.5170100331306458 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00021927838905317016, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 12096395.0, "repeat_count": 0.0, "routers_loss": 0.006784295197576284, "skip_count": 2.0, "step": 7502, "text_loss": 0.340880811214447 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.23011447020839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.00021902231321395017, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 12099743.0, "repeat_count": 0.0, "routers_loss": 0.0058755455538630486, "skip_count": 1.0, "step": 7504, "text_loss": 0.5299809575080872 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 0.00021876634504590985, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 12103121.0, "repeat_count": 0.0, "routers_loss": 0.010622406378388405, "skip_count": 2.0, "step": 7506, "text_loss": 0.1817338913679123 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 35.248899324919286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 0.00021851048464713662, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 12105883.0, "repeat_count": 0.0, "routers_loss": 0.004382388666272163, "skip_count": 3.0, "step": 7508, "text_loss": 0.5718557834625244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 0.00021825473211567665, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 12108936.0, "repeat_count": 0.0, "routers_loss": 0.001638208981603384, "skip_count": 0.0, "step": 7510, "text_loss": 0.4684678316116333 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.26768417963017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.040771484375, "learning_rate": 0.00021799908754953468, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 12112060.0, "repeat_count": 0.0, "routers_loss": 0.0007894381997175515, "skip_count": 2.0, "step": 7512, "text_loss": 0.5146099328994751 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.27707660698562, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.00021774355104667455, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 12115636.0, "repeat_count": 0.0, "routers_loss": 0.01400370616465807, "skip_count": 2.0, "step": 7514, "text_loss": 0.19512294232845306 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 35.28646903434106, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.00021748812270501805, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 12119116.0, "repeat_count": 0.0, "routers_loss": 0.005261222366243601, "skip_count": 3.0, "step": 7516, "text_loss": 0.17316904664039612 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.295861461696504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022216796875, "learning_rate": 0.0002172328026224459, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 12122070.0, "repeat_count": 0.0, "routers_loss": 0.01021486520767212, "skip_count": 2.0, "step": 7518, "text_loss": 0.2777172029018402 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.305253889051954, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00021697759089679713, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 12125386.0, "repeat_count": 2.0, "routers_loss": 0.005217147525399923, "skip_count": 2.0, "step": 7520, "text_loss": 0.49744322896003723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.3146463164074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.00021672248762586948, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 12128753.0, "repeat_count": 0.0, "routers_loss": 0.003868246916681528, "skip_count": 0.0, "step": 7522, "text_loss": 0.4209211468696594 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 35.32403874376284, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00021646749290741895, "loss": 0.009, "macro_f1": 0.6598639488220215, "num_tokens": 12132425.0, "repeat_count": 1.0, "routers_loss": 0.044205982238054276, "skip_count": 3.0, "step": 7524, "text_loss": 0.4180344343185425 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.00021621260683916005, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 12135740.0, "repeat_count": 0.0, "routers_loss": 0.0032584366854280233, "skip_count": 2.0, "step": 7526, "text_loss": 0.21219655871391296 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.34282359847373, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02294921875, "learning_rate": 0.00021595782951876552, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 12139239.0, "repeat_count": 0.0, "routers_loss": 0.002418758114799857, "skip_count": 2.0, "step": 7528, "text_loss": 0.40800613164901733 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.35221602582917, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 0.0002157031610438665, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 12142572.0, "repeat_count": 1.0, "routers_loss": 0.005265383515506983, "skip_count": 1.0, "step": 7530, "text_loss": 0.7539705634117126 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.36160845318462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0002154486015120525, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 12145737.0, "repeat_count": 1.0, "routers_loss": 0.006648020353168249, "skip_count": 2.0, "step": 7532, "text_loss": 0.7824432253837585 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.371000880540066, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0002151941510208712, "loss": 0.0049, "macro_f1": 0.3272727429866791, "num_tokens": 12149376.0, "repeat_count": 1.0, "routers_loss": 0.01692759431898594, "skip_count": 0.0, "step": 7534, "text_loss": 0.4476291239261627 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 35.38039330789551, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0002149398096678283, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 12152191.0, "repeat_count": 1.0, "routers_loss": 0.013883143663406372, "skip_count": 0.0, "step": 7536, "text_loss": 0.14996720850467682 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.38978573525095, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.058837890625, "learning_rate": 0.00021468557755038826, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 12155084.0, "repeat_count": 2.0, "routers_loss": 0.009390740655362606, "skip_count": 2.0, "step": 7538, "text_loss": 0.23685340583324432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.3991781626064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0002144314547659731, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 12159366.0, "repeat_count": 0.0, "routers_loss": 0.0025363171007484198, "skip_count": 0.0, "step": 7540, "text_loss": 0.6687407493591309 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.40857058996184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0205078125, "learning_rate": 0.00021417744141196315, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 12162545.0, "repeat_count": 0.0, "routers_loss": 0.004230613354593515, "skip_count": 1.0, "step": 7542, "text_loss": 0.24885894358158112 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 35.41796301731729, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.01953125, "learning_rate": 0.00021392353758569694, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 12165381.0, "repeat_count": 1.0, "routers_loss": 0.008058524690568447, "skip_count": 0.0, "step": 7544, "text_loss": 0.15833988785743713 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.427355444672735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0002136697433844707, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 12168304.0, "repeat_count": 0.0, "routers_loss": 0.0018041770672425628, "skip_count": 0.0, "step": 7546, "text_loss": 0.6046217083930969 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.43674787202818, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029541015625, "learning_rate": 0.00021341605890553894, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 12171040.0, "repeat_count": 1.0, "routers_loss": 0.008584463968873024, "skip_count": 2.0, "step": 7548, "text_loss": 0.3001522719860077 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 0.00021316248424611408, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 12174702.0, "repeat_count": 0.0, "routers_loss": 0.0010506469989195466, "skip_count": 0.0, "step": 7550, "text_loss": 0.2998376488685608 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0162353515625, "learning_rate": 0.00021290901950336627, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 12178388.0, "repeat_count": 0.0, "routers_loss": 0.0012753128539770842, "skip_count": 0.0, "step": 7552, "text_loss": 0.8125656843185425 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.00021265566477442384, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 12181863.0, "repeat_count": 0.0, "routers_loss": 0.004343052394688129, "skip_count": 2.0, "step": 7554, "text_loss": 0.14004671573638916 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 35.47431758144996, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.00021240242015637268, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 12185485.0, "repeat_count": 1.0, "routers_loss": 0.0005794052849523723, "skip_count": 0.0, "step": 7556, "text_loss": 0.7116519808769226 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.4837100088054, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.00021214928574625664, "loss": 0.0063, "macro_f1": 0.3272727429866791, "num_tokens": 12188914.0, "repeat_count": 1.0, "routers_loss": 0.01066325418651104, "skip_count": 0.0, "step": 7558, "text_loss": 0.4664429724216461 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.49310243616085, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.00021189626164107718, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 12193042.0, "repeat_count": 0.0, "routers_loss": 0.0011769415577873588, "skip_count": 0.0, "step": 7560, "text_loss": 0.672637403011322 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 0.00021164334793779388, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 12195675.0, "repeat_count": 1.0, "routers_loss": 0.008653911761939526, "skip_count": 1.0, "step": 7562, "text_loss": 0.5301182866096497 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.00021139054473332357, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 12198638.0, "repeat_count": 0.0, "routers_loss": 0.0058176578022539616, "skip_count": 0.0, "step": 7564, "text_loss": 0.1889677792787552 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.000211137852124541, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 12202312.0, "repeat_count": 0.0, "routers_loss": 0.0004154018242843449, "skip_count": 0.0, "step": 7566, "text_loss": 0.3610386848449707 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.53067214558262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.00021088527020827848, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 12205112.0, "repeat_count": 0.0, "routers_loss": 0.0014722816413268447, "skip_count": 0.0, "step": 7568, "text_loss": 0.15214823186397552 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.54006457293807, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.0002106327990813257, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 12208103.0, "repeat_count": 0.0, "routers_loss": 0.0015596678713336587, "skip_count": 0.0, "step": 7570, "text_loss": 0.5034125447273254 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 35.549457000293515, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.00021038043884043022, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 12211208.0, "repeat_count": 1.0, "routers_loss": 0.007482443004846573, "skip_count": 0.0, "step": 7572, "text_loss": 0.6760116219520569 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02001953125, "learning_rate": 0.00021012818958229696, "loss": 0.0031, "macro_f1": 0.6666666865348816, "num_tokens": 12214463.0, "repeat_count": 0.0, "routers_loss": 0.003875598544254899, "skip_count": 2.0, "step": 7574, "text_loss": 0.3278147876262665 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.5682418550044, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.00020987605140358824, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12218199.0, "repeat_count": 0.0, "routers_loss": 0.007918627932667732, "skip_count": 2.0, "step": 7576, "text_loss": 0.23850615322589874 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.577634282359845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029541015625, "learning_rate": 0.00020962402440092388, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12221151.0, "repeat_count": 0.0, "routers_loss": 0.005424308590590954, "skip_count": 1.0, "step": 7578, "text_loss": 0.5670642256736755 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.58702670971529, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0002093721086708812, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 12224789.0, "repeat_count": 1.0, "routers_loss": 0.0066504343412816525, "skip_count": 1.0, "step": 7580, "text_loss": 0.30404478311538696 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 35.59641913707074, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.00020912030430999452, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 12228134.0, "repeat_count": 1.0, "routers_loss": 0.008815597742795944, "skip_count": 0.0, "step": 7582, "text_loss": 0.32522889971733093 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 35.60581156442618, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05126953125, "learning_rate": 0.0002088686114147561, "loss": 0.0098, "macro_f1": 0.5492662787437439, "num_tokens": 12231335.0, "repeat_count": 0.0, "routers_loss": 0.03785836696624756, "skip_count": 2.0, "step": 7584, "text_loss": 0.6277920603752136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.61520399178163, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.00020861703008161504, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 12234619.0, "repeat_count": 0.0, "routers_loss": 0.0016183801926672459, "skip_count": 0.0, "step": 7586, "text_loss": 0.38319316506385803 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.62459641913707, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.042236328125, "learning_rate": 0.00020836556040697767, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 12237296.0, "repeat_count": 1.0, "routers_loss": 0.013077575713396072, "skip_count": 1.0, "step": 7588, "text_loss": 0.297571063041687 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.00020811420248720769, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 12240633.0, "repeat_count": 0.0, "routers_loss": 0.002858756808564067, "skip_count": 0.0, "step": 7590, "text_loss": 0.2506035268306732 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.000207862956418626, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 12244118.0, "repeat_count": 0.0, "routers_loss": 0.0032624071463942528, "skip_count": 1.0, "step": 7592, "text_loss": 0.19843827188014984 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.65277370120341, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.056640625, "learning_rate": 0.00020761182229751045, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 12247367.0, "repeat_count": 1.0, "routers_loss": 0.005885142367333174, "skip_count": 3.0, "step": 7594, "text_loss": 0.3347153067588806 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 35.66216612855885, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0235595703125, "learning_rate": 0.00020736080022009602, "loss": 0.0088, "macro_f1": 0.9452888369560242, "num_tokens": 12250487.0, "repeat_count": 1.0, "routers_loss": 0.021491389721632004, "skip_count": 4.0, "step": 7596, "text_loss": 0.6777212619781494 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 35.671558555914295, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.00020710989028257514, "loss": 0.0061, "macro_f1": 0.6595745086669922, "num_tokens": 12253834.0, "repeat_count": 1.0, "routers_loss": 0.014164486899971962, "skip_count": 4.0, "step": 7598, "text_loss": 0.741127610206604 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0002068590925810968, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 12257289.0, "repeat_count": 0.0, "routers_loss": 0.0012773120542988181, "skip_count": 0.0, "step": 7600, "text_loss": 0.5336982607841492 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.69034341062518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031005859375, "learning_rate": 0.0002066084072117672, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 12260825.0, "repeat_count": 0.0, "routers_loss": 0.013102042488753796, "skip_count": 2.0, "step": 7602, "text_loss": 0.30410775542259216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.699735837980626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.00020635783427064942, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 12264609.0, "repeat_count": 0.0, "routers_loss": 0.002602101070806384, "skip_count": 0.0, "step": 7604, "text_loss": 0.29835572838783264 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.70912826533607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00020610737385376348, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 12267537.0, "repeat_count": 0.0, "routers_loss": 0.0053265830501914024, "skip_count": 0.0, "step": 7606, "text_loss": 0.2095658779144287 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.71852069269152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.00020585702605708628, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 12271175.0, "repeat_count": 0.0, "routers_loss": 0.000614096992649138, "skip_count": 0.0, "step": 7608, "text_loss": 0.8146751523017883 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.72791312004696, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.00020560679097655137, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 12274067.0, "repeat_count": 0.0, "routers_loss": 0.0013201923575252295, "skip_count": 0.0, "step": 7610, "text_loss": 0.40818271040916443 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.73730554740241, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.0002053566687080497, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 12276946.0, "repeat_count": 0.0, "routers_loss": 0.004304401110857725, "skip_count": 1.0, "step": 7612, "text_loss": 0.7063660025596619 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0002051066593474284, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 12279760.0, "repeat_count": 0.0, "routers_loss": 0.0032060579396784306, "skip_count": 1.0, "step": 7614, "text_loss": 0.23671887814998627 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.756090402113294, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00020485676299049154, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 12282737.0, "repeat_count": 0.0, "routers_loss": 0.005103024188429117, "skip_count": 2.0, "step": 7616, "text_loss": 0.17571020126342773 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.76548282946874, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00020460697973299986, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 12286290.0, "repeat_count": 1.0, "routers_loss": 0.007189507596194744, "skip_count": 1.0, "step": 7618, "text_loss": 0.30872994661331177 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.77487525682419, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0002043573096706708, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 12289458.0, "repeat_count": 0.0, "routers_loss": 0.0010217712260782719, "skip_count": 0.0, "step": 7620, "text_loss": 0.5155487060546875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0002041077528991784, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 12292846.0, "repeat_count": 0.0, "routers_loss": 0.0022399788722395897, "skip_count": 1.0, "step": 7622, "text_loss": 0.717949390411377 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0002038583095141532, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 12295673.0, "repeat_count": 0.0, "routers_loss": 0.0018168877577409148, "skip_count": 0.0, "step": 7624, "text_loss": 0.560361385345459 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0279541015625, "learning_rate": 0.00020360897961118246, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 12298624.0, "repeat_count": 0.0, "routers_loss": 0.0008487844606861472, "skip_count": 0.0, "step": 7626, "text_loss": 0.6391524076461792 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.81244496624596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.00020335976328580984, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 12302136.0, "repeat_count": 0.0, "routers_loss": 0.0006127831293269992, "skip_count": 0.0, "step": 7628, "text_loss": 0.5932226777076721 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.821837393601406, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.00020311066063353556, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 12305152.0, "repeat_count": 0.0, "routers_loss": 0.0018765819258987904, "skip_count": 0.0, "step": 7630, "text_loss": 0.37831631302833557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.83122982095686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00020286167174981618, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 12307771.0, "repeat_count": 0.0, "routers_loss": 0.0025384656619280577, "skip_count": 0.0, "step": 7632, "text_loss": 0.34806445240974426 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.8406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0002026127967300645, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 12310921.0, "repeat_count": 0.0, "routers_loss": 0.008239032700657845, "skip_count": 2.0, "step": 7634, "text_loss": 0.34859901666641235 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00020236403566965027, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12314200.0, "repeat_count": 0.0, "routers_loss": 0.0029505928978323936, "skip_count": 2.0, "step": 7636, "text_loss": 0.2647531032562256 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 35.85940710302319, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0002021153886638991, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 12319221.0, "repeat_count": 1.0, "routers_loss": 0.0014016951899975538, "skip_count": 0.0, "step": 7638, "text_loss": 0.42428603768348694 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 35.86879953037863, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.04248046875, "learning_rate": 0.00020186685580809288, "loss": 0.0059, "macro_f1": 0.5492662787437439, "num_tokens": 12322204.0, "repeat_count": 0.0, "routers_loss": 0.01761031709611416, "skip_count": 2.0, "step": 7640, "text_loss": 0.25929757952690125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.878191957734074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.00020161843719746997, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 12324750.0, "repeat_count": 0.0, "routers_loss": 0.0023674629628658295, "skip_count": 0.0, "step": 7642, "text_loss": 0.567159116268158 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.887584385089525, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0002013701329272248, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 12327933.0, "repeat_count": 0.0, "routers_loss": 0.004534341394901276, "skip_count": 0.0, "step": 7644, "text_loss": 0.4765215516090393 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.89697681244497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.00020112194309250797, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12330847.0, "repeat_count": 0.0, "routers_loss": 0.003144246758893132, "skip_count": 2.0, "step": 7646, "text_loss": 0.39837369322776794 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.90636923980041, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.019287109375, "learning_rate": 0.00020087386778842642, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 12333782.0, "repeat_count": 1.0, "routers_loss": 0.008137194439768791, "skip_count": 1.0, "step": 7648, "text_loss": 0.42175763845443726 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.915761667155856, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.021484375, "learning_rate": 0.00020062590711004296, "loss": 0.0034, "macro_f1": 1.0, "num_tokens": 12336837.0, "repeat_count": 1.0, "routers_loss": 0.006499455776065588, "skip_count": 1.0, "step": 7650, "text_loss": 0.18695278465747833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.00020037806115237667, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 12340414.0, "repeat_count": 0.0, "routers_loss": 0.001548365456983447, "skip_count": 0.0, "step": 7652, "text_loss": 0.1981094628572464 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 35.93454652186674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.00020013033001040255, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 12343209.0, "repeat_count": 0.0, "routers_loss": 0.008136926218867302, "skip_count": 2.0, "step": 7654, "text_loss": 0.2231602668762207 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.943938949222186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 0.00019988271377905165, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 12346158.0, "repeat_count": 0.0, "routers_loss": 0.00370375020429492, "skip_count": 1.0, "step": 7656, "text_loss": 0.4809921383857727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 35.95333137657764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.00019963521255321077, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 12349279.0, "repeat_count": 0.0, "routers_loss": 0.00690054427832365, "skip_count": 3.0, "step": 7658, "text_loss": 0.40473970770835876 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 35.96272380393308, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 0.0001993878264277233, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 12352848.0, "repeat_count": 1.0, "routers_loss": 0.004367961548268795, "skip_count": 1.0, "step": 7660, "text_loss": 0.3646799921989441 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.00019914055549738775, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 12356737.0, "repeat_count": 0.0, "routers_loss": 0.000662159756757319, "skip_count": 0.0, "step": 7662, "text_loss": 0.3703214228153229 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 35.98150865864397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0001988933998569589, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 12360085.0, "repeat_count": 0.0, "routers_loss": 0.0023262565955519676, "skip_count": 0.0, "step": 7664, "text_loss": 0.12910836935043335 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 35.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0001986463596011473, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 12363296.0, "repeat_count": 0.0, "routers_loss": 0.002686078194528818, "skip_count": 1.0, "step": 7666, "text_loss": 0.39628392457962036 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0208740234375, "learning_rate": 0.00019839943482461914, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 12366072.0, "repeat_count": 0.0, "routers_loss": 0.007100159768015146, "skip_count": 1.0, "step": 7668, "text_loss": 0.6588287949562073 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.00939242735544, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00019815262562199648, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 12368940.0, "repeat_count": 0.0, "routers_loss": 0.004194926470518112, "skip_count": 0.0, "step": 7670, "text_loss": 0.36411619186401367 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0189208984375, "learning_rate": 0.00019790593208785713, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 12372031.0, "repeat_count": 0.0, "routers_loss": 0.0041313013061881065, "skip_count": 0.0, "step": 7672, "text_loss": 0.23270413279533386 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 36.02817728206633, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 0.00019765935431673444, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 12375115.0, "repeat_count": 1.0, "routers_loss": 0.003343774238601327, "skip_count": 0.0, "step": 7674, "text_loss": 0.1686355322599411 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 36.03756970942178, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.038330078125, "learning_rate": 0.00019741289240311755, "loss": 0.0058, "macro_f1": 0.6122449040412903, "num_tokens": 12379089.0, "repeat_count": 0.0, "routers_loss": 0.021328814327716827, "skip_count": 4.0, "step": 7676, "text_loss": 0.9312577247619629 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00019716654644145104, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 12383115.0, "repeat_count": 0.0, "routers_loss": 0.0004511173174250871, "skip_count": 0.0, "step": 7678, "text_loss": 0.3305695056915283 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.00019692031652613522, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 12386064.0, "repeat_count": 0.0, "routers_loss": 0.006190002430230379, "skip_count": 0.0, "step": 7680, "text_loss": 0.4829687178134918 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 36.06574699148811, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.00019667420275152575, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 12389743.0, "repeat_count": 2.0, "routers_loss": 0.004575030412524939, "skip_count": 1.0, "step": 7682, "text_loss": 0.5751548409461975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.075139418843555, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019287109375, "learning_rate": 0.0001964282052119341, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 12392481.0, "repeat_count": 0.0, "routers_loss": 0.002718796720728278, "skip_count": 0.0, "step": 7684, "text_loss": 0.5349925756454468 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.084531846199, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0001961823240016269, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 12395207.0, "repeat_count": 0.0, "routers_loss": 0.0027528523933142424, "skip_count": 0.0, "step": 7686, "text_loss": 0.5322592258453369 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 36.09392427355445, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.00019593655921482624, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12398232.0, "repeat_count": 1.0, "routers_loss": 0.008105970919132233, "skip_count": 0.0, "step": 7688, "text_loss": 0.3192061185836792 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 36.10331670090989, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.00019569091094570967, "loss": 0.0069, "macro_f1": 0.6603773832321167, "num_tokens": 12400862.0, "repeat_count": 1.0, "routers_loss": 0.024075545370578766, "skip_count": 1.0, "step": 7690, "text_loss": 0.3189752697944641 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 36.11270912826534, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.0001954453792884101, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 12404039.0, "repeat_count": 0.0, "routers_loss": 0.007513802964240313, "skip_count": 3.0, "step": 7692, "text_loss": 0.5985093712806702 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.12210155562078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0001951999643370157, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 12407085.0, "repeat_count": 1.0, "routers_loss": 0.009606506675481796, "skip_count": 2.0, "step": 7694, "text_loss": 0.2050790935754776 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.00019495466618556996, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 12411377.0, "repeat_count": 0.0, "routers_loss": 0.0007978329667821527, "skip_count": 0.0, "step": 7696, "text_loss": 0.4705570638179779 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00019470948492807154, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 12414427.0, "repeat_count": 0.0, "routers_loss": 0.0010737364646047354, "skip_count": 0.0, "step": 7698, "text_loss": 0.6105324029922485 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.15027883768712, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.00019446442065847448, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 12417442.0, "repeat_count": 0.0, "routers_loss": 0.001762967323884368, "skip_count": 0.0, "step": 7700, "text_loss": 0.5638618469238281 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00019421947347068774, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 12420862.0, "repeat_count": 0.0, "routers_loss": 0.0015798417152836919, "skip_count": 0.0, "step": 7702, "text_loss": 0.1939864307641983 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.00019397464345857562, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 12423876.0, "repeat_count": 0.0, "routers_loss": 0.005659835878759623, "skip_count": 1.0, "step": 7704, "text_loss": 0.20829300582408905 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 36.17845611975345, "f1_execute": 0.9777777791023254, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.052001953125, "learning_rate": 0.00019372993071595723, "loss": 0.0072, "macro_f1": 0.9449735879898071, "num_tokens": 12427639.0, "repeat_count": 4.0, "routers_loss": 0.018665846437215805, "skip_count": 2.0, "step": 7706, "text_loss": 0.47913849353790283 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.18784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00019348533533660727, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 12431520.0, "repeat_count": 0.0, "routers_loss": 0.0006690093432553113, "skip_count": 0.0, "step": 7708, "text_loss": 0.494870662689209 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.197240974464336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.00019324085741425511, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 12434213.0, "repeat_count": 0.0, "routers_loss": 0.004067352041602135, "skip_count": 1.0, "step": 7710, "text_loss": 0.7631711959838867 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 36.20663340181978, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 0.00019299649704258504, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 12437437.0, "repeat_count": 2.0, "routers_loss": 0.01157623715698719, "skip_count": 0.0, "step": 7712, "text_loss": 0.3145926296710968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.21602582917523, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0001927522543152364, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 12440507.0, "repeat_count": 0.0, "routers_loss": 0.001888492377474904, "skip_count": 0.0, "step": 7714, "text_loss": 0.576301097869873 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.22541825653067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 0.00019250812932580352, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 12443484.0, "repeat_count": 0.0, "routers_loss": 0.00042988534551113844, "skip_count": 0.0, "step": 7716, "text_loss": 0.5716445446014404 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.23481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0211181640625, "learning_rate": 0.00019226412216783557, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 12446460.0, "repeat_count": 0.0, "routers_loss": 0.005063199903815985, "skip_count": 1.0, "step": 7718, "text_loss": 0.2700924873352051 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0001920202329348365, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 12449346.0, "repeat_count": 0.0, "routers_loss": 0.0010775640839710832, "skip_count": 0.0, "step": 7720, "text_loss": 0.5162558555603027 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.253595538597004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.00019177646172026513, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 12452680.0, "repeat_count": 0.0, "routers_loss": 0.0014514096546918154, "skip_count": 0.0, "step": 7722, "text_loss": 0.5753642916679382 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0177001953125, "learning_rate": 0.00019153280861753497, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12455348.0, "repeat_count": 0.0, "routers_loss": 0.002202774863690138, "skip_count": 1.0, "step": 7724, "text_loss": 0.5751997232437134 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.2723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.00019128927372001454, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 12458098.0, "repeat_count": 0.0, "routers_loss": 0.005171069409698248, "skip_count": 0.0, "step": 7726, "text_loss": 0.22252975404262543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00019104585712102678, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 12460958.0, "repeat_count": 0.0, "routers_loss": 0.0041033923625946045, "skip_count": 0.0, "step": 7728, "text_loss": 0.18611937761306763 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 36.291165248018785, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.00019080255891384945, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 12463596.0, "repeat_count": 1.0, "routers_loss": 0.0012201941572129726, "skip_count": 0.0, "step": 7730, "text_loss": 0.47347909212112427 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 36.30055767537423, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0001905593791917148, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 12467021.0, "repeat_count": 2.0, "routers_loss": 0.005837214644998312, "skip_count": 2.0, "step": 7732, "text_loss": 0.2055564969778061 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.30995010272967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.00019031631804780974, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 12469743.0, "repeat_count": 0.0, "routers_loss": 0.0010269953636452556, "skip_count": 0.0, "step": 7734, "text_loss": 0.45995602011680603 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.319342530085116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00019007337557527582, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 12473082.0, "repeat_count": 0.0, "routers_loss": 0.00436213007196784, "skip_count": 1.0, "step": 7736, "text_loss": 0.4515823721885681 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.32873495744057, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 0.00018983055186720888, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 12476100.0, "repeat_count": 0.0, "routers_loss": 0.003051829058676958, "skip_count": 2.0, "step": 7738, "text_loss": 0.12298467755317688 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.33812738479601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0001895878470166597, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 12480231.0, "repeat_count": 0.0, "routers_loss": 0.008164191618561745, "skip_count": 2.0, "step": 7740, "text_loss": 0.17456457018852234 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.347519812151454, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046630859375, "learning_rate": 0.00018934526111663314, "loss": 0.0069, "macro_f1": 0.3272727429866791, "num_tokens": 12483894.0, "repeat_count": 0.0, "routers_loss": 0.008653721772134304, "skip_count": 1.0, "step": 7742, "text_loss": 0.7125775814056396 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 36.3569122395069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02294921875, "learning_rate": 0.00018910279426008857, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 12488077.0, "repeat_count": 0.0, "routers_loss": 0.005024447571486235, "skip_count": 6.0, "step": 7744, "text_loss": 0.833778977394104 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.00018886044653993966, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 12490999.0, "repeat_count": 0.0, "routers_loss": 0.002690888475626707, "skip_count": 0.0, "step": 7746, "text_loss": 0.15594039857387543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.375697094217784, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00018861821804905466, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 12494765.0, "repeat_count": 0.0, "routers_loss": 0.006087568122893572, "skip_count": 0.0, "step": 7748, "text_loss": 0.2696777880191803 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.385089521573235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.00018837610888025586, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 12497741.0, "repeat_count": 0.0, "routers_loss": 0.0014629303477704525, "skip_count": 0.0, "step": 7750, "text_loss": 0.6801294684410095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.39448194892868, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11865234375, "learning_rate": 0.00018813411912631996, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 12500585.0, "repeat_count": 0.0, "routers_loss": 0.001163579523563385, "skip_count": 0.0, "step": 7752, "text_loss": 0.41069695353507996 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 36.40387437628412, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.00018789224887997796, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 12503579.0, "repeat_count": 2.0, "routers_loss": 0.009436148218810558, "skip_count": 0.0, "step": 7754, "text_loss": 0.6993107795715332 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.413266803639566, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.00018765049823391472, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 12506698.0, "repeat_count": 1.0, "routers_loss": 0.002098206663504243, "skip_count": 2.0, "step": 7756, "text_loss": 0.5704247951507568 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.00018740886728077, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 12509869.0, "repeat_count": 0.0, "routers_loss": 0.002066673245280981, "skip_count": 1.0, "step": 7758, "text_loss": 0.7605635523796082 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.43205165835045, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0289306640625, "learning_rate": 0.00018716735611313707, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 12513433.0, "repeat_count": 0.0, "routers_loss": 0.0023439819924533367, "skip_count": 1.0, "step": 7760, "text_loss": 0.4746153950691223 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.441444085705896, "f1_execute": 0.9767441749572754, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.00018692596482356333, "loss": 0.0057, "macro_f1": 0.9255813956260681, "num_tokens": 12516817.0, "repeat_count": 3.0, "routers_loss": 0.039019811898469925, "skip_count": 4.0, "step": 7762, "text_loss": 0.3105330467224121 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.45083651306135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0286865234375, "learning_rate": 0.00018668469350455048, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 12519357.0, "repeat_count": 0.0, "routers_loss": 0.002269966993480921, "skip_count": 0.0, "step": 7764, "text_loss": 0.3700210452079773 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.00018644354224855414, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 12522072.0, "repeat_count": 0.0, "routers_loss": 0.001265842467546463, "skip_count": 0.0, "step": 7766, "text_loss": 0.6737633943557739 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 0.00018620251114798386, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 12524999.0, "repeat_count": 0.0, "routers_loss": 0.006547329016029835, "skip_count": 1.0, "step": 7768, "text_loss": 0.24906545877456665 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.47901379512768, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0001859616002952033, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 12527785.0, "repeat_count": 2.0, "routers_loss": 0.010791841894388199, "skip_count": 3.0, "step": 7770, "text_loss": 0.3069820702075958 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.48840622248312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025390625, "learning_rate": 0.0001857208097825299, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12530801.0, "repeat_count": 0.0, "routers_loss": 0.00492103723809123, "skip_count": 2.0, "step": 7772, "text_loss": 0.2524295151233673 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.497798649838565, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.0001854801397022351, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 12533919.0, "repeat_count": 0.0, "routers_loss": 0.001942967064678669, "skip_count": 0.0, "step": 7774, "text_loss": 0.7855241894721985 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 36.507191077194015, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.00018523959014654407, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 12537265.0, "repeat_count": 2.0, "routers_loss": 0.00987488217651844, "skip_count": 2.0, "step": 7776, "text_loss": 0.2767317593097687 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.51658350454946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.00018499916120763582, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 12539695.0, "repeat_count": 0.0, "routers_loss": 0.0054283770732581615, "skip_count": 1.0, "step": 7778, "text_loss": 0.43287888169288635 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 36.5259759319049, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00018475885297764306, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 12542881.0, "repeat_count": 2.0, "routers_loss": 0.00797359924763441, "skip_count": 0.0, "step": 7780, "text_loss": 0.3738224506378174 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0001845186655486527, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 12546530.0, "repeat_count": 0.0, "routers_loss": 0.0045951665379107, "skip_count": 0.0, "step": 7782, "text_loss": 0.2511517107486725 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 36.54476078661579, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0308837890625, "learning_rate": 0.00018427859901270482, "loss": 0.0055, "macro_f1": 0.9452888369560242, "num_tokens": 12549439.0, "repeat_count": 1.0, "routers_loss": 0.02312052994966507, "skip_count": 4.0, "step": 7784, "text_loss": 0.3837030827999115 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 36.55415321397123, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.059814453125, "learning_rate": 0.00018403865346179344, "loss": 0.0066, "macro_f1": 0.9265305995941162, "num_tokens": 12553211.0, "repeat_count": 1.0, "routers_loss": 0.014698561280965805, "skip_count": 3.0, "step": 7786, "text_loss": 0.510159432888031 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 36.563545641326684, "f1_execute": 0.9743589162826538, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.00018379882898786603, "loss": 0.0075, "macro_f1": 0.8803418874740601, "num_tokens": 12556497.0, "repeat_count": 2.0, "routers_loss": 0.023926246911287308, "skip_count": 7.0, "step": 7788, "text_loss": 0.44811317324638367 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.57293806868213, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 0.00018355912568282384, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 12559778.0, "repeat_count": 0.0, "routers_loss": 0.0011187797645106912, "skip_count": 0.0, "step": 7790, "text_loss": 0.32099616527557373 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.00018331954363852166, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 12562610.0, "repeat_count": 0.0, "routers_loss": 0.0005356677575036883, "skip_count": 0.0, "step": 7792, "text_loss": 0.9754356145858765 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 36.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0196533203125, "learning_rate": 0.0001830800829467677, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 12565886.0, "repeat_count": 2.0, "routers_loss": 0.0017101728590205312, "skip_count": 0.0, "step": 7794, "text_loss": 0.4234761595726013 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.60111535074846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.00018284074369932386, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 12568728.0, "repeat_count": 0.0, "routers_loss": 0.0012841494753956795, "skip_count": 0.0, "step": 7796, "text_loss": 0.41109147667884827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.6105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 0.0001826015259879053, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 12572231.0, "repeat_count": 0.0, "routers_loss": 0.0022388407960534096, "skip_count": 0.0, "step": 7798, "text_loss": 0.5459926128387451 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.619900205459345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.00018236242990418074, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 12574968.0, "repeat_count": 0.0, "routers_loss": 0.0019992550369352102, "skip_count": 0.0, "step": 7800, "text_loss": 0.5028481483459473 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.629292632814796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0205078125, "learning_rate": 0.0001821234555397722, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 12579074.0, "repeat_count": 0.0, "routers_loss": 0.002936388598755002, "skip_count": 2.0, "step": 7802, "text_loss": 0.2377086579799652 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 36.63868506017024, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.00018188460298625503, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 12581912.0, "repeat_count": 1.0, "routers_loss": 0.0026762608904391527, "skip_count": 0.0, "step": 7804, "text_loss": 0.13887254893779755 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 36.64807748752568, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.00018164587233515824, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 12585020.0, "repeat_count": 3.0, "routers_loss": 0.003901638789102435, "skip_count": 1.0, "step": 7806, "text_loss": 0.35454171895980835 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 0.00018140726367796373, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 12588310.0, "repeat_count": 0.0, "routers_loss": 0.0031358697451651096, "skip_count": 2.0, "step": 7808, "text_loss": 0.3567306697368622 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.66686234223657, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020263671875, "learning_rate": 0.00018116877710610673, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 12591735.0, "repeat_count": 0.0, "routers_loss": 0.002310588024556637, "skip_count": 1.0, "step": 7810, "text_loss": 0.45357072353363037 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.67625476959201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.00018093041271097582, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 12595232.0, "repeat_count": 0.0, "routers_loss": 0.005600228440016508, "skip_count": 2.0, "step": 7812, "text_loss": 0.4179847836494446 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.685647196947464, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.00018069217058391267, "loss": 0.006, "macro_f1": 0.6603773832321167, "num_tokens": 12598367.0, "repeat_count": 1.0, "routers_loss": 0.04015933722257614, "skip_count": 1.0, "step": 7814, "text_loss": 0.17874565720558167 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.69503962430291, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.00018045405081621214, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 12601864.0, "repeat_count": 0.0, "routers_loss": 0.005119446665048599, "skip_count": 1.0, "step": 7816, "text_loss": 0.6867854595184326 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.00018021605349912207, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 12605268.0, "repeat_count": 0.0, "routers_loss": 0.0005990012432448566, "skip_count": 0.0, "step": 7818, "text_loss": 0.9084970355033875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 0.00017997817872384358, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 12608093.0, "repeat_count": 0.0, "routers_loss": 0.008712377399206161, "skip_count": 1.0, "step": 7820, "text_loss": 0.19413328170776367 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.00017974042658153066, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 12611001.0, "repeat_count": 0.0, "routers_loss": 0.007535711396485567, "skip_count": 1.0, "step": 7822, "text_loss": 0.2672932744026184 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.73260933372468, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0001795027971632905, "loss": 0.0042, "macro_f1": 1.0, "num_tokens": 12614584.0, "repeat_count": 1.0, "routers_loss": 0.006770546548068523, "skip_count": 3.0, "step": 7824, "text_loss": 0.22805163264274597 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0189208984375, "learning_rate": 0.00017926529056018297, "loss": 0.0031, "macro_f1": 0.3333333432674408, "num_tokens": 12617519.0, "repeat_count": 0.0, "routers_loss": 0.0010458873584866524, "skip_count": 0.0, "step": 7826, "text_loss": 0.385499507188797 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 36.751394188435576, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.00017902790686322102, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 12621566.0, "repeat_count": 1.0, "routers_loss": 0.00634258147329092, "skip_count": 0.0, "step": 7828, "text_loss": 0.8044118285179138 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 36.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 0.00017879064616337076, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 12624751.0, "repeat_count": 0.0, "routers_loss": 0.0053052278235554695, "skip_count": 3.0, "step": 7830, "text_loss": 0.264322966337204 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.77017904314646, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047119140625, "learning_rate": 0.00017855350855155088, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 12628478.0, "repeat_count": 0.0, "routers_loss": 0.0028291696216911077, "skip_count": 0.0, "step": 7832, "text_loss": 0.20611460506916046 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 36.77957147050191, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.00017831649411863287, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 12632027.0, "repeat_count": 0.0, "routers_loss": 0.0009586421074345708, "skip_count": 1.0, "step": 7834, "text_loss": 0.4119716286659241 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.78896389785735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00017807960295544118, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 12635144.0, "repeat_count": 0.0, "routers_loss": 0.012304541654884815, "skip_count": 2.0, "step": 7836, "text_loss": 0.28647977113723755 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.798356325212794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0001778428351527529, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 12638719.0, "repeat_count": 0.0, "routers_loss": 0.005212076939642429, "skip_count": 2.0, "step": 7838, "text_loss": 0.630459189414978 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.807748752568244, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0001776061908012979, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 12642119.0, "repeat_count": 0.0, "routers_loss": 0.00183707510586828, "skip_count": 0.0, "step": 7840, "text_loss": 0.5905961990356445 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 36.81714117992369, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0001773696699917588, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 12645077.0, "repeat_count": 1.0, "routers_loss": 0.0058263009414076805, "skip_count": 0.0, "step": 7842, "text_loss": 0.41949576139450073 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 0.00017713327281477077, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 12648964.0, "repeat_count": 0.0, "routers_loss": 0.001586507773026824, "skip_count": 0.0, "step": 7844, "text_loss": 0.5048848390579224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0223388671875, "learning_rate": 0.00017689699936092163, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 12651934.0, "repeat_count": 0.0, "routers_loss": 0.002397194504737854, "skip_count": 0.0, "step": 7846, "text_loss": 0.23879878222942352 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 36.84531846199002, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0284423828125, "learning_rate": 0.0001766608497207518, "loss": 0.0054, "macro_f1": 0.5492662787437439, "num_tokens": 12654907.0, "repeat_count": 0.0, "routers_loss": 0.016742069274187088, "skip_count": 2.0, "step": 7848, "text_loss": 0.23400072753429413 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.85471088934546, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0001764248239847544, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 12658765.0, "repeat_count": 0.0, "routers_loss": 0.007037387229502201, "skip_count": 2.0, "step": 7850, "text_loss": 0.26165497303009033 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 36.86410331670091, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.017822265625, "learning_rate": 0.00017618892224337463, "loss": 0.0044, "macro_f1": 0.5492662787437439, "num_tokens": 12662024.0, "repeat_count": 0.0, "routers_loss": 0.017352160066366196, "skip_count": 2.0, "step": 7852, "text_loss": 0.23813043534755707 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 36.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0208740234375, "learning_rate": 0.00017595314458701084, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 12665751.0, "repeat_count": 0.0, "routers_loss": 0.005349365528672934, "skip_count": 3.0, "step": 7854, "text_loss": 0.14920757710933685 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00017571749110601337, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 12668823.0, "repeat_count": 0.0, "routers_loss": 0.0037689812015742064, "skip_count": 2.0, "step": 7856, "text_loss": 0.2198697030544281 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.89228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.00017548196189068506, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 12672367.0, "repeat_count": 0.0, "routers_loss": 0.0006363615393638611, "skip_count": 0.0, "step": 7858, "text_loss": 0.5338839888572693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.90167302612269, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.00017524655703128112, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 12675217.0, "repeat_count": 0.0, "routers_loss": 0.002691479865461588, "skip_count": 0.0, "step": 7860, "text_loss": 0.17463763058185577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.91106545347813, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.00017501127661800908, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 12678796.0, "repeat_count": 0.0, "routers_loss": 0.002262329449877143, "skip_count": 0.0, "step": 7862, "text_loss": 0.4637797474861145 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.92045788083358, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.00017477612074102899, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 12681631.0, "repeat_count": 0.0, "routers_loss": 0.00115531450137496, "skip_count": 0.0, "step": 7864, "text_loss": 0.6089238524436951 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.929850308189025, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.00017454108949045295, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 12685647.0, "repeat_count": 0.0, "routers_loss": 0.00260268640704453, "skip_count": 0.0, "step": 7866, "text_loss": 0.5876018404960632 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.93924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.00017430618295634514, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 12688995.0, "repeat_count": 0.0, "routers_loss": 0.002731681102886796, "skip_count": 0.0, "step": 7868, "text_loss": 0.35076001286506653 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 36.94863516289991, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.00017407140122872262, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 12692100.0, "repeat_count": 1.0, "routers_loss": 0.003314645728096366, "skip_count": 1.0, "step": 7870, "text_loss": 0.5313478112220764 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.958027590255355, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00017383674439755393, "loss": 0.0069, "macro_f1": 0.3272727429866791, "num_tokens": 12695117.0, "repeat_count": 0.0, "routers_loss": 0.010385016910731792, "skip_count": 1.0, "step": 7872, "text_loss": 0.5092368125915527 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.00017360221255276016, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 12697678.0, "repeat_count": 0.0, "routers_loss": 0.001273582922294736, "skip_count": 0.0, "step": 7874, "text_loss": 0.5282881855964661 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 36.97681244496625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.00017336780578421418, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 12702132.0, "repeat_count": 0.0, "routers_loss": 0.0007510313298553228, "skip_count": 0.0, "step": 7876, "text_loss": 0.49093571305274963 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 36.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 0.0001731335241817412, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 12705413.0, "repeat_count": 0.0, "routers_loss": 0.005138787440955639, "skip_count": 2.0, "step": 7878, "text_loss": 0.7503541111946106 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 36.99559729967714, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0001728993678351184, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 12708310.0, "repeat_count": 2.0, "routers_loss": 0.004379773512482643, "skip_count": 0.0, "step": 7880, "text_loss": 0.5942456126213074 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.004696213677725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0400390625, "learning_rate": 0.0001726653368340747, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 12711043.0, "repeat_count": 0.0, "routers_loss": 0.005271450616419315, "skip_count": 2.0, "step": 7882, "text_loss": 0.348360538482666 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 37.01408864103317, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.00017243143126829163, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 12714473.0, "repeat_count": 1.0, "routers_loss": 0.0015764752170071006, "skip_count": 1.0, "step": 7884, "text_loss": 0.45971861481666565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.02348106838861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.000172197651227402, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 12717832.0, "repeat_count": 0.0, "routers_loss": 0.00040649910806678236, "skip_count": 0.0, "step": 7886, "text_loss": 0.5996841788291931 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.00017196399680099078, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 12720479.0, "repeat_count": 0.0, "routers_loss": 0.00473182974383235, "skip_count": 2.0, "step": 7888, "text_loss": 0.40346208214759827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.00017173046807859483, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 12723104.0, "repeat_count": 0.0, "routers_loss": 0.0020138369873166084, "skip_count": 0.0, "step": 7890, "text_loss": 0.6878634095191956 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.05165835045494, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0001714970651497027, "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 12725967.0, "repeat_count": 0.0, "routers_loss": 0.008381367661058903, "skip_count": 1.0, "step": 7892, "text_loss": 0.9161711931228638 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 37.061050777810394, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0184326171875, "learning_rate": 0.00017126378810375498, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 12728819.0, "repeat_count": 1.0, "routers_loss": 0.0037658829241991043, "skip_count": 0.0, "step": 7894, "text_loss": 0.4447716772556305 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.07044320516584, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00017103063703014372, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 12731806.0, "repeat_count": 0.0, "routers_loss": 0.0022742559667676687, "skip_count": 0.0, "step": 7896, "text_loss": 0.9140825867652893 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 0.00017079761201821298, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 12734649.0, "repeat_count": 0.0, "routers_loss": 0.002157264854758978, "skip_count": 0.0, "step": 7898, "text_loss": 0.268303781747818 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 37.089228059876724, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 0.0001705647131572583, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 12737889.0, "repeat_count": 1.0, "routers_loss": 0.01064873393625021, "skip_count": 1.0, "step": 7900, "text_loss": 0.36009490489959717 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 37.09862048723217, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.00017033194053652685, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 12740821.0, "repeat_count": 1.0, "routers_loss": 0.0062920586206018925, "skip_count": 0.0, "step": 7902, "text_loss": 0.5301805138587952 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 37.10801291458761, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0615234375, "learning_rate": 0.00017009929424521782, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 12743876.0, "repeat_count": 1.0, "routers_loss": 0.0033694824669510126, "skip_count": 1.0, "step": 7904, "text_loss": 1.026949167251587 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.117405341943055, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 0.00016986677437248155, "loss": 0.0071, "macro_f1": 0.8817967176437378, "num_tokens": 12747623.0, "repeat_count": 2.0, "routers_loss": 0.05076088383793831, "skip_count": 3.0, "step": 7906, "text_loss": 0.33465588092803955 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.126797769298506, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00016963438100742014, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 12751255.0, "repeat_count": 0.0, "routers_loss": 0.0005921403644606471, "skip_count": 0.0, "step": 7908, "text_loss": 0.3498881757259369 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.00016940211423908713, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 12754297.0, "repeat_count": 0.0, "routers_loss": 0.004132566973567009, "skip_count": 0.0, "step": 7910, "text_loss": 0.2874198853969574 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.14558262400939, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0001691699741564876, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 12756969.0, "repeat_count": 0.0, "routers_loss": 0.0024724705144762993, "skip_count": 1.0, "step": 7912, "text_loss": 0.10593545436859131 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.00016893796084857806, "loss": 0.0031, "macro_f1": 0.3333333432674408, "num_tokens": 12760261.0, "repeat_count": 0.0, "routers_loss": 0.002991671208292246, "skip_count": 0.0, "step": 7914, "text_loss": 0.1331545114517212 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.16436747872028, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0198974609375, "learning_rate": 0.00016870607440426643, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 12762971.0, "repeat_count": 0.0, "routers_loss": 0.0018167285015806556, "skip_count": 0.0, "step": 7916, "text_loss": 0.496826171875 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 37.17375990607572, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.00016847431491241207, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 12765949.0, "repeat_count": 1.0, "routers_loss": 0.0033364067785441875, "skip_count": 0.0, "step": 7918, "text_loss": 0.43522849678993225 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.183152333431174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0001682426824618256, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 12769201.0, "repeat_count": 0.0, "routers_loss": 0.001313596498221159, "skip_count": 0.0, "step": 7920, "text_loss": 0.8691539168357849 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.19254476078662, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.00016801117714126908, "loss": 0.0108, "macro_f1": 0.6603773832321167, "num_tokens": 12773308.0, "repeat_count": 1.0, "routers_loss": 0.02579287625849247, "skip_count": 1.0, "step": 7922, "text_loss": 0.275301069021225 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025390625, "learning_rate": 0.00016777979903945568, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 12776166.0, "repeat_count": 0.0, "routers_loss": 0.010501758195459843, "skip_count": 1.0, "step": 7924, "text_loss": 0.32124993205070496 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0001675485482450499, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 12779965.0, "repeat_count": 0.0, "routers_loss": 0.0063389060087502, "skip_count": 2.0, "step": 7926, "text_loss": 0.2527695894241333 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00016731742484666774, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 12783019.0, "repeat_count": 0.0, "routers_loss": 0.002796935848891735, "skip_count": 0.0, "step": 7928, "text_loss": 0.18767669796943665 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.23011447020839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0001670864289328759, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 12786291.0, "repeat_count": 0.0, "routers_loss": 0.007973561994731426, "skip_count": 2.0, "step": 7930, "text_loss": 0.29628485441207886 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.23950689756384, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.00016685556059219253, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 12789566.0, "repeat_count": 4.0, "routers_loss": 0.011405733413994312, "skip_count": 6.0, "step": 7932, "text_loss": 0.16635073721408844 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.248899324919286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00016662481991308682, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 12792533.0, "repeat_count": 0.0, "routers_loss": 0.0012368770549073815, "skip_count": 1.0, "step": 7934, "text_loss": 0.4196353852748871 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.000166394206983979, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 12795619.0, "repeat_count": 0.0, "routers_loss": 0.0036002211272716522, "skip_count": 1.0, "step": 7936, "text_loss": 0.17559808492660522 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 37.26768417963017, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.00016616372189324035, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 12799702.0, "repeat_count": 1.0, "routers_loss": 0.0039332108572125435, "skip_count": 0.0, "step": 7938, "text_loss": 0.603410542011261 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.27707660698562, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029296875, "learning_rate": 0.00016593336472919324, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 12802704.0, "repeat_count": 0.0, "routers_loss": 0.0008303318754769862, "skip_count": 0.0, "step": 7940, "text_loss": 0.5331749320030212 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.28646903434106, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.00016570313558011098, "loss": 0.0058, "macro_f1": 0.6601307392120361, "num_tokens": 12805630.0, "repeat_count": 1.0, "routers_loss": 0.05092398822307587, "skip_count": 2.0, "step": 7942, "text_loss": 0.17398510873317719 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.295861461696504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.00016547303453421774, "loss": 0.0031, "macro_f1": 0.3333333432674408, "num_tokens": 12809065.0, "repeat_count": 0.0, "routers_loss": 0.0006886976188980043, "skip_count": 0.0, "step": 7944, "text_loss": 0.3419797718524933 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.305253889051954, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.044677734375, "learning_rate": 0.00016524306167968878, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 12812641.0, "repeat_count": 1.0, "routers_loss": 0.005634502973407507, "skip_count": 3.0, "step": 7946, "text_loss": 0.5877651572227478 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.3146463164074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.00016501321710465005, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 12815527.0, "repeat_count": 0.0, "routers_loss": 0.0020598487462848425, "skip_count": 0.0, "step": 7948, "text_loss": 0.3558528423309326 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0203857421875, "learning_rate": 0.0001647835008971783, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 12819103.0, "repeat_count": 0.0, "routers_loss": 0.005946476943790913, "skip_count": 2.0, "step": 7950, "text_loss": 0.5800213813781738 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02001953125, "learning_rate": 0.00016455391314530154, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 12822423.0, "repeat_count": 0.0, "routers_loss": 0.010360358282923698, "skip_count": 2.0, "step": 7952, "text_loss": 0.278255820274353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.34282359847373, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.00016432445393699802, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 12826180.0, "repeat_count": 0.0, "routers_loss": 0.003017681185156107, "skip_count": 0.0, "step": 7954, "text_loss": 0.1571389138698578 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.35221602582917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.00016409512336019698, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 12829196.0, "repeat_count": 0.0, "routers_loss": 0.0008854938205331564, "skip_count": 0.0, "step": 7956, "text_loss": 0.2776578366756439 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04541015625, "learning_rate": 0.00016386592150277834, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 12831983.0, "repeat_count": 0.0, "routers_loss": 0.0023990103509277105, "skip_count": 0.0, "step": 7958, "text_loss": 0.46686989068984985 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 37.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0001636368484525727, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 12834889.0, "repeat_count": 0.0, "routers_loss": 0.009835032746195793, "skip_count": 5.0, "step": 7960, "text_loss": 0.22224856913089752 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00016340790429736118, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 12837950.0, "repeat_count": 0.0, "routers_loss": 0.0018618656322360039, "skip_count": 0.0, "step": 7962, "text_loss": 0.5101882815361023 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 37.38978573525095, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.019287109375, "learning_rate": 0.00016317908912487578, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 12840981.0, "repeat_count": 1.0, "routers_loss": 0.001275144051760435, "skip_count": 1.0, "step": 7964, "text_loss": 0.40567103028297424 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.3991781626064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.00016295040302279873, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 12844044.0, "repeat_count": 0.0, "routers_loss": 0.003117429558187723, "skip_count": 2.0, "step": 7966, "text_loss": 0.6888198852539062 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.40857058996184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.00016272184607876312, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 12847350.0, "repeat_count": 2.0, "routers_loss": 0.006585797294974327, "skip_count": 4.0, "step": 7968, "text_loss": 0.19813506305217743 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 37.41796301731729, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.0001624934183803523, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 12850285.0, "repeat_count": 1.0, "routers_loss": 0.0043576788157224655, "skip_count": 1.0, "step": 7970, "text_loss": 0.6108269691467285 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 37.427355444672735, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0269775390625, "learning_rate": 0.00016226512001510024, "loss": 0.0039, "macro_f1": 0.5492662787437439, "num_tokens": 12853993.0, "repeat_count": 0.0, "routers_loss": 0.011879517696797848, "skip_count": 2.0, "step": 7972, "text_loss": 0.42478689551353455 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.43674787202818, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.00016203695107049117, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 12857022.0, "repeat_count": 0.0, "routers_loss": 0.0016375730047002435, "skip_count": 0.0, "step": 7974, "text_loss": 0.5130020976066589 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0001618089116339601, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 12860764.0, "repeat_count": 0.0, "routers_loss": 0.0006649247952736914, "skip_count": 0.0, "step": 7976, "text_loss": 1.0629136562347412 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.455532726739065, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.00016158100179289208, "loss": 0.0062, "macro_f1": 0.6603773832321167, "num_tokens": 12864066.0, "repeat_count": 1.0, "routers_loss": 0.03140667825937271, "skip_count": 1.0, "step": 7978, "text_loss": 0.4241345226764679 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 37.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.0001613532216346226, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 12867555.0, "repeat_count": 0.0, "routers_loss": 0.010257012210786343, "skip_count": 4.0, "step": 7980, "text_loss": 0.6085613369941711 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.47431758144996, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038818359375, "learning_rate": 0.0001611255712464374, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 12871415.0, "repeat_count": 0.0, "routers_loss": 0.00783725269138813, "skip_count": 1.0, "step": 7982, "text_loss": 0.15661844611167908 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.4837100088054, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.017578125, "learning_rate": 0.00016089805071557256, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 12874195.0, "repeat_count": 1.0, "routers_loss": 0.0027650597039610147, "skip_count": 2.0, "step": 7984, "text_loss": 0.4938865005970001 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.49310243616085, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.00016067066012921439, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 12878084.0, "repeat_count": 1.0, "routers_loss": 0.04647083953022957, "skip_count": 0.0, "step": 7986, "text_loss": 0.2973119020462036 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047119140625, "learning_rate": 0.00016044339957449938, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 12881182.0, "repeat_count": 0.0, "routers_loss": 0.002192265819758177, "skip_count": 0.0, "step": 7988, "text_loss": 0.2623208165168762 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 0.00016021626913851418, "loss": 0.0031, "macro_f1": 0.3333333432674408, "num_tokens": 12884028.0, "repeat_count": 0.0, "routers_loss": 0.0023096329532563686, "skip_count": 0.0, "step": 7990, "text_loss": 0.3752247989177704 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.52127971822718, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 0.00015998926890829562, "loss": 0.0046, "macro_f1": 0.3272727429866791, "num_tokens": 12887759.0, "repeat_count": 0.0, "routers_loss": 0.03038526326417923, "skip_count": 1.0, "step": 7992, "text_loss": 0.2609226405620575 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.53067214558262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0001597623989708306, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 12890976.0, "repeat_count": 0.0, "routers_loss": 0.0015199477784335613, "skip_count": 0.0, "step": 7994, "text_loss": 0.6512867212295532 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.54006457293807, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00015953565941305615, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 12894112.0, "repeat_count": 0.0, "routers_loss": 0.0024166766088455915, "skip_count": 0.0, "step": 7996, "text_loss": 0.5539866089820862 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.549457000293515, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0001593090503218591, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 12896857.0, "repeat_count": 1.0, "routers_loss": 0.005081235896795988, "skip_count": 2.0, "step": 7998, "text_loss": 0.6631022691726685 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 37.55884942764896, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.00015908257178407682, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 12900075.0, "repeat_count": 1.0, "routers_loss": 0.0024711282458156347, "skip_count": 0.0, "step": 8000, "text_loss": 0.3309785723686218 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.5682418550044, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.00015885622388649617, "loss": 0.0059, "macro_f1": 0.6601307392120361, "num_tokens": 12903845.0, "repeat_count": 1.0, "routers_loss": 0.04024988412857056, "skip_count": 2.0, "step": 8002, "text_loss": 0.2384071946144104 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.577634282359845, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.045166015625, "learning_rate": 0.00015863000671585405, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 12907694.0, "repeat_count": 1.0, "routers_loss": 0.001953886589035392, "skip_count": 2.0, "step": 8004, "text_loss": 0.5001366138458252 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.58702670971529, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 0.00015840392035883726, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 12910871.0, "repeat_count": 0.0, "routers_loss": 0.002982128644362092, "skip_count": 2.0, "step": 8006, "text_loss": 0.2589346170425415 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.59641913707074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0001581779649020827, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 12914484.0, "repeat_count": 0.0, "routers_loss": 0.0009384988807141781, "skip_count": 0.0, "step": 8008, "text_loss": 0.5727795362472534 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 0.00015795214043217654, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 12917480.0, "repeat_count": 0.0, "routers_loss": 0.008854437619447708, "skip_count": 2.0, "step": 8010, "text_loss": 0.24354904890060425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.61520399178163, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.00015772644703565563, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 12920383.0, "repeat_count": 0.0, "routers_loss": 0.001689503900706768, "skip_count": 0.0, "step": 8012, "text_loss": 0.5372336506843567 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.00015750088479900588, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 12923886.0, "repeat_count": 0.0, "routers_loss": 0.002284591319039464, "skip_count": 0.0, "step": 8014, "text_loss": 0.1708722710609436 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 37.633988846492514, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 0.00015727545380866316, "loss": 0.0042, "macro_f1": 1.0, "num_tokens": 12926998.0, "repeat_count": 1.0, "routers_loss": 0.004594483878463507, "skip_count": 4.0, "step": 8016, "text_loss": 0.26784324645996094 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 37.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0001570501541510131, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 12929726.0, "repeat_count": 1.0, "routers_loss": 0.0021998141892254353, "skip_count": 0.0, "step": 8018, "text_loss": 0.8051869869232178 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.65277370120341, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.00015682498591239086, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 12932182.0, "repeat_count": 0.0, "routers_loss": 0.0032623414881527424, "skip_count": 1.0, "step": 8020, "text_loss": 0.8431181907653809 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 0.00015659994917908144, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 12935338.0, "repeat_count": 0.0, "routers_loss": 0.0014909361489117146, "skip_count": 1.0, "step": 8022, "text_loss": 0.6168642640113831 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0001563750440373191, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 12938484.0, "repeat_count": 0.0, "routers_loss": 0.0010295510292053223, "skip_count": 0.0, "step": 8024, "text_loss": 0.2694014608860016 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 37.68095098326974, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.029296875, "learning_rate": 0.00015615027057328828, "loss": 0.0066, "macro_f1": 0.5492662787437439, "num_tokens": 12942045.0, "repeat_count": 0.0, "routers_loss": 0.018341995775699615, "skip_count": 2.0, "step": 8026, "text_loss": 0.8151478171348572 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 37.69034341062518, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0001559256288731224, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 12945547.0, "repeat_count": 2.0, "routers_loss": 0.0023289949167519808, "skip_count": 1.0, "step": 8028, "text_loss": 0.613464891910553 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.699735837980626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0191650390625, "learning_rate": 0.00015570111902290463, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 12949544.0, "repeat_count": 0.0, "routers_loss": 0.006635872647166252, "skip_count": 2.0, "step": 8030, "text_loss": 0.17417465150356293 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 37.70912826533607, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.00015547674110866756, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 12952838.0, "repeat_count": 1.0, "routers_loss": 0.006023989990353584, "skip_count": 1.0, "step": 8032, "text_loss": 0.4801837205886841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.71852069269152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030517578125, "learning_rate": 0.00015525249521639319, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 12956329.0, "repeat_count": 0.0, "routers_loss": 0.005706884432584047, "skip_count": 0.0, "step": 8034, "text_loss": 0.2028084248304367 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.72791312004696, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0242919921875, "learning_rate": 0.000155028381432013, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 12959122.0, "repeat_count": 0.0, "routers_loss": 0.003527123713865876, "skip_count": 2.0, "step": 8036, "text_loss": 0.39474430680274963 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.73730554740241, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0179443359375, "learning_rate": 0.00015480439984140776, "loss": 0.0029, "macro_f1": 1.0, "num_tokens": 12962546.0, "repeat_count": 1.0, "routers_loss": 0.010415437631309032, "skip_count": 2.0, "step": 8038, "text_loss": 0.20412345230579376 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0001545805505304077, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 12965861.0, "repeat_count": 0.0, "routers_loss": 0.001566931139677763, "skip_count": 0.0, "step": 8040, "text_loss": 0.5129821300506592 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 37.756090402113294, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0001543568335847923, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 12968677.0, "repeat_count": 3.0, "routers_loss": 0.0037196793127804995, "skip_count": 0.0, "step": 8042, "text_loss": 0.755020260810852 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.00015413324909029031, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 12972001.0, "repeat_count": 0.0, "routers_loss": 0.0010940275387838483, "skip_count": 0.0, "step": 8044, "text_loss": 0.48672133684158325 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.77487525682419, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 0.00015390979713257968, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 12974765.0, "repeat_count": 0.0, "routers_loss": 0.011106903664767742, "skip_count": 1.0, "step": 8046, "text_loss": 0.1727766990661621 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 37.78426768417963, "f1_execute": 0.949999988079071, "f1_repeat": 0.800000011920929, "f1_skip": 0.9090909361839294, "grad_norm": 0.048828125, "learning_rate": 0.00015368647779728757, "loss": 0.006, "macro_f1": 0.886363685131073, "num_tokens": 12979127.0, "repeat_count": 3.0, "routers_loss": 0.05134248360991478, "skip_count": 6.0, "step": 8048, "text_loss": 0.33233317732810974 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.00015346329116999057, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 12982812.0, "repeat_count": 0.0, "routers_loss": 0.0027500339783728123, "skip_count": 0.0, "step": 8050, "text_loss": 0.8176849484443665 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.80305253889052, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.00015324023733621412, "loss": 0.005, "macro_f1": 0.32098764181137085, "num_tokens": 12985740.0, "repeat_count": 0.0, "routers_loss": 0.030734945088624954, "skip_count": 2.0, "step": 8052, "text_loss": 0.38721024990081787 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 37.81244496624596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 0.00015301731638143285, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 12988646.0, "repeat_count": 0.0, "routers_loss": 0.002358534839004278, "skip_count": 2.0, "step": 8054, "text_loss": 0.5656245946884155 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.821837393601406, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.060791015625, "learning_rate": 0.0001527945283910705, "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 12991518.0, "repeat_count": 2.0, "routers_loss": 0.007991814985871315, "skip_count": 3.0, "step": 8056, "text_loss": 0.26438817381858826 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 37.83122982095686, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.00015257187345049983, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 12994847.0, "repeat_count": 1.0, "routers_loss": 0.011761264875531197, "skip_count": 1.0, "step": 8058, "text_loss": 0.1801673173904419 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 37.8406222483123, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0184326171875, "learning_rate": 0.0001523493516450427, "loss": 0.004, "macro_f1": 0.8823530077934265, "num_tokens": 12997874.0, "repeat_count": 1.0, "routers_loss": 0.021669765934348106, "skip_count": 2.0, "step": 8060, "text_loss": 0.3278379738330841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0001521269630599698, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 13000504.0, "repeat_count": 0.0, "routers_loss": 0.002388916676864028, "skip_count": 0.0, "step": 8062, "text_loss": 0.5396623611450195 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.85940710302319, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00015190470778050086, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 13003620.0, "repeat_count": 0.0, "routers_loss": 0.007719808723777533, "skip_count": 1.0, "step": 8064, "text_loss": 0.1989232450723648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00015168258589180462, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 13007410.0, "repeat_count": 0.0, "routers_loss": 0.0007461659261025488, "skip_count": 0.0, "step": 8066, "text_loss": 0.5293997526168823 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 37.878191957734074, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.00015146059747899848, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 13010240.0, "repeat_count": 1.0, "routers_loss": 0.005515575874596834, "skip_count": 0.0, "step": 8068, "text_loss": 0.2776186466217041 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.887584385089525, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.00015123874262714892, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 13012728.0, "repeat_count": 0.0, "routers_loss": 0.0026730166282504797, "skip_count": 0.0, "step": 8070, "text_loss": 0.5902766585350037 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.89697681244497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04833984375, "learning_rate": 0.00015101702142127088, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 13015616.0, "repeat_count": 0.0, "routers_loss": 0.002244985429570079, "skip_count": 0.0, "step": 8072, "text_loss": 0.21447396278381348 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 0.00015079543394632878, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 13019846.0, "repeat_count": 0.0, "routers_loss": 0.001963787479326129, "skip_count": 0.0, "step": 8074, "text_loss": 0.22974267601966858 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 37.915761667155856, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.053955078125, "learning_rate": 0.00015057398028723513, "loss": 0.0064, "macro_f1": 0.5492662787437439, "num_tokens": 13023036.0, "repeat_count": 0.0, "routers_loss": 0.02271878905594349, "skip_count": 2.0, "step": 8076, "text_loss": 0.26458361744880676 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 0.00015035266052885137, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 13025840.0, "repeat_count": 0.0, "routers_loss": 0.0011732397833839059, "skip_count": 0.0, "step": 8078, "text_loss": 0.44129177927970886 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.93454652186674, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0001501314747559877, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 13030031.0, "repeat_count": 1.0, "routers_loss": 0.015655985102057457, "skip_count": 2.0, "step": 8080, "text_loss": 0.28889161348342896 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.943938949222186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.00014991042305340286, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 13033603.0, "repeat_count": 0.0, "routers_loss": 0.0012988687958568335, "skip_count": 0.0, "step": 8082, "text_loss": 0.16362667083740234 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.95333137657764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00014968950550580434, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 13036931.0, "repeat_count": 0.0, "routers_loss": 0.002425852930173278, "skip_count": 0.0, "step": 8084, "text_loss": 0.35900676250457764 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 37.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0001494687221978482, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 13040637.0, "repeat_count": 0.0, "routers_loss": 0.004092676565051079, "skip_count": 1.0, "step": 8086, "text_loss": 0.20662656426429749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00014924807321413893, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 13043855.0, "repeat_count": 0.0, "routers_loss": 0.0009040542645379901, "skip_count": 0.0, "step": 8088, "text_loss": 0.30341213941574097 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.98150865864397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0001490275586392296, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 13046903.0, "repeat_count": 0.0, "routers_loss": 0.0019248841563239694, "skip_count": 0.0, "step": 8090, "text_loss": 0.4299648702144623 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 37.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.000148807178557622, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 13050219.0, "repeat_count": 0.0, "routers_loss": 0.0008314658771269023, "skip_count": 0.0, "step": 8092, "text_loss": 0.4521652162075043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00014858693305376598, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 13053076.0, "repeat_count": 0.0, "routers_loss": 0.0007470731507055461, "skip_count": 0.0, "step": 8094, "text_loss": 0.46265852451324463 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 38.00939242735544, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.00014836682221206, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 13056170.0, "repeat_count": 1.0, "routers_loss": 0.003292408073320985, "skip_count": 0.0, "step": 8096, "text_loss": 0.6483868956565857 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.00014814684611685124, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 13059181.0, "repeat_count": 0.0, "routers_loss": 0.001357200788334012, "skip_count": 0.0, "step": 8098, "text_loss": 0.43141183257102966 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.02817728206633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0177001953125, "learning_rate": 0.00014792700485243476, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 13062124.0, "repeat_count": 0.0, "routers_loss": 0.0030062920413911343, "skip_count": 0.0, "step": 8100, "text_loss": 0.26022693514823914 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0001477072985030542, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 13065273.0, "repeat_count": 0.0, "routers_loss": 0.0006919128354638815, "skip_count": 0.0, "step": 8102, "text_loss": 0.5927232503890991 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.00014748772715290144, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 13068346.0, "repeat_count": 0.0, "routers_loss": 0.005062389187514782, "skip_count": 0.0, "step": 8104, "text_loss": 0.1255214959383011 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0213623046875, "learning_rate": 0.00014726829088611664, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 13071384.0, "repeat_count": 0.0, "routers_loss": 0.0005492564523592591, "skip_count": 0.0, "step": 8106, "text_loss": 0.6445038914680481 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.06574699148811, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 0.00014704898978678817, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 13074667.0, "repeat_count": 0.0, "routers_loss": 0.002470226027071476, "skip_count": 0.0, "step": 8108, "text_loss": 0.5019628405570984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.075139418843555, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.00014682982393895256, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 13077566.0, "repeat_count": 0.0, "routers_loss": 0.0008262090268544853, "skip_count": 0.0, "step": 8110, "text_loss": 0.6075460314750671 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.084531846199, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0184326171875, "learning_rate": 0.00014661079342659467, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 13081042.0, "repeat_count": 0.0, "routers_loss": 0.00034181721275672317, "skip_count": 0.0, "step": 8112, "text_loss": 0.7349393963813782 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.09392427355445, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 0.0001463918983336474, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 13084151.0, "repeat_count": 1.0, "routers_loss": 0.01406828872859478, "skip_count": 2.0, "step": 8114, "text_loss": 0.3122454285621643 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.017822265625, "learning_rate": 0.00014617313874399173, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 13086998.0, "repeat_count": 0.0, "routers_loss": 0.002714085392653942, "skip_count": 0.0, "step": 8116, "text_loss": 0.6545852422714233 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.11270912826534, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.00014595451474145677, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 13090017.0, "repeat_count": 0.0, "routers_loss": 0.0073202489875257015, "skip_count": 0.0, "step": 8118, "text_loss": 0.5487201809883118 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.00014573602640981947, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 13093651.0, "repeat_count": 0.0, "routers_loss": 0.000667977670673281, "skip_count": 0.0, "step": 8120, "text_loss": 0.672166109085083 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00014551767383280535, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 13097139.0, "repeat_count": 0.0, "routers_loss": 0.0020584615413099527, "skip_count": 0.0, "step": 8122, "text_loss": 0.1996239423751831 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 38.14088641033167, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0517578125, "learning_rate": 0.00014529945709408726, "loss": 0.0069, "macro_f1": 0.6598639488220215, "num_tokens": 13100493.0, "repeat_count": 1.0, "routers_loss": 0.013855135068297386, "skip_count": 3.0, "step": 8124, "text_loss": 0.4099486768245697 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.15027883768712, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0001450813762772863, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 13103488.0, "repeat_count": 0.0, "routers_loss": 0.0014984552981331944, "skip_count": 0.0, "step": 8126, "text_loss": 0.6307108402252197 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 38.15967126504256, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.00014486343146597152, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 13106445.0, "repeat_count": 1.0, "routers_loss": 0.00430954247713089, "skip_count": 0.0, "step": 8128, "text_loss": 0.6226127743721008 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07177734375, "learning_rate": 0.00014464562274365972, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 13109258.0, "repeat_count": 0.0, "routers_loss": 0.003711461555212736, "skip_count": 1.0, "step": 8130, "text_loss": 0.17819052934646606 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.17845611975345, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.00014442795019381567, "loss": 0.0064, "macro_f1": 0.6603773832321167, "num_tokens": 13114206.0, "repeat_count": 1.0, "routers_loss": 0.015719098970294, "skip_count": 1.0, "step": 8132, "text_loss": 0.28450697660446167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.18784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.00014421041389985184, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 13117351.0, "repeat_count": 0.0, "routers_loss": 0.0013113922905176878, "skip_count": 0.0, "step": 8134, "text_loss": 0.310830682516098 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 38.197240974464336, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0289306640625, "learning_rate": 0.00014399301394512858, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 13120228.0, "repeat_count": 1.0, "routers_loss": 0.001965439412742853, "skip_count": 1.0, "step": 8136, "text_loss": 0.8635116815567017 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.20663340181978, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.00014377575041295393, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 13123380.0, "repeat_count": 1.0, "routers_loss": 0.004898902028799057, "skip_count": 2.0, "step": 8138, "text_loss": 0.5302467346191406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.21602582917523, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0179443359375, "learning_rate": 0.0001435586233865836, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 13126875.0, "repeat_count": 0.0, "routers_loss": 0.00031845085322856903, "skip_count": 0.0, "step": 8140, "text_loss": 0.5913560390472412 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 38.22541825653067, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0001433416329492213, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 13129563.0, "repeat_count": 1.0, "routers_loss": 0.00298812473192811, "skip_count": 1.0, "step": 8142, "text_loss": 0.5153398513793945 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.23481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00014312477918401807, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 13132608.0, "repeat_count": 0.0, "routers_loss": 0.0026608197949826717, "skip_count": 1.0, "step": 8144, "text_loss": 0.4554155766963959 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 38.24420311124156, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.00014290806217407272, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 13136204.0, "repeat_count": 1.0, "routers_loss": 0.0027651884593069553, "skip_count": 1.0, "step": 8146, "text_loss": 0.6349515318870544 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.253595538597004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 0.00014269148200243148, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 13138895.0, "repeat_count": 0.0, "routers_loss": 0.0006579195614904165, "skip_count": 0.0, "step": 8148, "text_loss": 0.4629364013671875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.26298796595245, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.00014247503875208846, "loss": 0.0059, "macro_f1": 0.3272727429866791, "num_tokens": 13142500.0, "repeat_count": 1.0, "routers_loss": 0.023065708577632904, "skip_count": 0.0, "step": 8150, "text_loss": 0.4962928593158722 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.2723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.00014225873250598496, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 13146203.0, "repeat_count": 0.0, "routers_loss": 0.007397830951958895, "skip_count": 1.0, "step": 8152, "text_loss": 0.3225953280925751 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00014204256334700988, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 13149517.0, "repeat_count": 0.0, "routers_loss": 0.004839105997234583, "skip_count": 1.0, "step": 8154, "text_loss": 0.18435558676719666 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 38.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.00014182653135799995, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 13152643.0, "repeat_count": 0.0, "routers_loss": 0.0028303388971835375, "skip_count": 4.0, "step": 8156, "text_loss": 0.5836900472640991 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0001416106366217389, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 13155213.0, "repeat_count": 0.0, "routers_loss": 0.0004012314020656049, "skip_count": 0.0, "step": 8158, "text_loss": 0.3723861575126648 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 20.0, "epoch": 38.30995010272967, "f1_execute": 0.9714285731315613, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0001413948792209579, "loss": 0.0065, "macro_f1": 0.8793651461601257, "num_tokens": 13158440.0, "repeat_count": 2.0, "routers_loss": 0.04377155378460884, "skip_count": 9.0, "step": 8160, "text_loss": 0.32476910948753357 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.319342530085116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0001411792592383357, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 13162651.0, "repeat_count": 0.0, "routers_loss": 0.0011163362069055438, "skip_count": 0.0, "step": 8162, "text_loss": 0.4890389144420624 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.32873495744057, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 0.00014096377675649823, "loss": 0.0055, "macro_f1": 0.6603773832321167, "num_tokens": 13165406.0, "repeat_count": 1.0, "routers_loss": 0.012117774225771427, "skip_count": 1.0, "step": 8164, "text_loss": 0.7763246893882751 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 38.33812738479601, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.00014074843185801883, "loss": 0.004, "macro_f1": 0.9262410998344421, "num_tokens": 13168402.0, "repeat_count": 3.0, "routers_loss": 0.009951545856893063, "skip_count": 2.0, "step": 8166, "text_loss": 0.5038266777992249 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 38.347519812151454, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 0.00014053322462541802, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 13171423.0, "repeat_count": 1.0, "routers_loss": 0.0021372761111706495, "skip_count": 1.0, "step": 8168, "text_loss": 0.5634724497795105 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.3569122395069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.00014031815514116354, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 13174713.0, "repeat_count": 0.0, "routers_loss": 0.0007417177548632026, "skip_count": 0.0, "step": 8170, "text_loss": 0.4009707272052765 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 38.36630466686234, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.035888671875, "learning_rate": 0.00014010322348767057, "loss": 0.0077, "macro_f1": 0.5934640765190125, "num_tokens": 13178012.0, "repeat_count": 0.0, "routers_loss": 0.01619168184697628, "skip_count": 3.0, "step": 8172, "text_loss": 0.29182371497154236 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.375697094217784, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.00013988842974730137, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 13181096.0, "repeat_count": 0.0, "routers_loss": 0.0037969043478369713, "skip_count": 0.0, "step": 8174, "text_loss": 0.275851845741272 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.385089521573235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.00013967377400236515, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 13184116.0, "repeat_count": 0.0, "routers_loss": 0.0007759644067846239, "skip_count": 0.0, "step": 8176, "text_loss": 0.7569663524627686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.39448194892868, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02001953125, "learning_rate": 0.00013945925633511848, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 13187319.0, "repeat_count": 0.0, "routers_loss": 0.002708743792027235, "skip_count": 0.0, "step": 8178, "text_loss": 0.4733831286430359 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.00013924487682776492, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 13190796.0, "repeat_count": 0.0, "routers_loss": 0.0005060714902356267, "skip_count": 0.0, "step": 8180, "text_loss": 0.5663171410560608 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.413266803639566, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0001390306355624551, "loss": 0.0049, "macro_f1": 0.3272727429866791, "num_tokens": 13193705.0, "repeat_count": 0.0, "routers_loss": 0.02932601235806942, "skip_count": 1.0, "step": 8182, "text_loss": 0.30700045824050903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0001388165326212867, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 13196393.0, "repeat_count": 0.0, "routers_loss": 0.0011637522839009762, "skip_count": 0.0, "step": 8184, "text_loss": 0.6897354125976562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.43205165835045, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00013860256808630427, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 13199526.0, "repeat_count": 0.0, "routers_loss": 0.0017184355529025197, "skip_count": 0.0, "step": 8186, "text_loss": 0.6246579885482788 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.441444085705896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.00013838874203949954, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 13202963.0, "repeat_count": 0.0, "routers_loss": 0.0026622721925377846, "skip_count": 0.0, "step": 8188, "text_loss": 0.506066083908081 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.45083651306135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.00013817505456281099, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 13207408.0, "repeat_count": 0.0, "routers_loss": 0.000543750764336437, "skip_count": 0.0, "step": 8190, "text_loss": 0.5192428231239319 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0001379615057381241, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 13211073.0, "repeat_count": 0.0, "routers_loss": 0.0010060713393613696, "skip_count": 0.0, "step": 8192, "text_loss": 0.5640166401863098 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.00013774809564727104, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 13214203.0, "repeat_count": 0.0, "routers_loss": 0.005152868572622538, "skip_count": 2.0, "step": 8194, "text_loss": 0.8643819689750671 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.47901379512768, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022216796875, "learning_rate": 0.0001375348243720312, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 13217748.0, "repeat_count": 0.0, "routers_loss": 0.0017722113989293575, "skip_count": 2.0, "step": 8196, "text_loss": 0.40500834584236145 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.48840622248312, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0001373216919941304, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 13221341.0, "repeat_count": 1.0, "routers_loss": 0.00999271310865879, "skip_count": 3.0, "step": 8198, "text_loss": 0.2317391037940979 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.497798649838565, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 0.00013710869859524143, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 13224288.0, "repeat_count": 0.0, "routers_loss": 0.0016836341237649322, "skip_count": 0.0, "step": 8200, "text_loss": 0.31873467564582825 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.507191077194015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.00013689584425698376, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 13227342.0, "repeat_count": 0.0, "routers_loss": 0.002255793660879135, "skip_count": 0.0, "step": 8202, "text_loss": 0.13513202965259552 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 38.51658350454946, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0001366831290609235, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 13230912.0, "repeat_count": 1.0, "routers_loss": 0.0062925987876951694, "skip_count": 4.0, "step": 8204, "text_loss": 0.3692396581172943 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 38.5259759319049, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.029296875, "learning_rate": 0.00013647055308857353, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 13233961.0, "repeat_count": 1.0, "routers_loss": 0.0020471401512622833, "skip_count": 0.0, "step": 8206, "text_loss": 0.5655510425567627 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.0001362581164213934, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 13237170.0, "repeat_count": 0.0, "routers_loss": 0.0009666495025157928, "skip_count": 0.0, "step": 8208, "text_loss": 0.720582902431488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.00013604581914078922, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 13241020.0, "repeat_count": 0.0, "routers_loss": 0.0006306356517598033, "skip_count": 0.0, "step": 8210, "text_loss": 0.5686481595039368 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 38.55415321397123, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.00013583366132811374, "loss": 0.0058, "macro_f1": 0.5492662787437439, "num_tokens": 13244491.0, "repeat_count": 2.0, "routers_loss": 0.016230134293437004, "skip_count": 0.0, "step": 8212, "text_loss": 0.55678790807724 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.563545641326684, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 0.00013562164306466624, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 13247551.0, "repeat_count": 0.0, "routers_loss": 0.003904943587258458, "skip_count": 2.0, "step": 8214, "text_loss": 0.6521575450897217 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.57293806868213, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 0.00013540976443169244, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 13250863.0, "repeat_count": 0.0, "routers_loss": 0.002239734400063753, "skip_count": 1.0, "step": 8216, "text_loss": 0.29757481813430786 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.00013519802551038452, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 13254215.0, "repeat_count": 0.0, "routers_loss": 0.004978829529136419, "skip_count": 2.0, "step": 8218, "text_loss": 0.30598193407058716 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00013498642638188157, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 13257269.0, "repeat_count": 0.0, "routers_loss": 0.0040260558016598225, "skip_count": 0.0, "step": 8220, "text_loss": 0.39327144622802734 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.60111535074846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 0.00013477496712726862, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 13260573.0, "repeat_count": 0.0, "routers_loss": 0.002124674618244171, "skip_count": 0.0, "step": 8222, "text_loss": 0.38342708349227905 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.6105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00013456364782757718, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 13263684.0, "repeat_count": 0.0, "routers_loss": 0.00087209593039006, "skip_count": 0.0, "step": 8224, "text_loss": 0.6338301301002502 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 38.619900205459345, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.00013435246856378526, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 13266879.0, "repeat_count": 1.0, "routers_loss": 0.003183641703799367, "skip_count": 0.0, "step": 8226, "text_loss": 0.6073583364486694 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.629292632814796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0169677734375, "learning_rate": 0.00013414142941681718, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 13270679.0, "repeat_count": 0.0, "routers_loss": 0.001859338372014463, "skip_count": 0.0, "step": 8228, "text_loss": 0.5427029132843018 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0001339305304675435, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 13273275.0, "repeat_count": 0.0, "routers_loss": 0.000655558833386749, "skip_count": 0.0, "step": 8230, "text_loss": 0.29442915320396423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.64807748752568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0250244140625, "learning_rate": 0.00013371977179678113, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 13276205.0, "repeat_count": 0.0, "routers_loss": 0.0011499621905386448, "skip_count": 0.0, "step": 8232, "text_loss": 0.5601125359535217 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.00013350915348529313, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 13279242.0, "repeat_count": 0.0, "routers_loss": 0.0019823790062218904, "skip_count": 0.0, "step": 8234, "text_loss": 0.43674135208129883 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 38.66686234223657, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04248046875, "learning_rate": 0.00013329867561378888, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 13282531.0, "repeat_count": 0.0, "routers_loss": 0.005772443953901529, "skip_count": 3.0, "step": 8236, "text_loss": 0.4838809072971344 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.67625476959201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.00013308833826292395, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 13286219.0, "repeat_count": 0.0, "routers_loss": 0.0038314659614115953, "skip_count": 2.0, "step": 8238, "text_loss": 0.5002569556236267 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 38.685647196947464, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.031005859375, "learning_rate": 0.00013287814151329987, "loss": 0.0075, "macro_f1": 0.9452888369560242, "num_tokens": 13290348.0, "repeat_count": 1.0, "routers_loss": 0.04819172993302345, "skip_count": 4.0, "step": 8240, "text_loss": 0.3099883198738098 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.69503962430291, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020263671875, "learning_rate": 0.00013266808544546438, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 13293644.0, "repeat_count": 0.0, "routers_loss": 0.010334883816540241, "skip_count": 2.0, "step": 8242, "text_loss": 0.17672912776470184 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.00013245817013991164, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 13296721.0, "repeat_count": 0.0, "routers_loss": 0.00162201386410743, "skip_count": 0.0, "step": 8244, "text_loss": 0.7664286494255066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.00013224839567708142, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 13299704.0, "repeat_count": 0.0, "routers_loss": 0.0039452011696994305, "skip_count": 0.0, "step": 8246, "text_loss": 0.1827820986509323 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 38.72321690636924, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.00013203876213735972, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 13302553.0, "repeat_count": 1.0, "routers_loss": 0.006701917387545109, "skip_count": 7.0, "step": 8248, "text_loss": 0.6020278930664062 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.73260933372468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0001318292696010785, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 13305875.0, "repeat_count": 0.0, "routers_loss": 0.00968079548329115, "skip_count": 2.0, "step": 8250, "text_loss": 0.2693248987197876 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 38.74200176108013, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025390625, "learning_rate": 0.00013161991814851571, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 13309115.0, "repeat_count": 2.0, "routers_loss": 0.008890608325600624, "skip_count": 2.0, "step": 8252, "text_loss": 0.6325297355651855 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 38.751394188435576, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 0.00013141070785989517, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 13312219.0, "repeat_count": 1.0, "routers_loss": 0.00825794693082571, "skip_count": 4.0, "step": 8254, "text_loss": 0.284396767616272 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.00013120163881538677, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 13315214.0, "repeat_count": 0.0, "routers_loss": 0.003378969384357333, "skip_count": 1.0, "step": 8256, "text_loss": 0.20296992361545563 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.77017904314646, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00013099271109510603, "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 13319117.0, "repeat_count": 1.0, "routers_loss": 0.0164186954498291, "skip_count": 0.0, "step": 8258, "text_loss": 0.21940068900585175 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 38.77957147050191, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0001307839247791145, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 13321631.0, "repeat_count": 0.0, "routers_loss": 0.0053979759104549885, "skip_count": 3.0, "step": 8260, "text_loss": 0.19442199170589447 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.78896389785735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0223388671875, "learning_rate": 0.00013057527994741946, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 13324759.0, "repeat_count": 0.0, "routers_loss": 0.0024567479267716408, "skip_count": 0.0, "step": 8262, "text_loss": 0.5528824925422668 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.798356325212794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0001303667766799741, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 13327554.0, "repeat_count": 0.0, "routers_loss": 0.002819873159751296, "skip_count": 1.0, "step": 8264, "text_loss": 0.4418395757675171 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.807748752568244, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 0.00013015841505667703, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 13331838.0, "repeat_count": 0.0, "routers_loss": 0.0030280952341854572, "skip_count": 1.0, "step": 8266, "text_loss": 0.5263079404830933 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 38.81714117992369, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0185546875, "learning_rate": 0.0001299501951573731, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 13334968.0, "repeat_count": 0.0, "routers_loss": 0.001774887670762837, "skip_count": 4.0, "step": 8268, "text_loss": 0.47985130548477173 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00012974211706185247, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 13338052.0, "repeat_count": 0.0, "routers_loss": 0.007027842104434967, "skip_count": 1.0, "step": 8270, "text_loss": 0.6588287949562073 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00012953418084985107, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 13341653.0, "repeat_count": 0.0, "routers_loss": 0.0026854060124605894, "skip_count": 1.0, "step": 8272, "text_loss": 0.43156498670578003 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.00012932638660105038, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 13345173.0, "repeat_count": 0.0, "routers_loss": 0.0033325920812785625, "skip_count": 0.0, "step": 8274, "text_loss": 0.1679086685180664 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.85471088934546, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.00012911873439507766, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 13348635.0, "repeat_count": 0.0, "routers_loss": 0.0016183287370949984, "skip_count": 0.0, "step": 8276, "text_loss": 0.5907418131828308 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.00012891122431150549, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 13351120.0, "repeat_count": 0.0, "routers_loss": 0.0049970983527600765, "skip_count": 1.0, "step": 8278, "text_loss": 0.5437678694725037 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048095703125, "learning_rate": 0.00012870385642985222, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 13353774.0, "repeat_count": 0.0, "routers_loss": 0.0027123154141008854, "skip_count": 0.0, "step": 8280, "text_loss": 0.5742796659469604 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.00012849663082958158, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 13358236.0, "repeat_count": 0.0, "routers_loss": 0.0062842960469424725, "skip_count": 0.0, "step": 8282, "text_loss": 0.2340863049030304 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.89228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.00012828954759010265, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 13360994.0, "repeat_count": 0.0, "routers_loss": 0.0006564505747519433, "skip_count": 0.0, "step": 8284, "text_loss": 0.45432794094085693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.90167302612269, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0001280826067907705, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 13363665.0, "repeat_count": 0.0, "routers_loss": 0.001298630959354341, "skip_count": 0.0, "step": 8286, "text_loss": 0.7439755201339722 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.91106545347813, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.00012787580851088493, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 13367412.0, "repeat_count": 0.0, "routers_loss": 0.00464112963527441, "skip_count": 0.0, "step": 8288, "text_loss": 0.2854461669921875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.92045788083358, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0001276691528296916, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 13370745.0, "repeat_count": 0.0, "routers_loss": 0.0006090773968026042, "skip_count": 0.0, "step": 8290, "text_loss": 0.6663011312484741 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.929850308189025, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.00012746263982638123, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 13373396.0, "repeat_count": 0.0, "routers_loss": 0.0038922233507037163, "skip_count": 0.0, "step": 8292, "text_loss": 0.3858443796634674 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.93924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.00012725626958009007, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 13376172.0, "repeat_count": 0.0, "routers_loss": 0.0016941255889832973, "skip_count": 0.0, "step": 8294, "text_loss": 0.4758119285106659 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 38.94863516289991, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02001953125, "learning_rate": 0.0001270500421698994, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 13379002.0, "repeat_count": 1.0, "routers_loss": 0.001703770598396659, "skip_count": 0.0, "step": 8296, "text_loss": 0.7464606165885925 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 38.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00012684395767483626, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 13382221.0, "repeat_count": 0.0, "routers_loss": 0.001474690856412053, "skip_count": 1.0, "step": 8298, "text_loss": 0.37309199571609497 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 38.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.00012663801617387245, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 13385276.0, "repeat_count": 0.0, "routers_loss": 0.004561704583466053, "skip_count": 3.0, "step": 8300, "text_loss": 0.43284836411476135 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 38.97681244496625, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02001953125, "learning_rate": 0.00012643221774592518, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 13388321.0, "repeat_count": 2.0, "routers_loss": 0.005136100109666586, "skip_count": 1.0, "step": 8302, "text_loss": 0.669730007648468 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 38.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.00012622656246985675, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 13391222.0, "repeat_count": 0.0, "routers_loss": 0.0028521555941551924, "skip_count": 0.0, "step": 8304, "text_loss": 0.16773155331611633 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 38.99559729967714, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.00012602105042447471, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 13395297.0, "repeat_count": 0.0, "routers_loss": 0.0033424890134483576, "skip_count": 2.0, "step": 8306, "text_loss": 0.1650846153497696 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.004696213677725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0001258156816885316, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 13398482.0, "repeat_count": 0.0, "routers_loss": 0.0012481207959353924, "skip_count": 0.0, "step": 8308, "text_loss": 0.37225499749183655 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 39.01408864103317, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 0.00012561045634072515, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 13402199.0, "repeat_count": 0.0, "routers_loss": 0.006243644282221794, "skip_count": 3.0, "step": 8310, "text_loss": 0.16000206768512726 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.02348106838861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00012540537445969807, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 13404950.0, "repeat_count": 0.0, "routers_loss": 0.004267443902790546, "skip_count": 2.0, "step": 8312, "text_loss": 0.400174081325531 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.00012520043612403815, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 13407883.0, "repeat_count": 0.0, "routers_loss": 0.005013707559555769, "skip_count": 2.0, "step": 8314, "text_loss": 0.1331731230020523 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 39.0422659230995, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.00012499564141227798, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 13410563.0, "repeat_count": 1.0, "routers_loss": 0.00463570561259985, "skip_count": 0.0, "step": 8316, "text_loss": 0.5098661184310913 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 39.05165835045494, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.052978515625, "learning_rate": 0.0001247909904028956, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 13413730.0, "repeat_count": 1.0, "routers_loss": 0.007066591177135706, "skip_count": 1.0, "step": 8318, "text_loss": 0.8059925436973572 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 39.061050777810394, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00012458648317431348, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 13416425.0, "repeat_count": 0.0, "routers_loss": 0.004210594110190868, "skip_count": 3.0, "step": 8320, "text_loss": 0.6559522151947021 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.07044320516584, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0001243821198048992, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 13419851.0, "repeat_count": 1.0, "routers_loss": 0.005613257177174091, "skip_count": 2.0, "step": 8322, "text_loss": 0.2783811688423157 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.00012417790037296523, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 13422588.0, "repeat_count": 0.0, "routers_loss": 0.00233642989769578, "skip_count": 1.0, "step": 8324, "text_loss": 0.7659147381782532 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.00012397382495676874, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 13425275.0, "repeat_count": 0.0, "routers_loss": 0.0013295465614646673, "skip_count": 0.0, "step": 8326, "text_loss": 0.5693745017051697 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 39.09862048723217, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 0.0001237698936345119, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 13428314.0, "repeat_count": 1.0, "routers_loss": 0.005712272133678198, "skip_count": 1.0, "step": 8328, "text_loss": 0.8581340909004211 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.10801291458761, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.00012356610648434153, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 13431453.0, "repeat_count": 0.0, "routers_loss": 0.0015835616504773498, "skip_count": 0.0, "step": 8330, "text_loss": 0.1395341008901596 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.117405341943055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.00012336246358434928, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 13434566.0, "repeat_count": 0.0, "routers_loss": 0.0012973316479474306, "skip_count": 0.0, "step": 8332, "text_loss": 0.7125005125999451 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.126797769298506, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.00012315896501257145, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 13438056.0, "repeat_count": 0.0, "routers_loss": 0.0005822008824907243, "skip_count": 0.0, "step": 8334, "text_loss": 0.7730510234832764 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 0.00012295561084698915, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 13441390.0, "repeat_count": 0.0, "routers_loss": 0.00547185679897666, "skip_count": 1.0, "step": 8336, "text_loss": 0.3927873373031616 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.14558262400939, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.000122752401165528, "loss": 0.0022, "macro_f1": 0.3333333432674408, "num_tokens": 13443864.0, "repeat_count": 0.0, "routers_loss": 0.0011191967641934752, "skip_count": 0.0, "step": 8338, "text_loss": 0.3996548354625702 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.00012254933604605828, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 13447070.0, "repeat_count": 0.0, "routers_loss": 0.0005196621641516685, "skip_count": 0.0, "step": 8340, "text_loss": 0.5597847104072571 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.16436747872028, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 0.00012234641556639508, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 13450522.0, "repeat_count": 0.0, "routers_loss": 0.003857341594994068, "skip_count": 2.0, "step": 8342, "text_loss": 0.14400488138198853 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.17375990607572, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00012214363980429793, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 13453578.0, "repeat_count": 1.0, "routers_loss": 0.006664265412837267, "skip_count": 3.0, "step": 8344, "text_loss": 0.27675092220306396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.183152333431174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0595703125, "learning_rate": 0.00012194100883747078, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 13456480.0, "repeat_count": 0.0, "routers_loss": 0.003549816319718957, "skip_count": 0.0, "step": 8346, "text_loss": 0.21776801347732544 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.19254476078662, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.00012173852274356217, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 13459859.0, "repeat_count": 1.0, "routers_loss": 0.00446992926299572, "skip_count": 3.0, "step": 8348, "text_loss": 0.1828736811876297 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021728515625, "learning_rate": 0.00012153618160016527, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 13463104.0, "repeat_count": 0.0, "routers_loss": 0.0024826989974826574, "skip_count": 1.0, "step": 8350, "text_loss": 0.15649555623531342 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 0.0001213339854848175, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 13467051.0, "repeat_count": 0.0, "routers_loss": 0.0021385846193879843, "skip_count": 1.0, "step": 8352, "text_loss": 0.49281737208366394 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 0.00012113193447500081, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 13470411.0, "repeat_count": 0.0, "routers_loss": 0.0014382716035470366, "skip_count": 1.0, "step": 8354, "text_loss": 0.5984349846839905 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.23011447020839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 0.00012093002864814151, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 13474666.0, "repeat_count": 0.0, "routers_loss": 0.008536498062312603, "skip_count": 1.0, "step": 8356, "text_loss": 0.2851131856441498 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.00012072826808161036, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 13477754.0, "repeat_count": 0.0, "routers_loss": 0.0027286717668175697, "skip_count": 0.0, "step": 8358, "text_loss": 0.5987376570701599 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.248899324919286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.0001205266528527223, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 13481151.0, "repeat_count": 0.0, "routers_loss": 0.002780565759167075, "skip_count": 1.0, "step": 8360, "text_loss": 0.1847199648618698 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00012032518303873674, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 13484050.0, "repeat_count": 0.0, "routers_loss": 0.0006186611135490239, "skip_count": 0.0, "step": 8362, "text_loss": 0.6229772567749023 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 39.26768417963017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.019287109375, "learning_rate": 0.00012012385871685716, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 13488551.0, "repeat_count": 0.0, "routers_loss": 0.00956071075052023, "skip_count": 5.0, "step": 8364, "text_loss": 0.2810790538787842 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.27707660698562, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.00011992267996423162, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 13491420.0, "repeat_count": 0.0, "routers_loss": 0.008410792797803879, "skip_count": 2.0, "step": 8366, "text_loss": 0.20509617030620575 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.28646903434106, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 0.00011972164685795212, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 13494736.0, "repeat_count": 0.0, "routers_loss": 0.00762166129425168, "skip_count": 1.0, "step": 8368, "text_loss": 0.24739402532577515 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.295861461696504, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.00011952075947505486, "loss": 0.0051, "macro_f1": 0.3272727429866791, "num_tokens": 13498363.0, "repeat_count": 0.0, "routers_loss": 0.010674391873180866, "skip_count": 1.0, "step": 8370, "text_loss": 0.31931644678115845 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 39.305253889051954, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0001193200178925204, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 13501029.0, "repeat_count": 2.0, "routers_loss": 0.0041843741200864315, "skip_count": 1.0, "step": 8372, "text_loss": 0.5103049278259277 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.3146463164074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00011911942218727312, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 13503854.0, "repeat_count": 0.0, "routers_loss": 0.0006344785797409713, "skip_count": 0.0, "step": 8374, "text_loss": 0.4914432764053345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00011891897243618183, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 13508316.0, "repeat_count": 0.0, "routers_loss": 0.0003527739318087697, "skip_count": 0.0, "step": 8376, "text_loss": 0.5317551493644714 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.00011871866871605913, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 13512603.0, "repeat_count": 0.0, "routers_loss": 0.001071247854270041, "skip_count": 0.0, "step": 8378, "text_loss": 0.6693558096885681 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.34282359847373, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 0.00011851851110366185, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 13515928.0, "repeat_count": 0.0, "routers_loss": 0.000924977008253336, "skip_count": 1.0, "step": 8380, "text_loss": 0.8004939556121826 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.35221602582917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 0.0001183184996756908, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 13518548.0, "repeat_count": 0.0, "routers_loss": 0.0017637151759117842, "skip_count": 0.0, "step": 8382, "text_loss": 0.5012105107307434 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 39.36160845318462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 0.00011811863450879063, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 13522155.0, "repeat_count": 2.0, "routers_loss": 0.0011129514314234257, "skip_count": 0.0, "step": 8384, "text_loss": 0.3866073489189148 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 39.371000880540066, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.00011791891567955009, "loss": 0.0046, "macro_f1": 0.8814815282821655, "num_tokens": 13525352.0, "repeat_count": 2.0, "routers_loss": 0.042801812291145325, "skip_count": 4.0, "step": 8386, "text_loss": 0.18817944824695587 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018798828125, "learning_rate": 0.00011771934326450173, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 13528537.0, "repeat_count": 0.0, "routers_loss": 0.0006869474309496582, "skip_count": 0.0, "step": 8388, "text_loss": 0.6407818794250488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.38978573525095, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.00011751991734012229, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 13531650.0, "repeat_count": 0.0, "routers_loss": 0.0008001072565093637, "skip_count": 0.0, "step": 8390, "text_loss": 0.5149344205856323 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.3991781626064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.00011732063798283204, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 13535071.0, "repeat_count": 0.0, "routers_loss": 0.0006921148742549121, "skip_count": 0.0, "step": 8392, "text_loss": 0.5906356573104858 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.40857058996184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00011712150526899523, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 13537741.0, "repeat_count": 0.0, "routers_loss": 0.005221226718276739, "skip_count": 2.0, "step": 8394, "text_loss": 0.3381146192550659 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 39.41796301731729, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.00011692251927491987, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 13541189.0, "repeat_count": 1.0, "routers_loss": 0.0023983579594641924, "skip_count": 1.0, "step": 8396, "text_loss": 0.7345486283302307 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.427355444672735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.00011672368007685774, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 13545210.0, "repeat_count": 1.0, "routers_loss": 0.005362956319004297, "skip_count": 2.0, "step": 8398, "text_loss": 0.6522865295410156 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.43674787202818, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.00011652498775100445, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 13548260.0, "repeat_count": 0.0, "routers_loss": 0.002955642296001315, "skip_count": 0.0, "step": 8400, "text_loss": 0.3200102150440216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.00011632644237349927, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 13551519.0, "repeat_count": 0.0, "routers_loss": 0.001079231034964323, "skip_count": 0.0, "step": 8402, "text_loss": 0.7251807451248169 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 39.455532726739065, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.00011612804402042509, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 13555241.0, "repeat_count": 1.0, "routers_loss": 0.013860360719263554, "skip_count": 0.0, "step": 8404, "text_loss": 0.159539595246315 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 25.0, "epoch": 39.46492515409451, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.054931640625, "learning_rate": 0.00011592979276780857, "loss": 0.0055, "macro_f1": 0.9555556178092957, "num_tokens": 13558389.0, "repeat_count": 1.0, "routers_loss": 0.017025530338287354, "skip_count": 5.0, "step": 8406, "text_loss": 0.5154430270195007 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.47431758144996, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.00011573168869162004, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 13561237.0, "repeat_count": 1.0, "routers_loss": 0.007349071092903614, "skip_count": 2.0, "step": 8408, "text_loss": 0.20888492465019226 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.4837100088054, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00011553373186777327, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 13564080.0, "repeat_count": 1.0, "routers_loss": 0.003303215140476823, "skip_count": 2.0, "step": 8410, "text_loss": 0.21808166801929474 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.49310243616085, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0208740234375, "learning_rate": 0.00011533592237212558, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 13566649.0, "repeat_count": 0.0, "routers_loss": 0.005856195464730263, "skip_count": 1.0, "step": 8412, "text_loss": 0.28037169575691223 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 0.0001151382602804782, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 13570015.0, "repeat_count": 0.0, "routers_loss": 0.0007515792385675013, "skip_count": 0.0, "step": 8414, "text_loss": 0.8517835736274719 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00011494074566857549, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 13573262.0, "repeat_count": 0.0, "routers_loss": 0.0043421462178230286, "skip_count": 0.0, "step": 8416, "text_loss": 0.27418580651283264 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.52127971822718, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.00011474337861210544, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 13576104.0, "repeat_count": 1.0, "routers_loss": 0.0108594736084342, "skip_count": 2.0, "step": 8418, "text_loss": 0.4724268317222595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.53067214558262, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.00011454615918669948, "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 13579138.0, "repeat_count": 1.0, "routers_loss": 0.04178442806005478, "skip_count": 0.0, "step": 8420, "text_loss": 0.4065103530883789 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.54006457293807, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 0.00011434908746793238, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 13582818.0, "repeat_count": 0.0, "routers_loss": 0.004756448790431023, "skip_count": 2.0, "step": 8422, "text_loss": 0.2932167947292328 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00011415216353132252, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 13586261.0, "repeat_count": 0.0, "routers_loss": 0.0033427432645112276, "skip_count": 1.0, "step": 8424, "text_loss": 0.47670233249664307 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 0.0001139553874523313, "loss": 0.003, "macro_f1": 0.6666666865348816, "num_tokens": 13589765.0, "repeat_count": 0.0, "routers_loss": 0.006597383879125118, "skip_count": 1.0, "step": 8426, "text_loss": 0.31448885798454285 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.5682418550044, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.00011375875930636403, "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 13592741.0, "repeat_count": 0.0, "routers_loss": 0.011398134753108025, "skip_count": 1.0, "step": 8428, "text_loss": 0.17429469525814056 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 39.577634282359845, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.00011356227916876877, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 13595763.0, "repeat_count": 1.0, "routers_loss": 0.0038021153304725885, "skip_count": 0.0, "step": 8430, "text_loss": 0.6043882966041565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.58702670971529, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.00011336594711483712, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 13598274.0, "repeat_count": 0.0, "routers_loss": 0.00044314167462289333, "skip_count": 0.0, "step": 8432, "text_loss": 0.3818575143814087 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.59641913707074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.00011316976321980388, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 13601510.0, "repeat_count": 0.0, "routers_loss": 0.001956664025783539, "skip_count": 0.0, "step": 8434, "text_loss": 0.48483794927597046 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0001129737275588471, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 13604410.0, "repeat_count": 0.0, "routers_loss": 0.005170237272977829, "skip_count": 0.0, "step": 8436, "text_loss": 0.21759741008281708 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.61520399178163, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.00011277784020708803, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 13607207.0, "repeat_count": 1.0, "routers_loss": 0.002223948948085308, "skip_count": 2.0, "step": 8438, "text_loss": 0.6877034306526184 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0203857421875, "learning_rate": 0.00011258210123959089, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 13610981.0, "repeat_count": 0.0, "routers_loss": 0.0017733481945469975, "skip_count": 1.0, "step": 8440, "text_loss": 0.7250658273696899 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 39.633988846492514, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.00011238651073136358, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 13614194.0, "repeat_count": 1.0, "routers_loss": 0.00155889883171767, "skip_count": 1.0, "step": 8442, "text_loss": 0.6742649078369141 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.00011219106875735652, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 13618011.0, "repeat_count": 0.0, "routers_loss": 0.0011234934208914638, "skip_count": 0.0, "step": 8444, "text_loss": 0.8105526566505432 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 39.65277370120341, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0198974609375, "learning_rate": 0.00011199577539246347, "loss": 0.0055, "macro_f1": 0.6603773832321167, "num_tokens": 13621852.0, "repeat_count": 1.0, "routers_loss": 0.02346695400774479, "skip_count": 1.0, "step": 8446, "text_loss": 0.22664032876491547 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02294921875, "learning_rate": 0.0001118006307115213, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 13624711.0, "repeat_count": 0.0, "routers_loss": 0.012819754891097546, "skip_count": 2.0, "step": 8448, "text_loss": 0.31696105003356934 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 39.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.00011160563478930969, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 13627561.0, "repeat_count": 0.0, "routers_loss": 0.0060531035996973515, "skip_count": 2.0, "step": 8450, "text_loss": 0.2935826778411865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00011141078770055152, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 13630445.0, "repeat_count": 0.0, "routers_loss": 0.004288572818040848, "skip_count": 0.0, "step": 8452, "text_loss": 0.5720692873001099 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.69034341062518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.00011121608951991252, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 13633496.0, "repeat_count": 0.0, "routers_loss": 0.005682424642145634, "skip_count": 1.0, "step": 8454, "text_loss": 0.28466710448265076 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.699735837980626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00011102154032200146, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 13635938.0, "repeat_count": 0.0, "routers_loss": 0.0009555552969686687, "skip_count": 0.0, "step": 8456, "text_loss": 0.47744694352149963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.70912826533607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.00011082714018136985, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 13638863.0, "repeat_count": 0.0, "routers_loss": 0.0023627313785254955, "skip_count": 0.0, "step": 8458, "text_loss": 0.5212090611457825 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.71852069269152, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0189208984375, "learning_rate": 0.00011063288917251235, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 13641874.0, "repeat_count": 1.0, "routers_loss": 0.00791920255869627, "skip_count": 2.0, "step": 8460, "text_loss": 0.31359919905662537 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 39.72791312004696, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.00011043878736986607, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 13644970.0, "repeat_count": 1.0, "routers_loss": 0.0033252311404794455, "skip_count": 1.0, "step": 8462, "text_loss": 0.33621230721473694 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.73730554740241, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.00011024483484781144, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 13648103.0, "repeat_count": 1.0, "routers_loss": 0.005567418877035379, "skip_count": 2.0, "step": 8464, "text_loss": 0.48708856105804443 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.00011005103168067143, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 13651085.0, "repeat_count": 0.0, "routers_loss": 0.00047958645154722035, "skip_count": 0.0, "step": 8466, "text_loss": 0.4151248633861542 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.756090402113294, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00010985737794271161, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 13654175.0, "repeat_count": 0.0, "routers_loss": 0.0009806647431105375, "skip_count": 0.0, "step": 8468, "text_loss": 0.7322396039962769 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.00010966387370814057, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 13657058.0, "repeat_count": 0.0, "routers_loss": 0.0009820344857871532, "skip_count": 0.0, "step": 8470, "text_loss": 0.6350769400596619 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 39.77487525682419, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00010947051905110945, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 13660203.0, "repeat_count": 2.0, "routers_loss": 0.002065197564661503, "skip_count": 0.0, "step": 8472, "text_loss": 0.6025850176811218 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.00010927731404571211, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 13664021.0, "repeat_count": 0.0, "routers_loss": 0.0009939799783751369, "skip_count": 0.0, "step": 8474, "text_loss": 0.3040087819099426 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0001090842587659851, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 13667055.0, "repeat_count": 0.0, "routers_loss": 0.0008282510680146515, "skip_count": 0.0, "step": 8476, "text_loss": 0.7306531667709351 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0001088913532859076, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 13669940.0, "repeat_count": 0.0, "routers_loss": 0.0008349589770659804, "skip_count": 0.0, "step": 8478, "text_loss": 0.32041916251182556 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.81244496624596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.00010869859767940133, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 13672955.0, "repeat_count": 0.0, "routers_loss": 0.0007435405277647078, "skip_count": 0.0, "step": 8480, "text_loss": 0.5343614816665649 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.821837393601406, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.00010850599202033051, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 13676173.0, "repeat_count": 0.0, "routers_loss": 0.002763360273092985, "skip_count": 0.0, "step": 8482, "text_loss": 0.6071668267250061 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.83122982095686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.00010831353638250213, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 13680121.0, "repeat_count": 0.0, "routers_loss": 0.00202178000472486, "skip_count": 0.0, "step": 8484, "text_loss": 0.42487844824790955 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.8406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.00010812123083966535, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 13683504.0, "repeat_count": 0.0, "routers_loss": 0.0056348275393247604, "skip_count": 1.0, "step": 8486, "text_loss": 0.17678795754909515 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.00010792907546551229, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 13686870.0, "repeat_count": 0.0, "routers_loss": 0.003331703832373023, "skip_count": 0.0, "step": 8488, "text_loss": 0.32238465547561646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.85940710302319, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.00010773707033367708, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 13690429.0, "repeat_count": 0.0, "routers_loss": 0.0011620528530329466, "skip_count": 0.0, "step": 8490, "text_loss": 0.4141998291015625 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 39.86879953037863, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.00010754521551773655, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 13693747.0, "repeat_count": 1.0, "routers_loss": 0.005236583761870861, "skip_count": 0.0, "step": 8492, "text_loss": 0.557283878326416 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 39.878191957734074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 0.00010735351109120972, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 13696837.0, "repeat_count": 0.0, "routers_loss": 0.005507425405085087, "skip_count": 6.0, "step": 8494, "text_loss": 0.7394861578941345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.887584385089525, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0198974609375, "learning_rate": 0.00010716195712755821, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 13700080.0, "repeat_count": 0.0, "routers_loss": 0.0008621517335996032, "skip_count": 0.0, "step": 8496, "text_loss": 0.7079368233680725 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.89697681244497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.00010697055370018572, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 13704088.0, "repeat_count": 0.0, "routers_loss": 0.0004489862476475537, "skip_count": 0.0, "step": 8498, "text_loss": 0.5672308206558228 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.90636923980041, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.00010677930088243847, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 13707391.0, "repeat_count": 1.0, "routers_loss": 0.009171495214104652, "skip_count": 2.0, "step": 8500, "text_loss": 0.6851600408554077 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029541015625, "learning_rate": 0.00010658819874760495, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 13711238.0, "repeat_count": 0.0, "routers_loss": 0.0016714727971702814, "skip_count": 1.0, "step": 8502, "text_loss": 0.7102733850479126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.00010639724736891576, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 13714553.0, "repeat_count": 0.0, "routers_loss": 0.0012916292762383819, "skip_count": 0.0, "step": 8504, "text_loss": 0.4234752953052521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.93454652186674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0001062064468195439, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 13718046.0, "repeat_count": 0.0, "routers_loss": 0.0005265420186333358, "skip_count": 0.0, "step": 8506, "text_loss": 0.5576326251029968 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.943938949222186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0001060157971726045, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 13720687.0, "repeat_count": 0.0, "routers_loss": 0.0023503501433879137, "skip_count": 1.0, "step": 8508, "text_loss": 0.5259605646133423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.95333137657764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01904296875, "learning_rate": 0.00010582529850115469, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 13723946.0, "repeat_count": 0.0, "routers_loss": 0.0007593657355755568, "skip_count": 0.0, "step": 8510, "text_loss": 0.3795129954814911 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.00010563495087819419, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 13727589.0, "repeat_count": 0.0, "routers_loss": 0.0005672222469002008, "skip_count": 0.0, "step": 8512, "text_loss": 0.685897946357727 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 39.972116231288524, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.00010544475437666445, "loss": 0.0049, "macro_f1": 0.9262410998344421, "num_tokens": 13730579.0, "repeat_count": 3.0, "routers_loss": 0.01708158478140831, "skip_count": 2.0, "step": 8514, "text_loss": 0.8044925332069397 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 39.98150865864397, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0517578125, "learning_rate": 0.00010525470906944917, "loss": 0.0113, "macro_f1": 1.0, "num_tokens": 13733563.0, "repeat_count": 1.0, "routers_loss": 0.010253295302391052, "skip_count": 2.0, "step": 8516, "text_loss": 0.3999447524547577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 39.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.00010506481502937398, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 13736645.0, "repeat_count": 0.0, "routers_loss": 0.004293019883334637, "skip_count": 0.0, "step": 8518, "text_loss": 0.3128681778907776 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 40.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.00010487507232920674, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 13740080.0, "repeat_count": 1.0, "routers_loss": 0.0030790462624281645, "skip_count": 1.0, "step": 8520, "text_loss": 0.39142900705337524 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.00939242735544, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.00010468548104165709, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 13743085.0, "repeat_count": 0.0, "routers_loss": 0.0007342757890000939, "skip_count": 0.0, "step": 8522, "text_loss": 0.7652465105056763 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 0.00010449604123937689, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 13746513.0, "repeat_count": 0.0, "routers_loss": 0.0030496022664010525, "skip_count": 0.0, "step": 8524, "text_loss": 0.6259746551513672 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 40.02817728206633, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00010430675299495973, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 13749391.0, "repeat_count": 1.0, "routers_loss": 0.010060965083539486, "skip_count": 1.0, "step": 8526, "text_loss": 0.2266668826341629 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.03756970942178, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 0.0001041176163809413, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 13752449.0, "repeat_count": 1.0, "routers_loss": 0.002234962536022067, "skip_count": 2.0, "step": 8528, "text_loss": 0.9742465019226074 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.00010392863146979903, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 13755572.0, "repeat_count": 0.0, "routers_loss": 0.0003572004789020866, "skip_count": 0.0, "step": 8530, "text_loss": 0.5757357478141785 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.00010373979833395242, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 13759198.0, "repeat_count": 0.0, "routers_loss": 0.011161680333316326, "skip_count": 0.0, "step": 8532, "text_loss": 0.6268131136894226 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.06574699148811, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.00010355111704576236, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 13761914.0, "repeat_count": 0.0, "routers_loss": 0.002053353004157543, "skip_count": 0.0, "step": 8534, "text_loss": 0.22388778626918793 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.075139418843555, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.00010336258767753232, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 13765371.0, "repeat_count": 0.0, "routers_loss": 0.003634720342233777, "skip_count": 2.0, "step": 8536, "text_loss": 0.5802993178367615 }, { "acc_repeat": 0.800000011920929, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.084531846199, "f1_execute": 0.9729729890823364, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.00010317421030150692, "loss": 0.0072, "macro_f1": 0.9539539813995361, "num_tokens": 13768276.0, "repeat_count": 5.0, "routers_loss": 0.053806692361831665, "skip_count": 5.0, "step": 8538, "text_loss": 0.10888377577066422 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.09392427355445, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07275390625, "learning_rate": 0.00010298598498987266, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 13772369.0, "repeat_count": 0.0, "routers_loss": 0.00501362606883049, "skip_count": 1.0, "step": 8540, "text_loss": 0.5794995427131653 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.00010279791181475795, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 13776595.0, "repeat_count": 1.0, "routers_loss": 0.002230882178992033, "skip_count": 2.0, "step": 8542, "text_loss": 0.5503702163696289 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.11270912826534, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.00010260999084823264, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 13779993.0, "repeat_count": 0.0, "routers_loss": 0.0012205395614728332, "skip_count": 0.0, "step": 8544, "text_loss": 0.7248672842979431 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.00010242222216230856, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 13782683.0, "repeat_count": 0.0, "routers_loss": 0.0003966465883422643, "skip_count": 0.0, "step": 8546, "text_loss": 0.7446619272232056 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 0.00010223460582893889, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 13785534.0, "repeat_count": 0.0, "routers_loss": 0.004968565888702869, "skip_count": 1.0, "step": 8548, "text_loss": 0.22457796335220337 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020263671875, "learning_rate": 0.00010204714192001863, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 13788608.0, "repeat_count": 0.0, "routers_loss": 0.0033054195810109377, "skip_count": 2.0, "step": 8550, "text_loss": 0.418837308883667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.15027883768712, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 0.00010185983050738434, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 13791553.0, "repeat_count": 0.0, "routers_loss": 0.001166256028227508, "skip_count": 0.0, "step": 8552, "text_loss": 0.4060337543487549 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.00010167267166281402, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 13795304.0, "repeat_count": 0.0, "routers_loss": 0.003844029037281871, "skip_count": 2.0, "step": 8554, "text_loss": 0.17412975430488586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023681640625, "learning_rate": 0.00010148566545802718, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 13798445.0, "repeat_count": 0.0, "routers_loss": 0.0033507589250802994, "skip_count": 0.0, "step": 8556, "text_loss": 0.24744336307048798 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.00010129881196468527, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 13801338.0, "repeat_count": 0.0, "routers_loss": 0.004076482728123665, "skip_count": 0.0, "step": 8558, "text_loss": 0.6542767882347107 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.18784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01806640625, "learning_rate": 0.00010111211125439069, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 13804157.0, "repeat_count": 0.0, "routers_loss": 0.0005654391716234386, "skip_count": 0.0, "step": 8560, "text_loss": 0.527079701423645 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.197240974464336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.00010092556339868758, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 13807411.0, "repeat_count": 0.0, "routers_loss": 0.004915264435112476, "skip_count": 1.0, "step": 8562, "text_loss": 0.721017599105835 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.20663340181978, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.00010073916846906139, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 13810489.0, "repeat_count": 0.0, "routers_loss": 0.005571382585912943, "skip_count": 1.0, "step": 8564, "text_loss": 0.5802517533302307 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.21602582917523, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.00010055292653693903, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 13813526.0, "repeat_count": 0.0, "routers_loss": 0.001321605988778174, "skip_count": 0.0, "step": 8566, "text_loss": 0.5485247373580933 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.22541825653067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.00010036683767368859, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 13817225.0, "repeat_count": 0.0, "routers_loss": 0.001876185997389257, "skip_count": 0.0, "step": 8568, "text_loss": 0.08957820385694504 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.23481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.00010018090195061997, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 13820667.0, "repeat_count": 0.0, "routers_loss": 0.004593426361680031, "skip_count": 0.0, "step": 8570, "text_loss": 0.24580086767673492 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 9.999511943898398e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 13824505.0, "repeat_count": 0.0, "routers_loss": 0.0022372701205313206, "skip_count": 0.0, "step": 8572, "text_loss": 0.20976831018924713 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.253595538597004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 9.980949020997276e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 13827623.0, "repeat_count": 0.0, "routers_loss": 0.0030519715510308743, "skip_count": 0.0, "step": 8574, "text_loss": 0.7638732194900513 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 9.962401433471985e-05, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 13831013.0, "repeat_count": 0.0, "routers_loss": 0.005036211106926203, "skip_count": 1.0, "step": 8576, "text_loss": 0.3791790306568146 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.2723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 9.943869188429989e-05, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 13833611.0, "repeat_count": 0.0, "routers_loss": 0.002071794355288148, "skip_count": 2.0, "step": 8578, "text_loss": 0.5480846166610718 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 40.28177282066334, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 9.925352292972884e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 13836678.0, "repeat_count": 1.0, "routers_loss": 0.008119060657918453, "skip_count": 0.0, "step": 8580, "text_loss": 0.21605457365512848 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 9.906850754196379e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 13839255.0, "repeat_count": 0.0, "routers_loss": 0.004017427563667297, "skip_count": 2.0, "step": 8582, "text_loss": 0.4473285973072052 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045654296875, "learning_rate": 9.888364579190285e-05, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 13842034.0, "repeat_count": 0.0, "routers_loss": 0.005163116846233606, "skip_count": 1.0, "step": 8584, "text_loss": 0.21627424657344818 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.30995010272967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 9.869893775038557e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 13844648.0, "repeat_count": 0.0, "routers_loss": 0.0044358340092003345, "skip_count": 1.0, "step": 8586, "text_loss": 0.5660704970359802 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.319342530085116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021484375, "learning_rate": 9.851438348819247e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 13847629.0, "repeat_count": 0.0, "routers_loss": 0.00038135924842208624, "skip_count": 1.0, "step": 8588, "text_loss": 0.6401235461235046 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.32873495744057, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 9.832998307604495e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 13851409.0, "repeat_count": 0.0, "routers_loss": 0.004005341790616512, "skip_count": 1.0, "step": 8590, "text_loss": 0.43975043296813965 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.33812738479601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 9.814573658460562e-05, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 13854031.0, "repeat_count": 0.0, "routers_loss": 0.006872966885566711, "skip_count": 2.0, "step": 8592, "text_loss": 0.6000451445579529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0184326171875, "learning_rate": 9.796164408447811e-05, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 13856813.0, "repeat_count": 0.0, "routers_loss": 0.0019872859120368958, "skip_count": 0.0, "step": 8594, "text_loss": 0.6026073098182678 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.3569122395069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 9.777770564620698e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 13859805.0, "repeat_count": 0.0, "routers_loss": 0.013098123483359814, "skip_count": 2.0, "step": 8596, "text_loss": 0.3294500708580017 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 40.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0203857421875, "learning_rate": 9.759392134027783e-05, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 13863119.0, "repeat_count": 1.0, "routers_loss": 0.001011171261779964, "skip_count": 1.0, "step": 8598, "text_loss": 0.4078965187072754 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.375697094217784, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 9.741029123711708e-05, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 13866239.0, "repeat_count": 0.0, "routers_loss": 0.003267963184043765, "skip_count": 0.0, "step": 8600, "text_loss": 0.5064641833305359 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.385089521573235, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 9.722681540709228e-05, "loss": 0.0045, "macro_f1": 0.6601307392120361, "num_tokens": 13869647.0, "repeat_count": 1.0, "routers_loss": 0.02431299351155758, "skip_count": 2.0, "step": 8602, "text_loss": 0.2512950301170349 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.39448194892868, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 9.704349392051155e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 13873128.0, "repeat_count": 0.0, "routers_loss": 0.0019577480852603912, "skip_count": 1.0, "step": 8604, "text_loss": 0.425156831741333 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 9.686032684762408e-05, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 13876603.0, "repeat_count": 0.0, "routers_loss": 0.001554530463181436, "skip_count": 1.0, "step": 8606, "text_loss": 0.3596082329750061 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01519775390625, "learning_rate": 9.667731425861975e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 13879602.0, "repeat_count": 0.0, "routers_loss": 0.0027400986291468143, "skip_count": 0.0, "step": 8608, "text_loss": 0.12101534754037857 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 9.649445622362957e-05, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 13882204.0, "repeat_count": 0.0, "routers_loss": 0.001957559958100319, "skip_count": 2.0, "step": 8610, "text_loss": 0.382834255695343 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.43205165835045, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 9.631175281272491e-05, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 13886397.0, "repeat_count": 1.0, "routers_loss": 0.009613300673663616, "skip_count": 3.0, "step": 8612, "text_loss": 0.24718235433101654 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.441444085705896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 9.612920409591813e-05, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 13889625.0, "repeat_count": 0.0, "routers_loss": 0.0015159029280766845, "skip_count": 0.0, "step": 8614, "text_loss": 0.406452476978302 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 40.45083651306135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 9.59468101431622e-05, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 13892518.0, "repeat_count": 0.0, "routers_loss": 0.008069832809269428, "skip_count": 3.0, "step": 8616, "text_loss": 0.19740329682826996 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0157470703125, "learning_rate": 9.576457102435082e-05, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 13895822.0, "repeat_count": 0.0, "routers_loss": 0.0024340536911040545, "skip_count": 0.0, "step": 8618, "text_loss": 0.44761306047439575 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 40.469621367772234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02880859375, "learning_rate": 9.558248680931841e-05, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 13898829.0, "repeat_count": 2.0, "routers_loss": 0.0053517078049480915, "skip_count": 1.0, "step": 8620, "text_loss": 0.37335118651390076 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.47901379512768, "f1_execute": 0.9767441749572754, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.021484375, "learning_rate": 9.540055756783994e-05, "loss": 0.0061, "macro_f1": 0.9255813956260681, "num_tokens": 13902122.0, "repeat_count": 3.0, "routers_loss": 0.03885587304830551, "skip_count": 4.0, "step": 8622, "text_loss": 0.21311092376708984 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.48840622248312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.051025390625, "learning_rate": 9.521878336963108e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 13904874.0, "repeat_count": 0.0, "routers_loss": 0.007965708151459694, "skip_count": 1.0, "step": 8624, "text_loss": 0.27229398488998413 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.497798649838565, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 9.5037164284348e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 13907755.0, "repeat_count": 0.0, "routers_loss": 0.0019825168419629335, "skip_count": 0.0, "step": 8626, "text_loss": 0.6535577178001404 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.507191077194015, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 9.485570038158747e-05, "loss": 0.0085, "macro_f1": 0.3272727429866791, "num_tokens": 13910619.0, "repeat_count": 1.0, "routers_loss": 0.017803344875574112, "skip_count": 0.0, "step": 8628, "text_loss": 0.26617178320884705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.51658350454946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 9.467439173088687e-05, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 13914098.0, "repeat_count": 0.0, "routers_loss": 0.0025836096610873938, "skip_count": 0.0, "step": 8630, "text_loss": 0.44465285539627075 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.5259759319049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030029296875, "learning_rate": 9.44932384017238e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 13917192.0, "repeat_count": 0.0, "routers_loss": 0.004438584204763174, "skip_count": 2.0, "step": 8632, "text_loss": 0.33622798323631287 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 9.431224046351688e-05, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 13920067.0, "repeat_count": 0.0, "routers_loss": 0.017312567681074142, "skip_count": 2.0, "step": 8634, "text_loss": 0.31870952248573303 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 9.413139798562476e-05, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 13922887.0, "repeat_count": 0.0, "routers_loss": 0.0019389945082366467, "skip_count": 0.0, "step": 8636, "text_loss": 0.18223261833190918 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.55415321397123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 9.395071103734648e-05, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 13926545.0, "repeat_count": 0.0, "routers_loss": 0.0011485094437375665, "skip_count": 0.0, "step": 8638, "text_loss": 0.48031774163246155 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 40.563545641326684, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0184326171875, "learning_rate": 9.377017968792179e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 13931171.0, "repeat_count": 1.0, "routers_loss": 0.003448521951213479, "skip_count": 0.0, "step": 8640, "text_loss": 0.7585139870643616 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 40.57293806868213, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0213623046875, "learning_rate": 9.35898040065305e-05, "loss": 0.0048, "macro_f1": 0.5492662787437439, "num_tokens": 13934369.0, "repeat_count": 0.0, "routers_loss": 0.017959754914045334, "skip_count": 2.0, "step": 8642, "text_loss": 0.49708613753318787 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.018310546875, "learning_rate": 9.3409584062293e-05, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 13938166.0, "repeat_count": 0.0, "routers_loss": 0.004092653747648001, "skip_count": 1.0, "step": 8644, "text_loss": 0.20662656426429749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 9.322951992426992e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 13941922.0, "repeat_count": 0.0, "routers_loss": 0.0026206092443317175, "skip_count": 0.0, "step": 8646, "text_loss": 0.4735889434814453 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 40.60111535074846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 9.304961166146209e-05, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 13945569.0, "repeat_count": 3.0, "routers_loss": 0.005156307481229305, "skip_count": 2.0, "step": 8648, "text_loss": 0.5630270838737488 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.6105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 9.286985934281079e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 13948357.0, "repeat_count": 0.0, "routers_loss": 0.004913610871881247, "skip_count": 1.0, "step": 8650, "text_loss": 0.4053497016429901 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.619900205459345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 9.26902630371974e-05, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 13952543.0, "repeat_count": 0.0, "routers_loss": 0.003946282435208559, "skip_count": 2.0, "step": 8652, "text_loss": 0.40166863799095154 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.629292632814796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 9.251082281344358e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 13955917.0, "repeat_count": 0.0, "routers_loss": 0.0009605551022104919, "skip_count": 0.0, "step": 8654, "text_loss": 0.20477983355522156 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 40.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 9.233153874031102e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 13960071.0, "repeat_count": 0.0, "routers_loss": 0.004408199340105057, "skip_count": 3.0, "step": 8656, "text_loss": 0.3349814713001251 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.64807748752568, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0208740234375, "learning_rate": 9.215241088650194e-05, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 13963125.0, "repeat_count": 1.0, "routers_loss": 0.005541396792978048, "skip_count": 2.0, "step": 8658, "text_loss": 0.6602919697761536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 9.197343932065843e-05, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 13966130.0, "repeat_count": 0.0, "routers_loss": 0.001636760076507926, "skip_count": 0.0, "step": 8660, "text_loss": 0.7704628109931946 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.66686234223657, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 9.179462411136263e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 13969791.0, "repeat_count": 0.0, "routers_loss": 0.0006453761598095298, "skip_count": 0.0, "step": 8662, "text_loss": 0.3898075520992279 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 40.67625476959201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 9.161596532713695e-05, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 13972987.0, "repeat_count": 0.0, "routers_loss": 0.005081792362034321, "skip_count": 4.0, "step": 8664, "text_loss": 0.8477506041526794 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.685647196947464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 9.143746303644374e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 13976505.0, "repeat_count": 0.0, "routers_loss": 0.0032063762191683054, "skip_count": 0.0, "step": 8666, "text_loss": 0.23729658126831055 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.69503962430291, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 9.125911730768543e-05, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 13980061.0, "repeat_count": 0.0, "routers_loss": 0.00043821477447636425, "skip_count": 0.0, "step": 8668, "text_loss": 0.4233637750148773 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 9.108092820920438e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 13983407.0, "repeat_count": 0.0, "routers_loss": 0.007779054809361696, "skip_count": 2.0, "step": 8670, "text_loss": 0.5050316452980042 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 9.090289580928307e-05, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 13986725.0, "repeat_count": 0.0, "routers_loss": 0.0018697676714509726, "skip_count": 1.0, "step": 8672, "text_loss": 1.0568488836288452 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 9.072502017614382e-05, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 13990765.0, "repeat_count": 0.0, "routers_loss": 0.002077789744362235, "skip_count": 0.0, "step": 8674, "text_loss": 0.48911142349243164 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 40.73260933372468, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 9.054730137794887e-05, "loss": 0.0081, "macro_f1": 0.6598639488220215, "num_tokens": 13994083.0, "repeat_count": 1.0, "routers_loss": 0.044373031705617905, "skip_count": 3.0, "step": 8676, "text_loss": 0.3420281708240509 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0250244140625, "learning_rate": 9.036973948280048e-05, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 13997500.0, "repeat_count": 0.0, "routers_loss": 0.0015431724023073912, "skip_count": 0.0, "step": 8678, "text_loss": 0.21514096856117249 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.751394188435576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030029296875, "learning_rate": 9.019233455874049e-05, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 14000460.0, "repeat_count": 0.0, "routers_loss": 0.006088062655180693, "skip_count": 1.0, "step": 8680, "text_loss": 0.43932875990867615 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.76078661579102, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 9.001508667375107e-05, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 14003537.0, "repeat_count": 2.0, "routers_loss": 0.01006145216524601, "skip_count": 3.0, "step": 8682, "text_loss": 0.2192728966474533 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.77017904314646, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 8.983799589575393e-05, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 14005943.0, "repeat_count": 0.0, "routers_loss": 0.001044525415636599, "skip_count": 0.0, "step": 8684, "text_loss": 0.8686383962631226 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.77957147050191, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 8.96610622926104e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 14008954.0, "repeat_count": 0.0, "routers_loss": 0.004876079503446817, "skip_count": 2.0, "step": 8686, "text_loss": 0.2513524889945984 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.78896389785735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 8.948428593212193e-05, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 14012268.0, "repeat_count": 1.0, "routers_loss": 0.007909095846116543, "skip_count": 2.0, "step": 8688, "text_loss": 0.17117907106876373 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.798356325212794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 8.930766688202946e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 14015192.0, "repeat_count": 0.0, "routers_loss": 0.0022194553166627884, "skip_count": 0.0, "step": 8690, "text_loss": 0.637697160243988 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 40.807748752568244, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0162353515625, "learning_rate": 8.913120521001383e-05, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 14018055.0, "repeat_count": 1.0, "routers_loss": 0.0023777696769684553, "skip_count": 0.0, "step": 8692, "text_loss": 0.39099860191345215 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.81714117992369, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 8.895490098369535e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14021035.0, "repeat_count": 0.0, "routers_loss": 0.002676652278751135, "skip_count": 1.0, "step": 8694, "text_loss": 0.6112156510353088 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 8.877875427063431e-05, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 14023759.0, "repeat_count": 0.0, "routers_loss": 0.001040685223415494, "skip_count": 0.0, "step": 8696, "text_loss": 0.3562681972980499 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 40.835926034634575, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 8.86027651383302e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 14026090.0, "repeat_count": 1.0, "routers_loss": 0.0011444527190178633, "skip_count": 0.0, "step": 8698, "text_loss": 0.6152632236480713 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 40.84531846199002, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.04345703125, "learning_rate": 8.842693365422266e-05, "loss": 0.008, "macro_f1": 0.8817967176437378, "num_tokens": 14029570.0, "repeat_count": 2.0, "routers_loss": 0.024327632039785385, "skip_count": 3.0, "step": 8700, "text_loss": 0.2170596867799759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.85471088934546, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 8.825125988569061e-05, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 14032418.0, "repeat_count": 0.0, "routers_loss": 0.00048010432510636747, "skip_count": 0.0, "step": 8702, "text_loss": 0.4421340525150299 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 8.807574390005241e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 14035610.0, "repeat_count": 0.0, "routers_loss": 0.0010498231276869774, "skip_count": 0.0, "step": 8704, "text_loss": 0.3656717538833618 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.873495744056356, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 8.790038576456627e-05, "loss": 0.0045, "macro_f1": 0.3272727429866791, "num_tokens": 14039354.0, "repeat_count": 0.0, "routers_loss": 0.019302964210510254, "skip_count": 1.0, "step": 8706, "text_loss": 0.6150856018066406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 8.772518554642972e-05, "loss": 0.0029, "macro_f1": 0.3333333432674408, "num_tokens": 14042353.0, "repeat_count": 0.0, "routers_loss": 0.004211598541587591, "skip_count": 0.0, "step": 8708, "text_loss": 0.17178772389888763 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.89228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 8.755014331277972e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 14045704.0, "repeat_count": 0.0, "routers_loss": 0.0007902922225184739, "skip_count": 0.0, "step": 8710, "text_loss": 0.6289885640144348 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.90167302612269, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 8.737525913069277e-05, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 14048743.0, "repeat_count": 1.0, "routers_loss": 0.007915202528238297, "skip_count": 2.0, "step": 8712, "text_loss": 0.2778690457344055 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 40.91106545347813, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 8.720053306718506e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 14052762.0, "repeat_count": 0.0, "routers_loss": 0.0027877227403223515, "skip_count": 3.0, "step": 8714, "text_loss": 0.3615926504135132 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.92045788083358, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0478515625, "learning_rate": 8.702596518921175e-05, "loss": 0.0086, "macro_f1": 0.6603773832321167, "num_tokens": 14056645.0, "repeat_count": 1.0, "routers_loss": 0.03460995852947235, "skip_count": 1.0, "step": 8716, "text_loss": 0.19412031769752502 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.929850308189025, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02001953125, "learning_rate": 8.685155556366763e-05, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 14059604.0, "repeat_count": 1.0, "routers_loss": 0.0026834046002477407, "skip_count": 2.0, "step": 8718, "text_loss": 0.4414670169353485 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 40.93924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 8.667730425738679e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 14062170.0, "repeat_count": 0.0, "routers_loss": 0.01547359861433506, "skip_count": 4.0, "step": 8720, "text_loss": 0.2850716710090637 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 8.650321133714267e-05, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 14065526.0, "repeat_count": 0.0, "routers_loss": 0.0020194994285702705, "skip_count": 0.0, "step": 8722, "text_loss": 0.1776508241891861 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 8.632927686964798e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 14068525.0, "repeat_count": 0.0, "routers_loss": 0.0037195945624262094, "skip_count": 0.0, "step": 8724, "text_loss": 0.2786005735397339 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 40.9674200176108, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 8.615550092155477e-05, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 14071830.0, "repeat_count": 1.0, "routers_loss": 0.008169961161911488, "skip_count": 4.0, "step": 8726, "text_loss": 0.43228310346603394 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.97681244496625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 8.598188355945424e-05, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 14074977.0, "repeat_count": 0.0, "routers_loss": 0.006407112814486027, "skip_count": 1.0, "step": 8728, "text_loss": 0.24443474411964417 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 40.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0498046875, "learning_rate": 8.580842484987689e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 14078104.0, "repeat_count": 0.0, "routers_loss": 0.001878641895018518, "skip_count": 1.0, "step": 8730, "text_loss": 0.4559098184108734 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 40.99559729967714, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 8.563512485929253e-05, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 14081934.0, "repeat_count": 0.0, "routers_loss": 0.0056114462204277515, "skip_count": 0.0, "step": 8732, "text_loss": 0.3063429594039917 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.004696213677725, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 8.546198365411007e-05, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 14085097.0, "repeat_count": 1.0, "routers_loss": 0.001542840269394219, "skip_count": 0.0, "step": 8734, "text_loss": 0.7624274492263794 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.01408864103317, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 8.528900130067741e-05, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 14088630.0, "repeat_count": 0.0, "routers_loss": 0.002677374053746462, "skip_count": 0.0, "step": 8736, "text_loss": 0.18395234644412994 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.02348106838861, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 8.511617786528175e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 14091513.0, "repeat_count": 1.0, "routers_loss": 0.004059800878167152, "skip_count": 0.0, "step": 8738, "text_loss": 0.4567817449569702 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 41.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040771484375, "learning_rate": 8.494351341414947e-05, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 14094500.0, "repeat_count": 1.0, "routers_loss": 0.0023724427446722984, "skip_count": 1.0, "step": 8740, "text_loss": 0.6925744414329529 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0155029296875, "learning_rate": 8.477100801344573e-05, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 14097518.0, "repeat_count": 0.0, "routers_loss": 0.0013842503540217876, "skip_count": 2.0, "step": 8742, "text_loss": 0.6574832201004028 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.05165835045494, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 8.459866172927505e-05, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 14101219.0, "repeat_count": 0.0, "routers_loss": 0.003597316099330783, "skip_count": 2.0, "step": 8744, "text_loss": 0.785912036895752 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 24.0, "epoch": 41.061050777810394, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.027099609375, "learning_rate": 8.442647462768082e-05, "loss": 0.0066, "macro_f1": 0.6225374937057495, "num_tokens": 14104460.0, "repeat_count": 0.0, "routers_loss": 0.01929798349738121, "skip_count": 5.0, "step": 8746, "text_loss": 0.2111714482307434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.07044320516584, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 8.425444677464545e-05, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 14107404.0, "repeat_count": 0.0, "routers_loss": 0.00048497592797502875, "skip_count": 0.0, "step": 8748, "text_loss": 0.4764930307865143 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.07983563252128, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 8.408257823609033e-05, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 14109917.0, "repeat_count": 1.0, "routers_loss": 0.007886217907071114, "skip_count": 2.0, "step": 8750, "text_loss": 0.2771969735622406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 8.391086907787587e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 14112649.0, "repeat_count": 0.0, "routers_loss": 0.006535434629768133, "skip_count": 0.0, "step": 8752, "text_loss": 0.1550854742527008 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 8.373931936580114e-05, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 14116044.0, "repeat_count": 0.0, "routers_loss": 0.002130605047568679, "skip_count": 0.0, "step": 8754, "text_loss": 0.4055478870868683 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.10801291458761, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 8.356792916560457e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 14119097.0, "repeat_count": 0.0, "routers_loss": 0.0005611231899820268, "skip_count": 0.0, "step": 8756, "text_loss": 0.47804903984069824 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 41.117405341943055, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 8.339669854296316e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 14122079.0, "repeat_count": 2.0, "routers_loss": 0.005650801584124565, "skip_count": 0.0, "step": 8758, "text_loss": 0.1968296617269516 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.126797769298506, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 8.322562756349273e-05, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 14124910.0, "repeat_count": 0.0, "routers_loss": 0.0035948604345321655, "skip_count": 1.0, "step": 8760, "text_loss": 0.4988253712654114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 8.305471629274802e-05, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 14127767.0, "repeat_count": 0.0, "routers_loss": 0.0012090947711840272, "skip_count": 0.0, "step": 8762, "text_loss": 0.6330704689025879 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.14558262400939, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.019287109375, "learning_rate": 8.288396479622262e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 14130766.0, "repeat_count": 0.0, "routers_loss": 0.0010853242129087448, "skip_count": 1.0, "step": 8764, "text_loss": 0.43057000637054443 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 8.271337313934868e-05, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 14133804.0, "repeat_count": 0.0, "routers_loss": 0.0037055034190416336, "skip_count": 2.0, "step": 8766, "text_loss": 0.31973564624786377 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.16436747872028, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 8.254294138749741e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 14137164.0, "repeat_count": 0.0, "routers_loss": 0.005338407587260008, "skip_count": 0.0, "step": 8768, "text_loss": 0.5066531896591187 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.17375990607572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 8.237266960597844e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 14140119.0, "repeat_count": 0.0, "routers_loss": 0.0014707009540870786, "skip_count": 1.0, "step": 8770, "text_loss": 0.553493857383728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.183152333431174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0223388671875, "learning_rate": 8.220255786004033e-05, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 14143223.0, "repeat_count": 0.0, "routers_loss": 0.002113121096044779, "skip_count": 0.0, "step": 8772, "text_loss": 0.40016281604766846 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.19254476078662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0179443359375, "learning_rate": 8.203260621487019e-05, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 14146366.0, "repeat_count": 0.0, "routers_loss": 0.002210963051766157, "skip_count": 1.0, "step": 8774, "text_loss": 0.44022905826568604 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 8.186281473559382e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 14150009.0, "repeat_count": 0.0, "routers_loss": 0.0011857844656333327, "skip_count": 0.0, "step": 8776, "text_loss": 0.572823703289032 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 8.169318348727544e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 14153343.0, "repeat_count": 0.0, "routers_loss": 0.0020397785119712353, "skip_count": 1.0, "step": 8778, "text_loss": 0.5724276900291443 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 8.152371253491841e-05, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 14156392.0, "repeat_count": 0.0, "routers_loss": 0.001745635992847383, "skip_count": 0.0, "step": 8780, "text_loss": 0.14162923395633698 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.23011447020839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 8.135440194346416e-05, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 14159616.0, "repeat_count": 0.0, "routers_loss": 0.002799858106300235, "skip_count": 0.0, "step": 8782, "text_loss": 0.18205340206623077 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.23950689756384, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0250244140625, "learning_rate": 8.118525177779284e-05, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 14163531.0, "repeat_count": 1.0, "routers_loss": 0.0029223538003861904, "skip_count": 0.0, "step": 8784, "text_loss": 0.4107058644294739 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.248899324919286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01904296875, "learning_rate": 8.101626210272311e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 14166776.0, "repeat_count": 0.0, "routers_loss": 0.001209643087349832, "skip_count": 0.0, "step": 8786, "text_loss": 0.6441596746444702 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 8.084743298301211e-05, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 14169586.0, "repeat_count": 0.0, "routers_loss": 0.0015196573222056031, "skip_count": 0.0, "step": 8788, "text_loss": 0.35585930943489075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.26768417963017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 8.067876448335549e-05, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 14174180.0, "repeat_count": 0.0, "routers_loss": 0.0004388966190163046, "skip_count": 0.0, "step": 8790, "text_loss": 0.31594613194465637 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.27707660698562, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 8.05102566683873e-05, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 14177950.0, "repeat_count": 1.0, "routers_loss": 0.0031201441306620836, "skip_count": 0.0, "step": 8792, "text_loss": 0.3161006569862366 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.28646903434106, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 8.034190960268012e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 14180642.0, "repeat_count": 0.0, "routers_loss": 0.001848527928814292, "skip_count": 0.0, "step": 8794, "text_loss": 0.47571417689323425 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.295861461696504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 8.017372335074486e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 14183743.0, "repeat_count": 0.0, "routers_loss": 0.0043064444325864315, "skip_count": 1.0, "step": 8796, "text_loss": 0.5976942777633667 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.305253889051954, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030029296875, "learning_rate": 8.000569797703072e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 14187742.0, "repeat_count": 0.0, "routers_loss": 0.005383181851357222, "skip_count": 2.0, "step": 8798, "text_loss": 0.2692606449127197 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.3146463164074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0206298828125, "learning_rate": 7.983783354592544e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 14191211.0, "repeat_count": 0.0, "routers_loss": 0.001401974936015904, "skip_count": 0.0, "step": 8800, "text_loss": 0.38108205795288086 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 7.967013012175478e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 14194992.0, "repeat_count": 0.0, "routers_loss": 0.001168998540379107, "skip_count": 0.0, "step": 8802, "text_loss": 0.5201764106750488 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05322265625, "learning_rate": 7.950258776878332e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 14198059.0, "repeat_count": 0.0, "routers_loss": 0.0032015808392316103, "skip_count": 2.0, "step": 8804, "text_loss": 0.6014752984046936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.34282359847373, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 7.933520655121351e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 14202313.0, "repeat_count": 0.0, "routers_loss": 0.0009403078584000468, "skip_count": 0.0, "step": 8806, "text_loss": 0.54194176197052 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.35221602582917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 7.916798653318607e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14205534.0, "repeat_count": 0.0, "routers_loss": 0.0027781077660620213, "skip_count": 1.0, "step": 8808, "text_loss": 0.7181227803230286 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0203857421875, "learning_rate": 7.900092777878004e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 14209357.0, "repeat_count": 0.0, "routers_loss": 0.0034586815163493156, "skip_count": 1.0, "step": 8810, "text_loss": 0.21651209890842438 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 41.371000880540066, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 7.883403035201265e-05, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 14212328.0, "repeat_count": 1.0, "routers_loss": 0.01194343063980341, "skip_count": 4.0, "step": 8812, "text_loss": 0.20523512363433838 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 41.38039330789551, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0157470703125, "learning_rate": 7.866729431683938e-05, "loss": 0.0038, "macro_f1": 1.0, "num_tokens": 14214979.0, "repeat_count": 1.0, "routers_loss": 0.0045132869854569435, "skip_count": 1.0, "step": 8814, "text_loss": 0.4066837728023529 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.38978573525095, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0181884765625, "learning_rate": 7.850071973715368e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 14219030.0, "repeat_count": 0.0, "routers_loss": 0.005109346006065607, "skip_count": 2.0, "step": 8816, "text_loss": 0.12459450960159302 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.3991781626064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 7.833430667678737e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 14222117.0, "repeat_count": 0.0, "routers_loss": 0.0036401136312633753, "skip_count": 0.0, "step": 8818, "text_loss": 0.3759046494960785 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 41.40857058996184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 7.816805519951008e-05, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 14225546.0, "repeat_count": 2.0, "routers_loss": 0.006177824921905994, "skip_count": 1.0, "step": 8820, "text_loss": 0.4031941592693329 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 41.41796301731729, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 7.800196536902987e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 14228731.0, "repeat_count": 0.0, "routers_loss": 0.009549650363624096, "skip_count": 5.0, "step": 8822, "text_loss": 0.2895966172218323 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.427355444672735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 7.783603724899258e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14231796.0, "repeat_count": 0.0, "routers_loss": 0.005532847251743078, "skip_count": 2.0, "step": 8824, "text_loss": 0.32433390617370605 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.43674787202818, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 7.767027090298206e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 14235869.0, "repeat_count": 0.0, "routers_loss": 0.0011165215400978923, "skip_count": 0.0, "step": 8826, "text_loss": 0.41239091753959656 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 7.750466639452059e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 14238830.0, "repeat_count": 0.0, "routers_loss": 0.0007845646468922496, "skip_count": 0.0, "step": 8828, "text_loss": 0.5113243460655212 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 7.733922378706787e-05, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 14241672.0, "repeat_count": 0.0, "routers_loss": 0.0029602700378745794, "skip_count": 1.0, "step": 8830, "text_loss": 0.22004501521587372 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 41.46492515409451, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 7.717394314402199e-05, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 14244522.0, "repeat_count": 2.0, "routers_loss": 0.005297200754284859, "skip_count": 1.0, "step": 8832, "text_loss": 0.6039504408836365 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.47431758144996, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 7.700882452871872e-05, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 14246964.0, "repeat_count": 0.0, "routers_loss": 0.0018059068825095892, "skip_count": 2.0, "step": 8834, "text_loss": 0.46563026309013367 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 7.684386800443177e-05, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 14249387.0, "repeat_count": 0.0, "routers_loss": 0.005659483838826418, "skip_count": 2.0, "step": 8836, "text_loss": 0.31516948342323303 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.49310243616085, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 7.667907363437288e-05, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 14252438.0, "repeat_count": 0.0, "routers_loss": 0.011170750483870506, "skip_count": 1.0, "step": 8838, "text_loss": 0.22867503762245178 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 7.651444148169157e-05, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 14255490.0, "repeat_count": 0.0, "routers_loss": 0.004106760956346989, "skip_count": 2.0, "step": 8840, "text_loss": 0.5757828950881958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 7.634997160947499e-05, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 14258430.0, "repeat_count": 0.0, "routers_loss": 0.0008562540751881897, "skip_count": 0.0, "step": 8842, "text_loss": 0.5166661143302917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0198974609375, "learning_rate": 7.618566408074862e-05, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 14261275.0, "repeat_count": 0.0, "routers_loss": 0.0012901517329737544, "skip_count": 0.0, "step": 8844, "text_loss": 0.7376981973648071 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.53067214558262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 7.602151895847526e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 14264698.0, "repeat_count": 0.0, "routers_loss": 0.00267209205776453, "skip_count": 0.0, "step": 8846, "text_loss": 0.5249470472335815 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 41.54006457293807, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 7.585753630555565e-05, "loss": 0.009, "macro_f1": 1.0, "num_tokens": 14267887.0, "repeat_count": 1.0, "routers_loss": 0.015334542840719223, "skip_count": 7.0, "step": 8848, "text_loss": 1.1539889574050903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.017578125, "learning_rate": 7.569371618482818e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 14271392.0, "repeat_count": 0.0, "routers_loss": 0.0010222389828413725, "skip_count": 0.0, "step": 8850, "text_loss": 0.33968010544776917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 7.553005865906914e-05, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 14274658.0, "repeat_count": 0.0, "routers_loss": 0.0006116362637840211, "skip_count": 0.0, "step": 8852, "text_loss": 0.7514221668243408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.5682418550044, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 7.536656379099221e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 14277763.0, "repeat_count": 0.0, "routers_loss": 0.0036474792286753654, "skip_count": 0.0, "step": 8854, "text_loss": 0.3964846134185791 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.577634282359845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 7.520323164324921e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14281165.0, "repeat_count": 0.0, "routers_loss": 0.005498840939253569, "skip_count": 1.0, "step": 8856, "text_loss": 0.2235594391822815 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 41.58702670971529, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 7.504006227842919e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 14284761.0, "repeat_count": 2.0, "routers_loss": 0.006513409782201052, "skip_count": 0.0, "step": 8858, "text_loss": 0.45196816325187683 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.59641913707074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 7.48770557590589e-05, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 14287844.0, "repeat_count": 0.0, "routers_loss": 0.0013065916718915105, "skip_count": 0.0, "step": 8860, "text_loss": 0.2188033014535904 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.60581156442618, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 7.471421214760287e-05, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 14291280.0, "repeat_count": 1.0, "routers_loss": 0.0016644994029775262, "skip_count": 0.0, "step": 8862, "text_loss": 0.7049906253814697 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.61520399178163, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 7.455153150646299e-05, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 14294330.0, "repeat_count": 1.0, "routers_loss": 0.002664943691343069, "skip_count": 0.0, "step": 8864, "text_loss": 0.2160239815711975 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.62459641913707, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02001953125, "learning_rate": 7.43890138979788e-05, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 14298355.0, "repeat_count": 1.0, "routers_loss": 0.0035776710137724876, "skip_count": 0.0, "step": 8866, "text_loss": 0.4922088384628296 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0257568359375, "learning_rate": 7.422665938442741e-05, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 14301452.0, "repeat_count": 0.0, "routers_loss": 0.0029914912302047014, "skip_count": 2.0, "step": 8868, "text_loss": 0.5828475952148438 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 7.406446802802331e-05, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 14304667.0, "repeat_count": 1.0, "routers_loss": 0.0010031569981947541, "skip_count": 2.0, "step": 8870, "text_loss": 0.657244861125946 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.65277370120341, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 7.390243989091849e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 14307397.0, "repeat_count": 0.0, "routers_loss": 0.007960405200719833, "skip_count": 1.0, "step": 8872, "text_loss": 0.3147352635860443 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.66216612855885, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 7.37405750352026e-05, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 14310687.0, "repeat_count": 1.0, "routers_loss": 0.007953251712024212, "skip_count": 3.0, "step": 8874, "text_loss": 0.30315887928009033 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 7.357887352290227e-05, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 14314007.0, "repeat_count": 0.0, "routers_loss": 0.0012103051412850618, "skip_count": 0.0, "step": 8876, "text_loss": 0.6356115341186523 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037109375, "learning_rate": 7.341733541598217e-05, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 14316696.0, "repeat_count": 0.0, "routers_loss": 0.0017898730002343655, "skip_count": 1.0, "step": 8878, "text_loss": 0.35877764225006104 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.69034341062518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047119140625, "learning_rate": 7.325596077634383e-05, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 14320172.0, "repeat_count": 0.0, "routers_loss": 0.0007144945557229221, "skip_count": 0.0, "step": 8880, "text_loss": 0.7939266562461853 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.699735837980626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 7.309474966582635e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 14323262.0, "repeat_count": 0.0, "routers_loss": 0.001255290349945426, "skip_count": 0.0, "step": 8882, "text_loss": 0.7115976810455322 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.70912826533607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 7.293370214620616e-05, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 14326826.0, "repeat_count": 0.0, "routers_loss": 0.0028131126891821623, "skip_count": 2.0, "step": 8884, "text_loss": 0.24073036015033722 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.71852069269152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 7.277281827919691e-05, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 14329658.0, "repeat_count": 0.0, "routers_loss": 0.0024797592777758837, "skip_count": 1.0, "step": 8886, "text_loss": 0.47276070713996887 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 41.72791312004696, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037109375, "learning_rate": 7.26120981264496e-05, "loss": 0.0081, "macro_f1": 0.6598639488220215, "num_tokens": 14333584.0, "repeat_count": 1.0, "routers_loss": 0.023670634254813194, "skip_count": 3.0, "step": 8888, "text_loss": 0.47537583112716675 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.73730554740241, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04541015625, "learning_rate": 7.245154174955254e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 14336850.0, "repeat_count": 0.0, "routers_loss": 0.0009583478095009923, "skip_count": 0.0, "step": 8890, "text_loss": 0.5258943438529968 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 41.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 7.229114921003116e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 14339940.0, "repeat_count": 0.0, "routers_loss": 0.006664840504527092, "skip_count": 3.0, "step": 8892, "text_loss": 0.20986922085285187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.756090402113294, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 7.213092056934833e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 14342737.0, "repeat_count": 0.0, "routers_loss": 0.0005362578085623682, "skip_count": 0.0, "step": 8894, "text_loss": 0.5174402594566345 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 7.197085588890383e-05, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 14345769.0, "repeat_count": 0.0, "routers_loss": 0.006428950000554323, "skip_count": 1.0, "step": 8896, "text_loss": 0.657136857509613 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.77487525682419, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 7.181095523003478e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14348563.0, "repeat_count": 1.0, "routers_loss": 0.0015549053205177188, "skip_count": 0.0, "step": 8898, "text_loss": 0.49799686670303345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.78426768417963, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 7.165121865401535e-05, "loss": 0.0068, "macro_f1": 0.32098764181137085, "num_tokens": 14353134.0, "repeat_count": 0.0, "routers_loss": 0.030110027641057968, "skip_count": 2.0, "step": 8900, "text_loss": 0.3644331693649292 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 41.793660111535075, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 7.149164622205712e-05, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 14356031.0, "repeat_count": 1.0, "routers_loss": 0.0014812488807365298, "skip_count": 1.0, "step": 8902, "text_loss": 0.46983054280281067 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 7.133223799530836e-05, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 14358941.0, "repeat_count": 0.0, "routers_loss": 0.001170543720945716, "skip_count": 0.0, "step": 8904, "text_loss": 0.7030026316642761 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 41.81244496624596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 7.117299403485466e-05, "loss": 0.0085, "macro_f1": 1.0, "num_tokens": 14361807.0, "repeat_count": 1.0, "routers_loss": 0.0011649372754618526, "skip_count": 1.0, "step": 8906, "text_loss": 0.44989535212516785 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.821837393601406, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0213623046875, "learning_rate": 7.101391440171856e-05, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 14365464.0, "repeat_count": 0.0, "routers_loss": 0.0028165180701762438, "skip_count": 0.0, "step": 8908, "text_loss": 0.487165629863739 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.83122982095686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 7.085499915685978e-05, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 14368149.0, "repeat_count": 0.0, "routers_loss": 0.001956705003976822, "skip_count": 2.0, "step": 8910, "text_loss": 0.3717629909515381 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.8406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 7.069624836117484e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 14371440.0, "repeat_count": 0.0, "routers_loss": 0.0027164234779775143, "skip_count": 1.0, "step": 8912, "text_loss": 0.3683965802192688 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 7.053766207549734e-05, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 14374965.0, "repeat_count": 0.0, "routers_loss": 0.005999395158141851, "skip_count": 2.0, "step": 8914, "text_loss": 0.6271854639053345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.85940710302319, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 7.037924036059789e-05, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 14378445.0, "repeat_count": 0.0, "routers_loss": 0.000978486379608512, "skip_count": 0.0, "step": 8916, "text_loss": 0.5927628874778748 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 7.022098327718401e-05, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 14382851.0, "repeat_count": 0.0, "routers_loss": 0.012569266371428967, "skip_count": 1.0, "step": 8918, "text_loss": 0.4092319905757904 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 41.878191957734074, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03564453125, "learning_rate": 7.006289088590007e-05, "loss": 0.0065, "macro_f1": 0.5492662787437439, "num_tokens": 14386959.0, "repeat_count": 0.0, "routers_loss": 0.011032132431864738, "skip_count": 2.0, "step": 8920, "text_loss": 0.6553854942321777 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.887584385089525, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048095703125, "learning_rate": 6.990496324732737e-05, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 14390031.0, "repeat_count": 0.0, "routers_loss": 0.001376329455524683, "skip_count": 0.0, "step": 8922, "text_loss": 0.7792862057685852 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.89697681244497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 6.974720042198396e-05, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 14392966.0, "repeat_count": 0.0, "routers_loss": 0.005924372002482414, "skip_count": 2.0, "step": 8924, "text_loss": 0.4466548562049866 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 6.958960247032515e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 14395619.0, "repeat_count": 0.0, "routers_loss": 0.010054769925773144, "skip_count": 2.0, "step": 8926, "text_loss": 0.24784758687019348 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0208740234375, "learning_rate": 6.943216945274255e-05, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 14398891.0, "repeat_count": 0.0, "routers_loss": 0.0006864808965474367, "skip_count": 0.0, "step": 8928, "text_loss": 0.5154114961624146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 6.927490142956489e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 14402991.0, "repeat_count": 0.0, "routers_loss": 0.000996887218207121, "skip_count": 0.0, "step": 8930, "text_loss": 0.5888006091117859 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 41.93454652186674, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 6.911779846105753e-05, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 14406276.0, "repeat_count": 1.0, "routers_loss": 0.0007863475475460291, "skip_count": 0.0, "step": 8932, "text_loss": 0.6862632632255554 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.943938949222186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 6.896086060742262e-05, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 14409005.0, "repeat_count": 0.0, "routers_loss": 0.0020060581155121326, "skip_count": 1.0, "step": 8934, "text_loss": 0.8998132348060608 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 41.95333137657764, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 6.880408792879905e-05, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 14411902.0, "repeat_count": 2.0, "routers_loss": 0.008094016462564468, "skip_count": 3.0, "step": 8936, "text_loss": 0.3411460518836975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 6.864748048526237e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 14414683.0, "repeat_count": 0.0, "routers_loss": 0.004374993033707142, "skip_count": 0.0, "step": 8938, "text_loss": 0.24222217500209808 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043212890625, "learning_rate": 6.84910383368249e-05, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 14417740.0, "repeat_count": 0.0, "routers_loss": 0.003004335332661867, "skip_count": 2.0, "step": 8940, "text_loss": 0.5524137020111084 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 41.98150865864397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 6.83347615434356e-05, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 14420678.0, "repeat_count": 0.0, "routers_loss": 0.007001105695962906, "skip_count": 2.0, "step": 8942, "text_loss": 0.3124033212661743 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 41.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 6.817865016497993e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 14424259.0, "repeat_count": 0.0, "routers_loss": 0.0038414683658629656, "skip_count": 0.0, "step": 8944, "text_loss": 0.509667694568634 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 42.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.060791015625, "learning_rate": 6.80227042612801e-05, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 14427084.0, "repeat_count": 1.0, "routers_loss": 0.008573584258556366, "skip_count": 0.0, "step": 8946, "text_loss": 0.2533438205718994 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.00939242735544, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 6.786692389209482e-05, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 14429690.0, "repeat_count": 1.0, "routers_loss": 0.003758789971470833, "skip_count": 2.0, "step": 8948, "text_loss": 0.14571085572242737 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06640625, "learning_rate": 6.771130911711953e-05, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 14432983.0, "repeat_count": 0.0, "routers_loss": 0.005996126215904951, "skip_count": 2.0, "step": 8950, "text_loss": 0.24994049966335297 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.02817728206633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 6.755585999598613e-05, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 14435772.0, "repeat_count": 0.0, "routers_loss": 0.0012271527666598558, "skip_count": 0.0, "step": 8952, "text_loss": 0.3705698549747467 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 42.03756970942178, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 6.740057658826293e-05, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 14438912.0, "repeat_count": 1.0, "routers_loss": 0.0017618577694520354, "skip_count": 1.0, "step": 8954, "text_loss": 0.6691124439239502 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 6.72454589534548e-05, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 14441959.0, "repeat_count": 0.0, "routers_loss": 0.0016956349136307836, "skip_count": 1.0, "step": 8956, "text_loss": 0.45412346720695496 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 6.709050715100324e-05, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 14444804.0, "repeat_count": 0.0, "routers_loss": 0.017321301624178886, "skip_count": 2.0, "step": 8958, "text_loss": 0.2668265998363495 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.06574699148811, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 6.69357212402859e-05, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 14447390.0, "repeat_count": 0.0, "routers_loss": 0.005267233122140169, "skip_count": 2.0, "step": 8960, "text_loss": 0.35546016693115234 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 42.075139418843555, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.017578125, "learning_rate": 6.67811012806172e-05, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 14451286.0, "repeat_count": 0.0, "routers_loss": 0.0045175012201070786, "skip_count": 3.0, "step": 8962, "text_loss": 0.14669834077358246 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.084531846199, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0211181640625, "learning_rate": 6.662664733124768e-05, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 14454335.0, "repeat_count": 1.0, "routers_loss": 0.004905698820948601, "skip_count": 3.0, "step": 8964, "text_loss": 0.28777357935905457 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 42.09392427355445, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 6.647235945136442e-05, "loss": 0.0074, "macro_f1": 0.8823530077934265, "num_tokens": 14457708.0, "repeat_count": 2.0, "routers_loss": 0.032136883586645126, "skip_count": 1.0, "step": 8966, "text_loss": 0.2317836582660675 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 42.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 6.631823770009088e-05, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 14460721.0, "repeat_count": 1.0, "routers_loss": 0.0038611628115177155, "skip_count": 1.0, "step": 8968, "text_loss": 0.28979742527008057 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.11270912826534, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 6.616428213648656e-05, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 14463467.0, "repeat_count": 0.0, "routers_loss": 0.0006560821202583611, "skip_count": 0.0, "step": 8970, "text_loss": 0.3474387526512146 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 42.12210155562078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 6.60104928195479e-05, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 14466586.0, "repeat_count": 1.0, "routers_loss": 0.0016879125032573938, "skip_count": 0.0, "step": 8972, "text_loss": 0.5454491972923279 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 6.58568698082071e-05, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 14470125.0, "repeat_count": 0.0, "routers_loss": 0.0004945555119775236, "skip_count": 0.0, "step": 8974, "text_loss": 0.4728975296020508 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.14088641033167, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 6.570341316133272e-05, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 14473887.0, "repeat_count": 2.0, "routers_loss": 0.010141569189727306, "skip_count": 3.0, "step": 8976, "text_loss": 0.24756617844104767 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.15027883768712, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 6.555012293772967e-05, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 14477046.0, "repeat_count": 1.0, "routers_loss": 0.011950359679758549, "skip_count": 2.0, "step": 8978, "text_loss": 0.25375646352767944 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 6.539699919613911e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 14480638.0, "repeat_count": 0.0, "routers_loss": 0.0007824545609764755, "skip_count": 0.0, "step": 8980, "text_loss": 0.6888379454612732 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 6.524404199523826e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 14483723.0, "repeat_count": 0.0, "routers_loss": 0.004318726249039173, "skip_count": 1.0, "step": 8982, "text_loss": 0.3603152334690094 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.17845611975345, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 6.509125139364058e-05, "loss": 0.0064, "macro_f1": 0.3272727429866791, "num_tokens": 14486876.0, "repeat_count": 0.0, "routers_loss": 0.010652635246515274, "skip_count": 1.0, "step": 8984, "text_loss": 0.43394285440444946 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.18784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 6.493862744989587e-05, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 14489944.0, "repeat_count": 0.0, "routers_loss": 0.0010475299786776304, "skip_count": 0.0, "step": 8986, "text_loss": 0.5952020287513733 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.197240974464336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 6.478617022248984e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 14493094.0, "repeat_count": 0.0, "routers_loss": 0.004329503979533911, "skip_count": 1.0, "step": 8988, "text_loss": 0.7284399271011353 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.20663340181978, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 6.463387976984437e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14496944.0, "repeat_count": 0.0, "routers_loss": 0.0019588395953178406, "skip_count": 1.0, "step": 8990, "text_loss": 0.8103306889533997 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.21602582917523, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 6.448175615031749e-05, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 14499997.0, "repeat_count": 0.0, "routers_loss": 0.008046228438615799, "skip_count": 1.0, "step": 8992, "text_loss": 0.14758773148059845 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 42.22541825653067, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 6.432979942220319e-05, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 14503247.0, "repeat_count": 1.0, "routers_loss": 0.0028899910394102335, "skip_count": 0.0, "step": 8994, "text_loss": 0.2568151652812958 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.23481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 6.417800964373161e-05, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 14506244.0, "repeat_count": 0.0, "routers_loss": 0.0042211092077195644, "skip_count": 2.0, "step": 8996, "text_loss": 0.3506850600242615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 6.402638687306872e-05, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 14510502.0, "repeat_count": 0.0, "routers_loss": 0.003309462917968631, "skip_count": 0.0, "step": 8998, "text_loss": 0.5852319598197937 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 42.253595538597004, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 6.387493116831699e-05, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 14513679.0, "repeat_count": 1.0, "routers_loss": 0.015246274881064892, "skip_count": 5.0, "step": 9000, "text_loss": 0.4266709089279175 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 42.26298796595245, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 6.372364258751434e-05, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 14516862.0, "repeat_count": 2.0, "routers_loss": 0.005648075137287378, "skip_count": 2.0, "step": 9002, "text_loss": 0.34153711795806885 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 42.2723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 6.357252118863482e-05, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 14519660.0, "repeat_count": 0.0, "routers_loss": 0.005153972655534744, "skip_count": 3.0, "step": 9004, "text_loss": 0.3911980092525482 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 6.342156702958851e-05, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 14522261.0, "repeat_count": 0.0, "routers_loss": 0.001209715730510652, "skip_count": 0.0, "step": 9006, "text_loss": 0.45400822162628174 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023193359375, "learning_rate": 6.327078016822124e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 14525368.0, "repeat_count": 0.0, "routers_loss": 0.00367624219506979, "skip_count": 1.0, "step": 9008, "text_loss": 0.5327706336975098 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0198974609375, "learning_rate": 6.31201606623149e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 14528253.0, "repeat_count": 0.0, "routers_loss": 0.0018971028039231896, "skip_count": 0.0, "step": 9010, "text_loss": 0.19216643273830414 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 42.30995010272967, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 6.296970856958712e-05, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 14531214.0, "repeat_count": 1.0, "routers_loss": 0.003927265293896198, "skip_count": 0.0, "step": 9012, "text_loss": 0.3931650221347809 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.319342530085116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 6.281942394769142e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 14535063.0, "repeat_count": 0.0, "routers_loss": 0.00801338441669941, "skip_count": 0.0, "step": 9014, "text_loss": 0.1605554074048996 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.32873495744057, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 6.266930685421717e-05, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 14538690.0, "repeat_count": 0.0, "routers_loss": 0.0013267790200188756, "skip_count": 0.0, "step": 9016, "text_loss": 0.4797641932964325 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.33812738479601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 6.251935734668957e-05, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 14542591.0, "repeat_count": 0.0, "routers_loss": 0.0013866537483409047, "skip_count": 1.0, "step": 9018, "text_loss": 0.4539037346839905 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 6.236957548256945e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 14545259.0, "repeat_count": 0.0, "routers_loss": 0.001481749233789742, "skip_count": 0.0, "step": 9020, "text_loss": 0.6693689227104187 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.3569122395069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 6.22199613192535e-05, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 14548362.0, "repeat_count": 0.0, "routers_loss": 0.005995423533022404, "skip_count": 1.0, "step": 9022, "text_loss": 0.6533607244491577 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 42.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 6.207051491407428e-05, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 14551694.0, "repeat_count": 0.0, "routers_loss": 0.015427720732986927, "skip_count": 4.0, "step": 9024, "text_loss": 0.33537840843200684 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 42.375697094217784, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 6.192123632429986e-05, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 14554614.0, "repeat_count": 1.0, "routers_loss": 0.0017432396998628974, "skip_count": 0.0, "step": 9026, "text_loss": 0.9725127220153809 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.385089521573235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021728515625, "learning_rate": 6.177212560713413e-05, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 14559474.0, "repeat_count": 0.0, "routers_loss": 0.002909898292273283, "skip_count": 2.0, "step": 9028, "text_loss": 0.16944198310375214 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.39448194892868, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 6.162318281971652e-05, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 14563046.0, "repeat_count": 0.0, "routers_loss": 0.00274385092779994, "skip_count": 0.0, "step": 9030, "text_loss": 0.43176764249801636 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 42.40387437628412, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.022216796875, "learning_rate": 6.147440801912218e-05, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 14565829.0, "repeat_count": 1.0, "routers_loss": 0.0024230771232396364, "skip_count": 0.0, "step": 9032, "text_loss": 0.5683854818344116 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 6.132580126236197e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14569016.0, "repeat_count": 0.0, "routers_loss": 0.004686394706368446, "skip_count": 1.0, "step": 9034, "text_loss": 0.5422781705856323 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 42.42265923099501, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 6.117736260638223e-05, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 14572558.0, "repeat_count": 2.0, "routers_loss": 0.0010892068967223167, "skip_count": 1.0, "step": 9036, "text_loss": 0.5740243196487427 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.43205165835045, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 6.102909210806495e-05, "loss": 0.006, "macro_f1": 0.3272727429866791, "num_tokens": 14575969.0, "repeat_count": 1.0, "routers_loss": 0.0163960512727499, "skip_count": 0.0, "step": 9038, "text_loss": 0.4803958535194397 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.441444085705896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 6.088098982422768e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 14578746.0, "repeat_count": 0.0, "routers_loss": 0.0020733694545924664, "skip_count": 0.0, "step": 9040, "text_loss": 0.30313390493392944 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.45083651306135, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 6.073305581162342e-05, "loss": 0.0066, "macro_f1": 0.6601307392120361, "num_tokens": 14581856.0, "repeat_count": 1.0, "routers_loss": 0.022739989683032036, "skip_count": 2.0, "step": 9042, "text_loss": 0.5871608257293701 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.46022894041679, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 6.058529012694086e-05, "loss": 0.0034, "macro_f1": 1.0, "num_tokens": 14584754.0, "repeat_count": 1.0, "routers_loss": 0.012138293124735355, "skip_count": 2.0, "step": 9044, "text_loss": 0.18492890894412994 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.053466796875, "learning_rate": 6.0437692826803893e-05, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 14587867.0, "repeat_count": 0.0, "routers_loss": 0.0009839123813435435, "skip_count": 0.0, "step": 9046, "text_loss": 0.5532476902008057 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 42.47901379512768, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11376953125, "learning_rate": 6.029026396777237e-05, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 14591521.0, "repeat_count": 2.0, "routers_loss": 0.01392262615263462, "skip_count": 5.0, "step": 9048, "text_loss": 0.20356278121471405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.48840622248312, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 6.0143003606341174e-05, "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 14595358.0, "repeat_count": 0.0, "routers_loss": 0.018218200653791428, "skip_count": 1.0, "step": 9050, "text_loss": 0.3070164620876312 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.497798649838565, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.019775390625, "learning_rate": 5.9995911798940764e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 14598696.0, "repeat_count": 0.0, "routers_loss": 0.0002688709646463394, "skip_count": 1.0, "step": 9052, "text_loss": 0.5637917518615723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.507191077194015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 5.984898860193694e-05, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 14602301.0, "repeat_count": 0.0, "routers_loss": 0.003135781968012452, "skip_count": 0.0, "step": 9054, "text_loss": 0.345111608505249 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.51658350454946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 5.9702234071631e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 14606625.0, "repeat_count": 0.0, "routers_loss": 0.002299862913787365, "skip_count": 0.0, "step": 9056, "text_loss": 0.30707255005836487 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 42.5259759319049, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 5.9555648264259576e-05, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 14610303.0, "repeat_count": 1.0, "routers_loss": 0.0007164468406699598, "skip_count": 0.0, "step": 9058, "text_loss": 0.56083083152771 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 5.940923123599462e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 14613211.0, "repeat_count": 0.0, "routers_loss": 0.00136603566352278, "skip_count": 0.0, "step": 9060, "text_loss": 0.4455239474773407 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 5.926298304294336e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 14615844.0, "repeat_count": 0.0, "routers_loss": 0.001727075781673193, "skip_count": 0.0, "step": 9062, "text_loss": 0.5928102731704712 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.55415321397123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 5.911690374114842e-05, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 14619190.0, "repeat_count": 0.0, "routers_loss": 0.0022300337441265583, "skip_count": 0.0, "step": 9064, "text_loss": 0.9456163048744202 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.563545641326684, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 5.8970993386587676e-05, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 14622304.0, "repeat_count": 0.0, "routers_loss": 0.006507525686174631, "skip_count": 2.0, "step": 9066, "text_loss": 0.1809750199317932 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.57293806868213, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 5.882525203517419e-05, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 14625386.0, "repeat_count": 0.0, "routers_loss": 0.0022866397630423307, "skip_count": 0.0, "step": 9068, "text_loss": 0.1849939227104187 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.58233049603757, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.048095703125, "learning_rate": 5.867967974275629e-05, "loss": 0.0097, "macro_f1": 1.0, "num_tokens": 14628472.0, "repeat_count": 1.0, "routers_loss": 0.0058460538275539875, "skip_count": 2.0, "step": 9070, "text_loss": 0.2627561688423157 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 5.853427656511773e-05, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 14631187.0, "repeat_count": 1.0, "routers_loss": 0.0085217310115695, "skip_count": 2.0, "step": 9072, "text_loss": 0.18039973080158234 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 42.60111535074846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 5.838904255797717e-05, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 14633919.0, "repeat_count": 1.0, "routers_loss": 0.007423012051731348, "skip_count": 4.0, "step": 9074, "text_loss": 0.23746201395988464 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.6105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 5.8243977776988585e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 14636674.0, "repeat_count": 0.0, "routers_loss": 0.0011181328445672989, "skip_count": 0.0, "step": 9076, "text_loss": 0.38140806555747986 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 42.619900205459345, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 5.8099082277741024e-05, "loss": 0.0052, "macro_f1": 0.9262410998344421, "num_tokens": 14639506.0, "repeat_count": 3.0, "routers_loss": 0.03306882083415985, "skip_count": 2.0, "step": 9078, "text_loss": 0.2627770006656647 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.629292632814796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 5.795435611575872e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 14642955.0, "repeat_count": 0.0, "routers_loss": 0.0014759303303435445, "skip_count": 0.0, "step": 9080, "text_loss": 0.47112786769866943 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 5.78097993465011e-05, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 14646018.0, "repeat_count": 0.0, "routers_loss": 0.003744201036170125, "skip_count": 0.0, "step": 9082, "text_loss": 0.36873605847358704 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.64807748752568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 5.7665412025362516e-05, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 14649402.0, "repeat_count": 0.0, "routers_loss": 0.002992798574268818, "skip_count": 2.0, "step": 9084, "text_loss": 0.6350628137588501 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 5.752119420767243e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 14652248.0, "repeat_count": 0.0, "routers_loss": 0.005798593629151583, "skip_count": 2.0, "step": 9086, "text_loss": 0.2512637972831726 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.66686234223657, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 5.7377145948695474e-05, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 14655060.0, "repeat_count": 0.0, "routers_loss": 0.0024162146728485823, "skip_count": 0.0, "step": 9088, "text_loss": 0.4233066439628601 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 42.67625476959201, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 5.723326730363115e-05, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 14658873.0, "repeat_count": 1.0, "routers_loss": 0.004826475866138935, "skip_count": 4.0, "step": 9090, "text_loss": 0.45946353673934937 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.685647196947464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 5.7089558327614036e-05, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 14661865.0, "repeat_count": 0.0, "routers_loss": 0.0020765739027410746, "skip_count": 2.0, "step": 9092, "text_loss": 0.9425542950630188 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.69503962430291, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0286865234375, "learning_rate": 5.694601907571356e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 14666085.0, "repeat_count": 0.0, "routers_loss": 0.0012533976696431637, "skip_count": 0.0, "step": 9094, "text_loss": 0.6307007670402527 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 42.70443205165835, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 5.680264960293446e-05, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 14668992.0, "repeat_count": 1.0, "routers_loss": 0.013796845450997353, "skip_count": 5.0, "step": 9096, "text_loss": 0.21720129251480103 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 5.665944996421612e-05, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 14672365.0, "repeat_count": 0.0, "routers_loss": 0.004391494672745466, "skip_count": 0.0, "step": 9098, "text_loss": 0.28794240951538086 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 5.651642021443287e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 14676232.0, "repeat_count": 0.0, "routers_loss": 0.0006779583054594696, "skip_count": 0.0, "step": 9100, "text_loss": 0.45190441608428955 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 23.0, "epoch": 42.73260933372468, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.0213623046875, "learning_rate": 5.637356040839398e-05, "loss": 0.0049, "macro_f1": 0.6289562582969666, "num_tokens": 14679582.0, "repeat_count": 0.0, "routers_loss": 0.02379363216459751, "skip_count": 6.0, "step": 9102, "text_loss": 0.3395652770996094 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 42.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04345703125, "learning_rate": 5.623087060084364e-05, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 14683438.0, "repeat_count": 0.0, "routers_loss": 0.00344930961728096, "skip_count": 4.0, "step": 9104, "text_loss": 0.4345538914203644 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 42.751394188435576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 5.60883508464608e-05, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 14686333.0, "repeat_count": 0.0, "routers_loss": 0.005554547533392906, "skip_count": 3.0, "step": 9106, "text_loss": 0.5202528238296509 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 5.594600119985932e-05, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 14690754.0, "repeat_count": 0.0, "routers_loss": 0.004589532967656851, "skip_count": 1.0, "step": 9108, "text_loss": 0.3040390610694885 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.77017904314646, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 5.580382171558784e-05, "loss": 0.0055, "macro_f1": 0.32098764181137085, "num_tokens": 14693793.0, "repeat_count": 0.0, "routers_loss": 0.029969461262226105, "skip_count": 2.0, "step": 9110, "text_loss": 0.3644331693649292 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.77957147050191, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 5.566181244812979e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 14697290.0, "repeat_count": 0.0, "routers_loss": 0.003387648146599531, "skip_count": 0.0, "step": 9112, "text_loss": 0.5177932977676392 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.78896389785735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0206298828125, "learning_rate": 5.5519973451903404e-05, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 14700597.0, "repeat_count": 0.0, "routers_loss": 0.004790942650288343, "skip_count": 1.0, "step": 9114, "text_loss": 0.2132686972618103 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.798356325212794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 5.5378304781261715e-05, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 14703852.0, "repeat_count": 0.0, "routers_loss": 0.0007685191812925041, "skip_count": 0.0, "step": 9116, "text_loss": 0.6690551042556763 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 42.807748752568244, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 5.523680649049234e-05, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 14707218.0, "repeat_count": 1.0, "routers_loss": 0.0033531817607581615, "skip_count": 0.0, "step": 9118, "text_loss": 0.26232191920280457 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.81714117992369, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 5.509547863381781e-05, "loss": 0.0084, "macro_f1": 0.3272727429866791, "num_tokens": 14710244.0, "repeat_count": 1.0, "routers_loss": 0.025616342201828957, "skip_count": 0.0, "step": 9120, "text_loss": 0.2897983193397522 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 5.495432126539507e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 14713495.0, "repeat_count": 0.0, "routers_loss": 0.0014400121290236712, "skip_count": 0.0, "step": 9122, "text_loss": 0.4580271244049072 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 5.481333443931602e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 14716703.0, "repeat_count": 0.0, "routers_loss": 0.0008548611658625305, "skip_count": 0.0, "step": 9124, "text_loss": 0.5140601992607117 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.84531846199002, "f1_execute": 0.9767441749572754, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 5.4672518209607e-05, "loss": 0.0075, "macro_f1": 0.9255813956260681, "num_tokens": 14719443.0, "repeat_count": 3.0, "routers_loss": 0.02092800848186016, "skip_count": 4.0, "step": 9126, "text_loss": 0.2842077314853668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.85471088934546, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 5.4531872630228965e-05, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 14722711.0, "repeat_count": 0.0, "routers_loss": 0.0037711653858423233, "skip_count": 0.0, "step": 9128, "text_loss": 0.3268158733844757 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 5.4391397755077784e-05, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 14725635.0, "repeat_count": 0.0, "routers_loss": 0.005959369707852602, "skip_count": 0.0, "step": 9130, "text_loss": 0.44725099205970764 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0181884765625, "learning_rate": 5.425109363798358e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 14728945.0, "repeat_count": 0.0, "routers_loss": 0.0011272960109636188, "skip_count": 0.0, "step": 9132, "text_loss": 0.45580998063087463 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0167236328125, "learning_rate": 5.411096033271118e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 14732271.0, "repeat_count": 0.0, "routers_loss": 0.0015554855344817042, "skip_count": 0.0, "step": 9134, "text_loss": 0.16767354309558868 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.89228059876724, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 5.3970997892959894e-05, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 14735462.0, "repeat_count": 4.0, "routers_loss": 0.007287262007594109, "skip_count": 5.0, "step": 9136, "text_loss": 0.8925374746322632 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.90167302612269, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 5.383120637236366e-05, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 14739288.0, "repeat_count": 0.0, "routers_loss": 0.004336730111390352, "skip_count": 0.0, "step": 9138, "text_loss": 0.29503148794174194 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.91106545347813, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 5.369158582449074e-05, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 14742058.0, "repeat_count": 0.0, "routers_loss": 0.004528806544840336, "skip_count": 0.0, "step": 9140, "text_loss": 0.16937516629695892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.92045788083358, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 5.3552136302844e-05, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 14745628.0, "repeat_count": 0.0, "routers_loss": 0.0005676734144799411, "skip_count": 0.0, "step": 9142, "text_loss": 0.48764488101005554 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.929850308189025, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 5.3412857860860917e-05, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 14748482.0, "repeat_count": 0.0, "routers_loss": 0.0017468055011704564, "skip_count": 0.0, "step": 9144, "text_loss": 0.46164339780807495 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.93924273554447, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 5.327375055191314e-05, "loss": 0.0051, "macro_f1": 0.3272727429866791, "num_tokens": 14751091.0, "repeat_count": 0.0, "routers_loss": 0.007167307659983635, "skip_count": 1.0, "step": 9146, "text_loss": 0.37566086649894714 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 42.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 5.3134814429306896e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 14753850.0, "repeat_count": 0.0, "routers_loss": 0.003801940008997917, "skip_count": 2.0, "step": 9148, "text_loss": 0.17589576542377472 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 5.299604954628268e-05, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 14756779.0, "repeat_count": 0.0, "routers_loss": 0.00396628538146615, "skip_count": 1.0, "step": 9150, "text_loss": 0.4118746817111969 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 42.9674200176108, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 5.2857455956015544e-05, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 14759574.0, "repeat_count": 2.0, "routers_loss": 0.003950111567974091, "skip_count": 0.0, "step": 9152, "text_loss": 0.5839328169822693 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.97681244496625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 5.271903371161479e-05, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 14762802.0, "repeat_count": 0.0, "routers_loss": 0.0006622051005251706, "skip_count": 1.0, "step": 9154, "text_loss": 0.40162989497184753 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 42.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 5.2580782866124054e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 14766136.0, "repeat_count": 0.0, "routers_loss": 0.003140404587611556, "skip_count": 0.0, "step": 9156, "text_loss": 0.2028028815984726 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 42.99559729967714, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 5.244270347252139e-05, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 14769306.0, "repeat_count": 0.0, "routers_loss": 0.0035792726557701826, "skip_count": 1.0, "step": 9158, "text_loss": 0.5611430406570435 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.004696213677725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 5.2304795583719034e-05, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 14771928.0, "repeat_count": 0.0, "routers_loss": 0.007276696152985096, "skip_count": 2.0, "step": 9160, "text_loss": 0.1382172554731369 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.01408864103317, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 5.2167059252563485e-05, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 14775047.0, "repeat_count": 0.0, "routers_loss": 0.003121814923360944, "skip_count": 0.0, "step": 9162, "text_loss": 0.6130381226539612 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 43.02348106838861, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 5.2029494531835695e-05, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 14777746.0, "repeat_count": 4.0, "routers_loss": 0.006029475014656782, "skip_count": 1.0, "step": 9164, "text_loss": 0.5901363492012024 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 43.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 5.189210147425061e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 14780813.0, "repeat_count": 0.0, "routers_loss": 0.0034428017679601908, "skip_count": 5.0, "step": 9166, "text_loss": 0.5909968018531799 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 5.1754880132457494e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 14785178.0, "repeat_count": 0.0, "routers_loss": 0.0025068193208426237, "skip_count": 2.0, "step": 9168, "text_loss": 0.20257101953029633 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.05165835045494, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 5.161783055904001e-05, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 14788307.0, "repeat_count": 0.0, "routers_loss": 0.003352245781570673, "skip_count": 0.0, "step": 9170, "text_loss": 0.20024186372756958 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 43.061050777810394, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 5.1480952806515654e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 14791053.0, "repeat_count": 1.0, "routers_loss": 0.0009423785959370434, "skip_count": 0.0, "step": 9172, "text_loss": 0.6944412589073181 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.07044320516584, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 5.13442469273363e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 14794259.0, "repeat_count": 0.0, "routers_loss": 0.0016676477389410138, "skip_count": 0.0, "step": 9174, "text_loss": 0.10889370739459991 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02099609375, "learning_rate": 5.1207712973887875e-05, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 14797345.0, "repeat_count": 0.0, "routers_loss": 0.005842766724526882, "skip_count": 2.0, "step": 9176, "text_loss": 0.17763052880764008 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 5.107135099849042e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 14800819.0, "repeat_count": 0.0, "routers_loss": 0.0004951528972014785, "skip_count": 0.0, "step": 9178, "text_loss": 0.43891432881355286 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 5.093516105339818e-05, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 14803924.0, "repeat_count": 0.0, "routers_loss": 0.0031010014936327934, "skip_count": 1.0, "step": 9180, "text_loss": 0.39177098870277405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.10801291458761, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 5.079914319079931e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 14807083.0, "repeat_count": 0.0, "routers_loss": 0.00047361713950522244, "skip_count": 0.0, "step": 9182, "text_loss": 0.39144888520240784 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.117405341943055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 5.066329746281617e-05, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 14810263.0, "repeat_count": 0.0, "routers_loss": 0.0018734827172011137, "skip_count": 0.0, "step": 9184, "text_loss": 0.531446099281311 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.126797769298506, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 5.052762392150506e-05, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 14813761.0, "repeat_count": 0.0, "routers_loss": 0.00503428652882576, "skip_count": 0.0, "step": 9186, "text_loss": 0.19398775696754456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 5.039212261885634e-05, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 14817708.0, "repeat_count": 0.0, "routers_loss": 0.0010842647170647979, "skip_count": 0.0, "step": 9188, "text_loss": 0.5365647077560425 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 43.14558262400939, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0172119140625, "learning_rate": 5.025679360679442e-05, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 14820912.0, "repeat_count": 2.0, "routers_loss": 0.004775309935212135, "skip_count": 2.0, "step": 9190, "text_loss": 0.6473321318626404 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 5.012163693717747e-05, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 14824115.0, "repeat_count": 0.0, "routers_loss": 0.004022061824798584, "skip_count": 0.0, "step": 9192, "text_loss": 0.24432586133480072 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.16436747872028, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 4.9986652661798025e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 14827404.0, "repeat_count": 0.0, "routers_loss": 0.00231996551156044, "skip_count": 1.0, "step": 9194, "text_loss": 0.7459486722946167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.17375990607572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 4.98518408323822e-05, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 14830077.0, "repeat_count": 0.0, "routers_loss": 0.000999651150777936, "skip_count": 0.0, "step": 9196, "text_loss": 0.5136345624923706 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.183152333431174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 4.971720150059012e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 14833231.0, "repeat_count": 0.0, "routers_loss": 0.0033226648811250925, "skip_count": 2.0, "step": 9198, "text_loss": 0.1597593128681183 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.19254476078662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 4.958273471801583e-05, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 14836534.0, "repeat_count": 0.0, "routers_loss": 0.00400200579315424, "skip_count": 0.0, "step": 9200, "text_loss": 0.16248664259910583 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 4.94484405361873e-05, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 14840301.0, "repeat_count": 0.0, "routers_loss": 0.0038636941462755203, "skip_count": 0.0, "step": 9202, "text_loss": 0.20964740216732025 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 4.9314319006566296e-05, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 14844094.0, "repeat_count": 0.0, "routers_loss": 0.00593461561948061, "skip_count": 2.0, "step": 9204, "text_loss": 0.43311986327171326 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0166015625, "learning_rate": 4.918037018054844e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 14847148.0, "repeat_count": 0.0, "routers_loss": 0.0007939442875795066, "skip_count": 0.0, "step": 9206, "text_loss": 0.8805840015411377 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.23011447020839, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 4.904659410946311e-05, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 14851556.0, "repeat_count": 2.0, "routers_loss": 0.0058822291903197765, "skip_count": 4.0, "step": 9208, "text_loss": 0.2123873233795166 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 4.891299084457362e-05, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 14855208.0, "repeat_count": 0.0, "routers_loss": 0.0024413811042904854, "skip_count": 0.0, "step": 9210, "text_loss": 0.4408712685108185 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.248899324919286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0400390625, "learning_rate": 4.8779560437076983e-05, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 14858433.0, "repeat_count": 0.0, "routers_loss": 0.007487752009183168, "skip_count": 1.0, "step": 9212, "text_loss": 0.7417129874229431 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 4.864630293810401e-05, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 14861739.0, "repeat_count": 0.0, "routers_loss": 0.007972145453095436, "skip_count": 2.0, "step": 9214, "text_loss": 0.3347324728965759 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.26768417963017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 4.851321839871908e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 14865220.0, "repeat_count": 0.0, "routers_loss": 0.006238576490432024, "skip_count": 1.0, "step": 9216, "text_loss": 0.49660998582839966 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.27707660698562, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 4.838030686992062e-05, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 14868179.0, "repeat_count": 0.0, "routers_loss": 0.003592922119423747, "skip_count": 0.0, "step": 9218, "text_loss": 0.316535621881485 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 43.28646903434106, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 4.824756840264055e-05, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 14870950.0, "repeat_count": 0.0, "routers_loss": 0.012321153655648232, "skip_count": 3.0, "step": 9220, "text_loss": 0.270915150642395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.295861461696504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 4.8115003047744466e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 14873749.0, "repeat_count": 0.0, "routers_loss": 0.0008396002231165767, "skip_count": 0.0, "step": 9222, "text_loss": 0.4190096855163574 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.305253889051954, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0169677734375, "learning_rate": 4.798261085603162e-05, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 14877349.0, "repeat_count": 0.0, "routers_loss": 0.002983161248266697, "skip_count": 1.0, "step": 9224, "text_loss": 0.8203139901161194 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.3146463164074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 4.785039187823503e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 14881192.0, "repeat_count": 0.0, "routers_loss": 0.003951616585254669, "skip_count": 2.0, "step": 9226, "text_loss": 0.36447709798812866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 4.771834616502119e-05, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 14884608.0, "repeat_count": 0.0, "routers_loss": 0.001604852732270956, "skip_count": 0.0, "step": 9228, "text_loss": 0.733951985836029 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.333431171118285, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 4.758647376699032e-05, "loss": 0.0053, "macro_f1": 0.8820862174034119, "num_tokens": 14887963.0, "repeat_count": 2.0, "routers_loss": 0.041028670966625214, "skip_count": 2.0, "step": 9230, "text_loss": 0.1800784021615982 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.34282359847373, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 4.7454774734676074e-05, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 14890769.0, "repeat_count": 0.0, "routers_loss": 0.0027380166575312614, "skip_count": 0.0, "step": 9232, "text_loss": 0.6017972230911255 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.35221602582917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03564453125, "learning_rate": 4.732324911854591e-05, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 14894162.0, "repeat_count": 0.0, "routers_loss": 0.0018064725445583463, "skip_count": 2.0, "step": 9234, "text_loss": 0.5853637456893921 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 43.36160845318462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 4.7191896969000617e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 14897248.0, "repeat_count": 1.0, "routers_loss": 0.005479716695845127, "skip_count": 0.0, "step": 9236, "text_loss": 0.6206526756286621 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 43.371000880540066, "f1_execute": 0.9767441749572754, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.0257568359375, "learning_rate": 4.706071833637454e-05, "loss": 0.0059, "macro_f1": 0.9446290731430054, "num_tokens": 14900186.0, "repeat_count": 4.0, "routers_loss": 0.013435420580208302, "skip_count": 3.0, "step": 9238, "text_loss": 0.46402135491371155 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 43.38039330789551, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 4.692971327093559e-05, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 14903080.0, "repeat_count": 1.0, "routers_loss": 0.007366253528743982, "skip_count": 4.0, "step": 9240, "text_loss": 0.6870771646499634 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.38978573525095, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 4.6798881822885276e-05, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 14906837.0, "repeat_count": 1.0, "routers_loss": 0.004979560151696205, "skip_count": 2.0, "step": 9242, "text_loss": 0.46396589279174805 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.3991781626064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 4.666822404235838e-05, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 14909541.0, "repeat_count": 0.0, "routers_loss": 0.00023516178771387786, "skip_count": 0.0, "step": 9244, "text_loss": 0.5960518717765808 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 43.40857058996184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 4.6537739979423174e-05, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 14912820.0, "repeat_count": 1.0, "routers_loss": 0.0014796241885051131, "skip_count": 1.0, "step": 9246, "text_loss": 0.48075684905052185 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.41796301731729, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 4.640742968408146e-05, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 14916283.0, "repeat_count": 0.0, "routers_loss": 0.001386807532981038, "skip_count": 0.0, "step": 9248, "text_loss": 0.3950015902519226 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 43.427355444672735, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.037109375, "learning_rate": 4.627729320626833e-05, "loss": 0.0061, "macro_f1": 0.9452888369560242, "num_tokens": 14918958.0, "repeat_count": 1.0, "routers_loss": 0.020335515961050987, "skip_count": 4.0, "step": 9250, "text_loss": 0.6995832324028015 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.43674787202818, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 4.6147330595852354e-05, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 14921888.0, "repeat_count": 0.0, "routers_loss": 0.005387732293456793, "skip_count": 2.0, "step": 9252, "text_loss": 0.2771800756454468 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 4.601754190263552e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14925135.0, "repeat_count": 0.0, "routers_loss": 0.001703745685517788, "skip_count": 1.0, "step": 9254, "text_loss": 0.7100088596343994 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 4.5887927176352875e-05, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 14929198.0, "repeat_count": 0.0, "routers_loss": 0.0058114733546972275, "skip_count": 2.0, "step": 9256, "text_loss": 0.21729083359241486 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 4.5758486466673244e-05, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 14932685.0, "repeat_count": 0.0, "routers_loss": 0.0026105218566954136, "skip_count": 0.0, "step": 9258, "text_loss": 0.20695121586322784 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.47431758144996, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 4.5629219823198564e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 14937901.0, "repeat_count": 0.0, "routers_loss": 0.006947176996618509, "skip_count": 2.0, "step": 9260, "text_loss": 0.15886647999286652 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 4.550012729546393e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 14941406.0, "repeat_count": 0.0, "routers_loss": 0.0011366386897861958, "skip_count": 0.0, "step": 9262, "text_loss": 0.49892309308052063 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 43.49310243616085, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 4.537120893293789e-05, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 14944200.0, "repeat_count": 1.0, "routers_loss": 0.002686526160687208, "skip_count": 1.0, "step": 9264, "text_loss": 0.6201852560043335 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 4.5242464785022256e-05, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 14947592.0, "repeat_count": 0.0, "routers_loss": 0.0007816873257979751, "skip_count": 0.0, "step": 9266, "text_loss": 0.49434536695480347 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 4.5113894901051944e-05, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 14950382.0, "repeat_count": 0.0, "routers_loss": 0.0013167982688173652, "skip_count": 0.0, "step": 9268, "text_loss": 0.696306586265564 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 43.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03564453125, "learning_rate": 4.498549933029511e-05, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 14953424.0, "repeat_count": 0.0, "routers_loss": 0.006240467075258493, "skip_count": 3.0, "step": 9270, "text_loss": 0.14193731546401978 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.53067214558262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0242919921875, "learning_rate": 4.485727812195339e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 14956937.0, "repeat_count": 0.0, "routers_loss": 0.006212725769728422, "skip_count": 2.0, "step": 9272, "text_loss": 0.40858668088912964 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.54006457293807, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 4.472923132516132e-05, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 14960398.0, "repeat_count": 0.0, "routers_loss": 0.003120801877230406, "skip_count": 2.0, "step": 9274, "text_loss": 0.4740981459617615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 4.46013589889866e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 14963037.0, "repeat_count": 0.0, "routers_loss": 0.0027343074325472116, "skip_count": 0.0, "step": 9276, "text_loss": 0.1420614868402481 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 4.4473661162430176e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 14965604.0, "repeat_count": 0.0, "routers_loss": 0.0006372901843860745, "skip_count": 0.0, "step": 9278, "text_loss": 0.4628531336784363 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.5682418550044, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 4.4346137894426155e-05, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 14968803.0, "repeat_count": 0.0, "routers_loss": 0.0062922025099396706, "skip_count": 2.0, "step": 9280, "text_loss": 0.29813849925994873 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.577634282359845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 4.421878923384159e-05, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 14972557.0, "repeat_count": 0.0, "routers_loss": 0.006071912590414286, "skip_count": 2.0, "step": 9282, "text_loss": 0.19581027328968048 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 43.58702670971529, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 4.40916152294768e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 14975358.0, "repeat_count": 1.0, "routers_loss": 0.001606325968168676, "skip_count": 0.0, "step": 9284, "text_loss": 0.6929896473884583 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.59641913707074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 4.3964615930065124e-05, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 14978045.0, "repeat_count": 0.0, "routers_loss": 0.002845643786713481, "skip_count": 1.0, "step": 9286, "text_loss": 0.49997636675834656 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 4.3837791384272744e-05, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 14981606.0, "repeat_count": 0.0, "routers_loss": 0.005257320590317249, "skip_count": 1.0, "step": 9288, "text_loss": 0.3391074538230896 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.61520399178163, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 4.3711141640699395e-05, "loss": 0.0045, "macro_f1": 0.8820862174034119, "num_tokens": 14984404.0, "repeat_count": 2.0, "routers_loss": 0.02914038859307766, "skip_count": 2.0, "step": 9290, "text_loss": 0.29165980219841003 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 4.3584666747877254e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 14987280.0, "repeat_count": 0.0, "routers_loss": 0.005831835325807333, "skip_count": 1.0, "step": 9292, "text_loss": 0.5312305688858032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 4.345836675427184e-05, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 14990071.0, "repeat_count": 0.0, "routers_loss": 0.0035566375590860844, "skip_count": 0.0, "step": 9294, "text_loss": 0.25595441460609436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 4.333224170828149e-05, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 14993809.0, "repeat_count": 0.0, "routers_loss": 0.0026552488561719656, "skip_count": 0.0, "step": 9296, "text_loss": 0.18538808822631836 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 43.65277370120341, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 4.3206291658237586e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 14996794.0, "repeat_count": 0.0, "routers_loss": 0.010047328658401966, "skip_count": 4.0, "step": 9298, "text_loss": 0.37891554832458496 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 4.308051665240442e-05, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 15000911.0, "repeat_count": 0.0, "routers_loss": 0.0030308531131595373, "skip_count": 0.0, "step": 9300, "text_loss": 0.20204831659793854 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 4.295491673897922e-05, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 15004106.0, "repeat_count": 0.0, "routers_loss": 0.003695673542097211, "skip_count": 1.0, "step": 9302, "text_loss": 0.84013831615448 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 4.282949196609215e-05, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 15007482.0, "repeat_count": 0.0, "routers_loss": 0.000820459274109453, "skip_count": 0.0, "step": 9304, "text_loss": 0.4521652162075043 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.69034341062518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025390625, "learning_rate": 4.2704242381806144e-05, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 15010579.0, "repeat_count": 0.0, "routers_loss": 0.006170184817165136, "skip_count": 1.0, "step": 9306, "text_loss": 0.22438007593154907 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 43.699735837980626, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.051025390625, "learning_rate": 4.25791680341171e-05, "loss": 0.0065, "macro_f1": 0.6122449040412903, "num_tokens": 15013835.0, "repeat_count": 0.0, "routers_loss": 0.021745599806308746, "skip_count": 4.0, "step": 9308, "text_loss": 0.5847432613372803 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.70912826533607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 4.245426897095372e-05, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 15017268.0, "repeat_count": 0.0, "routers_loss": 0.0022570823784917593, "skip_count": 1.0, "step": 9310, "text_loss": 0.345931738615036 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.71852069269152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 4.232954524017763e-05, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 15020095.0, "repeat_count": 0.0, "routers_loss": 0.0009895693510770798, "skip_count": 0.0, "step": 9312, "text_loss": 0.5374923944473267 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.72791312004696, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 4.220499688958307e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 15022763.0, "repeat_count": 0.0, "routers_loss": 0.005146807990968227, "skip_count": 0.0, "step": 9314, "text_loss": 0.7208939790725708 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.73730554740241, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022216796875, "learning_rate": 4.208062396689738e-05, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 15025926.0, "repeat_count": 0.0, "routers_loss": 0.00369556387886405, "skip_count": 1.0, "step": 9316, "text_loss": 0.36686572432518005 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 4.1956426519780435e-05, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 15029120.0, "repeat_count": 0.0, "routers_loss": 0.00971714872866869, "skip_count": 2.0, "step": 9318, "text_loss": 0.20697914063930511 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 43.756090402113294, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0286865234375, "learning_rate": 4.183240459582488e-05, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 15032000.0, "repeat_count": 1.0, "routers_loss": 0.002361048012971878, "skip_count": 1.0, "step": 9320, "text_loss": 0.6737313866615295 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047119140625, "learning_rate": 4.1708558242556207e-05, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 15034831.0, "repeat_count": 0.0, "routers_loss": 0.001238204538822174, "skip_count": 0.0, "step": 9322, "text_loss": 0.823642373085022 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.77487525682419, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 4.1584887507432556e-05, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 15037487.0, "repeat_count": 0.0, "routers_loss": 0.005211949814110994, "skip_count": 1.0, "step": 9324, "text_loss": 0.3821350634098053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 4.146139243784475e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 15040167.0, "repeat_count": 0.0, "routers_loss": 0.007513152435421944, "skip_count": 0.0, "step": 9326, "text_loss": 0.18124167621135712 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 4.133807308111637e-05, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 15043777.0, "repeat_count": 0.0, "routers_loss": 0.0029832208529114723, "skip_count": 0.0, "step": 9328, "text_loss": 0.47313618659973145 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0615234375, "learning_rate": 4.1214929484503615e-05, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 15046622.0, "repeat_count": 0.0, "routers_loss": 0.009155526757240295, "skip_count": 1.0, "step": 9330, "text_loss": 0.20556017756462097 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.81244496624596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 4.1091961695195304e-05, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 15049543.0, "repeat_count": 0.0, "routers_loss": 0.003529169363901019, "skip_count": 0.0, "step": 9332, "text_loss": 0.18752245604991913 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.821837393601406, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04638671875, "learning_rate": 4.0969169760313005e-05, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 15052924.0, "repeat_count": 1.0, "routers_loss": 0.002136822324246168, "skip_count": 2.0, "step": 9334, "text_loss": 0.85563725233078 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.83122982095686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053466796875, "learning_rate": 4.084655372691076e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 15056579.0, "repeat_count": 0.0, "routers_loss": 0.003167972667142749, "skip_count": 2.0, "step": 9336, "text_loss": 0.45709627866744995 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 43.8406222483123, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0240478515625, "learning_rate": 4.07241136419752e-05, "loss": 0.0048, "macro_f1": 0.5492662787437439, "num_tokens": 15059739.0, "repeat_count": 0.0, "routers_loss": 0.03742539510130882, "skip_count": 2.0, "step": 9338, "text_loss": 0.19531641900539398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 4.06018495524258e-05, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 15062795.0, "repeat_count": 0.0, "routers_loss": 0.002699678996577859, "skip_count": 0.0, "step": 9340, "text_loss": 0.31032654643058777 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.85940710302319, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 4.047976150511423e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 15066591.0, "repeat_count": 0.0, "routers_loss": 0.0026099481619894505, "skip_count": 0.0, "step": 9342, "text_loss": 0.4676157832145691 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.052490234375, "learning_rate": 4.035784954682486e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 15069509.0, "repeat_count": 0.0, "routers_loss": 0.006772278342396021, "skip_count": 1.0, "step": 9344, "text_loss": 0.23385995626449585 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 43.878191957734074, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 4.0236113724274713e-05, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 15072898.0, "repeat_count": 1.0, "routers_loss": 0.0005968905170448124, "skip_count": 0.0, "step": 9346, "text_loss": 0.6250094175338745 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.887584385089525, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030029296875, "learning_rate": 4.011455408411302e-05, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 15075547.0, "repeat_count": 0.0, "routers_loss": 0.012884319759905338, "skip_count": 2.0, "step": 9348, "text_loss": 0.23720405995845795 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.89697681244497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 3.9993170672921794e-05, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 15078902.0, "repeat_count": 0.0, "routers_loss": 0.0018171088304370642, "skip_count": 0.0, "step": 9350, "text_loss": 0.23975110054016113 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 43.90636923980041, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0458984375, "learning_rate": 3.9871963537215284e-05, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 15082292.0, "repeat_count": 1.0, "routers_loss": 0.001974726328626275, "skip_count": 1.0, "step": 9352, "text_loss": 0.354034423828125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 3.975093272344038e-05, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 15085288.0, "repeat_count": 0.0, "routers_loss": 0.0014760299818590283, "skip_count": 0.0, "step": 9354, "text_loss": 0.6398947834968567 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 43.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 3.963007827797627e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 15089089.0, "repeat_count": 0.0, "routers_loss": 0.004467889666557312, "skip_count": 3.0, "step": 9356, "text_loss": 0.26422595977783203 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.93454652186674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 3.950940024713462e-05, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 15092178.0, "repeat_count": 0.0, "routers_loss": 0.0048953029327094555, "skip_count": 1.0, "step": 9358, "text_loss": 0.7519236207008362 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 43.943938949222186, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 3.9388898677159446e-05, "loss": 0.0065, "macro_f1": 1.0, "num_tokens": 15094825.0, "repeat_count": 1.0, "routers_loss": 0.004229324869811535, "skip_count": 1.0, "step": 9360, "text_loss": 0.522379457950592 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 43.95333137657764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 3.9268573614227146e-05, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 15098119.0, "repeat_count": 0.0, "routers_loss": 0.0028480603359639645, "skip_count": 3.0, "step": 9362, "text_loss": 0.47443902492523193 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 3.914842510444666e-05, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 15101362.0, "repeat_count": 0.0, "routers_loss": 0.0024998984299600124, "skip_count": 1.0, "step": 9364, "text_loss": 0.6255060434341431 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 43.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 3.9028453193859006e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 15104544.0, "repeat_count": 0.0, "routers_loss": 0.008692052215337753, "skip_count": 1.0, "step": 9366, "text_loss": 0.26974618434906006 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 43.98150865864397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0291748046875, "learning_rate": 3.890865792843768e-05, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 15107619.0, "repeat_count": 0.0, "routers_loss": 0.002779777627438307, "skip_count": 2.0, "step": 9368, "text_loss": 0.4157184064388275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 43.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 3.878903935408845e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 15111352.0, "repeat_count": 0.0, "routers_loss": 0.0010220289696007967, "skip_count": 0.0, "step": 9370, "text_loss": 0.5674155950546265 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 3.866959751664939e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 15114088.0, "repeat_count": 0.0, "routers_loss": 0.004387985449284315, "skip_count": 1.0, "step": 9372, "text_loss": 0.3638002276420593 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.00939242735544, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 3.8550332461890824e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 15117271.0, "repeat_count": 0.0, "routers_loss": 0.0005855522467754781, "skip_count": 0.0, "step": 9374, "text_loss": 0.6257871389389038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023681640625, "learning_rate": 3.843124423551536e-05, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 15119936.0, "repeat_count": 0.0, "routers_loss": 0.0026496360078454018, "skip_count": 0.0, "step": 9376, "text_loss": 0.7019506096839905 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.02817728206633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 3.8312332883157774e-05, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 15123407.0, "repeat_count": 0.0, "routers_loss": 0.0024072150699794292, "skip_count": 0.0, "step": 9378, "text_loss": 0.45380696654319763 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 3.819359845038517e-05, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 15126742.0, "repeat_count": 0.0, "routers_loss": 0.00031929166289046407, "skip_count": 0.0, "step": 9380, "text_loss": 0.5322204828262329 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 3.807504098269682e-05, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 15130854.0, "repeat_count": 0.0, "routers_loss": 0.00177620945032686, "skip_count": 0.0, "step": 9382, "text_loss": 0.5220870971679688 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 44.05635456413267, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.02783203125, "learning_rate": 3.7956660525524156e-05, "loss": 0.0071, "macro_f1": 0.8823530077934265, "num_tokens": 15135054.0, "repeat_count": 1.0, "routers_loss": 0.013358182273805141, "skip_count": 2.0, "step": 9384, "text_loss": 0.39796701073646545 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.06574699148811, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 3.783845712423067e-05, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 15139179.0, "repeat_count": 0.0, "routers_loss": 0.0030253338627517223, "skip_count": 0.0, "step": 9386, "text_loss": 0.13592341542243958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.075139418843555, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 3.772043082411236e-05, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 15142436.0, "repeat_count": 0.0, "routers_loss": 0.0008311813580803573, "skip_count": 0.0, "step": 9388, "text_loss": 0.7804215550422668 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.084531846199, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02490234375, "learning_rate": 3.760258167039704e-05, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 15146071.0, "repeat_count": 0.0, "routers_loss": 0.012432600371539593, "skip_count": 1.0, "step": 9390, "text_loss": 0.37692421674728394 }, { "acc_repeat": 1.0, "acc_skip": 0.8571428656578064, "avg_layers": 23.0, "epoch": 44.09392427355445, "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9230769276618958, "grad_norm": 0.053955078125, "learning_rate": 3.748490970824464e-05, "loss": 0.0074, "macro_f1": 0.9662289023399353, "num_tokens": 15149020.0, "repeat_count": 1.0, "routers_loss": 0.03158312290906906, "skip_count": 7.0, "step": 9392, "text_loss": 0.6111845374107361 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0166015625, "learning_rate": 3.7367414982747374e-05, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 15151887.0, "repeat_count": 0.0, "routers_loss": 0.000898235070053488, "skip_count": 0.0, "step": 9394, "text_loss": 0.42988476157188416 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.11270912826534, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 3.7250097538929384e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 15155395.0, "repeat_count": 0.0, "routers_loss": 0.0024584042839705944, "skip_count": 1.0, "step": 9396, "text_loss": 0.4083070456981659 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 3.713295742174694e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 15158275.0, "repeat_count": 0.0, "routers_loss": 0.0012269694125279784, "skip_count": 0.0, "step": 9398, "text_loss": 0.529385507106781 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 3.701599467608835e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 15161533.0, "repeat_count": 0.0, "routers_loss": 0.002610012423247099, "skip_count": 1.0, "step": 9400, "text_loss": 0.1785552203655243 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 44.14088641033167, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 3.6899209346773986e-05, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 15164799.0, "repeat_count": 1.0, "routers_loss": 0.0012146600056439638, "skip_count": 0.0, "step": 9402, "text_loss": 0.9209059476852417 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.15027883768712, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 3.678260147855628e-05, "loss": 0.0028, "macro_f1": 0.6666666865348816, "num_tokens": 15168111.0, "repeat_count": 0.0, "routers_loss": 0.001716976286843419, "skip_count": 1.0, "step": 9404, "text_loss": 0.5762659907341003 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.15967126504256, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0213623046875, "learning_rate": 3.6666171116119474e-05, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 15171285.0, "repeat_count": 1.0, "routers_loss": 0.005656248424202204, "skip_count": 2.0, "step": 9406, "text_loss": 0.3065127432346344 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 3.6549918304079946e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 15174838.0, "repeat_count": 0.0, "routers_loss": 0.002362997969612479, "skip_count": 2.0, "step": 9408, "text_loss": 0.5256759524345398 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 3.643384308698594e-05, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 15177713.0, "repeat_count": 0.0, "routers_loss": 0.002327109221369028, "skip_count": 1.0, "step": 9410, "text_loss": 0.27613985538482666 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 44.18784854710889, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 3.6317945509317716e-05, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 15180863.0, "repeat_count": 1.0, "routers_loss": 0.008501979522407055, "skip_count": 0.0, "step": 9412, "text_loss": 0.3379829525947571 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.197240974464336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 3.6202225615487525e-05, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 15184531.0, "repeat_count": 0.0, "routers_loss": 0.004115676507353783, "skip_count": 0.0, "step": 9414, "text_loss": 0.24313601851463318 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.20663340181978, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 3.6086683449839454e-05, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 15187699.0, "repeat_count": 0.0, "routers_loss": 0.0017425924306735396, "skip_count": 0.0, "step": 9416, "text_loss": 0.47485142946243286 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 44.21602582917523, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.030029296875, "learning_rate": 3.597131905664935e-05, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 15190528.0, "repeat_count": 1.0, "routers_loss": 0.0031498887110501528, "skip_count": 1.0, "step": 9418, "text_loss": 0.5356660485267639 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.22541825653067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 3.585613248012515e-05, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 15194165.0, "repeat_count": 0.0, "routers_loss": 0.006833057850599289, "skip_count": 1.0, "step": 9420, "text_loss": 0.21593274176120758 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.23481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 3.574112376440658e-05, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 15197612.0, "repeat_count": 0.0, "routers_loss": 0.0013788710348308086, "skip_count": 1.0, "step": 9422, "text_loss": 0.5275097489356995 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022216796875, "learning_rate": 3.5626292953565175e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 15201103.0, "repeat_count": 0.0, "routers_loss": 0.0021296890918165445, "skip_count": 0.0, "step": 9424, "text_loss": 0.3420610725879669 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.253595538597004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 3.551164009160429e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 15204007.0, "repeat_count": 0.0, "routers_loss": 0.0025281559210270643, "skip_count": 0.0, "step": 9426, "text_loss": 0.4756413996219635 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 3.539716522245917e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 15208066.0, "repeat_count": 0.0, "routers_loss": 0.0008577071712352335, "skip_count": 0.0, "step": 9428, "text_loss": 0.7672523260116577 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 44.2723803933079, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 3.528286838999672e-05, "loss": 0.0032, "macro_f1": 0.6666666865348816, "num_tokens": 15211118.0, "repeat_count": 1.0, "routers_loss": 0.002977409167215228, "skip_count": 0.0, "step": 9430, "text_loss": 0.5010796785354614 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 44.28177282066334, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 3.5168749638015806e-05, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 15214245.0, "repeat_count": 1.0, "routers_loss": 0.0009552660631015897, "skip_count": 0.0, "step": 9432, "text_loss": 0.6633321642875671 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0289306640625, "learning_rate": 3.505480901024677e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 15217449.0, "repeat_count": 0.0, "routers_loss": 0.005598205607384443, "skip_count": 2.0, "step": 9434, "text_loss": 0.545702338218689 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 44.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 3.494104655035213e-05, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 15220391.0, "repeat_count": 0.0, "routers_loss": 0.0154950562864542, "skip_count": 4.0, "step": 9436, "text_loss": 0.211164191365242 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.30995010272967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 3.4827462301925735e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 15224061.0, "repeat_count": 0.0, "routers_loss": 0.001531782210804522, "skip_count": 0.0, "step": 9438, "text_loss": 0.49369096755981445 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.319342530085116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 3.471405630849328e-05, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 15227586.0, "repeat_count": 0.0, "routers_loss": 0.004152537789195776, "skip_count": 1.0, "step": 9440, "text_loss": 0.1624782234430313 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.32873495744057, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046875, "learning_rate": 3.4600828613512156e-05, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 15230713.0, "repeat_count": 0.0, "routers_loss": 0.0026113570202142, "skip_count": 0.0, "step": 9442, "text_loss": 0.1921689808368683 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 44.33812738479601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 3.44877792603715e-05, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 15233925.0, "repeat_count": 0.0, "routers_loss": 0.008077848702669144, "skip_count": 3.0, "step": 9444, "text_loss": 0.32417818903923035 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 3.437490829239193e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 15236684.0, "repeat_count": 0.0, "routers_loss": 0.0005273211863823235, "skip_count": 0.0, "step": 9446, "text_loss": 0.3497772812843323 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.3569122395069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 3.4262215752825895e-05, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 15239866.0, "repeat_count": 0.0, "routers_loss": 0.0015295564662665129, "skip_count": 0.0, "step": 9448, "text_loss": 0.7613807320594788 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 44.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 3.414970168485737e-05, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 15243615.0, "repeat_count": 1.0, "routers_loss": 0.0039047773461788893, "skip_count": 0.0, "step": 9450, "text_loss": 0.3325706720352173 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.375697094217784, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 3.403736613160191e-05, "loss": 0.0049, "macro_f1": 0.32098764181137085, "num_tokens": 15246714.0, "repeat_count": 0.0, "routers_loss": 0.0300968699157238, "skip_count": 2.0, "step": 9452, "text_loss": 0.3441869020462036 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 44.385089521573235, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 3.392520913610681e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 15249520.0, "repeat_count": 1.0, "routers_loss": 0.0037529836408793926, "skip_count": 0.0, "step": 9454, "text_loss": 0.5083104968070984 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.39448194892868, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 3.381323074135073e-05, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 15252527.0, "repeat_count": 0.0, "routers_loss": 0.0019368440844118595, "skip_count": 2.0, "step": 9456, "text_loss": 0.49744489789009094 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 3.3701430990244085e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 15255330.0, "repeat_count": 0.0, "routers_loss": 0.0033424650318920612, "skip_count": 1.0, "step": 9458, "text_loss": 0.5603348016738892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 3.35898099256286e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 15257961.0, "repeat_count": 0.0, "routers_loss": 0.0006928095244802535, "skip_count": 0.0, "step": 9460, "text_loss": 0.5270714163780212 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 3.347836759027789e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 15261137.0, "repeat_count": 0.0, "routers_loss": 0.0030718250200152397, "skip_count": 2.0, "step": 9462, "text_loss": 0.11651179939508438 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.43205165835045, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 3.33671040268968e-05, "loss": 0.0064, "macro_f1": 0.6601307392120361, "num_tokens": 15264234.0, "repeat_count": 1.0, "routers_loss": 0.03508305177092552, "skip_count": 2.0, "step": 9464, "text_loss": 0.14562347531318665 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.441444085705896, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 3.3256019278121717e-05, "loss": 0.0066, "macro_f1": 0.3272727429866791, "num_tokens": 15267047.0, "repeat_count": 0.0, "routers_loss": 0.008365205489099026, "skip_count": 1.0, "step": 9466, "text_loss": 0.8550931215286255 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.45083651306135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 3.3145113386520485e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 15270442.0, "repeat_count": 0.0, "routers_loss": 0.0036910634953528643, "skip_count": 0.0, "step": 9468, "text_loss": 0.24741731584072113 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 3.30343863945925e-05, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 15273845.0, "repeat_count": 0.0, "routers_loss": 0.0014966290909796953, "skip_count": 0.0, "step": 9470, "text_loss": 0.5137372612953186 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 3.2923838344768534e-05, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 15277940.0, "repeat_count": 0.0, "routers_loss": 0.0028104602824896574, "skip_count": 0.0, "step": 9472, "text_loss": 0.5737728476524353 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.47901379512768, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056396484375, "learning_rate": 3.281346927941087e-05, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 15281640.0, "repeat_count": 0.0, "routers_loss": 0.007870957255363464, "skip_count": 2.0, "step": 9474, "text_loss": 0.27684518694877625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.48840622248312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 3.270327924081301e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 15284877.0, "repeat_count": 0.0, "routers_loss": 0.006224945653229952, "skip_count": 0.0, "step": 9476, "text_loss": 0.35599255561828613 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.497798649838565, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 3.259326827120013e-05, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 15287945.0, "repeat_count": 0.0, "routers_loss": 0.001179040758870542, "skip_count": 0.0, "step": 9478, "text_loss": 0.26802319288253784 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.507191077194015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 3.2483436412728553e-05, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 15290754.0, "repeat_count": 0.0, "routers_loss": 0.001992281526327133, "skip_count": 0.0, "step": 9480, "text_loss": 0.40124714374542236 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.51658350454946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 3.2373783707486057e-05, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 15294841.0, "repeat_count": 0.0, "routers_loss": 0.0012830843916162848, "skip_count": 0.0, "step": 9482, "text_loss": 0.6739225387573242 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.5259759319049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 3.226431019749171e-05, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 15298397.0, "repeat_count": 0.0, "routers_loss": 0.003624147269874811, "skip_count": 2.0, "step": 9484, "text_loss": 0.5250326991081238 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.016357421875, "learning_rate": 3.2155015924696105e-05, "loss": 0.0031, "macro_f1": 0.3333333432674408, "num_tokens": 15301499.0, "repeat_count": 0.0, "routers_loss": 0.0019682408310472965, "skip_count": 0.0, "step": 9486, "text_loss": 0.5574567317962646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 3.204590093098098e-05, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 15304531.0, "repeat_count": 0.0, "routers_loss": 0.002245094161480665, "skip_count": 0.0, "step": 9488, "text_loss": 0.4065501093864441 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.55415321397123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 3.1936965258159366e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 15307826.0, "repeat_count": 0.0, "routers_loss": 0.002919224789366126, "skip_count": 1.0, "step": 9490, "text_loss": 0.5183609127998352 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.563545641326684, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0211181640625, "learning_rate": 3.1828208947975615e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 15311420.0, "repeat_count": 0.0, "routers_loss": 0.004961747210472822, "skip_count": 1.0, "step": 9492, "text_loss": 0.1962234377861023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.57293806868213, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 3.171963204210537e-05, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 15314196.0, "repeat_count": 0.0, "routers_loss": 0.0026044815313071012, "skip_count": 0.0, "step": 9494, "text_loss": 0.223251610994339 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 3.161123458215553e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 15317174.0, "repeat_count": 0.0, "routers_loss": 0.0029661289881914854, "skip_count": 0.0, "step": 9496, "text_loss": 0.32970958948135376 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 3.150301660966415e-05, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 15320343.0, "repeat_count": 0.0, "routers_loss": 0.0011696632718667388, "skip_count": 0.0, "step": 9498, "text_loss": 0.8590811491012573 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.60111535074846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 3.13949781661006e-05, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 15324138.0, "repeat_count": 0.0, "routers_loss": 0.0015035583637654781, "skip_count": 0.0, "step": 9500, "text_loss": 0.6658036708831787 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.6105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 3.1287119292865375e-05, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 15328395.0, "repeat_count": 0.0, "routers_loss": 0.001930502592585981, "skip_count": 0.0, "step": 9502, "text_loss": 0.4104210138320923 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.619900205459345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 3.117944003129025e-05, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 15332196.0, "repeat_count": 0.0, "routers_loss": 0.0010025398805737495, "skip_count": 0.0, "step": 9504, "text_loss": 0.7272399663925171 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 44.629292632814796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 3.107194042263806e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 15335253.0, "repeat_count": 1.0, "routers_loss": 0.004520092159509659, "skip_count": 0.0, "step": 9506, "text_loss": 0.29173022508621216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 3.096462050810284e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 15338129.0, "repeat_count": 0.0, "routers_loss": 0.0009707154240459204, "skip_count": 0.0, "step": 9508, "text_loss": 0.6530287861824036 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.64807748752568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 3.0857480328809916e-05, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 15341487.0, "repeat_count": 0.0, "routers_loss": 0.0008689566748216748, "skip_count": 0.0, "step": 9510, "text_loss": 0.36988505721092224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 3.0750519925815565e-05, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 15344460.0, "repeat_count": 0.0, "routers_loss": 0.0022587007842957973, "skip_count": 0.0, "step": 9512, "text_loss": 0.2447768598794937 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.66686234223657, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 3.064373934010711e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 15348135.0, "repeat_count": 0.0, "routers_loss": 0.001986770424991846, "skip_count": 0.0, "step": 9514, "text_loss": 0.43159469962120056 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.67625476959201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 3.053713861260321e-05, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 15351073.0, "repeat_count": 0.0, "routers_loss": 0.0003514432755764574, "skip_count": 0.0, "step": 9516, "text_loss": 0.3638324737548828 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.685647196947464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 3.043071778415335e-05, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 15353633.0, "repeat_count": 0.0, "routers_loss": 0.003395392093807459, "skip_count": 0.0, "step": 9518, "text_loss": 0.5728140473365784 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.69503962430291, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 3.03244768955383e-05, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 15357322.0, "repeat_count": 0.0, "routers_loss": 0.0016641782131046057, "skip_count": 0.0, "step": 9520, "text_loss": 0.666814386844635 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0177001953125, "learning_rate": 3.021841598746966e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 15360771.0, "repeat_count": 0.0, "routers_loss": 0.0024721708614379168, "skip_count": 0.0, "step": 9522, "text_loss": 0.7148030400276184 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 3.01125351005902e-05, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 15364281.0, "repeat_count": 0.0, "routers_loss": 0.004133665468543768, "skip_count": 0.0, "step": 9524, "text_loss": 0.2985752820968628 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 3.0006834275473737e-05, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 15367354.0, "repeat_count": 0.0, "routers_loss": 0.003016186412423849, "skip_count": 1.0, "step": 9526, "text_loss": 0.22689883410930634 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 44.73260933372468, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.01531982421875, "learning_rate": 2.9901313552624932e-05, "loss": 0.003, "macro_f1": 1.0, "num_tokens": 15371027.0, "repeat_count": 1.0, "routers_loss": 0.015333639457821846, "skip_count": 7.0, "step": 9528, "text_loss": 0.8308720588684082 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0223388671875, "learning_rate": 2.97959729724796e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 15373948.0, "repeat_count": 0.0, "routers_loss": 0.001420815708115697, "skip_count": 0.0, "step": 9530, "text_loss": 0.5439777970314026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.751394188435576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 2.9690812575404456e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 15377366.0, "repeat_count": 0.0, "routers_loss": 0.0007130459416657686, "skip_count": 0.0, "step": 9532, "text_loss": 0.45405295491218567 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.76078661579102, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 2.95858324016971e-05, "loss": 0.0067, "macro_f1": 0.3272727429866791, "num_tokens": 15380115.0, "repeat_count": 1.0, "routers_loss": 0.04256885498762131, "skip_count": 0.0, "step": 9534, "text_loss": 0.39998912811279297 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 44.77017904314646, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 2.9481032491586178e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 15383205.0, "repeat_count": 0.0, "routers_loss": 0.004944019019603729, "skip_count": 4.0, "step": 9536, "text_loss": 0.1882237195968628 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.77957147050191, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 2.937641288523124e-05, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 15386619.0, "repeat_count": 0.0, "routers_loss": 0.007820523343980312, "skip_count": 1.0, "step": 9538, "text_loss": 0.26401394605636597 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.78896389785735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 2.9271973622722603e-05, "loss": 0.0026, "macro_f1": 0.3333333432674408, "num_tokens": 15389135.0, "repeat_count": 0.0, "routers_loss": 0.0010751578956842422, "skip_count": 0.0, "step": 9540, "text_loss": 0.39813846349716187 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.798356325212794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 2.9167714744081643e-05, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 15392150.0, "repeat_count": 0.0, "routers_loss": 0.0031554463785141706, "skip_count": 2.0, "step": 9542, "text_loss": 0.669784665107727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.807748752568244, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 2.9063636289260677e-05, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 15394974.0, "repeat_count": 0.0, "routers_loss": 0.00287301791831851, "skip_count": 1.0, "step": 9544, "text_loss": 0.176493301987648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.81714117992369, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 2.8959738298142635e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 15398432.0, "repeat_count": 0.0, "routers_loss": 0.0011708475649356842, "skip_count": 0.0, "step": 9546, "text_loss": 0.8762983083724976 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 2.885602081054145e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 15401121.0, "repeat_count": 0.0, "routers_loss": 0.003167103510349989, "skip_count": 1.0, "step": 9548, "text_loss": 0.2538717985153198 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 44.835926034634575, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 2.8752483866201885e-05, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 15404105.0, "repeat_count": 1.0, "routers_loss": 0.007552143186330795, "skip_count": 5.0, "step": 9550, "text_loss": 0.37045153975486755 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.84531846199002, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 2.8649127504799423e-05, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 15407232.0, "repeat_count": 1.0, "routers_loss": 0.007718692068010569, "skip_count": 2.0, "step": 9552, "text_loss": 0.15780900418758392 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.85471088934546, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 2.8545951765940547e-05, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 15410425.0, "repeat_count": 0.0, "routers_loss": 0.0003527951193973422, "skip_count": 0.0, "step": 9554, "text_loss": 0.5931823253631592 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 2.8442956689162193e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 15413724.0, "repeat_count": 0.0, "routers_loss": 0.00146177364513278, "skip_count": 0.0, "step": 9556, "text_loss": 0.691118061542511 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 2.8340142313932448e-05, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 15416776.0, "repeat_count": 0.0, "routers_loss": 0.0010256811510771513, "skip_count": 0.0, "step": 9558, "text_loss": 0.40814271569252014 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 2.823750867964997e-05, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 15419815.0, "repeat_count": 0.0, "routers_loss": 0.0047921910881996155, "skip_count": 0.0, "step": 9560, "text_loss": 0.28953713178634644 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.89228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 2.8135055825644072e-05, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 15422806.0, "repeat_count": 0.0, "routers_loss": 0.002010057680308819, "skip_count": 1.0, "step": 9562, "text_loss": 0.8377944231033325 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 44.90167302612269, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 2.803278379117491e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 15425405.0, "repeat_count": 0.0, "routers_loss": 0.005009239539504051, "skip_count": 1.0, "step": 9564, "text_loss": 0.5936337113380432 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.91106545347813, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 2.793069261543335e-05, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 15428233.0, "repeat_count": 0.0, "routers_loss": 0.007967893034219742, "skip_count": 2.0, "step": 9566, "text_loss": 0.49891290068626404 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.92045788083358, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 2.7828782337540882e-05, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 15431095.0, "repeat_count": 2.0, "routers_loss": 0.00638923142105341, "skip_count": 4.0, "step": 9568, "text_loss": 0.30928006768226624 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 44.929850308189025, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0205078125, "learning_rate": 2.7727052996549763e-05, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 15434933.0, "repeat_count": 0.0, "routers_loss": 0.0060427505522966385, "skip_count": 3.0, "step": 9570, "text_loss": 0.21274788677692413 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.93924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 2.762550463144281e-05, "loss": 0.0031, "macro_f1": 0.3333333432674408, "num_tokens": 15437655.0, "repeat_count": 0.0, "routers_loss": 0.0012480237055569887, "skip_count": 0.0, "step": 9572, "text_loss": 0.31049492955207825 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 2.7524137281133567e-05, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 15440643.0, "repeat_count": 0.0, "routers_loss": 0.005919245071709156, "skip_count": 0.0, "step": 9574, "text_loss": 0.16459886729717255 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 2.7422950984466233e-05, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 15443532.0, "repeat_count": 0.0, "routers_loss": 0.0061412835493683815, "skip_count": 2.0, "step": 9576, "text_loss": 0.7102797031402588 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0203857421875, "learning_rate": 2.7321945780215573e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 15447027.0, "repeat_count": 0.0, "routers_loss": 0.001149018993601203, "skip_count": 0.0, "step": 9578, "text_loss": 0.22778025269508362 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.97681244496625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 2.722112170708696e-05, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 15450173.0, "repeat_count": 0.0, "routers_loss": 0.002216014079749584, "skip_count": 0.0, "step": 9580, "text_loss": 0.21447396278381348 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 44.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 2.7120478803716264e-05, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 15452838.0, "repeat_count": 0.0, "routers_loss": 0.00498749827966094, "skip_count": 0.0, "step": 9582, "text_loss": 0.1664455235004425 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 44.99559729967714, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 2.7020017108670246e-05, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 15455928.0, "repeat_count": 1.0, "routers_loss": 0.005886784754693508, "skip_count": 3.0, "step": 9584, "text_loss": 0.3929266631603241 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.004696213677725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 2.691973666044589e-05, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 15459447.0, "repeat_count": 0.0, "routers_loss": 0.0029895263724029064, "skip_count": 1.0, "step": 9586, "text_loss": 0.27535343170166016 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.01408864103317, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 2.681963749747085e-05, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 15462340.0, "repeat_count": 1.0, "routers_loss": 0.0038893253076821566, "skip_count": 0.0, "step": 9588, "text_loss": 0.6950465440750122 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.02348106838861, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 2.671971965810338e-05, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 15465432.0, "repeat_count": 1.0, "routers_loss": 0.0016947018448263407, "skip_count": 0.0, "step": 9590, "text_loss": 0.41451266407966614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 2.6619983180632134e-05, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 15468300.0, "repeat_count": 0.0, "routers_loss": 0.0011597154662013054, "skip_count": 0.0, "step": 9592, "text_loss": 0.5846080780029297 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 2.6520428103276316e-05, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 15471084.0, "repeat_count": 0.0, "routers_loss": 0.005555236246436834, "skip_count": 2.0, "step": 9594, "text_loss": 0.4151473939418793 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.05165835045494, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 2.6421054464185633e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 15474348.0, "repeat_count": 0.0, "routers_loss": 0.0015279205981642008, "skip_count": 0.0, "step": 9596, "text_loss": 0.28742483258247375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.061050777810394, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 2.6321862301440234e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 15477493.0, "repeat_count": 0.0, "routers_loss": 0.0019169533625245094, "skip_count": 0.0, "step": 9598, "text_loss": 0.338019460439682 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.07044320516584, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048095703125, "learning_rate": 2.6222851653050773e-05, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 15480257.0, "repeat_count": 0.0, "routers_loss": 0.0015131557593122125, "skip_count": 1.0, "step": 9600, "text_loss": 0.5982558727264404 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029296875, "learning_rate": 2.612402255695828e-05, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 15482838.0, "repeat_count": 0.0, "routers_loss": 0.0026768618263304234, "skip_count": 0.0, "step": 9602, "text_loss": 0.32012176513671875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 2.6025375051034306e-05, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 15485746.0, "repeat_count": 0.0, "routers_loss": 0.002152341417968273, "skip_count": 0.0, "step": 9604, "text_loss": 0.16942192614078522 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 45.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 2.5926909173080658e-05, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 15488669.0, "repeat_count": 0.0, "routers_loss": 0.003325721947476268, "skip_count": 3.0, "step": 9606, "text_loss": 0.47950080037117004 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.10801291458761, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03564453125, "learning_rate": 2.582862496082977e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 15491512.0, "repeat_count": 0.0, "routers_loss": 0.0023114588111639023, "skip_count": 1.0, "step": 9608, "text_loss": 0.3907585144042969 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.117405341943055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 2.5730522451944292e-05, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 15494479.0, "repeat_count": 0.0, "routers_loss": 0.003140041371807456, "skip_count": 2.0, "step": 9610, "text_loss": 0.198005810379982 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.126797769298506, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041748046875, "learning_rate": 2.5632601684017264e-05, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 15497900.0, "repeat_count": 0.0, "routers_loss": 0.0015117402654141188, "skip_count": 0.0, "step": 9612, "text_loss": 0.874154269695282 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 2.5534862694572114e-05, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 15501817.0, "repeat_count": 0.0, "routers_loss": 0.00551232136785984, "skip_count": 2.0, "step": 9614, "text_loss": 0.1933375597000122 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.14558262400939, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 2.543730552106266e-05, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 15504872.0, "repeat_count": 0.0, "routers_loss": 0.001090583624318242, "skip_count": 0.0, "step": 9616, "text_loss": 0.4030717611312866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 2.533993020087294e-05, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 15507727.0, "repeat_count": 0.0, "routers_loss": 0.007001800462603569, "skip_count": 0.0, "step": 9618, "text_loss": 0.4812186062335968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.16436747872028, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 2.5242736771317333e-05, "loss": 0.0025, "macro_f1": 0.3333333432674408, "num_tokens": 15510689.0, "repeat_count": 0.0, "routers_loss": 0.0016861478798091412, "skip_count": 0.0, "step": 9620, "text_loss": 0.4578339457511902 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.17375990607572, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.05517578125, "learning_rate": 2.514572526964065e-05, "loss": 0.0068, "macro_f1": 0.8817967176437378, "num_tokens": 15513419.0, "repeat_count": 2.0, "routers_loss": 0.050852373242378235, "skip_count": 3.0, "step": 9622, "text_loss": 0.4038950204849243 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.183152333431174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 2.5048895733017772e-05, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 15516289.0, "repeat_count": 0.0, "routers_loss": 0.0015001936117187142, "skip_count": 0.0, "step": 9624, "text_loss": 0.8331962823867798 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.19254476078662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 2.4952248198554073e-05, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 15519476.0, "repeat_count": 0.0, "routers_loss": 0.0009114370332099497, "skip_count": 1.0, "step": 9626, "text_loss": 0.4997985363006592 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.017822265625, "learning_rate": 2.4855782703284925e-05, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 15523363.0, "repeat_count": 0.0, "routers_loss": 0.0011186953634023666, "skip_count": 0.0, "step": 9628, "text_loss": 0.2572024464607239 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 45.211329615497505, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0262451171875, "learning_rate": 2.4759499284176145e-05, "loss": 0.0059, "macro_f1": 0.6122449040412903, "num_tokens": 15526289.0, "repeat_count": 0.0, "routers_loss": 0.019600817933678627, "skip_count": 4.0, "step": 9630, "text_loss": 0.6323924660682678 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 45.22072204285295, "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 2.466339797812378e-05, "loss": 0.0065, "macro_f1": 0.9265305995941162, "num_tokens": 15530260.0, "repeat_count": 3.0, "routers_loss": 0.02459629252552986, "skip_count": 1.0, "step": 9632, "text_loss": 0.1824527233839035 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 45.23011447020839, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 2.4567478821954038e-05, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 15533916.0, "repeat_count": 2.0, "routers_loss": 0.009077859111130238, "skip_count": 2.0, "step": 9634, "text_loss": 0.4518069326877594 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.23950689756384, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 2.4471741852423235e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 15536958.0, "repeat_count": 1.0, "routers_loss": 0.002355317585170269, "skip_count": 0.0, "step": 9636, "text_loss": 0.8873519897460938 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.248899324919286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 2.437618710621803e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 15540544.0, "repeat_count": 0.0, "routers_loss": 0.001198371173813939, "skip_count": 0.0, "step": 9638, "text_loss": 0.4845949709415436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 2.4280814619955128e-05, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 15543355.0, "repeat_count": 0.0, "routers_loss": 0.0009287866414524615, "skip_count": 0.0, "step": 9640, "text_loss": 0.5979563593864441 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.26768417963017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 2.4185624430181464e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 15547215.0, "repeat_count": 0.0, "routers_loss": 0.0028763876762241125, "skip_count": 0.0, "step": 9642, "text_loss": 0.16279318928718567 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.27707660698562, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0196533203125, "learning_rate": 2.4090616573374135e-05, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 15550412.0, "repeat_count": 0.0, "routers_loss": 0.0013361044693738222, "skip_count": 0.0, "step": 9644, "text_loss": 0.2864333987236023 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 45.28646903434106, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0201416015625, "learning_rate": 2.3995791085940244e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 15553660.0, "repeat_count": 2.0, "routers_loss": 0.0019316677935421467, "skip_count": 0.0, "step": 9646, "text_loss": 0.6333117485046387 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.295861461696504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023193359375, "learning_rate": 2.390114800421722e-05, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 15556287.0, "repeat_count": 0.0, "routers_loss": 0.0011288017267361283, "skip_count": 1.0, "step": 9648, "text_loss": 0.6050677299499512 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.305253889051954, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 2.380668736447239e-05, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 15559246.0, "repeat_count": 0.0, "routers_loss": 0.0014249378582462668, "skip_count": 0.0, "step": 9650, "text_loss": 0.9484158754348755 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 45.3146463164074, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 2.371240920290324e-05, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 15562251.0, "repeat_count": 1.0, "routers_loss": 0.00741320988163352, "skip_count": 4.0, "step": 9652, "text_loss": 0.24387991428375244 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.32403874376284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 2.361831355563726e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 15565704.0, "repeat_count": 1.0, "routers_loss": 0.000942508690059185, "skip_count": 0.0, "step": 9654, "text_loss": 0.6523539423942566 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 45.333431171118285, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 2.352440045873233e-05, "loss": 0.0091, "macro_f1": 1.0, "num_tokens": 15568797.0, "repeat_count": 1.0, "routers_loss": 0.0064352210611104965, "skip_count": 4.0, "step": 9656, "text_loss": 0.3206343650817871 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.34282359847373, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 2.3430669948175943e-05, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 15571855.0, "repeat_count": 1.0, "routers_loss": 0.0013390982057899237, "skip_count": 0.0, "step": 9658, "text_loss": 0.8397402763366699 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.35221602582917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 2.3337122059885806e-05, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 15575379.0, "repeat_count": 0.0, "routers_loss": 0.0012212366564199328, "skip_count": 0.0, "step": 9660, "text_loss": 0.5116108655929565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 2.324375682970975e-05, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 15578108.0, "repeat_count": 0.0, "routers_loss": 0.003829900873824954, "skip_count": 0.0, "step": 9662, "text_loss": 0.1423535794019699 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 45.371000880540066, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 2.3150574293425376e-05, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 15581830.0, "repeat_count": 1.0, "routers_loss": 0.012756838463246822, "skip_count": 1.0, "step": 9664, "text_loss": 0.24676625430583954 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 2.3057574486740507e-05, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 15584872.0, "repeat_count": 0.0, "routers_loss": 0.0020642473828047514, "skip_count": 0.0, "step": 9666, "text_loss": 0.4851650893688202 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.38978573525095, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0184326171875, "learning_rate": 2.2964757445292806e-05, "loss": 0.0029, "macro_f1": 1.0, "num_tokens": 15588000.0, "repeat_count": 2.0, "routers_loss": 0.007441115565598011, "skip_count": 3.0, "step": 9668, "text_loss": 0.6416954398155212 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.3991781626064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.017333984375, "learning_rate": 2.287212320464993e-05, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 15591065.0, "repeat_count": 0.0, "routers_loss": 0.0015504831681028008, "skip_count": 0.0, "step": 9670, "text_loss": 0.5852687358856201 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 45.40857058996184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 2.2779671800309433e-05, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 15594631.0, "repeat_count": 2.0, "routers_loss": 0.005648284684866667, "skip_count": 2.0, "step": 9672, "text_loss": 0.7172279357910156 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.41796301731729, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 2.2687403267699024e-05, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 15598664.0, "repeat_count": 1.0, "routers_loss": 0.003756999270990491, "skip_count": 2.0, "step": 9674, "text_loss": 0.18986566364765167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.427355444672735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 2.259531764217604e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 15601616.0, "repeat_count": 0.0, "routers_loss": 0.002155672525987029, "skip_count": 0.0, "step": 9676, "text_loss": 0.4410690367221832 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.43674787202818, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 2.250341495902797e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 15604291.0, "repeat_count": 1.0, "routers_loss": 0.0020037787035107613, "skip_count": 0.0, "step": 9678, "text_loss": 0.5565816164016724 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 2.241169525347203e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 15607203.0, "repeat_count": 0.0, "routers_loss": 0.0014305647928267717, "skip_count": 0.0, "step": 9680, "text_loss": 0.4879189729690552 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.455532726739065, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 2.2320158560655447e-05, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 15610475.0, "repeat_count": 1.0, "routers_loss": 0.016029199585318565, "skip_count": 3.0, "step": 9682, "text_loss": 0.36342933773994446 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 2.2228804915655153e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 15613810.0, "repeat_count": 0.0, "routers_loss": 0.0023584216833114624, "skip_count": 0.0, "step": 9684, "text_loss": 0.18480375409126282 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.47431758144996, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 2.2137634353478043e-05, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 15617854.0, "repeat_count": 0.0, "routers_loss": 0.004325680434703827, "skip_count": 1.0, "step": 9686, "text_loss": 0.5345974564552307 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 45.4837100088054, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02001953125, "learning_rate": 2.2046646909060996e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 15620874.0, "repeat_count": 3.0, "routers_loss": 0.006946994923055172, "skip_count": 0.0, "step": 9688, "text_loss": 0.29016008973121643 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.49310243616085, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 2.195584261727046e-05, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 15623875.0, "repeat_count": 0.0, "routers_loss": 0.0034732038620859385, "skip_count": 1.0, "step": 9690, "text_loss": 0.2831312119960785 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 2.1865221512902766e-05, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 15626371.0, "repeat_count": 0.0, "routers_loss": 0.002495788736268878, "skip_count": 1.0, "step": 9692, "text_loss": 0.6090453267097473 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 45.511887290871734, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 2.1774783630684246e-05, "loss": 0.0076, "macro_f1": 0.6598639488220215, "num_tokens": 15630129.0, "repeat_count": 3.0, "routers_loss": 0.017551302909851074, "skip_count": 1.0, "step": 9694, "text_loss": 0.5127915740013123 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 2.168452900527068e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 15633179.0, "repeat_count": 0.0, "routers_loss": 0.0004413482965901494, "skip_count": 0.0, "step": 9696, "text_loss": 0.5901434421539307 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.53067214558262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0308837890625, "learning_rate": 2.159445767124796e-05, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 15636508.0, "repeat_count": 0.0, "routers_loss": 0.005992567166686058, "skip_count": 1.0, "step": 9698, "text_loss": 0.8493689298629761 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.54006457293807, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 2.1504569663131523e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 15639371.0, "repeat_count": 1.0, "routers_loss": 0.0033268092665821314, "skip_count": 0.0, "step": 9700, "text_loss": 0.2814267873764038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 2.1414865015366548e-05, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 15643025.0, "repeat_count": 0.0, "routers_loss": 0.004418607335537672, "skip_count": 0.0, "step": 9702, "text_loss": 0.2619725167751312 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 45.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 2.1325343762328197e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 15646996.0, "repeat_count": 0.0, "routers_loss": 0.0050115580670535564, "skip_count": 4.0, "step": 9704, "text_loss": 0.8204038143157959 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.5682418550044, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 2.123600593832109e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 15650194.0, "repeat_count": 0.0, "routers_loss": 0.0018730501178652048, "skip_count": 1.0, "step": 9706, "text_loss": 0.694500744342804 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.577634282359845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 2.1146851577579673e-05, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 15653743.0, "repeat_count": 0.0, "routers_loss": 0.0016657712403684855, "skip_count": 0.0, "step": 9708, "text_loss": 0.8211735486984253 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.58702670971529, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 2.1057880714268064e-05, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 15657325.0, "repeat_count": 0.0, "routers_loss": 0.0029736643191426992, "skip_count": 0.0, "step": 9710, "text_loss": 0.2846751809120178 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 45.59641913707074, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 2.0969093382479987e-05, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 15660522.0, "repeat_count": 1.0, "routers_loss": 0.01233653537929058, "skip_count": 4.0, "step": 9712, "text_loss": 0.23991759121418 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 2.0880489616239062e-05, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 15663254.0, "repeat_count": 0.0, "routers_loss": 0.0012792183551937342, "skip_count": 0.0, "step": 9714, "text_loss": 0.6943771243095398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.61520399178163, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 2.0792069449498297e-05, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 15666283.0, "repeat_count": 0.0, "routers_loss": 0.0033134319819509983, "skip_count": 0.0, "step": 9716, "text_loss": 0.4161235988140106 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 45.62459641913707, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 2.0703832916140476e-05, "loss": 0.0034, "macro_f1": 1.0, "num_tokens": 15669774.0, "repeat_count": 2.0, "routers_loss": 0.006201022770255804, "skip_count": 1.0, "step": 9718, "text_loss": 0.42691144347190857 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 2.061578004997805e-05, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 15672943.0, "repeat_count": 0.0, "routers_loss": 0.0033355073537677526, "skip_count": 1.0, "step": 9720, "text_loss": 0.9724727869033813 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 2.0527910884753033e-05, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 15677847.0, "repeat_count": 0.0, "routers_loss": 0.0019593657925724983, "skip_count": 0.0, "step": 9722, "text_loss": 0.417218416929245 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.65277370120341, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 2.0440225454137097e-05, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 15681460.0, "repeat_count": 0.0, "routers_loss": 0.007862947881221771, "skip_count": 2.0, "step": 9724, "text_loss": 0.24983589351177216 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.66216612855885, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 2.0352723791731364e-05, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 15685496.0, "repeat_count": 1.0, "routers_loss": 0.004811233840882778, "skip_count": 0.0, "step": 9726, "text_loss": 0.32930606603622437 }, { "acc_repeat": 0.0, "acc_skip": 0.8571428656578064, "avg_layers": 22.0, "epoch": 45.671558555914295, "f1_execute": 0.9767441749572754, "f1_repeat": 0.0, "f1_skip": 0.9230769276618958, "grad_norm": 0.045166015625, "learning_rate": 2.0265405931066626e-05, "loss": 0.0057, "macro_f1": 0.633273720741272, "num_tokens": 15688661.0, "repeat_count": 0.0, "routers_loss": 0.02648334763944149, "skip_count": 7.0, "step": 9728, "text_loss": 0.42316386103630066 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 45.68095098326974, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 2.0178271905603395e-05, "loss": 0.0054, "macro_f1": 0.6598639488220215, "num_tokens": 15692778.0, "repeat_count": 1.0, "routers_loss": 0.04439396783709526, "skip_count": 3.0, "step": 9730, "text_loss": 0.32248371839523315 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.69034341062518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 2.0091321748731517e-05, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 15695821.0, "repeat_count": 0.0, "routers_loss": 0.0020437403582036495, "skip_count": 2.0, "step": 9732, "text_loss": 0.5959160923957825 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.699735837980626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 2.000455549377045e-05, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 15699324.0, "repeat_count": 0.0, "routers_loss": 0.0002844796108547598, "skip_count": 0.0, "step": 9734, "text_loss": 0.45465928316116333 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.70912826533607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 1.9917973173969204e-05, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 15702044.0, "repeat_count": 0.0, "routers_loss": 0.003548701060935855, "skip_count": 0.0, "step": 9736, "text_loss": 0.7129027843475342 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 23.0, "epoch": 45.71852069269152, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.0279541015625, "learning_rate": 1.9831574822506248e-05, "loss": 0.0089, "macro_f1": 0.6289562582969666, "num_tokens": 15705474.0, "repeat_count": 0.0, "routers_loss": 0.023800918832421303, "skip_count": 6.0, "step": 9738, "text_loss": 0.28479668498039246 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.72791312004696, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 1.9745360472489648e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 15708323.0, "repeat_count": 0.0, "routers_loss": 0.01043168269097805, "skip_count": 2.0, "step": 9740, "text_loss": 0.4760739803314209 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 45.73730554740241, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 1.9659330156956867e-05, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 15711390.0, "repeat_count": 0.0, "routers_loss": 0.006430295296013355, "skip_count": 2.0, "step": 9742, "text_loss": 0.13933971524238586 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 45.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030029296875, "learning_rate": 1.957348390887487e-05, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 15714077.0, "repeat_count": 0.0, "routers_loss": 0.005738302133977413, "skip_count": 3.0, "step": 9744, "text_loss": 0.49661460518836975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.756090402113294, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 1.948782176114017e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 15716818.0, "repeat_count": 0.0, "routers_loss": 0.0011776578612625599, "skip_count": 0.0, "step": 9746, "text_loss": 0.36066678166389465 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 1.9402343746578567e-05, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 15720756.0, "repeat_count": 0.0, "routers_loss": 0.0005322427023202181, "skip_count": 0.0, "step": 9748, "text_loss": 0.5549091696739197 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.77487525682419, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 1.931704989794547e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 15724516.0, "repeat_count": 0.0, "routers_loss": 0.001399765140376985, "skip_count": 0.0, "step": 9750, "text_loss": 0.21269696950912476 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 1.9231940247925572e-05, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 15727142.0, "repeat_count": 0.0, "routers_loss": 0.0018337799701839685, "skip_count": 1.0, "step": 9752, "text_loss": 0.18105024099349976 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 1.914701482913317e-05, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 15730023.0, "repeat_count": 0.0, "routers_loss": 0.0010057559702545404, "skip_count": 0.0, "step": 9754, "text_loss": 0.477859228849411 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 45.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0198974609375, "learning_rate": 1.906227367411173e-05, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 15733108.0, "repeat_count": 0.0, "routers_loss": 0.002486895304173231, "skip_count": 3.0, "step": 9756, "text_loss": 0.4802452027797699 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 45.81244496624596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 1.8977716815334335e-05, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 15736130.0, "repeat_count": 1.0, "routers_loss": 0.004353616386651993, "skip_count": 0.0, "step": 9758, "text_loss": 0.5479429960250854 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.821837393601406, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 1.8893344285203228e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 15738691.0, "repeat_count": 0.0, "routers_loss": 0.0031500225886702538, "skip_count": 1.0, "step": 9760, "text_loss": 0.6871381402015686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.83122982095686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 1.8809156116050164e-05, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 15741682.0, "repeat_count": 0.0, "routers_loss": 0.0023419202771037817, "skip_count": 0.0, "step": 9762, "text_loss": 0.6725277900695801 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.8406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 1.8725152340136163e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 15745314.0, "repeat_count": 0.0, "routers_loss": 0.0018769606249406934, "skip_count": 0.0, "step": 9764, "text_loss": 0.4549144506454468 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 1.864133298965176e-05, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 15747982.0, "repeat_count": 1.0, "routers_loss": 0.0030958254355937243, "skip_count": 2.0, "step": 9766, "text_loss": 0.4970727264881134 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.85940710302319, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 1.8557698096716534e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 15750453.0, "repeat_count": 0.0, "routers_loss": 0.0020812496077269316, "skip_count": 1.0, "step": 9768, "text_loss": 0.7540801167488098 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 1.847424769337963e-05, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 15753857.0, "repeat_count": 0.0, "routers_loss": 0.0031040434259921312, "skip_count": 0.0, "step": 9770, "text_loss": 0.5154248476028442 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.878191957734074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 1.8390981811619356e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 15756742.0, "repeat_count": 0.0, "routers_loss": 0.002128311200067401, "skip_count": 0.0, "step": 9772, "text_loss": 0.7327702045440674 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.887584385089525, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 1.8307900483343354e-05, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 15759833.0, "repeat_count": 0.0, "routers_loss": 0.003279880853369832, "skip_count": 1.0, "step": 9774, "text_loss": 0.2673797607421875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.89697681244497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 1.8225003740388545e-05, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 15762768.0, "repeat_count": 0.0, "routers_loss": 0.004170822445303202, "skip_count": 0.0, "step": 9776, "text_loss": 0.1820847988128662 }, { "acc_repeat": 1.0, "acc_skip": 0.8888888955116272, "avg_layers": 21.0, "epoch": 45.90636923980041, "f1_execute": 0.9729729890823364, "f1_repeat": 1.0, "f1_skip": 0.9411765336990356, "grad_norm": 0.0194091796875, "learning_rate": 1.8142291614521132e-05, "loss": 0.0045, "macro_f1": 0.9713832139968872, "num_tokens": 15766965.0, "repeat_count": 1.0, "routers_loss": 0.022715313360095024, "skip_count": 9.0, "step": 9778, "text_loss": 0.5590897798538208 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 1.8059764137436596e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 15770199.0, "repeat_count": 0.0, "routers_loss": 0.007280370220541954, "skip_count": 1.0, "step": 9780, "text_loss": 0.28117987513542175 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 1.7977421340759582e-05, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 15773367.0, "repeat_count": 0.0, "routers_loss": 0.003529706271365285, "skip_count": 0.0, "step": 9782, "text_loss": 0.18752245604991913 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.93454652186674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 1.7895263256044013e-05, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 15776976.0, "repeat_count": 0.0, "routers_loss": 0.0025916248559951782, "skip_count": 1.0, "step": 9784, "text_loss": 0.6330561637878418 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 45.943938949222186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.042724609375, "learning_rate": 1.781328991477299e-05, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 15780848.0, "repeat_count": 0.0, "routers_loss": 0.0049234069883823395, "skip_count": 1.0, "step": 9786, "text_loss": 0.15685316920280457 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 45.95333137657764, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 1.7731501348358882e-05, "loss": 0.0067, "macro_f1": 0.8823530077934265, "num_tokens": 15783808.0, "repeat_count": 2.0, "routers_loss": 0.011918511241674423, "skip_count": 1.0, "step": 9788, "text_loss": 0.23963648080825806 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 1.7649897588143226e-05, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 15787421.0, "repeat_count": 0.0, "routers_loss": 0.0018508053617551923, "skip_count": 0.0, "step": 9790, "text_loss": 0.49311593174934387 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 1.7568478665396736e-05, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 15790274.0, "repeat_count": 0.0, "routers_loss": 0.0006157457246445119, "skip_count": 0.0, "step": 9792, "text_loss": 0.4567435085773468 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 45.98150865864397, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 1.7487244611319285e-05, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 15794462.0, "repeat_count": 3.0, "routers_loss": 0.0031584864482283592, "skip_count": 0.0, "step": 9794, "text_loss": 0.4325876832008362 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 45.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 1.740619545703992e-05, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 15797775.0, "repeat_count": 0.0, "routers_loss": 0.0028455168940126896, "skip_count": 0.0, "step": 9796, "text_loss": 0.1487245261669159 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 46.0, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.06201171875, "learning_rate": 1.7325331233616847e-05, "loss": 0.0078, "macro_f1": 0.6122449040412903, "num_tokens": 15801092.0, "repeat_count": 0.0, "routers_loss": 0.02560117095708847, "skip_count": 4.0, "step": 9798, "text_loss": 0.5299228429794312 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 46.00939242735544, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 1.7244651972037284e-05, "loss": 0.0046, "macro_f1": 0.6598639488220215, "num_tokens": 15804049.0, "repeat_count": 1.0, "routers_loss": 0.010446238331496716, "skip_count": 3.0, "step": 9800, "text_loss": 0.6591248512268066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 1.7164157703217886e-05, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 15807683.0, "repeat_count": 0.0, "routers_loss": 0.0017791346181184053, "skip_count": 0.0, "step": 9802, "text_loss": 0.45421653985977173 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.02817728206633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 1.7083848458004035e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 15810743.0, "repeat_count": 0.0, "routers_loss": 0.0008831496234051883, "skip_count": 0.0, "step": 9804, "text_loss": 0.5535439848899841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 1.7003724267170394e-05, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 15813880.0, "repeat_count": 0.0, "routers_loss": 0.002800740534439683, "skip_count": 0.0, "step": 9806, "text_loss": 0.5228974223136902 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 46.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 1.6923785161420845e-05, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 15816808.0, "repeat_count": 0.0, "routers_loss": 0.006823428440839052, "skip_count": 3.0, "step": 9808, "text_loss": 0.48018959164619446 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 1.6844031171388052e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 15819803.0, "repeat_count": 0.0, "routers_loss": 0.004808149300515652, "skip_count": 0.0, "step": 9810, "text_loss": 0.31094294786453247 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.06574699148811, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0223388671875, "learning_rate": 1.6764462327633955e-05, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 15822861.0, "repeat_count": 0.0, "routers_loss": 0.0026099751703441143, "skip_count": 0.0, "step": 9812, "text_loss": 0.5534207224845886 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.075139418843555, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0478515625, "learning_rate": 1.668507866064939e-05, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 15825960.0, "repeat_count": 1.0, "routers_loss": 0.008356450125575066, "skip_count": 2.0, "step": 9814, "text_loss": 0.40162262320518494 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.084531846199, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0198974609375, "learning_rate": 1.660588020085452e-05, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 15828906.0, "repeat_count": 0.0, "routers_loss": 0.006548966746777296, "skip_count": 2.0, "step": 9816, "text_loss": 0.2071811705827713 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.09392427355445, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 1.652686697859823e-05, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 15831935.0, "repeat_count": 0.0, "routers_loss": 0.0007895465241745114, "skip_count": 0.0, "step": 9818, "text_loss": 0.6879562735557556 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02294921875, "learning_rate": 1.6448039024158534e-05, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 15835745.0, "repeat_count": 1.0, "routers_loss": 0.00370208453387022, "skip_count": 2.0, "step": 9820, "text_loss": 0.6139163970947266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.11270912826534, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 1.6369396367742483e-05, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 15838373.0, "repeat_count": 0.0, "routers_loss": 0.002627170644700527, "skip_count": 0.0, "step": 9822, "text_loss": 0.3881947100162506 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.018798828125, "learning_rate": 1.6290939039486084e-05, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 15841156.0, "repeat_count": 0.0, "routers_loss": 0.005191941745579243, "skip_count": 2.0, "step": 9824, "text_loss": 0.6564247608184814 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 46.131493982976224, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 1.621266706945429e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 15843877.0, "repeat_count": 1.0, "routers_loss": 0.003889352548867464, "skip_count": 0.0, "step": 9826, "text_loss": 0.7128682136535645 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 46.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 1.6134580487641047e-05, "loss": 0.0031, "macro_f1": 0.6666666865348816, "num_tokens": 15846880.0, "repeat_count": 0.0, "routers_loss": 0.00674893194809556, "skip_count": 4.0, "step": 9828, "text_loss": 0.30893367528915405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.15027883768712, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 1.6056679323969425e-05, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 15850130.0, "repeat_count": 0.0, "routers_loss": 0.0009898045100271702, "skip_count": 0.0, "step": 9830, "text_loss": 0.6550688743591309 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 46.15967126504256, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 1.5978963608291154e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 15853578.0, "repeat_count": 1.0, "routers_loss": 0.0046016750857234, "skip_count": 0.0, "step": 9832, "text_loss": 0.43872204422950745 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02880859375, "learning_rate": 1.5901433370387132e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 15857939.0, "repeat_count": 0.0, "routers_loss": 0.004589201882481575, "skip_count": 1.0, "step": 9834, "text_loss": 0.41940808296203613 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0208740234375, "learning_rate": 1.5824088639967094e-05, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 15860584.0, "repeat_count": 0.0, "routers_loss": 0.0018899316200986505, "skip_count": 1.0, "step": 9836, "text_loss": 0.5105440616607666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.18784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 1.5746929446669556e-05, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 15864386.0, "repeat_count": 0.0, "routers_loss": 0.0006366848247125745, "skip_count": 0.0, "step": 9838, "text_loss": 0.5686481595039368 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.197240974464336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.017333984375, "learning_rate": 1.5669955820062254e-05, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 15869103.0, "repeat_count": 0.0, "routers_loss": 0.0043256948702037334, "skip_count": 1.0, "step": 9840, "text_loss": 0.16309607028961182 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.20663340181978, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 1.5593167789641483e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 15872384.0, "repeat_count": 0.0, "routers_loss": 0.00406000716611743, "skip_count": 1.0, "step": 9842, "text_loss": 0.21662485599517822 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 46.21602582917523, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.029541015625, "learning_rate": 1.551656538483259e-05, "loss": 0.0076, "macro_f1": 0.5492662787437439, "num_tokens": 15875261.0, "repeat_count": 0.0, "routers_loss": 0.020087692886590958, "skip_count": 2.0, "step": 9844, "text_loss": 0.6189377903938293 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.22541825653067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 1.5440148634989826e-05, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 15878132.0, "repeat_count": 0.0, "routers_loss": 0.0005302145145833492, "skip_count": 0.0, "step": 9846, "text_loss": 0.34496018290519714 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.23481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04443359375, "learning_rate": 1.536391756939609e-05, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 15881381.0, "repeat_count": 0.0, "routers_loss": 0.008405420929193497, "skip_count": 2.0, "step": 9848, "text_loss": 0.2865080237388611 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0213623046875, "learning_rate": 1.528787221726341e-05, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 15884621.0, "repeat_count": 0.0, "routers_loss": 0.0016017532907426357, "skip_count": 0.0, "step": 9850, "text_loss": 0.6104921102523804 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.253595538597004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 1.5212012607732528e-05, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 15888157.0, "repeat_count": 0.0, "routers_loss": 0.0015318389050662518, "skip_count": 0.0, "step": 9852, "text_loss": 0.2622036933898926 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.26298796595245, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 1.5136338769872915e-05, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 15891080.0, "repeat_count": 2.0, "routers_loss": 0.006494096480309963, "skip_count": 4.0, "step": 9854, "text_loss": 0.23415961861610413 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.2723803933079, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 1.5060850732682928e-05, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 15895486.0, "repeat_count": 2.0, "routers_loss": 0.007511078380048275, "skip_count": 3.0, "step": 9856, "text_loss": 0.7389219999313354 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 1.4985548525089709e-05, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 15898747.0, "repeat_count": 0.0, "routers_loss": 0.004874013364315033, "skip_count": 2.0, "step": 9858, "text_loss": 0.6853085160255432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0203857421875, "learning_rate": 1.4910432175949285e-05, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 15902157.0, "repeat_count": 0.0, "routers_loss": 0.0009244410903193057, "skip_count": 0.0, "step": 9860, "text_loss": 0.8172202110290527 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 46.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 1.4835501714046296e-05, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 15905012.0, "repeat_count": 0.0, "routers_loss": 0.00456853536888957, "skip_count": 3.0, "step": 9862, "text_loss": 0.7527797818183899 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.30995010272967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 1.4760757168094275e-05, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 15908302.0, "repeat_count": 0.0, "routers_loss": 0.0009686833946034312, "skip_count": 0.0, "step": 9864, "text_loss": 0.5548131465911865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.319342530085116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 1.4686198566735531e-05, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 15911923.0, "repeat_count": 0.0, "routers_loss": 0.0008255072170868516, "skip_count": 0.0, "step": 9866, "text_loss": 0.5995872020721436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.32873495744057, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 1.4611825938540935e-05, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 15914858.0, "repeat_count": 0.0, "routers_loss": 0.002459712326526642, "skip_count": 0.0, "step": 9868, "text_loss": 0.6777655482292175 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.33812738479601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.017578125, "learning_rate": 1.4537639312010298e-05, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 15918091.0, "repeat_count": 0.0, "routers_loss": 0.0014664786867797375, "skip_count": 0.0, "step": 9870, "text_loss": 0.42750120162963867 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 46.347519812151454, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 1.4463638715572103e-05, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 15920943.0, "repeat_count": 1.0, "routers_loss": 0.005549794062972069, "skip_count": 1.0, "step": 9872, "text_loss": 0.27477580308914185 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.3569122395069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 1.4389824177583388e-05, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 15924212.0, "repeat_count": 0.0, "routers_loss": 0.007967505604028702, "skip_count": 2.0, "step": 9874, "text_loss": 0.3174900412559509 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 1.4316195726330139e-05, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 15929143.0, "repeat_count": 0.0, "routers_loss": 0.0014913028571754694, "skip_count": 2.0, "step": 9876, "text_loss": 0.40919792652130127 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.375697094217784, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 1.4242753390026953e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 15931702.0, "repeat_count": 0.0, "routers_loss": 0.0003994424478150904, "skip_count": 0.0, "step": 9878, "text_loss": 0.35346853733062744 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.385089521573235, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0201416015625, "learning_rate": 1.4169497196816983e-05, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 15935225.0, "repeat_count": 1.0, "routers_loss": 0.008424114435911179, "skip_count": 3.0, "step": 9880, "text_loss": 0.230825275182724 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.39448194892868, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 1.4096427174772164e-05, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 15938630.0, "repeat_count": 0.0, "routers_loss": 0.004314251709729433, "skip_count": 1.0, "step": 9882, "text_loss": 0.8749642968177795 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 1.4023543351893043e-05, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 15941779.0, "repeat_count": 0.0, "routers_loss": 0.0008999531855806708, "skip_count": 0.0, "step": 9884, "text_loss": 0.6549318432807922 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 1.3950845756108943e-05, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 15944779.0, "repeat_count": 0.0, "routers_loss": 0.0010829231468960643, "skip_count": 0.0, "step": 9886, "text_loss": 0.5681273341178894 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 1.3878334415277583e-05, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 15947757.0, "repeat_count": 0.0, "routers_loss": 0.0038863453082740307, "skip_count": 1.0, "step": 9888, "text_loss": 0.4282133877277374 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 46.43205165835045, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.017822265625, "learning_rate": 1.3806009357185512e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 15952223.0, "repeat_count": 1.0, "routers_loss": 0.0006428947090171278, "skip_count": 0.0, "step": 9890, "text_loss": 0.4455379247665405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.441444085705896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 1.3733870609547838e-05, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 15955968.0, "repeat_count": 0.0, "routers_loss": 0.00048406270798295736, "skip_count": 0.0, "step": 9892, "text_loss": 0.37554407119750977 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.45083651306135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 1.3661918200008228e-05, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 15959376.0, "repeat_count": 0.0, "routers_loss": 0.004503594245761633, "skip_count": 1.0, "step": 9894, "text_loss": 0.22027169167995453 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 1.3590152156139012e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 15962882.0, "repeat_count": 0.0, "routers_loss": 0.0011738749453797936, "skip_count": 0.0, "step": 9896, "text_loss": 0.4203954041004181 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.469621367772234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 1.3518572505440973e-05, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 15965816.0, "repeat_count": 1.0, "routers_loss": 0.00806320272386074, "skip_count": 2.0, "step": 9898, "text_loss": 0.18884631991386414 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.47901379512768, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 1.3447179275343779e-05, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 15968840.0, "repeat_count": 0.0, "routers_loss": 0.004962162580341101, "skip_count": 1.0, "step": 9900, "text_loss": 0.22457796335220337 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.48840622248312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 1.3375972493205268e-05, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 15972768.0, "repeat_count": 0.0, "routers_loss": 0.0025535912718623877, "skip_count": 0.0, "step": 9902, "text_loss": 0.14859545230865479 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.497798649838565, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 1.3304952186312114e-05, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 15975380.0, "repeat_count": 0.0, "routers_loss": 0.002036662772297859, "skip_count": 0.0, "step": 9904, "text_loss": 0.5820382833480835 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.507191077194015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0191650390625, "learning_rate": 1.3234118381879378e-05, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 15978335.0, "repeat_count": 0.0, "routers_loss": 0.0055219330824911594, "skip_count": 2.0, "step": 9906, "text_loss": 0.29671815037727356 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.51658350454946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 1.316347110705074e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 15982003.0, "repeat_count": 0.0, "routers_loss": 0.005196230486035347, "skip_count": 0.0, "step": 9908, "text_loss": 0.5204919576644897 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.5259759319049, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.023193359375, "learning_rate": 1.3093010388898319e-05, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 15984937.0, "repeat_count": 1.0, "routers_loss": 0.0032779101748019457, "skip_count": 2.0, "step": 9910, "text_loss": 0.6803483366966248 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0208740234375, "learning_rate": 1.3022736254422851e-05, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 15988992.0, "repeat_count": 0.0, "routers_loss": 0.002347869798541069, "skip_count": 0.0, "step": 9912, "text_loss": 0.5335546731948853 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 1.2952648730553462e-05, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 15992828.0, "repeat_count": 0.0, "routers_loss": 0.0011128517799079418, "skip_count": 0.0, "step": 9914, "text_loss": 0.686739981174469 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.55415321397123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 1.288274784414789e-05, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 15995984.0, "repeat_count": 0.0, "routers_loss": 0.0031158174388110638, "skip_count": 0.0, "step": 9916, "text_loss": 0.16102474927902222 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.563545641326684, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.059814453125, "learning_rate": 1.2813033621992264e-05, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 15999606.0, "repeat_count": 0.0, "routers_loss": 0.0029228583443909883, "skip_count": 1.0, "step": 9918, "text_loss": 0.6022558212280273 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.57293806868213, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 1.274350609080116e-05, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 16002456.0, "repeat_count": 0.0, "routers_loss": 0.0031404250767081976, "skip_count": 2.0, "step": 9920, "text_loss": 0.7529577016830444 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 1.2674165277217653e-05, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 16005547.0, "repeat_count": 0.0, "routers_loss": 0.0038669302593916655, "skip_count": 0.0, "step": 9922, "text_loss": 0.47488540410995483 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 1.2605011207813378e-05, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 16009520.0, "repeat_count": 0.0, "routers_loss": 0.004838052671402693, "skip_count": 0.0, "step": 9924, "text_loss": 0.5252779722213745 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 46.60111535074846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.030517578125, "learning_rate": 1.2536043909088191e-05, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 16012730.0, "repeat_count": 1.0, "routers_loss": 0.0017430823063477874, "skip_count": 0.0, "step": 9926, "text_loss": 0.40845534205436707 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.6105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0167236328125, "learning_rate": 1.2467263407470619e-05, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 16015940.0, "repeat_count": 0.0, "routers_loss": 0.0010244545992463827, "skip_count": 0.0, "step": 9928, "text_loss": 0.8465730547904968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.619900205459345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 1.2398669729317357e-05, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 16018851.0, "repeat_count": 0.0, "routers_loss": 0.0007380630704574287, "skip_count": 0.0, "step": 9930, "text_loss": 0.37603214383125305 }, { "acc_repeat": 0.800000011920929, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.629292632814796, "f1_execute": 0.9729729890823364, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 1.2330262900913657e-05, "loss": 0.0087, "macro_f1": 0.9539539813995361, "num_tokens": 16022351.0, "repeat_count": 5.0, "routers_loss": 0.053848277777433395, "skip_count": 5.0, "step": 9932, "text_loss": 0.2047014981508255 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 46.63868506017024, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 1.2262042948473163e-05, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 16024902.0, "repeat_count": 1.0, "routers_loss": 0.0020845322869718075, "skip_count": 0.0, "step": 9934, "text_loss": 0.6269918084144592 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.64807748752568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 1.2194009898137903e-05, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 16028056.0, "repeat_count": 0.0, "routers_loss": 0.0008686805376783013, "skip_count": 0.0, "step": 9936, "text_loss": 0.4100899398326874 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 46.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 1.212616377597825e-05, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 16032111.0, "repeat_count": 0.0, "routers_loss": 0.004883588291704655, "skip_count": 3.0, "step": 9938, "text_loss": 0.3921346664428711 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.66686234223657, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 1.2058504607993015e-05, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 16035872.0, "repeat_count": 0.0, "routers_loss": 0.0005067490856163204, "skip_count": 0.0, "step": 9940, "text_loss": 0.44368258118629456 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.67625476959201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06787109375, "learning_rate": 1.1991032420109238e-05, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 16038923.0, "repeat_count": 0.0, "routers_loss": 0.005819452460855246, "skip_count": 2.0, "step": 9942, "text_loss": 0.27500197291374207 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.685647196947464, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 1.1923747238182403e-05, "loss": 0.0059, "macro_f1": 0.8817967176437378, "num_tokens": 16041803.0, "repeat_count": 2.0, "routers_loss": 0.035794492810964584, "skip_count": 3.0, "step": 9944, "text_loss": 0.5083543062210083 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.69503962430291, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 1.1856649087996384e-05, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 16045258.0, "repeat_count": 1.0, "routers_loss": 0.002845201175659895, "skip_count": 2.0, "step": 9946, "text_loss": 0.6859534382820129 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 1.1789737995263228e-05, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 16048618.0, "repeat_count": 0.0, "routers_loss": 0.0007575460476800799, "skip_count": 0.0, "step": 9948, "text_loss": 0.4512535333633423 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043212890625, "learning_rate": 1.1723013985623477e-05, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 16051595.0, "repeat_count": 0.0, "routers_loss": 0.002697878750041127, "skip_count": 1.0, "step": 9950, "text_loss": 0.3572070300579071 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 1.16564770846459e-05, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 16054494.0, "repeat_count": 0.0, "routers_loss": 0.0062429774552583694, "skip_count": 1.0, "step": 9952, "text_loss": 0.5479834079742432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.73260933372468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 1.1590127317827492e-05, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 16057555.0, "repeat_count": 0.0, "routers_loss": 0.0009302232647314668, "skip_count": 0.0, "step": 9954, "text_loss": 0.44800761342048645 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 1.1523964710593637e-05, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 16061072.0, "repeat_count": 0.0, "routers_loss": 0.002112898975610733, "skip_count": 0.0, "step": 9956, "text_loss": 0.3274081349372864 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.751394188435576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 1.1457989288297942e-05, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 16064165.0, "repeat_count": 0.0, "routers_loss": 0.00028447998920455575, "skip_count": 0.0, "step": 9958, "text_loss": 0.5712385773658752 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.76078661579102, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 1.1392201076222352e-05, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 16067293.0, "repeat_count": 1.0, "routers_loss": 0.009599249809980392, "skip_count": 2.0, "step": 9960, "text_loss": 0.26818037033081055 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.77017904314646, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 1.132660009957709e-05, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 16069852.0, "repeat_count": 0.0, "routers_loss": 0.005338563583791256, "skip_count": 0.0, "step": 9962, "text_loss": 0.6658869981765747 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.77957147050191, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0203857421875, "learning_rate": 1.1261186383500487e-05, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 16072633.0, "repeat_count": 0.0, "routers_loss": 0.001175224082544446, "skip_count": 1.0, "step": 9964, "text_loss": 0.4461731016635895 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 46.78896389785735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 1.1195959953059221e-05, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 16076065.0, "repeat_count": 1.0, "routers_loss": 0.0036650802940130234, "skip_count": 0.0, "step": 9966, "text_loss": 0.6107141971588135 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 46.798356325212794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 1.113092083324818e-05, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 16079309.0, "repeat_count": 0.0, "routers_loss": 0.005924097262322903, "skip_count": 2.0, "step": 9968, "text_loss": 0.5104627013206482 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 46.807748752568244, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 1.1066069048990545e-05, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 16082180.0, "repeat_count": 3.0, "routers_loss": 0.010777595452964306, "skip_count": 0.0, "step": 9970, "text_loss": 0.5205907225608826 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.81714117992369, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056640625, "learning_rate": 1.100140462513749e-05, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 16084654.0, "repeat_count": 0.0, "routers_loss": 0.0019593914039433002, "skip_count": 0.0, "step": 9972, "text_loss": 0.36411789059638977 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 46.82653360727913, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0255126953125, "learning_rate": 1.0936927586468693e-05, "loss": 0.0048, "macro_f1": 0.9452888369560242, "num_tokens": 16087736.0, "repeat_count": 1.0, "routers_loss": 0.0233579371124506, "skip_count": 4.0, "step": 9974, "text_loss": 0.267604261636734 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 1.0872637957691833e-05, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 16090838.0, "repeat_count": 0.0, "routers_loss": 0.00034629934816621244, "skip_count": 0.0, "step": 9976, "text_loss": 0.576068103313446 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 1.0808535763442761e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 16094084.0, "repeat_count": 0.0, "routers_loss": 0.0004253332444932312, "skip_count": 0.0, "step": 9978, "text_loss": 0.5883988738059998 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.85471088934546, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 1.0744621028285662e-05, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 16097432.0, "repeat_count": 0.0, "routers_loss": 0.0005800648941658437, "skip_count": 0.0, "step": 9980, "text_loss": 0.3358926475048065 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 46.86410331670091, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 1.068089377671272e-05, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 16100711.0, "repeat_count": 1.0, "routers_loss": 0.0015245937975123525, "skip_count": 0.0, "step": 9982, "text_loss": 0.6802405714988708 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 1.061735403314429e-05, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 16103952.0, "repeat_count": 0.0, "routers_loss": 0.002281307242810726, "skip_count": 1.0, "step": 9984, "text_loss": 0.3086298406124115 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 1.055400182192906e-05, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 16107101.0, "repeat_count": 0.0, "routers_loss": 0.0007910717977210879, "skip_count": 0.0, "step": 9986, "text_loss": 0.7036139965057373 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 46.89228059876724, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 1.0490837167343559e-05, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 16110316.0, "repeat_count": 1.0, "routers_loss": 0.0030006880406290293, "skip_count": 1.0, "step": 9988, "text_loss": 0.4638058841228485 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.90167302612269, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 1.04278600935927e-05, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 16113206.0, "repeat_count": 0.0, "routers_loss": 0.0006434856331907213, "skip_count": 0.0, "step": 9990, "text_loss": 0.6155068874359131 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.91106545347813, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 1.0365070624809403e-05, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 16116098.0, "repeat_count": 0.0, "routers_loss": 0.0007891099085099995, "skip_count": 0.0, "step": 9992, "text_loss": 0.4537872076034546 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 46.92045788083358, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 1.0302468785054641e-05, "loss": 0.0054, "macro_f1": 0.8823530077934265, "num_tokens": 16119344.0, "repeat_count": 2.0, "routers_loss": 0.011918486095964909, "skip_count": 1.0, "step": 9994, "text_loss": 0.18828579783439636 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.929850308189025, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 1.0240054598317672e-05, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 16122615.0, "repeat_count": 1.0, "routers_loss": 0.016306765377521515, "skip_count": 2.0, "step": 9996, "text_loss": 0.2876183092594147 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 46.93924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 1.0177828088515694e-05, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 16125506.0, "repeat_count": 0.0, "routers_loss": 0.00393108231946826, "skip_count": 1.0, "step": 9998, "text_loss": 0.6387818455696106 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 46.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 1.011578927949397e-05, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 16128499.0, "repeat_count": 0.0, "routers_loss": 0.001175055862404406, "skip_count": 0.0, "step": 10000, "text_loss": 0.4085952639579773 } ], "logging_steps": 2, "max_steps": 10650, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.738398356854296e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }