{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 28.169063692398005, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.009392427355444672, "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.25, "learning_rate": 2e-06, "loss": 0.4974, "macro_f1": 0.23255813121795654, "num_tokens": 3175.0, "repeat_count": 0.0, "routers_loss": 0.4339469373226166, "skip_count": 0.0, "step": 2, "text_loss": 0.3330848515033722 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 23.0, "epoch": 0.018784854710889344, "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8359375, "learning_rate": 6e-06, "loss": 0.4988, "macro_f1": 0.24242423474788666, "num_tokens": 5816.0, "repeat_count": 0.0, "routers_loss": 0.4511934816837311, "skip_count": 1.0, "step": 4, "text_loss": 0.4571273922920227 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.02817728206633402, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.234375, "learning_rate": 1e-05, "loss": 0.5113, "macro_f1": 0.222222238779068, "num_tokens": 9739.0, "repeat_count": 0.0, "routers_loss": 0.49306994676589966, "skip_count": 0.0, "step": 6, "text_loss": 0.41060560941696167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.03756970942177869, "f1_execute": 0.5641025900840759, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7265625, "learning_rate": 1.4e-05, "loss": 0.4766, "macro_f1": 0.18803420662879944, "num_tokens": 12869.0, "repeat_count": 1.0, "routers_loss": 0.48872503638267517, "skip_count": 2.0, "step": 8, "text_loss": 0.36678561568260193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.046962136777223364, "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.78125, "learning_rate": 1.8e-05, "loss": 0.4806, "macro_f1": 0.23255813121795654, "num_tokens": 15845.0, "repeat_count": 0.0, "routers_loss": 0.45077216625213623, "skip_count": 0.0, "step": 10, "text_loss": 0.5597779154777527 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 0.05635456413266804, "f1_execute": 0.7179487347602844, "f1_repeat": 0.2857142984867096, "f1_skip": 0.20000000298023224, "grad_norm": 1.5390625, "learning_rate": 2.2e-05, "loss": 0.4557, "macro_f1": 0.40122103691101074, "num_tokens": 19353.0, "repeat_count": 2.0, "routers_loss": 0.4130440056324005, "skip_count": 3.0, "step": 12, "text_loss": 0.2056603729724884 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.06574699148811271, "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.4375, "learning_rate": 2.6e-05, "loss": 0.5129, "macro_f1": 0.23255813121795654, "num_tokens": 22675.0, "repeat_count": 0.0, "routers_loss": 0.4582902193069458, "skip_count": 0.0, "step": 14, "text_loss": 0.32989829778671265 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 0.07513941884355738, "f1_execute": 0.6829268336296082, "f1_repeat": 0.0, "f1_skip": 0.2222222238779068, "grad_norm": 1.7421875, "learning_rate": 3e-05, "loss": 0.4729, "macro_f1": 0.3017163574695587, "num_tokens": 26022.0, "repeat_count": 0.0, "routers_loss": 0.42910993099212646, "skip_count": 1.0, "step": 16, "text_loss": 0.1353905349969864 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.08453184619900206, "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4765625, "learning_rate": 3.4000000000000007e-05, "loss": 0.4274, "macro_f1": 0.2518518567085266, "num_tokens": 29251.0, "repeat_count": 0.0, "routers_loss": 0.3990713059902191, "skip_count": 0.0, "step": 18, "text_loss": 0.3806765377521515 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.09392427355444673, "f1_execute": 0.6829268336296082, "f1_repeat": 0.2857142984867096, "f1_skip": 0.0, "grad_norm": 1.3125, "learning_rate": 3.8e-05, "loss": 0.4261, "macro_f1": 0.3228803873062134, "num_tokens": 32545.0, "repeat_count": 1.0, "routers_loss": 0.40146592259407043, "skip_count": 0.0, "step": 20, "text_loss": 0.25648367404937744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.1033167009098914, "f1_execute": 0.7272727489471436, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.625, "learning_rate": 4.2000000000000004e-05, "loss": 0.404, "macro_f1": 0.24242424964904785, "num_tokens": 36560.0, "repeat_count": 0.0, "routers_loss": 0.372715026140213, "skip_count": 0.0, "step": 22, "text_loss": 0.2799522578716278 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.11270912826533608, "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6328125, "learning_rate": 4.6e-05, "loss": 0.4218, "macro_f1": 0.2518518567085266, "num_tokens": 39597.0, "repeat_count": 0.0, "routers_loss": 0.4504941403865814, "skip_count": 0.0, "step": 24, "text_loss": 0.6635695695877075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.12210155562078075, "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7109375, "learning_rate": 5e-05, "loss": 0.3886, "macro_f1": 0.26950353384017944, "num_tokens": 43080.0, "repeat_count": 0.0, "routers_loss": 0.3498791456222534, "skip_count": 0.0, "step": 26, "text_loss": 0.7035041451454163 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.13149398297622542, "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.34375, "learning_rate": 5.4e-05, "loss": 0.3724, "macro_f1": 0.26950353384017944, "num_tokens": 46406.0, "repeat_count": 0.0, "routers_loss": 0.31265875697135925, "skip_count": 0.0, "step": 28, "text_loss": 0.6388277411460876 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.1408864103316701, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2578125, "learning_rate": 5.800000000000001e-05, "loss": 0.341, "macro_f1": 0.2857142686843872, "num_tokens": 49966.0, "repeat_count": 0.0, "routers_loss": 0.3200918138027191, "skip_count": 2.0, "step": 30, "text_loss": 0.17372547090053558 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.15027883768711475, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4140625, "learning_rate": 6.2e-05, "loss": 0.3207, "macro_f1": 0.2857142686843872, "num_tokens": 53378.0, "repeat_count": 1.0, "routers_loss": 0.32304447889328003, "skip_count": 1.0, "step": 32, "text_loss": 0.18196581304073334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.15967126504255943, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.46875, "learning_rate": 6.6e-05, "loss": 0.3304, "macro_f1": 0.3006536364555359, "num_tokens": 56933.0, "repeat_count": 0.0, "routers_loss": 0.24814388155937195, "skip_count": 0.0, "step": 34, "text_loss": 0.28823015093803406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.16906369239800412, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1171875, "learning_rate": 7.000000000000001e-05, "loss": 0.2778, "macro_f1": 0.3006536066532135, "num_tokens": 60744.0, "repeat_count": 1.0, "routers_loss": 0.22411039471626282, "skip_count": 0.0, "step": 36, "text_loss": 0.5260357856750488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.17845611975344877, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.484375, "learning_rate": 7.4e-05, "loss": 0.2738, "macro_f1": 0.2857142984867096, "num_tokens": 64900.0, "repeat_count": 0.0, "routers_loss": 0.44355395436286926, "skip_count": 0.0, "step": 38, "text_loss": 0.5382097363471985 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.18784854710889345, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3828125, "learning_rate": 7.8e-05, "loss": 0.2137, "macro_f1": 0.3076923191547394, "num_tokens": 68000.0, "repeat_count": 0.0, "routers_loss": 0.202330082654953, "skip_count": 0.0, "step": 40, "text_loss": 0.5946118831634521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.19724097446433814, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.78125, "learning_rate": 8.2e-05, "loss": 0.21, "macro_f1": 0.3144654333591461, "num_tokens": 70529.0, "repeat_count": 0.0, "routers_loss": 0.18023855984210968, "skip_count": 0.0, "step": 42, "text_loss": 0.5550904273986816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2066334018197828, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.609375, "learning_rate": 8.599999999999999e-05, "loss": 0.1918, "macro_f1": 0.32098764181137085, "num_tokens": 73427.0, "repeat_count": 2.0, "routers_loss": 0.2101590931415558, "skip_count": 0.0, "step": 44, "text_loss": 0.4636923372745514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.21602582917522747, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.53125, "learning_rate": 8.999999999999999e-05, "loss": 0.1881, "macro_f1": 0.3333333432674408, "num_tokens": 76472.0, "repeat_count": 0.0, "routers_loss": 0.11800424009561539, "skip_count": 0.0, "step": 46, "text_loss": 0.4187001883983612 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.22541825653067216, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.953125, "learning_rate": 9.400000000000001e-05, "loss": 0.1446, "macro_f1": 0.3272727429866791, "num_tokens": 79124.0, "repeat_count": 1.0, "routers_loss": 0.11632519960403442, "skip_count": 0.0, "step": 48, "text_loss": 0.2253919243812561 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.2348106838861168, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.58984375, "learning_rate": 9.800000000000001e-05, "loss": 0.1543, "macro_f1": 0.32098767161369324, "num_tokens": 81980.0, "repeat_count": 1.0, "routers_loss": 0.09669367223978043, "skip_count": 0.0, "step": 50, "text_loss": 0.6053179502487183 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.2442031112415615, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8515625, "learning_rate": 0.000102, "loss": 0.1393, "macro_f1": 0.32098764181137085, "num_tokens": 85236.0, "repeat_count": 0.0, "routers_loss": 0.12471720576286316, "skip_count": 0.0, "step": 52, "text_loss": 0.6027331948280334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2535955385970062, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.421875, "learning_rate": 0.000106, "loss": 0.1473, "macro_f1": 0.32098764181137085, "num_tokens": 88238.0, "repeat_count": 0.0, "routers_loss": 0.1376056969165802, "skip_count": 2.0, "step": 54, "text_loss": 0.2861751616001129 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.26298796595245083, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.35546875, "learning_rate": 0.00011, "loss": 0.1082, "macro_f1": 0.3333333432674408, "num_tokens": 91056.0, "repeat_count": 0.0, "routers_loss": 0.07449393719434738, "skip_count": 0.0, "step": 56, "text_loss": 0.48106974363327026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.2723803933078955, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.000114, "loss": 0.1123, "macro_f1": 0.32098764181137085, "num_tokens": 94987.0, "repeat_count": 0.0, "routers_loss": 0.07064720243215561, "skip_count": 0.0, "step": 58, "text_loss": 0.3554874658584595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2817728206633402, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5390625, "learning_rate": 0.000118, "loss": 0.1234, "macro_f1": 0.32098764181137085, "num_tokens": 97909.0, "repeat_count": 0.0, "routers_loss": 0.16835889220237732, "skip_count": 2.0, "step": 60, "text_loss": 0.5475804805755615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.29116524801878485, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.000122, "loss": 0.1224, "macro_f1": 0.3333333432674408, "num_tokens": 101043.0, "repeat_count": 0.0, "routers_loss": 0.06127442046999931, "skip_count": 0.0, "step": 62, "text_loss": 0.5966938734054565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3005576753742295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.000126, "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 104103.0, "repeat_count": 0.0, "routers_loss": 0.047825805842876434, "skip_count": 0.0, "step": 64, "text_loss": 0.5480486750602722 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3099501027296742, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.00013000000000000002, "loss": 0.1088, "macro_f1": 0.3006536364555359, "num_tokens": 107009.0, "repeat_count": 1.0, "routers_loss": 0.275174081325531, "skip_count": 4.0, "step": 66, "text_loss": 0.41714492440223694 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.31934253008511887, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.000134, "loss": 0.1123, "macro_f1": 0.3333333432674408, "num_tokens": 110486.0, "repeat_count": 0.0, "routers_loss": 0.029025178402662277, "skip_count": 0.0, "step": 68, "text_loss": 0.6775627732276917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3287349574405635, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.314453125, "learning_rate": 0.00013800000000000002, "loss": 0.1049, "macro_f1": 0.3272727429866791, "num_tokens": 113878.0, "repeat_count": 0.0, "routers_loss": 0.10141710191965103, "skip_count": 1.0, "step": 70, "text_loss": 0.6678873896598816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.33812738479600823, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.248046875, "learning_rate": 0.00014199999999999998, "loss": 0.1119, "macro_f1": 0.3272727429866791, "num_tokens": 116989.0, "repeat_count": 0.0, "routers_loss": 0.08002066612243652, "skip_count": 1.0, "step": 72, "text_loss": 0.405692994594574 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3475198121514529, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1787109375, "learning_rate": 0.000146, "loss": 0.0944, "macro_f1": 0.3144654333591461, "num_tokens": 119883.0, "repeat_count": 0.0, "routers_loss": 0.1867009848356247, "skip_count": 3.0, "step": 74, "text_loss": 0.44616150856018066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.35691223950689754, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.333984375, "learning_rate": 0.00015, "loss": 0.1003, "macro_f1": 0.32098764181137085, "num_tokens": 123325.0, "repeat_count": 0.0, "routers_loss": 0.07042168825864792, "skip_count": 2.0, "step": 76, "text_loss": 0.11340200901031494 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.36630466686234225, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26171875, "learning_rate": 0.000154, "loss": 0.1066, "macro_f1": 0.32098764181137085, "num_tokens": 126131.0, "repeat_count": 0.0, "routers_loss": 0.11535373330116272, "skip_count": 2.0, "step": 78, "text_loss": 0.3269135355949402 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3756970942177869, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.000158, "loss": 0.0891, "macro_f1": 0.3272727429866791, "num_tokens": 130349.0, "repeat_count": 0.0, "routers_loss": 0.09497501701116562, "skip_count": 1.0, "step": 80, "text_loss": 0.15273472666740417 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.38508952157323156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.000162, "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 133607.0, "repeat_count": 0.0, "routers_loss": 0.030639523640275, "skip_count": 0.0, "step": 82, "text_loss": 0.282884806394577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3944819489286763, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.00016600000000000002, "loss": 0.1254, "macro_f1": 0.3272727429866791, "num_tokens": 136694.0, "repeat_count": 0.0, "routers_loss": 0.07906441390514374, "skip_count": 1.0, "step": 84, "text_loss": 0.459094375371933 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.40387437628412093, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.00017, "loss": 0.1071, "macro_f1": 0.3144654333591461, "num_tokens": 139966.0, "repeat_count": 1.0, "routers_loss": 0.1124570444226265, "skip_count": 2.0, "step": 86, "text_loss": 0.29985448718070984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4132668036395656, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25390625, "learning_rate": 0.000174, "loss": 0.1031, "macro_f1": 0.32098764181137085, "num_tokens": 142788.0, "repeat_count": 2.0, "routers_loss": 0.1966402679681778, "skip_count": 0.0, "step": 88, "text_loss": 0.6435291767120361 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4226592309950103, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.349609375, "learning_rate": 0.000178, "loss": 0.0963, "macro_f1": 0.3333333432674408, "num_tokens": 146192.0, "repeat_count": 0.0, "routers_loss": 0.0325632207095623, "skip_count": 0.0, "step": 90, "text_loss": 0.35170626640319824 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.43205165835045495, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2265625, "learning_rate": 0.000182, "loss": 0.1073, "macro_f1": 0.32098764181137085, "num_tokens": 149792.0, "repeat_count": 1.0, "routers_loss": 0.15115146338939667, "skip_count": 1.0, "step": 92, "text_loss": 0.83159339427948 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4414440857058996, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.205078125, "learning_rate": 0.000186, "loss": 0.1073, "macro_f1": 0.3333333432674408, "num_tokens": 152766.0, "repeat_count": 0.0, "routers_loss": 0.043313540518283844, "skip_count": 0.0, "step": 94, "text_loss": 0.49707934260368347 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4508365130613443, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.00019, "loss": 0.0947, "macro_f1": 0.3333333432674408, "num_tokens": 156112.0, "repeat_count": 0.0, "routers_loss": 0.032021280378103256, "skip_count": 0.0, "step": 96, "text_loss": 0.27608928084373474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.46022894041678897, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2099609375, "learning_rate": 0.000194, "loss": 0.0846, "macro_f1": 0.3076923191547394, "num_tokens": 159454.0, "repeat_count": 2.0, "routers_loss": 0.24473154544830322, "skip_count": 2.0, "step": 98, "text_loss": 0.6026689410209656 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4696213677722336, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.00019800000000000002, "loss": 0.1028, "macro_f1": 0.32098764181137085, "num_tokens": 163661.0, "repeat_count": 0.0, "routers_loss": 0.11468276381492615, "skip_count": 2.0, "step": 100, "text_loss": 0.46733155846595764 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.47901379512767833, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.000202, "loss": 0.1089, "macro_f1": 0.3333333432674408, "num_tokens": 167134.0, "repeat_count": 0.0, "routers_loss": 0.021144939586520195, "skip_count": 0.0, "step": 102, "text_loss": 0.6362994909286499 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.488406222483123, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.000206, "loss": 0.0621, "macro_f1": 0.3272727429866791, "num_tokens": 170433.0, "repeat_count": 0.0, "routers_loss": 0.06594710797071457, "skip_count": 1.0, "step": 104, "text_loss": 0.4515477120876312 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.49779864983856764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.00021, "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 173387.0, "repeat_count": 0.0, "routers_loss": 0.032923027873039246, "skip_count": 0.0, "step": 106, "text_loss": 0.6638453006744385 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5071910771940124, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.240234375, "learning_rate": 0.000214, "loss": 0.0883, "macro_f1": 0.3272727429866791, "num_tokens": 176170.0, "repeat_count": 1.0, "routers_loss": 0.08034781366586685, "skip_count": 0.0, "step": 108, "text_loss": 1.186936855316162 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.516583504549457, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000218, "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 179877.0, "repeat_count": 0.0, "routers_loss": 0.07814185321331024, "skip_count": 1.0, "step": 110, "text_loss": 0.5488709211349487 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5259759319049017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.000222, "loss": 0.0946, "macro_f1": 0.3333333432674408, "num_tokens": 182726.0, "repeat_count": 0.0, "routers_loss": 0.01884695515036583, "skip_count": 0.0, "step": 112, "text_loss": 0.5195863842964172 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5353683592603463, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19921875, "learning_rate": 0.00022600000000000002, "loss": 0.0974, "macro_f1": 0.32098764181137085, "num_tokens": 185624.0, "repeat_count": 0.0, "routers_loss": 0.09657823294401169, "skip_count": 2.0, "step": 114, "text_loss": 0.43858134746551514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.544760786615791, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3046875, "learning_rate": 0.00023, "loss": 0.0753, "macro_f1": 0.3333333432674408, "num_tokens": 188155.0, "repeat_count": 0.0, "routers_loss": 0.01463601179420948, "skip_count": 0.0, "step": 116, "text_loss": 0.392981618642807 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5541532139712357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.439453125, "learning_rate": 0.00023400000000000002, "loss": 0.0843, "macro_f1": 0.3333333432674408, "num_tokens": 190970.0, "repeat_count": 0.0, "routers_loss": 0.03859659656882286, "skip_count": 0.0, "step": 118, "text_loss": 0.309179425239563 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5635456413266804, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2255859375, "learning_rate": 0.00023799999999999998, "loss": 0.053, "macro_f1": 0.3333333432674408, "num_tokens": 193988.0, "repeat_count": 0.0, "routers_loss": 0.019092386588454247, "skip_count": 0.0, "step": 120, "text_loss": 0.48543134331703186 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.572938068682125, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.35546875, "learning_rate": 0.000242, "loss": 0.1203, "macro_f1": 0.3272727429866791, "num_tokens": 196475.0, "repeat_count": 0.0, "routers_loss": 0.0619138665497303, "skip_count": 1.0, "step": 122, "text_loss": 0.4615364074707031 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5823304960375697, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1875, "learning_rate": 0.000246, "loss": 0.1002, "macro_f1": 0.3272727429866791, "num_tokens": 200045.0, "repeat_count": 1.0, "routers_loss": 0.09752107411623001, "skip_count": 0.0, "step": 124, "text_loss": 0.15802054107189178 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5917229233930144, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.00025, "loss": 0.0773, "macro_f1": 0.3333333432674408, "num_tokens": 203214.0, "repeat_count": 0.0, "routers_loss": 0.02896115928888321, "skip_count": 0.0, "step": 126, "text_loss": 0.4543360471725464 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.601115350748459, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4296875, "learning_rate": 0.000254, "loss": 0.0973, "macro_f1": 0.3333333432674408, "num_tokens": 206168.0, "repeat_count": 0.0, "routers_loss": 0.011423567309975624, "skip_count": 0.0, "step": 128, "text_loss": 0.4730179011821747 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6105077781039038, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.365234375, "learning_rate": 0.00025800000000000004, "loss": 0.099, "macro_f1": 0.3333333432674408, "num_tokens": 209907.0, "repeat_count": 0.0, "routers_loss": 0.01957600563764572, "skip_count": 0.0, "step": 130, "text_loss": 0.45122358202934265 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6199002054593484, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2060546875, "learning_rate": 0.000262, "loss": 0.0868, "macro_f1": 0.3272727429866791, "num_tokens": 213521.0, "repeat_count": 0.0, "routers_loss": 0.04882373288273811, "skip_count": 1.0, "step": 132, "text_loss": 0.4341491758823395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6292926328147931, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.000266, "loss": 0.0834, "macro_f1": 0.3333333432674408, "num_tokens": 216484.0, "repeat_count": 0.0, "routers_loss": 0.016083380207419395, "skip_count": 0.0, "step": 134, "text_loss": 0.46990111470222473 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6386850601702377, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.220703125, "learning_rate": 0.00027, "loss": 0.0863, "macro_f1": 0.3333333432674408, "num_tokens": 219398.0, "repeat_count": 0.0, "routers_loss": 0.01733536459505558, "skip_count": 0.0, "step": 136, "text_loss": 0.4455361068248749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6480774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.00027400000000000005, "loss": 0.0997, "macro_f1": 0.3333333432674408, "num_tokens": 222430.0, "repeat_count": 0.0, "routers_loss": 0.01332803163677454, "skip_count": 0.0, "step": 138, "text_loss": 0.47699397802352905 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.657469914881127, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.333984375, "learning_rate": 0.00027800000000000004, "loss": 0.0922, "macro_f1": 0.3144654333591461, "num_tokens": 225458.0, "repeat_count": 1.0, "routers_loss": 0.14924728870391846, "skip_count": 2.0, "step": 140, "text_loss": 0.5858222842216492 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6668623422365718, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25, "learning_rate": 0.00028199999999999997, "loss": 0.0798, "macro_f1": 0.3144654333591461, "num_tokens": 229365.0, "repeat_count": 1.0, "routers_loss": 0.1860177218914032, "skip_count": 2.0, "step": 142, "text_loss": 0.5003137588500977 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6762547695920165, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.00028599999999999996, "loss": 0.054, "macro_f1": 0.32098764181137085, "num_tokens": 231787.0, "repeat_count": 1.0, "routers_loss": 0.16498211026191711, "skip_count": 1.0, "step": 144, "text_loss": 0.5026470422744751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6856471969474611, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.00029, "loss": 0.0936, "macro_f1": 0.32098764181137085, "num_tokens": 235014.0, "repeat_count": 1.0, "routers_loss": 0.11801310628652573, "skip_count": 1.0, "step": 146, "text_loss": 0.611888587474823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6950396243029058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.000294, "loss": 0.0878, "macro_f1": 0.3333333432674408, "num_tokens": 238210.0, "repeat_count": 0.0, "routers_loss": 0.02422776259481907, "skip_count": 0.0, "step": 148, "text_loss": 0.2876914143562317 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7044320516583504, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.000298, "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 241582.0, "repeat_count": 0.0, "routers_loss": 0.07282499223947525, "skip_count": 2.0, "step": 150, "text_loss": 0.3919292390346527 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7138244790137951, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.37890625, "learning_rate": 0.000302, "loss": 0.0797, "macro_f1": 0.32098764181137085, "num_tokens": 244621.0, "repeat_count": 1.0, "routers_loss": 0.20659038424491882, "skip_count": 1.0, "step": 152, "text_loss": 0.4294498860836029 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7232169063692399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1787109375, "learning_rate": 0.000306, "loss": 0.072, "macro_f1": 0.3333333432674408, "num_tokens": 247833.0, "repeat_count": 0.0, "routers_loss": 0.02428400330245495, "skip_count": 0.0, "step": 154, "text_loss": 0.5930765867233276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7326093337246845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.00031, "loss": 0.0772, "macro_f1": 0.3333333432674408, "num_tokens": 251349.0, "repeat_count": 0.0, "routers_loss": 0.0167869683355093, "skip_count": 0.0, "step": 156, "text_loss": 0.41063904762268066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7420017610801292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.000314, "loss": 0.0821, "macro_f1": 0.3333333432674408, "num_tokens": 254886.0, "repeat_count": 0.0, "routers_loss": 0.02531604655086994, "skip_count": 0.0, "step": 158, "text_loss": 0.6739020347595215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7513941884355738, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.201171875, "learning_rate": 0.00031800000000000003, "loss": 0.09, "macro_f1": 0.3333333432674408, "num_tokens": 258260.0, "repeat_count": 0.0, "routers_loss": 0.017772775143384933, "skip_count": 0.0, "step": 160, "text_loss": 0.46873849630355835 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7607866157910185, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.224609375, "learning_rate": 0.000322, "loss": 0.0893, "macro_f1": 0.3272727429866791, "num_tokens": 261846.0, "repeat_count": 0.0, "routers_loss": 0.034902360290288925, "skip_count": 1.0, "step": 162, "text_loss": 0.3727971017360687 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7701790431464631, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.000326, "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 264348.0, "repeat_count": 0.0, "routers_loss": 0.013553355820477009, "skip_count": 0.0, "step": 164, "text_loss": 0.5798237323760986 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7795714705019078, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.408203125, "learning_rate": 0.00033, "loss": 0.0926, "macro_f1": 0.32098764181137085, "num_tokens": 267479.0, "repeat_count": 1.0, "routers_loss": 0.13571743667125702, "skip_count": 1.0, "step": 166, "text_loss": 0.8084776997566223 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7889638978573525, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2431640625, "learning_rate": 0.00033400000000000004, "loss": 0.0817, "macro_f1": 0.32098764181137085, "num_tokens": 270268.0, "repeat_count": 2.0, "routers_loss": 0.19884146749973297, "skip_count": 0.0, "step": 168, "text_loss": 0.7366134524345398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7983563252127972, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.00033800000000000003, "loss": 0.1022, "macro_f1": 0.32098764181137085, "num_tokens": 273518.0, "repeat_count": 1.0, "routers_loss": 0.15469175577163696, "skip_count": 1.0, "step": 170, "text_loss": 0.27204006910324097 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8077487525682419, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.000342, "loss": 0.0865, "macro_f1": 0.32098764181137085, "num_tokens": 277210.0, "repeat_count": 0.0, "routers_loss": 0.08603330701589584, "skip_count": 2.0, "step": 172, "text_loss": 0.7137667536735535 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8171411799236865, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000346, "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 280389.0, "repeat_count": 0.0, "routers_loss": 0.17851492762565613, "skip_count": 4.0, "step": 174, "text_loss": 0.5148105621337891 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8265336072791312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.00035, "loss": 0.0853, "macro_f1": 0.3333333432674408, "num_tokens": 283501.0, "repeat_count": 0.0, "routers_loss": 0.021331604570150375, "skip_count": 0.0, "step": 176, "text_loss": 0.301013320684433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8359260346345758, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.000354, "loss": 0.0911, "macro_f1": 0.32098764181137085, "num_tokens": 287154.0, "repeat_count": 0.0, "routers_loss": 0.057273946702480316, "skip_count": 2.0, "step": 178, "text_loss": 0.4740981459617615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8453184619900206, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.240234375, "learning_rate": 0.000358, "loss": 0.0904, "macro_f1": 0.3272727429866791, "num_tokens": 289929.0, "repeat_count": 0.0, "routers_loss": 0.04116598889231682, "skip_count": 1.0, "step": 180, "text_loss": 0.4838573932647705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8547108893454652, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.000362, "loss": 0.0991, "macro_f1": 0.3333333432674408, "num_tokens": 294293.0, "repeat_count": 0.0, "routers_loss": 0.027111956849694252, "skip_count": 0.0, "step": 182, "text_loss": 0.7495553493499756 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8641033167009099, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.158203125, "learning_rate": 0.000366, "loss": 0.1038, "macro_f1": 0.3333333432674408, "num_tokens": 297730.0, "repeat_count": 0.0, "routers_loss": 0.019166452810168266, "skip_count": 0.0, "step": 184, "text_loss": 0.534831166267395 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 0.8734957440563546, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2236328125, "learning_rate": 0.00037, "loss": 0.0784, "macro_f1": 0.5427350401878357, "num_tokens": 300593.0, "repeat_count": 1.0, "routers_loss": 0.2349659502506256, "skip_count": 2.0, "step": 186, "text_loss": 0.3549048602581024 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8828881714117992, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2041015625, "learning_rate": 0.000374, "loss": 0.0827, "macro_f1": 0.3076923191547394, "num_tokens": 303456.0, "repeat_count": 2.0, "routers_loss": 0.22502389550209045, "skip_count": 2.0, "step": 188, "text_loss": 0.8837642073631287 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8922805987672439, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.000378, "loss": 0.1085, "macro_f1": 0.3272727429866791, "num_tokens": 306241.0, "repeat_count": 1.0, "routers_loss": 0.12291611731052399, "skip_count": 0.0, "step": 190, "text_loss": 0.73353511095047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9016730261226886, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000382, "loss": 0.0969, "macro_f1": 0.3272727429866791, "num_tokens": 310606.0, "repeat_count": 0.0, "routers_loss": 0.055988848209381104, "skip_count": 1.0, "step": 192, "text_loss": 0.6261917352676392 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9110654534781333, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.34375, "learning_rate": 0.000386, "loss": 0.1055, "macro_f1": 0.3144654333591461, "num_tokens": 313564.0, "repeat_count": 0.0, "routers_loss": 0.12363404780626297, "skip_count": 3.0, "step": 194, "text_loss": 0.2790874242782593 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9204578808335779, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.27734375, "learning_rate": 0.00039000000000000005, "loss": 0.0964, "macro_f1": 0.3076923191547394, "num_tokens": 316958.0, "repeat_count": 2.0, "routers_loss": 0.2718356251716614, "skip_count": 2.0, "step": 196, "text_loss": 0.14428086578845978 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9298503081890226, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.00039400000000000004, "loss": 0.0917, "macro_f1": 0.32098764181137085, "num_tokens": 320103.0, "repeat_count": 0.0, "routers_loss": 0.07188102602958679, "skip_count": 2.0, "step": 198, "text_loss": 0.27155816555023193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9392427355444672, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.201171875, "learning_rate": 0.000398, "loss": 0.0809, "macro_f1": 0.32098764181137085, "num_tokens": 323566.0, "repeat_count": 1.0, "routers_loss": 0.18038256466388702, "skip_count": 1.0, "step": 200, "text_loss": 0.8453494310379028 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9486351628999119, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.000402, "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 326385.0, "repeat_count": 0.0, "routers_loss": 0.014639763161540031, "skip_count": 0.0, "step": 202, "text_loss": 0.5733131766319275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9580275902553567, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21875, "learning_rate": 0.00040600000000000006, "loss": 0.104, "macro_f1": 0.3333333432674408, "num_tokens": 329266.0, "repeat_count": 0.0, "routers_loss": 0.015269627794623375, "skip_count": 0.0, "step": 204, "text_loss": 0.7355639934539795 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9674200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.27734375, "learning_rate": 0.00041, "loss": 0.0833, "macro_f1": 0.3333333432674408, "num_tokens": 332984.0, "repeat_count": 0.0, "routers_loss": 0.018046971410512924, "skip_count": 0.0, "step": 206, "text_loss": 0.587641179561615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.976812444966246, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.000414, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 335739.0, "repeat_count": 1.0, "routers_loss": 0.12791286408901215, "skip_count": 0.0, "step": 208, "text_loss": 0.6538406610488892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9862048723216906, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.24609375, "learning_rate": 0.00041799999999999997, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 338966.0, "repeat_count": 0.0, "routers_loss": 0.050490595400333405, "skip_count": 1.0, "step": 210, "text_loss": 0.4188295602798462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9955972996771353, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.000422, "loss": 0.0588, "macro_f1": 0.3144654333591461, "num_tokens": 342063.0, "repeat_count": 0.0, "routers_loss": 0.11652113497257233, "skip_count": 3.0, "step": 212, "text_loss": 0.21822240948677063 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0046962136777224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2060546875, "learning_rate": 0.000426, "loss": 0.0621, "macro_f1": 0.3333333432674408, "num_tokens": 344887.0, "repeat_count": 0.0, "routers_loss": 0.023898238316178322, "skip_count": 0.0, "step": 214, "text_loss": 0.24692800641059875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.014088641033167, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3671875, "learning_rate": 0.00043, "loss": 0.1005, "macro_f1": 0.3272727429866791, "num_tokens": 348700.0, "repeat_count": 1.0, "routers_loss": 0.06414655596017838, "skip_count": 0.0, "step": 216, "text_loss": 0.4744548797607422 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0234810683886117, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.00043400000000000003, "loss": 0.0753, "macro_f1": 0.32098764181137085, "num_tokens": 351507.0, "repeat_count": 1.0, "routers_loss": 0.11702914535999298, "skip_count": 1.0, "step": 218, "text_loss": 0.5614864826202393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0328734957440564, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000438, "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 354484.0, "repeat_count": 0.0, "routers_loss": 0.014991643838584423, "skip_count": 0.0, "step": 220, "text_loss": 0.47209832072257996 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.042265923099501, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.251953125, "learning_rate": 0.000442, "loss": 0.106, "macro_f1": 0.3272727429866791, "num_tokens": 357954.0, "repeat_count": 0.0, "routers_loss": 0.04747112840414047, "skip_count": 1.0, "step": 222, "text_loss": 0.2968728244304657 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0516583504549457, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.40234375, "learning_rate": 0.000446, "loss": 0.0853, "macro_f1": 0.32098764181137085, "num_tokens": 360547.0, "repeat_count": 0.0, "routers_loss": 0.06754162162542343, "skip_count": 2.0, "step": 224, "text_loss": 0.2364148646593094 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0610507778103904, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2412109375, "learning_rate": 0.00045000000000000004, "loss": 0.1016, "macro_f1": 0.3272727429866791, "num_tokens": 364529.0, "repeat_count": 0.0, "routers_loss": 0.07830183953046799, "skip_count": 1.0, "step": 226, "text_loss": 0.4787476360797882 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.070443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.00045400000000000003, "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 367683.0, "repeat_count": 0.0, "routers_loss": 0.015735948458313942, "skip_count": 0.0, "step": 228, "text_loss": 0.37148505449295044 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25, "learning_rate": 0.000458, "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 371402.0, "repeat_count": 0.0, "routers_loss": 0.013354359194636345, "skip_count": 0.0, "step": 230, "text_loss": 0.7464763522148132 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.000462, "loss": 0.0731, "macro_f1": 0.3333333432674408, "num_tokens": 374587.0, "repeat_count": 0.0, "routers_loss": 0.013763721100986004, "skip_count": 0.0, "step": 232, "text_loss": 0.8754443526268005 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.098620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3984375, "learning_rate": 0.00046600000000000005, "loss": 0.0861, "macro_f1": 0.3333333432674408, "num_tokens": 377513.0, "repeat_count": 0.0, "routers_loss": 0.010075435042381287, "skip_count": 0.0, "step": 234, "text_loss": 0.31534913182258606 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1080129145876136, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.00047, "loss": 0.0791, "macro_f1": 0.3272727429866791, "num_tokens": 380736.0, "repeat_count": 0.0, "routers_loss": 0.059825167059898376, "skip_count": 1.0, "step": 236, "text_loss": 0.5936337113380432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1174053419430585, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000474, "loss": 0.0514, "macro_f1": 0.32098764181137085, "num_tokens": 383236.0, "repeat_count": 0.0, "routers_loss": 0.09134846180677414, "skip_count": 2.0, "step": 238, "text_loss": 0.5976157784461975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1267977692985032, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.208984375, "learning_rate": 0.00047799999999999996, "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 385778.0, "repeat_count": 1.0, "routers_loss": 0.11989791691303253, "skip_count": 1.0, "step": 240, "text_loss": 0.3554210960865021 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1361901966539478, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.000482, "loss": 0.0734, "macro_f1": 0.3333333432674408, "num_tokens": 388777.0, "repeat_count": 0.0, "routers_loss": 0.013591105118393898, "skip_count": 0.0, "step": 242, "text_loss": 0.4829460382461548 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1455826240093925, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12060546875, "learning_rate": 0.000486, "loss": 0.0625, "macro_f1": 0.32098764181137085, "num_tokens": 391797.0, "repeat_count": 0.0, "routers_loss": 0.0920003354549408, "skip_count": 2.0, "step": 244, "text_loss": 0.3085818886756897 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1549750513648371, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.00049, "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 396485.0, "repeat_count": 0.0, "routers_loss": 0.0129330949857831, "skip_count": 0.0, "step": 246, "text_loss": 0.42803969979286194 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1643674787202818, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.296875, "learning_rate": 0.000494, "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 399923.0, "repeat_count": 0.0, "routers_loss": 0.10677755624055862, "skip_count": 3.0, "step": 248, "text_loss": 0.2908555567264557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1737599060757264, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.000498, "loss": 0.0812, "macro_f1": 0.3144654333591461, "num_tokens": 403647.0, "repeat_count": 0.0, "routers_loss": 0.1504337340593338, "skip_count": 3.0, "step": 250, "text_loss": 0.333095908164978 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.183152333431171, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0005020000000000001, "loss": 0.0828, "macro_f1": 0.32098764181137085, "num_tokens": 409147.0, "repeat_count": 0.0, "routers_loss": 0.06503184884786606, "skip_count": 2.0, "step": 252, "text_loss": 0.16117942333221436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1925447607866158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.287109375, "learning_rate": 0.000506, "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 412072.0, "repeat_count": 0.0, "routers_loss": 0.016280122101306915, "skip_count": 0.0, "step": 254, "text_loss": 0.4217492640018463 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2019371881420604, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.00051, "loss": 0.0803, "macro_f1": 0.3144654333591461, "num_tokens": 415052.0, "repeat_count": 2.0, "routers_loss": 0.2117508500814438, "skip_count": 1.0, "step": 256, "text_loss": 0.5795308947563171 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.211329615497505, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2421875, "learning_rate": 0.000514, "loss": 0.0668, "macro_f1": 0.3272727429866791, "num_tokens": 418099.0, "repeat_count": 1.0, "routers_loss": 0.15002092719078064, "skip_count": 0.0, "step": 258, "text_loss": 0.4840938448905945 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2207220428529497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.000518, "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 422526.0, "repeat_count": 0.0, "routers_loss": 0.012834074907004833, "skip_count": 0.0, "step": 260, "text_loss": 0.36141225695610046 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2301144702083944, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.000522, "loss": 0.085, "macro_f1": 0.3076923191547394, "num_tokens": 425765.0, "repeat_count": 2.0, "routers_loss": 0.23808011412620544, "skip_count": 2.0, "step": 262, "text_loss": 0.27572691440582275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2395068975638392, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.000526, "loss": 0.0708, "macro_f1": 0.3272727429866791, "num_tokens": 429048.0, "repeat_count": 0.0, "routers_loss": 0.055687375366687775, "skip_count": 1.0, "step": 264, "text_loss": 0.37020301818847656 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.248899324919284, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2080078125, "learning_rate": 0.0005300000000000001, "loss": 0.0839, "macro_f1": 0.3272727429866791, "num_tokens": 431784.0, "repeat_count": 0.0, "routers_loss": 0.0872957780957222, "skip_count": 1.0, "step": 266, "text_loss": 0.5937283039093018 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2582917522747286, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.0005340000000000001, "loss": 0.0733, "macro_f1": 0.32098764181137085, "num_tokens": 434297.0, "repeat_count": 2.0, "routers_loss": 0.23507654666900635, "skip_count": 0.0, "step": 268, "text_loss": 0.3367372453212738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2676841796301732, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2431640625, "learning_rate": 0.0005380000000000001, "loss": 0.0708, "macro_f1": 0.32098764181137085, "num_tokens": 437586.0, "repeat_count": 0.0, "routers_loss": 0.12860390543937683, "skip_count": 2.0, "step": 270, "text_loss": 0.7149854302406311 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2770766069856179, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2451171875, "learning_rate": 0.0005420000000000001, "loss": 0.1072, "macro_f1": 0.3272727429866791, "num_tokens": 440649.0, "repeat_count": 0.0, "routers_loss": 0.044308312237262726, "skip_count": 1.0, "step": 272, "text_loss": 0.26778292655944824 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2864690343410625, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.44921875, "learning_rate": 0.000546, "loss": 0.0938, "macro_f1": 0.3144654333591461, "num_tokens": 443907.0, "repeat_count": 0.0, "routers_loss": 0.11514109373092651, "skip_count": 3.0, "step": 274, "text_loss": 0.23578761518001556 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.2958614616965072, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2578125, "learning_rate": 0.00055, "loss": 0.0932, "macro_f1": 0.5492662787437439, "num_tokens": 447147.0, "repeat_count": 0.0, "routers_loss": 0.055705297738313675, "skip_count": 2.0, "step": 276, "text_loss": 0.2513524889945984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3052538890519518, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.000554, "loss": 0.0667, "macro_f1": 0.32098764181137085, "num_tokens": 450032.0, "repeat_count": 0.0, "routers_loss": 0.13778971135616302, "skip_count": 2.0, "step": 278, "text_loss": 0.4857243597507477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3146463164073965, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.000558, "loss": 0.0672, "macro_f1": 0.3272727429866791, "num_tokens": 453195.0, "repeat_count": 1.0, "routers_loss": 0.0700262188911438, "skip_count": 0.0, "step": 280, "text_loss": 0.7589789628982544 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3240387437628411, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25, "learning_rate": 0.0005620000000000001, "loss": 0.0603, "macro_f1": 0.3144654333591461, "num_tokens": 455942.0, "repeat_count": 1.0, "routers_loss": 0.11706235259771347, "skip_count": 2.0, "step": 282, "text_loss": 0.4783432185649872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3334311711182858, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.265625, "learning_rate": 0.000566, "loss": 0.0793, "macro_f1": 0.3272727429866791, "num_tokens": 458932.0, "repeat_count": 0.0, "routers_loss": 0.07073967158794403, "skip_count": 1.0, "step": 284, "text_loss": 0.7117193937301636 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3428235984737307, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.00057, "loss": 0.0915, "macro_f1": 0.3272727429866791, "num_tokens": 462650.0, "repeat_count": 0.0, "routers_loss": 0.05301115661859512, "skip_count": 1.0, "step": 286, "text_loss": 0.4175460636615753 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.352216025829175, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.000574, "loss": 0.0675, "macro_f1": 0.3272727429866791, "num_tokens": 466290.0, "repeat_count": 0.0, "routers_loss": 0.06356479972600937, "skip_count": 1.0, "step": 288, "text_loss": 0.5832946300506592 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.36160845318462, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28515625, "learning_rate": 0.000578, "loss": 0.0805, "macro_f1": 0.3006536066532135, "num_tokens": 469296.0, "repeat_count": 1.0, "routers_loss": 0.21032999455928802, "skip_count": 3.0, "step": 290, "text_loss": 0.36023473739624023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3710008805400646, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.27734375, "learning_rate": 0.0005819999999999999, "loss": 0.0685, "macro_f1": 0.32098764181137085, "num_tokens": 472272.0, "repeat_count": 1.0, "routers_loss": 0.08062280714511871, "skip_count": 1.0, "step": 292, "text_loss": 0.37197956442832947 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3803933078955093, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28125, "learning_rate": 0.0005859999999999999, "loss": 0.0878, "macro_f1": 0.32098764181137085, "num_tokens": 475864.0, "repeat_count": 0.0, "routers_loss": 0.05023600533604622, "skip_count": 2.0, "step": 294, "text_loss": 0.4765273630619049 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.389785735250954, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2177734375, "learning_rate": 0.00059, "loss": 0.0728, "macro_f1": 0.3333333432674408, "num_tokens": 478916.0, "repeat_count": 0.0, "routers_loss": 0.011689410544931889, "skip_count": 0.0, "step": 296, "text_loss": 0.5878773927688599 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3991781626063986, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000594, "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 482369.0, "repeat_count": 0.0, "routers_loss": 0.010772093199193478, "skip_count": 0.0, "step": 298, "text_loss": 0.4424116313457489 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4085705899618433, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.181640625, "learning_rate": 0.000598, "loss": 0.0787, "macro_f1": 0.3076923191547394, "num_tokens": 486049.0, "repeat_count": 2.0, "routers_loss": 0.23482851684093475, "skip_count": 2.0, "step": 300, "text_loss": 0.21217775344848633 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.417963017317288, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2080078125, "learning_rate": 0.000602, "loss": 0.073, "macro_f1": 0.3076923191547394, "num_tokens": 488683.0, "repeat_count": 1.0, "routers_loss": 0.18843084573745728, "skip_count": 3.0, "step": 302, "text_loss": 0.2109498232603073 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4273554446727326, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.279296875, "learning_rate": 0.000606, "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 492010.0, "repeat_count": 0.0, "routers_loss": 0.17861786484718323, "skip_count": 3.0, "step": 304, "text_loss": 0.8446305394172668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4367478720281772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.00061, "loss": 0.0827, "macro_f1": 0.3333333432674408, "num_tokens": 494764.0, "repeat_count": 0.0, "routers_loss": 0.014124520123004913, "skip_count": 0.0, "step": 306, "text_loss": 0.742735743522644 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4461402993836219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26953125, "learning_rate": 0.000614, "loss": 0.1071, "macro_f1": 0.3333333432674408, "num_tokens": 497820.0, "repeat_count": 0.0, "routers_loss": 0.017968112602829933, "skip_count": 0.0, "step": 308, "text_loss": 0.28305482864379883 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4555327267390665, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1689453125, "learning_rate": 0.0006180000000000001, "loss": 0.0775, "macro_f1": 0.32098764181137085, "num_tokens": 500694.0, "repeat_count": 0.0, "routers_loss": 0.08593655377626419, "skip_count": 2.0, "step": 310, "text_loss": 0.3496848940849304 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4649251540945114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19140625, "learning_rate": 0.000622, "loss": 0.061, "macro_f1": 0.3333333432674408, "num_tokens": 503871.0, "repeat_count": 0.0, "routers_loss": 0.016449492424726486, "skip_count": 0.0, "step": 312, "text_loss": 0.6691372990608215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4743175814499558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.205078125, "learning_rate": 0.000626, "loss": 0.0815, "macro_f1": 0.3333333432674408, "num_tokens": 506730.0, "repeat_count": 0.0, "routers_loss": 0.014532964676618576, "skip_count": 0.0, "step": 314, "text_loss": 0.6118118166923523 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4837100088054007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2216796875, "learning_rate": 0.00063, "loss": 0.0742, "macro_f1": 0.3333333432674408, "num_tokens": 510323.0, "repeat_count": 0.0, "routers_loss": 0.013093139044940472, "skip_count": 0.0, "step": 316, "text_loss": 0.38126271963119507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4931024361608454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.400390625, "learning_rate": 0.000634, "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 514075.0, "repeat_count": 0.0, "routers_loss": 0.008627045899629593, "skip_count": 0.0, "step": 318, "text_loss": 0.5983037948608398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.50249486351629, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000638, "loss": 0.1008, "macro_f1": 0.3272727429866791, "num_tokens": 517418.0, "repeat_count": 0.0, "routers_loss": 0.04561378434300423, "skip_count": 1.0, "step": 320, "text_loss": 0.767257034778595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.5118872908717347, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.259765625, "learning_rate": 0.000642, "loss": 0.0926, "macro_f1": 0.3272727429866791, "num_tokens": 520443.0, "repeat_count": 0.0, "routers_loss": 0.024372953921556473, "skip_count": 0.0, "step": 322, "text_loss": 0.6572105884552002 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5212797182271793, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30078125, "learning_rate": 0.000646, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 523317.0, "repeat_count": 1.0, "routers_loss": 0.08099937438964844, "skip_count": 0.0, "step": 324, "text_loss": 0.205499529838562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.530672145582624, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.0006500000000000001, "loss": 0.0809, "macro_f1": 0.32098767161369324, "num_tokens": 526355.0, "repeat_count": 0.0, "routers_loss": 0.0657225176692009, "skip_count": 1.0, "step": 326, "text_loss": 0.2587239742279053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5400645729380686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.111328125, "learning_rate": 0.0006540000000000001, "loss": 0.0779, "macro_f1": 0.3333333432674408, "num_tokens": 529689.0, "repeat_count": 0.0, "routers_loss": 0.01849208027124405, "skip_count": 0.0, "step": 328, "text_loss": 0.2172023057937622 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5494570002935135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1845703125, "learning_rate": 0.0006580000000000001, "loss": 0.0758, "macro_f1": 0.3333333432674408, "num_tokens": 532603.0, "repeat_count": 0.0, "routers_loss": 0.016184113919734955, "skip_count": 0.0, "step": 330, "text_loss": 0.5980568528175354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.220703125, "learning_rate": 0.000662, "loss": 0.0439, "macro_f1": 0.3333333432674408, "num_tokens": 536056.0, "repeat_count": 0.0, "routers_loss": 0.01303898449987173, "skip_count": 0.0, "step": 332, "text_loss": 0.5421966314315796 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 1.5682418550044028, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.296875, "learning_rate": 0.000666, "loss": 0.0963, "macro_f1": 0.465986430644989, "num_tokens": 539231.0, "repeat_count": 3.0, "routers_loss": 0.3075675964355469, "skip_count": 3.0, "step": 334, "text_loss": 0.19719554483890533 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5776342823598473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.00067, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 542038.0, "repeat_count": 0.0, "routers_loss": 0.009116224013268948, "skip_count": 0.0, "step": 336, "text_loss": 0.3407036066055298 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5870267097152921, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2421875, "learning_rate": 0.000674, "loss": 0.0768, "macro_f1": 0.3333333432674408, "num_tokens": 545019.0, "repeat_count": 0.0, "routers_loss": 0.021463042125105858, "skip_count": 0.0, "step": 338, "text_loss": 0.24486012756824493 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5964191370707366, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.0006780000000000001, "loss": 0.0889, "macro_f1": 0.3333333432674408, "num_tokens": 548036.0, "repeat_count": 0.0, "routers_loss": 0.01857556402683258, "skip_count": 0.0, "step": 340, "text_loss": 0.28140124678611755 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6058115644261814, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0006820000000000001, "loss": 0.0617, "macro_f1": 0.3006536364555359, "num_tokens": 551419.0, "repeat_count": 2.0, "routers_loss": 0.27090007066726685, "skip_count": 3.0, "step": 342, "text_loss": 0.20690307021141052 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.615203991781626, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3046875, "learning_rate": 0.0006860000000000001, "loss": 0.1047, "macro_f1": 0.32098764181137085, "num_tokens": 554037.0, "repeat_count": 0.0, "routers_loss": 0.09231195598840714, "skip_count": 2.0, "step": 344, "text_loss": 0.4479128420352936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6245964191370708, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.00069, "loss": 0.0883, "macro_f1": 0.3333333432674408, "num_tokens": 556672.0, "repeat_count": 0.0, "routers_loss": 0.00935924518853426, "skip_count": 0.0, "step": 346, "text_loss": 0.6377320289611816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6339888464925154, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.000694, "loss": 0.0781, "macro_f1": 0.32098764181137085, "num_tokens": 559756.0, "repeat_count": 0.0, "routers_loss": 0.17641772329807281, "skip_count": 2.0, "step": 348, "text_loss": 0.6097636222839355 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.64338127384796, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.30078125, "learning_rate": 0.0006979999999999999, "loss": 0.0616, "macro_f1": 0.5492662787437439, "num_tokens": 563415.0, "repeat_count": 0.0, "routers_loss": 0.06240406632423401, "skip_count": 2.0, "step": 350, "text_loss": 0.5291631817817688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6527737012034047, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.296875, "learning_rate": 0.0007019999999999999, "loss": 0.1026, "macro_f1": 0.3333333432674408, "num_tokens": 566357.0, "repeat_count": 0.0, "routers_loss": 0.012269247323274612, "skip_count": 0.0, "step": 352, "text_loss": 0.5170195698738098 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6621661285588494, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0007059999999999999, "loss": 0.0815, "macro_f1": 0.32098764181137085, "num_tokens": 569449.0, "repeat_count": 0.0, "routers_loss": 0.07515309751033783, "skip_count": 2.0, "step": 354, "text_loss": 0.34507250785827637 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6715585559142943, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.00071, "loss": 0.0791, "macro_f1": 0.3144654333591461, "num_tokens": 572761.0, "repeat_count": 1.0, "routers_loss": 0.20768006145954132, "skip_count": 2.0, "step": 356, "text_loss": 0.3158532381057739 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6809509832697387, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.000714, "loss": 0.0682, "macro_f1": 0.3333333432674408, "num_tokens": 575909.0, "repeat_count": 0.0, "routers_loss": 0.025329967960715294, "skip_count": 0.0, "step": 358, "text_loss": 0.21455390751361847 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.6903434106251836, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.000718, "loss": 0.0775, "macro_f1": 0.32098767161369324, "num_tokens": 579186.0, "repeat_count": 1.0, "routers_loss": 0.07676175981760025, "skip_count": 0.0, "step": 360, "text_loss": 0.61895352602005 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.699735837980628, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.197265625, "learning_rate": 0.000722, "loss": 0.0781, "macro_f1": 0.32098767161369324, "num_tokens": 582437.0, "repeat_count": 0.0, "routers_loss": 0.08070661872625351, "skip_count": 1.0, "step": 362, "text_loss": 0.20557661354541779 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7091282653360729, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2216796875, "learning_rate": 0.000726, "loss": 0.11, "macro_f1": 0.3333333432674408, "num_tokens": 586096.0, "repeat_count": 0.0, "routers_loss": 0.015891313552856445, "skip_count": 0.0, "step": 364, "text_loss": 0.597991943359375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7185206926915173, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.00073, "loss": 0.0573, "macro_f1": 0.3076923191547394, "num_tokens": 589520.0, "repeat_count": 1.0, "routers_loss": 0.12844261527061462, "skip_count": 3.0, "step": 366, "text_loss": 0.2944789230823517 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7279131200469622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.000734, "loss": 0.1005, "macro_f1": 0.3333333432674408, "num_tokens": 592691.0, "repeat_count": 0.0, "routers_loss": 0.02382199838757515, "skip_count": 0.0, "step": 368, "text_loss": 0.23989969491958618 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7373055474024068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.000738, "loss": 0.0661, "macro_f1": 0.3333333432674408, "num_tokens": 596004.0, "repeat_count": 0.0, "routers_loss": 0.018812084570527077, "skip_count": 0.0, "step": 370, "text_loss": 0.22111408412456512 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7466979747578515, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2412109375, "learning_rate": 0.000742, "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 599087.0, "repeat_count": 0.0, "routers_loss": 0.08290331065654755, "skip_count": 1.0, "step": 372, "text_loss": 0.2567356526851654 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7560904021132961, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2412109375, "learning_rate": 0.000746, "loss": 0.0941, "macro_f1": 0.32098764181137085, "num_tokens": 602330.0, "repeat_count": 1.0, "routers_loss": 0.11482042074203491, "skip_count": 1.0, "step": 374, "text_loss": 0.7217292785644531 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7654828294687408, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2265625, "learning_rate": 0.00075, "loss": 0.0728, "macro_f1": 0.3272727429866791, "num_tokens": 605503.0, "repeat_count": 1.0, "routers_loss": 0.11849870532751083, "skip_count": 0.0, "step": 376, "text_loss": 0.5122153759002686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.7748752568241855, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2333984375, "learning_rate": 0.000754, "loss": 0.0835, "macro_f1": 0.32098767161369324, "num_tokens": 608505.0, "repeat_count": 0.0, "routers_loss": 0.07090992480516434, "skip_count": 1.0, "step": 378, "text_loss": 0.2204965502023697 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.78426768417963, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.000758, "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 611193.0, "repeat_count": 0.0, "routers_loss": 0.03812089189887047, "skip_count": 1.0, "step": 380, "text_loss": 0.44909021258354187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.793660111535075, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1689453125, "learning_rate": 0.000762, "loss": 0.0882, "macro_f1": 0.3272727429866791, "num_tokens": 614231.0, "repeat_count": 1.0, "routers_loss": 0.10270529240369797, "skip_count": 0.0, "step": 382, "text_loss": 0.13624964654445648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8030525388905194, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.330078125, "learning_rate": 0.0007660000000000001, "loss": 0.1107, "macro_f1": 0.32098764181137085, "num_tokens": 617090.0, "repeat_count": 1.0, "routers_loss": 0.11624004691839218, "skip_count": 1.0, "step": 384, "text_loss": 0.7314052581787109 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8124449662459643, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007700000000000001, "loss": 0.0628, "macro_f1": 0.32098764181137085, "num_tokens": 620596.0, "repeat_count": 0.0, "routers_loss": 0.07114322483539581, "skip_count": 2.0, "step": 386, "text_loss": 0.503322958946228 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8218373936014087, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.0007740000000000001, "loss": 0.0829, "macro_f1": 0.32098764181137085, "num_tokens": 624108.0, "repeat_count": 0.0, "routers_loss": 0.06061873584985733, "skip_count": 2.0, "step": 388, "text_loss": 0.11481904983520508 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8312298209568536, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2099609375, "learning_rate": 0.000778, "loss": 0.0791, "macro_f1": 0.3006536364555359, "num_tokens": 626895.0, "repeat_count": 1.0, "routers_loss": 0.2921771705150604, "skip_count": 4.0, "step": 390, "text_loss": 0.3069624602794647 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8406222483122983, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30859375, "learning_rate": 0.000782, "loss": 0.0605, "macro_f1": 0.3076923191547394, "num_tokens": 630204.0, "repeat_count": 0.0, "routers_loss": 0.202707901597023, "skip_count": 4.0, "step": 392, "text_loss": 0.6022785305976868 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.850014675667743, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.000786, "loss": 0.0877, "macro_f1": 0.3333333432674408, "num_tokens": 634373.0, "repeat_count": 0.0, "routers_loss": 0.0221510399132967, "skip_count": 0.0, "step": 394, "text_loss": 0.26787394285202026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8594071030231876, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.37890625, "learning_rate": 0.00079, "loss": 0.0805, "macro_f1": 0.32098764181137085, "num_tokens": 637442.0, "repeat_count": 2.0, "routers_loss": 0.12636390328407288, "skip_count": 0.0, "step": 396, "text_loss": 0.2799781560897827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8687995303786322, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2080078125, "learning_rate": 0.0007940000000000001, "loss": 0.0724, "macro_f1": 0.32098764181137085, "num_tokens": 641231.0, "repeat_count": 0.0, "routers_loss": 0.07933453470468521, "skip_count": 2.0, "step": 398, "text_loss": 0.2507784366607666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8781919577340769, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.0007980000000000001, "loss": 0.0909, "macro_f1": 0.3272727429866791, "num_tokens": 644560.0, "repeat_count": 1.0, "routers_loss": 0.10324911028146744, "skip_count": 0.0, "step": 400, "text_loss": 0.7756280303001404 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8875843850895215, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0008020000000000001, "loss": 0.0783, "macro_f1": 0.3144654333591461, "num_tokens": 647393.0, "repeat_count": 1.0, "routers_loss": 0.18546262383460999, "skip_count": 2.0, "step": 402, "text_loss": 0.5013328194618225 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8969768124449664, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0008060000000000001, "loss": 0.0787, "macro_f1": 0.2857142984867096, "num_tokens": 650355.0, "repeat_count": 3.0, "routers_loss": 0.3280293643474579, "skip_count": 4.0, "step": 404, "text_loss": 0.2842077314853668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9063692398004108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.0008100000000000001, "loss": 0.0901, "macro_f1": 0.3333333432674408, "num_tokens": 654280.0, "repeat_count": 0.0, "routers_loss": 0.02623247355222702, "skip_count": 0.0, "step": 406, "text_loss": 0.46742817759513855 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9157616671558557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.0008139999999999999, "loss": 0.0945, "macro_f1": 0.3333333432674408, "num_tokens": 657568.0, "repeat_count": 0.0, "routers_loss": 0.009744114242494106, "skip_count": 0.0, "step": 408, "text_loss": 0.7168047428131104 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9251540945113002, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.0008179999999999999, "loss": 0.1065, "macro_f1": 0.32098764181137085, "num_tokens": 660593.0, "repeat_count": 0.0, "routers_loss": 0.07591600716114044, "skip_count": 2.0, "step": 410, "text_loss": 0.449823260307312 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0008219999999999999, "loss": 0.0795, "macro_f1": 0.3333333432674408, "num_tokens": 663916.0, "repeat_count": 0.0, "routers_loss": 0.02076602540910244, "skip_count": 0.0, "step": 412, "text_loss": 0.4764713943004608 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9439389492221895, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.000826, "loss": 0.0836, "macro_f1": 0.3272727429866791, "num_tokens": 667502.0, "repeat_count": 0.0, "routers_loss": 0.049170155078172684, "skip_count": 1.0, "step": 414, "text_loss": 0.30333325266838074 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9533313765776343, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1513671875, "learning_rate": 0.00083, "loss": 0.1021, "macro_f1": 0.3272727429866791, "num_tokens": 670510.0, "repeat_count": 1.0, "routers_loss": 0.15554003417491913, "skip_count": 0.0, "step": 416, "text_loss": 0.3691870868206024 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.962723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000834, "loss": 0.1013, "macro_f1": 0.3333333432674408, "num_tokens": 674761.0, "repeat_count": 0.0, "routers_loss": 0.024516675621271133, "skip_count": 0.0, "step": 418, "text_loss": 0.32850381731987 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9721162312885236, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.000838, "loss": 0.0649, "macro_f1": 0.3333333432674408, "num_tokens": 678055.0, "repeat_count": 0.0, "routers_loss": 0.011026890948414803, "skip_count": 0.0, "step": 420, "text_loss": 0.6637290716171265 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9815086586439683, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000842, "loss": 0.0771, "macro_f1": 0.3272727429866791, "num_tokens": 680979.0, "repeat_count": 0.0, "routers_loss": 0.07451887428760529, "skip_count": 1.0, "step": 422, "text_loss": 0.27131685614585876 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.990901085999413, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1318359375, "learning_rate": 0.000846, "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 684144.0, "repeat_count": 1.0, "routers_loss": 0.11341800540685654, "skip_count": 1.0, "step": 424, "text_loss": 0.652126669883728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.00085, "loss": 0.0754, "macro_f1": 0.3272727429866791, "num_tokens": 687004.0, "repeat_count": 1.0, "routers_loss": 0.08985847979784012, "skip_count": 0.0, "step": 426, "text_loss": 0.2589428424835205 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.009392427355445, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23828125, "learning_rate": 0.000854, "loss": 0.0866, "macro_f1": 0.3333333432674408, "num_tokens": 689702.0, "repeat_count": 0.0, "routers_loss": 0.011355436407029629, "skip_count": 0.0, "step": 428, "text_loss": 0.8909716010093689 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0187848547108893, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.000858, "loss": 0.0623, "macro_f1": 0.3333333432674408, "num_tokens": 692698.0, "repeat_count": 0.0, "routers_loss": 0.013788948766887188, "skip_count": 0.0, "step": 430, "text_loss": 0.19141142070293427 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.028177282066334, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.000862, "loss": 0.0499, "macro_f1": 0.32098764181137085, "num_tokens": 696007.0, "repeat_count": 0.0, "routers_loss": 0.07998392730951309, "skip_count": 2.0, "step": 432, "text_loss": 0.1611809879541397 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0375697094217786, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.000866, "loss": 0.0541, "macro_f1": 0.32098764181137085, "num_tokens": 700271.0, "repeat_count": 0.0, "routers_loss": 0.06988382339477539, "skip_count": 2.0, "step": 434, "text_loss": 0.37254223227500916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0469621367772235, "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.00087, "loss": 0.0834, "macro_f1": 0.2777777910232544, "num_tokens": 703519.0, "repeat_count": 3.0, "routers_loss": 0.28240787982940674, "skip_count": 5.0, "step": 436, "text_loss": 0.29636648297309875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.056354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.423828125, "learning_rate": 0.000874, "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 706826.0, "repeat_count": 0.0, "routers_loss": 0.013924967497587204, "skip_count": 0.0, "step": 438, "text_loss": 0.20867908000946045 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.065746991488113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.000878, "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 710530.0, "repeat_count": 0.0, "routers_loss": 0.01170142088085413, "skip_count": 0.0, "step": 440, "text_loss": 0.7273373007774353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0751394188435572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.000882, "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 713503.0, "repeat_count": 0.0, "routers_loss": 0.011930872686207294, "skip_count": 0.0, "step": 442, "text_loss": 0.39314430952072144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.084531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.0008860000000000001, "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 716582.0, "repeat_count": 0.0, "routers_loss": 0.008630385622382164, "skip_count": 0.0, "step": 444, "text_loss": 0.5925271511077881 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.0939242735544465, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.0008900000000000001, "loss": 0.0811, "macro_f1": 0.3006536066532135, "num_tokens": 719941.0, "repeat_count": 3.0, "routers_loss": 0.3015584945678711, "skip_count": 1.0, "step": 446, "text_loss": 0.5059905052185059 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.1033167009098914, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.000894, "loss": 0.0822, "macro_f1": 0.31446540355682373, "num_tokens": 723113.0, "repeat_count": 1.0, "routers_loss": 0.10897493362426758, "skip_count": 1.0, "step": 448, "text_loss": 0.19616436958312988 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.112709128265336, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.33984375, "learning_rate": 0.000898, "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 726193.0, "repeat_count": 0.0, "routers_loss": 0.07236456125974655, "skip_count": 2.0, "step": 450, "text_loss": 0.1773054152727127 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1221015556207807, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3203125, "learning_rate": 0.000902, "loss": 0.058, "macro_f1": 0.3272727429866791, "num_tokens": 729275.0, "repeat_count": 1.0, "routers_loss": 0.08184371143579483, "skip_count": 0.0, "step": 452, "text_loss": 0.4927310049533844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1314939829762256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.000906, "loss": 0.0607, "macro_f1": 0.3333333432674408, "num_tokens": 731948.0, "repeat_count": 0.0, "routers_loss": 0.014033539220690727, "skip_count": 0.0, "step": 454, "text_loss": 0.4745742678642273 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.00091, "loss": 0.0651, "macro_f1": 0.3333333432674408, "num_tokens": 735351.0, "repeat_count": 0.0, "routers_loss": 0.0071774693205952644, "skip_count": 0.0, "step": 456, "text_loss": 0.18523462116718292 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.150278837687115, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.400390625, "learning_rate": 0.0009140000000000001, "loss": 0.0738, "macro_f1": 0.5492662787437439, "num_tokens": 738587.0, "repeat_count": 0.0, "routers_loss": 0.07781517505645752, "skip_count": 2.0, "step": 458, "text_loss": 0.3459635376930237 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.1596712650425594, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28125, "learning_rate": 0.0009180000000000001, "loss": 0.0723, "macro_f1": 0.3076923191547394, "num_tokens": 741779.0, "repeat_count": 0.0, "routers_loss": 0.09529037028551102, "skip_count": 2.0, "step": 460, "text_loss": 0.20197433233261108 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1690636923980042, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1865234375, "learning_rate": 0.0009220000000000001, "loss": 0.0519, "macro_f1": 0.3333333432674408, "num_tokens": 745355.0, "repeat_count": 0.0, "routers_loss": 0.009765669703483582, "skip_count": 0.0, "step": 462, "text_loss": 0.7031404376029968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1784561197534487, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0009260000000000001, "loss": 0.0527, "macro_f1": 0.3272727429866791, "num_tokens": 748628.0, "repeat_count": 0.0, "routers_loss": 0.03344850242137909, "skip_count": 1.0, "step": 464, "text_loss": 0.21274663507938385 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1878485471088935, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.00093, "loss": 0.0534, "macro_f1": 0.3076923191547394, "num_tokens": 751472.0, "repeat_count": 2.0, "routers_loss": 0.1354292333126068, "skip_count": 2.0, "step": 466, "text_loss": 0.5350717306137085 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.197240974464338, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.000934, "loss": 0.0598, "macro_f1": 0.3272727429866791, "num_tokens": 754479.0, "repeat_count": 0.0, "routers_loss": 0.056420840322971344, "skip_count": 1.0, "step": 468, "text_loss": 0.28153330087661743 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.206633401819783, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.234375, "learning_rate": 0.0009379999999999999, "loss": 0.0597, "macro_f1": 0.31446540355682373, "num_tokens": 757872.0, "repeat_count": 1.0, "routers_loss": 0.1622387170791626, "skip_count": 1.0, "step": 470, "text_loss": 0.22956843674182892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2160258291752273, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5, "learning_rate": 0.000942, "loss": 0.0953, "macro_f1": 0.32098764181137085, "num_tokens": 760468.0, "repeat_count": 0.0, "routers_loss": 0.05146972835063934, "skip_count": 2.0, "step": 472, "text_loss": 0.4513966739177704 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.225418256530672, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.000946, "loss": 0.0592, "macro_f1": 0.3272727429866791, "num_tokens": 763519.0, "repeat_count": 1.0, "routers_loss": 0.09022669494152069, "skip_count": 0.0, "step": 474, "text_loss": 0.25758957862854004 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.234810683886117, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.00095, "loss": 0.0498, "macro_f1": 0.3272727429866791, "num_tokens": 767391.0, "repeat_count": 0.0, "routers_loss": 0.03044828027486801, "skip_count": 1.0, "step": 476, "text_loss": 0.21366681158542633 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2442031112415615, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.291015625, "learning_rate": 0.000954, "loss": 0.0802, "macro_f1": 0.3272727429866791, "num_tokens": 770338.0, "repeat_count": 0.0, "routers_loss": 0.10397060960531235, "skip_count": 1.0, "step": 478, "text_loss": 1.0396177768707275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.2535955385970063, "f1_execute": 0.8571429252624512, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000958, "loss": 0.1099, "macro_f1": 0.285714328289032, "num_tokens": 773699.0, "repeat_count": 2.0, "routers_loss": 0.22604143619537354, "skip_count": 4.0, "step": 480, "text_loss": 0.2570283114910126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.2629879659524508, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.000962, "loss": 0.0667, "macro_f1": 0.32098767161369324, "num_tokens": 777473.0, "repeat_count": 0.0, "routers_loss": 0.048258859664201736, "skip_count": 1.0, "step": 482, "text_loss": 0.2540103495121002 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2723803933078957, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.197265625, "learning_rate": 0.000966, "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 780833.0, "repeat_count": 0.0, "routers_loss": 0.023018671199679375, "skip_count": 0.0, "step": 484, "text_loss": 0.38524550199508667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.28177282066334, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.314453125, "learning_rate": 0.0009699999999999999, "loss": 0.0709, "macro_f1": 0.3272727429866791, "num_tokens": 783656.0, "repeat_count": 0.0, "routers_loss": 0.044845327734947205, "skip_count": 1.0, "step": 486, "text_loss": 0.5859048366546631 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000974, "loss": 0.0615, "macro_f1": 0.3333333432674408, "num_tokens": 787173.0, "repeat_count": 0.0, "routers_loss": 0.010898692533373833, "skip_count": 0.0, "step": 488, "text_loss": 0.3456067442893982 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3005576753742294, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000978, "loss": 0.0796, "macro_f1": 0.32098764181137085, "num_tokens": 790395.0, "repeat_count": 0.0, "routers_loss": 0.06497956812381744, "skip_count": 2.0, "step": 490, "text_loss": 0.3751123249530792 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3099501027296743, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.000982, "loss": 0.0772, "macro_f1": 0.3272727429866791, "num_tokens": 793137.0, "repeat_count": 0.0, "routers_loss": 0.07763728499412537, "skip_count": 1.0, "step": 492, "text_loss": 0.43296709656715393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3193425300851187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009860000000000001, "loss": 0.0819, "macro_f1": 0.3333333432674408, "num_tokens": 796497.0, "repeat_count": 0.0, "routers_loss": 0.02127906307578087, "skip_count": 0.0, "step": 494, "text_loss": 0.4841311275959015 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3287349574405636, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2138671875, "learning_rate": 0.00099, "loss": 0.073, "macro_f1": 0.3272727429866791, "num_tokens": 799361.0, "repeat_count": 1.0, "routers_loss": 0.09518691152334213, "skip_count": 0.0, "step": 496, "text_loss": 0.5094487071037292 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.3381273847960085, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.130859375, "learning_rate": 0.000994, "loss": 0.0789, "macro_f1": 0.5492662787437439, "num_tokens": 802629.0, "repeat_count": 0.0, "routers_loss": 0.0563947930932045, "skip_count": 2.0, "step": 498, "text_loss": 0.42783617973327637 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.347519812151453, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1865234375, "learning_rate": 0.000998, "loss": 0.0476, "macro_f1": 0.3272727429866791, "num_tokens": 805881.0, "repeat_count": 1.0, "routers_loss": 0.10570426285266876, "skip_count": 0.0, "step": 500, "text_loss": 0.28395503759384155 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.3569122395068973, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2275390625, "learning_rate": 0.0009999999760498814, "loss": 0.0849, "macro_f1": 0.5492662787437439, "num_tokens": 809283.0, "repeat_count": 0.0, "routers_loss": 0.031202208250761032, "skip_count": 2.0, "step": 502, "text_loss": 0.32970911264419556 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.366304666862342, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0009999997844489475, "loss": 0.0574, "macro_f1": 0.3272727429866791, "num_tokens": 812440.0, "repeat_count": 0.0, "routers_loss": 0.07647835463285446, "skip_count": 1.0, "step": 504, "text_loss": 0.4901447296142578 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.375697094217787, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25, "learning_rate": 0.000999999401247153, "loss": 0.0668, "macro_f1": 0.32098764181137085, "num_tokens": 815716.0, "repeat_count": 0.0, "routers_loss": 0.08515176922082901, "skip_count": 2.0, "step": 506, "text_loss": 0.6157599687576294 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3850895215732315, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25390625, "learning_rate": 0.0009999988264446445, "loss": 0.0686, "macro_f1": 0.3333333432674408, "num_tokens": 819086.0, "repeat_count": 0.0, "routers_loss": 0.00946938619017601, "skip_count": 0.0, "step": 508, "text_loss": 0.5053519010543823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3944819489286764, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0009999980600416424, "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 822268.0, "repeat_count": 0.0, "routers_loss": 0.01058756373822689, "skip_count": 0.0, "step": 510, "text_loss": 0.5570021867752075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.403874376284121, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.000999997102038441, "loss": 0.0678, "macro_f1": 0.3333333432674408, "num_tokens": 825728.0, "repeat_count": 0.0, "routers_loss": 0.008705209009349346, "skip_count": 0.0, "step": 512, "text_loss": 0.6519040465354919 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.4132668036395657, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.220703125, "learning_rate": 0.0009999959524354064, "loss": 0.083, "macro_f1": 0.3272727429866791, "num_tokens": 829459.0, "repeat_count": 0.0, "routers_loss": 0.04024193435907364, "skip_count": 1.0, "step": 514, "text_loss": 0.5290043950080872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25390625, "learning_rate": 0.00099999461123298, "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 832291.0, "repeat_count": 0.0, "routers_loss": 0.015742862597107887, "skip_count": 0.0, "step": 516, "text_loss": 0.7910057902336121 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.432051658350455, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.000999993078431675, "loss": 0.0759, "macro_f1": 0.3076923191547394, "num_tokens": 835399.0, "repeat_count": 1.0, "routers_loss": 0.16753782331943512, "skip_count": 3.0, "step": 518, "text_loss": 0.45196083188056946 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.4414440857058994, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.236328125, "learning_rate": 0.0009999913540320792, "loss": 0.0968, "macro_f1": 0.31446540355682373, "num_tokens": 838993.0, "repeat_count": 0.0, "routers_loss": 0.09357143193483353, "skip_count": 2.0, "step": 520, "text_loss": 0.5499435663223267 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.4508365130613443, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2451171875, "learning_rate": 0.0009999894380348536, "loss": 0.0821, "macro_f1": 0.5492662787437439, "num_tokens": 842652.0, "repeat_count": 0.0, "routers_loss": 0.056803856045007706, "skip_count": 2.0, "step": 522, "text_loss": 0.197520449757576 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 2.4602289404167887, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.2333984375, "learning_rate": 0.000999987330440732, "loss": 0.0725, "macro_f1": 0.4871794879436493, "num_tokens": 847061.0, "repeat_count": 0.0, "routers_loss": 0.08962195366621017, "skip_count": 3.0, "step": 524, "text_loss": 0.27509039640426636 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.4696213677722336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000999985031250522, "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 850780.0, "repeat_count": 0.0, "routers_loss": 0.022930558770895004, "skip_count": 0.0, "step": 526, "text_loss": 0.13291706144809723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.4790137951276785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.197265625, "learning_rate": 0.0009999825404651053, "loss": 0.0614, "macro_f1": 0.3333333432674408, "num_tokens": 853886.0, "repeat_count": 0.0, "routers_loss": 0.017097990959882736, "skip_count": 0.0, "step": 528, "text_loss": 0.21706295013427734 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.0009999798580854356, "loss": 0.0724, "macro_f1": 0.3333333432674408, "num_tokens": 857364.0, "repeat_count": 0.0, "routers_loss": 0.02831801027059555, "skip_count": 0.0, "step": 530, "text_loss": 0.9035662412643433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.497798649838568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.000999976984112541, "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 860661.0, "repeat_count": 0.0, "routers_loss": 0.019671892747282982, "skip_count": 0.0, "step": 532, "text_loss": 0.8354863524436951 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 2.5071910771940122, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.2890625, "learning_rate": 0.0009999739185475231, "loss": 0.0963, "macro_f1": 0.47333335876464844, "num_tokens": 864124.0, "repeat_count": 2.0, "routers_loss": 0.21383361518383026, "skip_count": 3.0, "step": 534, "text_loss": 0.23422949016094208 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.516583504549457, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.0009999706613915565, "loss": 0.0598, "macro_f1": 0.32098767161369324, "num_tokens": 866976.0, "repeat_count": 0.0, "routers_loss": 0.07158871740102768, "skip_count": 1.0, "step": 536, "text_loss": 0.11800774186849594 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.5259759319049016, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26953125, "learning_rate": 0.0009999672126458894, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 870549.0, "repeat_count": 0.0, "routers_loss": 0.08185924589633942, "skip_count": 1.0, "step": 538, "text_loss": 0.19232480227947235 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.5353683592603464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.000999963572311843, "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 873733.0, "repeat_count": 0.0, "routers_loss": 0.01633382774889469, "skip_count": 0.0, "step": 540, "text_loss": 0.3725031912326813 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.544760786615791, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0009999597403908128, "loss": 0.0761, "macro_f1": 0.3272727429866791, "num_tokens": 877099.0, "repeat_count": 0.0, "routers_loss": 0.0782657191157341, "skip_count": 1.0, "step": 542, "text_loss": 0.17589199542999268 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.5541532139712357, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2177734375, "learning_rate": 0.0009999557168842669, "loss": 0.0716, "macro_f1": 0.5492662787437439, "num_tokens": 879883.0, "repeat_count": 0.0, "routers_loss": 0.05275818333029747, "skip_count": 2.0, "step": 544, "text_loss": 0.26448264718055725 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.56354564132668, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.0009999515017937468, "loss": 0.071, "macro_f1": 0.32098764181137085, "num_tokens": 882223.0, "repeat_count": 0.0, "routers_loss": 0.09335892647504807, "skip_count": 2.0, "step": 546, "text_loss": 0.208544060587883 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.572938068682125, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.376953125, "learning_rate": 0.0009999470951208684, "loss": 0.0855, "macro_f1": 0.32098764181137085, "num_tokens": 885241.0, "repeat_count": 2.0, "routers_loss": 0.22983254492282867, "skip_count": 0.0, "step": 548, "text_loss": 0.6612338423728943 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.58233049603757, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.00099994249686732, "loss": 0.0786, "macro_f1": 0.3272727429866791, "num_tokens": 887897.0, "repeat_count": 1.0, "routers_loss": 0.12858282029628754, "skip_count": 0.0, "step": 550, "text_loss": 0.4673548936843872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.5917229233930144, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0009999377070348638, "loss": 0.0944, "macro_f1": 0.3333333432674408, "num_tokens": 891224.0, "repeat_count": 0.0, "routers_loss": 0.017421770840883255, "skip_count": 0.0, "step": 552, "text_loss": 0.6419258117675781 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.601115350748459, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000999932725625335, "loss": 0.0791, "macro_f1": 0.32098764181137085, "num_tokens": 894578.0, "repeat_count": 0.0, "routers_loss": 0.07890026271343231, "skip_count": 2.0, "step": 554, "text_loss": 0.5970752239227295 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.6105077781039037, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.0009999275526406427, "loss": 0.0796, "macro_f1": 0.31446540355682373, "num_tokens": 897145.0, "repeat_count": 1.0, "routers_loss": 0.09836960583925247, "skip_count": 1.0, "step": 556, "text_loss": 0.752425491809845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6199002054593485, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1875, "learning_rate": 0.0009999221880827693, "loss": 0.0882, "macro_f1": 0.3333333432674408, "num_tokens": 900565.0, "repeat_count": 0.0, "routers_loss": 0.017694659531116486, "skip_count": 0.0, "step": 558, "text_loss": 0.195619136095047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.629292632814793, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.0009999166319537703, "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 903506.0, "repeat_count": 0.0, "routers_loss": 0.019375264644622803, "skip_count": 0.0, "step": 560, "text_loss": 0.4603337347507477 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 2.638685060170238, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.146484375, "learning_rate": 0.0009999108842557748, "loss": 0.0953, "macro_f1": 0.4871794879436493, "num_tokens": 906380.0, "repeat_count": 0.0, "routers_loss": 0.12013207376003265, "skip_count": 3.0, "step": 562, "text_loss": 0.6279402375221252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6480774875256823, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0009999049449909854, "loss": 0.0799, "macro_f1": 0.3272727429866791, "num_tokens": 909116.0, "repeat_count": 0.0, "routers_loss": 0.06441342830657959, "skip_count": 1.0, "step": 564, "text_loss": 0.23741699755191803 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.657469914881127, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0009998988141616781, "loss": 0.064, "macro_f1": 0.32098767161369324, "num_tokens": 912189.0, "repeat_count": 0.0, "routers_loss": 0.08309414982795715, "skip_count": 1.0, "step": 566, "text_loss": 0.27780941128730774 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6668623422365716, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009998924917702023, "loss": 0.0876, "macro_f1": 0.3272727429866791, "num_tokens": 916279.0, "repeat_count": 1.0, "routers_loss": 0.07197169959545135, "skip_count": 0.0, "step": 568, "text_loss": 0.6371755599975586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6762547695920165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2255859375, "learning_rate": 0.0009998859778189806, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 919490.0, "repeat_count": 0.0, "routers_loss": 0.008022273890674114, "skip_count": 0.0, "step": 570, "text_loss": 0.6028938889503479 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6856471969474613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.000999879272310509, "loss": 0.084, "macro_f1": 0.3333333432674408, "num_tokens": 923694.0, "repeat_count": 0.0, "routers_loss": 0.01634674146771431, "skip_count": 0.0, "step": 572, "text_loss": 0.7177054286003113 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.695039624302906, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.0009998723752473574, "loss": 0.0716, "macro_f1": 0.3272727429866791, "num_tokens": 926933.0, "repeat_count": 0.0, "routers_loss": 0.060559045523405075, "skip_count": 1.0, "step": 574, "text_loss": 0.5203254818916321 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.0009998652866321687, "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 929832.0, "repeat_count": 0.0, "routers_loss": 0.011485611088573933, "skip_count": 0.0, "step": 576, "text_loss": 0.6147452592849731 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.713824479013795, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.000999858006467659, "loss": 0.0649, "macro_f1": 0.29333335161209106, "num_tokens": 933266.0, "repeat_count": 2.0, "routers_loss": 0.2929030954837799, "skip_count": 4.0, "step": 578, "text_loss": 0.1720666140317917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.72321690636924, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.24609375, "learning_rate": 0.0009998505347566186, "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 937545.0, "repeat_count": 0.0, "routers_loss": 0.053780000656843185, "skip_count": 2.0, "step": 580, "text_loss": 0.3258405327796936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7326093337246844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.00099984287150191, "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 941001.0, "repeat_count": 0.0, "routers_loss": 0.02637636847794056, "skip_count": 0.0, "step": 582, "text_loss": 0.23762771487236023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7420017610801293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009998350167064705, "loss": 0.0672, "macro_f1": 0.3333333432674408, "num_tokens": 943989.0, "repeat_count": 0.0, "routers_loss": 0.01637580618262291, "skip_count": 0.0, "step": 584, "text_loss": 0.7460582852363586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7513941884355737, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009998269703733096, "loss": 0.0686, "macro_f1": 0.3272727429866791, "num_tokens": 947245.0, "repeat_count": 1.0, "routers_loss": 0.13934117555618286, "skip_count": 0.0, "step": 586, "text_loss": 0.5284690260887146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7607866157910186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.13671875, "learning_rate": 0.0009998187325055106, "loss": 0.0667, "macro_f1": 0.3333333432674408, "num_tokens": 950116.0, "repeat_count": 0.0, "routers_loss": 0.02138397842645645, "skip_count": 0.0, "step": 588, "text_loss": 0.3920256197452545 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0009998103031062305, "loss": 0.0778, "macro_f1": 0.3333333432674408, "num_tokens": 953277.0, "repeat_count": 0.0, "routers_loss": 0.007098200265318155, "skip_count": 0.0, "step": 590, "text_loss": 0.7472905516624451 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.779571470501908, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.318359375, "learning_rate": 0.0009998016821786994, "loss": 0.0872, "macro_f1": 0.32098764181137085, "num_tokens": 958229.0, "repeat_count": 1.0, "routers_loss": 0.07946522533893585, "skip_count": 1.0, "step": 592, "text_loss": 0.5506448745727539 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.7889638978573528, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.000999792869726221, "loss": 0.0523, "macro_f1": 0.3272727429866791, "num_tokens": 961016.0, "repeat_count": 0.0, "routers_loss": 0.0850791186094284, "skip_count": 1.0, "step": 594, "text_loss": 0.3824431002140045 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0009997838657521717, "loss": 0.0632, "macro_f1": 0.3333333432674408, "num_tokens": 963847.0, "repeat_count": 0.0, "routers_loss": 0.016370445489883423, "skip_count": 0.0, "step": 596, "text_loss": 0.2139475792646408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.8077487525682416, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009997746702600026, "loss": 0.0702, "macro_f1": 0.307692289352417, "num_tokens": 966619.0, "repeat_count": 0.0, "routers_loss": 0.1310746818780899, "skip_count": 3.0, "step": 598, "text_loss": 0.3651018440723419 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8171411799236865, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23828125, "learning_rate": 0.0009997652832532372, "loss": 0.0792, "macro_f1": 0.3272727429866791, "num_tokens": 970418.0, "repeat_count": 1.0, "routers_loss": 0.14303378760814667, "skip_count": 0.0, "step": 600, "text_loss": 0.7094736099243164 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8265336072791314, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009997557047354722, "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 973491.0, "repeat_count": 0.0, "routers_loss": 0.03334212675690651, "skip_count": 1.0, "step": 602, "text_loss": 0.4812237024307251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.835926034634576, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.0009997459347103783, "loss": 0.0956, "macro_f1": 0.3272727429866791, "num_tokens": 976672.0, "repeat_count": 0.0, "routers_loss": 0.02831871062517166, "skip_count": 0.0, "step": 604, "text_loss": 0.21737146377563477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8453184619900207, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0009997359731816998, "loss": 0.0646, "macro_f1": 0.3333333432674408, "num_tokens": 979898.0, "repeat_count": 0.0, "routers_loss": 0.017968013882637024, "skip_count": 0.0, "step": 606, "text_loss": 0.5458008050918579 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.854710889345465, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.224609375, "learning_rate": 0.0009997258201532536, "loss": 0.0751, "macro_f1": 0.3333333432674408, "num_tokens": 982811.0, "repeat_count": 0.0, "routers_loss": 0.016256732866168022, "skip_count": 0.0, "step": 608, "text_loss": 0.8643257021903992 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0009997154756289303, "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 985245.0, "repeat_count": 0.0, "routers_loss": 0.021214161068201065, "skip_count": 0.0, "step": 610, "text_loss": 0.2204967886209488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8734957440563544, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.000999704939612694, "loss": 0.0636, "macro_f1": 0.3006536364555359, "num_tokens": 988539.0, "repeat_count": 3.0, "routers_loss": 0.23249399662017822, "skip_count": 2.0, "step": 612, "text_loss": 0.32489025592803955 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8828881714117993, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009996942121085824, "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 991660.0, "repeat_count": 0.0, "routers_loss": 0.010706410743296146, "skip_count": 0.0, "step": 614, "text_loss": 0.4551754891872406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8922805987672437, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3671875, "learning_rate": 0.000999683293120706, "loss": 0.1016, "macro_f1": 0.3333333432674408, "num_tokens": 994828.0, "repeat_count": 0.0, "routers_loss": 0.006676184479147196, "skip_count": 0.0, "step": 616, "text_loss": 0.6212068200111389 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9016730261226886, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.408203125, "learning_rate": 0.0009996721826532491, "loss": 0.0976, "macro_f1": 0.3076923191547394, "num_tokens": 997951.0, "repeat_count": 2.0, "routers_loss": 0.2148125320672989, "skip_count": 2.0, "step": 618, "text_loss": 0.26514527201652527 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.911065453478133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1904296875, "learning_rate": 0.000999660880710469, "loss": 0.0909, "macro_f1": 0.3333333432674408, "num_tokens": 1001139.0, "repeat_count": 0.0, "routers_loss": 0.022332455962896347, "skip_count": 0.0, "step": 620, "text_loss": 0.26131340861320496 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.920457880833578, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009996493872966971, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1003678.0, "repeat_count": 1.0, "routers_loss": 0.08348730951547623, "skip_count": 0.0, "step": 622, "text_loss": 0.19151706993579865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.0009996377024163374, "loss": 0.0822, "macro_f1": 0.3333333432674408, "num_tokens": 1007082.0, "repeat_count": 0.0, "routers_loss": 0.028577150776982307, "skip_count": 0.0, "step": 624, "text_loss": 0.305387407541275 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9392427355444672, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.0009996258260738676, "loss": 0.0892, "macro_f1": 0.3272727429866791, "num_tokens": 1010064.0, "repeat_count": 1.0, "routers_loss": 0.08312026411294937, "skip_count": 0.0, "step": 626, "text_loss": 0.49436143040657043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9486351628999117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009996137582738388, "loss": 0.0591, "macro_f1": 0.3333333432674408, "num_tokens": 1013462.0, "repeat_count": 0.0, "routers_loss": 0.013337327167391777, "skip_count": 0.0, "step": 628, "text_loss": 0.6515294313430786 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9580275902553566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.000999601499020875, "loss": 0.0537, "macro_f1": 0.3333333432674408, "num_tokens": 1016246.0, "repeat_count": 0.0, "routers_loss": 0.029126765206456184, "skip_count": 0.0, "step": 630, "text_loss": 0.18834827840328217 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9674200176108014, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009995890483196746, "loss": 0.0602, "macro_f1": 0.3272727429866791, "num_tokens": 1019286.0, "repeat_count": 0.0, "routers_loss": 0.054844800382852554, "skip_count": 1.0, "step": 632, "text_loss": 0.6988179087638855 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.322265625, "learning_rate": 0.0009995764061750086, "loss": 0.0767, "macro_f1": 0.3333333432674408, "num_tokens": 1022207.0, "repeat_count": 0.0, "routers_loss": 0.010095693171024323, "skip_count": 0.0, "step": 634, "text_loss": 0.558451771736145 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9862048723216907, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000999563572591721, "loss": 0.0521, "macro_f1": 0.32098764181137085, "num_tokens": 1025319.0, "repeat_count": 1.0, "routers_loss": 0.0698433518409729, "skip_count": 1.0, "step": 636, "text_loss": 0.5961872935295105 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.995597299677135, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11083984375, "learning_rate": 0.0009995505475747302, "loss": 0.0849, "macro_f1": 0.3272727429866791, "num_tokens": 1028362.0, "repeat_count": 0.0, "routers_loss": 0.040211405605077744, "skip_count": 1.0, "step": 638, "text_loss": 0.546863317489624 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.004696213677722, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.119140625, "learning_rate": 0.0009995373311290272, "loss": 0.0709, "macro_f1": 0.3144654333591461, "num_tokens": 1032199.0, "repeat_count": 2.0, "routers_loss": 0.1457643061876297, "skip_count": 1.0, "step": 640, "text_loss": 0.2137298285961151 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0009995239232596764, "loss": 0.0545, "macro_f1": 0.3333333432674408, "num_tokens": 1035801.0, "repeat_count": 0.0, "routers_loss": 0.011394930072128773, "skip_count": 0.0, "step": 642, "text_loss": 0.43054503202438354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0234810683886115, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009995103239718163, "loss": 0.0665, "macro_f1": 0.3333333432674408, "num_tokens": 1039223.0, "repeat_count": 0.0, "routers_loss": 0.00997432041913271, "skip_count": 0.0, "step": 644, "text_loss": 0.7749615907669067 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0328734957440564, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0009994965332706573, "loss": 0.0755, "macro_f1": 0.3144654333591461, "num_tokens": 1042154.0, "repeat_count": 3.0, "routers_loss": 0.10589150339365005, "skip_count": 0.0, "step": 646, "text_loss": 0.7812211513519287 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.042265923099501, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.0009994825511614846, "loss": 0.0383, "macro_f1": 0.3272727429866791, "num_tokens": 1045250.0, "repeat_count": 0.0, "routers_loss": 0.0748734176158905, "skip_count": 1.0, "step": 648, "text_loss": 0.844803512096405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0516583504549457, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1220703125, "learning_rate": 0.0009994683776496562, "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 1048446.0, "repeat_count": 0.0, "routers_loss": 0.03742415830492973, "skip_count": 1.0, "step": 650, "text_loss": 0.2098839282989502 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0610507778103906, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009994540127406034, "loss": 0.0591, "macro_f1": 0.32098764181137085, "num_tokens": 1051840.0, "repeat_count": 0.0, "routers_loss": 0.06025516986846924, "skip_count": 2.0, "step": 652, "text_loss": 0.27727583050727844 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.070443205165835, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.181640625, "learning_rate": 0.0009994394564398306, "loss": 0.0519, "macro_f1": 0.521541953086853, "num_tokens": 1055142.0, "repeat_count": 4.0, "routers_loss": 0.22807340323925018, "skip_count": 2.0, "step": 654, "text_loss": 0.9672397971153259 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0009994247087529158, "loss": 0.0618, "macro_f1": 0.3333333432674408, "num_tokens": 1057698.0, "repeat_count": 0.0, "routers_loss": 0.01348950993269682, "skip_count": 0.0, "step": 656, "text_loss": 0.6375506520271301 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.0892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.0009994097696855106, "loss": 0.0412, "macro_f1": 0.3333333432674408, "num_tokens": 1060624.0, "repeat_count": 0.0, "routers_loss": 0.009649243205785751, "skip_count": 0.0, "step": 658, "text_loss": 0.5315385460853577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.098620487232169, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2041015625, "learning_rate": 0.0009993946392433395, "loss": 0.0609, "macro_f1": 0.307692289352417, "num_tokens": 1065076.0, "repeat_count": 0.0, "routers_loss": 0.1250980943441391, "skip_count": 3.0, "step": 660, "text_loss": 0.25780341029167175 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1080129145876136, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0009993793174322006, "loss": 0.0471, "macro_f1": 0.3333333432674408, "num_tokens": 1068365.0, "repeat_count": 0.0, "routers_loss": 0.011544390581548214, "skip_count": 0.0, "step": 662, "text_loss": 0.34876301884651184 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1174053419430585, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009993638042579654, "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1071693.0, "repeat_count": 0.0, "routers_loss": 0.03777370601892471, "skip_count": 1.0, "step": 664, "text_loss": 0.21811571717262268 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.126797769298503, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.203125, "learning_rate": 0.0009993480997265783, "loss": 0.0475, "macro_f1": 0.5492662787437439, "num_tokens": 1074733.0, "repeat_count": 0.0, "routers_loss": 0.049949806183576584, "skip_count": 2.0, "step": 666, "text_loss": 0.38410288095474243 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.136190196653948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10302734375, "learning_rate": 0.0009993322038440572, "loss": 0.0605, "macro_f1": 0.3333333432674408, "num_tokens": 1077993.0, "repeat_count": 0.0, "routers_loss": 0.0247171800583601, "skip_count": 0.0, "step": 668, "text_loss": 0.25576895475387573 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1455826240093923, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.000999316116616494, "loss": 0.0619, "macro_f1": 0.3333333432674408, "num_tokens": 1080491.0, "repeat_count": 0.0, "routers_loss": 0.008118715137243271, "skip_count": 0.0, "step": 670, "text_loss": 0.6269792914390564 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.154975051364837, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.0009992998380500527, "loss": 0.0462, "macro_f1": 0.3272727429866791, "num_tokens": 1083817.0, "repeat_count": 0.0, "routers_loss": 0.03366057574748993, "skip_count": 1.0, "step": 672, "text_loss": 0.26891493797302246 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1643674787202816, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009992833681509716, "loss": 0.0529, "macro_f1": 0.3333333432674408, "num_tokens": 1087368.0, "repeat_count": 0.0, "routers_loss": 0.020552074536681175, "skip_count": 0.0, "step": 674, "text_loss": 0.14421936869621277 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.1737599060757264, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.18359375, "learning_rate": 0.0009992667069255619, "loss": 0.0696, "macro_f1": 0.31446540355682373, "num_tokens": 1090452.0, "repeat_count": 0.0, "routers_loss": 0.06937336176633835, "skip_count": 2.0, "step": 676, "text_loss": 0.24999259412288666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1831523334311713, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.0009992498543802085, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 1093996.0, "repeat_count": 1.0, "routers_loss": 0.0380021296441555, "skip_count": 0.0, "step": 678, "text_loss": 0.42473849654197693 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 3.1925447607866158, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.2119140625, "learning_rate": 0.0009992328105213688, "loss": 0.0411, "macro_f1": 0.4400000274181366, "num_tokens": 1096837.0, "repeat_count": 1.0, "routers_loss": 0.20885063707828522, "skip_count": 4.0, "step": 680, "text_loss": 0.3829527199268341 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.2019371881420606, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1474609375, "learning_rate": 0.0009992155753555747, "loss": 0.0722, "macro_f1": 0.5492662787437439, "num_tokens": 1100320.0, "repeat_count": 0.0, "routers_loss": 0.018230699002742767, "skip_count": 2.0, "step": 682, "text_loss": 0.6190969944000244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.211329615497505, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30859375, "learning_rate": 0.0009991981488894303, "loss": 0.0681, "macro_f1": 0.32098767161369324, "num_tokens": 1103682.0, "repeat_count": 0.0, "routers_loss": 0.05550144240260124, "skip_count": 1.0, "step": 684, "text_loss": 0.44418027997016907 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.22072204285295, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2158203125, "learning_rate": 0.0009991805311296133, "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1106427.0, "repeat_count": 0.0, "routers_loss": 0.07990608364343643, "skip_count": 2.0, "step": 686, "text_loss": 0.5577231645584106 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2301144702083944, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009991627220828753, "loss": 0.0568, "macro_f1": 0.32098764181137085, "num_tokens": 1109314.0, "repeat_count": 0.0, "routers_loss": 0.05167485028505325, "skip_count": 2.0, "step": 688, "text_loss": 0.27325430512428284 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.2395068975638392, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10693359375, "learning_rate": 0.0009991447217560408, "loss": 0.0521, "macro_f1": 0.5492662787437439, "num_tokens": 1112748.0, "repeat_count": 0.0, "routers_loss": 0.04621964320540428, "skip_count": 2.0, "step": 690, "text_loss": 0.5288321375846863 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.2488993249192837, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.000999126530156007, "loss": 0.0499, "macro_f1": 0.307692289352417, "num_tokens": 1116965.0, "repeat_count": 1.0, "routers_loss": 0.11950276792049408, "skip_count": 2.0, "step": 692, "text_loss": 0.14215624332427979 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2582917522747286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.0009991081472897454, "loss": 0.0722, "macro_f1": 0.3333333432674408, "num_tokens": 1120570.0, "repeat_count": 0.0, "routers_loss": 0.01905500330030918, "skip_count": 0.0, "step": 694, "text_loss": 0.41862696409225464 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.267684179630173, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0009990895731643002, "loss": 0.0464, "macro_f1": 0.3272727429866791, "num_tokens": 1124009.0, "repeat_count": 1.0, "routers_loss": 0.06974572688341141, "skip_count": 0.0, "step": 696, "text_loss": 0.41160130500793457 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.277076606985618, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.000999070807786789, "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 1127370.0, "repeat_count": 1.0, "routers_loss": 0.07055293023586273, "skip_count": 0.0, "step": 698, "text_loss": 0.48068273067474365 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2864690343410627, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.197265625, "learning_rate": 0.000999051851164403, "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1130234.0, "repeat_count": 1.0, "routers_loss": 0.12506946921348572, "skip_count": 1.0, "step": 700, "text_loss": 0.47925490140914917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1943359375, "learning_rate": 0.000999032703304406, "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 1132874.0, "repeat_count": 0.0, "routers_loss": 0.00809287466108799, "skip_count": 0.0, "step": 702, "text_loss": 0.47433632612228394 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.305253889051952, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1064453125, "learning_rate": 0.0009990133642141358, "loss": 0.0497, "macro_f1": 0.5492662787437439, "num_tokens": 1136011.0, "repeat_count": 0.0, "routers_loss": 0.0319170281291008, "skip_count": 2.0, "step": 704, "text_loss": 0.6574832201004028 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3146463164073965, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.33984375, "learning_rate": 0.000998993833901003, "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1139674.0, "repeat_count": 0.0, "routers_loss": 0.09850362688302994, "skip_count": 2.0, "step": 706, "text_loss": 0.7660127282142639 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3240387437628414, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0009989741123724919, "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 1143558.0, "repeat_count": 0.0, "routers_loss": 0.006673311349004507, "skip_count": 0.0, "step": 708, "text_loss": 0.5976111888885498 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.333431171118286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009989541996361594, "loss": 0.045, "macro_f1": 0.3333333432674408, "num_tokens": 1146122.0, "repeat_count": 0.0, "routers_loss": 0.004988791421055794, "skip_count": 0.0, "step": 710, "text_loss": 0.5256119966506958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3428235984737307, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.0009989340956996367, "loss": 0.0528, "macro_f1": 0.3333333432674408, "num_tokens": 1149546.0, "repeat_count": 0.0, "routers_loss": 0.0067769973538815975, "skip_count": 0.0, "step": 712, "text_loss": 0.5040497779846191 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.352216025829175, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26953125, "learning_rate": 0.0009989138005706273, "loss": 0.0735, "macro_f1": 0.32098764181137085, "num_tokens": 1153195.0, "repeat_count": 0.0, "routers_loss": 0.09899546951055527, "skip_count": 2.0, "step": 714, "text_loss": 0.20803412795066833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.000998893314256908, "loss": 0.064, "macro_f1": 0.3333333432674408, "num_tokens": 1157081.0, "repeat_count": 0.0, "routers_loss": 0.010492355562746525, "skip_count": 0.0, "step": 716, "text_loss": 0.23077639937400818 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3710008805400644, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0009988726367663298, "loss": 0.0539, "macro_f1": 0.3333333432674408, "num_tokens": 1160079.0, "repeat_count": 0.0, "routers_loss": 0.01063773687928915, "skip_count": 0.0, "step": 718, "text_loss": 0.6085864901542664 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3803933078955093, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0009988517681068163, "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1163249.0, "repeat_count": 1.0, "routers_loss": 0.05981874838471413, "skip_count": 0.0, "step": 720, "text_loss": 0.4047050476074219 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3897857352509537, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.0009988307082863638, "loss": 0.0361, "macro_f1": 0.3333333432674408, "num_tokens": 1166259.0, "repeat_count": 0.0, "routers_loss": 0.009750043973326683, "skip_count": 0.0, "step": 722, "text_loss": 0.5306474566459656 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.3991781626063986, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.240234375, "learning_rate": 0.0009988094573130434, "loss": 0.063, "macro_f1": 0.5359477400779724, "num_tokens": 1168887.0, "repeat_count": 2.0, "routers_loss": 0.18601104617118835, "skip_count": 2.0, "step": 724, "text_loss": 0.53528892993927 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.408570589961843, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009987880151949974, "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1172625.0, "repeat_count": 0.0, "routers_loss": 0.02845010720193386, "skip_count": 1.0, "step": 726, "text_loss": 0.4760453701019287 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.417963017317288, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2177734375, "learning_rate": 0.0009987663819404434, "loss": 0.06, "macro_f1": 0.5492662787437439, "num_tokens": 1176580.0, "repeat_count": 0.0, "routers_loss": 0.017596980556845665, "skip_count": 2.0, "step": 728, "text_loss": 0.5146099328994751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.427355444672733, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1318359375, "learning_rate": 0.000998744557557671, "loss": 0.0484, "macro_f1": 0.3272727429866791, "num_tokens": 1179804.0, "repeat_count": 0.0, "routers_loss": 0.0625474750995636, "skip_count": 1.0, "step": 730, "text_loss": 0.27738022804260254 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.436747872028177, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.0009987225420550433, "loss": 0.0796, "macro_f1": 0.307692289352417, "num_tokens": 1182658.0, "repeat_count": 1.0, "routers_loss": 0.16188351809978485, "skip_count": 2.0, "step": 732, "text_loss": 0.23231445252895355 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.446140299383622, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2001953125, "learning_rate": 0.0009987003354409965, "loss": 0.0626, "macro_f1": 0.3272727429866791, "num_tokens": 1185451.0, "repeat_count": 0.0, "routers_loss": 0.02391529455780983, "skip_count": 0.0, "step": 734, "text_loss": 0.4496627151966095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.4555327267390665, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.234375, "learning_rate": 0.0009986779377240405, "loss": 0.0513, "macro_f1": 0.32098767161369324, "num_tokens": 1188666.0, "repeat_count": 0.0, "routers_loss": 0.08435963839292526, "skip_count": 1.0, "step": 736, "text_loss": 0.4950787127017975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.4649251540945114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1220703125, "learning_rate": 0.000998655348912758, "loss": 0.0515, "macro_f1": 0.3333333432674408, "num_tokens": 1193035.0, "repeat_count": 0.0, "routers_loss": 0.01648722216486931, "skip_count": 0.0, "step": 738, "text_loss": 0.24761848151683807 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1513671875, "learning_rate": 0.0009986325690158051, "loss": 0.0435, "macro_f1": 0.3333333432674408, "num_tokens": 1196840.0, "repeat_count": 0.0, "routers_loss": 0.013143910095095634, "skip_count": 0.0, "step": 740, "text_loss": 0.15662719309329987 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.4837100088054007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009986095980419113, "loss": 0.0757, "macro_f1": 0.3333333432674408, "num_tokens": 1200573.0, "repeat_count": 0.0, "routers_loss": 0.026706280186772346, "skip_count": 0.0, "step": 742, "text_loss": 0.16725164651870728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.493102436160845, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1982421875, "learning_rate": 0.0009985864359998787, "loss": 0.0795, "macro_f1": 0.3006536364555359, "num_tokens": 1203589.0, "repeat_count": 2.0, "routers_loss": 0.28607678413391113, "skip_count": 3.0, "step": 744, "text_loss": 0.6350882053375244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.50249486351629, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009985630828985835, "loss": 0.0572, "macro_f1": 0.3272727429866791, "num_tokens": 1206422.0, "repeat_count": 0.0, "routers_loss": 0.05685260891914368, "skip_count": 1.0, "step": 746, "text_loss": 0.33779552578926086 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.5118872908717345, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.09814453125, "learning_rate": 0.0009985395387469742, "loss": 0.0458, "macro_f1": 0.5492662787437439, "num_tokens": 1211588.0, "repeat_count": 0.0, "routers_loss": 0.0437830351293087, "skip_count": 2.0, "step": 748, "text_loss": 0.28664472699165344 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5212797182271793, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.0009985158035540735, "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 1214580.0, "repeat_count": 2.0, "routers_loss": 0.07074898481369019, "skip_count": 0.0, "step": 750, "text_loss": 0.3939313292503357 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.0009984918773289762, "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1217388.0, "repeat_count": 0.0, "routers_loss": 0.009757856838405132, "skip_count": 0.0, "step": 752, "text_loss": 0.37641215324401855 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5400645729380686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0009984677600808512, "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 1219960.0, "repeat_count": 0.0, "routers_loss": 0.02515069581568241, "skip_count": 0.0, "step": 754, "text_loss": 0.155938982963562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5494570002935135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30078125, "learning_rate": 0.0009984434518189405, "loss": 0.0764, "macro_f1": 0.3333333432674408, "num_tokens": 1223234.0, "repeat_count": 0.0, "routers_loss": 0.025766927748918533, "skip_count": 0.0, "step": 756, "text_loss": 0.691118061542511 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.558849427648958, "f1_execute": 0.9411765336990356, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009984189525525584, "loss": 0.0451, "macro_f1": 0.5359477400779724, "num_tokens": 1225764.0, "repeat_count": 2.0, "routers_loss": 0.1782722771167755, "skip_count": 2.0, "step": 758, "text_loss": 0.3592209219932556 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.568241855004403, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.0009983942622910935, "loss": 0.0659, "macro_f1": 0.3333333432674408, "num_tokens": 1230097.0, "repeat_count": 0.0, "routers_loss": 0.00825568474829197, "skip_count": 0.0, "step": 760, "text_loss": 0.4646475315093994 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5776342823598473, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009983693810440074, "loss": 0.0477, "macro_f1": 0.32098764181137085, "num_tokens": 1233140.0, "repeat_count": 0.0, "routers_loss": 0.04156976938247681, "skip_count": 2.0, "step": 762, "text_loss": 0.298682302236557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.587026709715292, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3515625, "learning_rate": 0.000998344308820834, "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 1236305.0, "repeat_count": 0.0, "routers_loss": 0.05697929114103317, "skip_count": 1.0, "step": 764, "text_loss": 0.5249121189117432 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5964191370707366, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.18359375, "learning_rate": 0.0009983190456311817, "loss": 0.0592, "macro_f1": 0.3144654333591461, "num_tokens": 1239673.0, "repeat_count": 0.0, "routers_loss": 0.09547408670186996, "skip_count": 3.0, "step": 766, "text_loss": 0.41277334094047546 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.6058115644261814, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.185546875, "learning_rate": 0.000998293591484731, "loss": 0.0484, "macro_f1": 0.5492662787437439, "num_tokens": 1242292.0, "repeat_count": 0.0, "routers_loss": 0.030693158507347107, "skip_count": 2.0, "step": 768, "text_loss": 0.1583656519651413 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.615203991781626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000998267946391236, "loss": 0.051, "macro_f1": 0.3333333432674408, "num_tokens": 1244661.0, "repeat_count": 0.0, "routers_loss": 0.01211300864815712, "skip_count": 0.0, "step": 770, "text_loss": 0.4629349112510681 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6245964191370708, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.0009982421103605238, "loss": 0.0441, "macro_f1": 0.32098764181137085, "num_tokens": 1248688.0, "repeat_count": 0.0, "routers_loss": 0.0665968507528305, "skip_count": 2.0, "step": 772, "text_loss": 0.4019293785095215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6339888464925156, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000998216083402495, "loss": 0.0613, "macro_f1": 0.32098764181137085, "num_tokens": 1251395.0, "repeat_count": 0.0, "routers_loss": 0.07186859846115112, "skip_count": 2.0, "step": 774, "text_loss": 0.4659276604652405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.302734375, "learning_rate": 0.0009981898655271235, "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1254888.0, "repeat_count": 0.0, "routers_loss": 0.007823926396667957, "skip_count": 0.0, "step": 776, "text_loss": 0.5160359740257263 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 3.6527737012034045, "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.11962890625, "learning_rate": 0.0009981634567444557, "loss": 0.0775, "macro_f1": 0.590062141418457, "num_tokens": 1258250.0, "repeat_count": 3.0, "routers_loss": 0.24624499678611755, "skip_count": 4.0, "step": 778, "text_loss": 0.29319918155670166 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6621661285588494, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.0009981368570646115, "loss": 0.0885, "macro_f1": 0.3272727429866791, "num_tokens": 1260916.0, "repeat_count": 0.0, "routers_loss": 0.030730176717042923, "skip_count": 1.0, "step": 780, "text_loss": 0.624981164932251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6715585559142943, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009981100664977838, "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1264004.0, "repeat_count": 0.0, "routers_loss": 0.006829176563769579, "skip_count": 0.0, "step": 782, "text_loss": 0.6137266159057617 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6809509832697387, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009980830850542391, "loss": 0.058, "macro_f1": 0.3333333432674408, "num_tokens": 1267130.0, "repeat_count": 0.0, "routers_loss": 0.018471000716090202, "skip_count": 0.0, "step": 784, "text_loss": 0.15213175117969513 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6903434106251836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.0009980559127443166, "loss": 0.052, "macro_f1": 0.3333333432674408, "num_tokens": 1271129.0, "repeat_count": 0.0, "routers_loss": 0.007903140969574451, "skip_count": 0.0, "step": 786, "text_loss": 0.5768613219261169 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.699735837980628, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.000998028549578429, "loss": 0.0719, "macro_f1": 0.307692289352417, "num_tokens": 1274232.0, "repeat_count": 0.0, "routers_loss": 0.06737866252660751, "skip_count": 3.0, "step": 788, "text_loss": 0.2877073585987091 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.709128265336073, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009980009955670615, "loss": 0.0698, "macro_f1": 0.3144654333591461, "num_tokens": 1277193.0, "repeat_count": 0.0, "routers_loss": 0.10194934904575348, "skip_count": 3.0, "step": 790, "text_loss": 0.11860492825508118 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.7185206926915173, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.000997973250720773, "loss": 0.0552, "macro_f1": 0.32098764181137085, "num_tokens": 1280960.0, "repeat_count": 0.0, "routers_loss": 0.10297708213329315, "skip_count": 2.0, "step": 792, "text_loss": 0.13477706909179688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.727913120046962, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009979453150501954, "loss": 0.0663, "macro_f1": 0.32098764181137085, "num_tokens": 1284611.0, "repeat_count": 1.0, "routers_loss": 0.06122037023305893, "skip_count": 1.0, "step": 794, "text_loss": 0.40569379925727844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.737305547402407, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.000997917188566034, "loss": 0.062, "macro_f1": 0.32098764181137085, "num_tokens": 1287834.0, "repeat_count": 0.0, "routers_loss": 0.061135001480579376, "skip_count": 2.0, "step": 796, "text_loss": 0.2829287648200989 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.7466979747578515, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.109375, "learning_rate": 0.0009978888712790664, "loss": 0.0654, "macro_f1": 0.3272727429866791, "num_tokens": 1291666.0, "repeat_count": 0.0, "routers_loss": 0.04841872677206993, "skip_count": 1.0, "step": 798, "text_loss": 1.011757254600525 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 27.0, "epoch": 3.756090402113296, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 0.14453125, "learning_rate": 0.0009978603632001444, "loss": 0.0636, "macro_f1": 0.4104308485984802, "num_tokens": 1294627.0, "repeat_count": 1.0, "routers_loss": 0.15698759257793427, "skip_count": 5.0, "step": 800, "text_loss": 0.4457623362541199 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.765482829468741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0009978316643401916, "loss": 0.0688, "macro_f1": 0.3333333432674408, "num_tokens": 1297711.0, "repeat_count": 0.0, "routers_loss": 0.018952010199427605, "skip_count": 0.0, "step": 802, "text_loss": 0.2069481462240219 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.7748752568241857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.0009978027747102062, "loss": 0.0479, "macro_f1": 0.3333333432674408, "num_tokens": 1300569.0, "repeat_count": 0.0, "routers_loss": 0.014538386836647987, "skip_count": 0.0, "step": 804, "text_loss": 0.4983852505683899 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.78426768417963, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2109375, "learning_rate": 0.0009977736943212584, "loss": 0.0721, "macro_f1": 0.32098764181137085, "num_tokens": 1303969.0, "repeat_count": 0.0, "routers_loss": 0.11164087057113647, "skip_count": 2.0, "step": 806, "text_loss": 0.2910642921924591 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.793660111535075, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.000997744423184492, "loss": 0.0424, "macro_f1": 0.3272727429866791, "num_tokens": 1307263.0, "repeat_count": 0.0, "routers_loss": 0.06073406711220741, "skip_count": 1.0, "step": 808, "text_loss": 0.18831779062747955 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 3.8030525388905194, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.26171875, "learning_rate": 0.0009977149613111236, "loss": 0.0486, "macro_f1": 0.4400000274181366, "num_tokens": 1309953.0, "repeat_count": 1.0, "routers_loss": 0.11035524308681488, "skip_count": 4.0, "step": 810, "text_loss": 0.7872759699821472 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8124449662459643, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0009976853087124433, "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1313243.0, "repeat_count": 0.0, "routers_loss": 0.021804286167025566, "skip_count": 0.0, "step": 812, "text_loss": 0.22349292039871216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.8218373936014087, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28125, "learning_rate": 0.0009976554653998138, "loss": 0.0612, "macro_f1": 0.31446540355682373, "num_tokens": 1316165.0, "repeat_count": 0.0, "routers_loss": 0.10715524107217789, "skip_count": 2.0, "step": 814, "text_loss": 0.18035532534122467 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8312298209568536, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.000997625431384671, "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1319206.0, "repeat_count": 0.0, "routers_loss": 0.007173649035394192, "skip_count": 0.0, "step": 816, "text_loss": 0.48928648233413696 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8406222483122985, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0009975952066785243, "loss": 0.0655, "macro_f1": 0.3006536364555359, "num_tokens": 1322549.0, "repeat_count": 1.0, "routers_loss": 0.22308112680912018, "skip_count": 4.0, "step": 818, "text_loss": 0.5211259722709656 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.850014675667743, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0009975647912929557, "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1325213.0, "repeat_count": 0.0, "routers_loss": 0.00998698640614748, "skip_count": 0.0, "step": 820, "text_loss": 0.7117052674293518 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8594071030231873, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0009975341852396205, "loss": 0.0723, "macro_f1": 0.32098764181137085, "num_tokens": 1328383.0, "repeat_count": 0.0, "routers_loss": 0.07454588264226913, "skip_count": 2.0, "step": 822, "text_loss": 0.34539610147476196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8687995303786322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009975033885302469, "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 1331406.0, "repeat_count": 0.0, "routers_loss": 0.009157589636743069, "skip_count": 0.0, "step": 824, "text_loss": 0.7484824657440186 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.878191957734077, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.0009974724011766363, "loss": 0.0474, "macro_f1": 0.3272727429866791, "num_tokens": 1334410.0, "repeat_count": 1.0, "routers_loss": 0.17149391770362854, "skip_count": 0.0, "step": 826, "text_loss": 0.5913820266723633 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8875843850895215, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009974412231906632, "loss": 0.058, "macro_f1": 0.32098764181137085, "num_tokens": 1337653.0, "repeat_count": 1.0, "routers_loss": 0.09743282198905945, "skip_count": 1.0, "step": 828, "text_loss": 0.2505693733692169 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8969768124449664, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0009974098545842748, "loss": 0.0638, "macro_f1": 0.3272727429866791, "num_tokens": 1340860.0, "repeat_count": 0.0, "routers_loss": 0.041490405797958374, "skip_count": 1.0, "step": 830, "text_loss": 0.5585370063781738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.906369239800411, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.193359375, "learning_rate": 0.0009973782953694918, "loss": 0.0746, "macro_f1": 0.3006536066532135, "num_tokens": 1344232.0, "repeat_count": 1.0, "routers_loss": 0.16080693900585175, "skip_count": 3.0, "step": 832, "text_loss": 0.4782734513282776 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9157616671558557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.000997346545558408, "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1347667.0, "repeat_count": 0.0, "routers_loss": 0.01173500344157219, "skip_count": 0.0, "step": 834, "text_loss": 0.25036177039146423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.0009973146051631895, "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1350707.0, "repeat_count": 0.0, "routers_loss": 0.011477196589112282, "skip_count": 0.0, "step": 836, "text_loss": 0.5482863187789917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0009972824741960764, "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1353704.0, "repeat_count": 0.0, "routers_loss": 0.010528896935284138, "skip_count": 0.0, "step": 838, "text_loss": 0.6732596158981323 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9439389492221895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1181640625, "learning_rate": 0.000997250152669381, "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1356608.0, "repeat_count": 0.0, "routers_loss": 0.010678744874894619, "skip_count": 0.0, "step": 840, "text_loss": 0.5479338765144348 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9533313765776343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.181640625, "learning_rate": 0.000997217640595489, "loss": 0.0631, "macro_f1": 0.3333333432674408, "num_tokens": 1359809.0, "repeat_count": 0.0, "routers_loss": 0.00835978239774704, "skip_count": 0.0, "step": 842, "text_loss": 0.42543259263038635 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9627238039330788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.0009971849379868593, "loss": 0.0653, "macro_f1": 0.3333333432674408, "num_tokens": 1362201.0, "repeat_count": 0.0, "routers_loss": 0.009930923581123352, "skip_count": 0.0, "step": 844, "text_loss": 0.720462441444397 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.9721162312885236, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.0009971520448560235, "loss": 0.0615, "macro_f1": 0.3272727429866791, "num_tokens": 1365790.0, "repeat_count": 0.0, "routers_loss": 0.06344373524188995, "skip_count": 1.0, "step": 846, "text_loss": 0.8423607349395752 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 3.9815086586439685, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.16796875, "learning_rate": 0.000997118961215586, "loss": 0.0674, "macro_f1": 0.4533333480358124, "num_tokens": 1368387.0, "repeat_count": 1.0, "routers_loss": 0.14688406884670258, "skip_count": 3.0, "step": 848, "text_loss": 0.3933577537536621 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000997085687078225, "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1371189.0, "repeat_count": 0.0, "routers_loss": 0.009953443892300129, "skip_count": 0.0, "step": 850, "text_loss": 0.41469162702560425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.0, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.0009970522224566909, "loss": 0.0555, "macro_f1": 0.32098767161369324, "num_tokens": 1374008.0, "repeat_count": 0.0, "routers_loss": 0.048870690166950226, "skip_count": 1.0, "step": 852, "text_loss": 0.613615870475769 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.009392427355444, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0009970185673638075, "loss": 0.0629, "macro_f1": 0.32098764181137085, "num_tokens": 1376662.0, "repeat_count": 1.0, "routers_loss": 0.06865929812192917, "skip_count": 1.0, "step": 854, "text_loss": 0.4392736256122589 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.01878485471089, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.162109375, "learning_rate": 0.0009969847218124716, "loss": 0.0506, "macro_f1": 0.5492662787437439, "num_tokens": 1380049.0, "repeat_count": 0.0, "routers_loss": 0.02382219396531582, "skip_count": 1.0, "step": 856, "text_loss": 0.19115346670150757 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.028177282066334, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009969506858156527, "loss": 0.0344, "macro_f1": 0.3272727429866791, "num_tokens": 1383008.0, "repeat_count": 0.0, "routers_loss": 0.03907281160354614, "skip_count": 1.0, "step": 858, "text_loss": 0.34842637181282043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.037569709421779, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12060546875, "learning_rate": 0.0009969164593863935, "loss": 0.0365, "macro_f1": 0.3333333432674408, "num_tokens": 1387051.0, "repeat_count": 0.0, "routers_loss": 0.007645803038030863, "skip_count": 0.0, "step": 860, "text_loss": 0.3810436725616455 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.046962136777223, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009968820425378098, "loss": 0.0463, "macro_f1": 0.3272727429866791, "num_tokens": 1390244.0, "repeat_count": 1.0, "routers_loss": 0.04435238987207413, "skip_count": 0.0, "step": 862, "text_loss": 0.34853485226631165 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.056354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28515625, "learning_rate": 0.00099684743528309, "loss": 0.0424, "macro_f1": 0.3333333432674408, "num_tokens": 1392976.0, "repeat_count": 0.0, "routers_loss": 0.006071661598980427, "skip_count": 0.0, "step": 864, "text_loss": 0.6395178437232971 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.065746991488113, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0810546875, "learning_rate": 0.0009968126376354958, "loss": 0.0477, "macro_f1": 0.5492662787437439, "num_tokens": 1396061.0, "repeat_count": 0.0, "routers_loss": 0.05011235550045967, "skip_count": 2.0, "step": 866, "text_loss": 0.09103966504335403 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.075139418843557, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009967776496083616, "loss": 0.0509, "macro_f1": 0.3272727429866791, "num_tokens": 1398993.0, "repeat_count": 1.0, "routers_loss": 0.03979124873876572, "skip_count": 0.0, "step": 868, "text_loss": 0.27257058024406433 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.084531846199002, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.14453125, "learning_rate": 0.000996742471215095, "loss": 0.0516, "macro_f1": 0.5492662787437439, "num_tokens": 1402080.0, "repeat_count": 0.0, "routers_loss": 0.030823837965726852, "skip_count": 2.0, "step": 870, "text_loss": 0.7047103047370911 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.093924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009967071024691763, "loss": 0.0461, "macro_f1": 0.3333333432674408, "num_tokens": 1404890.0, "repeat_count": 0.0, "routers_loss": 0.009721715934574604, "skip_count": 0.0, "step": 872, "text_loss": 0.959106981754303 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.000996671543384159, "loss": 0.05, "macro_f1": 0.3333333432674408, "num_tokens": 1407853.0, "repeat_count": 0.0, "routers_loss": 0.006025883834809065, "skip_count": 0.0, "step": 874, "text_loss": 0.47571972012519836 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.112709128265336, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.0009966357939736692, "loss": 0.0416, "macro_f1": 0.3272727429866791, "num_tokens": 1410723.0, "repeat_count": 0.0, "routers_loss": 0.025964925065636635, "skip_count": 0.0, "step": 876, "text_loss": 0.4964611530303955 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.122101555620781, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0009965998542514065, "loss": 0.0415, "macro_f1": 0.32098764181137085, "num_tokens": 1414008.0, "repeat_count": 0.0, "routers_loss": 0.09509637206792831, "skip_count": 2.0, "step": 878, "text_loss": 0.621494710445404 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 4.131493982976226, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11083984375, "learning_rate": 0.0009965637242311427, "loss": 0.0472, "macro_f1": 0.542222261428833, "num_tokens": 1417447.0, "repeat_count": 0.0, "routers_loss": 0.02520318515598774, "skip_count": 4.0, "step": 880, "text_loss": 0.40209758281707764 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 4.14088641033167, "f1_execute": 0.936170220375061, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.263671875, "learning_rate": 0.000996527403926723, "loss": 0.0495, "macro_f1": 0.5342789888381958, "num_tokens": 1419905.0, "repeat_count": 0.0, "routers_loss": 0.13183781504631042, "skip_count": 6.0, "step": 882, "text_loss": 0.642185389995575 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.1502788376871145, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1201171875, "learning_rate": 0.0009964908933520655, "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 1423436.0, "repeat_count": 0.0, "routers_loss": 0.009429510682821274, "skip_count": 0.0, "step": 884, "text_loss": 0.48232755064964294 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.15967126504256, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1669921875, "learning_rate": 0.0009964541925211613, "loss": 0.0349, "macro_f1": 0.32098764181137085, "num_tokens": 1426842.0, "repeat_count": 0.0, "routers_loss": 0.07629609107971191, "skip_count": 2.0, "step": 886, "text_loss": 0.16620934009552002 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.169063692398004, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0927734375, "learning_rate": 0.0009964173014480738, "loss": 0.0348, "macro_f1": 0.5492662787437439, "num_tokens": 1430430.0, "repeat_count": 0.0, "routers_loss": 0.036814019083976746, "skip_count": 2.0, "step": 888, "text_loss": 0.4866008758544922 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.178456119753449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.0009963802201469398, "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1433821.0, "repeat_count": 0.0, "routers_loss": 0.0041250260546803474, "skip_count": 0.0, "step": 890, "text_loss": 0.578216552734375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.187848547108893, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2373046875, "learning_rate": 0.0009963429486319693, "loss": 0.0463, "macro_f1": 0.32098764181137085, "num_tokens": 1436976.0, "repeat_count": 0.0, "routers_loss": 0.06213559955358505, "skip_count": 2.0, "step": 892, "text_loss": 0.221701517701149 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 4.197240974464338, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.361328125, "learning_rate": 0.0009963054869174446, "loss": 0.0313, "macro_f1": 0.4871794879436493, "num_tokens": 1440397.0, "repeat_count": 0.0, "routers_loss": 0.07532428950071335, "skip_count": 2.0, "step": 894, "text_loss": 0.6922838091850281 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.206633401819783, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.0009962678350177209, "loss": 0.0472, "macro_f1": 0.3272727429866791, "num_tokens": 1443604.0, "repeat_count": 0.0, "routers_loss": 0.0419243648648262, "skip_count": 1.0, "step": 896, "text_loss": 0.22092342376708984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.216025829175227, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009962299929472268, "loss": 0.034, "macro_f1": 0.32098764181137085, "num_tokens": 1446257.0, "repeat_count": 2.0, "routers_loss": 0.10849297791719437, "skip_count": 0.0, "step": 898, "text_loss": 0.26394811272621155 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.225418256530672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.000996191960720463, "loss": 0.0394, "macro_f1": 0.3333333432674408, "num_tokens": 1449669.0, "repeat_count": 0.0, "routers_loss": 0.0092767970636487, "skip_count": 0.0, "step": 900, "text_loss": 0.5338577628135681 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.234810683886117, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009961537383520042, "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1452450.0, "repeat_count": 1.0, "routers_loss": 0.02985367365181446, "skip_count": 0.0, "step": 902, "text_loss": 0.5875228047370911 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.2442031112415615, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.0009961153258564966, "loss": 0.0378, "macro_f1": 0.3144654333591461, "num_tokens": 1456909.0, "repeat_count": 0.0, "routers_loss": 0.06794842332601547, "skip_count": 3.0, "step": 904, "text_loss": 0.40959444642066956 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.253595538597006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009960767232486604, "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1461712.0, "repeat_count": 0.0, "routers_loss": 0.0023562447167932987, "skip_count": 0.0, "step": 906, "text_loss": 0.3932875096797943 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.262987965952451, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.000996037930543288, "loss": 0.0505, "macro_f1": 0.3272727429866791, "num_tokens": 1464817.0, "repeat_count": 0.0, "routers_loss": 0.03880339860916138, "skip_count": 1.0, "step": 908, "text_loss": 0.17482402920722961 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.272380393307896, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.000995998947755245, "loss": 0.0479, "macro_f1": 0.3272727429866791, "num_tokens": 1467810.0, "repeat_count": 0.0, "routers_loss": 0.01736828312277794, "skip_count": 1.0, "step": 910, "text_loss": 0.4140470325946808 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009959597748994695, "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1470802.0, "repeat_count": 0.0, "routers_loss": 0.011824851855635643, "skip_count": 0.0, "step": 912, "text_loss": 0.7153383493423462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.2911652480187845, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0009959204119909726, "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1474539.0, "repeat_count": 0.0, "routers_loss": 0.025456594303250313, "skip_count": 0.0, "step": 914, "text_loss": 0.42812058329582214 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009958808590448385, "loss": 0.0489, "macro_f1": 0.3333333432674408, "num_tokens": 1477552.0, "repeat_count": 0.0, "routers_loss": 0.006795851048082113, "skip_count": 0.0, "step": 916, "text_loss": 0.5402814149856567 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.309950102729674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0009958411160762234, "loss": 0.039, "macro_f1": 0.3333333432674408, "num_tokens": 1482547.0, "repeat_count": 0.0, "routers_loss": 0.015615932643413544, "skip_count": 0.0, "step": 918, "text_loss": 0.3836168050765991 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.319342530085119, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0009958011831003577, "loss": 0.0448, "macro_f1": 0.3272727429866791, "num_tokens": 1485807.0, "repeat_count": 0.0, "routers_loss": 0.043541423976421356, "skip_count": 1.0, "step": 920, "text_loss": 0.4333936274051666 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.328734957440563, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.000995761060132543, "loss": 0.0418, "macro_f1": 0.6538461446762085, "num_tokens": 1488941.0, "repeat_count": 1.0, "routers_loss": 0.05866432189941406, "skip_count": 2.0, "step": 922, "text_loss": 0.4106994867324829 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.3381273847960085, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1630859375, "learning_rate": 0.0009957207471881552, "loss": 0.0531, "macro_f1": 0.5492662787437439, "num_tokens": 1492026.0, "repeat_count": 0.0, "routers_loss": 0.02714901603758335, "skip_count": 2.0, "step": 924, "text_loss": 0.542091429233551 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.347519812151453, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.0009956802442826415, "loss": 0.0386, "macro_f1": 0.3272727429866791, "num_tokens": 1494543.0, "repeat_count": 1.0, "routers_loss": 0.0563737191259861, "skip_count": 0.0, "step": 926, "text_loss": 0.47209203243255615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.356912239506897, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009956395514315235, "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1497831.0, "repeat_count": 1.0, "routers_loss": 0.03285066783428192, "skip_count": 0.0, "step": 928, "text_loss": 0.6628931164741516 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.366304666862343, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009955986686503943, "loss": 0.0466, "macro_f1": 0.3272727429866791, "num_tokens": 1501375.0, "repeat_count": 0.0, "routers_loss": 0.024297121912240982, "skip_count": 1.0, "step": 930, "text_loss": 0.495676189661026 }, { "acc_repeat": 1.0, "acc_skip": 0.25, "avg_layers": 28.0, "epoch": 4.375697094217787, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.11181640625, "learning_rate": 0.0009955575959549202, "loss": 0.0424, "macro_f1": 0.7795917987823486, "num_tokens": 1504363.0, "repeat_count": 1.0, "routers_loss": 0.12196464836597443, "skip_count": 4.0, "step": 932, "text_loss": 0.26123273372650146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.3850895215732315, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.0009955163333608408, "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1507178.0, "repeat_count": 0.0, "routers_loss": 0.012947078794240952, "skip_count": 0.0, "step": 934, "text_loss": 0.32552677392959595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.394481948928676, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009954748808839674, "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 1509910.0, "repeat_count": 0.0, "routers_loss": 0.008946365676820278, "skip_count": 0.0, "step": 936, "text_loss": 0.533141016960144 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.403874376284121, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.000995433238540185, "loss": 0.0466, "macro_f1": 0.6538461446762085, "num_tokens": 1512826.0, "repeat_count": 1.0, "routers_loss": 0.029975678771734238, "skip_count": 1.0, "step": 938, "text_loss": 0.2953577935695648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.413266803639566, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0009953914063454512, "loss": 0.0497, "macro_f1": 0.3144654333591461, "num_tokens": 1517230.0, "repeat_count": 1.0, "routers_loss": 0.0889134630560875, "skip_count": 2.0, "step": 940, "text_loss": 0.5368834733963013 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.193359375, "learning_rate": 0.000995349384315796, "loss": 0.0413, "macro_f1": 0.3333333432674408, "num_tokens": 1519876.0, "repeat_count": 0.0, "routers_loss": 0.013458753935992718, "skip_count": 0.0, "step": 942, "text_loss": 0.2005518227815628 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.432051658350455, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.000995307172467322, "loss": 0.0444, "macro_f1": 0.31446540355682373, "num_tokens": 1522998.0, "repeat_count": 1.0, "routers_loss": 0.08850377053022385, "skip_count": 1.0, "step": 944, "text_loss": 0.227926567196846 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.4414440857059, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009952647708162054, "loss": 0.0503, "macro_f1": 0.3272727429866791, "num_tokens": 1527100.0, "repeat_count": 0.0, "routers_loss": 0.03199794515967369, "skip_count": 1.0, "step": 946, "text_loss": 0.4859686493873596 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.450836513061344, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009952221793786942, "loss": 0.0354, "macro_f1": 0.3333333432674408, "num_tokens": 1530028.0, "repeat_count": 0.0, "routers_loss": 0.006507779937237501, "skip_count": 0.0, "step": 948, "text_loss": 0.6855354905128479 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.460228940416789, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.0009951793981711097, "loss": 0.0584, "macro_f1": 0.6538461446762085, "num_tokens": 1533254.0, "repeat_count": 1.0, "routers_loss": 0.06175103038549423, "skip_count": 1.0, "step": 950, "text_loss": 0.7590400576591492 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.469621367772234, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1025390625, "learning_rate": 0.0009951364272098458, "loss": 0.0295, "macro_f1": 0.5492662787437439, "num_tokens": 1536239.0, "repeat_count": 0.0, "routers_loss": 0.03773383051156998, "skip_count": 2.0, "step": 952, "text_loss": 0.669784665107727 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.4790137951276785, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009950932665113688, "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1539682.0, "repeat_count": 0.0, "routers_loss": 0.07280613481998444, "skip_count": 2.0, "step": 954, "text_loss": 0.3365570902824402 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.0009950499160922184, "loss": 0.0541, "macro_f1": 0.3333333432674408, "num_tokens": 1542875.0, "repeat_count": 0.0, "routers_loss": 0.01770266517996788, "skip_count": 0.0, "step": 956, "text_loss": 0.0921545997262001 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.497798649838567, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09375, "learning_rate": 0.000995006375969006, "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1547135.0, "repeat_count": 1.0, "routers_loss": 0.07672002166509628, "skip_count": 0.0, "step": 958, "text_loss": 0.5887606739997864 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.507191077194013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009949626461584165, "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 1550100.0, "repeat_count": 0.0, "routers_loss": 0.006247182376682758, "skip_count": 0.0, "step": 960, "text_loss": 0.5777931213378906 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.516583504549457, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.119140625, "learning_rate": 0.0009949187266772076, "loss": 0.0366, "macro_f1": 0.5492662787437439, "num_tokens": 1553192.0, "repeat_count": 0.0, "routers_loss": 0.030319908633828163, "skip_count": 2.0, "step": 962, "text_loss": 0.2370252162218094 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.5259759319049016, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009948746175422088, "loss": 0.0511, "macro_f1": 0.3333333432674408, "num_tokens": 1556318.0, "repeat_count": 0.0, "routers_loss": 0.006004320923238993, "skip_count": 0.0, "step": 964, "text_loss": 0.6271032094955444 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000994830318770323, "loss": 0.0514, "macro_f1": 0.3333333432674408, "num_tokens": 1559195.0, "repeat_count": 0.0, "routers_loss": 0.011544366367161274, "skip_count": 0.0, "step": 966, "text_loss": 0.47256720066070557 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.544760786615791, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.171875, "learning_rate": 0.0009947858303785255, "loss": 0.0374, "macro_f1": 0.6603773832321167, "num_tokens": 1561813.0, "repeat_count": 1.0, "routers_loss": 0.05258861929178238, "skip_count": 1.0, "step": 968, "text_loss": 0.7703132629394531 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.554153213971236, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.0009947411523838648, "loss": 0.0453, "macro_f1": 0.3333333432674408, "num_tokens": 1564634.0, "repeat_count": 0.0, "routers_loss": 0.011216280050575733, "skip_count": 0.0, "step": 970, "text_loss": 0.4666804075241089 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0009946962848034608, "loss": 0.0696, "macro_f1": 0.3333333432674408, "num_tokens": 1567959.0, "repeat_count": 0.0, "routers_loss": 0.009387624450027943, "skip_count": 0.0, "step": 972, "text_loss": 0.4067264199256897 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.5729380686821255, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.0009946512276545075, "loss": 0.0397, "macro_f1": 0.3272727429866791, "num_tokens": 1571221.0, "repeat_count": 1.0, "routers_loss": 0.041713520884513855, "skip_count": 0.0, "step": 974, "text_loss": 0.5242366194725037 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 4.58233049603757, "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 0.228515625, "learning_rate": 0.0009946059809542705, "loss": 0.0487, "macro_f1": 0.7644445300102234, "num_tokens": 1575033.0, "repeat_count": 2.0, "routers_loss": 0.05748331546783447, "skip_count": 2.0, "step": 976, "text_loss": 0.5704690217971802 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.591722923393014, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0009945605447200887, "loss": 0.0445, "macro_f1": 0.3272727429866791, "num_tokens": 1579050.0, "repeat_count": 0.0, "routers_loss": 0.016765203326940536, "skip_count": 0.0, "step": 978, "text_loss": 0.4804173707962036 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.601115350748459, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1337890625, "learning_rate": 0.0009945149189693732, "loss": 0.0406, "macro_f1": 0.5492662787437439, "num_tokens": 1582967.0, "repeat_count": 0.0, "routers_loss": 0.021518222987651825, "skip_count": 2.0, "step": 980, "text_loss": 0.4138598144054413 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.610507778103904, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.0009944691037196078, "loss": 0.0456, "macro_f1": 0.3333333432674408, "num_tokens": 1586282.0, "repeat_count": 0.0, "routers_loss": 0.012246460653841496, "skip_count": 0.0, "step": 982, "text_loss": 0.22561736404895782 }, { "acc_repeat": 0.5, "acc_skip": 0.800000011920929, "avg_layers": 24.0, "epoch": 4.6199002054593485, "f1_execute": 0.930232584476471, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, "grad_norm": 0.1455078125, "learning_rate": 0.0009944230989883491, "loss": 0.0456, "macro_f1": 0.7989664077758789, "num_tokens": 1589279.0, "repeat_count": 2.0, "routers_loss": 0.09344895929098129, "skip_count": 5.0, "step": 984, "text_loss": 0.4416656494140625 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.629292632814793, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.111328125, "learning_rate": 0.0009943769047932264, "loss": 0.0404, "macro_f1": 0.5359477400779724, "num_tokens": 1592398.0, "repeat_count": 2.0, "routers_loss": 0.08916857838630676, "skip_count": 2.0, "step": 986, "text_loss": 0.5536438822746277 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.638685060170237, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000994330521151941, "loss": 0.039, "macro_f1": 0.32098764181137085, "num_tokens": 1596213.0, "repeat_count": 1.0, "routers_loss": 0.06114347651600838, "skip_count": 1.0, "step": 988, "text_loss": 0.5835405588150024 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.000994283948082267, "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1598827.0, "repeat_count": 0.0, "routers_loss": 0.0017335431184619665, "skip_count": 0.0, "step": 990, "text_loss": 0.5857380032539368 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.657469914881127, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10693359375, "learning_rate": 0.0009942371856020522, "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1602915.0, "repeat_count": 0.0, "routers_loss": 0.014606470242142677, "skip_count": 0.0, "step": 992, "text_loss": 0.6939892768859863 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 4.666862342236572, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0009941902337292155, "loss": 0.06, "macro_f1": 0.6598639488220215, "num_tokens": 1605776.0, "repeat_count": 3.0, "routers_loss": 0.06297315657138824, "skip_count": 1.0, "step": 994, "text_loss": 0.37616831064224243 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.676254769592017, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1083984375, "learning_rate": 0.0009941430924817487, "loss": 0.0572, "macro_f1": 0.5492662787437439, "num_tokens": 1609856.0, "repeat_count": 0.0, "routers_loss": 0.03297794610261917, "skip_count": 2.0, "step": 996, "text_loss": 0.2098303586244583 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.685647196947461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.000994095761877717, "loss": 0.0499, "macro_f1": 0.3333333432674408, "num_tokens": 1612904.0, "repeat_count": 0.0, "routers_loss": 0.012901155278086662, "skip_count": 0.0, "step": 998, "text_loss": 0.20103533565998077 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.695039624302906, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.259765625, "learning_rate": 0.000994048241935257, "loss": 0.0535, "macro_f1": 0.3272727429866791, "num_tokens": 1615540.0, "repeat_count": 0.0, "routers_loss": 0.020434845238924026, "skip_count": 0.0, "step": 1000, "text_loss": 0.32709044218063354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.70443205165835, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1669921875, "learning_rate": 0.0009940005326725789, "loss": 0.0453, "macro_f1": 0.32098764181137085, "num_tokens": 1618786.0, "repeat_count": 0.0, "routers_loss": 0.07831378281116486, "skip_count": 2.0, "step": 1002, "text_loss": 0.5789632797241211 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.713824479013795, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21875, "learning_rate": 0.0009939526341079647, "loss": 0.0511, "macro_f1": 0.32098764181137085, "num_tokens": 1621736.0, "repeat_count": 2.0, "routers_loss": 0.04863874986767769, "skip_count": 0.0, "step": 1004, "text_loss": 0.6128849387168884 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009939045462597693, "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1624649.0, "repeat_count": 0.0, "routers_loss": 0.00677989237010479, "skip_count": 0.0, "step": 1006, "text_loss": 0.6168264150619507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.732609333724684, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009938562691464202, "loss": 0.0524, "macro_f1": 0.3333333432674408, "num_tokens": 1627700.0, "repeat_count": 0.0, "routers_loss": 0.019490402191877365, "skip_count": 0.0, "step": 1008, "text_loss": 0.17463822662830353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.742001761080129, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.000993807802786417, "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1630714.0, "repeat_count": 0.0, "routers_loss": 0.0019022391643375158, "skip_count": 0.0, "step": 1010, "text_loss": 0.5675593018531799 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 4.751394188435574, "f1_execute": 0.9599999785423279, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 0.1640625, "learning_rate": 0.0009937591471983322, "loss": 0.0501, "macro_f1": 0.7644444704055786, "num_tokens": 1633770.0, "repeat_count": 1.0, "routers_loss": 0.042485643178224564, "skip_count": 2.0, "step": 1012, "text_loss": 0.42387229204177856 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.760786615791019, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0009937103024008109, "loss": 0.0545, "macro_f1": 0.3272727429866791, "num_tokens": 1637120.0, "repeat_count": 0.0, "routers_loss": 0.09427817165851593, "skip_count": 1.0, "step": 1014, "text_loss": 0.49511051177978516 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009936612684125702, "loss": 0.0503, "macro_f1": 0.3333333432674408, "num_tokens": 1640165.0, "repeat_count": 0.0, "routers_loss": 0.005106127820909023, "skip_count": 0.0, "step": 1016, "text_loss": 0.5398799180984497 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.7795714705019074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2734375, "learning_rate": 0.0009936120452524004, "loss": 0.0506, "macro_f1": 0.3333333432674408, "num_tokens": 1643251.0, "repeat_count": 0.0, "routers_loss": 0.016914300620555878, "skip_count": 0.0, "step": 1018, "text_loss": 0.20882178843021393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.788963897857353, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009935626329391637, "loss": 0.0537, "macro_f1": 0.32098764181137085, "num_tokens": 1646560.0, "repeat_count": 0.0, "routers_loss": 0.13481520116329193, "skip_count": 2.0, "step": 1020, "text_loss": 0.5719883441925049 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.798356325212797, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1552734375, "learning_rate": 0.0009935130314917948, "loss": 0.0602, "macro_f1": 0.5492662787437439, "num_tokens": 1649538.0, "repeat_count": 0.0, "routers_loss": 0.07700438797473907, "skip_count": 2.0, "step": 1022, "text_loss": 0.1303367167711258 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.807748752568242, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009934632409293015, "loss": 0.0611, "macro_f1": 0.32098764181137085, "num_tokens": 1652397.0, "repeat_count": 1.0, "routers_loss": 0.11416907608509064, "skip_count": 1.0, "step": 1024, "text_loss": 0.24076920747756958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.817141179923686, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.0009934132612707631, "loss": 0.0507, "macro_f1": 0.31446540355682373, "num_tokens": 1654938.0, "repeat_count": 0.0, "routers_loss": 0.09484589844942093, "skip_count": 2.0, "step": 1026, "text_loss": 0.1652517318725586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009933630925353324, "loss": 0.0395, "macro_f1": 0.3333333432674408, "num_tokens": 1658536.0, "repeat_count": 0.0, "routers_loss": 0.00741987070068717, "skip_count": 0.0, "step": 1028, "text_loss": 0.49296700954437256 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.835926034634576, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1845703125, "learning_rate": 0.0009933127347422337, "loss": 0.0602, "macro_f1": 0.32098764181137085, "num_tokens": 1661446.0, "repeat_count": 0.0, "routers_loss": 0.08399344235658646, "skip_count": 2.0, "step": 1030, "text_loss": 0.22363591194152832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.158203125, "learning_rate": 0.0009932621879107648, "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1664612.0, "repeat_count": 0.0, "routers_loss": 0.0031781597062945366, "skip_count": 0.0, "step": 1032, "text_loss": 0.36083245277404785 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.854710889345466, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.000993211452060295, "loss": 0.042, "macro_f1": 0.3272727429866791, "num_tokens": 1667467.0, "repeat_count": 0.0, "routers_loss": 0.03595469892024994, "skip_count": 1.0, "step": 1034, "text_loss": 0.16372856497764587 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.86410331670091, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000993160527210266, "loss": 0.061, "macro_f1": 0.3144654333591461, "num_tokens": 1670675.0, "repeat_count": 3.0, "routers_loss": 0.1597205102443695, "skip_count": 0.0, "step": 1036, "text_loss": 0.6049913763999939 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2197265625, "learning_rate": 0.000993109413380193, "loss": 0.0562, "macro_f1": 0.3333333432674408, "num_tokens": 1673477.0, "repeat_count": 0.0, "routers_loss": 0.009756010957062244, "skip_count": 0.0, "step": 1038, "text_loss": 0.7034620642662048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.882888171411799, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.0009930581105896624, "loss": 0.0559, "macro_f1": 0.3272727429866791, "num_tokens": 1676809.0, "repeat_count": 0.0, "routers_loss": 0.020718922838568687, "skip_count": 0.0, "step": 1040, "text_loss": 0.2814720571041107 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.892280598767244, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.0009930066188583338, "loss": 0.0445, "macro_f1": 0.32098764181137085, "num_tokens": 1679398.0, "repeat_count": 1.0, "routers_loss": 0.04755603149533272, "skip_count": 1.0, "step": 1042, "text_loss": 0.5445759296417236 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.901673026122689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.0009929549382059388, "loss": 0.0509, "macro_f1": 0.3333333432674408, "num_tokens": 1682269.0, "repeat_count": 0.0, "routers_loss": 0.01040949858725071, "skip_count": 0.0, "step": 1044, "text_loss": 0.2876914143562317 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.911065453478133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009929030686522816, "loss": 0.0363, "macro_f1": 0.3333333432674408, "num_tokens": 1685428.0, "repeat_count": 0.0, "routers_loss": 0.008158888667821884, "skip_count": 0.0, "step": 1046, "text_loss": 0.49053525924682617 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.9204578808335775, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009928510102172386, "loss": 0.0498, "macro_f1": 0.3333333432674408, "num_tokens": 1688252.0, "repeat_count": 0.0, "routers_loss": 0.005102572031319141, "skip_count": 0.0, "step": 1048, "text_loss": 0.5274341106414795 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0009927987629207587, "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1691289.0, "repeat_count": 0.0, "routers_loss": 0.016768503934144974, "skip_count": 0.0, "step": 1050, "text_loss": 0.9935035109519958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.939242735544467, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0009927463267828634, "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1694148.0, "repeat_count": 0.0, "routers_loss": 0.010905829258263111, "skip_count": 0.0, "step": 1052, "text_loss": 0.20895758271217346 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.948635162899912, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.000992693701823646, "loss": 0.0624, "macro_f1": 0.3272727429866791, "num_tokens": 1698543.0, "repeat_count": 1.0, "routers_loss": 0.10533971339464188, "skip_count": 0.0, "step": 1054, "text_loss": 0.5776236653327942 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.958027590255357, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0009926408880632726, "loss": 0.0556, "macro_f1": 0.3272727429866791, "num_tokens": 1702460.0, "repeat_count": 0.0, "routers_loss": 0.026313411071896553, "skip_count": 1.0, "step": 1056, "text_loss": 0.34990596771240234 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.967420017610801, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.0009925878855219818, "loss": 0.0391, "macro_f1": 0.3333333432674408, "num_tokens": 1705686.0, "repeat_count": 0.0, "routers_loss": 0.007763393223285675, "skip_count": 0.0, "step": 1058, "text_loss": 0.4980163276195526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.976812444966246, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.177734375, "learning_rate": 0.000992534694220084, "loss": 0.0613, "macro_f1": 0.3272727429866791, "num_tokens": 1708739.0, "repeat_count": 0.0, "routers_loss": 0.03998444974422455, "skip_count": 1.0, "step": 1060, "text_loss": 0.29092350602149963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.98620487232169, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.000992481314177962, "loss": 0.0312, "macro_f1": 0.32098764181137085, "num_tokens": 1711903.0, "repeat_count": 1.0, "routers_loss": 0.06966045498847961, "skip_count": 1.0, "step": 1062, "text_loss": 0.6267179250717163 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.995597299677136, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.244140625, "learning_rate": 0.0009924277454160717, "loss": 0.0548, "macro_f1": 0.3272727429866791, "num_tokens": 1715974.0, "repeat_count": 0.0, "routers_loss": 0.05536063387989998, "skip_count": 1.0, "step": 1064, "text_loss": 0.5813798904418945 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.004696213677723, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0009923739879549402, "loss": 0.0423, "macro_f1": 0.3333333432674408, "num_tokens": 1718828.0, "repeat_count": 0.0, "routers_loss": 0.020993782207369804, "skip_count": 0.0, "step": 1066, "text_loss": 0.22665327787399292 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0888671875, "learning_rate": 0.0009923200418151677, "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 1722419.0, "repeat_count": 0.0, "routers_loss": 0.007351701147854328, "skip_count": 0.0, "step": 1068, "text_loss": 0.5796169638633728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.0234810683886115, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009922659070174264, "loss": 0.0452, "macro_f1": 0.3272727429866791, "num_tokens": 1725663.0, "repeat_count": 1.0, "routers_loss": 0.026033315807580948, "skip_count": 0.0, "step": 1070, "text_loss": 0.25742828845977783 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009922115835824612, "loss": 0.041, "macro_f1": 0.3333333432674408, "num_tokens": 1729239.0, "repeat_count": 0.0, "routers_loss": 0.0118600158020854, "skip_count": 0.0, "step": 1072, "text_loss": 0.21630282700061798 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.042265923099501, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0009921570715310884, "loss": 0.0364, "macro_f1": 0.6666666865348816, "num_tokens": 1732507.0, "repeat_count": 1.0, "routers_loss": 0.016118815168738365, "skip_count": 0.0, "step": 1074, "text_loss": 0.5639925003051758 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.051658350454946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009921023708841974, "loss": 0.0407, "macro_f1": 0.3333333432674408, "num_tokens": 1736182.0, "repeat_count": 0.0, "routers_loss": 0.004275390412658453, "skip_count": 0.0, "step": 1076, "text_loss": 0.5758615136146545 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0009920474816627496, "loss": 0.037, "macro_f1": 0.3333333432674408, "num_tokens": 1739559.0, "repeat_count": 0.0, "routers_loss": 0.01299292128533125, "skip_count": 0.0, "step": 1078, "text_loss": 0.18221625685691833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.0704432051658355, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009919924038877788, "loss": 0.0343, "macro_f1": 0.32098764181137085, "num_tokens": 1742890.0, "repeat_count": 0.0, "routers_loss": 0.038295745849609375, "skip_count": 2.0, "step": 1080, "text_loss": 0.17354349792003632 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 5.07983563252128, "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.1884765625, "learning_rate": 0.0009919371375803905, "loss": 0.0455, "macro_f1": 0.8194444179534912, "num_tokens": 1746433.0, "repeat_count": 2.0, "routers_loss": 0.04052971675992012, "skip_count": 3.0, "step": 1082, "text_loss": 0.2250112146139145 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009918816827617632, "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 1750802.0, "repeat_count": 0.0, "routers_loss": 0.009114136919379234, "skip_count": 0.0, "step": 1084, "text_loss": 0.2526719272136688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.098620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.000991826039453147, "loss": 0.0392, "macro_f1": 0.3333333432674408, "num_tokens": 1754272.0, "repeat_count": 0.0, "routers_loss": 0.004904678091406822, "skip_count": 0.0, "step": 1086, "text_loss": 0.7308789491653442 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 5.108012914587614, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.000991770207675865, "loss": 0.0327, "macro_f1": 0.6666666865348816, "num_tokens": 1757231.0, "repeat_count": 0.0, "routers_loss": 0.02129189297556877, "skip_count": 2.0, "step": 1088, "text_loss": 0.21764220297336578 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.1174053419430585, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009917141874513113, "loss": 0.0315, "macro_f1": 0.3333333432674408, "num_tokens": 1760003.0, "repeat_count": 0.0, "routers_loss": 0.01310618408024311, "skip_count": 0.0, "step": 1090, "text_loss": 0.33892181515693665 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.126797769298503, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.171875, "learning_rate": 0.0009916579788009537, "loss": 0.0457, "macro_f1": 0.5492662787437439, "num_tokens": 1763052.0, "repeat_count": 0.0, "routers_loss": 0.02059309557080269, "skip_count": 2.0, "step": 1092, "text_loss": 0.6551769375801086 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.136190196653947, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10546875, "learning_rate": 0.0009916015817463312, "loss": 0.0385, "macro_f1": 0.5492662787437439, "num_tokens": 1766655.0, "repeat_count": 0.0, "routers_loss": 0.0274797435849905, "skip_count": 2.0, "step": 1094, "text_loss": 0.3984372019767761 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.000991544996309055, "loss": 0.0271, "macro_f1": 0.3333333432674408, "num_tokens": 1769997.0, "repeat_count": 0.0, "routers_loss": 0.01437368243932724, "skip_count": 0.0, "step": 1096, "text_loss": 0.4203338921070099 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.154975051364837, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.000991488222510809, "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 1773130.0, "repeat_count": 0.0, "routers_loss": 0.001382062560878694, "skip_count": 0.0, "step": 1098, "text_loss": 0.43132516741752625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.164367478720282, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.000991431260373349, "loss": 0.0329, "macro_f1": 0.3144654333591461, "num_tokens": 1775682.0, "repeat_count": 1.0, "routers_loss": 0.1115434318780899, "skip_count": 2.0, "step": 1100, "text_loss": 0.3218227028846741 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.111328125, "learning_rate": 0.000991374109918503, "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 1778407.0, "repeat_count": 0.0, "routers_loss": 0.009529678151011467, "skip_count": 0.0, "step": 1102, "text_loss": 0.17183731496334076 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.183152333431171, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1142578125, "learning_rate": 0.000991316771168171, "loss": 0.044, "macro_f1": 0.5492662787437439, "num_tokens": 1781518.0, "repeat_count": 0.0, "routers_loss": 0.018668074160814285, "skip_count": 2.0, "step": 1104, "text_loss": 1.1324785947799683 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.192544760786616, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.125, "learning_rate": 0.0009912592441443258, "loss": 0.0411, "macro_f1": 0.3272727429866791, "num_tokens": 1784878.0, "repeat_count": 0.0, "routers_loss": 0.04145100712776184, "skip_count": 1.0, "step": 1106, "text_loss": 0.6082063317298889 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.20193718814206, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0009912015288690112, "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1788978.0, "repeat_count": 0.0, "routers_loss": 0.021450644358992577, "skip_count": 1.0, "step": 1108, "text_loss": 0.5597621202468872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.2113296154975055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.0009911436253643444, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 1792321.0, "repeat_count": 0.0, "routers_loss": 0.017405325546860695, "skip_count": 0.0, "step": 1110, "text_loss": 0.2560598850250244 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.0009910855336525137, "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1795182.0, "repeat_count": 0.0, "routers_loss": 0.007162237539887428, "skip_count": 0.0, "step": 1112, "text_loss": 0.3438240587711334 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 5.230114470208394, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.115234375, "learning_rate": 0.00099102725375578, "loss": 0.0326, "macro_f1": 0.480392187833786, "num_tokens": 1798987.0, "repeat_count": 1.0, "routers_loss": 0.11149197816848755, "skip_count": 3.0, "step": 1114, "text_loss": 0.20455503463745117 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.239506897563839, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.0009909687856964767, "loss": 0.035, "macro_f1": 0.3006536364555359, "num_tokens": 1802064.0, "repeat_count": 2.0, "routers_loss": 0.12679415941238403, "skip_count": 3.0, "step": 1116, "text_loss": 0.11996729671955109 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.248899324919284, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.12451171875, "learning_rate": 0.0009909101294970082, "loss": 0.0365, "macro_f1": 0.5492662787437439, "num_tokens": 1805412.0, "repeat_count": 0.0, "routers_loss": 0.05108053982257843, "skip_count": 2.0, "step": 1118, "text_loss": 0.13224145770072937 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.258291752274729, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.123046875, "learning_rate": 0.0009908512851798522, "loss": 0.0455, "macro_f1": 0.6603773832321167, "num_tokens": 1808196.0, "repeat_count": 1.0, "routers_loss": 0.02131766639649868, "skip_count": 1.0, "step": 1120, "text_loss": 0.7824069261550903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.0009907922527675576, "loss": 0.0405, "macro_f1": 0.3333333432674408, "num_tokens": 1811622.0, "repeat_count": 0.0, "routers_loss": 0.006226244382560253, "skip_count": 0.0, "step": 1122, "text_loss": 0.5419743061065674 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.277076606985618, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.12890625, "learning_rate": 0.000990733032282746, "loss": 0.0535, "macro_f1": 0.5492662787437439, "num_tokens": 1814628.0, "repeat_count": 0.0, "routers_loss": 0.03088250942528248, "skip_count": 2.0, "step": 1124, "text_loss": 0.37100958824157715 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.286469034341063, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0810546875, "learning_rate": 0.000990673623748111, "loss": 0.0348, "macro_f1": 0.32098767161369324, "num_tokens": 1817205.0, "repeat_count": 0.0, "routers_loss": 0.05495348572731018, "skip_count": 1.0, "step": 1126, "text_loss": 0.20241330564022064 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.295861461696507, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.0927734375, "learning_rate": 0.0009906140271864173, "loss": 0.0433, "macro_f1": 0.4871794879436493, "num_tokens": 1820141.0, "repeat_count": 0.0, "routers_loss": 0.037809282541275024, "skip_count": 2.0, "step": 1128, "text_loss": 0.32965806126594543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.305253889051952, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0908203125, "learning_rate": 0.0009905542426205032, "loss": 0.0348, "macro_f1": 0.32098767161369324, "num_tokens": 1824011.0, "repeat_count": 0.0, "routers_loss": 0.03320181369781494, "skip_count": 1.0, "step": 1130, "text_loss": 0.36329755187034607 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.314646316407397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009904942700732777, "loss": 0.0335, "macro_f1": 0.3333333432674408, "num_tokens": 1826873.0, "repeat_count": 0.0, "routers_loss": 0.004102326463907957, "skip_count": 0.0, "step": 1132, "text_loss": 0.6692602038383484 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.324038743762841, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08544921875, "learning_rate": 0.0009904341095677226, "loss": 0.03, "macro_f1": 0.29333335161209106, "num_tokens": 1830103.0, "repeat_count": 2.0, "routers_loss": 0.2376193106174469, "skip_count": 4.0, "step": 1134, "text_loss": 0.19212862849235535 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.333431171118286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.119140625, "learning_rate": 0.0009903737611268919, "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 1833201.0, "repeat_count": 0.0, "routers_loss": 0.005253395065665245, "skip_count": 0.0, "step": 1136, "text_loss": 0.6773360371589661 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.34282359847373, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009903132247739107, "loss": 0.0305, "macro_f1": 0.3076923191547394, "num_tokens": 1836045.0, "repeat_count": 1.0, "routers_loss": 0.14382585883140564, "skip_count": 3.0, "step": 1138, "text_loss": 0.2882297933101654 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.3522160258291755, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.150390625, "learning_rate": 0.0009902525005319766, "loss": 0.04, "macro_f1": 0.5427350401878357, "num_tokens": 1839721.0, "repeat_count": 1.0, "routers_loss": 0.04033960774540901, "skip_count": 2.0, "step": 1140, "text_loss": 0.7172559499740601 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.36160845318462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12109375, "learning_rate": 0.0009901915884243597, "loss": 0.0351, "macro_f1": 0.6666666865348816, "num_tokens": 1842614.0, "repeat_count": 1.0, "routers_loss": 0.005162308923900127, "skip_count": 0.0, "step": 1142, "text_loss": 0.42892804741859436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.371000880540064, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009901304884744014, "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1845444.0, "repeat_count": 1.0, "routers_loss": 0.10117656737565994, "skip_count": 2.0, "step": 1144, "text_loss": 0.20806430280208588 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.380393307895509, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0009900692007055152, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 1848558.0, "repeat_count": 0.0, "routers_loss": 0.014107038266956806, "skip_count": 0.0, "step": 1146, "text_loss": 0.5355974435806274 }, { "acc_repeat": 0.25, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 5.389785735250954, "f1_execute": 0.9166666865348816, "f1_repeat": 0.4000000059604645, "f1_skip": 0.6666666865348816, "grad_norm": 0.16015625, "learning_rate": 0.000990007725141187, "loss": 0.0449, "macro_f1": 0.6611111164093018, "num_tokens": 1852723.0, "repeat_count": 4.0, "routers_loss": 0.15537866950035095, "skip_count": 2.0, "step": 1148, "text_loss": 0.6388513445854187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.399178162606399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1181640625, "learning_rate": 0.0009899460618049741, "loss": 0.0397, "macro_f1": 0.3333333432674408, "num_tokens": 1856181.0, "repeat_count": 0.0, "routers_loss": 0.011800912208855152, "skip_count": 0.0, "step": 1150, "text_loss": 0.6113069653511047 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 5.408570589961843, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.000989884210720506, "loss": 0.0331, "macro_f1": 0.6666666865348816, "num_tokens": 1859685.0, "repeat_count": 2.0, "routers_loss": 0.022900646552443504, "skip_count": 0.0, "step": 1152, "text_loss": 0.25718021392822266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.4179630173172875, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009898221719114844, "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1862505.0, "repeat_count": 0.0, "routers_loss": 0.026814989745616913, "skip_count": 1.0, "step": 1154, "text_loss": 0.5426549911499023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.427355444672733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009897599454016823, "loss": 0.0401, "macro_f1": 0.3333333432674408, "num_tokens": 1866266.0, "repeat_count": 0.0, "routers_loss": 0.0032623792067170143, "skip_count": 0.0, "step": 1156, "text_loss": 0.37752896547317505 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.436747872028177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07080078125, "learning_rate": 0.0009896975312149454, "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 1870216.0, "repeat_count": 0.0, "routers_loss": 0.015617577359080315, "skip_count": 0.0, "step": 1158, "text_loss": 0.18207129836082458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.446140299383622, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009896349293751906, "loss": 0.0423, "macro_f1": 0.3272727429866791, "num_tokens": 1873338.0, "repeat_count": 0.0, "routers_loss": 0.02250153198838234, "skip_count": 1.0, "step": 1160, "text_loss": 0.548884391784668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.455532726739067, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009895721399064072, "loss": 0.0388, "macro_f1": 0.32098764181137085, "num_tokens": 1876470.0, "repeat_count": 1.0, "routers_loss": 0.055204521864652634, "skip_count": 1.0, "step": 1162, "text_loss": 0.48052409291267395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.464925154094511, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.0009895091628326564, "loss": 0.0293, "macro_f1": 0.3333333432674408, "num_tokens": 1879354.0, "repeat_count": 0.0, "routers_loss": 0.009093789383769035, "skip_count": 0.0, "step": 1164, "text_loss": 0.3908069431781769 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.474317581449956, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.000989445998178071, "loss": 0.0323, "macro_f1": 0.3272727429866791, "num_tokens": 1881941.0, "repeat_count": 0.0, "routers_loss": 0.015086972154676914, "skip_count": 1.0, "step": 1166, "text_loss": 0.4884725511074066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.4837100088054, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0009893826459668558, "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1885374.0, "repeat_count": 0.0, "routers_loss": 0.06587666273117065, "skip_count": 3.0, "step": 1168, "text_loss": 0.12760137021541595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0009893191062232873, "loss": 0.0322, "macro_f1": 0.3333333432674408, "num_tokens": 1888612.0, "repeat_count": 0.0, "routers_loss": 0.006088624242693186, "skip_count": 0.0, "step": 1170, "text_loss": 0.4821319580078125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0009892553789717143, "loss": 0.0389, "macro_f1": 0.3333333432674408, "num_tokens": 1891463.0, "repeat_count": 0.0, "routers_loss": 0.010113578289747238, "skip_count": 0.0, "step": 1172, "text_loss": 0.3613642454147339 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.5118872908717345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0009891914642365573, "loss": 0.0404, "macro_f1": 0.3333333432674408, "num_tokens": 1894230.0, "repeat_count": 0.0, "routers_loss": 0.004947459790855646, "skip_count": 0.0, "step": 1174, "text_loss": 0.5037549138069153 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.521279718227179, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.0009891273620423083, "loss": 0.0428, "macro_f1": 0.3272727429866791, "num_tokens": 1897294.0, "repeat_count": 1.0, "routers_loss": 0.026075217872858047, "skip_count": 0.0, "step": 1176, "text_loss": 0.32558977603912354 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.530672145582624, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0009890630724135314, "loss": 0.0351, "macro_f1": 0.3272727429866791, "num_tokens": 1901553.0, "repeat_count": 0.0, "routers_loss": 0.06650999188423157, "skip_count": 1.0, "step": 1178, "text_loss": 0.23473620414733887 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.540064572938069, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009889985953748625, "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 1904556.0, "repeat_count": 0.0, "routers_loss": 0.010361116379499435, "skip_count": 1.0, "step": 1180, "text_loss": 0.6927042007446289 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.549457000293513, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.0009889339309510094, "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 1908053.0, "repeat_count": 0.0, "routers_loss": 0.013286533765494823, "skip_count": 0.0, "step": 1182, "text_loss": 0.19977325201034546 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 5.558849427648958, "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, "grad_norm": 0.058837890625, "learning_rate": 0.0009888690791667518, "loss": 0.0204, "macro_f1": 0.7018141150474548, "num_tokens": 1911754.0, "repeat_count": 2.0, "routers_loss": 0.11920545995235443, "skip_count": 3.0, "step": 1184, "text_loss": 0.4072858691215515 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.568241855004403, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11083984375, "learning_rate": 0.0009888040400469408, "loss": 0.0391, "macro_f1": 0.3272727429866791, "num_tokens": 1914862.0, "repeat_count": 0.0, "routers_loss": 0.03652849420905113, "skip_count": 1.0, "step": 1186, "text_loss": 0.2654043138027191 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.577634282359847, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1689453125, "learning_rate": 0.0009887388136164996, "loss": 0.0336, "macro_f1": 0.5492662787437439, "num_tokens": 1918542.0, "repeat_count": 0.0, "routers_loss": 0.03991910070180893, "skip_count": 2.0, "step": 1188, "text_loss": 0.21130657196044922 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.587026709715292, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09521484375, "learning_rate": 0.000988673399900423, "loss": 0.0429, "macro_f1": 0.3272727429866791, "num_tokens": 1921589.0, "repeat_count": 0.0, "routers_loss": 0.014900135807693005, "skip_count": 0.0, "step": 1190, "text_loss": 0.5519335865974426 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.596419137070737, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009886077989237777, "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 1924320.0, "repeat_count": 0.0, "routers_loss": 0.06271552294492722, "skip_count": 1.0, "step": 1192, "text_loss": 0.213813915848732 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 24.0, "epoch": 5.6058115644261814, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.1875, "learning_rate": 0.000988542010711702, "loss": 0.0342, "macro_f1": 0.6225374937057495, "num_tokens": 1927178.0, "repeat_count": 0.0, "routers_loss": 0.03081391751766205, "skip_count": 5.0, "step": 1194, "text_loss": 0.7524349093437195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.615203991781626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0009884760352894064, "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1930216.0, "repeat_count": 0.0, "routers_loss": 0.008556773886084557, "skip_count": 0.0, "step": 1196, "text_loss": 0.28230375051498413 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.62459641913707, "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 0.1064453125, "learning_rate": 0.0009884098726821726, "loss": 0.0472, "macro_f1": 0.4871794879436493, "num_tokens": 1933312.0, "repeat_count": 3.0, "routers_loss": 0.05344727262854576, "skip_count": 0.0, "step": 1198, "text_loss": 0.5509607195854187 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 5.633988846492516, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.1298828125, "learning_rate": 0.000988343522915354, "loss": 0.0441, "macro_f1": 0.480392187833786, "num_tokens": 1936160.0, "repeat_count": 1.0, "routers_loss": 0.07324771583080292, "skip_count": 3.0, "step": 1200, "text_loss": 0.30565372109413147 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 25.0, "epoch": 5.64338127384796, "f1_execute": 0.8936169743537903, "f1_repeat": 0.0, "f1_skip": 0.444444477558136, "grad_norm": 0.2470703125, "learning_rate": 0.0009882769860143764, "loss": 0.0317, "macro_f1": 0.4460204839706421, "num_tokens": 1939266.0, "repeat_count": 0.0, "routers_loss": 0.18620699644088745, "skip_count": 6.0, "step": 1202, "text_loss": 0.976121723651886 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.6527737012034045, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1318359375, "learning_rate": 0.000988210262004737, "loss": 0.0474, "macro_f1": 0.6666666865348816, "num_tokens": 1942173.0, "repeat_count": 0.0, "routers_loss": 0.007703613489866257, "skip_count": 1.0, "step": 1204, "text_loss": 0.5647401809692383 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.66216612855885, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1484375, "learning_rate": 0.0009881433509120036, "loss": 0.0376, "macro_f1": 0.5492662787437439, "num_tokens": 1945071.0, "repeat_count": 0.0, "routers_loss": 0.02162683941423893, "skip_count": 2.0, "step": 1206, "text_loss": 0.24229218065738678 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.671558555914294, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0966796875, "learning_rate": 0.0009880762527618176, "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1949060.0, "repeat_count": 0.0, "routers_loss": 0.017667081207036972, "skip_count": 0.0, "step": 1208, "text_loss": 0.4035970866680145 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009880089675798908, "loss": 0.0367, "macro_f1": 0.3333333432674408, "num_tokens": 1951698.0, "repeat_count": 0.0, "routers_loss": 0.006405784282833338, "skip_count": 0.0, "step": 1210, "text_loss": 0.5319879055023193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.690343410625183, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009879414953920071, "loss": 0.0294, "macro_f1": 0.3333333432674408, "num_tokens": 1955266.0, "repeat_count": 0.0, "routers_loss": 0.009859707206487656, "skip_count": 0.0, "step": 1212, "text_loss": 0.6687407493591309 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.699735837980628, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.130859375, "learning_rate": 0.0009878738362240219, "loss": 0.045, "macro_f1": 0.5492662787437439, "num_tokens": 1958538.0, "repeat_count": 0.0, "routers_loss": 0.030890554189682007, "skip_count": 2.0, "step": 1214, "text_loss": 0.20820017158985138 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 5.709128265336073, "f1_execute": 0.9200000166893005, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.000987805990101862, "loss": 0.0317, "macro_f1": 0.47333335876464844, "num_tokens": 1961419.0, "repeat_count": 2.0, "routers_loss": 0.10383198410272598, "skip_count": 2.0, "step": 1216, "text_loss": 0.8664976358413696 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.718520692691517, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009877379570515268, "loss": 0.0366, "macro_f1": 0.3333333432674408, "num_tokens": 1964836.0, "repeat_count": 0.0, "routers_loss": 0.013376163318753242, "skip_count": 0.0, "step": 1218, "text_loss": 0.4223395884037018 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.727913120046962, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0859375, "learning_rate": 0.0009876697370990865, "loss": 0.0343, "macro_f1": 0.3333333432674408, "num_tokens": 1967620.0, "repeat_count": 0.0, "routers_loss": 0.008577900938689709, "skip_count": 0.0, "step": 1220, "text_loss": 0.4789901375770569 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.0009876013302706828, "loss": 0.049, "macro_f1": 0.3333333432674408, "num_tokens": 1971100.0, "repeat_count": 0.0, "routers_loss": 0.004730266984552145, "skip_count": 0.0, "step": 1222, "text_loss": 0.6799837946891785 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.7466979747578515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009875327365925295, "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1974408.0, "repeat_count": 0.0, "routers_loss": 0.010849526152014732, "skip_count": 0.0, "step": 1224, "text_loss": 0.18967926502227783 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.756090402113296, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009874639560909118, "loss": 0.0498, "macro_f1": 0.32098767161369324, "num_tokens": 1977046.0, "repeat_count": 0.0, "routers_loss": 0.04841252416372299, "skip_count": 1.0, "step": 1226, "text_loss": 0.6133310198783875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.765482829468741, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1318359375, "learning_rate": 0.0009873949887921867, "loss": 0.0402, "macro_f1": 0.3272727429866791, "num_tokens": 1980330.0, "repeat_count": 0.0, "routers_loss": 0.029638588428497314, "skip_count": 1.0, "step": 1228, "text_loss": 0.15649555623531342 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.774875256824186, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0009873258347227823, "loss": 0.0331, "macro_f1": 0.3272727429866791, "num_tokens": 1983173.0, "repeat_count": 0.0, "routers_loss": 0.009955910965800285, "skip_count": 0.0, "step": 1230, "text_loss": 0.4741005599498749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009872564939091989, "loss": 0.0342, "macro_f1": 0.3333333432674408, "num_tokens": 1986825.0, "repeat_count": 0.0, "routers_loss": 0.010205300524830818, "skip_count": 0.0, "step": 1232, "text_loss": 0.5315462350845337 }, { "acc_repeat": 1.0, "acc_skip": 0.5714285969734192, "avg_layers": 25.0, "epoch": 5.7936601115350745, "f1_execute": 0.9302325248718262, "f1_repeat": 1.0, "f1_skip": 0.7272727489471436, "grad_norm": 0.11865234375, "learning_rate": 0.0009871869663780077, "loss": 0.0336, "macro_f1": 0.8858351111412048, "num_tokens": 1990448.0, "repeat_count": 1.0, "routers_loss": 0.09120134264230728, "skip_count": 7.0, "step": 1234, "text_loss": 0.6187508702278137 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.125, "learning_rate": 0.0009871172521558522, "loss": 0.0475, "macro_f1": 0.6666666865348816, "num_tokens": 1993474.0, "repeat_count": 0.0, "routers_loss": 0.016188839450478554, "skip_count": 1.0, "step": 1236, "text_loss": 0.20783066749572754 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 5.812444966245964, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.216796875, "learning_rate": 0.0009870473512694465, "loss": 0.0373, "macro_f1": 0.5934640765190125, "num_tokens": 1996536.0, "repeat_count": 0.0, "routers_loss": 0.05046704784035683, "skip_count": 3.0, "step": 1238, "text_loss": 0.247748002409935 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.821837393601409, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.09033203125, "learning_rate": 0.0009869772637455772, "loss": 0.0251, "macro_f1": 0.4871794879436493, "num_tokens": 1999530.0, "repeat_count": 0.0, "routers_loss": 0.044926248490810394, "skip_count": 2.0, "step": 1240, "text_loss": 0.26001980900764465 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.831229820956853, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1513671875, "learning_rate": 0.000986906989611102, "loss": 0.0446, "macro_f1": 0.3272727429866791, "num_tokens": 2002782.0, "repeat_count": 0.0, "routers_loss": 0.025911526754498482, "skip_count": 0.0, "step": 1242, "text_loss": 0.9009982943534851 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.8406222483122985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0009868365288929492, "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2005331.0, "repeat_count": 0.0, "routers_loss": 0.0043760035187006, "skip_count": 0.0, "step": 1244, "text_loss": 0.5547386407852173 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.850014675667743, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.0009867658816181206, "loss": 0.0374, "macro_f1": 0.3333333432674408, "num_tokens": 2008115.0, "repeat_count": 0.0, "routers_loss": 0.009227181784808636, "skip_count": 0.0, "step": 1246, "text_loss": 1.0067731142044067 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.859407103023187, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.000986695047813688, "loss": 0.0261, "macro_f1": 0.3272727429866791, "num_tokens": 2011137.0, "repeat_count": 1.0, "routers_loss": 0.023822437971830368, "skip_count": 0.0, "step": 1248, "text_loss": 0.30058956146240234 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 5.868799530378633, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.1044921875, "learning_rate": 0.0009866240275067948, "loss": 0.044, "macro_f1": 0.47333335876464844, "num_tokens": 2014159.0, "repeat_count": 2.0, "routers_loss": 0.21523773670196533, "skip_count": 3.0, "step": 1250, "text_loss": 0.39072203636169434 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.878191957734077, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1201171875, "learning_rate": 0.0009865528207246563, "loss": 0.0351, "macro_f1": 0.5492662787437439, "num_tokens": 2017731.0, "repeat_count": 0.0, "routers_loss": 0.06184682995080948, "skip_count": 2.0, "step": 1252, "text_loss": 0.35751575231552124 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.8875843850895215, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.166015625, "learning_rate": 0.000986481427494559, "loss": 0.0336, "macro_f1": 0.3333333432674408, "num_tokens": 2020485.0, "repeat_count": 0.0, "routers_loss": 0.007573372684419155, "skip_count": 0.0, "step": 1254, "text_loss": 0.4061077833175659 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.896976812444966, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.000986409847843861, "loss": 0.0382, "macro_f1": 0.3272727429866791, "num_tokens": 2024149.0, "repeat_count": 1.0, "routers_loss": 0.07447971403598785, "skip_count": 0.0, "step": 1256, "text_loss": 0.41876497864723206 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.906369239800411, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.000986338081799992, "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 2026545.0, "repeat_count": 0.0, "routers_loss": 0.006609147880226374, "skip_count": 0.0, "step": 1258, "text_loss": 0.4673794209957123 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.915761667155856, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.0009862661293904523, "loss": 0.0498, "macro_f1": 0.32098764181137085, "num_tokens": 2029581.0, "repeat_count": 0.0, "routers_loss": 0.10624702274799347, "skip_count": 2.0, "step": 1260, "text_loss": 0.3483233153820038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1201171875, "learning_rate": 0.0009861939906428145, "loss": 0.0525, "macro_f1": 0.3333333432674408, "num_tokens": 2033936.0, "repeat_count": 0.0, "routers_loss": 0.007944886572659016, "skip_count": 0.0, "step": 1262, "text_loss": 0.16362667083740234 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.934546521866745, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009861216655847225, "loss": 0.0376, "macro_f1": 0.6666666865348816, "num_tokens": 2037876.0, "repeat_count": 1.0, "routers_loss": 0.007004092447459698, "skip_count": 0.0, "step": 1264, "text_loss": 0.43228110671043396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.94393894922219, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.0009860491542438912, "loss": 0.047, "macro_f1": 0.3272727429866791, "num_tokens": 2040842.0, "repeat_count": 0.0, "routers_loss": 0.026916226372122765, "skip_count": 1.0, "step": 1266, "text_loss": 0.5901188850402832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.953331376577634, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.000985976456648107, "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 2043890.0, "repeat_count": 0.0, "routers_loss": 0.007325216196477413, "skip_count": 0.0, "step": 1268, "text_loss": 0.8780109882354736 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.962723803933079, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.10205078125, "learning_rate": 0.000985903572825228, "loss": 0.0306, "macro_f1": 0.4871794879436493, "num_tokens": 2048848.0, "repeat_count": 0.0, "routers_loss": 0.05007527023553848, "skip_count": 2.0, "step": 1270, "text_loss": 0.5863722562789917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.972116231288524, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.000985830502803183, "loss": 0.0396, "macro_f1": 0.3272727429866791, "num_tokens": 2051561.0, "repeat_count": 0.0, "routers_loss": 0.023995524272322655, "skip_count": 0.0, "step": 1272, "text_loss": 0.7460709810256958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.9815086586439685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.0009857572466099732, "loss": 0.0431, "macro_f1": 0.3333333432674408, "num_tokens": 2054752.0, "repeat_count": 0.0, "routers_loss": 0.006928362417966127, "skip_count": 0.0, "step": 1274, "text_loss": 0.5130293369293213 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.162109375, "learning_rate": 0.0009856838042736698, "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 2058151.0, "repeat_count": 0.0, "routers_loss": 0.006969396956264973, "skip_count": 0.0, "step": 1276, "text_loss": 0.5911393761634827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0009856101758224166, "loss": 0.0441, "macro_f1": 0.3333333432674408, "num_tokens": 2061012.0, "repeat_count": 0.0, "routers_loss": 0.003499418031424284, "skip_count": 0.0, "step": 1278, "text_loss": 0.25347545742988586 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.000985536361284428, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2064597.0, "repeat_count": 0.0, "routers_loss": 0.007856054231524467, "skip_count": 0.0, "step": 1280, "text_loss": 0.7476963400840759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.01878485471089, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0888671875, "learning_rate": 0.0009854623606879898, "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2067972.0, "repeat_count": 0.0, "routers_loss": 0.02617792971432209, "skip_count": 1.0, "step": 1282, "text_loss": 0.5775872468948364 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.028177282066334, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.000985388174061459, "loss": 0.0356, "macro_f1": 0.32098767161369324, "num_tokens": 2071812.0, "repeat_count": 0.0, "routers_loss": 0.035979997366666794, "skip_count": 1.0, "step": 1284, "text_loss": 0.2933400869369507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.037569709421779, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08447265625, "learning_rate": 0.0009853138014332646, "loss": 0.0273, "macro_f1": 0.3333333432674408, "num_tokens": 2074868.0, "repeat_count": 0.0, "routers_loss": 0.005142854526638985, "skip_count": 0.0, "step": 1286, "text_loss": 0.29085102677345276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.046962136777223, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.0009852392428319058, "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 2078225.0, "repeat_count": 0.0, "routers_loss": 0.0032799106556922197, "skip_count": 0.0, "step": 1288, "text_loss": 0.7293626070022583 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 6.056354564132668, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.08935546875, "learning_rate": 0.0009851644982859537, "loss": 0.0273, "macro_f1": 0.480392187833786, "num_tokens": 2081495.0, "repeat_count": 1.0, "routers_loss": 0.12224318832159042, "skip_count": 3.0, "step": 1290, "text_loss": 0.26125892996788025 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.065746991488113, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0009850895678240508, "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2084390.0, "repeat_count": 1.0, "routers_loss": 0.010662888176739216, "skip_count": 0.0, "step": 1292, "text_loss": 0.3510764539241791 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.075139418843557, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1689453125, "learning_rate": 0.0009850144514749104, "loss": 0.0332, "macro_f1": 0.5492662787437439, "num_tokens": 2087210.0, "repeat_count": 0.0, "routers_loss": 0.01979079470038414, "skip_count": 2.0, "step": 1294, "text_loss": 0.40202176570892334 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.084531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11669921875, "learning_rate": 0.000984939149267317, "loss": 0.0253, "macro_f1": 0.6666666865348816, "num_tokens": 2090777.0, "repeat_count": 0.0, "routers_loss": 0.005172552540898323, "skip_count": 1.0, "step": 1296, "text_loss": 0.5275651216506958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.093924273554447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009848636612301272, "loss": 0.0299, "macro_f1": 0.3333333432674408, "num_tokens": 2094248.0, "repeat_count": 0.0, "routers_loss": 0.0029599082190543413, "skip_count": 0.0, "step": 1298, "text_loss": 0.4517653286457062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.0009847879873922675, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2097139.0, "repeat_count": 0.0, "routers_loss": 0.011455860920250416, "skip_count": 0.0, "step": 1300, "text_loss": 0.16888445615768433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.112709128265336, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.0009847121277827366, "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2100415.0, "repeat_count": 0.0, "routers_loss": 0.008091195486485958, "skip_count": 0.0, "step": 1302, "text_loss": 0.40061676502227783 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.122101555620781, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.000984636082430604, "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2103285.0, "repeat_count": 0.0, "routers_loss": 0.009593960829079151, "skip_count": 0.0, "step": 1304, "text_loss": 0.7211073637008667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.131493982976226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.107421875, "learning_rate": 0.0009845598513650103, "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2106255.0, "repeat_count": 0.0, "routers_loss": 0.0023068038281053305, "skip_count": 0.0, "step": 1306, "text_loss": 0.7077119946479797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.0009844834346151674, "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 2109305.0, "repeat_count": 0.0, "routers_loss": 0.007703019306063652, "skip_count": 0.0, "step": 1308, "text_loss": 0.3534316122531891 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.1502788376871145, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0009844068322103585, "loss": 0.0287, "macro_f1": 0.3272727429866791, "num_tokens": 2112216.0, "repeat_count": 0.0, "routers_loss": 0.023549847304821014, "skip_count": 1.0, "step": 1310, "text_loss": 0.6792599558830261 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.0009843300441799378, "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2114925.0, "repeat_count": 0.0, "routers_loss": 0.007605871185660362, "skip_count": 0.0, "step": 1312, "text_loss": 0.1571389138698578 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.169063692398004, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0009842530705533304, "loss": 0.0253, "macro_f1": 0.3272727429866791, "num_tokens": 2117744.0, "repeat_count": 0.0, "routers_loss": 0.014964760281145573, "skip_count": 0.0, "step": 1314, "text_loss": 0.7840361595153809 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.178456119753449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.000984175911360033, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2120848.0, "repeat_count": 0.0, "routers_loss": 0.004663798492401838, "skip_count": 0.0, "step": 1316, "text_loss": 0.536246120929718 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.187848547108893, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1201171875, "learning_rate": 0.000984098566629613, "loss": 0.0288, "macro_f1": 0.5492662787437439, "num_tokens": 2123651.0, "repeat_count": 0.0, "routers_loss": 0.022852955386042595, "skip_count": 2.0, "step": 1318, "text_loss": 0.43372172117233276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.197240974464338, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0009840210363917087, "loss": 0.0216, "macro_f1": 0.3333333432674408, "num_tokens": 2128011.0, "repeat_count": 0.0, "routers_loss": 0.012578422203660011, "skip_count": 0.0, "step": 1320, "text_loss": 0.28190380334854126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.0009839433206760306, "loss": 0.0204, "macro_f1": 0.3333333432674408, "num_tokens": 2131035.0, "repeat_count": 0.0, "routers_loss": 0.006863643880933523, "skip_count": 0.0, "step": 1322, "text_loss": 0.6340444087982178 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.216025829175227, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.0009838654195123589, "loss": 0.0243, "macro_f1": 0.3333333432674408, "num_tokens": 2133856.0, "repeat_count": 0.0, "routers_loss": 0.00468854233622551, "skip_count": 0.0, "step": 1324, "text_loss": 0.5138425827026367 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.225418256530672, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0009837873329305458, "loss": 0.0396, "macro_f1": 0.6666666865348816, "num_tokens": 2136451.0, "repeat_count": 1.0, "routers_loss": 0.005731126759201288, "skip_count": 0.0, "step": 1326, "text_loss": 0.742124617099762 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.000983709060960514, "loss": 0.0416, "macro_f1": 0.3333333432674408, "num_tokens": 2139496.0, "repeat_count": 0.0, "routers_loss": 0.0056343949399888515, "skip_count": 0.0, "step": 1328, "text_loss": 0.7317464351654053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.2442031112415615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.0009836306036322576, "loss": 0.0312, "macro_f1": 0.3333333432674408, "num_tokens": 2143120.0, "repeat_count": 0.0, "routers_loss": 0.005127966403961182, "skip_count": 0.0, "step": 1330, "text_loss": 0.538652241230011 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 6.253595538597006, "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.11083984375, "learning_rate": 0.0009835519609758415, "loss": 0.0301, "macro_f1": 0.590062141418457, "num_tokens": 2145807.0, "repeat_count": 3.0, "routers_loss": 0.1673707216978073, "skip_count": 4.0, "step": 1332, "text_loss": 0.3498198091983795 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.262987965952451, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0009834731330214017, "loss": 0.0293, "macro_f1": 0.3272727429866791, "num_tokens": 2148397.0, "repeat_count": 1.0, "routers_loss": 0.04026653990149498, "skip_count": 0.0, "step": 1334, "text_loss": 0.8153424859046936 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 27.0, "epoch": 6.272380393307896, "f1_execute": 0.8999999761581421, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, "grad_norm": 0.16015625, "learning_rate": 0.0009833941197991455, "loss": 0.0329, "macro_f1": 0.7888889312744141, "num_tokens": 2152226.0, "repeat_count": 2.0, "routers_loss": 0.05481519177556038, "skip_count": 5.0, "step": 1336, "text_loss": 0.7802760004997253 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.28177282066334, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009833149213393506, "loss": 0.0304, "macro_f1": 0.3272727429866791, "num_tokens": 2156023.0, "repeat_count": 0.0, "routers_loss": 0.01760484278202057, "skip_count": 0.0, "step": 1338, "text_loss": 0.19721226394176483 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.2911652480187845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.000983235537672366, "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2160037.0, "repeat_count": 0.0, "routers_loss": 0.013206037692725658, "skip_count": 0.0, "step": 1340, "text_loss": 0.5003817081451416 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.30055767537423, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.000983155968828612, "loss": 0.0315, "macro_f1": 0.6666666865348816, "num_tokens": 2163910.0, "repeat_count": 1.0, "routers_loss": 0.01256406120955944, "skip_count": 0.0, "step": 1342, "text_loss": 0.5996923446655273 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.309950102729674, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11962890625, "learning_rate": 0.0009830762148385793, "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2166921.0, "repeat_count": 0.0, "routers_loss": 0.015086234547197819, "skip_count": 1.0, "step": 1344, "text_loss": 0.45356282591819763 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.319342530085119, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08447265625, "learning_rate": 0.0009829962757328297, "loss": 0.0223, "macro_f1": 0.32098764181137085, "num_tokens": 2170135.0, "repeat_count": 0.0, "routers_loss": 0.07909081131219864, "skip_count": 2.0, "step": 1346, "text_loss": 0.2874644994735718 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 6.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009829161515419959, "loss": 0.0246, "macro_f1": 0.6666666865348816, "num_tokens": 2173029.0, "repeat_count": 0.0, "routers_loss": 0.013569854199886322, "skip_count": 2.0, "step": 1348, "text_loss": 0.25533875823020935 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.3381273847960085, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0009828358422967823, "loss": 0.0226, "macro_f1": 0.32098764181137085, "num_tokens": 2176605.0, "repeat_count": 1.0, "routers_loss": 0.08111091703176498, "skip_count": 1.0, "step": 1350, "text_loss": 0.32827726006507874 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 6.347519812151453, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.091796875, "learning_rate": 0.0009827553480279627, "loss": 0.03, "macro_f1": 0.5427350401878357, "num_tokens": 2179406.0, "repeat_count": 0.0, "routers_loss": 0.026550088077783585, "skip_count": 2.0, "step": 1352, "text_loss": 0.2966301143169403 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009826746687663832, "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2182353.0, "repeat_count": 0.0, "routers_loss": 0.003914554137736559, "skip_count": 0.0, "step": 1354, "text_loss": 0.7596251964569092 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 6.366304666862343, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0859375, "learning_rate": 0.0009825938045429602, "loss": 0.0324, "macro_f1": 0.5866667032241821, "num_tokens": 2185786.0, "repeat_count": 1.0, "routers_loss": 0.059612665325403214, "skip_count": 3.0, "step": 1356, "text_loss": 0.12325898557901382 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.375697094217787, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10009765625, "learning_rate": 0.0009825127553886807, "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 2190157.0, "repeat_count": 0.0, "routers_loss": 0.0071132429875433445, "skip_count": 0.0, "step": 1358, "text_loss": 0.9287898540496826 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.3850895215732315, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0009824315213346033, "loss": 0.0348, "macro_f1": 0.3333333432674408, "num_tokens": 2193077.0, "repeat_count": 0.0, "routers_loss": 0.009611099027097225, "skip_count": 0.0, "step": 1360, "text_loss": 0.20427259802818298 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.394481948928676, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0009823501024118569, "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2196494.0, "repeat_count": 0.0, "routers_loss": 0.006913455203175545, "skip_count": 0.0, "step": 1362, "text_loss": 0.574759840965271 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.403874376284121, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009822684986516411, "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 2199839.0, "repeat_count": 0.0, "routers_loss": 0.009208920411765575, "skip_count": 0.0, "step": 1364, "text_loss": 0.42422571778297424 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.413266803639566, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.000982186710085227, "loss": 0.0208, "macro_f1": 0.32098764181137085, "num_tokens": 2203212.0, "repeat_count": 1.0, "routers_loss": 0.059975091367959976, "skip_count": 1.0, "step": 1366, "text_loss": 0.29213017225265503 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 6.42265923099501, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.181640625, "learning_rate": 0.0009821047367439561, "loss": 0.0358, "macro_f1": 0.44705885648727417, "num_tokens": 2206240.0, "repeat_count": 0.0, "routers_loss": 0.048244867473840714, "skip_count": 4.0, "step": 1368, "text_loss": 0.3072395324707031 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.432051658350455, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0009820225786592405, "loss": 0.0375, "macro_f1": 0.3272727429866791, "num_tokens": 2209903.0, "repeat_count": 1.0, "routers_loss": 0.026068156585097313, "skip_count": 0.0, "step": 1370, "text_loss": 0.5961400270462036 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.4414440857059, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.109375, "learning_rate": 0.0009819402358625634, "loss": 0.0366, "macro_f1": 0.3272727429866791, "num_tokens": 2213439.0, "repeat_count": 0.0, "routers_loss": 0.022615568712353706, "skip_count": 1.0, "step": 1372, "text_loss": 0.19375644624233246 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.450836513061344, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.000981857708385479, "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2216457.0, "repeat_count": 0.0, "routers_loss": 0.005855285096913576, "skip_count": 0.0, "step": 1374, "text_loss": 0.5123368501663208 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.460228940416789, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0009817749962596114, "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2219975.0, "repeat_count": 1.0, "routers_loss": 0.0651634931564331, "skip_count": 0.0, "step": 1376, "text_loss": 0.5999220609664917 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.469621367772234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0009816920995166568, "loss": 0.0371, "macro_f1": 0.6666666865348816, "num_tokens": 2222833.0, "repeat_count": 1.0, "routers_loss": 0.011408994905650616, "skip_count": 0.0, "step": 1378, "text_loss": 0.5323230624198914 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.4790137951276785, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.205078125, "learning_rate": 0.0009816090181883807, "loss": 0.0313, "macro_f1": 0.32098764181137085, "num_tokens": 2225842.0, "repeat_count": 0.0, "routers_loss": 0.039720915257930756, "skip_count": 2.0, "step": 1380, "text_loss": 0.23363439738750458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.0009815257523066204, "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 2229430.0, "repeat_count": 0.0, "routers_loss": 0.002765297656878829, "skip_count": 0.0, "step": 1382, "text_loss": 0.718977689743042 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.497798649838567, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0009814423019032835, "loss": 0.0396, "macro_f1": 0.5492662787437439, "num_tokens": 2232594.0, "repeat_count": 2.0, "routers_loss": 0.05362323671579361, "skip_count": 0.0, "step": 1384, "text_loss": 0.6392166614532471 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.507191077194013, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.0009813586670103483, "loss": 0.0426, "macro_f1": 0.6603773832321167, "num_tokens": 2236327.0, "repeat_count": 1.0, "routers_loss": 0.031728316098451614, "skip_count": 1.0, "step": 1386, "text_loss": 0.5951619148254395 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.516583504549457, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.126953125, "learning_rate": 0.0009812748476598638, "loss": 0.031, "macro_f1": 0.5492662787437439, "num_tokens": 2239746.0, "repeat_count": 0.0, "routers_loss": 0.03981253132224083, "skip_count": 2.0, "step": 1388, "text_loss": 0.22756551206111908 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.5259759319049016, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.12451171875, "learning_rate": 0.0009811908438839498, "loss": 0.0331, "macro_f1": 0.5492662787437439, "num_tokens": 2242786.0, "repeat_count": 0.0, "routers_loss": 0.04617162421345711, "skip_count": 2.0, "step": 1390, "text_loss": 0.3233799934387207 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.535368359260346, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.000981106655714797, "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2245696.0, "repeat_count": 0.0, "routers_loss": 0.046828847378492355, "skip_count": 1.0, "step": 1392, "text_loss": 0.24273279309272766 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.544760786615791, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.07373046875, "learning_rate": 0.0009810222831846656, "loss": 0.0307, "macro_f1": 0.5492662787437439, "num_tokens": 2249326.0, "repeat_count": 0.0, "routers_loss": 0.010921589098870754, "skip_count": 2.0, "step": 1394, "text_loss": 0.3921460807323456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.554153213971236, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0009809377263258882, "loss": 0.0315, "macro_f1": 0.32098767161369324, "num_tokens": 2253393.0, "repeat_count": 0.0, "routers_loss": 0.04564022272825241, "skip_count": 1.0, "step": 1396, "text_loss": 0.582602858543396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.56354564132668, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.000980852985170867, "loss": 0.0328, "macro_f1": 0.3272727429866791, "num_tokens": 2256626.0, "repeat_count": 0.0, "routers_loss": 0.013289985246956348, "skip_count": 0.0, "step": 1398, "text_loss": 0.41031694412231445 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.5729380686821255, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.0009807680597520745, "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2259326.0, "repeat_count": 0.0, "routers_loss": 0.0065213534981012344, "skip_count": 0.0, "step": 1400, "text_loss": 0.2888098657131195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.58233049603757, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.0009806829501020546, "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2262344.0, "repeat_count": 0.0, "routers_loss": 0.04199840500950813, "skip_count": 1.0, "step": 1402, "text_loss": 0.31973034143447876 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.591722923393014, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08935546875, "learning_rate": 0.0009805976562534215, "loss": 0.0317, "macro_f1": 0.6603773832321167, "num_tokens": 2266354.0, "repeat_count": 1.0, "routers_loss": 0.015434930101037025, "skip_count": 1.0, "step": 1404, "text_loss": 0.508630633354187 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 6.601115350748459, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0009805121782388599, "loss": 0.0339, "macro_f1": 0.6533333659172058, "num_tokens": 2269660.0, "repeat_count": 2.0, "routers_loss": 0.0720924660563469, "skip_count": 2.0, "step": 1406, "text_loss": 0.40927737951278687 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.610507778103904, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0634765625, "learning_rate": 0.0009804265160911253, "loss": 0.0266, "macro_f1": 0.5492662787437439, "num_tokens": 2273335.0, "repeat_count": 0.0, "routers_loss": 0.02400495670735836, "skip_count": 2.0, "step": 1408, "text_loss": 0.1777762621641159 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.6199002054593485, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2314453125, "learning_rate": 0.0009803406698430433, "loss": 0.0371, "macro_f1": 0.3272727429866791, "num_tokens": 2277107.0, "repeat_count": 0.0, "routers_loss": 0.02560107782483101, "skip_count": 1.0, "step": 1410, "text_loss": 0.17955881357192993 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.629292632814793, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009802546395275104, "loss": 0.0349, "macro_f1": 0.3333333432674408, "num_tokens": 2281638.0, "repeat_count": 0.0, "routers_loss": 0.006655813194811344, "skip_count": 0.0, "step": 1412, "text_loss": 0.20882295072078705 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 6.638685060170237, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.08740234375, "learning_rate": 0.000980168425177494, "loss": 0.0342, "macro_f1": 0.8200000524520874, "num_tokens": 2284876.0, "repeat_count": 1.0, "routers_loss": 0.06325097382068634, "skip_count": 3.0, "step": 1414, "text_loss": 0.26035264134407043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.648077487525683, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.000980082026826031, "loss": 0.0315, "macro_f1": 0.3272727429866791, "num_tokens": 2288938.0, "repeat_count": 1.0, "routers_loss": 0.013436575420200825, "skip_count": 0.0, "step": 1416, "text_loss": 0.5502325892448425 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.657469914881127, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0009799954445062296, "loss": 0.0193, "macro_f1": 0.6603773832321167, "num_tokens": 2292317.0, "repeat_count": 1.0, "routers_loss": 0.011264479719102383, "skip_count": 1.0, "step": 1418, "text_loss": 0.48075684905052185 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.666862342236572, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1611328125, "learning_rate": 0.0009799086782512686, "loss": 0.0292, "macro_f1": 0.5492662787437439, "num_tokens": 2295935.0, "repeat_count": 0.0, "routers_loss": 0.02833271212875843, "skip_count": 2.0, "step": 1420, "text_loss": 0.18221206963062286 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09375, "learning_rate": 0.0009798217280943967, "loss": 0.0356, "macro_f1": 0.6666666865348816, "num_tokens": 2298927.0, "repeat_count": 0.0, "routers_loss": 0.009208574891090393, "skip_count": 1.0, "step": 1422, "text_loss": 0.48686322569847107 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.685647196947461, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0009797345940689335, "loss": 0.0267, "macro_f1": 0.3272727429866791, "num_tokens": 2301541.0, "repeat_count": 0.0, "routers_loss": 0.015011847950518131, "skip_count": 0.0, "step": 1424, "text_loss": 0.49446266889572144 }, { "acc_repeat": 0.0, "acc_skip": 0.4000000059604645, "avg_layers": 26.0, "epoch": 6.695039624302906, "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, "f1_skip": 0.5714285969734192, "grad_norm": 0.1337890625, "learning_rate": 0.0009796472762082687, "loss": 0.0338, "macro_f1": 0.5034013986587524, "num_tokens": 2304589.0, "repeat_count": 0.0, "routers_loss": 0.05912091210484505, "skip_count": 5.0, "step": 1426, "text_loss": 0.23945684731006622 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.70443205165835, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.000979559774545863, "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 2307860.0, "repeat_count": 0.0, "routers_loss": 0.021242303773760796, "skip_count": 1.0, "step": 1428, "text_loss": 0.531273365020752 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.713824479013795, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.000979472089115247, "loss": 0.0276, "macro_f1": 0.32098764181137085, "num_tokens": 2311581.0, "repeat_count": 0.0, "routers_loss": 0.02768544852733612, "skip_count": 2.0, "step": 1430, "text_loss": 0.2497459501028061 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.000979384219950022, "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2314639.0, "repeat_count": 0.0, "routers_loss": 0.008678150363266468, "skip_count": 0.0, "step": 1432, "text_loss": 0.6579355001449585 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.732609333724684, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08056640625, "learning_rate": 0.0009792961670838595, "loss": 0.0362, "macro_f1": 0.3272727429866791, "num_tokens": 2317927.0, "repeat_count": 1.0, "routers_loss": 0.03325597569346428, "skip_count": 0.0, "step": 1434, "text_loss": 0.5209436416625977 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.742001761080129, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009792079305505016, "loss": 0.0306, "macro_f1": 0.3272727429866791, "num_tokens": 2321065.0, "repeat_count": 1.0, "routers_loss": 0.019228918477892876, "skip_count": 0.0, "step": 1436, "text_loss": 0.41087067127227783 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.751394188435574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.000979119510383761, "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2323714.0, "repeat_count": 0.0, "routers_loss": 0.017071325331926346, "skip_count": 0.0, "step": 1438, "text_loss": 0.21490029990673065 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.760786615791019, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2060546875, "learning_rate": 0.00097903090661752, "loss": 0.0309, "macro_f1": 0.3333333432674408, "num_tokens": 2326454.0, "repeat_count": 0.0, "routers_loss": 0.00991755723953247, "skip_count": 0.0, "step": 1440, "text_loss": 0.23847346007823944 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.770179043146463, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.232421875, "learning_rate": 0.000978942119285732, "loss": 0.0404, "macro_f1": 0.3272727429866791, "num_tokens": 2329462.0, "repeat_count": 0.0, "routers_loss": 0.04908733069896698, "skip_count": 1.0, "step": 1442, "text_loss": 0.23343028128147125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.7795714705019074, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.0009788531484224204, "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2332146.0, "repeat_count": 0.0, "routers_loss": 0.0032628148328512907, "skip_count": 0.0, "step": 1444, "text_loss": 0.47423800826072693 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 6.788963897857353, "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, "grad_norm": 0.10693359375, "learning_rate": 0.0009787639940616788, "loss": 0.0405, "macro_f1": 0.7018141150474548, "num_tokens": 2335738.0, "repeat_count": 1.0, "routers_loss": 0.14336998760700226, "skip_count": 3.0, "step": 1446, "text_loss": 0.21837592124938965 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.189453125, "learning_rate": 0.0009786746562376717, "loss": 0.0241, "macro_f1": 0.6666666865348816, "num_tokens": 2338488.0, "repeat_count": 0.0, "routers_loss": 0.010542908683419228, "skip_count": 1.0, "step": 1448, "text_loss": 1.0614757537841797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.807748752568242, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.0009785851349846334, "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2342074.0, "repeat_count": 0.0, "routers_loss": 0.005998016335070133, "skip_count": 0.0, "step": 1450, "text_loss": 0.4269719421863556 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 6.817141179923686, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.1083984375, "learning_rate": 0.0009784954303368686, "loss": 0.0384, "macro_f1": 0.44705885648727417, "num_tokens": 2345838.0, "repeat_count": 0.0, "routers_loss": 0.0959126204252243, "skip_count": 3.0, "step": 1452, "text_loss": 0.3315916955471039 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.0009784055423287521, "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 2348939.0, "repeat_count": 0.0, "routers_loss": 0.0025467623490840197, "skip_count": 0.0, "step": 1454, "text_loss": 0.6162732839584351 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.835926034634576, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0009783154709947293, "loss": 0.0256, "macro_f1": 0.3272727429866791, "num_tokens": 2352232.0, "repeat_count": 0.0, "routers_loss": 0.01860538125038147, "skip_count": 1.0, "step": 1456, "text_loss": 0.23928768932819366 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.84531846199002, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0009782252163693158, "loss": 0.0201, "macro_f1": 0.3272727429866791, "num_tokens": 2355159.0, "repeat_count": 0.0, "routers_loss": 0.04412713274359703, "skip_count": 1.0, "step": 1458, "text_loss": 0.3371323347091675 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.0009781347784870973, "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 2358175.0, "repeat_count": 0.0, "routers_loss": 0.006809141952544451, "skip_count": 0.0, "step": 1460, "text_loss": 0.547267735004425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.86410331670091, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009780441573827296, "loss": 0.03, "macro_f1": 0.3076923191547394, "num_tokens": 2360991.0, "repeat_count": 0.0, "routers_loss": 0.08924390375614166, "skip_count": 4.0, "step": 1462, "text_loss": 0.7026563882827759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1865234375, "learning_rate": 0.000977953353090939, "loss": 0.0272, "macro_f1": 0.3333333432674408, "num_tokens": 2363894.0, "repeat_count": 0.0, "routers_loss": 0.021858472377061844, "skip_count": 0.0, "step": 1464, "text_loss": 0.2718065083026886 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.882888171411799, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.0009778623656465219, "loss": 0.0338, "macro_f1": 0.32098764181137085, "num_tokens": 2367265.0, "repeat_count": 0.0, "routers_loss": 0.044781096279621124, "skip_count": 0.0, "step": 1466, "text_loss": 0.5008095502853394 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.892280598767244, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009777711950843448, "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 2370186.0, "repeat_count": 0.0, "routers_loss": 0.0040459707379341125, "skip_count": 0.0, "step": 1468, "text_loss": 0.5242461562156677 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 6.901673026122689, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.134765625, "learning_rate": 0.0009776798414393446, "loss": 0.0279, "macro_f1": 0.6598639488220215, "num_tokens": 2373314.0, "repeat_count": 1.0, "routers_loss": 0.0708528608083725, "skip_count": 3.0, "step": 1470, "text_loss": 0.2821732461452484 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.911065453478133, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1328125, "learning_rate": 0.0009775883047465279, "loss": 0.0414, "macro_f1": 0.31446540355682373, "num_tokens": 2376435.0, "repeat_count": 1.0, "routers_loss": 0.0290578193962574, "skip_count": 1.0, "step": 1472, "text_loss": 0.8438440561294556 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.9204578808335775, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10546875, "learning_rate": 0.000977496585040972, "loss": 0.0373, "macro_f1": 0.3333333432674408, "num_tokens": 2380244.0, "repeat_count": 0.0, "routers_loss": 0.010360375046730042, "skip_count": 0.0, "step": 1474, "text_loss": 0.4356135427951813 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.929850308189023, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.000977404682357824, "loss": 0.0294, "macro_f1": 0.3272727429866791, "num_tokens": 2383498.0, "repeat_count": 0.0, "routers_loss": 0.023518972098827362, "skip_count": 0.0, "step": 1476, "text_loss": 0.25195425748825073 }, { "acc_repeat": 0.800000011920929, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 6.939242735544467, "f1_execute": 0.9743589162826538, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, "grad_norm": 0.11181640625, "learning_rate": 0.000977312596732301, "loss": 0.0375, "macro_f1": 0.9544159770011902, "num_tokens": 2386414.0, "repeat_count": 5.0, "routers_loss": 0.08190606534481049, "skip_count": 4.0, "step": 1478, "text_loss": 0.6586798429489136 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.948635162899912, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10546875, "learning_rate": 0.0009772203281996905, "loss": 0.0336, "macro_f1": 1.0, "num_tokens": 2389399.0, "repeat_count": 1.0, "routers_loss": 0.016441475600004196, "skip_count": 2.0, "step": 1480, "text_loss": 0.3671986758708954 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.958027590255357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009771278767953502, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2392400.0, "repeat_count": 0.0, "routers_loss": 0.019211363047361374, "skip_count": 0.0, "step": 1482, "text_loss": 0.27418580651283264 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.967420017610801, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0009770352425547072, "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 2395123.0, "repeat_count": 0.0, "routers_loss": 0.015800386667251587, "skip_count": 0.0, "step": 1484, "text_loss": 0.19896622002124786 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.976812444966246, "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009769424255132596, "loss": 0.0256, "macro_f1": 0.4871794879436493, "num_tokens": 2397359.0, "repeat_count": 3.0, "routers_loss": 0.06670158356428146, "skip_count": 0.0, "step": 1486, "text_loss": 0.4229799509048462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.98620487232169, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1162109375, "learning_rate": 0.0009768494257065747, "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 2400387.0, "repeat_count": 0.0, "routers_loss": 0.011144762858748436, "skip_count": 1.0, "step": 1488, "text_loss": 0.4264226257801056 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.995597299677136, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12353515625, "learning_rate": 0.0009767562431702904, "loss": 0.0387, "macro_f1": 0.3006536364555359, "num_tokens": 2403241.0, "repeat_count": 2.0, "routers_loss": 0.12339717149734497, "skip_count": 3.0, "step": 1490, "text_loss": 0.2850193977355957 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.004696213677723, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07177734375, "learning_rate": 0.0009766628779401142, "loss": 0.0215, "macro_f1": 0.6666666865348816, "num_tokens": 2406087.0, "repeat_count": 0.0, "routers_loss": 0.008174685761332512, "skip_count": 1.0, "step": 1492, "text_loss": 0.6756544709205627 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.000976569330051824, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 2409312.0, "repeat_count": 0.0, "routers_loss": 0.0021256296895444393, "skip_count": 0.0, "step": 1494, "text_loss": 0.4789894223213196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.0234810683886115, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.053955078125, "learning_rate": 0.0009764755995412677, "loss": 0.0193, "macro_f1": 0.3333333432674408, "num_tokens": 2412758.0, "repeat_count": 0.0, "routers_loss": 0.003944927826523781, "skip_count": 0.0, "step": 1496, "text_loss": 0.5157490968704224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.032873495744056, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009763816864443627, "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2416079.0, "repeat_count": 1.0, "routers_loss": 0.03893325850367546, "skip_count": 0.0, "step": 1498, "text_loss": 0.28045418858528137 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.042265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0009762875907970968, "loss": 0.0199, "macro_f1": 0.3333333432674408, "num_tokens": 2420340.0, "repeat_count": 0.0, "routers_loss": 0.0017725443467497826, "skip_count": 0.0, "step": 1500, "text_loss": 0.35550856590270996 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.051658350454946, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06298828125, "learning_rate": 0.0009761933126355277, "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2424735.0, "repeat_count": 0.0, "routers_loss": 0.01393749937415123, "skip_count": 1.0, "step": 1502, "text_loss": 0.38840189576148987 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009760988519957828, "loss": 0.0249, "macro_f1": 0.6666666865348816, "num_tokens": 2428132.0, "repeat_count": 0.0, "routers_loss": 0.01687910407781601, "skip_count": 2.0, "step": 1504, "text_loss": 0.3031681478023529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.0704432051658355, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0009760042089140598, "loss": 0.0193, "macro_f1": 0.3144654333591461, "num_tokens": 2431592.0, "repeat_count": 1.0, "routers_loss": 0.04704280197620392, "skip_count": 2.0, "step": 1506, "text_loss": 0.16355200111865997 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0009759093834266259, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2434236.0, "repeat_count": 0.0, "routers_loss": 0.0016075772000476718, "skip_count": 0.0, "step": 1508, "text_loss": 0.6080073118209839 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0009758143755698186, "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2437170.0, "repeat_count": 0.0, "routers_loss": 0.008451299741864204, "skip_count": 0.0, "step": 1510, "text_loss": 0.22100484371185303 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 7.098620487232169, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.06689453125, "learning_rate": 0.0009757191853800449, "loss": 0.0227, "macro_f1": 0.5866667032241821, "num_tokens": 2441187.0, "repeat_count": 1.0, "routers_loss": 0.046565692871809006, "skip_count": 3.0, "step": 1512, "text_loss": 0.25098952651023865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.108012914587614, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.000975623812893782, "loss": 0.0276, "macro_f1": 0.3272727429866791, "num_tokens": 2444664.0, "repeat_count": 0.0, "routers_loss": 0.02872578240931034, "skip_count": 1.0, "step": 1514, "text_loss": 0.4952253997325897 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.1174053419430585, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.0009755282581475768, "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2447748.0, "repeat_count": 0.0, "routers_loss": 0.002055214950814843, "skip_count": 0.0, "step": 1516, "text_loss": 0.7465500831604004 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.126797769298503, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10302734375, "learning_rate": 0.000975432521178046, "loss": 0.0216, "macro_f1": 0.3272727429866791, "num_tokens": 2450834.0, "repeat_count": 1.0, "routers_loss": 0.04498551785945892, "skip_count": 0.0, "step": 1518, "text_loss": 0.28144413232803345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0009753366020218763, "loss": 0.0234, "macro_f1": 0.3333333432674408, "num_tokens": 2454233.0, "repeat_count": 0.0, "routers_loss": 0.003669742727652192, "skip_count": 0.0, "step": 1520, "text_loss": 0.5667551755905151 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009752405007158238, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2457331.0, "repeat_count": 0.0, "routers_loss": 0.010455607436597347, "skip_count": 0.0, "step": 1522, "text_loss": 0.19575810432434082 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 7.154975051364837, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0009751442172967151, "loss": 0.0193, "macro_f1": 0.8823530077934265, "num_tokens": 2459935.0, "repeat_count": 2.0, "routers_loss": 0.025189083069562912, "skip_count": 1.0, "step": 1524, "text_loss": 0.45453405380249023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.164367478720282, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.000975047751801446, "loss": 0.0187, "macro_f1": 0.3272727429866791, "num_tokens": 2463008.0, "repeat_count": 0.0, "routers_loss": 0.012297490611672401, "skip_count": 0.0, "step": 1526, "text_loss": 0.31437572836875916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.0009749511042669823, "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2466475.0, "repeat_count": 0.0, "routers_loss": 0.011026266030967236, "skip_count": 0.0, "step": 1528, "text_loss": 0.46604859828948975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.183152333431171, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009748542747303595, "loss": 0.0182, "macro_f1": 0.3272727429866791, "num_tokens": 2469320.0, "repeat_count": 0.0, "routers_loss": 0.011934996582567692, "skip_count": 1.0, "step": 1530, "text_loss": 0.7764923572540283 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.192544760786616, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0966796875, "learning_rate": 0.0009747572632286827, "loss": 0.0203, "macro_f1": 0.3333333432674408, "num_tokens": 2472468.0, "repeat_count": 0.0, "routers_loss": 0.005786920432001352, "skip_count": 0.0, "step": 1532, "text_loss": 0.3555782437324524 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.20193718814206, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009746600697991271, "loss": 0.02, "macro_f1": 0.6666666865348816, "num_tokens": 2475736.0, "repeat_count": 1.0, "routers_loss": 0.0026990731712430716, "skip_count": 0.0, "step": 1534, "text_loss": 0.49561792612075806 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 7.2113296154975055, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0556640625, "learning_rate": 0.0009745626944789375, "loss": 0.0204, "macro_f1": 0.8823530077934265, "num_tokens": 2478887.0, "repeat_count": 1.0, "routers_loss": 0.020221207290887833, "skip_count": 2.0, "step": 1536, "text_loss": 0.5375416278839111 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.22072204285295, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0009744651373054279, "loss": 0.0286, "macro_f1": 0.3272727429866791, "num_tokens": 2481293.0, "repeat_count": 0.0, "routers_loss": 0.03131086751818657, "skip_count": 1.0, "step": 1538, "text_loss": 0.5241039395332336 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 7.230114470208394, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.08984375, "learning_rate": 0.0009743673983159828, "loss": 0.0241, "macro_f1": 0.6122449040412903, "num_tokens": 2484403.0, "repeat_count": 0.0, "routers_loss": 0.04448170214891434, "skip_count": 4.0, "step": 1540, "text_loss": 0.7465724349021912 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08935546875, "learning_rate": 0.0009742694775480557, "loss": 0.0265, "macro_f1": 0.6666666865348816, "num_tokens": 2487952.0, "repeat_count": 0.0, "routers_loss": 0.007171491626650095, "skip_count": 1.0, "step": 1542, "text_loss": 0.2877117097377777 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.248899324919284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0009741713750391703, "loss": 0.0171, "macro_f1": 0.6666666865348816, "num_tokens": 2490815.0, "repeat_count": 1.0, "routers_loss": 0.004559285007417202, "skip_count": 0.0, "step": 1544, "text_loss": 0.6097800135612488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.258291752274729, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0009740730908269193, "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 2494727.0, "repeat_count": 0.0, "routers_loss": 0.005271553061902523, "skip_count": 0.0, "step": 1546, "text_loss": 0.5431114435195923 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.0009739746249489658, "loss": 0.0239, "macro_f1": 0.3333333432674408, "num_tokens": 2499266.0, "repeat_count": 0.0, "routers_loss": 0.0015409323386847973, "skip_count": 0.0, "step": 1548, "text_loss": 0.4702678322792053 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.277076606985618, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1171875, "learning_rate": 0.0009738759774430417, "loss": 0.0216, "macro_f1": 0.32098764181137085, "num_tokens": 2502273.0, "repeat_count": 1.0, "routers_loss": 0.030183158814907074, "skip_count": 1.0, "step": 1550, "text_loss": 0.3239189088344574 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.286469034341063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0009737771483469493, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2507624.0, "repeat_count": 0.0, "routers_loss": 0.005410848651081324, "skip_count": 0.0, "step": 1552, "text_loss": 0.4014642834663391 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009736781376985598, "loss": 0.0168, "macro_f1": 0.6666666865348816, "num_tokens": 2510366.0, "repeat_count": 0.0, "routers_loss": 0.0066976165398955345, "skip_count": 1.0, "step": 1554, "text_loss": 0.5924848914146423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.305253889051952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.13671875, "learning_rate": 0.0009735789455358144, "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2513317.0, "repeat_count": 0.0, "routers_loss": 0.002763477386906743, "skip_count": 0.0, "step": 1556, "text_loss": 0.3222943842411041 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.314646316407397, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11767578125, "learning_rate": 0.0009734795718967237, "loss": 0.0283, "macro_f1": 0.32098764181137085, "num_tokens": 2516628.0, "repeat_count": 0.0, "routers_loss": 0.061566028743982315, "skip_count": 2.0, "step": 1558, "text_loss": 0.3249334692955017 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.324038743762841, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.095703125, "learning_rate": 0.0009733800168193679, "loss": 0.0228, "macro_f1": 1.0, "num_tokens": 2519424.0, "repeat_count": 2.0, "routers_loss": 0.017976421862840652, "skip_count": 4.0, "step": 1560, "text_loss": 0.3341919481754303 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.333431171118286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.0009732802803418966, "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2522922.0, "repeat_count": 0.0, "routers_loss": 0.002525332849472761, "skip_count": 0.0, "step": 1562, "text_loss": 0.3176332712173462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.34282359847373, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07861328125, "learning_rate": 0.0009731803625025292, "loss": 0.0196, "macro_f1": 0.3272727429866791, "num_tokens": 2525811.0, "repeat_count": 0.0, "routers_loss": 0.015524424612522125, "skip_count": 1.0, "step": 1564, "text_loss": 0.532774031162262 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.3522160258291755, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10205078125, "learning_rate": 0.0009730802633395541, "loss": 0.0257, "macro_f1": 0.6603773832321167, "num_tokens": 2529157.0, "repeat_count": 1.0, "routers_loss": 0.08138631284236908, "skip_count": 1.0, "step": 1566, "text_loss": 0.529487133026123 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009729799828913298, "loss": 0.0223, "macro_f1": 0.3333333432674408, "num_tokens": 2532249.0, "repeat_count": 0.0, "routers_loss": 0.0035867292899638414, "skip_count": 0.0, "step": 1568, "text_loss": 0.503160297870636 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.371000880540064, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06884765625, "learning_rate": 0.0009728795211962838, "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2535904.0, "repeat_count": 0.0, "routers_loss": 0.02987455204129219, "skip_count": 2.0, "step": 1570, "text_loss": 0.9170270562171936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.380393307895509, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11865234375, "learning_rate": 0.0009727788782929131, "loss": 0.0273, "macro_f1": 0.3272727429866791, "num_tokens": 2538943.0, "repeat_count": 1.0, "routers_loss": 0.04676021635532379, "skip_count": 0.0, "step": 1572, "text_loss": 0.29146310687065125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.389785735250954, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0009726780542197844, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2541805.0, "repeat_count": 0.0, "routers_loss": 0.002127803163602948, "skip_count": 0.0, "step": 1574, "text_loss": 1.0126502513885498 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.399178162606399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0009725770490155338, "loss": 0.0262, "macro_f1": 0.3333333432674408, "num_tokens": 2546213.0, "repeat_count": 0.0, "routers_loss": 0.007609677035361528, "skip_count": 0.0, "step": 1576, "text_loss": 0.190168559551239 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.408570589961843, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.0009724758627188665, "loss": 0.0356, "macro_f1": 0.3272727429866791, "num_tokens": 2549554.0, "repeat_count": 0.0, "routers_loss": 0.033554721623659134, "skip_count": 1.0, "step": 1578, "text_loss": 0.2977406084537506 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.4179630173172875, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0009723744953685572, "loss": 0.028, "macro_f1": 0.3272727429866791, "num_tokens": 2552785.0, "repeat_count": 1.0, "routers_loss": 0.027864238247275352, "skip_count": 0.0, "step": 1580, "text_loss": 0.2700682580471039 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.427355444672733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19921875, "learning_rate": 0.0009722729470034503, "loss": 0.0224, "macro_f1": 0.3333333432674408, "num_tokens": 2556550.0, "repeat_count": 0.0, "routers_loss": 0.004798175301402807, "skip_count": 0.0, "step": 1582, "text_loss": 0.6559903025627136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.436747872028177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.078125, "learning_rate": 0.0009721712176624591, "loss": 0.0242, "macro_f1": 0.3333333432674408, "num_tokens": 2559862.0, "repeat_count": 0.0, "routers_loss": 0.013764148578047752, "skip_count": 0.0, "step": 1584, "text_loss": 0.2257535308599472 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.446140299383622, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10986328125, "learning_rate": 0.0009720693073845667, "loss": 0.032, "macro_f1": 0.5492662787437439, "num_tokens": 2562766.0, "repeat_count": 0.0, "routers_loss": 0.01937069371342659, "skip_count": 2.0, "step": 1586, "text_loss": 0.178413525223732 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.455532726739067, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.0009719672162088252, "loss": 0.0306, "macro_f1": 0.32098767161369324, "num_tokens": 2566583.0, "repeat_count": 1.0, "routers_loss": 0.06224144622683525, "skip_count": 0.0, "step": 1588, "text_loss": 0.3992367684841156 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 7.464925154094511, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.185546875, "learning_rate": 0.0009718649441743559, "loss": 0.0239, "macro_f1": 0.9449735879898071, "num_tokens": 2569516.0, "repeat_count": 2.0, "routers_loss": 0.06937911361455917, "skip_count": 4.0, "step": 1590, "text_loss": 0.1945122629404068 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.00097176249132035, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2572418.0, "repeat_count": 0.0, "routers_loss": 0.0034326619934290648, "skip_count": 0.0, "step": 1592, "text_loss": 0.6259906888008118 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.4837100088054, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08642578125, "learning_rate": 0.0009716598576860676, "loss": 0.0278, "macro_f1": 0.6666666865348816, "num_tokens": 2575235.0, "repeat_count": 1.0, "routers_loss": 0.004557516425848007, "skip_count": 0.0, "step": 1594, "text_loss": 0.6638736724853516 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 7.493102436160846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.193359375, "learning_rate": 0.0009715570433108378, "loss": 0.0198, "macro_f1": 1.0, "num_tokens": 2578157.0, "repeat_count": 1.0, "routers_loss": 0.015363055281341076, "skip_count": 1.0, "step": 1596, "text_loss": 0.6530464887619019 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009714540482340595, "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 2581801.0, "repeat_count": 1.0, "routers_loss": 0.01257144846022129, "skip_count": 0.0, "step": 1598, "text_loss": 0.5916110277175903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.5118872908717345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058837890625, "learning_rate": 0.0009713508724952006, "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2585204.0, "repeat_count": 0.0, "routers_loss": 0.003175645601004362, "skip_count": 0.0, "step": 1600, "text_loss": 0.27901601791381836 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12353515625, "learning_rate": 0.0009712475161337981, "loss": 0.0261, "macro_f1": 0.3333333432674408, "num_tokens": 2588286.0, "repeat_count": 0.0, "routers_loss": 0.004122321493923664, "skip_count": 0.0, "step": 1602, "text_loss": 0.42420244216918945 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009711439791894585, "loss": 0.0341, "macro_f1": 0.6666666865348816, "num_tokens": 2591476.0, "repeat_count": 0.0, "routers_loss": 0.011215819045901299, "skip_count": 1.0, "step": 1604, "text_loss": 0.5549933910369873 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.540064572938069, "f1_execute": 0.9599999785423279, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.0703125, "learning_rate": 0.0009710402617018574, "loss": 0.0172, "macro_f1": 0.8200000524520874, "num_tokens": 2594336.0, "repeat_count": 1.0, "routers_loss": 0.02916567400097847, "skip_count": 2.0, "step": 1606, "text_loss": 0.3263779282569885 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.549457000293513, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009709363637107393, "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 2597462.0, "repeat_count": 0.0, "routers_loss": 0.015897957608103752, "skip_count": 1.0, "step": 1608, "text_loss": 0.20917139947414398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009708322852559184, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2601543.0, "repeat_count": 0.0, "routers_loss": 0.002211357234045863, "skip_count": 0.0, "step": 1610, "text_loss": 0.450550377368927 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.568241855004403, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1728515625, "learning_rate": 0.0009707280263772776, "loss": 0.0277, "macro_f1": 0.6666666865348816, "num_tokens": 2604462.0, "repeat_count": 0.0, "routers_loss": 0.01615734025835991, "skip_count": 2.0, "step": 1612, "text_loss": 0.6908381581306458 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.577634282359847, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0888671875, "learning_rate": 0.0009706235871147688, "loss": 0.0241, "macro_f1": 0.5492662787437439, "num_tokens": 2607484.0, "repeat_count": 0.0, "routers_loss": 0.022048067301511765, "skip_count": 2.0, "step": 1614, "text_loss": 0.36691340804100037 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.587026709715292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10546875, "learning_rate": 0.0009705189675084138, "loss": 0.0176, "macro_f1": 0.6666666865348816, "num_tokens": 2610204.0, "repeat_count": 0.0, "routers_loss": 0.008503952994942665, "skip_count": 1.0, "step": 1616, "text_loss": 0.5226598381996155 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.596419137070737, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009704141675983029, "loss": 0.0248, "macro_f1": 0.3333333432674408, "num_tokens": 2613128.0, "repeat_count": 0.0, "routers_loss": 0.0019020626787096262, "skip_count": 0.0, "step": 1618, "text_loss": 0.6465088725090027 }, { "acc_repeat": 0.0, "acc_skip": 0.5714285969734192, "avg_layers": 24.0, "epoch": 7.6058115644261814, "f1_execute": 0.9333333373069763, "f1_repeat": 0.0, "f1_skip": 0.7272727489471436, "grad_norm": 0.107421875, "learning_rate": 0.0009703091874245956, "loss": 0.032, "macro_f1": 0.5535354018211365, "num_tokens": 2616360.0, "repeat_count": 0.0, "routers_loss": 0.11837691068649292, "skip_count": 7.0, "step": 1620, "text_loss": 0.2987039089202881 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.615203991781626, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009702040270275204, "loss": 0.0181, "macro_f1": 0.3333333432674408, "num_tokens": 2619606.0, "repeat_count": 0.0, "routers_loss": 0.0065958453342318535, "skip_count": 0.0, "step": 1622, "text_loss": 0.6262096166610718 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.103515625, "learning_rate": 0.000970098686447375, "loss": 0.0257, "macro_f1": 0.6666666865348816, "num_tokens": 2622499.0, "repeat_count": 0.0, "routers_loss": 0.013632026500999928, "skip_count": 1.0, "step": 1624, "text_loss": 0.2392602562904358 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.633988846492516, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.125, "learning_rate": 0.0009699931657245264, "loss": 0.0245, "macro_f1": 0.5492662787437439, "num_tokens": 2626002.0, "repeat_count": 0.0, "routers_loss": 0.012147823348641396, "skip_count": 2.0, "step": 1626, "text_loss": 0.4742976129055023 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009698874648994098, "loss": 0.0285, "macro_f1": 1.0, "num_tokens": 2629847.0, "repeat_count": 1.0, "routers_loss": 0.010692884214222431, "skip_count": 3.0, "step": 1628, "text_loss": 0.5090685486793518 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.6527737012034045, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009697815840125304, "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2633529.0, "repeat_count": 0.0, "routers_loss": 0.011442207731306553, "skip_count": 0.0, "step": 1630, "text_loss": 0.1874329298734665 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0009696755231044618, "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2636321.0, "repeat_count": 0.0, "routers_loss": 0.0026681360322982073, "skip_count": 0.0, "step": 1632, "text_loss": 0.7650400400161743 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.671558555914294, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10498046875, "learning_rate": 0.0009695692822158466, "loss": 0.0242, "macro_f1": 0.3272727429866791, "num_tokens": 2638840.0, "repeat_count": 1.0, "routers_loss": 0.033965807408094406, "skip_count": 0.0, "step": 1634, "text_loss": 0.6175784468650818 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0009694628613873968, "loss": 0.018, "macro_f1": 0.3333333432674408, "num_tokens": 2641886.0, "repeat_count": 0.0, "routers_loss": 0.007568214554339647, "skip_count": 0.0, "step": 1636, "text_loss": 0.43139931559562683 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.690343410625183, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.193359375, "learning_rate": 0.0009693562606598929, "loss": 0.025, "macro_f1": 0.3333333432674408, "num_tokens": 2645028.0, "repeat_count": 0.0, "routers_loss": 0.004973865579813719, "skip_count": 0.0, "step": 1638, "text_loss": 0.6430339217185974 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.699735837980628, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.0009692494800741844, "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2648209.0, "repeat_count": 1.0, "routers_loss": 0.049863800406455994, "skip_count": 0.0, "step": 1640, "text_loss": 0.28138160705566406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.709128265336073, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08544921875, "learning_rate": 0.0009691425196711901, "loss": 0.0398, "macro_f1": 0.3272727429866791, "num_tokens": 2651171.0, "repeat_count": 0.0, "routers_loss": 0.02112230286002159, "skip_count": 0.0, "step": 1642, "text_loss": 0.3745322525501251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.718520692691517, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.0009690353794918971, "loss": 0.0275, "macro_f1": 0.3333333432674408, "num_tokens": 2654093.0, "repeat_count": 0.0, "routers_loss": 0.0024304776452481747, "skip_count": 0.0, "step": 1644, "text_loss": 0.4275154173374176 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.727913120046962, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.000968928059577362, "loss": 0.0244, "macro_f1": 0.6666666865348816, "num_tokens": 2657079.0, "repeat_count": 0.0, "routers_loss": 0.009320619516074657, "skip_count": 1.0, "step": 1646, "text_loss": 0.46650025248527527 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.737305547402407, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009688205599687099, "loss": 0.0209, "macro_f1": 0.3272727429866791, "num_tokens": 2660951.0, "repeat_count": 0.0, "routers_loss": 0.011913162656128407, "skip_count": 0.0, "step": 1648, "text_loss": 0.46644100546836853 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.7466979747578515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0009687128807071347, "loss": 0.0284, "macro_f1": 0.3333333432674408, "num_tokens": 2663823.0, "repeat_count": 0.0, "routers_loss": 0.013754756189882755, "skip_count": 0.0, "step": 1650, "text_loss": 0.40808847546577454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.0009686050218338996, "loss": 0.0286, "macro_f1": 0.3333333432674408, "num_tokens": 2667079.0, "repeat_count": 0.0, "routers_loss": 0.009099726565182209, "skip_count": 0.0, "step": 1652, "text_loss": 0.2389989197254181 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.765482829468741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08837890625, "learning_rate": 0.0009684969833903359, "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2670162.0, "repeat_count": 0.0, "routers_loss": 0.0034928603563457727, "skip_count": 1.0, "step": 1654, "text_loss": 0.6930749416351318 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.774875256824186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0009683887654178445, "loss": 0.0261, "macro_f1": 0.6666666865348816, "num_tokens": 2673031.0, "repeat_count": 0.0, "routers_loss": 0.008340462110936642, "skip_count": 1.0, "step": 1656, "text_loss": 0.277752548456192 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06884765625, "learning_rate": 0.0009682803679578947, "loss": 0.0259, "macro_f1": 0.3333333432674408, "num_tokens": 2676092.0, "repeat_count": 0.0, "routers_loss": 0.004337446764111519, "skip_count": 0.0, "step": 1658, "text_loss": 0.5176776051521301 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.7936601115350745, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009681717910520244, "loss": 0.0242, "macro_f1": 0.32098764181137085, "num_tokens": 2679479.0, "repeat_count": 0.0, "routers_loss": 0.034611742943525314, "skip_count": 2.0, "step": 1660, "text_loss": 0.21485982835292816 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.80305253889052, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.07958984375, "learning_rate": 0.0009680630347418406, "loss": 0.022, "macro_f1": 0.5492662787437439, "num_tokens": 2683289.0, "repeat_count": 0.0, "routers_loss": 0.03297121450304985, "skip_count": 2.0, "step": 1662, "text_loss": 0.33801013231277466 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.812444966245964, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.000967954099069019, "loss": 0.0411, "macro_f1": 0.32098764181137085, "num_tokens": 2685879.0, "repeat_count": 1.0, "routers_loss": 0.04551183059811592, "skip_count": 1.0, "step": 1664, "text_loss": 0.41123488545417786 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.821837393601409, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009678449840753038, "loss": 0.0324, "macro_f1": 0.32098764181137085, "num_tokens": 2688910.0, "repeat_count": 0.0, "routers_loss": 0.05866450071334839, "skip_count": 2.0, "step": 1666, "text_loss": 0.1740892380475998 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009677356898025082, "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2691680.0, "repeat_count": 0.0, "routers_loss": 0.009243223816156387, "skip_count": 0.0, "step": 1668, "text_loss": 0.2512350380420685 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.8406222483122985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.000967626216292514, "loss": 0.0195, "macro_f1": 0.3333333432674408, "num_tokens": 2694895.0, "repeat_count": 0.0, "routers_loss": 0.005576452240347862, "skip_count": 0.0, "step": 1670, "text_loss": 0.43294376134872437 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 7.850014675667743, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.09130859375, "learning_rate": 0.0009675165635872715, "loss": 0.0306, "macro_f1": 0.44705885648727417, "num_tokens": 2697806.0, "repeat_count": 0.0, "routers_loss": 0.05372785031795502, "skip_count": 3.0, "step": 1672, "text_loss": 0.1614082306623459 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.859407103023187, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009674067317288, "loss": 0.0296, "macro_f1": 0.6666666865348816, "num_tokens": 2700529.0, "repeat_count": 1.0, "routers_loss": 0.018131591379642487, "skip_count": 0.0, "step": 1674, "text_loss": 0.2093173861503601 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.868799530378633, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009672967207591869, "loss": 0.0257, "macro_f1": 0.3272727429866791, "num_tokens": 2703650.0, "repeat_count": 0.0, "routers_loss": 0.0673515796661377, "skip_count": 1.0, "step": 1676, "text_loss": 0.3029400110244751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.878191957734077, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009671865307205892, "loss": 0.021, "macro_f1": 0.32098767161369324, "num_tokens": 2707615.0, "repeat_count": 0.0, "routers_loss": 0.03821169584989548, "skip_count": 1.0, "step": 1678, "text_loss": 0.2262786477804184 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 7.8875843850895215, "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.1396484375, "learning_rate": 0.0009670761616552315, "loss": 0.0465, "macro_f1": 0.9615669250488281, "num_tokens": 2710894.0, "repeat_count": 2.0, "routers_loss": 0.042625464498996735, "skip_count": 6.0, "step": 1680, "text_loss": 0.29623574018478394 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.896976812444966, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009669656136054074, "loss": 0.0289, "macro_f1": 0.3333333432674408, "num_tokens": 2714330.0, "repeat_count": 0.0, "routers_loss": 0.0037571541033685207, "skip_count": 0.0, "step": 1682, "text_loss": 0.7510389089584351 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.906369239800411, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07421875, "learning_rate": 0.0009668548866134795, "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2717176.0, "repeat_count": 0.0, "routers_loss": 0.004142968449741602, "skip_count": 0.0, "step": 1684, "text_loss": 0.3273485600948334 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07373046875, "learning_rate": 0.0009667439807218783, "loss": 0.0233, "macro_f1": 0.6666666865348816, "num_tokens": 2720628.0, "repeat_count": 0.0, "routers_loss": 0.008753842674195766, "skip_count": 2.0, "step": 1686, "text_loss": 0.4314708709716797 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.9251540945113, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0732421875, "learning_rate": 0.0009666328959731033, "loss": 0.0211, "macro_f1": 0.6603773832321167, "num_tokens": 2723739.0, "repeat_count": 1.0, "routers_loss": 0.022674910724163055, "skip_count": 1.0, "step": 1688, "text_loss": 0.25734150409698486 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 7.934546521866745, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1552734375, "learning_rate": 0.0009665216324097222, "loss": 0.0324, "macro_f1": 0.5934640765190125, "num_tokens": 2726644.0, "repeat_count": 0.0, "routers_loss": 0.03932750225067139, "skip_count": 3.0, "step": 1690, "text_loss": 0.24511034786701202 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.94393894922219, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.0009664101900743714, "loss": 0.0255, "macro_f1": 0.3272727429866791, "num_tokens": 2729662.0, "repeat_count": 0.0, "routers_loss": 0.012672754004597664, "skip_count": 1.0, "step": 1692, "text_loss": 0.39431414008140564 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.953331376577634, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.076171875, "learning_rate": 0.000966298569009756, "loss": 0.0231, "macro_f1": 0.5492662787437439, "num_tokens": 2732578.0, "repeat_count": 0.0, "routers_loss": 0.01548632513731718, "skip_count": 2.0, "step": 1694, "text_loss": 0.12439999729394913 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.962723803933079, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009661867692586494, "loss": 0.0153, "macro_f1": 0.32098764181137085, "num_tokens": 2735887.0, "repeat_count": 0.0, "routers_loss": 0.05622401833534241, "skip_count": 2.0, "step": 1696, "text_loss": 0.29024389386177063 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.972116231288524, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.0009660747908638933, "loss": 0.0205, "macro_f1": 0.3272727429866791, "num_tokens": 2739293.0, "repeat_count": 0.0, "routers_loss": 0.041060201823711395, "skip_count": 1.0, "step": 1698, "text_loss": 0.39461007714271545 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.9815086586439685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1767578125, "learning_rate": 0.0009659626338683981, "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 2742468.0, "repeat_count": 0.0, "routers_loss": 0.007251353468745947, "skip_count": 0.0, "step": 1700, "text_loss": 0.2751767635345459 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.990901085999413, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009658502983151427, "loss": 0.0186, "macro_f1": 0.3272727429866791, "num_tokens": 2745123.0, "repeat_count": 0.0, "routers_loss": 0.012847424484789371, "skip_count": 1.0, "step": 1702, "text_loss": 0.4756404757499695 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11767578125, "learning_rate": 0.0009657377842471742, "loss": 0.0313, "macro_f1": 0.6666666865348816, "num_tokens": 2748016.0, "repeat_count": 0.0, "routers_loss": 0.007060411386191845, "skip_count": 1.0, "step": 1704, "text_loss": 0.9571210145950317 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.009392427355445, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10009765625, "learning_rate": 0.0009656250917076081, "loss": 0.0188, "macro_f1": 0.5492662787437439, "num_tokens": 2750717.0, "repeat_count": 0.0, "routers_loss": 0.016748681664466858, "skip_count": 2.0, "step": 1706, "text_loss": 0.14542843401432037 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.018784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.0009655122207396285, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2753635.0, "repeat_count": 0.0, "routers_loss": 0.013607042841613293, "skip_count": 0.0, "step": 1708, "text_loss": 0.21836471557617188 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0009653991713864878, "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2756643.0, "repeat_count": 0.0, "routers_loss": 0.0012097888393327594, "skip_count": 0.0, "step": 1710, "text_loss": 0.635187029838562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1171875, "learning_rate": 0.0009652859436915066, "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2759432.0, "repeat_count": 0.0, "routers_loss": 0.006196760106831789, "skip_count": 0.0, "step": 1712, "text_loss": 0.5629420876502991 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.046962136777223, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0009651725376980743, "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2762538.0, "repeat_count": 0.0, "routers_loss": 0.0042513771913945675, "skip_count": 0.0, "step": 1714, "text_loss": 0.39522525668144226 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 8.056354564132668, "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.1494140625, "learning_rate": 0.0009650589534496479, "loss": 0.0194, "macro_f1": 0.8194444179534912, "num_tokens": 2765571.0, "repeat_count": 2.0, "routers_loss": 0.03596706688404083, "skip_count": 3.0, "step": 1716, "text_loss": 0.6252416968345642 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04833984375, "learning_rate": 0.0009649451909897532, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 2769206.0, "repeat_count": 0.0, "routers_loss": 0.0025788163766264915, "skip_count": 0.0, "step": 1718, "text_loss": 0.8851634860038757 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.075139418843557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.0009648312503619843, "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2772488.0, "repeat_count": 0.0, "routers_loss": 0.004443451762199402, "skip_count": 0.0, "step": 1720, "text_loss": 0.8568580746650696 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 8.084531846199003, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1552734375, "learning_rate": 0.0009647171316100034, "loss": 0.0265, "macro_f1": 0.9265305995941162, "num_tokens": 2776482.0, "repeat_count": 1.0, "routers_loss": 0.022948263213038445, "skip_count": 3.0, "step": 1722, "text_loss": 0.13431036472320557 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1572265625, "learning_rate": 0.0009646028347775409, "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 2778966.0, "repeat_count": 0.0, "routers_loss": 0.011328035034239292, "skip_count": 1.0, "step": 1724, "text_loss": 0.2085491120815277 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0009644883599083958, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2781968.0, "repeat_count": 0.0, "routers_loss": 0.002208018908277154, "skip_count": 0.0, "step": 1726, "text_loss": 0.4948323965072632 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.112709128265337, "f1_execute": 0.9411764740943909, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009643737070464349, "loss": 0.0158, "macro_f1": 0.6470588445663452, "num_tokens": 2784666.0, "repeat_count": 1.0, "routers_loss": 0.04391832649707794, "skip_count": 2.0, "step": 1728, "text_loss": 0.39060094952583313 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046630859375, "learning_rate": 0.0009642588762355935, "loss": 0.0212, "macro_f1": 0.6666666865348816, "num_tokens": 2787558.0, "repeat_count": 0.0, "routers_loss": 0.004497280344367027, "skip_count": 1.0, "step": 1730, "text_loss": 0.34908708930015564 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.131493982976226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0009641438675198748, "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2790474.0, "repeat_count": 0.0, "routers_loss": 0.00583475548774004, "skip_count": 0.0, "step": 1732, "text_loss": 0.5720033049583435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.140886410331671, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08154296875, "learning_rate": 0.0009640286809433508, "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2793272.0, "repeat_count": 0.0, "routers_loss": 0.007826375775039196, "skip_count": 0.0, "step": 1734, "text_loss": 0.32181721925735474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0009639133165501606, "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2797726.0, "repeat_count": 0.0, "routers_loss": 0.0019055595621466637, "skip_count": 0.0, "step": 1736, "text_loss": 0.620936393737793 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0009637977743845124, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2800706.0, "repeat_count": 0.0, "routers_loss": 0.0028302327264100313, "skip_count": 0.0, "step": 1738, "text_loss": 0.6473138332366943 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.169063692398003, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009636820544906823, "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 2803847.0, "repeat_count": 1.0, "routers_loss": 0.01105099730193615, "skip_count": 2.0, "step": 1740, "text_loss": 0.4401201903820038 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 8.178456119753449, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1455078125, "learning_rate": 0.0009635661569130141, "loss": 0.0195, "macro_f1": 0.5934640765190125, "num_tokens": 2807235.0, "repeat_count": 0.0, "routers_loss": 0.02619045600295067, "skip_count": 3.0, "step": 1742, "text_loss": 0.459264874458313 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.187848547108894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0009634500816959202, "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2810396.0, "repeat_count": 0.0, "routers_loss": 0.007915694266557693, "skip_count": 2.0, "step": 1744, "text_loss": 0.5084020495414734 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.197240974464338, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.1748046875, "learning_rate": 0.0009633338288838805, "loss": 0.0271, "macro_f1": 0.5492662787437439, "num_tokens": 2813215.0, "repeat_count": 2.0, "routers_loss": 0.08364596217870712, "skip_count": 0.0, "step": 1746, "text_loss": 0.27681824564933777 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 30.0, "epoch": 8.206633401819783, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.051025390625, "learning_rate": 0.0009632173985214438, "loss": 0.0156, "macro_f1": 0.8817967176437378, "num_tokens": 2816452.0, "repeat_count": 3.0, "routers_loss": 0.028805451467633247, "skip_count": 2.0, "step": 1748, "text_loss": 0.4678419530391693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.216025829175228, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0625, "learning_rate": 0.000963100790653226, "loss": 0.0188, "macro_f1": 0.3272727429866791, "num_tokens": 2819364.0, "repeat_count": 0.0, "routers_loss": 0.03056817688047886, "skip_count": 1.0, "step": 1750, "text_loss": 0.3078109920024872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.225418256530672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009629840053239116, "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2823469.0, "repeat_count": 0.0, "routers_loss": 0.0019477814203128219, "skip_count": 0.0, "step": 1752, "text_loss": 0.45501336455345154 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057373046875, "learning_rate": 0.000962867042578253, "loss": 0.0173, "macro_f1": 0.3333333432674408, "num_tokens": 2826716.0, "repeat_count": 0.0, "routers_loss": 0.0032963966950774193, "skip_count": 0.0, "step": 1754, "text_loss": 0.49234694242477417 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.244203111241562, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0009627499024610707, "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2829733.0, "repeat_count": 0.0, "routers_loss": 0.010289114899933338, "skip_count": 1.0, "step": 1756, "text_loss": 0.22335539758205414 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.253595538597006, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0888671875, "learning_rate": 0.0009626325850172527, "loss": 0.0174, "macro_f1": 0.3272727429866791, "num_tokens": 2833350.0, "repeat_count": 0.0, "routers_loss": 0.03249066323041916, "skip_count": 1.0, "step": 1758, "text_loss": 0.6581931114196777 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.262987965952451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.0009625150902917555, "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 2836558.0, "repeat_count": 0.0, "routers_loss": 0.00870000571012497, "skip_count": 0.0, "step": 1760, "text_loss": 0.22938725352287292 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009623974183296031, "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2840560.0, "repeat_count": 0.0, "routers_loss": 0.007767196744680405, "skip_count": 0.0, "step": 1762, "text_loss": 0.24473799765110016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009622795691758876, "loss": 0.0244, "macro_f1": 0.3333333432674408, "num_tokens": 2843548.0, "repeat_count": 0.0, "routers_loss": 0.0021693643648177385, "skip_count": 0.0, "step": 1764, "text_loss": 0.3084608018398285 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0009621615428757693, "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 2847076.0, "repeat_count": 0.0, "routers_loss": 0.0024727333802729845, "skip_count": 0.0, "step": 1766, "text_loss": 0.5251734852790833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.300557675374229, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.000962043339474476, "loss": 0.0194, "macro_f1": 0.3333333432674408, "num_tokens": 2849751.0, "repeat_count": 0.0, "routers_loss": 0.005174890160560608, "skip_count": 0.0, "step": 1768, "text_loss": 0.4410129189491272 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.309950102729674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06103515625, "learning_rate": 0.0009619249590173032, "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 2853916.0, "repeat_count": 0.0, "routers_loss": 0.006785830482840538, "skip_count": 2.0, "step": 1770, "text_loss": 0.550076425075531 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 8.31934253008512, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.06591796875, "learning_rate": 0.0009618064015496149, "loss": 0.0192, "macro_f1": 0.5934640765190125, "num_tokens": 2857372.0, "repeat_count": 0.0, "routers_loss": 0.021370256319642067, "skip_count": 3.0, "step": 1772, "text_loss": 0.1988629847764969 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.072265625, "learning_rate": 0.0009616876671168423, "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2861028.0, "repeat_count": 0.0, "routers_loss": 0.004313841462135315, "skip_count": 1.0, "step": 1774, "text_loss": 0.42581331729888916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.338127384796008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0009615687557644847, "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2864847.0, "repeat_count": 0.0, "routers_loss": 0.0025742491707205772, "skip_count": 0.0, "step": 1776, "text_loss": 0.46510905027389526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009614496675381093, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2867392.0, "repeat_count": 0.0, "routers_loss": 0.0016813480760902166, "skip_count": 0.0, "step": 1778, "text_loss": 0.5922174453735352 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0810546875, "learning_rate": 0.0009613304024833507, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 2871273.0, "repeat_count": 0.0, "routers_loss": 0.004948933608829975, "skip_count": 0.0, "step": 1780, "text_loss": 0.6776977777481079 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.366304666862343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009612109606459117, "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 2874172.0, "repeat_count": 1.0, "routers_loss": 0.016950147226452827, "skip_count": 2.0, "step": 1782, "text_loss": 0.48758944869041443 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.375697094217786, "f1_execute": 0.9599999785423279, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 0.08251953125, "learning_rate": 0.0009610913420715623, "loss": 0.0237, "macro_f1": 0.7644444704055786, "num_tokens": 2877528.0, "repeat_count": 2.0, "routers_loss": 0.04880943149328232, "skip_count": 1.0, "step": 1784, "text_loss": 0.4404778480529785 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.385089521573232, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0009609715468061411, "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2880627.0, "repeat_count": 0.0, "routers_loss": 0.004678630735725164, "skip_count": 0.0, "step": 1786, "text_loss": 0.7295402884483337 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.394481948928677, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0009608515748955535, "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2883333.0, "repeat_count": 0.0, "routers_loss": 0.0026695074047893286, "skip_count": 0.0, "step": 1788, "text_loss": 0.9697831273078918 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 8.40387437628412, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.107421875, "learning_rate": 0.000960731426385773, "loss": 0.0157, "macro_f1": 0.4871794879436493, "num_tokens": 2887444.0, "repeat_count": 0.0, "routers_loss": 0.029743613675236702, "skip_count": 2.0, "step": 1790, "text_loss": 0.4737568199634552 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.0009606111013228407, "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2890221.0, "repeat_count": 0.0, "routers_loss": 0.0016153788892552257, "skip_count": 0.0, "step": 1792, "text_loss": 0.6693558096885681 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.422659230995011, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009604905997528655, "loss": 0.02, "macro_f1": 0.3272727429866791, "num_tokens": 2893262.0, "repeat_count": 0.0, "routers_loss": 0.01965433731675148, "skip_count": 1.0, "step": 1794, "text_loss": 0.45227760076522827 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.432051658350455, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08642578125, "learning_rate": 0.0009603699217220239, "loss": 0.0117, "macro_f1": 0.6601307392120361, "num_tokens": 2896823.0, "repeat_count": 1.0, "routers_loss": 0.024017298594117165, "skip_count": 2.0, "step": 1796, "text_loss": 0.48865509033203125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.0009602490672765597, "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 2899707.0, "repeat_count": 0.0, "routers_loss": 0.0012420224957168102, "skip_count": 0.0, "step": 1798, "text_loss": 0.43292415142059326 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07861328125, "learning_rate": 0.0009601280364627848, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2902795.0, "repeat_count": 0.0, "routers_loss": 0.0020389219280332327, "skip_count": 0.0, "step": 1800, "text_loss": 0.41021591424942017 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.460228940416789, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009600068293270783, "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 2905769.0, "repeat_count": 0.0, "routers_loss": 0.002006303984671831, "skip_count": 0.0, "step": 1802, "text_loss": 0.46892106533050537 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.000959885445915887, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2909475.0, "repeat_count": 0.0, "routers_loss": 0.003734810510650277, "skip_count": 0.0, "step": 1804, "text_loss": 0.45364710688591003 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 8.479013795127678, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11669921875, "learning_rate": 0.0009597638862757254, "loss": 0.0182, "macro_f1": 0.8823530077934265, "num_tokens": 2914348.0, "repeat_count": 1.0, "routers_loss": 0.038971323519945145, "skip_count": 2.0, "step": 1806, "text_loss": 0.42913779616355896 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.488406222483123, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.0009596421504531751, "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2917467.0, "repeat_count": 1.0, "routers_loss": 0.04800829663872719, "skip_count": 0.0, "step": 1808, "text_loss": 0.17332297563552856 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.497798649838568, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0009595202384948858, "loss": 0.0227, "macro_f1": 0.6666666865348816, "num_tokens": 2920223.0, "repeat_count": 1.0, "routers_loss": 0.009164143353700638, "skip_count": 0.0, "step": 1810, "text_loss": 0.33740702271461487 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0947265625, "learning_rate": 0.0009593981504475742, "loss": 0.0275, "macro_f1": 0.6666666865348816, "num_tokens": 2923780.0, "repeat_count": 0.0, "routers_loss": 0.011236993595957756, "skip_count": 2.0, "step": 1812, "text_loss": 0.1609916388988495 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.516583504549457, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10595703125, "learning_rate": 0.0009592758863580248, "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2926259.0, "repeat_count": 0.0, "routers_loss": 0.019026532769203186, "skip_count": 2.0, "step": 1814, "text_loss": 0.6460903882980347 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.525975931904902, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009591534462730894, "loss": 0.0206, "macro_f1": 0.5492662787437439, "num_tokens": 2929173.0, "repeat_count": 2.0, "routers_loss": 0.0608333982527256, "skip_count": 0.0, "step": 1816, "text_loss": 0.476126492023468 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.000959030830239687, "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2932703.0, "repeat_count": 0.0, "routers_loss": 0.0093300249427557, "skip_count": 0.0, "step": 1818, "text_loss": 0.5471875667572021 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.544760786615791, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2001953125, "learning_rate": 0.0009589080383048048, "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2936195.0, "repeat_count": 0.0, "routers_loss": 0.010434109717607498, "skip_count": 0.0, "step": 1820, "text_loss": 0.5068115592002869 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0009587850705154964, "loss": 0.0291, "macro_f1": 0.3333333432674408, "num_tokens": 2939412.0, "repeat_count": 0.0, "routers_loss": 0.004347751382738352, "skip_count": 0.0, "step": 1822, "text_loss": 0.4241984784603119 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.56354564132668, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0859375, "learning_rate": 0.0009586619269188836, "loss": 0.0224, "macro_f1": 0.32098767161369324, "num_tokens": 2942318.0, "repeat_count": 0.0, "routers_loss": 0.034238871186971664, "skip_count": 1.0, "step": 1824, "text_loss": 0.2328975349664688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.572938068682125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0009585386075621553, "loss": 0.027, "macro_f1": 0.3333333432674408, "num_tokens": 2945731.0, "repeat_count": 0.0, "routers_loss": 0.006097695790231228, "skip_count": 0.0, "step": 1826, "text_loss": 0.22816994786262512 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.582330496037569, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0908203125, "learning_rate": 0.0009584151124925676, "loss": 0.0208, "macro_f1": 0.3272727429866791, "num_tokens": 2948944.0, "repeat_count": 0.0, "routers_loss": 0.007790776435285807, "skip_count": 1.0, "step": 1828, "text_loss": 0.5009413361549377 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07275390625, "learning_rate": 0.0009582914417574438, "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 2951723.0, "repeat_count": 0.0, "routers_loss": 0.009144559502601624, "skip_count": 2.0, "step": 1830, "text_loss": 0.1402502954006195 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.60111535074846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0009581675954041751, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 2954726.0, "repeat_count": 1.0, "routers_loss": 0.006593191530555487, "skip_count": 0.0, "step": 1832, "text_loss": 0.4871736466884613 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.610507778103903, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009580435734802196, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2957853.0, "repeat_count": 0.0, "routers_loss": 0.01241068821400404, "skip_count": 0.0, "step": 1834, "text_loss": 0.30100154876708984 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.619900205459349, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0009579193760331027, "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2960783.0, "repeat_count": 0.0, "routers_loss": 0.002219218760728836, "skip_count": 0.0, "step": 1836, "text_loss": 0.4961516559123993 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.629292632814794, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12255859375, "learning_rate": 0.0009577950031104169, "loss": 0.0166, "macro_f1": 0.6601307392120361, "num_tokens": 2963328.0, "repeat_count": 1.0, "routers_loss": 0.029363535344600677, "skip_count": 2.0, "step": 1838, "text_loss": 0.42814353108406067 }, { "acc_repeat": 1.0, "acc_skip": 0.25, "avg_layers": 28.0, "epoch": 8.638685060170237, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.1044921875, "learning_rate": 0.0009576704547598226, "loss": 0.0257, "macro_f1": 0.7795917987823486, "num_tokens": 2966108.0, "repeat_count": 1.0, "routers_loss": 0.0579402856528759, "skip_count": 4.0, "step": 1840, "text_loss": 0.20523512363433838 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.648077487525683, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0625, "learning_rate": 0.0009575457310290463, "loss": 0.0121, "macro_f1": 0.3272727429866791, "num_tokens": 2969137.0, "repeat_count": 0.0, "routers_loss": 0.008810589089989662, "skip_count": 0.0, "step": 1842, "text_loss": 0.6199528574943542 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009574208319658831, "loss": 0.0208, "macro_f1": 0.6666666865348816, "num_tokens": 2972407.0, "repeat_count": 0.0, "routers_loss": 0.0012295129708945751, "skip_count": 1.0, "step": 1844, "text_loss": 0.66938316822052 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 8.666862342236572, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.1474609375, "learning_rate": 0.000957295757618194, "loss": 0.0152, "macro_f1": 0.4871794879436493, "num_tokens": 2976045.0, "repeat_count": 0.0, "routers_loss": 0.06162935495376587, "skip_count": 2.0, "step": 1846, "text_loss": 0.5381782650947571 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009571705080339079, "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 2979025.0, "repeat_count": 0.0, "routers_loss": 0.003950524143874645, "skip_count": 0.0, "step": 1848, "text_loss": 0.5831671357154846 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.0009570450832610208, "loss": 0.0209, "macro_f1": 0.3333333432674408, "num_tokens": 2982276.0, "repeat_count": 0.0, "routers_loss": 0.010354886762797832, "skip_count": 0.0, "step": 1850, "text_loss": 0.27448201179504395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.695039624302906, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061279296875, "learning_rate": 0.0009569194833475956, "loss": 0.0199, "macro_f1": 0.3272727429866791, "num_tokens": 2985691.0, "repeat_count": 0.0, "routers_loss": 0.010167439468204975, "skip_count": 0.0, "step": 1852, "text_loss": 0.5264663696289062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.704432051658351, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1328125, "learning_rate": 0.0009567937083417624, "loss": 0.0194, "macro_f1": 0.3272727429866791, "num_tokens": 2989126.0, "repeat_count": 0.0, "routers_loss": 0.0371871180832386, "skip_count": 1.0, "step": 1854, "text_loss": 0.2008018046617508 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.0009566677582917185, "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 2992814.0, "repeat_count": 0.0, "routers_loss": 0.010190588422119617, "skip_count": 0.0, "step": 1856, "text_loss": 0.749717116355896 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.72321690636924, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.0009565416332457282, "loss": 0.0132, "macro_f1": 0.6538461446762085, "num_tokens": 2995729.0, "repeat_count": 1.0, "routers_loss": 0.022285036742687225, "skip_count": 1.0, "step": 1858, "text_loss": 0.5870219469070435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.732609333724685, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009564153332521228, "loss": 0.0224, "macro_f1": 0.3272727429866791, "num_tokens": 2998812.0, "repeat_count": 0.0, "routers_loss": 0.011050296947360039, "skip_count": 1.0, "step": 1860, "text_loss": 0.8444408774375916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.742001761080129, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06005859375, "learning_rate": 0.0009562888583593005, "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3001799.0, "repeat_count": 0.0, "routers_loss": 0.007125461008399725, "skip_count": 0.0, "step": 1862, "text_loss": 0.41510361433029175 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.751394188435574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06884765625, "learning_rate": 0.0009561622086157272, "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3005088.0, "repeat_count": 0.0, "routers_loss": 0.0049054501578211784, "skip_count": 0.0, "step": 1864, "text_loss": 0.3801248073577881 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.760786615791018, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.054443359375, "learning_rate": 0.000956035384069935, "loss": 0.0238, "macro_f1": 1.0, "num_tokens": 3008178.0, "repeat_count": 1.0, "routers_loss": 0.005162427201867104, "skip_count": 1.0, "step": 1866, "text_loss": 0.2687684893608093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.770179043146463, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10400390625, "learning_rate": 0.0009559083847705233, "loss": 0.0214, "macro_f1": 0.3272727429866791, "num_tokens": 3010923.0, "repeat_count": 0.0, "routers_loss": 0.028984658420085907, "skip_count": 1.0, "step": 1868, "text_loss": 0.6277349591255188 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.779571470501908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009557812107661584, "loss": 0.0208, "macro_f1": 1.0, "num_tokens": 3015030.0, "repeat_count": 1.0, "routers_loss": 0.012200530618429184, "skip_count": 1.0, "step": 1870, "text_loss": 0.6293368339538574 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.788963897857352, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11962890625, "learning_rate": 0.0009556538621055739, "loss": 0.0268, "macro_f1": 0.3272727429866791, "num_tokens": 3019067.0, "repeat_count": 0.0, "routers_loss": 0.06365182995796204, "skip_count": 1.0, "step": 1872, "text_loss": 0.39046618342399597 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.115234375, "learning_rate": 0.0009555263388375699, "loss": 0.014, "macro_f1": 0.6666666865348816, "num_tokens": 3022166.0, "repeat_count": 0.0, "routers_loss": 0.0041703456081449986, "skip_count": 1.0, "step": 1874, "text_loss": 0.42232340574264526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.807748752568243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11572265625, "learning_rate": 0.0009553986410110134, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3025865.0, "repeat_count": 0.0, "routers_loss": 0.005841755773872137, "skip_count": 0.0, "step": 1876, "text_loss": 0.37600573897361755 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.817141179923686, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0009552707686748388, "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3029950.0, "repeat_count": 0.0, "routers_loss": 0.05165952071547508, "skip_count": 1.0, "step": 1878, "text_loss": 0.33717799186706543 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009551427218780467, "loss": 0.0219, "macro_f1": 0.6666666865348816, "num_tokens": 3033649.0, "repeat_count": 0.0, "routers_loss": 0.020680008456110954, "skip_count": 2.0, "step": 1880, "text_loss": 0.5011783838272095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.835926034634575, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.0009550145006697048, "loss": 0.0217, "macro_f1": 0.32098764181137085, "num_tokens": 3036847.0, "repeat_count": 0.0, "routers_loss": 0.07626450061798096, "skip_count": 2.0, "step": 1882, "text_loss": 0.3066408336162567 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.84531846199002, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.056396484375, "learning_rate": 0.0009548861050989482, "loss": 0.0136, "macro_f1": 1.0, "num_tokens": 3040353.0, "repeat_count": 1.0, "routers_loss": 0.010884666815400124, "skip_count": 1.0, "step": 1884, "text_loss": 0.49779415130615234 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0908203125, "learning_rate": 0.0009547575352149778, "loss": 0.0213, "macro_f1": 0.6666666865348816, "num_tokens": 3043504.0, "repeat_count": 0.0, "routers_loss": 0.006704333238303661, "skip_count": 2.0, "step": 1886, "text_loss": 0.12284614145755768 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.86410331670091, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11474609375, "learning_rate": 0.0009546287910670621, "loss": 0.0211, "macro_f1": 0.5427350401878357, "num_tokens": 3046422.0, "repeat_count": 1.0, "routers_loss": 0.04799000173807144, "skip_count": 2.0, "step": 1888, "text_loss": 0.1824081838130951 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009544998727045361, "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 3049819.0, "repeat_count": 0.0, "routers_loss": 0.008139612153172493, "skip_count": 0.0, "step": 1890, "text_loss": 0.18929053843021393 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 8.8828881714118, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.09375, "learning_rate": 0.0009543707801768015, "loss": 0.0175, "macro_f1": 0.5934640765190125, "num_tokens": 3052766.0, "repeat_count": 0.0, "routers_loss": 0.02966771461069584, "skip_count": 3.0, "step": 1892, "text_loss": 0.247748002409935 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 8.892280598767243, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.06689453125, "learning_rate": 0.0009542415135333267, "loss": 0.0193, "macro_f1": 0.44705885648727417, "num_tokens": 3056427.0, "repeat_count": 0.0, "routers_loss": 0.03637036308646202, "skip_count": 2.0, "step": 1894, "text_loss": 0.2583999037742615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.901673026122689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0595703125, "learning_rate": 0.0009541120728236472, "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3059497.0, "repeat_count": 0.0, "routers_loss": 0.007026574574410915, "skip_count": 0.0, "step": 1896, "text_loss": 0.5222375988960266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.076171875, "learning_rate": 0.0009539824580973646, "loss": 0.0219, "macro_f1": 0.3333333432674408, "num_tokens": 3062187.0, "repeat_count": 0.0, "routers_loss": 0.003449335927143693, "skip_count": 0.0, "step": 1898, "text_loss": 0.5736427307128906 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0009538526694041477, "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3066100.0, "repeat_count": 0.0, "routers_loss": 0.0035463871899992228, "skip_count": 0.0, "step": 1900, "text_loss": 0.5471583604812622 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.929850308189023, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.080078125, "learning_rate": 0.0009537227067937318, "loss": 0.0233, "macro_f1": 1.0, "num_tokens": 3068737.0, "repeat_count": 3.0, "routers_loss": 0.00597514258697629, "skip_count": 3.0, "step": 1902, "text_loss": 0.36644190549850464 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.939242735544468, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.166015625, "learning_rate": 0.0009535925703159186, "loss": 0.0301, "macro_f1": 0.32098764181137085, "num_tokens": 3071686.0, "repeat_count": 0.0, "routers_loss": 0.025420479476451874, "skip_count": 2.0, "step": 1904, "text_loss": 0.535789966583252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.948635162899912, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009534622600205769, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3074954.0, "repeat_count": 0.0, "routers_loss": 0.014377486892044544, "skip_count": 0.0, "step": 1906, "text_loss": 0.19009549915790558 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.958027590255357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11083984375, "learning_rate": 0.0009533317759576416, "loss": 0.0197, "macro_f1": 0.3333333432674408, "num_tokens": 3077540.0, "repeat_count": 0.0, "routers_loss": 0.004848944488912821, "skip_count": 0.0, "step": 1908, "text_loss": 0.5022001266479492 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009532011181771148, "loss": 0.0217, "macro_f1": 0.6666666865348816, "num_tokens": 3080445.0, "repeat_count": 0.0, "routers_loss": 0.009480170905590057, "skip_count": 2.0, "step": 1910, "text_loss": 0.35135936737060547 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10400390625, "learning_rate": 0.0009530702867290644, "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 3083657.0, "repeat_count": 0.0, "routers_loss": 0.0019353039097040892, "skip_count": 0.0, "step": 1912, "text_loss": 0.5123994946479797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.986204872321691, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0009529392816636256, "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 3086837.0, "repeat_count": 0.0, "routers_loss": 0.0010921972570940852, "skip_count": 0.0, "step": 1914, "text_loss": 0.44477662444114685 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.995597299677135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19140625, "learning_rate": 0.0009528081030309995, "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 3089892.0, "repeat_count": 0.0, "routers_loss": 0.0018027103506028652, "skip_count": 0.0, "step": 1916, "text_loss": 0.7356183528900146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.004696213677722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009526767508814542, "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3093058.0, "repeat_count": 0.0, "routers_loss": 0.003243023296818137, "skip_count": 0.0, "step": 1918, "text_loss": 0.48823556303977966 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.0009525452252653239, "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 3096404.0, "repeat_count": 0.0, "routers_loss": 0.009360014460980892, "skip_count": 0.0, "step": 1920, "text_loss": 0.21498437225818634 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 9.023481068388612, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.140625, "learning_rate": 0.0009524135262330098, "loss": 0.0224, "macro_f1": 0.9265305995941162, "num_tokens": 3099520.0, "repeat_count": 1.0, "routers_loss": 0.017444295808672905, "skip_count": 3.0, "step": 1922, "text_loss": 0.27608850598335266 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.032873495744056, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.050537109375, "learning_rate": 0.0009522816538349789, "loss": 0.0162, "macro_f1": 0.5492662787437439, "num_tokens": 3102956.0, "repeat_count": 0.0, "routers_loss": 0.06424452364444733, "skip_count": 2.0, "step": 1924, "text_loss": 0.21558666229248047 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.042265923099501, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0009521496081217651, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3106565.0, "repeat_count": 1.0, "routers_loss": 0.002270506462082267, "skip_count": 0.0, "step": 1926, "text_loss": 0.5641813278198242 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.051658350454945, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.095703125, "learning_rate": 0.0009520173891439684, "loss": 0.0216, "macro_f1": 0.6666666865348816, "num_tokens": 3109314.0, "repeat_count": 0.0, "routers_loss": 0.011512448079884052, "skip_count": 1.0, "step": 1928, "text_loss": 0.6351624727249146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009518849969522556, "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 3112956.0, "repeat_count": 0.0, "routers_loss": 0.003883908037096262, "skip_count": 0.0, "step": 1930, "text_loss": 0.35160085558891296 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.070443205165835, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0009517524315973595, "loss": 0.019, "macro_f1": 1.0, "num_tokens": 3115593.0, "repeat_count": 1.0, "routers_loss": 0.009479222819209099, "skip_count": 3.0, "step": 1932, "text_loss": 0.2900560200214386 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.079835632521279, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.0009516196931300794, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3118516.0, "repeat_count": 0.0, "routers_loss": 0.017834696918725967, "skip_count": 2.0, "step": 1934, "text_loss": 0.20094378292560577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0009514867816012809, "loss": 0.02, "macro_f1": 0.3333333432674408, "num_tokens": 3122242.0, "repeat_count": 0.0, "routers_loss": 0.0017964740982279181, "skip_count": 0.0, "step": 1936, "text_loss": 0.6498590707778931 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.0009513536970618961, "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3125645.0, "repeat_count": 0.0, "routers_loss": 0.007437168620526791, "skip_count": 2.0, "step": 1938, "text_loss": 0.25863033533096313 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.108012914587613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0625, "learning_rate": 0.0009512204395629232, "loss": 0.0184, "macro_f1": 0.6666666865348816, "num_tokens": 3128740.0, "repeat_count": 0.0, "routers_loss": 0.0008759932243265212, "skip_count": 1.0, "step": 1940, "text_loss": 0.5638351440429688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.117405341943059, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06884765625, "learning_rate": 0.0009510870091554264, "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3131742.0, "repeat_count": 1.0, "routers_loss": 0.019906625151634216, "skip_count": 0.0, "step": 1942, "text_loss": 0.8410717844963074 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.126797769298504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.0009509534058905369, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3134407.0, "repeat_count": 0.0, "routers_loss": 0.0009229081333614886, "skip_count": 0.0, "step": 1944, "text_loss": 0.47506049275398254 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0009508196298194517, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3137053.0, "repeat_count": 0.0, "routers_loss": 0.003630586201325059, "skip_count": 0.0, "step": 1946, "text_loss": 0.32225799560546875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009506856809934338, "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 3140943.0, "repeat_count": 0.0, "routers_loss": 0.007580445148050785, "skip_count": 0.0, "step": 1948, "text_loss": 0.3120577931404114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0009505515594638127, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3144298.0, "repeat_count": 0.0, "routers_loss": 0.004471861757338047, "skip_count": 0.0, "step": 1950, "text_loss": 0.22052447497844696 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 9.164367478720282, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09130859375, "learning_rate": 0.0009504172652819843, "loss": 0.023, "macro_f1": 1.0, "num_tokens": 3147069.0, "repeat_count": 1.0, "routers_loss": 0.009606664068996906, "skip_count": 1.0, "step": 1952, "text_loss": 0.34773921966552734 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0625, "learning_rate": 0.0009502827984994099, "loss": 0.0148, "macro_f1": 0.6666666865348816, "num_tokens": 3149992.0, "repeat_count": 0.0, "routers_loss": 0.006443799939006567, "skip_count": 1.0, "step": 1954, "text_loss": 0.6442171335220337 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.0009501481591676177, "loss": 0.0188, "macro_f1": 0.3333333432674408, "num_tokens": 3153167.0, "repeat_count": 0.0, "routers_loss": 0.003219039412215352, "skip_count": 0.0, "step": 1956, "text_loss": 0.43369221687316895 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.192544760786616, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.000950013347338202, "loss": 0.0152, "macro_f1": 0.3272727429866791, "num_tokens": 3156590.0, "repeat_count": 0.0, "routers_loss": 0.025551019236445427, "skip_count": 1.0, "step": 1958, "text_loss": 0.294479101896286 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.201937188142061, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009498783630628225, "loss": 0.0158, "macro_f1": 1.0, "num_tokens": 3159451.0, "repeat_count": 1.0, "routers_loss": 0.013802438974380493, "skip_count": 2.0, "step": 1960, "text_loss": 0.20888492465019226 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.211329615497505, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009497432063932057, "loss": 0.0137, "macro_f1": 0.6601307392120361, "num_tokens": 3162889.0, "repeat_count": 1.0, "routers_loss": 0.02852988988161087, "skip_count": 2.0, "step": 1962, "text_loss": 0.5027125477790833 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045166015625, "learning_rate": 0.0009496078773811437, "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 3165979.0, "repeat_count": 0.0, "routers_loss": 0.01784522272646427, "skip_count": 2.0, "step": 1964, "text_loss": 0.1696339100599289 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.000949472376078495, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3168683.0, "repeat_count": 0.0, "routers_loss": 0.0017019887454807758, "skip_count": 0.0, "step": 1966, "text_loss": 0.48905447125434875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.051025390625, "learning_rate": 0.000949336702537184, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 3171968.0, "repeat_count": 0.0, "routers_loss": 0.004817947279661894, "skip_count": 2.0, "step": 1968, "text_loss": 0.20984773337841034 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.248899324919284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0009492008568092007, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3175947.0, "repeat_count": 0.0, "routers_loss": 0.0012963006738573313, "skip_count": 0.0, "step": 1970, "text_loss": 0.5215106010437012 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 9.258291752274728, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.044921875, "learning_rate": 0.0009490648389466019, "loss": 0.0135, "macro_f1": 0.4871794879436493, "num_tokens": 3179348.0, "repeat_count": 0.0, "routers_loss": 0.03950481489300728, "skip_count": 2.0, "step": 1972, "text_loss": 0.24640929698944092 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09326171875, "learning_rate": 0.0009489286490015097, "loss": 0.0183, "macro_f1": 0.6666666865348816, "num_tokens": 3182640.0, "repeat_count": 0.0, "routers_loss": 0.0043345349840819836, "skip_count": 2.0, "step": 1974, "text_loss": 0.6362852454185486 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.277076606985618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0009487922870261122, "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3185657.0, "repeat_count": 0.0, "routers_loss": 0.0015687479171901941, "skip_count": 0.0, "step": 1976, "text_loss": 0.8977144360542297 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.286469034341062, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061279296875, "learning_rate": 0.0009486557530726638, "loss": 0.0139, "macro_f1": 0.3333333432674408, "num_tokens": 3188772.0, "repeat_count": 0.0, "routers_loss": 0.0010977238416671753, "skip_count": 0.0, "step": 1978, "text_loss": 0.38512736558914185 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 9.295861461696507, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.0009485190471934844, "loss": 0.0196, "macro_f1": 0.6666666865348816, "num_tokens": 3193131.0, "repeat_count": 2.0, "routers_loss": 0.002264744369313121, "skip_count": 0.0, "step": 1980, "text_loss": 0.4171289801597595 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.305253889051952, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.00094838216944096, "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3196668.0, "repeat_count": 0.0, "routers_loss": 0.042320676147937775, "skip_count": 1.0, "step": 1982, "text_loss": 0.19008000195026398 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 9.314646316407396, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0009482451198675424, "loss": 0.0151, "macro_f1": 0.32098767161369324, "num_tokens": 3200282.0, "repeat_count": 0.0, "routers_loss": 0.01796630397439003, "skip_count": 1.0, "step": 1984, "text_loss": 0.5009249448776245 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.324038743762841, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061767578125, "learning_rate": 0.0009481078985257494, "loss": 0.0147, "macro_f1": 0.6666666865348816, "num_tokens": 3204439.0, "repeat_count": 0.0, "routers_loss": 0.01052347756922245, "skip_count": 1.0, "step": 1986, "text_loss": 0.15319275856018066 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.333431171118287, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0009479705054681644, "loss": 0.015, "macro_f1": 0.3076923191547394, "num_tokens": 3207590.0, "repeat_count": 1.0, "routers_loss": 0.09640293568372726, "skip_count": 3.0, "step": 1988, "text_loss": 0.3654652535915375 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.34282359847373, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06689453125, "learning_rate": 0.0009478329407474366, "loss": 0.0183, "macro_f1": 0.5492662787437439, "num_tokens": 3211172.0, "repeat_count": 0.0, "routers_loss": 0.012670112773776054, "skip_count": 1.0, "step": 1990, "text_loss": 0.5817596316337585 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.352216025829176, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05859375, "learning_rate": 0.000947695204416281, "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 3214050.0, "repeat_count": 1.0, "routers_loss": 0.005263707600533962, "skip_count": 0.0, "step": 1992, "text_loss": 0.5985888242721558 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.361608453184619, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009475572965274787, "loss": 0.0144, "macro_f1": 0.3272727429866791, "num_tokens": 3217318.0, "repeat_count": 1.0, "routers_loss": 0.0682850033044815, "skip_count": 0.0, "step": 1994, "text_loss": 0.316506564617157 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.371000880540064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0595703125, "learning_rate": 0.000947419217133876, "loss": 0.019, "macro_f1": 0.6666666865348816, "num_tokens": 3220012.0, "repeat_count": 0.0, "routers_loss": 0.008508823812007904, "skip_count": 2.0, "step": 1996, "text_loss": 0.09665893763303757 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.38039330789551, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.053466796875, "learning_rate": 0.0009472809662883852, "loss": 0.0155, "macro_f1": 1.0, "num_tokens": 3223019.0, "repeat_count": 1.0, "routers_loss": 0.01100847590714693, "skip_count": 2.0, "step": 1998, "text_loss": 0.4938808083534241 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.389785735250953, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0009471425440439844, "loss": 0.0135, "macro_f1": 0.8817967176437378, "num_tokens": 3226013.0, "repeat_count": 2.0, "routers_loss": 0.04953207075595856, "skip_count": 3.0, "step": 2000, "text_loss": 0.22258254885673523 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 9.399178162606399, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009470039504537173, "loss": 0.0186, "macro_f1": 0.31446540355682373, "num_tokens": 3230031.0, "repeat_count": 0.0, "routers_loss": 0.052884332835674286, "skip_count": 2.0, "step": 2002, "text_loss": 0.1741616576910019 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.408570589961844, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009468651855706931, "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 3232991.0, "repeat_count": 1.0, "routers_loss": 0.008056716993451118, "skip_count": 0.0, "step": 2004, "text_loss": 0.3173636198043823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0009467262494480868, "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3236390.0, "repeat_count": 0.0, "routers_loss": 0.0053409393876791, "skip_count": 0.0, "step": 2006, "text_loss": 0.5806330442428589 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.427355444672733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.000946587142139139, "loss": 0.0147, "macro_f1": 0.3333333432674408, "num_tokens": 3239267.0, "repeat_count": 0.0, "routers_loss": 0.0015652200672775507, "skip_count": 0.0, "step": 2008, "text_loss": 0.6214317679405212 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.436747872028178, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11376953125, "learning_rate": 0.000946447863697156, "loss": 0.0151, "macro_f1": 0.6601307392120361, "num_tokens": 3242569.0, "repeat_count": 1.0, "routers_loss": 0.011673987843096256, "skip_count": 2.0, "step": 2010, "text_loss": 0.532565712928772 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.446140299383622, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0009463084141755093, "loss": 0.0159, "macro_f1": 0.3272727429866791, "num_tokens": 3245669.0, "repeat_count": 0.0, "routers_loss": 0.028480790555477142, "skip_count": 1.0, "step": 2012, "text_loss": 0.25210800766944885 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.455532726739067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009461687936276364, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3248751.0, "repeat_count": 0.0, "routers_loss": 0.007234727032482624, "skip_count": 0.0, "step": 2014, "text_loss": 0.35922971367836 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.46492515409451, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0009460290021070402, "loss": 0.0195, "macro_f1": 0.6666666865348816, "num_tokens": 3252614.0, "repeat_count": 1.0, "routers_loss": 0.014691276475787163, "skip_count": 0.0, "step": 2016, "text_loss": 0.2747853398323059 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051513671875, "learning_rate": 0.0009458890396672888, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3256374.0, "repeat_count": 0.0, "routers_loss": 0.002385235857218504, "skip_count": 0.0, "step": 2018, "text_loss": 0.5268719792366028 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 9.483710008805401, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.04443359375, "learning_rate": 0.0009457489063620164, "loss": 0.0133, "macro_f1": 0.8823530077934265, "num_tokens": 3259792.0, "repeat_count": 1.0, "routers_loss": 0.047268565744161606, "skip_count": 2.0, "step": 2020, "text_loss": 0.7785539627075195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.493102436160845, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009456086022449221, "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 3262833.0, "repeat_count": 0.0, "routers_loss": 0.015878718346357346, "skip_count": 1.0, "step": 2022, "text_loss": 0.42270028591156006 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.50249486351629, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08935546875, "learning_rate": 0.0009454681273697711, "loss": 0.0117, "macro_f1": 0.3272727429866791, "num_tokens": 3265718.0, "repeat_count": 1.0, "routers_loss": 0.030749641358852386, "skip_count": 0.0, "step": 2024, "text_loss": 0.18668225407600403 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.511887290871735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0009453274817903931, "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3268158.0, "repeat_count": 0.0, "routers_loss": 0.011538166552782059, "skip_count": 1.0, "step": 2026, "text_loss": 0.34090787172317505 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.000945186665560684, "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 3271082.0, "repeat_count": 0.0, "routers_loss": 0.009527760557830334, "skip_count": 0.0, "step": 2028, "text_loss": 0.2110334187746048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.530672145582624, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.119140625, "learning_rate": 0.000945045678734605, "loss": 0.0175, "macro_f1": 0.3144654333591461, "num_tokens": 3273488.0, "repeat_count": 0.0, "routers_loss": 0.03317151218652725, "skip_count": 3.0, "step": 2030, "text_loss": 0.2233227640390396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.540064572938068, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12451171875, "learning_rate": 0.0009449045213661822, "loss": 0.0201, "macro_f1": 0.3272727429866791, "num_tokens": 3276646.0, "repeat_count": 0.0, "routers_loss": 0.018510591238737106, "skip_count": 1.0, "step": 2032, "text_loss": 0.16100332140922546 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 9.549457000293513, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.1318359375, "learning_rate": 0.0009447631935095077, "loss": 0.0185, "macro_f1": 0.9452888369560242, "num_tokens": 3279441.0, "repeat_count": 1.0, "routers_loss": 0.028113311156630516, "skip_count": 4.0, "step": 2034, "text_loss": 0.29208317399024963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0009446216952187384, "loss": 0.0164, "macro_f1": 0.3333333432674408, "num_tokens": 3282697.0, "repeat_count": 0.0, "routers_loss": 0.008379172533750534, "skip_count": 0.0, "step": 2036, "text_loss": 0.16026398539543152 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06298828125, "learning_rate": 0.0009444800265480967, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3285574.0, "repeat_count": 0.0, "routers_loss": 0.00941354501992464, "skip_count": 0.0, "step": 2038, "text_loss": 0.29523080587387085 }, { "acc_repeat": 0.75, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 9.577634282359847, "f1_execute": 0.9230769276618958, "f1_repeat": 0.8571428656578064, "f1_skip": 0.800000011920929, "grad_norm": 0.076171875, "learning_rate": 0.0009443381875518703, "loss": 0.0197, "macro_f1": 0.8600732684135437, "num_tokens": 3289159.0, "repeat_count": 4.0, "routers_loss": 0.04974055662751198, "skip_count": 6.0, "step": 2040, "text_loss": 0.23033179342746735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.587026709715293, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0537109375, "learning_rate": 0.0009441961782844123, "loss": 0.0146, "macro_f1": 0.3272727429866791, "num_tokens": 3293598.0, "repeat_count": 0.0, "routers_loss": 0.022241825237870216, "skip_count": 1.0, "step": 2042, "text_loss": 0.8299165368080139 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.0009440539988001408, "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3296648.0, "repeat_count": 0.0, "routers_loss": 0.011019332334399223, "skip_count": 0.0, "step": 2044, "text_loss": 0.18207129836082458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.605811564426181, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0009439116491535394, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3300058.0, "repeat_count": 0.0, "routers_loss": 0.002889640862122178, "skip_count": 0.0, "step": 2046, "text_loss": 0.7051978707313538 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 9.615203991781627, "f1_execute": 0.9333333373069763, "f1_repeat": 0.5, "f1_skip": 0.8571428656578064, "grad_norm": 0.078125, "learning_rate": 0.0009437691293991563, "loss": 0.0192, "macro_f1": 0.7634921073913574, "num_tokens": 3303296.0, "repeat_count": 3.0, "routers_loss": 0.07741832733154297, "skip_count": 4.0, "step": 2048, "text_loss": 0.15563532710075378 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09521484375, "learning_rate": 0.0009436264395916061, "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 3306204.0, "repeat_count": 0.0, "routers_loss": 0.014225383289158344, "skip_count": 2.0, "step": 2050, "text_loss": 0.18117287755012512 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.633988846492516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009434835797855672, "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 3309444.0, "repeat_count": 0.0, "routers_loss": 0.0023932650219649076, "skip_count": 0.0, "step": 2052, "text_loss": 0.4645874798297882 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.643381273847961, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0009433405500357839, "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3312488.0, "repeat_count": 0.0, "routers_loss": 0.03193361684679985, "skip_count": 1.0, "step": 2054, "text_loss": 0.5291082859039307 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0009431973503970655, "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3315765.0, "repeat_count": 0.0, "routers_loss": 0.0020529816392809153, "skip_count": 0.0, "step": 2056, "text_loss": 0.5877931118011475 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.66216612855885, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0009430539809242864, "loss": 0.0185, "macro_f1": 0.32098764181137085, "num_tokens": 3318877.0, "repeat_count": 2.0, "routers_loss": 0.07907948642969131, "skip_count": 0.0, "step": 2058, "text_loss": 0.3836737871170044 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 9.671558555914293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0009429104416723862, "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 3322576.0, "repeat_count": 2.0, "routers_loss": 0.003006070153787732, "skip_count": 0.0, "step": 2060, "text_loss": 0.3480920195579529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 0.0009427667326963689, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3325974.0, "repeat_count": 0.0, "routers_loss": 0.005013179033994675, "skip_count": 0.0, "step": 2062, "text_loss": 0.931358814239502 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.690343410625184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0009426228540513047, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 3329398.0, "repeat_count": 0.0, "routers_loss": 0.0059848143719136715, "skip_count": 0.0, "step": 2064, "text_loss": 0.47568953037261963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.699735837980628, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009424788057923277, "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3332029.0, "repeat_count": 0.0, "routers_loss": 0.00783882662653923, "skip_count": 0.0, "step": 2066, "text_loss": 0.22887596487998962 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.709128265336073, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0712890625, "learning_rate": 0.0009423345879746376, "loss": 0.0128, "macro_f1": 0.5492662787437439, "num_tokens": 3334858.0, "repeat_count": 0.0, "routers_loss": 0.01866884157061577, "skip_count": 2.0, "step": 2068, "text_loss": 0.17724967002868652 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.718520692691518, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06591796875, "learning_rate": 0.000942190200653499, "loss": 0.0162, "macro_f1": 0.32098764181137085, "num_tokens": 3338094.0, "repeat_count": 0.0, "routers_loss": 0.028636593371629715, "skip_count": 2.0, "step": 2070, "text_loss": 0.34344956278800964 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.727913120046962, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.07568359375, "learning_rate": 0.0009420456438842413, "loss": 0.0165, "macro_f1": 0.5492662787437439, "num_tokens": 3340526.0, "repeat_count": 0.0, "routers_loss": 0.023245645686984062, "skip_count": 2.0, "step": 2072, "text_loss": 0.7276164293289185 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.737305547402407, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11328125, "learning_rate": 0.000941900917722259, "loss": 0.0143, "macro_f1": 0.3272727429866791, "num_tokens": 3343303.0, "repeat_count": 1.0, "routers_loss": 0.01565689593553543, "skip_count": 0.0, "step": 2074, "text_loss": 0.5665070414543152 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1201171875, "learning_rate": 0.0009417560222230115, "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 3346409.0, "repeat_count": 0.0, "routers_loss": 0.0035056080669164658, "skip_count": 0.0, "step": 2076, "text_loss": 0.5112795233726501 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.0009416109574420229, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3349220.0, "repeat_count": 0.0, "routers_loss": 0.0027565446216613054, "skip_count": 0.0, "step": 2078, "text_loss": 0.5240910053253174 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 9.765482829468741, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08203125, "learning_rate": 0.0009414657234348823, "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 3352627.0, "repeat_count": 3.0, "routers_loss": 0.01652451977133751, "skip_count": 2.0, "step": 2080, "text_loss": 1.0217112302780151 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.774875256824185, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0009413203202572438, "loss": 0.0179, "macro_f1": 0.32098764181137085, "num_tokens": 3355392.0, "repeat_count": 0.0, "routers_loss": 0.1012420505285263, "skip_count": 2.0, "step": 2082, "text_loss": 0.4085482358932495 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08251953125, "learning_rate": 0.000941174747964826, "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3358425.0, "repeat_count": 0.0, "routers_loss": 0.004962718114256859, "skip_count": 0.0, "step": 2084, "text_loss": 0.5833504796028137 }, { "acc_repeat": 0.5, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 9.793660111535075, "f1_execute": 0.9583333134651184, "f1_repeat": 0.6666666865348816, "f1_skip": 0.800000011920929, "grad_norm": 0.11376953125, "learning_rate": 0.0009410290066134124, "loss": 0.0211, "macro_f1": 0.8083333373069763, "num_tokens": 3361925.0, "repeat_count": 2.0, "routers_loss": 0.07889176905155182, "skip_count": 3.0, "step": 2086, "text_loss": 0.38126569986343384 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.803052538890519, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.051513671875, "learning_rate": 0.0009408830962588517, "loss": 0.0195, "macro_f1": 0.6601307392120361, "num_tokens": 3365963.0, "repeat_count": 1.0, "routers_loss": 0.033715736120939255, "skip_count": 2.0, "step": 2088, "text_loss": 0.23213914036750793 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.812444966245964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0009407370169570567, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3369422.0, "repeat_count": 0.0, "routers_loss": 0.0014188943896442652, "skip_count": 0.0, "step": 2090, "text_loss": 0.4648318886756897 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.82183739360141, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0009405907687640054, "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 3372506.0, "repeat_count": 0.0, "routers_loss": 0.015339684672653675, "skip_count": 1.0, "step": 2092, "text_loss": 0.2563800811767578 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 9.831229820956853, "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.054443359375, "learning_rate": 0.0009404443517357404, "loss": 0.0146, "macro_f1": 0.542222261428833, "num_tokens": 3375653.0, "repeat_count": 4.0, "routers_loss": 0.06562861055135727, "skip_count": 0.0, "step": 2094, "text_loss": 0.797835111618042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.840622248312298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.000940297765928369, "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3379018.0, "repeat_count": 0.0, "routers_loss": 0.005745889153331518, "skip_count": 0.0, "step": 2096, "text_loss": 0.4238114655017853 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0009401510113980631, "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 3382855.0, "repeat_count": 0.0, "routers_loss": 0.0026634482201188803, "skip_count": 0.0, "step": 2098, "text_loss": 0.4967166483402252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009400040882010592, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 3386386.0, "repeat_count": 0.0, "routers_loss": 0.0020642587915062904, "skip_count": 0.0, "step": 2100, "text_loss": 0.44390562176704407 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.868799530378633, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056640625, "learning_rate": 0.0009398569963936589, "loss": 0.017, "macro_f1": 0.3272727429866791, "num_tokens": 3389958.0, "repeat_count": 0.0, "routers_loss": 0.013722737319767475, "skip_count": 1.0, "step": 2102, "text_loss": 0.7207565903663635 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.878191957734076, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.0009397097360322276, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3392892.0, "repeat_count": 0.0, "routers_loss": 0.002051608171314001, "skip_count": 0.0, "step": 2104, "text_loss": 0.3196398913860321 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.887584385089522, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.000939562307173196, "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 3396636.0, "repeat_count": 0.0, "routers_loss": 0.007085663266479969, "skip_count": 0.0, "step": 2106, "text_loss": 0.5663776397705078 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.896976812444967, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11328125, "learning_rate": 0.0009394147098730592, "loss": 0.02, "macro_f1": 0.5492662787437439, "num_tokens": 3399475.0, "repeat_count": 0.0, "routers_loss": 0.019473131746053696, "skip_count": 2.0, "step": 2108, "text_loss": 0.7708223462104797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0009392669441883767, "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3402350.0, "repeat_count": 0.0, "routers_loss": 0.0028328890912234783, "skip_count": 0.0, "step": 2110, "text_loss": 0.5888006091117859 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10693359375, "learning_rate": 0.0009391190101757724, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3405561.0, "repeat_count": 0.0, "routers_loss": 0.023098422214388847, "skip_count": 2.0, "step": 2112, "text_loss": 0.09865197539329529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.925154094511301, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.000938970907891935, "loss": 0.0247, "macro_f1": 0.3333333432674408, "num_tokens": 3408513.0, "repeat_count": 0.0, "routers_loss": 0.002896632067859173, "skip_count": 0.0, "step": 2114, "text_loss": 0.6613234281539917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0009388226373936179, "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 3411195.0, "repeat_count": 0.0, "routers_loss": 0.015814457088708878, "skip_count": 0.0, "step": 2116, "text_loss": 0.17363053560256958 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.94393894922219, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12451171875, "learning_rate": 0.0009386741987376381, "loss": 0.015, "macro_f1": 0.6603773832321167, "num_tokens": 3414875.0, "repeat_count": 1.0, "routers_loss": 0.02676783688366413, "skip_count": 0.0, "step": 2118, "text_loss": 0.674056887626648 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0009385255919808778, "loss": 0.0203, "macro_f1": 0.6666666865348816, "num_tokens": 3418410.0, "repeat_count": 0.0, "routers_loss": 0.01022857241332531, "skip_count": 1.0, "step": 2120, "text_loss": 0.235092431306839 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.962723803933079, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0888671875, "learning_rate": 0.0009383768171802836, "loss": 0.0244, "macro_f1": 0.5492662787437439, "num_tokens": 3421289.0, "repeat_count": 0.0, "routers_loss": 0.013572212308645248, "skip_count": 2.0, "step": 2122, "text_loss": 0.5992844104766846 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.064453125, "learning_rate": 0.0009382278743928659, "loss": 0.0201, "macro_f1": 0.6666666865348816, "num_tokens": 3424781.0, "repeat_count": 0.0, "routers_loss": 0.0051873656921088696, "skip_count": 2.0, "step": 2124, "text_loss": 0.29915499687194824 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 9.981508658643968, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.07421875, "learning_rate": 0.0009380787636757001, "loss": 0.0155, "macro_f1": 0.6122449040412903, "num_tokens": 3427942.0, "repeat_count": 0.0, "routers_loss": 0.030079292133450508, "skip_count": 4.0, "step": 2126, "text_loss": 0.24181491136550903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0009379294850859256, "loss": 0.0141, "macro_f1": 0.3333333432674408, "num_tokens": 3431314.0, "repeat_count": 0.0, "routers_loss": 0.002675612922757864, "skip_count": 0.0, "step": 2128, "text_loss": 0.4669873118400574 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0009377800386807465, "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 3435020.0, "repeat_count": 0.0, "routers_loss": 0.009334275498986244, "skip_count": 0.0, "step": 2130, "text_loss": 0.6478219628334045 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.009392427355445, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.134765625, "learning_rate": 0.0009376304245174306, "loss": 0.0137, "macro_f1": 0.6000000238418579, "num_tokens": 3438276.0, "repeat_count": 1.0, "routers_loss": 0.038227908313274384, "skip_count": 2.0, "step": 2132, "text_loss": 0.4401201903820038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.018784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041748046875, "learning_rate": 0.0009374806426533104, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3440938.0, "repeat_count": 0.0, "routers_loss": 0.006901399698108435, "skip_count": 0.0, "step": 2134, "text_loss": 0.5948942303657532 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0009373306931457827, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3444028.0, "repeat_count": 0.0, "routers_loss": 0.0037061909679323435, "skip_count": 0.0, "step": 2136, "text_loss": 0.5349751114845276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0009371805760523086, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 3448331.0, "repeat_count": 0.0, "routers_loss": 0.0025877030566334724, "skip_count": 0.0, "step": 2138, "text_loss": 0.4591051936149597 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.046962136777223, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.07373046875, "learning_rate": 0.0009370302914304129, "loss": 0.0144, "macro_f1": 0.5934640765190125, "num_tokens": 3451434.0, "repeat_count": 0.0, "routers_loss": 0.018742674961686134, "skip_count": 3.0, "step": 2140, "text_loss": 0.23470863699913025 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.056354564132668, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009368798393376851, "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 3454375.0, "repeat_count": 0.0, "routers_loss": 0.02382594160735607, "skip_count": 1.0, "step": 2142, "text_loss": 0.6077954769134521 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.065746991488112, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05517578125, "learning_rate": 0.0009367292198317787, "loss": 0.0164, "macro_f1": 0.5492662787437439, "num_tokens": 3457591.0, "repeat_count": 0.0, "routers_loss": 0.03331060707569122, "skip_count": 2.0, "step": 2144, "text_loss": 0.3691073954105377 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.075139418843557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0009365784329704115, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3460895.0, "repeat_count": 0.0, "routers_loss": 0.0016955457394942641, "skip_count": 0.0, "step": 2146, "text_loss": 0.3947436511516571 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.084531846199003, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0009364274788113651, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 3464101.0, "repeat_count": 1.0, "routers_loss": 0.006169239990413189, "skip_count": 0.0, "step": 2148, "text_loss": 0.3348555266857147 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 10.093924273554446, "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009362763574124858, "loss": 0.019, "macro_f1": 0.9265305995941162, "num_tokens": 3467417.0, "repeat_count": 3.0, "routers_loss": 0.024033790454268456, "skip_count": 1.0, "step": 2150, "text_loss": 0.496633380651474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0009361250688316829, "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3470917.0, "repeat_count": 0.0, "routers_loss": 0.0024986129719763994, "skip_count": 0.0, "step": 2152, "text_loss": 0.6857671737670898 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0546875, "learning_rate": 0.0009359736131269312, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3473624.0, "repeat_count": 0.0, "routers_loss": 0.008183322846889496, "skip_count": 1.0, "step": 2154, "text_loss": 0.13883116841316223 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06640625, "learning_rate": 0.0009358219903562684, "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 3476472.0, "repeat_count": 0.0, "routers_loss": 0.011198793537914753, "skip_count": 3.0, "step": 2156, "text_loss": 0.24243666231632233 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.131493982976226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009356702005777969, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3479688.0, "repeat_count": 0.0, "routers_loss": 0.002520184963941574, "skip_count": 0.0, "step": 2158, "text_loss": 0.6407818794250488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.140886410331671, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009355182438496825, "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3482598.0, "repeat_count": 0.0, "routers_loss": 0.0011065017897635698, "skip_count": 0.0, "step": 2160, "text_loss": 0.7214245796203613 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0009353661202301557, "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 3486271.0, "repeat_count": 0.0, "routers_loss": 0.0017824085662141442, "skip_count": 0.0, "step": 2162, "text_loss": 0.5140969157218933 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.053466796875, "learning_rate": 0.0009352138297775101, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3489206.0, "repeat_count": 0.0, "routers_loss": 0.001542879967018962, "skip_count": 0.0, "step": 2164, "text_loss": 0.7956416606903076 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.169063692398003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.000935061372550104, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 3492003.0, "repeat_count": 0.0, "routers_loss": 0.01420794241130352, "skip_count": 3.0, "step": 2166, "text_loss": 0.27489882707595825 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.178456119753449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0009349087486063594, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3494784.0, "repeat_count": 0.0, "routers_loss": 0.003614309709519148, "skip_count": 1.0, "step": 2168, "text_loss": 0.2962227761745453 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.187848547108894, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009347559580047618, "loss": 0.0175, "macro_f1": 0.8814815282821655, "num_tokens": 3497886.0, "repeat_count": 2.0, "routers_loss": 0.02122853323817253, "skip_count": 4.0, "step": 2170, "text_loss": 0.5919580459594727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.197240974464338, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06396484375, "learning_rate": 0.000934603000803861, "loss": 0.0135, "macro_f1": 0.5492662787437439, "num_tokens": 3500939.0, "repeat_count": 0.0, "routers_loss": 0.02042219042778015, "skip_count": 1.0, "step": 2172, "text_loss": 0.28722381591796875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009344498770622704, "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3504852.0, "repeat_count": 0.0, "routers_loss": 0.004345106892287731, "skip_count": 0.0, "step": 2174, "text_loss": 0.603236734867096 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.216025829175228, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1064453125, "learning_rate": 0.0009342965868386673, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 3508320.0, "repeat_count": 0.0, "routers_loss": 0.00368050136603415, "skip_count": 0.0, "step": 2176, "text_loss": 0.6020491719245911 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.225418256530672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.000934143130191793, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 3511278.0, "repeat_count": 0.0, "routers_loss": 0.013425769284367561, "skip_count": 0.0, "step": 2178, "text_loss": 0.5954724550247192 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060546875, "learning_rate": 0.000933989507180452, "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 3514361.0, "repeat_count": 0.0, "routers_loss": 0.002896249992772937, "skip_count": 0.0, "step": 2180, "text_loss": 0.39175131916999817 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.244203111241562, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.052978515625, "learning_rate": 0.0009338357178635135, "loss": 0.0147, "macro_f1": 0.6603773832321167, "num_tokens": 3517962.0, "repeat_count": 1.0, "routers_loss": 0.011538350023329258, "skip_count": 1.0, "step": 2182, "text_loss": 0.4482830762863159 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.253595538597006, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009336817622999093, "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 3521299.0, "repeat_count": 1.0, "routers_loss": 0.022787930443882942, "skip_count": 0.0, "step": 2184, "text_loss": 0.35177817940711975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.262987965952451, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009335276405486357, "loss": 0.0139, "macro_f1": 0.3272727429866791, "num_tokens": 3524611.0, "repeat_count": 0.0, "routers_loss": 0.011597735807299614, "skip_count": 1.0, "step": 2186, "text_loss": 0.24868851900100708 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0009333733526687524, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 3528012.0, "repeat_count": 0.0, "routers_loss": 0.014253967441618443, "skip_count": 0.0, "step": 2188, "text_loss": 0.3970910310745239 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.054931640625, "learning_rate": 0.000933218898719383, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3530908.0, "repeat_count": 0.0, "routers_loss": 0.001659149187617004, "skip_count": 0.0, "step": 2190, "text_loss": 0.7618573307991028 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009330642787597141, "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3533993.0, "repeat_count": 0.0, "routers_loss": 0.005574346985667944, "skip_count": 0.0, "step": 2192, "text_loss": 0.16470147669315338 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.300557675374229, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0009329094928489969, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3537310.0, "repeat_count": 0.0, "routers_loss": 0.0026400673668831587, "skip_count": 0.0, "step": 2194, "text_loss": 0.3400416374206543 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.309950102729674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.0009327545410465452, "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3540045.0, "repeat_count": 0.0, "routers_loss": 0.008448398672044277, "skip_count": 3.0, "step": 2196, "text_loss": 0.3110542297363281 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.31934253008512, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.0009325994234117372, "loss": 0.0122, "macro_f1": 0.32098764181137085, "num_tokens": 3544097.0, "repeat_count": 0.0, "routers_loss": 0.037553198635578156, "skip_count": 2.0, "step": 2198, "text_loss": 0.36126700043678284 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.328734957440563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09716796875, "learning_rate": 0.000932444140004014, "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3547054.0, "repeat_count": 1.0, "routers_loss": 0.006464479025453329, "skip_count": 0.0, "step": 2200, "text_loss": 0.4947047233581543 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.338127384796008, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009322886908828805, "loss": 0.0138, "macro_f1": 0.6666666865348816, "num_tokens": 3549903.0, "repeat_count": 1.0, "routers_loss": 0.005384812597185373, "skip_count": 0.0, "step": 2202, "text_loss": 0.5923738479614258 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.0009321330761079052, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3553745.0, "repeat_count": 0.0, "routers_loss": 0.015346619300544262, "skip_count": 2.0, "step": 2204, "text_loss": 0.1904175877571106 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.356912239506897, "f1_execute": 0.9268292784690857, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, "grad_norm": 0.06494140625, "learning_rate": 0.00093197729573872, "loss": 0.0203, "macro_f1": 0.8422764539718628, "num_tokens": 3557235.0, "repeat_count": 3.0, "routers_loss": 0.1207597479224205, "skip_count": 6.0, "step": 2206, "text_loss": 0.3904837667942047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.366304666862343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0771484375, "learning_rate": 0.0009318213498350202, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3560795.0, "repeat_count": 0.0, "routers_loss": 0.003334777895361185, "skip_count": 0.0, "step": 2208, "text_loss": 0.4268290102481842 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.375697094217786, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0537109375, "learning_rate": 0.0009316652384565645, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3563754.0, "repeat_count": 0.0, "routers_loss": 0.004230072256177664, "skip_count": 0.0, "step": 2210, "text_loss": 0.40049710869789124 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.385089521573232, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046875, "learning_rate": 0.0009315089616631751, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 3567173.0, "repeat_count": 0.0, "routers_loss": 0.0006645230459980667, "skip_count": 0.0, "step": 2212, "text_loss": 0.42568323016166687 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.394481948928677, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009313525195147376, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3570831.0, "repeat_count": 0.0, "routers_loss": 0.0097877848893404, "skip_count": 0.0, "step": 2214, "text_loss": 0.45808279514312744 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 10.40387437628412, "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, "grad_norm": 0.076171875, "learning_rate": 0.000931195912071201, "loss": 0.0187, "macro_f1": 0.7018141150474548, "num_tokens": 3573745.0, "repeat_count": 2.0, "routers_loss": 0.07351134717464447, "skip_count": 3.0, "step": 2216, "text_loss": 0.285696804523468 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009310391393925775, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3576785.0, "repeat_count": 0.0, "routers_loss": 0.0033160944003611803, "skip_count": 0.0, "step": 2218, "text_loss": 0.17516443133354187 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.422659230995011, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.047119140625, "learning_rate": 0.0009308822015389424, "loss": 0.0241, "macro_f1": 0.5427350401878357, "num_tokens": 3580695.0, "repeat_count": 1.0, "routers_loss": 0.052930232137441635, "skip_count": 1.0, "step": 2220, "text_loss": 0.5918155908584595 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 10.432051658350455, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.072265625, "learning_rate": 0.0009307250985704352, "loss": 0.0128, "macro_f1": 0.6122449040412903, "num_tokens": 3583729.0, "repeat_count": 0.0, "routers_loss": 0.025454653427004814, "skip_count": 4.0, "step": 2222, "text_loss": 0.2652169466018677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0009305678305472575, "loss": 0.0158, "macro_f1": 0.3333333432674408, "num_tokens": 3586775.0, "repeat_count": 0.0, "routers_loss": 0.011279845610260963, "skip_count": 0.0, "step": 2224, "text_loss": 0.3511691987514496 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.000930410397529675, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3589676.0, "repeat_count": 0.0, "routers_loss": 0.002700264798477292, "skip_count": 0.0, "step": 2226, "text_loss": 0.24045433104038239 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.460228940416789, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.048095703125, "learning_rate": 0.000930252799578016, "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 3593242.0, "repeat_count": 1.0, "routers_loss": 0.00826631672680378, "skip_count": 2.0, "step": 2228, "text_loss": 0.3777645528316498 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.469621367772234, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0009300950367526728, "loss": 0.0131, "macro_f1": 0.8820862174034119, "num_tokens": 3596807.0, "repeat_count": 2.0, "routers_loss": 0.036221496760845184, "skip_count": 2.0, "step": 2230, "text_loss": 0.502962589263916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.0009299371091141001, "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3600150.0, "repeat_count": 0.0, "routers_loss": 0.006449893582612276, "skip_count": 0.0, "step": 2232, "text_loss": 0.20256924629211426 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04638671875, "learning_rate": 0.0009297790167228161, "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3602988.0, "repeat_count": 0.0, "routers_loss": 0.007872486487030983, "skip_count": 2.0, "step": 2234, "text_loss": 0.42476826906204224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.497798649838568, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0009296207596394022, "loss": 0.0101, "macro_f1": 0.32098764181137085, "num_tokens": 3606071.0, "repeat_count": 0.0, "routers_loss": 0.027397040277719498, "skip_count": 2.0, "step": 2236, "text_loss": 0.23432791233062744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0595703125, "learning_rate": 0.0009294623379245028, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3609389.0, "repeat_count": 0.0, "routers_loss": 0.01042645052075386, "skip_count": 0.0, "step": 2238, "text_loss": 0.16665785014629364 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.516583504549457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0009293037516388252, "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3612105.0, "repeat_count": 0.0, "routers_loss": 0.0012458425480872393, "skip_count": 0.0, "step": 2240, "text_loss": 0.59421306848526 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 10.525975931904902, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0009291450008431404, "loss": 0.0185, "macro_f1": 1.0, "num_tokens": 3615439.0, "repeat_count": 1.0, "routers_loss": 0.005781981628388166, "skip_count": 1.0, "step": 2242, "text_loss": 0.510798454284668 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 10.535368359260346, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.0966796875, "learning_rate": 0.0009289860855982814, "loss": 0.0166, "macro_f1": 0.4871794879436493, "num_tokens": 3618842.0, "repeat_count": 0.0, "routers_loss": 0.031195320188999176, "skip_count": 3.0, "step": 2244, "text_loss": 0.7574363350868225 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.544760786615791, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04931640625, "learning_rate": 0.0009288270059651454, "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 3621823.0, "repeat_count": 0.0, "routers_loss": 0.001746491645462811, "skip_count": 0.0, "step": 2246, "text_loss": 0.5125683546066284 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.554153213971237, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.220703125, "learning_rate": 0.0009286677620046918, "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3624502.0, "repeat_count": 0.0, "routers_loss": 0.03792348504066467, "skip_count": 2.0, "step": 2248, "text_loss": 0.7533677220344543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009285083537779429, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3627057.0, "repeat_count": 0.0, "routers_loss": 0.0009684451506473124, "skip_count": 0.0, "step": 2250, "text_loss": 0.2219279706478119 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.572938068682125, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11767578125, "learning_rate": 0.0009283487813459845, "loss": 0.0148, "macro_f1": 0.5492662787437439, "num_tokens": 3629720.0, "repeat_count": 0.0, "routers_loss": 0.022757573053240776, "skip_count": 2.0, "step": 2252, "text_loss": 0.6903313994407654 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.582330496037569, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009281890447699652, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 3633234.0, "repeat_count": 1.0, "routers_loss": 0.003613058477640152, "skip_count": 0.0, "step": 2254, "text_loss": 0.6278893351554871 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0009280291441110961, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3636289.0, "repeat_count": 0.0, "routers_loss": 0.006214062683284283, "skip_count": 0.0, "step": 2256, "text_loss": 0.3011114001274109 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.60111535074846, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.041015625, "learning_rate": 0.0009278690794306517, "loss": 0.014, "macro_f1": 0.5492662787437439, "num_tokens": 3640251.0, "repeat_count": 0.0, "routers_loss": 0.052556321024894714, "skip_count": 2.0, "step": 2258, "text_loss": 0.19894185662269592 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 10.610507778103903, "f1_execute": 0.978723406791687, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.08251953125, "learning_rate": 0.0009277088507899689, "loss": 0.0163, "macro_f1": 0.9452888369560242, "num_tokens": 3643527.0, "repeat_count": 4.0, "routers_loss": 0.0572301521897316, "skip_count": 1.0, "step": 2260, "text_loss": 0.5593410134315491 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.619900205459349, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0009275484582504475, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 3646959.0, "repeat_count": 0.0, "routers_loss": 0.008010074496269226, "skip_count": 0.0, "step": 2262, "text_loss": 0.2128177285194397 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 10.629292632814794, "f1_execute": 0.95652174949646, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, "grad_norm": 0.05419921875, "learning_rate": 0.0009273879018735505, "loss": 0.0138, "macro_f1": 0.8521739840507507, "num_tokens": 3651298.0, "repeat_count": 3.0, "routers_loss": 0.035729870200157166, "skip_count": 3.0, "step": 2264, "text_loss": 0.2987811267375946 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.638685060170237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009272271817208031, "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 3655609.0, "repeat_count": 0.0, "routers_loss": 0.002379779238253832, "skip_count": 0.0, "step": 2266, "text_loss": 0.6024088263511658 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0009270662978537939, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 3658444.0, "repeat_count": 0.0, "routers_loss": 0.008943650871515274, "skip_count": 0.0, "step": 2268, "text_loss": 0.1741207242012024 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 10.657469914881126, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0009269052503341736, "loss": 0.0161, "macro_f1": 0.6595745086669922, "num_tokens": 3662282.0, "repeat_count": 1.0, "routers_loss": 0.030201267451047897, "skip_count": 4.0, "step": 2270, "text_loss": 0.7300035953521729 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.666862342236572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.0009267440392236562, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3665531.0, "repeat_count": 0.0, "routers_loss": 0.0026635683607310057, "skip_count": 0.0, "step": 2272, "text_loss": 0.31535038352012634 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0009265826645840178, "loss": 0.0151, "macro_f1": 0.3333333432674408, "num_tokens": 3668407.0, "repeat_count": 0.0, "routers_loss": 0.004258926957845688, "skip_count": 0.0, "step": 2274, "text_loss": 0.7272579073905945 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 10.68564719694746, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.125, "learning_rate": 0.0009264211264770976, "loss": 0.0154, "macro_f1": 0.6122449040412903, "num_tokens": 3671503.0, "repeat_count": 0.0, "routers_loss": 0.038987524807453156, "skip_count": 4.0, "step": 2276, "text_loss": 0.7488982677459717 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.695039624302906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.099609375, "learning_rate": 0.0009262594249647975, "loss": 0.0164, "macro_f1": 0.6666666865348816, "num_tokens": 3674107.0, "repeat_count": 0.0, "routers_loss": 0.007211760152131319, "skip_count": 1.0, "step": 2278, "text_loss": 0.1992369294166565 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 10.704432051658351, "f1_execute": 0.9767441749572754, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.0546875, "learning_rate": 0.0009260975601090815, "loss": 0.0112, "macro_f1": 0.9446290731430054, "num_tokens": 3677184.0, "repeat_count": 4.0, "routers_loss": 0.02538592554628849, "skip_count": 3.0, "step": 2280, "text_loss": 0.46402135491371155 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0009259355319719768, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3680683.0, "repeat_count": 0.0, "routers_loss": 0.0038464947137981653, "skip_count": 0.0, "step": 2282, "text_loss": 0.5804527401924133 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009257733406155726, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3683928.0, "repeat_count": 0.0, "routers_loss": 0.004841136280447245, "skip_count": 0.0, "step": 2284, "text_loss": 0.4834538400173187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0009256109861020212, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 3687101.0, "repeat_count": 0.0, "routers_loss": 0.002191900508478284, "skip_count": 0.0, "step": 2286, "text_loss": 0.8199604749679565 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.742001761080129, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0927734375, "learning_rate": 0.000925448468493537, "loss": 0.0162, "macro_f1": 0.5427350401878357, "num_tokens": 3690490.0, "repeat_count": 1.0, "routers_loss": 0.03488675877451897, "skip_count": 2.0, "step": 2288, "text_loss": 0.33263635635375977 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.751394188435574, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0009252857878523971, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 3694109.0, "repeat_count": 1.0, "routers_loss": 0.002897309372201562, "skip_count": 0.0, "step": 2290, "text_loss": 0.47494807839393616 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.760786615791018, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05810546875, "learning_rate": 0.000925122944240941, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3697233.0, "repeat_count": 0.0, "routers_loss": 0.01842675730586052, "skip_count": 2.0, "step": 2292, "text_loss": 0.14693495631217957 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.770179043146463, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.045654296875, "learning_rate": 0.0009249599377215707, "loss": 0.0146, "macro_f1": 0.5866667032241821, "num_tokens": 3700376.0, "repeat_count": 1.0, "routers_loss": 0.04169808700680733, "skip_count": 3.0, "step": 2294, "text_loss": 0.38051268458366394 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.779571470501908, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05908203125, "learning_rate": 0.0009247967683567507, "loss": 0.0112, "macro_f1": 0.3272727429866791, "num_tokens": 3703212.0, "repeat_count": 0.0, "routers_loss": 0.012183113023638725, "skip_count": 1.0, "step": 2296, "text_loss": 0.23789077997207642 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 10.788963897857352, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05712890625, "learning_rate": 0.0009246334362090077, "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3706490.0, "repeat_count": 1.0, "routers_loss": 0.01880069635808468, "skip_count": 2.0, "step": 2298, "text_loss": 0.29067978262901306 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.798356325212797, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.000924469941340931, "loss": 0.0173, "macro_f1": 0.3272727429866791, "num_tokens": 3709804.0, "repeat_count": 1.0, "routers_loss": 0.027359159663319588, "skip_count": 0.0, "step": 2300, "text_loss": 0.67828369140625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.807748752568243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.000924306283815172, "loss": 0.0153, "macro_f1": 0.3333333432674408, "num_tokens": 3712824.0, "repeat_count": 0.0, "routers_loss": 0.003152279881760478, "skip_count": 0.0, "step": 2302, "text_loss": 0.8333184719085693 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.817141179923686, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0703125, "learning_rate": 0.0009241424636944445, "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3715385.0, "repeat_count": 0.0, "routers_loss": 0.0442950464785099, "skip_count": 2.0, "step": 2304, "text_loss": 0.41893699765205383 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 10.826533607279131, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.058837890625, "learning_rate": 0.0009239784810415249, "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3719080.0, "repeat_count": 1.0, "routers_loss": 0.015729321166872978, "skip_count": 2.0, "step": 2306, "text_loss": 0.13360483944416046 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.835926034634575, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.06787109375, "learning_rate": 0.0009238143359192514, "loss": 0.0136, "macro_f1": 0.5934640765190125, "num_tokens": 3722439.0, "repeat_count": 0.0, "routers_loss": 0.028816604986786842, "skip_count": 3.0, "step": 2308, "text_loss": 0.39594101905822754 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05419921875, "learning_rate": 0.000923650028390525, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3725092.0, "repeat_count": 0.0, "routers_loss": 0.0036455015651881695, "skip_count": 2.0, "step": 2310, "text_loss": 0.6169708371162415 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09814453125, "learning_rate": 0.0009234855585183086, "loss": 0.014, "macro_f1": 0.6666666865348816, "num_tokens": 3728412.0, "repeat_count": 0.0, "routers_loss": 0.007565604057163, "skip_count": 1.0, "step": 2312, "text_loss": 0.21257059276103973 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 10.86410331670091, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0517578125, "learning_rate": 0.0009233209263656273, "loss": 0.0184, "macro_f1": 0.9262410998344421, "num_tokens": 3731467.0, "repeat_count": 2.0, "routers_loss": 0.02510629966855049, "skip_count": 3.0, "step": 2314, "text_loss": 0.21639840304851532 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057861328125, "learning_rate": 0.0009231561319955684, "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3734906.0, "repeat_count": 0.0, "routers_loss": 0.00872227642685175, "skip_count": 0.0, "step": 2316, "text_loss": 0.35639774799346924 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0009229911754712815, "loss": 0.0176, "macro_f1": 0.3333333432674408, "num_tokens": 3737943.0, "repeat_count": 0.0, "routers_loss": 0.004695790819823742, "skip_count": 0.0, "step": 2318, "text_loss": 0.5269573330879211 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.892280598767243, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.0009228260568559781, "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 3741833.0, "repeat_count": 1.0, "routers_loss": 0.0217357836663723, "skip_count": 0.0, "step": 2320, "text_loss": 0.5110208988189697 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.901673026122689, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.0009226607762129322, "loss": 0.0201, "macro_f1": 0.32098764181137085, "num_tokens": 3744642.0, "repeat_count": 1.0, "routers_loss": 0.05595960095524788, "skip_count": 1.0, "step": 2322, "text_loss": 0.6291998624801636 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0009224953336054796, "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3748127.0, "repeat_count": 0.0, "routers_loss": 0.0071634589694440365, "skip_count": 0.0, "step": 2324, "text_loss": 0.7404762506484985 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.000922329729097018, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3751373.0, "repeat_count": 0.0, "routers_loss": 0.0011676300782710314, "skip_count": 0.0, "step": 2326, "text_loss": 0.2915459871292114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061279296875, "learning_rate": 0.0009221639627510075, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3754518.0, "repeat_count": 0.0, "routers_loss": 0.01039792038500309, "skip_count": 0.0, "step": 2328, "text_loss": 0.22066321969032288 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0751953125, "learning_rate": 0.0009219980346309702, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3757621.0, "repeat_count": 0.0, "routers_loss": 0.0032070958986878395, "skip_count": 0.0, "step": 2330, "text_loss": 0.5558560490608215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.948635162899912, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.076171875, "learning_rate": 0.0009218319448004899, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3760885.0, "repeat_count": 0.0, "routers_loss": 0.007085457909852266, "skip_count": 0.0, "step": 2332, "text_loss": 0.4348253607749939 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.958027590255357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1103515625, "learning_rate": 0.0009216656933232129, "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 3764462.0, "repeat_count": 0.0, "routers_loss": 0.005504854489117861, "skip_count": 1.0, "step": 2334, "text_loss": 0.35828644037246704 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0009214992802628463, "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3767159.0, "repeat_count": 0.0, "routers_loss": 0.0013970810687169433, "skip_count": 0.0, "step": 2336, "text_loss": 0.2956557869911194 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.976812444966246, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009213327056831607, "loss": 0.0181, "macro_f1": 0.3272727429866791, "num_tokens": 3770408.0, "repeat_count": 0.0, "routers_loss": 0.0427570566534996, "skip_count": 1.0, "step": 2338, "text_loss": 0.14883014559745789 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.986204872321691, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0009211659696479875, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 3773474.0, "repeat_count": 0.0, "routers_loss": 0.0011273405980318785, "skip_count": 0.0, "step": 2340, "text_loss": 0.26011669635772705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.995597299677135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.00092099907222122, "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3776909.0, "repeat_count": 0.0, "routers_loss": 0.0016178421210497618, "skip_count": 0.0, "step": 2342, "text_loss": 0.49078530073165894 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.004696213677722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.000920832013466814, "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 3780741.0, "repeat_count": 0.0, "routers_loss": 0.005510095041245222, "skip_count": 0.0, "step": 2344, "text_loss": 0.4870249927043915 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.014088641033167, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0009206647934487866, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3784673.0, "repeat_count": 1.0, "routers_loss": 0.0047357892617583275, "skip_count": 0.0, "step": 2346, "text_loss": 0.3251725733280182 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05615234375, "learning_rate": 0.0009204974122312167, "loss": 0.0142, "macro_f1": 0.6666666865348816, "num_tokens": 3787503.0, "repeat_count": 0.0, "routers_loss": 0.00795028731226921, "skip_count": 1.0, "step": 2348, "text_loss": 0.18282145261764526 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.060546875, "learning_rate": 0.0009203298698782452, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 3790528.0, "repeat_count": 1.0, "routers_loss": 0.0009506374481134117, "skip_count": 0.0, "step": 2350, "text_loss": 0.4093080461025238 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.042265923099501, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.047607421875, "learning_rate": 0.0009201621664540747, "loss": 0.0155, "macro_f1": 0.6666666865348816, "num_tokens": 3794134.0, "repeat_count": 1.0, "routers_loss": 0.005159572698175907, "skip_count": 0.0, "step": 2352, "text_loss": 0.5451981425285339 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.051658350454945, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009199943020229694, "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3797414.0, "repeat_count": 0.0, "routers_loss": 0.002356168581172824, "skip_count": 0.0, "step": 2354, "text_loss": 0.3070453405380249 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0810546875, "learning_rate": 0.0009198262766492554, "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 3800094.0, "repeat_count": 0.0, "routers_loss": 0.0051761893555521965, "skip_count": 1.0, "step": 2356, "text_loss": 0.5880904197692871 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.070443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.00091965809039732, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3803280.0, "repeat_count": 0.0, "routers_loss": 0.0025952060241252184, "skip_count": 0.0, "step": 2358, "text_loss": 0.5210731625556946 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.079835632521279, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06787109375, "learning_rate": 0.0009194897433316127, "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 3805866.0, "repeat_count": 0.0, "routers_loss": 0.0042560105212032795, "skip_count": 2.0, "step": 2360, "text_loss": 0.6472984552383423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009193212355166446, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3808952.0, "repeat_count": 0.0, "routers_loss": 0.0026232977397739887, "skip_count": 0.0, "step": 2362, "text_loss": 0.450063556432724 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0009191525670169881, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3812080.0, "repeat_count": 0.0, "routers_loss": 0.0034355956595391035, "skip_count": 0.0, "step": 2364, "text_loss": 0.49727216362953186 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.108012914587613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.000918983737897277, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3815282.0, "repeat_count": 0.0, "routers_loss": 0.0055653867311775684, "skip_count": 1.0, "step": 2366, "text_loss": 0.6336377859115601 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.117405341943059, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0009188147482222071, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 3818106.0, "repeat_count": 2.0, "routers_loss": 0.011016021482646465, "skip_count": 2.0, "step": 2368, "text_loss": 0.22513329982757568 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.126797769298504, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009186455980565358, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3821228.0, "repeat_count": 1.0, "routers_loss": 0.014039464294910431, "skip_count": 0.0, "step": 2370, "text_loss": 0.21331638097763062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009184762874650816, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3825048.0, "repeat_count": 0.0, "routers_loss": 0.001088051125407219, "skip_count": 0.0, "step": 2372, "text_loss": 0.6031543612480164 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.095703125, "learning_rate": 0.0009183068165127245, "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3828781.0, "repeat_count": 0.0, "routers_loss": 0.006263940595090389, "skip_count": 1.0, "step": 2374, "text_loss": 0.6249601244926453 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.154975051364836, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.0009181371852644062, "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 3832507.0, "repeat_count": 1.0, "routers_loss": 0.001987969037145376, "skip_count": 0.0, "step": 2376, "text_loss": 0.37972065806388855 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.164367478720282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0908203125, "learning_rate": 0.0009179673937851299, "loss": 0.0158, "macro_f1": 0.6666666865348816, "num_tokens": 3835644.0, "repeat_count": 0.0, "routers_loss": 0.007635094691067934, "skip_count": 1.0, "step": 2378, "text_loss": 0.46319663524627686 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0830078125, "learning_rate": 0.0009177974421399598, "loss": 0.0137, "macro_f1": 0.6666666865348816, "num_tokens": 3838700.0, "repeat_count": 0.0, "routers_loss": 0.01617279462516308, "skip_count": 2.0, "step": 2380, "text_loss": 0.32141056656837463 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056396484375, "learning_rate": 0.0009176273303940217, "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 3841953.0, "repeat_count": 0.0, "routers_loss": 0.0022273799404501915, "skip_count": 2.0, "step": 2382, "text_loss": 0.5908139944076538 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.192544760786616, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0009174570586125026, "loss": 0.0122, "macro_f1": 0.32098767161369324, "num_tokens": 3845763.0, "repeat_count": 1.0, "routers_loss": 0.030915161594748497, "skip_count": 0.0, "step": 2384, "text_loss": 0.41400137543678284 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.201937188142061, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04248046875, "learning_rate": 0.0009172866268606513, "loss": 0.0122, "macro_f1": 0.6666666865348816, "num_tokens": 3848984.0, "repeat_count": 0.0, "routers_loss": 0.010480951517820358, "skip_count": 2.0, "step": 2386, "text_loss": 0.2560874819755554 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056396484375, "learning_rate": 0.0009171160352037775, "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3852118.0, "repeat_count": 0.0, "routers_loss": 0.00809961836785078, "skip_count": 1.0, "step": 2388, "text_loss": 0.28236693143844604 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.22072204285295, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0009169452837072521, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 3855314.0, "repeat_count": 1.0, "routers_loss": 0.005569872446358204, "skip_count": 1.0, "step": 2390, "text_loss": 0.4578137695789337 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1123046875, "learning_rate": 0.0009167743724365073, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3858301.0, "repeat_count": 0.0, "routers_loss": 0.0038610948249697685, "skip_count": 1.0, "step": 2392, "text_loss": 0.14082716405391693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009166033014570368, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3861296.0, "repeat_count": 0.0, "routers_loss": 0.0017607157351449132, "skip_count": 0.0, "step": 2394, "text_loss": 0.384442001581192 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 11.248899324919284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0009164320708343954, "loss": 0.0131, "macro_f1": 0.6666666865348816, "num_tokens": 3863985.0, "repeat_count": 2.0, "routers_loss": 0.009627950377762318, "skip_count": 0.0, "step": 2396, "text_loss": 0.6969521045684814 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.258291752274728, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0009162606806341989, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 3866636.0, "repeat_count": 0.0, "routers_loss": 0.006915586534887552, "skip_count": 0.0, "step": 2398, "text_loss": 0.48069697618484497 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.267684179630173, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.0009160891309221242, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3870867.0, "repeat_count": 1.0, "routers_loss": 0.0013031222624704242, "skip_count": 0.0, "step": 2400, "text_loss": 0.3882075846195221 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.277076606985618, "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0009159174217639096, "loss": 0.0112, "macro_f1": 0.5427350401878357, "num_tokens": 3873663.0, "repeat_count": 2.0, "routers_loss": 0.06621067970991135, "skip_count": 1.0, "step": 2402, "text_loss": 0.5740041136741638 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.286469034341062, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0009157455532253547, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3876788.0, "repeat_count": 1.0, "routers_loss": 0.005957918707281351, "skip_count": 0.0, "step": 2404, "text_loss": 0.26025933027267456 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 11.295861461696507, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.08642578125, "learning_rate": 0.0009155735253723191, "loss": 0.0126, "macro_f1": 0.9452888369560242, "num_tokens": 3879942.0, "repeat_count": 1.0, "routers_loss": 0.039429809898138046, "skip_count": 4.0, "step": 2406, "text_loss": 1.1349908113479614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.305253889051952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047607421875, "learning_rate": 0.0009154013382707251, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3882682.0, "repeat_count": 0.0, "routers_loss": 0.0012570557883009315, "skip_count": 0.0, "step": 2408, "text_loss": 0.5611135363578796 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.314646316407396, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0009152289919865543, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3886425.0, "repeat_count": 0.0, "routers_loss": 0.0017455556662753224, "skip_count": 0.0, "step": 2410, "text_loss": 0.7523751854896545 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.324038743762841, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0009150564865858506, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3889273.0, "repeat_count": 0.0, "routers_loss": 0.011178011074662209, "skip_count": 1.0, "step": 2412, "text_loss": 0.26942551136016846 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 11.333431171118287, "f1_execute": 0.9803921580314636, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.0009148838221347182, "loss": 0.0107, "macro_f1": 0.5934640765190125, "num_tokens": 3892199.0, "repeat_count": 3.0, "routers_loss": 0.019628092646598816, "skip_count": 0.0, "step": 2414, "text_loss": 0.5492315888404846 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.34282359847373, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04541015625, "learning_rate": 0.0009147109986993225, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 3895362.0, "repeat_count": 1.0, "routers_loss": 0.012255983427166939, "skip_count": 0.0, "step": 2416, "text_loss": 0.23798216879367828 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0009145380163458899, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3898476.0, "repeat_count": 0.0, "routers_loss": 0.007018954027444124, "skip_count": 0.0, "step": 2418, "text_loss": 0.1923145055770874 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.361608453184619, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0009143648751407074, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 3901817.0, "repeat_count": 0.0, "routers_loss": 0.0008574824314564466, "skip_count": 0.0, "step": 2420, "text_loss": 0.4001806974411011 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 11.371000880540064, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11328125, "learning_rate": 0.0009141915751501231, "loss": 0.0102, "macro_f1": 0.5492662787437439, "num_tokens": 3905461.0, "repeat_count": 0.0, "routers_loss": 0.01572350226342678, "skip_count": 2.0, "step": 2422, "text_loss": 0.19519129395484924 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0009140181164405458, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3908878.0, "repeat_count": 0.0, "routers_loss": 0.0005503420252352953, "skip_count": 0.0, "step": 2424, "text_loss": 0.6937088370323181 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0009138444990784454, "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3912053.0, "repeat_count": 0.0, "routers_loss": 0.007556677330285311, "skip_count": 0.0, "step": 2426, "text_loss": 0.35431069135665894 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.399178162606399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.000913670723130352, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3915192.0, "repeat_count": 0.0, "routers_loss": 0.0013609991874545813, "skip_count": 0.0, "step": 2428, "text_loss": 0.5171207189559937 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.408570589961844, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.050048828125, "learning_rate": 0.0009134967886628573, "loss": 0.0115, "macro_f1": 1.0, "num_tokens": 3917927.0, "repeat_count": 2.0, "routers_loss": 0.010895746760070324, "skip_count": 2.0, "step": 2430, "text_loss": 0.2852934002876282 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.417963017317287, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009133226957426133, "loss": 0.0132, "macro_f1": 0.5492662787437439, "num_tokens": 3921460.0, "repeat_count": 2.0, "routers_loss": 0.04196908697485924, "skip_count": 0.0, "step": 2432, "text_loss": 0.4864770770072937 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.427355444672733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0009131484444363324, "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3924662.0, "repeat_count": 0.0, "routers_loss": 0.004484197124838829, "skip_count": 0.0, "step": 2434, "text_loss": 0.7568684220314026 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0009129740348107882, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3927337.0, "repeat_count": 0.0, "routers_loss": 0.004351360257714987, "skip_count": 2.0, "step": 2436, "text_loss": 0.5953161716461182 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 11.446140299383622, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.04736328125, "learning_rate": 0.0009127994669328151, "loss": 0.0085, "macro_f1": 0.6122449040412903, "num_tokens": 3930407.0, "repeat_count": 0.0, "routers_loss": 0.01664198748767376, "skip_count": 4.0, "step": 2438, "text_loss": 0.5320524573326111 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.455532726739067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0595703125, "learning_rate": 0.0009126247408693071, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3933184.0, "repeat_count": 0.0, "routers_loss": 0.0017819046042859554, "skip_count": 1.0, "step": 2440, "text_loss": 0.6051273345947266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0009124498566872204, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 3936620.0, "repeat_count": 0.0, "routers_loss": 0.005519696045666933, "skip_count": 0.0, "step": 2442, "text_loss": 0.12987950444221497 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.474317581449956, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0009122748144535704, "loss": 0.0111, "macro_f1": 0.32098764181137085, "num_tokens": 3940010.0, "repeat_count": 0.0, "routers_loss": 0.04543351009488106, "skip_count": 2.0, "step": 2444, "text_loss": 0.4642033576965332 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.483710008805401, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009120996142354338, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3943135.0, "repeat_count": 0.0, "routers_loss": 0.00550565542653203, "skip_count": 0.0, "step": 2446, "text_loss": 0.5697627067565918 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.493102436160845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0009119242560999477, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3946650.0, "repeat_count": 0.0, "routers_loss": 0.008842485956847668, "skip_count": 0.0, "step": 2448, "text_loss": 0.17046524584293365 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08154296875, "learning_rate": 0.0009117487401143095, "loss": 0.0154, "macro_f1": 0.6666666865348816, "num_tokens": 3949470.0, "repeat_count": 1.0, "routers_loss": 0.005900127813220024, "skip_count": 0.0, "step": 2450, "text_loss": 0.37260866165161133 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.511887290871735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0009115730663457773, "loss": 0.0137, "macro_f1": 1.0, "num_tokens": 3952546.0, "repeat_count": 1.0, "routers_loss": 0.003409258322790265, "skip_count": 1.0, "step": 2452, "text_loss": 0.5308008193969727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05224609375, "learning_rate": 0.0009113972348616698, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 3955817.0, "repeat_count": 0.0, "routers_loss": 0.010098597034811974, "skip_count": 1.0, "step": 2454, "text_loss": 0.39226648211479187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 11.530672145582624, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0009112212457293658, "loss": 0.0102, "macro_f1": 0.3272727429866791, "num_tokens": 3958911.0, "repeat_count": 0.0, "routers_loss": 0.08184818178415298, "skip_count": 0.0, "step": 2456, "text_loss": 0.45411455631256104 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0009110450990163047, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3962584.0, "repeat_count": 0.0, "routers_loss": 0.0009352223132736981, "skip_count": 0.0, "step": 2458, "text_loss": 0.47292324900627136 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.549457000293513, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0009108687947899863, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 3965597.0, "repeat_count": 1.0, "routers_loss": 0.008150188252329826, "skip_count": 2.0, "step": 2460, "text_loss": 0.33208340406417847 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 11.558849427648958, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.043212890625, "learning_rate": 0.0009106923331179707, "loss": 0.0125, "macro_f1": 0.5492662787437439, "num_tokens": 3968664.0, "repeat_count": 0.0, "routers_loss": 0.050999004393815994, "skip_count": 2.0, "step": 2462, "text_loss": 0.2459995150566101 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009105157140678782, "loss": 0.0126, "macro_f1": 0.6666666865348816, "num_tokens": 3971772.0, "repeat_count": 0.0, "routers_loss": 0.006196586415171623, "skip_count": 1.0, "step": 2464, "text_loss": 0.23956991732120514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.577634282359847, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009103389377073896, "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 3976224.0, "repeat_count": 0.0, "routers_loss": 0.008181816898286343, "skip_count": 0.0, "step": 2466, "text_loss": 0.3235875070095062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057373046875, "learning_rate": 0.0009101620041042462, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3978876.0, "repeat_count": 0.0, "routers_loss": 0.0015451472718268633, "skip_count": 0.0, "step": 2468, "text_loss": 0.4038759469985962 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.596419137070736, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09130859375, "learning_rate": 0.000909984913326249, "loss": 0.0131, "macro_f1": 0.3272727429866791, "num_tokens": 3981992.0, "repeat_count": 0.0, "routers_loss": 0.021785033866763115, "skip_count": 1.0, "step": 2470, "text_loss": 0.6346460580825806 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.605811564426181, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0009098076654412595, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 3984560.0, "repeat_count": 0.0, "routers_loss": 0.0011462471447885036, "skip_count": 0.0, "step": 2472, "text_loss": 0.3449646532535553 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.0009096302605171996, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3987548.0, "repeat_count": 0.0, "routers_loss": 0.0014367027906700969, "skip_count": 0.0, "step": 2474, "text_loss": 0.5918350219726562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.0009094526986220513, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 3990727.0, "repeat_count": 0.0, "routers_loss": 0.0008977655088528991, "skip_count": 0.0, "step": 2476, "text_loss": 0.463350385427475 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.633988846492516, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.0009092749798238563, "loss": 0.015, "macro_f1": 0.3272727429866791, "num_tokens": 3993757.0, "repeat_count": 1.0, "routers_loss": 0.016712551936507225, "skip_count": 0.0, "step": 2478, "text_loss": 0.5621229410171509 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.643381273847961, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.000909097104190717, "loss": 0.0172, "macro_f1": 0.32098764181137085, "num_tokens": 3997259.0, "repeat_count": 0.0, "routers_loss": 0.04134179651737213, "skip_count": 2.0, "step": 2480, "text_loss": 0.375476598739624 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0009089190717907956, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4000563.0, "repeat_count": 0.0, "routers_loss": 0.003462378401309252, "skip_count": 0.0, "step": 2482, "text_loss": 0.5553798675537109 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0009087408826923146, "loss": 0.0182, "macro_f1": 0.6666666865348816, "num_tokens": 4004065.0, "repeat_count": 0.0, "routers_loss": 0.008057428523898125, "skip_count": 2.0, "step": 2484, "text_loss": 0.4329465329647064 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.671558555914293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0009085625369635564, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4007119.0, "repeat_count": 0.0, "routers_loss": 0.005759050603955984, "skip_count": 0.0, "step": 2486, "text_loss": 0.501268744468689 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.680950983269739, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009083840346728631, "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 4010547.0, "repeat_count": 1.0, "routers_loss": 0.020763102918863297, "skip_count": 0.0, "step": 2488, "text_loss": 0.480196475982666 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.690343410625184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0009082053758886374, "loss": 0.0117, "macro_f1": 0.6666666865348816, "num_tokens": 4014600.0, "repeat_count": 0.0, "routers_loss": 0.005801836494356394, "skip_count": 1.0, "step": 2490, "text_loss": 0.18249782919883728 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.699735837980628, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.062255859375, "learning_rate": 0.0009080265606793416, "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 4017964.0, "repeat_count": 1.0, "routers_loss": 0.004226063843816519, "skip_count": 1.0, "step": 2492, "text_loss": 0.6573076248168945 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.049072265625, "learning_rate": 0.000907847589113498, "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 4020694.0, "repeat_count": 0.0, "routers_loss": 0.004281101748347282, "skip_count": 2.0, "step": 2494, "text_loss": 0.3944586217403412 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.718520692691518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061279296875, "learning_rate": 0.000907668461259689, "loss": 0.0152, "macro_f1": 0.6666666865348816, "num_tokens": 4023757.0, "repeat_count": 0.0, "routers_loss": 0.008786370046436787, "skip_count": 1.0, "step": 2496, "text_loss": 0.6452898979187012 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.727913120046962, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0009074891771865566, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4026601.0, "repeat_count": 0.0, "routers_loss": 0.005209595896303654, "skip_count": 0.0, "step": 2498, "text_loss": 0.9633619785308838 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 11.737305547402407, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0009073097369628028, "loss": 0.013, "macro_f1": 1.0, "num_tokens": 4030321.0, "repeat_count": 3.0, "routers_loss": 0.00860709697008133, "skip_count": 1.0, "step": 2500, "text_loss": 0.48566827178001404 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0009071301406571893, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4033234.0, "repeat_count": 0.0, "routers_loss": 0.0035277456045150757, "skip_count": 0.0, "step": 2502, "text_loss": 0.3771554231643677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.000906950388338538, "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 4036417.0, "repeat_count": 0.0, "routers_loss": 0.0013424850767478347, "skip_count": 0.0, "step": 2504, "text_loss": 0.8962806463241577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.765482829468741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0009067704800757301, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4039564.0, "repeat_count": 0.0, "routers_loss": 0.0010423909407109022, "skip_count": 0.0, "step": 2506, "text_loss": 0.43170279264450073 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.774875256824185, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.000906590415937707, "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 4043212.0, "repeat_count": 0.0, "routers_loss": 0.021780289709568024, "skip_count": 1.0, "step": 2508, "text_loss": 0.41495826840400696 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.78426768417963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0009064101959934696, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4046687.0, "repeat_count": 0.0, "routers_loss": 0.007261929102241993, "skip_count": 1.0, "step": 2510, "text_loss": 0.21821187436580658 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.057861328125, "learning_rate": 0.0009062298203120783, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4050735.0, "repeat_count": 0.0, "routers_loss": 0.007447180338203907, "skip_count": 2.0, "step": 2512, "text_loss": 0.1818767935037613 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.803052538890519, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0009060492889626535, "loss": 0.0142, "macro_f1": 0.3272727429866791, "num_tokens": 4054426.0, "repeat_count": 1.0, "routers_loss": 0.0718490406870842, "skip_count": 0.0, "step": 2514, "text_loss": 0.22798970341682434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.812444966245964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.0009058686020143753, "loss": 0.0183, "macro_f1": 0.3333333432674408, "num_tokens": 4057615.0, "repeat_count": 0.0, "routers_loss": 0.0052676633931696415, "skip_count": 0.0, "step": 2516, "text_loss": 0.1712338626384735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0009056877595364832, "loss": 0.0137, "macro_f1": 0.3333333432674408, "num_tokens": 4060338.0, "repeat_count": 0.0, "routers_loss": 0.0018052728846669197, "skip_count": 0.0, "step": 2518, "text_loss": 0.6811438798904419 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.0009055067615982761, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4062887.0, "repeat_count": 0.0, "routers_loss": 0.0009029926732182503, "skip_count": 0.0, "step": 2520, "text_loss": 0.5480356812477112 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.840622248312298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051025390625, "learning_rate": 0.0009053256082691133, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 4065357.0, "repeat_count": 0.0, "routers_loss": 0.0027515271212905645, "skip_count": 0.0, "step": 2522, "text_loss": 0.5234101414680481 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009051442996184127, "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 4068111.0, "repeat_count": 0.0, "routers_loss": 0.002199822571128607, "skip_count": 0.0, "step": 2524, "text_loss": 0.2418575882911682 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0625, "learning_rate": 0.0009049628357156521, "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 4071284.0, "repeat_count": 0.0, "routers_loss": 0.006303096655756235, "skip_count": 2.0, "step": 2526, "text_loss": 0.7948065996170044 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.868799530378633, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.000904781216630369, "loss": 0.0068, "macro_f1": 0.6601307392120361, "num_tokens": 4074750.0, "repeat_count": 1.0, "routers_loss": 0.01791904680430889, "skip_count": 2.0, "step": 2528, "text_loss": 0.809726357460022 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 11.878191957734076, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0576171875, "learning_rate": 0.0009045994424321602, "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4078617.0, "repeat_count": 2.0, "routers_loss": 0.016553178429603577, "skip_count": 2.0, "step": 2530, "text_loss": 0.8755000829696655 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.887584385089522, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061767578125, "learning_rate": 0.0009044175131906817, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 4080936.0, "repeat_count": 0.0, "routers_loss": 0.00884837657213211, "skip_count": 0.0, "step": 2532, "text_loss": 0.795871913433075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.896976812444967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0009042354289756491, "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4084459.0, "repeat_count": 0.0, "routers_loss": 0.0024387789890170097, "skip_count": 0.0, "step": 2534, "text_loss": 0.18875400722026825 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0625, "learning_rate": 0.0009040531898568379, "loss": 0.0171, "macro_f1": 0.3333333432674408, "num_tokens": 4088464.0, "repeat_count": 0.0, "routers_loss": 0.00491489190608263, "skip_count": 0.0, "step": 2536, "text_loss": 0.334369033575058 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 11.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.091796875, "learning_rate": 0.000903870795904082, "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 4091659.0, "repeat_count": 0.0, "routers_loss": 0.004592662677168846, "skip_count": 2.0, "step": 2538, "text_loss": 0.21298295259475708 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 11.925154094511301, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0458984375, "learning_rate": 0.000903688247187275, "loss": 0.0137, "macro_f1": 0.5492662787437439, "num_tokens": 4095496.0, "repeat_count": 0.0, "routers_loss": 0.011647242121398449, "skip_count": 2.0, "step": 2540, "text_loss": 0.2985081672668457 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0009035055437763704, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4098663.0, "repeat_count": 0.0, "routers_loss": 0.0021238960325717926, "skip_count": 0.0, "step": 2542, "text_loss": 0.35359489917755127 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 11.94393894922219, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05859375, "learning_rate": 0.0009033226857413803, "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 4101588.0, "repeat_count": 1.0, "routers_loss": 0.0024701557122170925, "skip_count": 0.0, "step": 2544, "text_loss": 1.1577601432800293 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.000903139673152376, "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4104643.0, "repeat_count": 0.0, "routers_loss": 0.002499542199075222, "skip_count": 0.0, "step": 2546, "text_loss": 1.0173401832580566 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.962723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.0009029565060794885, "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 4109247.0, "repeat_count": 0.0, "routers_loss": 0.0034200598020106554, "skip_count": 0.0, "step": 2548, "text_loss": 0.5690504312515259 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 11.972116231288524, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06884765625, "learning_rate": 0.0009027731845929079, "loss": 0.0155, "macro_f1": 0.8823530077934265, "num_tokens": 4112597.0, "repeat_count": 1.0, "routers_loss": 0.015981333330273628, "skip_count": 1.0, "step": 2550, "text_loss": 0.294549822807312 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 11.981508658643968, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.06103515625, "learning_rate": 0.0009025897087628829, "loss": 0.0064, "macro_f1": 0.5492662787437439, "num_tokens": 4115844.0, "repeat_count": 0.0, "routers_loss": 0.02606951631605625, "skip_count": 2.0, "step": 2552, "text_loss": 0.22692419588565826 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 11.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.0009024060786597222, "loss": 0.0202, "macro_f1": 0.3333333432674408, "num_tokens": 4118634.0, "repeat_count": 0.0, "routers_loss": 0.001026194542646408, "skip_count": 0.0, "step": 2554, "text_loss": 0.6807059645652771 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.000902222294353793, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4122024.0, "repeat_count": 0.0, "routers_loss": 0.001974924933165312, "skip_count": 0.0, "step": 2556, "text_loss": 0.7373668551445007 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.009392427355445, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04833984375, "learning_rate": 0.0009020383559155219, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 4124803.0, "repeat_count": 1.0, "routers_loss": 0.004662613850086927, "skip_count": 2.0, "step": 2558, "text_loss": 0.21808166801929474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.018784854710889, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.0009018542634153943, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 4127680.0, "repeat_count": 0.0, "routers_loss": 0.006881687790155411, "skip_count": 0.0, "step": 2560, "text_loss": 0.25192978978157043 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 12.028177282066334, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0009016700169239551, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 4130431.0, "repeat_count": 1.0, "routers_loss": 0.005977808032184839, "skip_count": 1.0, "step": 2562, "text_loss": 0.4700816869735718 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009014856165118075, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 4133535.0, "repeat_count": 0.0, "routers_loss": 0.007005698047578335, "skip_count": 1.0, "step": 2564, "text_loss": 0.6558199524879456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.046962136777223, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030517578125, "learning_rate": 0.0009013010622496144, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4136534.0, "repeat_count": 0.0, "routers_loss": 0.007262171246111393, "skip_count": 0.0, "step": 2566, "text_loss": 0.2565421462059021 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 12.056354564132668, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.043212890625, "learning_rate": 0.0009011163542080971, "loss": 0.0088, "macro_f1": 0.5934640765190125, "num_tokens": 4139762.0, "repeat_count": 0.0, "routers_loss": 0.05431923270225525, "skip_count": 3.0, "step": 2568, "text_loss": 0.19896510243415833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0009009314924580363, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4143398.0, "repeat_count": 0.0, "routers_loss": 0.003667369019240141, "skip_count": 0.0, "step": 2570, "text_loss": 0.6581419110298157 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.075139418843557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052978515625, "learning_rate": 0.0009007464770702712, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4146248.0, "repeat_count": 0.0, "routers_loss": 0.00132099783513695, "skip_count": 0.0, "step": 2572, "text_loss": 0.5316711068153381 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0009005613081157002, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4149455.0, "repeat_count": 0.0, "routers_loss": 0.0020061524119228125, "skip_count": 0.0, "step": 2574, "text_loss": 0.5400773882865906 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05517578125, "learning_rate": 0.0009003759856652802, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4152774.0, "repeat_count": 0.0, "routers_loss": 0.002621434163302183, "skip_count": 1.0, "step": 2576, "text_loss": 0.3672606945037842 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051513671875, "learning_rate": 0.0009001905097900273, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4155835.0, "repeat_count": 0.0, "routers_loss": 0.005290219560265541, "skip_count": 0.0, "step": 2578, "text_loss": 0.8159038424491882 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0009000048805610161, "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 4158874.0, "repeat_count": 0.0, "routers_loss": 0.0013576085912063718, "skip_count": 0.0, "step": 2580, "text_loss": 0.5518951416015625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.00089981909804938, "loss": 0.0143, "macro_f1": 0.3333333432674408, "num_tokens": 4162076.0, "repeat_count": 0.0, "routers_loss": 0.0021483441814780235, "skip_count": 0.0, "step": 2582, "text_loss": 0.43552228808403015 }, { "acc_repeat": 1.0, "acc_skip": 0.25, "avg_layers": 28.0, "epoch": 12.131493982976226, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.068359375, "learning_rate": 0.0008996331623263114, "loss": 0.0117, "macro_f1": 0.7795917987823486, "num_tokens": 4165041.0, "repeat_count": 1.0, "routers_loss": 0.0544300302863121, "skip_count": 4.0, "step": 2584, "text_loss": 0.24812501668930054 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.140886410331671, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047607421875, "learning_rate": 0.0008994470734630611, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4168290.0, "repeat_count": 0.0, "routers_loss": 0.0017150711501017213, "skip_count": 0.0, "step": 2586, "text_loss": 0.6392097473144531 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0615234375, "learning_rate": 0.0008992608315309388, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4171310.0, "repeat_count": 0.0, "routers_loss": 0.0046473173424601555, "skip_count": 2.0, "step": 2588, "text_loss": 0.6534156799316406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.15967126504256, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06591796875, "learning_rate": 0.0008990744366013125, "loss": 0.0105, "macro_f1": 0.3144654333591461, "num_tokens": 4174042.0, "repeat_count": 2.0, "routers_loss": 0.060913100838661194, "skip_count": 1.0, "step": 2590, "text_loss": 0.5365690588951111 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 12.169063692398003, "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.055419921875, "learning_rate": 0.0008988878887456093, "loss": 0.0118, "macro_f1": 0.6051587462425232, "num_tokens": 4177666.0, "repeat_count": 1.0, "routers_loss": 0.06268956512212753, "skip_count": 4.0, "step": 2592, "text_loss": 0.226226806640625 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.178456119753449, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008987011880353149, "loss": 0.0089, "macro_f1": 0.32098764181137085, "num_tokens": 4180490.0, "repeat_count": 0.0, "routers_loss": 0.030141465365886688, "skip_count": 2.0, "step": 2594, "text_loss": 0.2581401765346527 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 12.187848547108894, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.044677734375, "learning_rate": 0.0008985143345419729, "loss": 0.0082, "macro_f1": 0.5492662787437439, "num_tokens": 4183300.0, "repeat_count": 0.0, "routers_loss": 0.018745863810181618, "skip_count": 2.0, "step": 2596, "text_loss": 0.7778542637825012 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 12.197240974464338, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.064453125, "learning_rate": 0.0008983273283371862, "loss": 0.0096, "macro_f1": 0.5492662787437439, "num_tokens": 4186535.0, "repeat_count": 0.0, "routers_loss": 0.026792079210281372, "skip_count": 2.0, "step": 2598, "text_loss": 0.34700271487236023 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008981401694926159, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4189082.0, "repeat_count": 0.0, "routers_loss": 0.001914160675369203, "skip_count": 0.0, "step": 2600, "text_loss": 0.6879339218139648 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.216025829175228, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0008979528580799815, "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 4192330.0, "repeat_count": 0.0, "routers_loss": 0.007978348061442375, "skip_count": 2.0, "step": 2602, "text_loss": 0.3524550497531891 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 12.225418256530672, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.0008977653941710613, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4196117.0, "repeat_count": 2.0, "routers_loss": 0.0035376469604671, "skip_count": 0.0, "step": 2604, "text_loss": 0.42356348037719727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05810546875, "learning_rate": 0.0008975777778376916, "loss": 0.0156, "macro_f1": 0.6666666865348816, "num_tokens": 4200423.0, "repeat_count": 0.0, "routers_loss": 0.008262477815151215, "skip_count": 1.0, "step": 2606, "text_loss": 0.5272893905639648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.244203111241562, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0008973900091517675, "loss": 0.0114, "macro_f1": 0.3272727429866791, "num_tokens": 4203257.0, "repeat_count": 0.0, "routers_loss": 0.022957922890782356, "skip_count": 1.0, "step": 2608, "text_loss": 0.2713734805583954 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.253595538597006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043701171875, "learning_rate": 0.000897202088185242, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 4206243.0, "repeat_count": 0.0, "routers_loss": 0.006623407825827599, "skip_count": 2.0, "step": 2610, "text_loss": 0.5920525789260864 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.262987965952451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0008970140150101274, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4209264.0, "repeat_count": 0.0, "routers_loss": 0.0008602747693657875, "skip_count": 0.0, "step": 2612, "text_loss": 0.33421996235847473 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0008968257896984932, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 4212058.0, "repeat_count": 0.0, "routers_loss": 0.0024653903674334288, "skip_count": 1.0, "step": 2614, "text_loss": 0.37923356890678406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.28177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06298828125, "learning_rate": 0.0008966374123224677, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4214929.0, "repeat_count": 0.0, "routers_loss": 0.010878405533730984, "skip_count": 0.0, "step": 2616, "text_loss": 0.4350503981113434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.291165248018785, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0008964488829542376, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4219170.0, "repeat_count": 0.0, "routers_loss": 0.02864212542772293, "skip_count": 1.0, "step": 2618, "text_loss": 0.26250728964805603 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.300557675374229, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.062255859375, "learning_rate": 0.0008962602016660478, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4222077.0, "repeat_count": 0.0, "routers_loss": 0.010444172658026218, "skip_count": 2.0, "step": 2620, "text_loss": 0.4718937575817108 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.309950102729674, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0478515625, "learning_rate": 0.0008960713685302011, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4225383.0, "repeat_count": 0.0, "routers_loss": 0.006409442983567715, "skip_count": 1.0, "step": 2622, "text_loss": 0.30420538783073425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.31934253008512, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.0008958823836190588, "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 4228349.0, "repeat_count": 0.0, "routers_loss": 0.009996986016631126, "skip_count": 1.0, "step": 2624, "text_loss": 0.5392362475395203 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0008956932470050404, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 4232007.0, "repeat_count": 0.0, "routers_loss": 0.0014383369125425816, "skip_count": 0.0, "step": 2626, "text_loss": 0.7112401127815247 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 12.338127384796008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.0008955039587606233, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4235122.0, "repeat_count": 0.0, "routers_loss": 0.00781513936817646, "skip_count": 3.0, "step": 2628, "text_loss": 0.17802883684635162 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 12.347519812151454, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0439453125, "learning_rate": 0.0008953145189583429, "loss": 0.0126, "macro_f1": 0.542222261428833, "num_tokens": 4238248.0, "repeat_count": 0.0, "routers_loss": 0.062252625823020935, "skip_count": 4.0, "step": 2630, "text_loss": 0.5551572442054749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0008951249276707933, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4241042.0, "repeat_count": 0.0, "routers_loss": 0.0011421777307987213, "skip_count": 0.0, "step": 2632, "text_loss": 0.7092233896255493 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.366304666862343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0008949351849706261, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4243939.0, "repeat_count": 0.0, "routers_loss": 0.0032689040526747704, "skip_count": 0.0, "step": 2634, "text_loss": 0.19925718009471893 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.375697094217786, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.0008947452909305509, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4247535.0, "repeat_count": 1.0, "routers_loss": 0.002066014800220728, "skip_count": 0.0, "step": 2636, "text_loss": 0.5249715447425842 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 12.385089521573232, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.09326171875, "learning_rate": 0.0008945552456233356, "loss": 0.0169, "macro_f1": 0.8820862174034119, "num_tokens": 4251441.0, "repeat_count": 2.0, "routers_loss": 0.029332537204027176, "skip_count": 2.0, "step": 2638, "text_loss": 0.19229578971862793 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.394481948928677, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.078125, "learning_rate": 0.0008943650491218058, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4254314.0, "repeat_count": 0.0, "routers_loss": 0.0075911120511591434, "skip_count": 0.0, "step": 2640, "text_loss": 0.27059751749038696 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.0008941747014988453, "loss": 0.0156, "macro_f1": 0.3333333432674408, "num_tokens": 4257442.0, "repeat_count": 0.0, "routers_loss": 0.009030844084918499, "skip_count": 0.0, "step": 2642, "text_loss": 0.36747801303863525 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.123046875, "learning_rate": 0.0008939842028273956, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4260386.0, "repeat_count": 0.0, "routers_loss": 0.007844001986086369, "skip_count": 1.0, "step": 2644, "text_loss": 0.6397647857666016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.422659230995011, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0008937935531804562, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4263516.0, "repeat_count": 0.0, "routers_loss": 0.0018789108144119382, "skip_count": 0.0, "step": 2646, "text_loss": 0.4795534908771515 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.432051658350455, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0008936027526310844, "loss": 0.0098, "macro_f1": 0.3272727429866791, "num_tokens": 4266744.0, "repeat_count": 0.0, "routers_loss": 0.0348590686917305, "skip_count": 1.0, "step": 2648, "text_loss": 0.27691999077796936 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07275390625, "learning_rate": 0.000893411801252395, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4269766.0, "repeat_count": 0.0, "routers_loss": 0.004543309565633535, "skip_count": 1.0, "step": 2650, "text_loss": 0.18867231905460358 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0008932206991175615, "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 4273513.0, "repeat_count": 0.0, "routers_loss": 0.0035277456045150757, "skip_count": 1.0, "step": 2652, "text_loss": 0.45613357424736023 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.460228940416789, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008930294462998143, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4276878.0, "repeat_count": 1.0, "routers_loss": 0.011337592266499996, "skip_count": 0.0, "step": 2654, "text_loss": 0.24733254313468933 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0869140625, "learning_rate": 0.0008928380428724419, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4279915.0, "repeat_count": 0.0, "routers_loss": 0.0010295971296727657, "skip_count": 1.0, "step": 2656, "text_loss": 0.41722849011421204 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0008926464889087903, "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4282888.0, "repeat_count": 0.0, "routers_loss": 0.0017198545392602682, "skip_count": 2.0, "step": 2658, "text_loss": 0.738322377204895 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0008924547844822634, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4285805.0, "repeat_count": 0.0, "routers_loss": 0.001339946174994111, "skip_count": 0.0, "step": 2660, "text_loss": 0.4802379906177521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.497798649838568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05322265625, "learning_rate": 0.000892262929666323, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4290282.0, "repeat_count": 0.0, "routers_loss": 0.0022340165451169014, "skip_count": 0.0, "step": 2662, "text_loss": 0.6503544449806213 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008920709245344878, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4294106.0, "repeat_count": 0.0, "routers_loss": 0.005288850050419569, "skip_count": 1.0, "step": 2664, "text_loss": 0.12312037497758865 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.516583504549457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.0008918787691603347, "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 4298013.0, "repeat_count": 0.0, "routers_loss": 0.004259659443050623, "skip_count": 1.0, "step": 2666, "text_loss": 0.3070000112056732 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.000891686463617498, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4300799.0, "repeat_count": 0.0, "routers_loss": 0.009489355608820915, "skip_count": 1.0, "step": 2668, "text_loss": 0.18535588681697845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008914940079796696, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4304641.0, "repeat_count": 0.0, "routers_loss": 0.0025417013093829155, "skip_count": 0.0, "step": 2670, "text_loss": 0.482585072517395 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.544760786615791, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008913014023205988, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4307462.0, "repeat_count": 0.0, "routers_loss": 0.006371749565005302, "skip_count": 0.0, "step": 2672, "text_loss": 0.7064456939697266 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008911086467140925, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4310396.0, "repeat_count": 0.0, "routers_loss": 0.0027512952219694853, "skip_count": 0.0, "step": 2674, "text_loss": 0.23532851040363312 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05712890625, "learning_rate": 0.000890915741234015, "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 4314781.0, "repeat_count": 0.0, "routers_loss": 0.008253013715147972, "skip_count": 1.0, "step": 2676, "text_loss": 0.30950358510017395 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.572938068682125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0008907226859542879, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4317988.0, "repeat_count": 0.0, "routers_loss": 0.005409995559602976, "skip_count": 2.0, "step": 2678, "text_loss": 0.4930732846260071 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 12.582330496037569, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.060546875, "learning_rate": 0.0008905294809488907, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 4321014.0, "repeat_count": 1.0, "routers_loss": 0.0029942214023321867, "skip_count": 1.0, "step": 2680, "text_loss": 0.6224040389060974 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0008903361262918595, "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4324268.0, "repeat_count": 0.0, "routers_loss": 0.008411120623350143, "skip_count": 1.0, "step": 2682, "text_loss": 0.16296671330928802 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 12.60111535074846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05126953125, "learning_rate": 0.0008901426220572884, "loss": 0.0138, "macro_f1": 1.0, "num_tokens": 4327494.0, "repeat_count": 2.0, "routers_loss": 0.01039006095379591, "skip_count": 4.0, "step": 2684, "text_loss": 0.43866512179374695 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.610507778103903, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060791015625, "learning_rate": 0.0008899489683193286, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4330936.0, "repeat_count": 0.0, "routers_loss": 0.0009329111780971289, "skip_count": 0.0, "step": 2686, "text_loss": 0.44250962138175964 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.619900205459349, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07421875, "learning_rate": 0.0008897551651521885, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4334123.0, "repeat_count": 0.0, "routers_loss": 0.003197216661646962, "skip_count": 0.0, "step": 2688, "text_loss": 0.48313501477241516 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.629292632814794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09716796875, "learning_rate": 0.0008895612126301339, "loss": 0.0157, "macro_f1": 0.3333333432674408, "num_tokens": 4337610.0, "repeat_count": 0.0, "routers_loss": 0.0033548236824572086, "skip_count": 0.0, "step": 2690, "text_loss": 0.4715327322483063 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.638685060170237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051513671875, "learning_rate": 0.0008893671108274877, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4341026.0, "repeat_count": 0.0, "routers_loss": 0.0024757643695920706, "skip_count": 0.0, "step": 2692, "text_loss": 0.43402785062789917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0008891728598186302, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 4344422.0, "repeat_count": 0.0, "routers_loss": 0.003317243419587612, "skip_count": 0.0, "step": 2694, "text_loss": 0.8498559594154358 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 12.657469914881126, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0400390625, "learning_rate": 0.0008889784596779986, "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 4347507.0, "repeat_count": 0.0, "routers_loss": 0.01577926240861416, "skip_count": 3.0, "step": 2696, "text_loss": 0.5646669864654541 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.666862342236572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11328125, "learning_rate": 0.0008887839104800876, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4350414.0, "repeat_count": 0.0, "routers_loss": 0.002953822258859873, "skip_count": 0.0, "step": 2698, "text_loss": 0.5145012140274048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0008885892122994486, "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4354110.0, "repeat_count": 0.0, "routers_loss": 0.005849295295774937, "skip_count": 0.0, "step": 2700, "text_loss": 0.580982506275177 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.68564719694746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008883943652106903, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 4357323.0, "repeat_count": 1.0, "routers_loss": 0.012347398325800896, "skip_count": 2.0, "step": 2702, "text_loss": 0.2234988808631897 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.695039624302906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 0.0008881993692884787, "loss": 0.0128, "macro_f1": 0.6666666865348816, "num_tokens": 4360228.0, "repeat_count": 0.0, "routers_loss": 0.003574999049305916, "skip_count": 1.0, "step": 2704, "text_loss": 0.4261806607246399 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.704432051658351, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008880042246075365, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4363905.0, "repeat_count": 0.0, "routers_loss": 0.0031574300955981016, "skip_count": 0.0, "step": 2706, "text_loss": 0.691118061542511 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008878089312426433, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4366736.0, "repeat_count": 0.0, "routers_loss": 0.003195564029738307, "skip_count": 0.0, "step": 2708, "text_loss": 0.613926112651825 }, { "acc_repeat": 0.0, "acc_skip": 0.6000000238418579, "avg_layers": 25.0, "epoch": 12.72321690636924, "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.75, "grad_norm": 0.054443359375, "learning_rate": 0.0008876134892686363, "loss": 0.011, "macro_f1": 0.5694444179534912, "num_tokens": 4370146.0, "repeat_count": 0.0, "routers_loss": 0.038784291595220566, "skip_count": 5.0, "step": 2710, "text_loss": 0.2723451852798462 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.000887417898760409, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 4373653.0, "repeat_count": 0.0, "routers_loss": 0.0006457131239585578, "skip_count": 0.0, "step": 2712, "text_loss": 0.31667640805244446 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.742001761080129, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10498046875, "learning_rate": 0.000887222159792912, "loss": 0.0155, "macro_f1": 0.6603773832321167, "num_tokens": 4376993.0, "repeat_count": 1.0, "routers_loss": 0.045078590512275696, "skip_count": 1.0, "step": 2714, "text_loss": 0.5872798562049866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.751394188435574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0008870262724411528, "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4380160.0, "repeat_count": 0.0, "routers_loss": 0.003628545207902789, "skip_count": 0.0, "step": 2716, "text_loss": 0.7468157410621643 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 12.760786615791018, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11181640625, "learning_rate": 0.0008868302367801962, "loss": 0.0118, "macro_f1": 0.6598639488220215, "num_tokens": 4383100.0, "repeat_count": 1.0, "routers_loss": 0.05404464527964592, "skip_count": 3.0, "step": 2718, "text_loss": 0.2970244884490967 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0008866340528851629, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4386700.0, "repeat_count": 0.0, "routers_loss": 0.007000274024903774, "skip_count": 0.0, "step": 2720, "text_loss": 0.34521186351776123 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 12.779571470501908, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.052978515625, "learning_rate": 0.0008864377208312313, "loss": 0.0082, "macro_f1": 0.8823530077934265, "num_tokens": 4390299.0, "repeat_count": 1.0, "routers_loss": 0.02025366574525833, "skip_count": 2.0, "step": 2722, "text_loss": 1.0536936521530151 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.788963897857352, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.000886241240693636, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 4393353.0, "repeat_count": 0.0, "routers_loss": 0.00251673418097198, "skip_count": 0.0, "step": 2724, "text_loss": 0.5678093433380127 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.798356325212797, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0008860446125476686, "loss": 0.0135, "macro_f1": 0.6666666865348816, "num_tokens": 4396446.0, "repeat_count": 1.0, "routers_loss": 0.009532532654702663, "skip_count": 0.0, "step": 2726, "text_loss": 0.23775041103363037 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.807748752568243, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.091796875, "learning_rate": 0.0008858478364686776, "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 4399977.0, "repeat_count": 1.0, "routers_loss": 0.008062181062996387, "skip_count": 0.0, "step": 2728, "text_loss": 0.18888695538043976 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.817141179923686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0008856509125320678, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4404406.0, "repeat_count": 0.0, "routers_loss": 0.0007731119985692203, "skip_count": 0.0, "step": 2730, "text_loss": 0.47331541776657104 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0498046875, "learning_rate": 0.0008854538408133006, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 4407165.0, "repeat_count": 0.0, "routers_loss": 0.003115242812782526, "skip_count": 1.0, "step": 2732, "text_loss": 0.491370290517807 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0008852566213878947, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4410101.0, "repeat_count": 0.0, "routers_loss": 0.0008958528051152825, "skip_count": 0.0, "step": 2734, "text_loss": 0.42188262939453125 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 12.84531846199002, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07763671875, "learning_rate": 0.0008850592543314246, "loss": 0.0118, "macro_f1": 1.0, "num_tokens": 4413015.0, "repeat_count": 1.0, "routers_loss": 0.01139112375676632, "skip_count": 1.0, "step": 2736, "text_loss": 0.4716498553752899 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.854710889345466, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0008848617397195218, "loss": 0.0084, "macro_f1": 0.6603773832321167, "num_tokens": 4416404.0, "repeat_count": 1.0, "routers_loss": 0.01609630137681961, "skip_count": 1.0, "step": 2738, "text_loss": 0.19490821659564972 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0008846640776278745, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 4419408.0, "repeat_count": 0.0, "routers_loss": 0.001489170710556209, "skip_count": 0.0, "step": 2740, "text_loss": 0.6443108320236206 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.873495744056354, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0008844662681322269, "loss": 0.0144, "macro_f1": 0.6666666865348816, "num_tokens": 4422067.0, "repeat_count": 1.0, "routers_loss": 0.0014755792217329144, "skip_count": 0.0, "step": 2742, "text_loss": 0.9150356650352478 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0008842683113083801, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 4425647.0, "repeat_count": 0.0, "routers_loss": 0.008962674997746944, "skip_count": 1.0, "step": 2744, "text_loss": 0.7103227972984314 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 12.892280598767243, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0008840702072321915, "loss": 0.0104, "macro_f1": 0.6598639488220215, "num_tokens": 4428855.0, "repeat_count": 1.0, "routers_loss": 0.02554207295179367, "skip_count": 3.0, "step": 2746, "text_loss": 0.27141591906547546 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.901673026122689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0008838719559795751, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4432838.0, "repeat_count": 0.0, "routers_loss": 0.0011747616808861494, "skip_count": 0.0, "step": 2748, "text_loss": 0.4007738530635834 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 12.911065453478134, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03466796875, "learning_rate": 0.0008836735576265009, "loss": 0.0073, "macro_f1": 0.5492662787437439, "num_tokens": 4435793.0, "repeat_count": 0.0, "routers_loss": 0.017564335837960243, "skip_count": 2.0, "step": 2750, "text_loss": 0.5972410440444946 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.920457880833577, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.044921875, "learning_rate": 0.0008834750122489956, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 4438871.0, "repeat_count": 1.0, "routers_loss": 0.007004009559750557, "skip_count": 0.0, "step": 2752, "text_loss": 0.2294853925704956 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0008832763199231423, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4441846.0, "repeat_count": 0.0, "routers_loss": 0.0014562139986082911, "skip_count": 0.0, "step": 2754, "text_loss": 0.722432017326355 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.939242735544468, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0751953125, "learning_rate": 0.0008830774807250802, "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 4444786.0, "repeat_count": 1.0, "routers_loss": 0.024773593991994858, "skip_count": 0.0, "step": 2756, "text_loss": 0.507905125617981 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 12.948635162899912, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.049072265625, "learning_rate": 0.0008828784947310049, "loss": 0.0129, "macro_f1": 0.8823530077934265, "num_tokens": 4448442.0, "repeat_count": 1.0, "routers_loss": 0.04959975928068161, "skip_count": 2.0, "step": 2758, "text_loss": 0.3617522418498993 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 12.958027590255357, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1025390625, "learning_rate": 0.000882679362017168, "loss": 0.0149, "macro_f1": 1.0, "num_tokens": 4451401.0, "repeat_count": 1.0, "routers_loss": 0.005783245898783207, "skip_count": 2.0, "step": 2760, "text_loss": 0.49187400937080383 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.9674200176108, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0008824800826598778, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 4454537.0, "repeat_count": 0.0, "routers_loss": 0.00656260596588254, "skip_count": 0.0, "step": 2762, "text_loss": 0.6823583245277405 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 12.976812444966246, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0546875, "learning_rate": 0.0008822806567354983, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4457706.0, "repeat_count": 1.0, "routers_loss": 0.005298966076225042, "skip_count": 0.0, "step": 2764, "text_loss": 0.554322361946106 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.986204872321691, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046630859375, "learning_rate": 0.0008820810843204501, "loss": 0.0096, "macro_f1": 0.3272727429866791, "num_tokens": 4460710.0, "repeat_count": 0.0, "routers_loss": 0.03164982795715332, "skip_count": 1.0, "step": 2766, "text_loss": 0.1656961441040039 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 12.995597299677135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.072265625, "learning_rate": 0.0008818813654912095, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4464001.0, "repeat_count": 0.0, "routers_loss": 0.000715116853825748, "skip_count": 0.0, "step": 2768, "text_loss": 0.5818144083023071 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.004696213677722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056396484375, "learning_rate": 0.0008816815003243093, "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 4467364.0, "repeat_count": 0.0, "routers_loss": 0.002851625671610236, "skip_count": 0.0, "step": 2770, "text_loss": 0.6068631410598755 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0008814814888963383, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4470681.0, "repeat_count": 0.0, "routers_loss": 0.004729873035103083, "skip_count": 1.0, "step": 2772, "text_loss": 0.5386646389961243 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.000881281331283941, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4473734.0, "repeat_count": 0.0, "routers_loss": 0.0031853127293288708, "skip_count": 1.0, "step": 2774, "text_loss": 0.5695263147354126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008810810275638182, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4478404.0, "repeat_count": 0.0, "routers_loss": 0.0008977465913631022, "skip_count": 0.0, "step": 2776, "text_loss": 0.4750773310661316 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.042265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0008808805778127269, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4481287.0, "repeat_count": 0.0, "routers_loss": 0.00469845999032259, "skip_count": 0.0, "step": 2778, "text_loss": 0.14078612625598907 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 13.051658350454945, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.049560546875, "learning_rate": 0.0008806799821074796, "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 4483929.0, "repeat_count": 0.0, "routers_loss": 0.01789761893451214, "skip_count": 2.0, "step": 2780, "text_loss": 0.2167191207408905 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056396484375, "learning_rate": 0.0008804792405249451, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 4487468.0, "repeat_count": 0.0, "routers_loss": 0.001018838956952095, "skip_count": 0.0, "step": 2782, "text_loss": 0.5424665212631226 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 13.070443205165835, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.07373046875, "learning_rate": 0.000880278353142048, "loss": 0.0077, "macro_f1": 0.8200000524520874, "num_tokens": 4490942.0, "repeat_count": 1.0, "routers_loss": 0.03260354697704315, "skip_count": 3.0, "step": 2784, "text_loss": 0.20994654297828674 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.079835632521279, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05322265625, "learning_rate": 0.0008800773200357683, "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4493986.0, "repeat_count": 0.0, "routers_loss": 0.003019835101440549, "skip_count": 0.0, "step": 2786, "text_loss": 0.5709528923034668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0008798761412831429, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4498232.0, "repeat_count": 0.0, "routers_loss": 0.00285192858427763, "skip_count": 0.0, "step": 2788, "text_loss": 0.5103896260261536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044921875, "learning_rate": 0.0008796748169612634, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4501231.0, "repeat_count": 0.0, "routers_loss": 0.0012469831854104996, "skip_count": 0.0, "step": 2790, "text_loss": 0.43669697642326355 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.108012914587613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0008794733471472778, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4504208.0, "repeat_count": 0.0, "routers_loss": 0.011512776836752892, "skip_count": 1.0, "step": 2792, "text_loss": 0.2299770563840866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.117405341943059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0008792717319183899, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4507013.0, "repeat_count": 0.0, "routers_loss": 0.00834917277097702, "skip_count": 0.0, "step": 2794, "text_loss": 0.2130603939294815 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.126797769298504, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0008790699713518587, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 4510286.0, "repeat_count": 0.0, "routers_loss": 0.008616939187049866, "skip_count": 2.0, "step": 2796, "text_loss": 0.4377101957798004 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.0008788680655249994, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4513762.0, "repeat_count": 0.0, "routers_loss": 0.003408568911254406, "skip_count": 0.0, "step": 2798, "text_loss": 0.435138463973999 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 13.145582624009393, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0008786660145151826, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4516696.0, "repeat_count": 1.0, "routers_loss": 0.0029398901388049126, "skip_count": 0.0, "step": 2800, "text_loss": 0.3195655047893524 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0008784638183998348, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4519760.0, "repeat_count": 0.0, "routers_loss": 0.0013777425047010183, "skip_count": 0.0, "step": 2802, "text_loss": 0.8129430413246155 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.164367478720282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0008782614772564379, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4522106.0, "repeat_count": 0.0, "routers_loss": 0.0031694830395281315, "skip_count": 0.0, "step": 2804, "text_loss": 0.18083660304546356 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0008780589911625293, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4525743.0, "repeat_count": 0.0, "routers_loss": 0.002161208540201187, "skip_count": 0.0, "step": 2806, "text_loss": 0.8228182792663574 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07177734375, "learning_rate": 0.0008778563601957021, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 4529573.0, "repeat_count": 0.0, "routers_loss": 0.0028444856870919466, "skip_count": 1.0, "step": 2808, "text_loss": 0.3715563118457794 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.192544760786616, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0008776535844336049, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4532452.0, "repeat_count": 0.0, "routers_loss": 0.003807213855907321, "skip_count": 0.0, "step": 2810, "text_loss": 0.6012523174285889 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.201937188142061, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0008774506639539417, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4536077.0, "repeat_count": 0.0, "routers_loss": 0.006698979996144772, "skip_count": 0.0, "step": 2812, "text_loss": 0.27097949385643005 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.099609375, "learning_rate": 0.0008772475988344722, "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 4539057.0, "repeat_count": 0.0, "routers_loss": 0.004849409218877554, "skip_count": 1.0, "step": 2814, "text_loss": 1.026973843574524 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 13.22072204285295, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.041748046875, "learning_rate": 0.0008770443891530109, "loss": 0.0115, "macro_f1": 0.5934640765190125, "num_tokens": 4542253.0, "repeat_count": 0.0, "routers_loss": 0.019148651510477066, "skip_count": 3.0, "step": 2816, "text_loss": 0.2717585563659668 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.230114470208395, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.052490234375, "learning_rate": 0.0008768410349874286, "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 4545047.0, "repeat_count": 1.0, "routers_loss": 0.02231316640973091, "skip_count": 2.0, "step": 2818, "text_loss": 0.274346262216568 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043212890625, "learning_rate": 0.0008766375364156508, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4548371.0, "repeat_count": 0.0, "routers_loss": 0.008014129474759102, "skip_count": 2.0, "step": 2820, "text_loss": 0.22850871086120605 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.248899324919284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.0008764338935156586, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4551276.0, "repeat_count": 0.0, "routers_loss": 0.0014544493751600385, "skip_count": 0.0, "step": 2822, "text_loss": 0.6308462023735046 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 13.258291752274728, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.000876230106365488, "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 4554143.0, "repeat_count": 0.0, "routers_loss": 0.00818584579974413, "skip_count": 3.0, "step": 2824, "text_loss": 0.3484207093715668 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 13.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0008760261750432312, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 4557256.0, "repeat_count": 0.0, "routers_loss": 0.006275608204305172, "skip_count": 3.0, "step": 2826, "text_loss": 0.1927330046892166 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.277076606985618, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 0.0008758220996270348, "loss": 0.0103, "macro_f1": 1.0, "num_tokens": 4560202.0, "repeat_count": 2.0, "routers_loss": 0.0055974251590669155, "skip_count": 2.0, "step": 2828, "text_loss": 0.7796496748924255 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.286469034341062, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046142578125, "learning_rate": 0.0008756178801951007, "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 4563508.0, "repeat_count": 0.0, "routers_loss": 0.0019799957517534494, "skip_count": 0.0, "step": 2830, "text_loss": 0.49633297324180603 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0458984375, "learning_rate": 0.0008754135168256865, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4566776.0, "repeat_count": 0.0, "routers_loss": 0.004538947716355324, "skip_count": 0.0, "step": 2832, "text_loss": 0.5346745252609253 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.305253889051952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0008752090095971044, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4569787.0, "repeat_count": 0.0, "routers_loss": 0.001663343166001141, "skip_count": 0.0, "step": 2834, "text_loss": 0.5524004697799683 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.314646316407396, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.000875004358587722, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 4572813.0, "repeat_count": 0.0, "routers_loss": 0.0022988212294876575, "skip_count": 0.0, "step": 2836, "text_loss": 0.4232870042324066 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.324038743762841, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.000874799563875962, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4575563.0, "repeat_count": 0.0, "routers_loss": 0.007781553082168102, "skip_count": 1.0, "step": 2838, "text_loss": 0.19239822030067444 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 13.333431171118287, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03515625, "learning_rate": 0.0008745946255403021, "loss": 0.0072, "macro_f1": 0.5492662787437439, "num_tokens": 4578117.0, "repeat_count": 0.0, "routers_loss": 0.01872488670051098, "skip_count": 2.0, "step": 2840, "text_loss": 0.2148810178041458 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.34282359847373, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0008743895436592749, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 4582330.0, "repeat_count": 1.0, "routers_loss": 0.005634195636957884, "skip_count": 1.0, "step": 2842, "text_loss": 0.4929640591144562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048583984375, "learning_rate": 0.0008741843183114685, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4585765.0, "repeat_count": 0.0, "routers_loss": 0.0008928569150157273, "skip_count": 0.0, "step": 2844, "text_loss": 0.32702967524528503 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 13.361608453184619, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.0008739789495755253, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4589000.0, "repeat_count": 0.0, "routers_loss": 0.014715569093823433, "skip_count": 4.0, "step": 2846, "text_loss": 0.25125816464424133 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.371000880540064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.0008737734375301433, "loss": 0.0135, "macro_f1": 0.3333333432674408, "num_tokens": 4592391.0, "repeat_count": 0.0, "routers_loss": 0.0017551190685480833, "skip_count": 0.0, "step": 2848, "text_loss": 0.6595172882080078 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0008735677822540749, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4596662.0, "repeat_count": 0.0, "routers_loss": 0.0006456313421949744, "skip_count": 0.0, "step": 2850, "text_loss": 0.6290773153305054 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0008733619838261276, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 4599682.0, "repeat_count": 0.0, "routers_loss": 0.00765060493722558, "skip_count": 2.0, "step": 2852, "text_loss": 0.3268161416053772 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.399178162606399, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0008731560423251637, "loss": 0.01, "macro_f1": 1.0, "num_tokens": 4603324.0, "repeat_count": 1.0, "routers_loss": 0.01161442045122385, "skip_count": 2.0, "step": 2854, "text_loss": 0.3029932975769043 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 25.0, "epoch": 13.408570589961844, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.0419921875, "learning_rate": 0.0008729499578301005, "loss": 0.0098, "macro_f1": 0.9555556178092957, "num_tokens": 4606975.0, "repeat_count": 1.0, "routers_loss": 0.02055389992892742, "skip_count": 5.0, "step": 2856, "text_loss": 0.6268532872200012 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05078125, "learning_rate": 0.00087274373041991, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4609629.0, "repeat_count": 0.0, "routers_loss": 0.0013911726418882608, "skip_count": 0.0, "step": 2858, "text_loss": 0.534355640411377 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 13.427355444672733, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.053955078125, "learning_rate": 0.0008725373601736188, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 4612913.0, "repeat_count": 2.0, "routers_loss": 0.01010701060295105, "skip_count": 0.0, "step": 2860, "text_loss": 0.3391380310058594 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0008723308471703085, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4616718.0, "repeat_count": 0.0, "routers_loss": 0.005969462916254997, "skip_count": 1.0, "step": 2862, "text_loss": 0.47250816226005554 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.446140299383622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046630859375, "learning_rate": 0.0008721241914891152, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4619680.0, "repeat_count": 0.0, "routers_loss": 0.0027780034579336643, "skip_count": 0.0, "step": 2864, "text_loss": 0.3249278664588928 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.455532726739067, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 0.0008719173932092295, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 4622700.0, "repeat_count": 0.0, "routers_loss": 0.0015912104863673449, "skip_count": 0.0, "step": 2866, "text_loss": 0.7789985537528992 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05126953125, "learning_rate": 0.0008717104524098973, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4626637.0, "repeat_count": 0.0, "routers_loss": 0.0036539011634886265, "skip_count": 0.0, "step": 2868, "text_loss": 0.619088351726532 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10400390625, "learning_rate": 0.0008715033691704187, "loss": 0.0118, "macro_f1": 0.6666666865348816, "num_tokens": 4629863.0, "repeat_count": 0.0, "routers_loss": 0.008402476087212563, "skip_count": 1.0, "step": 2870, "text_loss": 0.5550018548965454 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.483710008805401, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0008712961435701479, "loss": 0.0161, "macro_f1": 0.6666666865348816, "num_tokens": 4632657.0, "repeat_count": 0.0, "routers_loss": 0.01400839351117611, "skip_count": 1.0, "step": 2872, "text_loss": 0.17368625104427338 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.493102436160845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008710887756884947, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4635885.0, "repeat_count": 0.0, "routers_loss": 0.0014573842054232955, "skip_count": 0.0, "step": 2874, "text_loss": 0.5138643383979797 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008708812656049225, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 4639341.0, "repeat_count": 0.0, "routers_loss": 0.002810224425047636, "skip_count": 1.0, "step": 2876, "text_loss": 0.70310378074646 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 13.511887290871735, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.03564453125, "learning_rate": 0.0008706736133989497, "loss": 0.0105, "macro_f1": 0.9449735879898071, "num_tokens": 4642163.0, "repeat_count": 2.0, "routers_loss": 0.029783209785819054, "skip_count": 4.0, "step": 2878, "text_loss": 0.26898008584976196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008704658191501491, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4645858.0, "repeat_count": 0.0, "routers_loss": 0.0009193966398015618, "skip_count": 0.0, "step": 2880, "text_loss": 0.6047570705413818 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 13.530672145582624, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0008702578829381475, "loss": 0.0131, "macro_f1": 0.8814815282821655, "num_tokens": 4649237.0, "repeat_count": 2.0, "routers_loss": 0.05698608607053757, "skip_count": 4.0, "step": 2882, "text_loss": 0.10695219784975052 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0008700498048426269, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4652362.0, "repeat_count": 0.0, "routers_loss": 0.0011786938412114978, "skip_count": 0.0, "step": 2884, "text_loss": 0.4442957937717438 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 13.549457000293513, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.046142578125, "learning_rate": 0.0008698415849433229, "loss": 0.0092, "macro_f1": 0.5492662787437439, "num_tokens": 4655616.0, "repeat_count": 2.0, "routers_loss": 0.02142646163702011, "skip_count": 0.0, "step": 2886, "text_loss": 0.5820964574813843 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0008696332233200262, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4659294.0, "repeat_count": 0.0, "routers_loss": 0.004038636106997728, "skip_count": 0.0, "step": 2888, "text_loss": 0.11847645789384842 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.0008694247200525806, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4662512.0, "repeat_count": 0.0, "routers_loss": 0.0013256469974294305, "skip_count": 0.0, "step": 2890, "text_loss": 0.4873582720756531 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.577634282359847, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008692160752208856, "loss": 0.0129, "macro_f1": 0.3272727429866791, "num_tokens": 4666190.0, "repeat_count": 0.0, "routers_loss": 0.04477972164750099, "skip_count": 1.0, "step": 2892, "text_loss": 0.44243401288986206 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.587026709715293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09521484375, "learning_rate": 0.0008690072889048941, "loss": 0.0127, "macro_f1": 1.0, "num_tokens": 4668884.0, "repeat_count": 1.0, "routers_loss": 0.004407547414302826, "skip_count": 2.0, "step": 2894, "text_loss": 0.6847127079963684 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008687983611846133, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4672093.0, "repeat_count": 0.0, "routers_loss": 0.005245382897555828, "skip_count": 1.0, "step": 2896, "text_loss": 0.25583332777023315 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.605811564426181, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0458984375, "learning_rate": 0.0008685892921401049, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4674917.0, "repeat_count": 0.0, "routers_loss": 0.0010470855049788952, "skip_count": 0.0, "step": 2898, "text_loss": 0.41998377442359924 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0008683800818514844, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4677739.0, "repeat_count": 0.0, "routers_loss": 0.009026622399687767, "skip_count": 2.0, "step": 2900, "text_loss": 0.303053081035614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.0008681707303989215, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4680721.0, "repeat_count": 0.0, "routers_loss": 0.004500916693359613, "skip_count": 0.0, "step": 2902, "text_loss": 0.5573288798332214 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.633988846492516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0008679612378626404, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 4683339.0, "repeat_count": 0.0, "routers_loss": 0.005047840531915426, "skip_count": 1.0, "step": 2904, "text_loss": 0.321353554725647 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.643381273847961, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0008677516043229187, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4686453.0, "repeat_count": 0.0, "routers_loss": 0.010256914421916008, "skip_count": 1.0, "step": 2906, "text_loss": 0.4300784468650818 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 13.652773701203404, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.0008675418298600883, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4689645.0, "repeat_count": 1.0, "routers_loss": 0.0022669637110084295, "skip_count": 0.0, "step": 2908, "text_loss": 0.5064885020256042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.66216612855885, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008673319145545358, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4692320.0, "repeat_count": 0.0, "routers_loss": 0.0011188550852239132, "skip_count": 0.0, "step": 2910, "text_loss": 0.7114819884300232 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.671558555914293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0008671218584867003, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4695116.0, "repeat_count": 0.0, "routers_loss": 0.002966561820358038, "skip_count": 2.0, "step": 2912, "text_loss": 0.5662392973899841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047607421875, "learning_rate": 0.0008669116617370762, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4698040.0, "repeat_count": 0.0, "routers_loss": 0.0012894890969619155, "skip_count": 0.0, "step": 2914, "text_loss": 0.718977689743042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.690343410625184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0008667013243862111, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4700963.0, "repeat_count": 0.0, "routers_loss": 0.0007232456118799746, "skip_count": 0.0, "step": 2916, "text_loss": 0.3447718024253845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.699735837980628, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.000866490846514707, "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 4704471.0, "repeat_count": 1.0, "routers_loss": 0.015166680328547955, "skip_count": 0.0, "step": 2918, "text_loss": 0.454946368932724 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.709128265336073, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04736328125, "learning_rate": 0.000866280228203219, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 4707238.0, "repeat_count": 1.0, "routers_loss": 0.0061312485486269, "skip_count": 1.0, "step": 2920, "text_loss": 0.721788227558136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.718520692691518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008660694695324564, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4711323.0, "repeat_count": 0.0, "routers_loss": 0.00169933564029634, "skip_count": 0.0, "step": 2922, "text_loss": 0.7562121748924255 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.727913120046962, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0008658585705831829, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 4714417.0, "repeat_count": 0.0, "routers_loss": 0.0022731393110007048, "skip_count": 0.0, "step": 2924, "text_loss": 0.5726147890090942 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.737305547402407, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0008656475314362148, "loss": 0.0131, "macro_f1": 0.8817967176437378, "num_tokens": 4717445.0, "repeat_count": 2.0, "routers_loss": 0.06477782875299454, "skip_count": 3.0, "step": 2926, "text_loss": 0.4505867660045624 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 13.74669797475785, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.06396484375, "learning_rate": 0.0008654363521724229, "loss": 0.0129, "macro_f1": 0.9449735879898071, "num_tokens": 4722253.0, "repeat_count": 2.0, "routers_loss": 0.027405790984630585, "skip_count": 4.0, "step": 2928, "text_loss": 0.24767601490020752 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0537109375, "learning_rate": 0.0008652250328727315, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4725465.0, "repeat_count": 0.0, "routers_loss": 0.006544729229062796, "skip_count": 2.0, "step": 2930, "text_loss": 0.4478724002838135 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 13.765482829468741, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0008650135736181184, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4729213.0, "repeat_count": 1.0, "routers_loss": 0.0055119614116847515, "skip_count": 0.0, "step": 2932, "text_loss": 0.6749323010444641 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 0.0008648019744896154, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4732280.0, "repeat_count": 0.0, "routers_loss": 0.008374541997909546, "skip_count": 0.0, "step": 2934, "text_loss": 0.4647359251976013 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 13.78426768417963, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06201171875, "learning_rate": 0.0008645902355683077, "loss": 0.0091, "macro_f1": 0.6595745086669922, "num_tokens": 4736244.0, "repeat_count": 1.0, "routers_loss": 0.068686343729496, "skip_count": 4.0, "step": 2936, "text_loss": 0.5356017351150513 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 13.793660111535075, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0008643783569353339, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4739810.0, "repeat_count": 2.0, "routers_loss": 0.017954571172595024, "skip_count": 0.0, "step": 2938, "text_loss": 0.3145926296710968 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.803052538890519, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.054443359375, "learning_rate": 0.0008641663386718863, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4742720.0, "repeat_count": 0.0, "routers_loss": 0.006261351052671671, "skip_count": 1.0, "step": 2940, "text_loss": 0.3200613856315613 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.812444966245964, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008639541808592109, "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 4745870.0, "repeat_count": 1.0, "routers_loss": 0.0025341357104480267, "skip_count": 1.0, "step": 2942, "text_loss": 0.5020416378974915 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0008637418835786067, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4748943.0, "repeat_count": 0.0, "routers_loss": 0.008970048278570175, "skip_count": 2.0, "step": 2944, "text_loss": 0.14517110586166382 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008635294469114265, "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4751360.0, "repeat_count": 0.0, "routers_loss": 0.002133632078766823, "skip_count": 0.0, "step": 2946, "text_loss": 0.5367856025695801 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.840622248312298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.0008633168709390766, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4754403.0, "repeat_count": 0.0, "routers_loss": 0.0011866620043292642, "skip_count": 0.0, "step": 2948, "text_loss": 0.38302522897720337 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 13.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0008631041557430163, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4757867.0, "repeat_count": 2.0, "routers_loss": 0.0026854004245251417, "skip_count": 0.0, "step": 2950, "text_loss": 0.43433454632759094 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05859375, "learning_rate": 0.0008628913014047585, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 4761171.0, "repeat_count": 0.0, "routers_loss": 0.002433479530736804, "skip_count": 0.0, "step": 2952, "text_loss": 0.4725971519947052 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.868799530378633, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0008626783080058696, "loss": 0.0066, "macro_f1": 0.3272727429866791, "num_tokens": 4764752.0, "repeat_count": 1.0, "routers_loss": 0.017182493582367897, "skip_count": 0.0, "step": 2954, "text_loss": 0.460641473531723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.878191957734076, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12353515625, "learning_rate": 0.0008624651756279687, "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 4767453.0, "repeat_count": 0.0, "routers_loss": 0.0018134774873033166, "skip_count": 0.0, "step": 2956, "text_loss": 0.4091459810733795 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 13.887584385089522, "f1_execute": 0.9777777791023254, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.053466796875, "learning_rate": 0.000862251904352729, "loss": 0.0108, "macro_f1": 0.9259259104728699, "num_tokens": 4771110.0, "repeat_count": 3.0, "routers_loss": 0.0365753099322319, "skip_count": 3.0, "step": 2958, "text_loss": 0.22408585250377655 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.896976812444967, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05029296875, "learning_rate": 0.000862038494261876, "loss": 0.0109, "macro_f1": 0.3272727429866791, "num_tokens": 4774464.0, "repeat_count": 0.0, "routers_loss": 0.024343067780137062, "skip_count": 1.0, "step": 2960, "text_loss": 0.16483014822006226 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0008618249454371891, "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 4777894.0, "repeat_count": 0.0, "routers_loss": 0.0008310087723657489, "skip_count": 0.0, "step": 2962, "text_loss": 0.5573428869247437 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0008616112579605006, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4781116.0, "repeat_count": 0.0, "routers_loss": 0.0065494864247739315, "skip_count": 0.0, "step": 2964, "text_loss": 0.18816794455051422 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.925154094511301, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.0008613974319136957, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4784886.0, "repeat_count": 0.0, "routers_loss": 0.0019726944155991077, "skip_count": 0.0, "step": 2966, "text_loss": 0.5097305774688721 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0008611834673787134, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4787563.0, "repeat_count": 0.0, "routers_loss": 0.006327496841549873, "skip_count": 0.0, "step": 2968, "text_loss": 0.6953814029693604 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 13.94393894922219, "f1_execute": 0.9600000381469727, "f1_repeat": 0.5, "f1_skip": 1.0, "grad_norm": 0.056884765625, "learning_rate": 0.0008609693644375449, "loss": 0.0086, "macro_f1": 0.8200000524520874, "num_tokens": 4790421.0, "repeat_count": 3.0, "routers_loss": 0.042896661907434464, "skip_count": 1.0, "step": 2970, "text_loss": 0.2573051154613495 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 13.953331376577633, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.14453125, "learning_rate": 0.000860755123172235, "loss": 0.0096, "macro_f1": 1.0, "num_tokens": 4793786.0, "repeat_count": 2.0, "routers_loss": 0.013228793628513813, "skip_count": 1.0, "step": 2972, "text_loss": 0.46614497900009155 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 13.962723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0008605407436648815, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4796864.0, "repeat_count": 0.0, "routers_loss": 0.007294759154319763, "skip_count": 2.0, "step": 2974, "text_loss": 0.21555091440677643 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 13.972116231288524, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.057861328125, "learning_rate": 0.0008603262259976348, "loss": 0.0129, "macro_f1": 1.0, "num_tokens": 4800080.0, "repeat_count": 1.0, "routers_loss": 0.0024024227168411016, "skip_count": 5.0, "step": 2976, "text_loss": 0.7855485081672668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0008601115702526987, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4802899.0, "repeat_count": 0.0, "routers_loss": 0.001433031284250319, "skip_count": 0.0, "step": 2978, "text_loss": 0.6777765154838562 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 13.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04931640625, "learning_rate": 0.0008598967765123293, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4805835.0, "repeat_count": 0.0, "routers_loss": 0.003073975909501314, "skip_count": 0.0, "step": 2980, "text_loss": 0.5926910638809204 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 14.0, "f1_execute": 0.9333333373069763, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05322265625, "learning_rate": 0.0008596818448588364, "loss": 0.0139, "macro_f1": 0.8666667342185974, "num_tokens": 4809028.0, "repeat_count": 1.0, "routers_loss": 0.06438573449850082, "skip_count": 6.0, "step": 2982, "text_loss": 0.23975612223148346 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.009392427355445, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0008594667753745821, "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 4812831.0, "repeat_count": 0.0, "routers_loss": 0.014817612245678902, "skip_count": 1.0, "step": 2984, "text_loss": 0.17292268574237823 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.018784854710889, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.07421875, "learning_rate": 0.0008592515681419813, "loss": 0.0078, "macro_f1": 0.5492662787437439, "num_tokens": 4816005.0, "repeat_count": 2.0, "routers_loss": 0.025407327339053154, "skip_count": 0.0, "step": 2986, "text_loss": 0.6403061151504517 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0008590362232435018, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4818901.0, "repeat_count": 0.0, "routers_loss": 0.006826757453382015, "skip_count": 0.0, "step": 2988, "text_loss": 0.2572069466114044 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.03756970942178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008588207407616644, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4823120.0, "repeat_count": 0.0, "routers_loss": 0.0009054148104041815, "skip_count": 0.0, "step": 2990, "text_loss": 0.4827076196670532 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.046962136777223, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0008586051207790422, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 4825774.0, "repeat_count": 0.0, "routers_loss": 0.0012294676853343844, "skip_count": 0.0, "step": 2992, "text_loss": 0.40157821774482727 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 14.056354564132668, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.052734375, "learning_rate": 0.0008583893633782612, "loss": 0.0084, "macro_f1": 0.5492662787437439, "num_tokens": 4828841.0, "repeat_count": 0.0, "routers_loss": 0.011474622413516045, "skip_count": 2.0, "step": 2994, "text_loss": 0.14842072129249573 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.058837890625, "learning_rate": 0.0008581734686419999, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4831458.0, "repeat_count": 0.0, "routers_loss": 0.009154081344604492, "skip_count": 2.0, "step": 2996, "text_loss": 0.365400105714798 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.075139418843557, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.00085795743665299, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4834609.0, "repeat_count": 0.0, "routers_loss": 0.002899336162954569, "skip_count": 0.0, "step": 2998, "text_loss": 0.5574684143066406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0008577412674940152, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4838324.0, "repeat_count": 0.0, "routers_loss": 0.0034664268605411053, "skip_count": 0.0, "step": 3000, "text_loss": 0.6752855777740479 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 0.0008575249612479117, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 4841877.0, "repeat_count": 0.0, "routers_loss": 0.0036425739526748657, "skip_count": 2.0, "step": 3002, "text_loss": 0.6332980394363403 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.103316700909891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048095703125, "learning_rate": 0.0008573085179975685, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4845840.0, "repeat_count": 0.0, "routers_loss": 0.0013783496106043458, "skip_count": 0.0, "step": 3004, "text_loss": 0.4219617545604706 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0008570919378259274, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4848766.0, "repeat_count": 0.0, "routers_loss": 0.004823608323931694, "skip_count": 1.0, "step": 3006, "text_loss": 0.7987180948257446 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.000856875220815982, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4852310.0, "repeat_count": 0.0, "routers_loss": 0.0014760984340682626, "skip_count": 0.0, "step": 3008, "text_loss": 0.35592713952064514 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.131493982976226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0008566583670507788, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4856146.0, "repeat_count": 0.0, "routers_loss": 0.0031717263627797365, "skip_count": 1.0, "step": 3010, "text_loss": 0.19379083812236786 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.140886410331671, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0008564413766134164, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 4859386.0, "repeat_count": 0.0, "routers_loss": 0.003361492184922099, "skip_count": 0.0, "step": 3012, "text_loss": 0.39129266142845154 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048583984375, "learning_rate": 0.0008562242495870463, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4862661.0, "repeat_count": 0.0, "routers_loss": 0.0010563990799710155, "skip_count": 0.0, "step": 3014, "text_loss": 0.5966938734054565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.15967126504256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0008560069860548716, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4865410.0, "repeat_count": 0.0, "routers_loss": 0.001233913702890277, "skip_count": 0.0, "step": 3016, "text_loss": 0.3386077880859375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.169063692398003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055419921875, "learning_rate": 0.0008557895861001484, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4868931.0, "repeat_count": 0.0, "routers_loss": 0.0018066301709041, "skip_count": 0.0, "step": 3018, "text_loss": 0.5222050547599792 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.178456119753449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008555720498061845, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4873492.0, "repeat_count": 0.0, "routers_loss": 0.0050385501235723495, "skip_count": 1.0, "step": 3020, "text_loss": 0.4558849334716797 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.187848547108894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008553543772563403, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4877026.0, "repeat_count": 0.0, "routers_loss": 0.004828717093914747, "skip_count": 0.0, "step": 3022, "text_loss": 0.36598992347717285 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 25.0, "epoch": 14.197240974464338, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.06103515625, "learning_rate": 0.0008551365685340285, "loss": 0.0084, "macro_f1": 0.9555556178092957, "num_tokens": 4879655.0, "repeat_count": 1.0, "routers_loss": 0.02049369551241398, "skip_count": 5.0, "step": 3024, "text_loss": 0.5069093704223633 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 14.206633401819783, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.043212890625, "learning_rate": 0.0008549186237227138, "loss": 0.0088, "macro_f1": 0.8823530077934265, "num_tokens": 4882606.0, "repeat_count": 1.0, "routers_loss": 0.03947242721915245, "skip_count": 2.0, "step": 3026, "text_loss": 0.2600715458393097 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 14.216025829175228, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.0008547005429059128, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4885246.0, "repeat_count": 2.0, "routers_loss": 0.0026363315992057323, "skip_count": 0.0, "step": 3028, "text_loss": 0.37642326951026917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.225418256530672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048828125, "learning_rate": 0.0008544823261671948, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 4888109.0, "repeat_count": 0.0, "routers_loss": 0.003858231008052826, "skip_count": 0.0, "step": 3030, "text_loss": 0.5875385999679565 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 14.234810683886117, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.061279296875, "learning_rate": 0.0008542639735901804, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 4891168.0, "repeat_count": 1.0, "routers_loss": 0.004789089784026146, "skip_count": 1.0, "step": 3032, "text_loss": 0.6417325139045715 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.244203111241562, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0008540454852585434, "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4894355.0, "repeat_count": 0.0, "routers_loss": 0.007334680762141943, "skip_count": 2.0, "step": 3034, "text_loss": 0.23697198927402496 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 14.253595538597006, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.034423828125, "learning_rate": 0.0008538268612560084, "loss": 0.0058, "macro_f1": 0.4871794879436493, "num_tokens": 4897543.0, "repeat_count": 0.0, "routers_loss": 0.022096361964941025, "skip_count": 3.0, "step": 3036, "text_loss": 0.1989550143480301 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.262987965952451, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.047119140625, "learning_rate": 0.0008536081016663527, "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 4900752.0, "repeat_count": 1.0, "routers_loss": 0.0037680594250559807, "skip_count": 2.0, "step": 3038, "text_loss": 0.5001366138458252 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0400390625, "learning_rate": 0.0008533892065734055, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4903581.0, "repeat_count": 0.0, "routers_loss": 0.0032373068388551474, "skip_count": 1.0, "step": 3040, "text_loss": 0.5019411444664001 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.28177282066334, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.042724609375, "learning_rate": 0.0008531701760610476, "loss": 0.0121, "macro_f1": 1.0, "num_tokens": 4907108.0, "repeat_count": 1.0, "routers_loss": 0.0078013185411691666, "skip_count": 2.0, "step": 3042, "text_loss": 0.3460627794265747 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 14.291165248018785, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.04833984375, "learning_rate": 0.000852951010213212, "loss": 0.0089, "macro_f1": 0.8200000524520874, "num_tokens": 4911269.0, "repeat_count": 1.0, "routers_loss": 0.03576689213514328, "skip_count": 3.0, "step": 3044, "text_loss": 0.268994003534317 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 14.300557675374229, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0008527317091138835, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 4914203.0, "repeat_count": 1.0, "routers_loss": 0.0032140621915459633, "skip_count": 1.0, "step": 3046, "text_loss": 0.9998719692230225 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.309950102729674, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040771484375, "learning_rate": 0.0008525122728470987, "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4918562.0, "repeat_count": 1.0, "routers_loss": 0.008559177629649639, "skip_count": 3.0, "step": 3048, "text_loss": 0.3062439560890198 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0008522927014969459, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 4921940.0, "repeat_count": 0.0, "routers_loss": 0.008735597133636475, "skip_count": 2.0, "step": 3050, "text_loss": 0.3637430965900421 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05517578125, "learning_rate": 0.0008520729951475652, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4925416.0, "repeat_count": 0.0, "routers_loss": 0.0012709591537714005, "skip_count": 0.0, "step": 3052, "text_loss": 0.542036235332489 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.338127384796008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06640625, "learning_rate": 0.0008518531538831488, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4928695.0, "repeat_count": 0.0, "routers_loss": 0.0010660928674042225, "skip_count": 1.0, "step": 3054, "text_loss": 0.43144503235816956 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.059326171875, "learning_rate": 0.00085163317778794, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4931504.0, "repeat_count": 0.0, "routers_loss": 0.004558971151709557, "skip_count": 2.0, "step": 3056, "text_loss": 0.5257010459899902 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.0008514130669462341, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4934935.0, "repeat_count": 0.0, "routers_loss": 0.010774781927466393, "skip_count": 2.0, "step": 3058, "text_loss": 0.26061776280403137 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.366304666862343, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0008511928214423782, "loss": 0.0103, "macro_f1": 0.6601307392120361, "num_tokens": 4938047.0, "repeat_count": 1.0, "routers_loss": 0.014763157814741135, "skip_count": 2.0, "step": 3060, "text_loss": 0.2856905460357666 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.375697094217786, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0008509724413607705, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 4941041.0, "repeat_count": 1.0, "routers_loss": 0.004613345488905907, "skip_count": 0.0, "step": 3062, "text_loss": 0.2870287001132965 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.385089521573232, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0008507519267858612, "loss": 0.015, "macro_f1": 1.0, "num_tokens": 4944708.0, "repeat_count": 1.0, "routers_loss": 0.008584189228713512, "skip_count": 2.0, "step": 3064, "text_loss": 0.15828095376491547 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.394481948928677, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.0008505312778021519, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 4948295.0, "repeat_count": 0.0, "routers_loss": 0.0014670816017314792, "skip_count": 0.0, "step": 3066, "text_loss": 0.36697930097579956 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.40387437628412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.0008503104944941958, "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 4951983.0, "repeat_count": 0.0, "routers_loss": 0.005348859820514917, "skip_count": 2.0, "step": 3068, "text_loss": 0.21612997353076935 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0008500895769465972, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4955023.0, "repeat_count": 0.0, "routers_loss": 0.0013203793205320835, "skip_count": 0.0, "step": 3070, "text_loss": 0.9757798314094543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.422659230995011, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.0008498685252440124, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 4957600.0, "repeat_count": 0.0, "routers_loss": 0.006907356437295675, "skip_count": 0.0, "step": 3072, "text_loss": 0.356107234954834 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.432051658350455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061279296875, "learning_rate": 0.0008496473394711487, "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4960746.0, "repeat_count": 0.0, "routers_loss": 0.0027704904787242413, "skip_count": 1.0, "step": 3074, "text_loss": 0.6812908053398132 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0576171875, "learning_rate": 0.0008494260197127649, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 4963845.0, "repeat_count": 0.0, "routers_loss": 0.0036796489730477333, "skip_count": 2.0, "step": 3076, "text_loss": 0.7215370535850525 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0556640625, "learning_rate": 0.0008492045660536712, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 4966887.0, "repeat_count": 0.0, "routers_loss": 0.0037137691397219896, "skip_count": 1.0, "step": 3078, "text_loss": 0.8700299859046936 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 14.460228940416789, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03857421875, "learning_rate": 0.0008489829785787291, "loss": 0.0078, "macro_f1": 0.8823530077934265, "num_tokens": 4969859.0, "repeat_count": 1.0, "routers_loss": 0.016492314636707306, "skip_count": 2.0, "step": 3080, "text_loss": 0.6520360112190247 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043701171875, "learning_rate": 0.0008487612573728513, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4972628.0, "repeat_count": 0.0, "routers_loss": 0.004022917244583368, "skip_count": 2.0, "step": 3082, "text_loss": 0.17498187720775604 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044677734375, "learning_rate": 0.0008485394025210016, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4975475.0, "repeat_count": 0.0, "routers_loss": 0.009141159243881702, "skip_count": 1.0, "step": 3084, "text_loss": 0.5975366234779358 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.488406222483123, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 0.0008483174141081956, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4978858.0, "repeat_count": 0.0, "routers_loss": 0.0031561285723000765, "skip_count": 0.0, "step": 3086, "text_loss": 0.18748866021633148 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.497798649838568, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008480952922194991, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4982142.0, "repeat_count": 0.0, "routers_loss": 0.0007894713780842721, "skip_count": 0.0, "step": 3088, "text_loss": 0.42083197832107544 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008478730369400302, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4984872.0, "repeat_count": 0.0, "routers_loss": 0.0005908289458602667, "skip_count": 0.0, "step": 3090, "text_loss": 0.45337188243865967 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.516583504549457, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02392578125, "learning_rate": 0.0008476506483549573, "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 4988137.0, "repeat_count": 1.0, "routers_loss": 0.0016509373672306538, "skip_count": 2.0, "step": 3092, "text_loss": 0.6397262811660767 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0008474281265495002, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4991164.0, "repeat_count": 0.0, "routers_loss": 0.004088304936885834, "skip_count": 1.0, "step": 3094, "text_loss": 0.18352322280406952 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0008472054716089295, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4993876.0, "repeat_count": 0.0, "routers_loss": 0.005200014915317297, "skip_count": 0.0, "step": 3096, "text_loss": 0.2776511013507843 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.544760786615791, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.0008469826836185673, "loss": 0.01, "macro_f1": 0.6601307392120361, "num_tokens": 4997068.0, "repeat_count": 1.0, "routers_loss": 0.012686059810221195, "skip_count": 2.0, "step": 3098, "text_loss": 0.23209233582019806 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.554153213971237, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.055419921875, "learning_rate": 0.0008467597626637858, "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 5000038.0, "repeat_count": 1.0, "routers_loss": 0.006401528604328632, "skip_count": 2.0, "step": 3100, "text_loss": 0.45936745405197144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.56354564132668, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008465367088300093, "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 5002870.0, "repeat_count": 0.0, "routers_loss": 0.016640547662973404, "skip_count": 1.0, "step": 3102, "text_loss": 0.44502779841423035 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.572938068682125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0008463135222027124, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5006357.0, "repeat_count": 0.0, "routers_loss": 0.008411331102252007, "skip_count": 2.0, "step": 3104, "text_loss": 0.3414570391178131 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.582330496037569, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0008460902028674204, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5009059.0, "repeat_count": 0.0, "routers_loss": 0.0010406570509076118, "skip_count": 0.0, "step": 3106, "text_loss": 0.5931221842765808 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0008458667509097098, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 5012327.0, "repeat_count": 0.0, "routers_loss": 0.001959054498001933, "skip_count": 0.0, "step": 3108, "text_loss": 0.5191171169281006 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.60111535074846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0008456431664152078, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 5015472.0, "repeat_count": 0.0, "routers_loss": 0.000994380097836256, "skip_count": 0.0, "step": 3110, "text_loss": 0.4455361068248749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.610507778103903, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0008454194494695923, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 5018901.0, "repeat_count": 0.0, "routers_loss": 0.0037662344984710217, "skip_count": 0.0, "step": 3112, "text_loss": 0.5335362553596497 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 14.619900205459349, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02294921875, "learning_rate": 0.0008451956001585923, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5022520.0, "repeat_count": 0.0, "routers_loss": 0.008664715103805065, "skip_count": 3.0, "step": 3114, "text_loss": 0.16230148077011108 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.629292632814794, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.000844971618567987, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 5025505.0, "repeat_count": 0.0, "routers_loss": 0.0015904927859082818, "skip_count": 0.0, "step": 3116, "text_loss": 0.6989432573318481 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.638685060170237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0008447475047836068, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 5028767.0, "repeat_count": 0.0, "routers_loss": 0.005853322334587574, "skip_count": 1.0, "step": 3118, "text_loss": 0.31420737504959106 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 14.648077487525683, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008445232588913325, "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 5032577.0, "repeat_count": 0.0, "routers_loss": 0.012760105542838573, "skip_count": 0.0, "step": 3120, "text_loss": 0.5534627437591553 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.0008442988809770953, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 5035381.0, "repeat_count": 0.0, "routers_loss": 0.0022257440723478794, "skip_count": 0.0, "step": 3122, "text_loss": 0.42492759227752686 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.666862342236572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0008440743711268775, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 5038743.0, "repeat_count": 0.0, "routers_loss": 0.004648433532565832, "skip_count": 0.0, "step": 3124, "text_loss": 0.16404685378074646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0008438497294267117, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5041492.0, "repeat_count": 0.0, "routers_loss": 0.006313877180218697, "skip_count": 0.0, "step": 3126, "text_loss": 0.23191484808921814 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.68564719694746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0008436249559626807, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5043955.0, "repeat_count": 1.0, "routers_loss": 0.0036270488053560257, "skip_count": 0.0, "step": 3128, "text_loss": 0.5782018303871155 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.695039624302906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04345703125, "learning_rate": 0.0008434000508209187, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5047571.0, "repeat_count": 0.0, "routers_loss": 0.003809858812019229, "skip_count": 1.0, "step": 3130, "text_loss": 0.7129825949668884 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.704432051658351, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0008431750140876092, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 5051608.0, "repeat_count": 0.0, "routers_loss": 0.0022369057405740023, "skip_count": 0.0, "step": 3132, "text_loss": 0.4433445930480957 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.713824479013795, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.000842949845848987, "loss": 0.0135, "macro_f1": 0.32098764181137085, "num_tokens": 5054656.0, "repeat_count": 0.0, "routers_loss": 0.0425117202103138, "skip_count": 2.0, "step": 3134, "text_loss": 0.38721024990081787 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.72321690636924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0008427245461913368, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 5059108.0, "repeat_count": 0.0, "routers_loss": 0.0018077283166348934, "skip_count": 0.0, "step": 3136, "text_loss": 0.7496368885040283 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.732609333724685, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12109375, "learning_rate": 0.0008424991152009941, "loss": 0.0111, "macro_f1": 1.0, "num_tokens": 5062371.0, "repeat_count": 1.0, "routers_loss": 0.008801834657788277, "skip_count": 2.0, "step": 3138, "text_loss": 0.5337086319923401 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 14.742001761080129, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0008422735529643444, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5065593.0, "repeat_count": 0.0, "routers_loss": 0.00548676960170269, "skip_count": 3.0, "step": 3140, "text_loss": 0.2561623156070709 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.751394188435574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0008420478595678233, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5068271.0, "repeat_count": 0.0, "routers_loss": 0.006389956455677748, "skip_count": 0.0, "step": 3142, "text_loss": 0.15605193376541138 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.760786615791018, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07958984375, "learning_rate": 0.0008418220350979175, "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 5071358.0, "repeat_count": 1.0, "routers_loss": 0.012387622147798538, "skip_count": 2.0, "step": 3144, "text_loss": 0.3085838258266449 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008415960796411628, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5075584.0, "repeat_count": 0.0, "routers_loss": 0.00311864772811532, "skip_count": 1.0, "step": 3146, "text_loss": 0.4786977469921112 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.779571470501908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0008413699932841461, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5078388.0, "repeat_count": 0.0, "routers_loss": 0.0030679800547659397, "skip_count": 0.0, "step": 3148, "text_loss": 0.5222916603088379 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.788963897857352, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0008411437761135039, "loss": 0.011, "macro_f1": 1.0, "num_tokens": 5081584.0, "repeat_count": 1.0, "routers_loss": 0.012907958589494228, "skip_count": 2.0, "step": 3150, "text_loss": 0.5369884371757507 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0008409174282159232, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 5084450.0, "repeat_count": 0.0, "routers_loss": 0.012314042076468468, "skip_count": 2.0, "step": 3152, "text_loss": 0.25685277581214905 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.807748752568243, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.000840690949678141, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 5087865.0, "repeat_count": 1.0, "routers_loss": 0.00899206381291151, "skip_count": 0.0, "step": 3154, "text_loss": 0.1717093288898468 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.817141179923686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06103515625, "learning_rate": 0.0008404643405869441, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 5090857.0, "repeat_count": 0.0, "routers_loss": 0.0013312003575265408, "skip_count": 0.0, "step": 3156, "text_loss": 0.27446436882019043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.826533607279131, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0008402376010291695, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 5093917.0, "repeat_count": 0.0, "routers_loss": 0.002653320087119937, "skip_count": 0.0, "step": 3158, "text_loss": 0.4237489402294159 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045654296875, "learning_rate": 0.0008400107310917045, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5096656.0, "repeat_count": 0.0, "routers_loss": 0.012976993806660175, "skip_count": 2.0, "step": 3160, "text_loss": 0.42361980676651 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.84531846199002, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.000839783730861486, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5099582.0, "repeat_count": 0.0, "routers_loss": 0.006936746649444103, "skip_count": 2.0, "step": 3162, "text_loss": 0.26656073331832886 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0008395566004255008, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 5102908.0, "repeat_count": 0.0, "routers_loss": 0.006619359832257032, "skip_count": 1.0, "step": 3164, "text_loss": 0.590774416923523 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06884765625, "learning_rate": 0.0008393293398707858, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5105829.0, "repeat_count": 0.0, "routers_loss": 0.010120268911123276, "skip_count": 2.0, "step": 3166, "text_loss": 0.605930507183075 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.873495744056354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 0.0008391019492844275, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 5109850.0, "repeat_count": 0.0, "routers_loss": 0.004940980114042759, "skip_count": 2.0, "step": 3168, "text_loss": 0.12973152101039886 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0008388744287535627, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 5113353.0, "repeat_count": 0.0, "routers_loss": 0.0031777634285390377, "skip_count": 1.0, "step": 3170, "text_loss": 0.18577200174331665 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0008386467783653775, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 5116421.0, "repeat_count": 0.0, "routers_loss": 0.005431659985333681, "skip_count": 0.0, "step": 3172, "text_loss": 0.2302747517824173 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 14.901673026122689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.000838418998207108, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5119457.0, "repeat_count": 0.0, "routers_loss": 0.0077286697924137115, "skip_count": 4.0, "step": 3174, "text_loss": 0.19606637954711914 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0008381910883660399, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5123201.0, "repeat_count": 0.0, "routers_loss": 0.003982985392212868, "skip_count": 0.0, "step": 3176, "text_loss": 0.716376006603241 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 14.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09423828125, "learning_rate": 0.0008379630489295089, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 5126035.0, "repeat_count": 0.0, "routers_loss": 0.005626026075333357, "skip_count": 1.0, "step": 3178, "text_loss": 0.5144625902175903 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.929850308189023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008377348799849, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 5129179.0, "repeat_count": 0.0, "routers_loss": 0.015458245761692524, "skip_count": 2.0, "step": 3180, "text_loss": 0.29887503385543823 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 14.939242735544468, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.062255859375, "learning_rate": 0.0008375065816196479, "loss": 0.0086, "macro_f1": 0.5492662787437439, "num_tokens": 5132149.0, "repeat_count": 0.0, "routers_loss": 0.012210468761622906, "skip_count": 2.0, "step": 3182, "text_loss": 0.8981851935386658 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.948635162899912, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0008372781539212371, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5135287.0, "repeat_count": 0.0, "routers_loss": 0.0052537876181304455, "skip_count": 0.0, "step": 3184, "text_loss": 0.4245666563510895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 14.958027590255357, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0008370495969772014, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5138589.0, "repeat_count": 0.0, "routers_loss": 0.012873421423137188, "skip_count": 2.0, "step": 3186, "text_loss": 0.40581050515174866 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 14.9674200176108, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0008368209108751244, "loss": 0.0127, "macro_f1": 0.6521739363670349, "num_tokens": 5141635.0, "repeat_count": 2.0, "routers_loss": 0.07720445841550827, "skip_count": 4.0, "step": 3188, "text_loss": 0.3755173981189728 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0008365920957026389, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5144728.0, "repeat_count": 0.0, "routers_loss": 0.001440995605662465, "skip_count": 0.0, "step": 3190, "text_loss": 0.5067034363746643 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 14.986204872321691, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.041748046875, "learning_rate": 0.0008363631515474275, "loss": 0.0089, "macro_f1": 0.6538461446762085, "num_tokens": 5147963.0, "repeat_count": 1.0, "routers_loss": 0.018752984702587128, "skip_count": 2.0, "step": 3192, "text_loss": 0.20224551856517792 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 14.995597299677135, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0008361340784972217, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 5151184.0, "repeat_count": 0.0, "routers_loss": 0.0005360354552976787, "skip_count": 0.0, "step": 3194, "text_loss": 0.4588058292865753 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.004696213677722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0008359048766398031, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5153889.0, "repeat_count": 0.0, "routers_loss": 0.0009184491937048733, "skip_count": 1.0, "step": 3196, "text_loss": 0.2980220317840576 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.014088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.000835675546063002, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5156758.0, "repeat_count": 0.0, "routers_loss": 0.001252970308996737, "skip_count": 0.0, "step": 3198, "text_loss": 0.6775755882263184 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0008354460868546985, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5160247.0, "repeat_count": 0.0, "routers_loss": 0.0037315806839615107, "skip_count": 0.0, "step": 3200, "text_loss": 0.35867011547088623 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0008352164991028217, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 5163456.0, "repeat_count": 1.0, "routers_loss": 0.001497485558502376, "skip_count": 0.0, "step": 3202, "text_loss": 0.690290093421936 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.042265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.0008349867828953501, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 5166139.0, "repeat_count": 0.0, "routers_loss": 0.001051135826855898, "skip_count": 0.0, "step": 3204, "text_loss": 0.3340415954589844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.051658350454945, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0008347569383203113, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 5169009.0, "repeat_count": 0.0, "routers_loss": 0.0010544003453105688, "skip_count": 0.0, "step": 3206, "text_loss": 0.8584878444671631 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.06105077781039, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008345269654657823, "loss": 0.0085, "macro_f1": 1.0, "num_tokens": 5172618.0, "repeat_count": 1.0, "routers_loss": 0.007312417030334473, "skip_count": 1.0, "step": 3208, "text_loss": 0.19500218331813812 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.070443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0008342968644198892, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 5175857.0, "repeat_count": 0.0, "routers_loss": 0.00276504410430789, "skip_count": 0.0, "step": 3210, "text_loss": 0.5446314215660095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.079835632521279, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0008340666352708068, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 5178585.0, "repeat_count": 0.0, "routers_loss": 0.002669303445145488, "skip_count": 0.0, "step": 3212, "text_loss": 0.3687484860420227 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0008338362781067596, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5181777.0, "repeat_count": 0.0, "routers_loss": 0.0031585274264216423, "skip_count": 0.0, "step": 3214, "text_loss": 0.27325859665870667 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.09862048723217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.000833605793016021, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 5184312.0, "repeat_count": 0.0, "routers_loss": 0.008807534351944923, "skip_count": 2.0, "step": 3216, "text_loss": 0.4466548562049866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.108012914587613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008333751800869133, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5187497.0, "repeat_count": 0.0, "routers_loss": 0.003171310294419527, "skip_count": 0.0, "step": 3218, "text_loss": 0.5423526763916016 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.117405341943059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0008331444394078076, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 5190982.0, "repeat_count": 0.0, "routers_loss": 0.0016481258207932115, "skip_count": 2.0, "step": 3220, "text_loss": 0.48984917998313904 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.126797769298504, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.000832913571067124, "loss": 0.0107, "macro_f1": 1.0, "num_tokens": 5194044.0, "repeat_count": 1.0, "routers_loss": 0.003957313951104879, "skip_count": 1.0, "step": 3222, "text_loss": 0.4533331096172333 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.136190196653947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0008326825751533322, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5197092.0, "repeat_count": 0.0, "routers_loss": 0.0016904744552448392, "skip_count": 0.0, "step": 3224, "text_loss": 0.5538802742958069 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05224609375, "learning_rate": 0.0008324514517549501, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5199941.0, "repeat_count": 0.0, "routers_loss": 0.005608258303254843, "skip_count": 1.0, "step": 3226, "text_loss": 0.416242778301239 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 15.154975051364836, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.040771484375, "learning_rate": 0.0008322202009605444, "loss": 0.0072, "macro_f1": 0.8823530077934265, "num_tokens": 5202618.0, "repeat_count": 1.0, "routers_loss": 0.020965175703167915, "skip_count": 2.0, "step": 3228, "text_loss": 0.17496295273303986 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 15.164367478720282, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008319888228587311, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 5206414.0, "repeat_count": 1.0, "routers_loss": 0.021259209141135216, "skip_count": 5.0, "step": 3230, "text_loss": 0.22471418976783752 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0008317573175381745, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 5209768.0, "repeat_count": 0.0, "routers_loss": 0.0018647604156285524, "skip_count": 0.0, "step": 3232, "text_loss": 0.4415269196033478 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0008315256850875881, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5213257.0, "repeat_count": 0.0, "routers_loss": 0.002345515415072441, "skip_count": 0.0, "step": 3234, "text_loss": 0.347247838973999 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 15.192544760786616, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0008312939255957336, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 5215800.0, "repeat_count": 0.0, "routers_loss": 0.007112892810255289, "skip_count": 3.0, "step": 3236, "text_loss": 0.31091734766960144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.201937188142061, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0008310620391514219, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5219205.0, "repeat_count": 0.0, "routers_loss": 0.00432228296995163, "skip_count": 0.0, "step": 3238, "text_loss": 0.3421775996685028 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.0008308300258435124, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 5222422.0, "repeat_count": 0.0, "routers_loss": 0.0076514314860105515, "skip_count": 2.0, "step": 3240, "text_loss": 0.22378318011760712 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.22072204285295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0008305978857609128, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5225625.0, "repeat_count": 0.0, "routers_loss": 0.0007617069641128182, "skip_count": 0.0, "step": 3242, "text_loss": 0.5880323648452759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0008303656189925799, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5229113.0, "repeat_count": 0.0, "routers_loss": 0.0017418119823560119, "skip_count": 0.0, "step": 3244, "text_loss": 0.3302813768386841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.239506897563839, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0008301332256275183, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5232061.0, "repeat_count": 0.0, "routers_loss": 0.0026667986530810595, "skip_count": 0.0, "step": 3246, "text_loss": 0.5679706335067749 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.248899324919284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.0008299007057547821, "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 5235279.0, "repeat_count": 1.0, "routers_loss": 0.011016624979674816, "skip_count": 2.0, "step": 3248, "text_loss": 0.5081504583358765 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.258291752274728, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0008296680594634731, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5239655.0, "repeat_count": 1.0, "routers_loss": 0.005492044147104025, "skip_count": 0.0, "step": 3250, "text_loss": 0.14675180613994598 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0008294352868427418, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5243579.0, "repeat_count": 0.0, "routers_loss": 0.00404445780441165, "skip_count": 1.0, "step": 3252, "text_loss": 0.4201085865497589 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.277076606985618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0008292023879817871, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 5247059.0, "repeat_count": 0.0, "routers_loss": 0.006886140909045935, "skip_count": 1.0, "step": 3254, "text_loss": 0.2289208322763443 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.286469034341062, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057861328125, "learning_rate": 0.0008289693629698564, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 5249940.0, "repeat_count": 0.0, "routers_loss": 0.0005736657767556608, "skip_count": 0.0, "step": 3256, "text_loss": 0.5670450925827026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.295861461696507, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0008287362118962452, "loss": 0.006, "macro_f1": 0.3272727429866791, "num_tokens": 5253580.0, "repeat_count": 0.0, "routers_loss": 0.011349895037710667, "skip_count": 1.0, "step": 3258, "text_loss": 0.5042323470115662 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.305253889051952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0008285029348502973, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5257080.0, "repeat_count": 0.0, "routers_loss": 0.0013626761501654983, "skip_count": 0.0, "step": 3260, "text_loss": 0.3227672874927521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.314646316407396, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0008282695319214053, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5259951.0, "repeat_count": 0.0, "routers_loss": 0.00471635302528739, "skip_count": 0.0, "step": 3262, "text_loss": 0.20773714780807495 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.324038743762841, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.0008280360031990093, "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 5263314.0, "repeat_count": 0.0, "routers_loss": 0.010472415015101433, "skip_count": 2.0, "step": 3264, "text_loss": 0.34397366642951965 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.333431171118287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.000827802348772598, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 5267358.0, "repeat_count": 0.0, "routers_loss": 0.0007814752752892673, "skip_count": 0.0, "step": 3266, "text_loss": 0.747342586517334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.34282359847373, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0008275685687317084, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5270400.0, "repeat_count": 0.0, "routers_loss": 0.000902949133887887, "skip_count": 0.0, "step": 3268, "text_loss": 0.43782034516334534 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0008273346631659252, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5273147.0, "repeat_count": 0.0, "routers_loss": 0.00043462219764478505, "skip_count": 0.0, "step": 3270, "text_loss": 0.6358205080032349 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.361608453184619, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008271006321648816, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5277638.0, "repeat_count": 0.0, "routers_loss": 0.002211218234151602, "skip_count": 0.0, "step": 3272, "text_loss": 0.20220105350017548 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.371000880540064, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.0008268664758182589, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5280638.0, "repeat_count": 1.0, "routers_loss": 0.010536720044910908, "skip_count": 0.0, "step": 3274, "text_loss": 0.7579061388969421 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 0.0008266321942157859, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5283847.0, "repeat_count": 0.0, "routers_loss": 0.0017158017726615071, "skip_count": 0.0, "step": 3276, "text_loss": 0.669302761554718 }, { "acc_repeat": 0.800000011920929, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.389785735250953, "f1_execute": 0.9743589162826538, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, "grad_norm": 0.06005859375, "learning_rate": 0.0008263977874472399, "loss": 0.0088, "macro_f1": 0.9544159770011902, "num_tokens": 5286627.0, "repeat_count": 5.0, "routers_loss": 0.011220700107514858, "skip_count": 4.0, "step": 3278, "text_loss": 0.8703984022140503 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.399178162606399, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0008261632556024461, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5289766.0, "repeat_count": 0.0, "routers_loss": 0.0020442772656679153, "skip_count": 0.0, "step": 3280, "text_loss": 0.5009346008300781 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.0008259285987712774, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5293010.0, "repeat_count": 0.0, "routers_loss": 0.005645765457302332, "skip_count": 0.0, "step": 3282, "text_loss": 0.2546011209487915 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.042236328125, "learning_rate": 0.0008256938170436549, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 5296732.0, "repeat_count": 0.0, "routers_loss": 0.0027385836001485586, "skip_count": 2.0, "step": 3284, "text_loss": 0.5244000554084778 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.427355444672733, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0008254589105095473, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 5299926.0, "repeat_count": 1.0, "routers_loss": 0.007451715879142284, "skip_count": 1.0, "step": 3286, "text_loss": 0.28979742527008057 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0008252238792589711, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5303006.0, "repeat_count": 0.0, "routers_loss": 0.004805843345820904, "skip_count": 2.0, "step": 3288, "text_loss": 0.5131978392601013 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.446140299383622, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.000824988723381991, "loss": 0.0091, "macro_f1": 0.3272727429866791, "num_tokens": 5306953.0, "repeat_count": 0.0, "routers_loss": 0.010639613494277, "skip_count": 1.0, "step": 3290, "text_loss": 0.4901447296142578 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 15.455532726739067, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.044189453125, "learning_rate": 0.0008247534429687191, "loss": 0.007, "macro_f1": 0.5492662787437439, "num_tokens": 5310516.0, "repeat_count": 0.0, "routers_loss": 0.013625577092170715, "skip_count": 2.0, "step": 3292, "text_loss": 0.2124534696340561 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.46492515409451, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0008245180381093152, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 5313959.0, "repeat_count": 0.0, "routers_loss": 0.004958513658493757, "skip_count": 1.0, "step": 3294, "text_loss": 0.46682238578796387 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0008242825088939867, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5316609.0, "repeat_count": 0.0, "routers_loss": 0.003962756600230932, "skip_count": 0.0, "step": 3296, "text_loss": 0.7010108232498169 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.483710008805401, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008240468554129892, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5319638.0, "repeat_count": 0.0, "routers_loss": 0.0006996620795689523, "skip_count": 0.0, "step": 3298, "text_loss": 0.4966355860233307 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.493102436160845, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0008238110777566255, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 5323019.0, "repeat_count": 0.0, "routers_loss": 0.0016031896229833364, "skip_count": 0.0, "step": 3300, "text_loss": 0.38668957352638245 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.50249486351629, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0008235751760152459, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 5326099.0, "repeat_count": 2.0, "routers_loss": 0.00344281829893589, "skip_count": 2.0, "step": 3302, "text_loss": 0.5330720543861389 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.511887290871735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06005859375, "learning_rate": 0.0008233391502792484, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5328993.0, "repeat_count": 0.0, "routers_loss": 0.007886730134487152, "skip_count": 1.0, "step": 3304, "text_loss": 0.5470269322395325 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.521279718227179, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0008231030006390786, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5331554.0, "repeat_count": 0.0, "routers_loss": 0.008180000819265842, "skip_count": 1.0, "step": 3306, "text_loss": 0.4023340344429016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0008228667271852294, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5335712.0, "repeat_count": 0.0, "routers_loss": 0.0002942821884062141, "skip_count": 0.0, "step": 3308, "text_loss": 0.5306711792945862 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05908203125, "learning_rate": 0.0008226303300082414, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5338701.0, "repeat_count": 0.0, "routers_loss": 0.0006134595023468137, "skip_count": 0.0, "step": 3310, "text_loss": 0.5906263589859009 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.549457000293513, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0008223938091987022, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5342274.0, "repeat_count": 0.0, "routers_loss": 0.0016656654188409448, "skip_count": 0.0, "step": 3312, "text_loss": 0.5201764106750488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.558849427648958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0008221571648472472, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5345185.0, "repeat_count": 0.0, "routers_loss": 0.0038612703792750835, "skip_count": 0.0, "step": 3314, "text_loss": 0.36633720993995667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.568241855004402, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0008219203970445589, "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 5348804.0, "repeat_count": 0.0, "routers_loss": 0.009782899171113968, "skip_count": 1.0, "step": 3316, "text_loss": 0.3117460012435913 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.577634282359847, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055908203125, "learning_rate": 0.0008216835058813672, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 5351896.0, "repeat_count": 0.0, "routers_loss": 0.007713229861110449, "skip_count": 0.0, "step": 3318, "text_loss": 0.253496378660202 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0008214464914484492, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5355058.0, "repeat_count": 0.0, "routers_loss": 0.006227815989404917, "skip_count": 2.0, "step": 3320, "text_loss": 0.32693132758140564 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0008212093538366292, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 5358365.0, "repeat_count": 0.0, "routers_loss": 0.002601418411359191, "skip_count": 0.0, "step": 3322, "text_loss": 0.40394455194473267 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 15.605811564426181, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.000820972093136779, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5360981.0, "repeat_count": 0.0, "routers_loss": 0.005545300897210836, "skip_count": 3.0, "step": 3324, "text_loss": 0.6758295893669128 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.615203991781627, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.05078125, "learning_rate": 0.0008207347094398172, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 5364018.0, "repeat_count": 1.0, "routers_loss": 0.001924700103700161, "skip_count": 0.0, "step": 3326, "text_loss": 0.5196860432624817 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0008204972028367097, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5366986.0, "repeat_count": 0.0, "routers_loss": 0.012254828587174416, "skip_count": 1.0, "step": 3328, "text_loss": 0.24661913514137268 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.633988846492516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0008202595734184694, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5371463.0, "repeat_count": 0.0, "routers_loss": 0.005094083491712809, "skip_count": 0.0, "step": 3330, "text_loss": 0.2525769770145416 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.643381273847961, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0008200218212761566, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 5374823.0, "repeat_count": 1.0, "routers_loss": 0.0025883198250085115, "skip_count": 0.0, "step": 3332, "text_loss": 0.21849912405014038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.000819783946500878, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5377640.0, "repeat_count": 0.0, "routers_loss": 0.008240507915616035, "skip_count": 0.0, "step": 3334, "text_loss": 0.2662734091281891 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 15.66216612855885, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.050537109375, "learning_rate": 0.000819545949183788, "loss": 0.01, "macro_f1": 0.5934640765190125, "num_tokens": 5380593.0, "repeat_count": 0.0, "routers_loss": 0.038378193974494934, "skip_count": 3.0, "step": 3336, "text_loss": 0.2431795746088028 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.671558555914293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 0.0008193078294160874, "loss": 0.0097, "macro_f1": 1.0, "num_tokens": 5384487.0, "repeat_count": 1.0, "routers_loss": 0.005926199723035097, "skip_count": 1.0, "step": 3338, "text_loss": 0.5663705468177795 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.680950983269739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0008190695872890242, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5387511.0, "repeat_count": 0.0, "routers_loss": 0.010842559859156609, "skip_count": 2.0, "step": 3340, "text_loss": 0.11517292261123657 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.690343410625184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0008188312228938933, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 5390698.0, "repeat_count": 0.0, "routers_loss": 0.001304097007960081, "skip_count": 0.0, "step": 3342, "text_loss": 0.4827076196670532 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.699735837980628, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0008185927363220363, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5393778.0, "repeat_count": 1.0, "routers_loss": 0.005354117136448622, "skip_count": 0.0, "step": 3344, "text_loss": 0.44467049837112427 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.040771484375, "learning_rate": 0.0008183541276648418, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5396925.0, "repeat_count": 0.0, "routers_loss": 0.004800073802471161, "skip_count": 2.0, "step": 3346, "text_loss": 0.2032834142446518 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.718520692691518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0008181153970137449, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5400522.0, "repeat_count": 0.0, "routers_loss": 0.0021674633026123047, "skip_count": 0.0, "step": 3348, "text_loss": 0.4507528841495514 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.727913120046962, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.051513671875, "learning_rate": 0.0008178765444602278, "loss": 0.0117, "macro_f1": 0.8820862174034119, "num_tokens": 5403526.0, "repeat_count": 2.0, "routers_loss": 0.04263930395245552, "skip_count": 2.0, "step": 3350, "text_loss": 0.3606615960597992 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 15.737305547402407, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008176375700958194, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5407127.0, "repeat_count": 1.0, "routers_loss": 0.006953123956918716, "skip_count": 0.0, "step": 3352, "text_loss": 0.2290353775024414 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0008173984740120948, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5410829.0, "repeat_count": 0.0, "routers_loss": 0.0014363783411681652, "skip_count": 0.0, "step": 3354, "text_loss": 0.4220392405986786 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.756090402113296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0008171592563006762, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5414152.0, "repeat_count": 0.0, "routers_loss": 0.00202389364130795, "skip_count": 1.0, "step": 3356, "text_loss": 0.37729766964912415 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.765482829468741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0008169199170532323, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5417312.0, "repeat_count": 0.0, "routers_loss": 0.006253739818930626, "skip_count": 2.0, "step": 3358, "text_loss": 0.1304289996623993 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.774875256824185, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0703125, "learning_rate": 0.0008166804563614785, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 5421227.0, "repeat_count": 2.0, "routers_loss": 0.01622140221297741, "skip_count": 2.0, "step": 3360, "text_loss": 0.298664391040802 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.78426768417963, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0008164408743171763, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5424646.0, "repeat_count": 1.0, "routers_loss": 0.0037176944315433502, "skip_count": 2.0, "step": 3362, "text_loss": 0.12147632241249084 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046630859375, "learning_rate": 0.0008162011710121339, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5427897.0, "repeat_count": 0.0, "routers_loss": 0.0020403533708304167, "skip_count": 1.0, "step": 3364, "text_loss": 0.2656533420085907 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.803052538890519, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041748046875, "learning_rate": 0.0008159613465382066, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5430474.0, "repeat_count": 0.0, "routers_loss": 0.0018634048756211996, "skip_count": 0.0, "step": 3366, "text_loss": 0.9133086204528809 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.812444966245964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.0008157214009872951, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5433113.0, "repeat_count": 0.0, "routers_loss": 0.012944488786160946, "skip_count": 2.0, "step": 3368, "text_loss": 0.24352453649044037 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05712890625, "learning_rate": 0.0008154813344513472, "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 5436259.0, "repeat_count": 0.0, "routers_loss": 0.002347963862121105, "skip_count": 2.0, "step": 3370, "text_loss": 0.7601244449615479 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0008152411470223568, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 5439126.0, "repeat_count": 0.0, "routers_loss": 0.0016609140438959002, "skip_count": 0.0, "step": 3372, "text_loss": 0.5551947355270386 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.840622248312298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0008150008387923643, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5442739.0, "repeat_count": 0.0, "routers_loss": 0.008321396075189114, "skip_count": 0.0, "step": 3374, "text_loss": 0.25028282403945923 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 15.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08544921875, "learning_rate": 0.000814760409853456, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 5445247.0, "repeat_count": 2.0, "routers_loss": 0.009738070890307426, "skip_count": 1.0, "step": 3376, "text_loss": 0.37271201610565186 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0008145198602977651, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5449044.0, "repeat_count": 0.0, "routers_loss": 0.0028421466704458, "skip_count": 0.0, "step": 3378, "text_loss": 0.1458655595779419 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.868799530378633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.0008142791902174701, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 5453063.0, "repeat_count": 0.0, "routers_loss": 0.0015170135302469134, "skip_count": 0.0, "step": 3380, "text_loss": 0.5548722743988037 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 15.878191957734076, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0008140383997047966, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 5455814.0, "repeat_count": 0.0, "routers_loss": 0.0022444510832428932, "skip_count": 1.0, "step": 3382, "text_loss": 0.8034513592720032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.887584385089522, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.000813797488852016, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5459392.0, "repeat_count": 0.0, "routers_loss": 0.00038578867679461837, "skip_count": 0.0, "step": 3384, "text_loss": 0.6940088868141174 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.896976812444967, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0008135564577514458, "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 5462413.0, "repeat_count": 0.0, "routers_loss": 0.0019727381877601147, "skip_count": 0.0, "step": 3386, "text_loss": 0.5124650597572327 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.90636923980041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.0008133153064954495, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 5465552.0, "repeat_count": 0.0, "routers_loss": 0.0019896167796105146, "skip_count": 0.0, "step": 3388, "text_loss": 0.4292517900466919 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 15.915761667155856, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0008130740351764367, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 5468573.0, "repeat_count": 1.0, "routers_loss": 0.0030118159484118223, "skip_count": 1.0, "step": 3390, "text_loss": 0.48903173208236694 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 15.925154094511301, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 0.000812832643886863, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5471547.0, "repeat_count": 0.0, "routers_loss": 0.005084246397018433, "skip_count": 2.0, "step": 3392, "text_loss": 0.35789889097213745 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.934546521866745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0008125911327192299, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5474331.0, "repeat_count": 0.0, "routers_loss": 0.0008874498889781535, "skip_count": 0.0, "step": 3394, "text_loss": 0.6267408728599548 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0008123495017660851, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5477633.0, "repeat_count": 0.0, "routers_loss": 0.001794386887922883, "skip_count": 0.0, "step": 3396, "text_loss": 0.3701885938644409 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0008121077511200221, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5481277.0, "repeat_count": 0.0, "routers_loss": 0.002140481723472476, "skip_count": 0.0, "step": 3398, "text_loss": 0.6362857818603516 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.962723803933079, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0556640625, "learning_rate": 0.00081186588087368, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 5484237.0, "repeat_count": 0.0, "routers_loss": 0.000867189432028681, "skip_count": 0.0, "step": 3400, "text_loss": 1.0847382545471191 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0008116238911197442, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5487423.0, "repeat_count": 0.0, "routers_loss": 0.0029817656613886356, "skip_count": 0.0, "step": 3402, "text_loss": 0.3813740313053131 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049560546875, "learning_rate": 0.0008113817819509454, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5490155.0, "repeat_count": 0.0, "routers_loss": 0.0035141287371516228, "skip_count": 0.0, "step": 3404, "text_loss": 0.2113083451986313 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 15.990901085999413, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0008111395534600603, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5493415.0, "repeat_count": 0.0, "routers_loss": 0.003317659953609109, "skip_count": 0.0, "step": 3406, "text_loss": 0.5869330167770386 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.052001953125, "learning_rate": 0.0008108972057399114, "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 5496032.0, "repeat_count": 0.0, "routers_loss": 0.003833734430372715, "skip_count": 2.0, "step": 3408, "text_loss": 0.2938928008079529 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11328125, "learning_rate": 0.0008106547388833669, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5498890.0, "repeat_count": 0.0, "routers_loss": 0.002622978063300252, "skip_count": 1.0, "step": 3410, "text_loss": 0.3130980432033539 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.01878485471089, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0008104121529833402, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5502010.0, "repeat_count": 1.0, "routers_loss": 0.007447598036378622, "skip_count": 0.0, "step": 3412, "text_loss": 0.4413072466850281 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.000810169448132791, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5505212.0, "repeat_count": 0.0, "routers_loss": 0.0031087708193808794, "skip_count": 1.0, "step": 3414, "text_loss": 0.2910428047180176 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.037569709421778, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0008099266244247243, "loss": 0.0082, "macro_f1": 0.3272727429866791, "num_tokens": 5508755.0, "repeat_count": 0.0, "routers_loss": 0.02510393038392067, "skip_count": 1.0, "step": 3416, "text_loss": 0.33022749423980713 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008096836819521903, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5512034.0, "repeat_count": 0.0, "routers_loss": 0.0020537273958325386, "skip_count": 1.0, "step": 3418, "text_loss": 0.4731218218803406 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0008094406208082853, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5515707.0, "repeat_count": 0.0, "routers_loss": 0.004218162503093481, "skip_count": 2.0, "step": 3420, "text_loss": 0.23429590463638306 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 16.065746991488112, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0869140625, "learning_rate": 0.0008091974410861507, "loss": 0.0069, "macro_f1": 0.9265305995941162, "num_tokens": 5518436.0, "repeat_count": 1.0, "routers_loss": 0.013488355092704296, "skip_count": 3.0, "step": 3422, "text_loss": 0.45768749713897705 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0008089541428789733, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5522368.0, "repeat_count": 0.0, "routers_loss": 0.0010335417464375496, "skip_count": 1.0, "step": 3424, "text_loss": 0.43423423171043396 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0008087107262799855, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 5526061.0, "repeat_count": 0.0, "routers_loss": 0.002134323585778475, "skip_count": 0.0, "step": 3426, "text_loss": 0.4031757414340973 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1318359375, "learning_rate": 0.0008084671913824651, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5529284.0, "repeat_count": 0.0, "routers_loss": 0.0097216060385108, "skip_count": 2.0, "step": 3428, "text_loss": 0.2836039960384369 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.000808223538279735, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5532159.0, "repeat_count": 0.0, "routers_loss": 0.001684269867837429, "skip_count": 0.0, "step": 3430, "text_loss": 0.5804527401924133 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.112709128265337, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0008079797670651637, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 5536050.0, "repeat_count": 1.0, "routers_loss": 0.013918434269726276, "skip_count": 1.0, "step": 3432, "text_loss": 0.31325826048851013 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0008077358778321647, "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 5538885.0, "repeat_count": 0.0, "routers_loss": 0.0007751787197776139, "skip_count": 0.0, "step": 3434, "text_loss": 0.783108115196228 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.131493982976224, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0008074918706741966, "loss": 0.0063, "macro_f1": 0.9262410998344421, "num_tokens": 5541909.0, "repeat_count": 3.0, "routers_loss": 0.021819550544023514, "skip_count": 2.0, "step": 3436, "text_loss": 0.6558083295822144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.14088641033167, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0008072477456847638, "loss": 0.0057, "macro_f1": 0.3272727429866791, "num_tokens": 5545101.0, "repeat_count": 1.0, "routers_loss": 0.03309348225593567, "skip_count": 0.0, "step": 3438, "text_loss": 0.9877075552940369 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.150278837687114, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.0008070035029574151, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 5548971.0, "repeat_count": 1.0, "routers_loss": 0.008696741424500942, "skip_count": 1.0, "step": 3440, "text_loss": 0.24766330420970917 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 16.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.000806759142585745, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 5552174.0, "repeat_count": 0.0, "routers_loss": 0.004240929149091244, "skip_count": 3.0, "step": 3442, "text_loss": 0.37255001068115234 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05322265625, "learning_rate": 0.0008065146646633927, "loss": 0.0088, "macro_f1": 0.6666666865348816, "num_tokens": 5555005.0, "repeat_count": 0.0, "routers_loss": 0.014345484785735607, "skip_count": 1.0, "step": 3444, "text_loss": 0.26157206296920776 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.17845611975345, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06005859375, "learning_rate": 0.0008062700692840428, "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 5559127.0, "repeat_count": 1.0, "routers_loss": 0.008315163664519787, "skip_count": 2.0, "step": 3446, "text_loss": 0.21971040964126587 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 16.187848547108892, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.056396484375, "learning_rate": 0.0008060253565414246, "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 5562254.0, "repeat_count": 0.0, "routers_loss": 0.009582413360476494, "skip_count": 3.0, "step": 3448, "text_loss": 0.6758295893669128 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.19724097446434, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0008057805265293124, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5565515.0, "repeat_count": 0.0, "routers_loss": 0.002429503947496414, "skip_count": 0.0, "step": 3450, "text_loss": 0.696592390537262 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0008055355793415257, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5568392.0, "repeat_count": 0.0, "routers_loss": 0.0007724192109890282, "skip_count": 0.0, "step": 3452, "text_loss": 0.7092870473861694 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0008052905150719285, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 5571090.0, "repeat_count": 0.0, "routers_loss": 0.0010859938338398933, "skip_count": 0.0, "step": 3454, "text_loss": 0.6593860387802124 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.225418256530673, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008050453338144301, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 5574552.0, "repeat_count": 1.0, "routers_loss": 0.0030258705373853445, "skip_count": 1.0, "step": 3456, "text_loss": 0.3479384481906891 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 0.0008048000356629844, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 5577484.0, "repeat_count": 0.0, "routers_loss": 0.005052885971963406, "skip_count": 2.0, "step": 3458, "text_loss": 0.21858671307563782 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.24420311124156, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029541015625, "learning_rate": 0.0008045546207115901, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 5581605.0, "repeat_count": 1.0, "routers_loss": 0.009976249188184738, "skip_count": 3.0, "step": 3460, "text_loss": 0.16868001222610474 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0008043090890542904, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5584994.0, "repeat_count": 0.0, "routers_loss": 0.00270817126147449, "skip_count": 0.0, "step": 3462, "text_loss": 0.785690426826477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0008040634407851739, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5588067.0, "repeat_count": 0.0, "routers_loss": 0.0018436965765431523, "skip_count": 0.0, "step": 3464, "text_loss": 0.5006644129753113 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.0008038176759983731, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5590789.0, "repeat_count": 0.0, "routers_loss": 0.008516279980540276, "skip_count": 2.0, "step": 3466, "text_loss": 0.20963478088378906 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.281772820663342, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0008035717947880659, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 5593472.0, "repeat_count": 0.0, "routers_loss": 0.0016293043736368418, "skip_count": 0.0, "step": 3468, "text_loss": 0.7376078963279724 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.0008033257972484742, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5596108.0, "repeat_count": 0.0, "routers_loss": 0.002364142332226038, "skip_count": 0.0, "step": 3470, "text_loss": 0.5156455039978027 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0008030796834738649, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5599103.0, "repeat_count": 0.0, "routers_loss": 0.008872323669493198, "skip_count": 0.0, "step": 3472, "text_loss": 0.2996419668197632 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 16.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043701171875, "learning_rate": 0.0008028334535585491, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5602410.0, "repeat_count": 0.0, "routers_loss": 0.011508257128298283, "skip_count": 3.0, "step": 3474, "text_loss": 0.25438693165779114 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.31934253008512, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.0008025871075968827, "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 5605424.0, "repeat_count": 2.0, "routers_loss": 0.017225435003638268, "skip_count": 2.0, "step": 3476, "text_loss": 0.2549574077129364 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.328734957440563, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0008023406456832657, "loss": 0.0111, "macro_f1": 0.9262410998344421, "num_tokens": 5608266.0, "repeat_count": 3.0, "routers_loss": 0.039165645837783813, "skip_count": 2.0, "step": 3478, "text_loss": 0.1797947734594345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0008020940679121429, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5611471.0, "repeat_count": 0.0, "routers_loss": 0.0009718866203911602, "skip_count": 0.0, "step": 3480, "text_loss": 0.8267702460289001 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0008018473743780036, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5615046.0, "repeat_count": 0.0, "routers_loss": 0.006087122485041618, "skip_count": 2.0, "step": 3482, "text_loss": 0.7267677187919617 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.000801600565175381, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5618350.0, "repeat_count": 0.0, "routers_loss": 0.0007539413054473698, "skip_count": 0.0, "step": 3484, "text_loss": 0.5910211801528931 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046142578125, "learning_rate": 0.0008013536403988529, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5621381.0, "repeat_count": 0.0, "routers_loss": 0.0008076327503658831, "skip_count": 0.0, "step": 3486, "text_loss": 0.30616798996925354 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 16.375697094217788, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.049072265625, "learning_rate": 0.0008011066001430412, "loss": 0.0086, "macro_f1": 0.6122449040412903, "num_tokens": 5624617.0, "repeat_count": 0.0, "routers_loss": 0.023835813626646996, "skip_count": 4.0, "step": 3488, "text_loss": 0.3376443088054657 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.38508952157323, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0008008594445026122, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5627989.0, "repeat_count": 0.0, "routers_loss": 0.004226419143378735, "skip_count": 2.0, "step": 3490, "text_loss": 0.8185343146324158 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.394481948928675, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0008006121735722767, "loss": 0.0084, "macro_f1": 0.32098764181137085, "num_tokens": 5632286.0, "repeat_count": 0.0, "routers_loss": 0.0366671048104763, "skip_count": 2.0, "step": 3492, "text_loss": 0.2209547609090805 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.403874376284122, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0008003647874467892, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 5635368.0, "repeat_count": 1.0, "routers_loss": 0.012956378981471062, "skip_count": 0.0, "step": 3494, "text_loss": 0.20468664169311523 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.413266803639566, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.0008001172862209485, "loss": 0.0103, "macro_f1": 0.6666666865348816, "num_tokens": 5638440.0, "repeat_count": 1.0, "routers_loss": 0.0017375422175973654, "skip_count": 0.0, "step": 3496, "text_loss": 0.6647221446037292 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 16.42265923099501, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.0007998696699895976, "loss": 0.0091, "macro_f1": 0.6592592597007751, "num_tokens": 5641996.0, "repeat_count": 1.0, "routers_loss": 0.025240756571292877, "skip_count": 5.0, "step": 3498, "text_loss": 0.23892143368721008 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021728515625, "learning_rate": 0.0007996219388476236, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5645071.0, "repeat_count": 0.0, "routers_loss": 0.007436830550432205, "skip_count": 1.0, "step": 3500, "text_loss": 0.7580804228782654 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0007993740928899571, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 5648175.0, "repeat_count": 0.0, "routers_loss": 0.001126602990552783, "skip_count": 0.0, "step": 3502, "text_loss": 0.5281378626823425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0007991261322115737, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5650973.0, "repeat_count": 0.0, "routers_loss": 0.0007907263352535665, "skip_count": 0.0, "step": 3504, "text_loss": 0.25220927596092224 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.46022894041679, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.000798878056907492, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 5654252.0, "repeat_count": 2.0, "routers_loss": 0.006263538729399443, "skip_count": 2.0, "step": 3506, "text_loss": 0.46569153666496277 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 16.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0703125, "learning_rate": 0.0007986298670727752, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 5657229.0, "repeat_count": 0.0, "routers_loss": 0.004049144219607115, "skip_count": 3.0, "step": 3508, "text_loss": 0.15174436569213867 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 16.479013795127678, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0791015625, "learning_rate": 0.0007983815628025301, "loss": 0.0074, "macro_f1": 0.9262410998344421, "num_tokens": 5659974.0, "repeat_count": 2.0, "routers_loss": 0.0471976138651371, "skip_count": 3.0, "step": 3510, "text_loss": 0.39072203636169434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.488406222483125, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.000798133144191907, "loss": 0.0082, "macro_f1": 0.3272727429866791, "num_tokens": 5662893.0, "repeat_count": 0.0, "routers_loss": 0.04030488431453705, "skip_count": 1.0, "step": 3512, "text_loss": 0.3562147617340088 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0595703125, "learning_rate": 0.0007978846113361009, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5666476.0, "repeat_count": 0.0, "routers_loss": 0.007475079502910376, "skip_count": 1.0, "step": 3514, "text_loss": 0.26518192887306213 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044189453125, "learning_rate": 0.0007976359643303497, "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 5669647.0, "repeat_count": 0.0, "routers_loss": 0.00558585487306118, "skip_count": 2.0, "step": 3516, "text_loss": 0.29284560680389404 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.516583504549455, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0007973872032699354, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 5673491.0, "repeat_count": 1.0, "routers_loss": 0.0026981087867170572, "skip_count": 1.0, "step": 3518, "text_loss": 0.35089045763015747 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.525975931904902, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.000797138328250184, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5676529.0, "repeat_count": 1.0, "routers_loss": 0.0027328627184033394, "skip_count": 0.0, "step": 3520, "text_loss": 0.41077399253845215 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 16.535368359260346, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0007968893393664646, "loss": 0.01, "macro_f1": 0.6592592597007751, "num_tokens": 5679987.0, "repeat_count": 1.0, "routers_loss": 0.02695014327764511, "skip_count": 5.0, "step": 3522, "text_loss": 0.44942837953567505 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007966402367141903, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5683185.0, "repeat_count": 0.0, "routers_loss": 0.00817026849836111, "skip_count": 2.0, "step": 3524, "text_loss": 0.14528048038482666 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0007963910203888176, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 5686544.0, "repeat_count": 0.0, "routers_loss": 0.0021973433904349804, "skip_count": 0.0, "step": 3526, "text_loss": 0.22358648478984833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.56354564132668, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0007961416904858469, "loss": 0.0078, "macro_f1": 0.3272727429866791, "num_tokens": 5689579.0, "repeat_count": 0.0, "routers_loss": 0.033712416887283325, "skip_count": 1.0, "step": 3528, "text_loss": 0.3083649277687073 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0007958922471008217, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5692869.0, "repeat_count": 0.0, "routers_loss": 0.011182719841599464, "skip_count": 2.0, "step": 3530, "text_loss": 0.21288011968135834 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0007956426903293292, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5696007.0, "repeat_count": 0.0, "routers_loss": 0.0015808293828740716, "skip_count": 0.0, "step": 3532, "text_loss": 0.6068631410598755 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.591722923393014, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0007953930202670001, "loss": 0.0062, "macro_f1": 0.5492662787437439, "num_tokens": 5699474.0, "repeat_count": 2.0, "routers_loss": 0.03205178305506706, "skip_count": 0.0, "step": 3534, "text_loss": 0.4317135512828827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.601115350748458, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0007951432370095084, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 5703483.0, "repeat_count": 0.0, "routers_loss": 0.003518853336572647, "skip_count": 0.0, "step": 3536, "text_loss": 0.5432273149490356 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 16.610507778103905, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11083984375, "learning_rate": 0.0007948933406525715, "loss": 0.01, "macro_f1": 1.0, "num_tokens": 5707301.0, "repeat_count": 1.0, "routers_loss": 0.004982157610356808, "skip_count": 1.0, "step": 3538, "text_loss": 0.40061065554618835 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.61990020545935, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0751953125, "learning_rate": 0.0007946433312919502, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5710847.0, "repeat_count": 0.0, "routers_loss": 0.003067734418436885, "skip_count": 0.0, "step": 3540, "text_loss": 0.5396234393119812 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 16.629292632814792, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05224609375, "learning_rate": 0.0007943932090234486, "loss": 0.0097, "macro_f1": 0.5492662787437439, "num_tokens": 5713683.0, "repeat_count": 0.0, "routers_loss": 0.03728383034467697, "skip_count": 2.0, "step": 3542, "text_loss": 0.18310914933681488 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 16.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007941429739429138, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 5716397.0, "repeat_count": 0.0, "routers_loss": 0.0025092530995607376, "skip_count": 3.0, "step": 3544, "text_loss": 0.5806207060813904 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007938926261462366, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5719984.0, "repeat_count": 0.0, "routers_loss": 0.002493767999112606, "skip_count": 0.0, "step": 3546, "text_loss": 0.38606807589530945 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 16.657469914881126, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05078125, "learning_rate": 0.0007936421657293507, "loss": 0.0094, "macro_f1": 0.8823530077934265, "num_tokens": 5723571.0, "repeat_count": 1.0, "routers_loss": 0.014810923486948013, "skip_count": 2.0, "step": 3548, "text_loss": 0.49558472633361816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.666862342236573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0007933915927882327, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5726405.0, "repeat_count": 0.0, "routers_loss": 0.00152928801253438, "skip_count": 0.0, "step": 3550, "text_loss": 0.8674797415733337 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.000793140907418903, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5729955.0, "repeat_count": 0.0, "routers_loss": 0.005522782914340496, "skip_count": 2.0, "step": 3552, "text_loss": 0.3274473249912262 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.0007928901097174248, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5733030.0, "repeat_count": 0.0, "routers_loss": 0.009207013063132763, "skip_count": 2.0, "step": 3554, "text_loss": 0.18237128853797913 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0007926391997799039, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5735978.0, "repeat_count": 0.0, "routers_loss": 0.00695531303063035, "skip_count": 0.0, "step": 3556, "text_loss": 0.3266434967517853 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05419921875, "learning_rate": 0.0007923881777024898, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5738901.0, "repeat_count": 0.0, "routers_loss": 0.002743212040513754, "skip_count": 1.0, "step": 3558, "text_loss": 0.4971913695335388 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.713824479013795, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04931640625, "learning_rate": 0.0007921370435813741, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5741946.0, "repeat_count": 1.0, "routers_loss": 0.007037297356873751, "skip_count": 0.0, "step": 3560, "text_loss": 0.5645473599433899 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0007918857975127924, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5744987.0, "repeat_count": 0.0, "routers_loss": 0.0030746585689485073, "skip_count": 0.0, "step": 3562, "text_loss": 0.17717665433883667 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.0007916344395930224, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 5747837.0, "repeat_count": 0.0, "routers_loss": 0.004522138275206089, "skip_count": 0.0, "step": 3564, "text_loss": 0.7676118612289429 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.000791382969918385, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5750716.0, "repeat_count": 0.0, "routers_loss": 0.0026240211445838213, "skip_count": 0.0, "step": 3566, "text_loss": 0.4975173771381378 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.751394188435572, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.000791131388585244, "loss": 0.011, "macro_f1": 0.8820862174034119, "num_tokens": 5754368.0, "repeat_count": 2.0, "routers_loss": 0.021831991150975227, "skip_count": 2.0, "step": 3568, "text_loss": 0.9670342206954956 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.76078661579102, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0007908796956900055, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5757076.0, "repeat_count": 1.0, "routers_loss": 0.0017586691537871957, "skip_count": 0.0, "step": 3570, "text_loss": 0.3057977259159088 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.000790627891329119, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5760613.0, "repeat_count": 0.0, "routers_loss": 0.005515786819159985, "skip_count": 0.0, "step": 3572, "text_loss": 0.5860086679458618 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0007903759755990763, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 5763557.0, "repeat_count": 0.0, "routers_loss": 0.004096484277397394, "skip_count": 0.0, "step": 3574, "text_loss": 0.17175781726837158 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.788963897857354, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04541015625, "learning_rate": 0.000790123948596412, "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 5767430.0, "repeat_count": 1.0, "routers_loss": 0.005216122139245272, "skip_count": 0.0, "step": 3576, "text_loss": 0.7520374059677124 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0007898718104177031, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 5770175.0, "repeat_count": 0.0, "routers_loss": 0.0037980107590556145, "skip_count": 0.0, "step": 3578, "text_loss": 0.18117885291576385 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007896195611595699, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5773032.0, "repeat_count": 0.0, "routers_loss": 0.003672175807878375, "skip_count": 2.0, "step": 3580, "text_loss": 0.7241058349609375 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.817141179923688, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0615234375, "learning_rate": 0.0007893672009186744, "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 5776077.0, "repeat_count": 1.0, "routers_loss": 0.01229850109666586, "skip_count": 3.0, "step": 3582, "text_loss": 0.29140418767929077 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.82653360727913, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007891147297917216, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5779088.0, "repeat_count": 1.0, "routers_loss": 0.0035251814406365156, "skip_count": 0.0, "step": 3584, "text_loss": 0.1727485954761505 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.055908203125, "learning_rate": 0.000788862147875459, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 5782201.0, "repeat_count": 0.0, "routers_loss": 0.004725661128759384, "skip_count": 2.0, "step": 3586, "text_loss": 0.43512848019599915 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06396484375, "learning_rate": 0.0007886094552666765, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5785039.0, "repeat_count": 0.0, "routers_loss": 0.005632172804325819, "skip_count": 0.0, "step": 3588, "text_loss": 0.3534786105155945 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0556640625, "learning_rate": 0.0007883566520622062, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 5788017.0, "repeat_count": 0.0, "routers_loss": 0.006249965168535709, "skip_count": 1.0, "step": 3590, "text_loss": 0.2089710384607315 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.0007881037383589229, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 5791168.0, "repeat_count": 0.0, "routers_loss": 0.0013797614956274629, "skip_count": 0.0, "step": 3592, "text_loss": 0.4349329471588135 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0007878507142537436, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 5793927.0, "repeat_count": 0.0, "routers_loss": 0.0019719740375876427, "skip_count": 1.0, "step": 3594, "text_loss": 0.6087368726730347 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.8828881714118, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0007875975798436274, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5797214.0, "repeat_count": 1.0, "routers_loss": 0.0037070370744913816, "skip_count": 0.0, "step": 3596, "text_loss": 0.4258122444152832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.048583984375, "learning_rate": 0.0007873443352255764, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5800691.0, "repeat_count": 0.0, "routers_loss": 0.008431311696767807, "skip_count": 0.0, "step": 3598, "text_loss": 0.6006711721420288 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.055419921875, "learning_rate": 0.0007870909804966337, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5804712.0, "repeat_count": 0.0, "routers_loss": 0.0017720256000757217, "skip_count": 0.0, "step": 3600, "text_loss": 0.6055042743682861 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.911065453478134, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0007868375157538861, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 5807670.0, "repeat_count": 1.0, "routers_loss": 0.010697763413190842, "skip_count": 0.0, "step": 3602, "text_loss": 0.8039056658744812 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.920457880833577, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0007865839410944611, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5810880.0, "repeat_count": 1.0, "routers_loss": 0.0030022128485143185, "skip_count": 0.0, "step": 3604, "text_loss": 0.596110463142395 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 16.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0007863302566155295, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5814171.0, "repeat_count": 0.0, "routers_loss": 0.006257854867726564, "skip_count": 2.0, "step": 3606, "text_loss": 0.5700319409370422 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.939242735544468, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0007860764624143031, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5817607.0, "repeat_count": 1.0, "routers_loss": 0.004838473163545132, "skip_count": 0.0, "step": 3608, "text_loss": 0.8319530487060547 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 16.94863516289991, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.08154296875, "learning_rate": 0.0007858225585880369, "loss": 0.0067, "macro_f1": 0.8823530077934265, "num_tokens": 5821452.0, "repeat_count": 1.0, "routers_loss": 0.02173662930727005, "skip_count": 2.0, "step": 3610, "text_loss": 0.3738477826118469 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007855685452340269, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5824683.0, "repeat_count": 0.0, "routers_loss": 0.0032719180453568697, "skip_count": 0.0, "step": 3612, "text_loss": 0.4054839015007019 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.967420017610802, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0007853144224496118, "loss": 0.0093, "macro_f1": 0.3272727429866791, "num_tokens": 5827860.0, "repeat_count": 1.0, "routers_loss": 0.032171256840229034, "skip_count": 0.0, "step": 3614, "text_loss": 0.18112395703792572 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 16.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0458984375, "learning_rate": 0.0007850601903321716, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5831651.0, "repeat_count": 0.0, "routers_loss": 0.013230946846306324, "skip_count": 1.0, "step": 3616, "text_loss": 0.2698844075202942 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 16.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.000784805848979129, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5834369.0, "repeat_count": 0.0, "routers_loss": 0.00162619655020535, "skip_count": 0.0, "step": 3618, "text_loss": 0.2430931180715561 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 16.995597299677137, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0007845513984879477, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5838102.0, "repeat_count": 1.0, "routers_loss": 0.002781603019684553, "skip_count": 0.0, "step": 3620, "text_loss": 0.4968300759792328 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0007842968389561337, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5841029.0, "repeat_count": 0.0, "routers_loss": 0.0023873315658420324, "skip_count": 0.0, "step": 3622, "text_loss": 0.5842974781990051 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0007840421704812346, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 5845158.0, "repeat_count": 0.0, "routers_loss": 0.00400173757225275, "skip_count": 1.0, "step": 3624, "text_loss": 0.8312450647354126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.00078378739316084, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 5849175.0, "repeat_count": 0.0, "routers_loss": 0.0004974664188921452, "skip_count": 0.0, "step": 3626, "text_loss": 0.48637253046035767 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 25.0, "epoch": 17.032873495744056, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.10693359375, "learning_rate": 0.000783532507092581, "loss": 0.0079, "macro_f1": 0.9555556178092957, "num_tokens": 5852020.0, "repeat_count": 1.0, "routers_loss": 0.02555239573121071, "skip_count": 5.0, "step": 3628, "text_loss": 0.5407033562660217 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0007832775123741306, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5854873.0, "repeat_count": 0.0, "routers_loss": 0.0025962977670133114, "skip_count": 0.0, "step": 3630, "text_loss": 0.618230938911438 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.000783022409103203, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5858086.0, "repeat_count": 0.0, "routers_loss": 0.0029271875973790884, "skip_count": 0.0, "step": 3632, "text_loss": 0.21259798109531403 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0007827671973775542, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5860886.0, "repeat_count": 0.0, "routers_loss": 0.004102068953216076, "skip_count": 0.0, "step": 3634, "text_loss": 0.4991208016872406 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.070443205165834, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0007825118772949819, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5864291.0, "repeat_count": 0.0, "routers_loss": 0.0023497689981013536, "skip_count": 1.0, "step": 3636, "text_loss": 0.3878401517868042 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0007822564489533255, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 5867155.0, "repeat_count": 0.0, "routers_loss": 0.007680345326662064, "skip_count": 2.0, "step": 3638, "text_loss": 0.6132124066352844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.053466796875, "learning_rate": 0.0007820009124504653, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5870325.0, "repeat_count": 0.0, "routers_loss": 0.0008242831099778414, "skip_count": 0.0, "step": 3640, "text_loss": 0.3552473187446594 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.098620487232168, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0007817452678843236, "loss": 0.0073, "macro_f1": 0.6601307392120361, "num_tokens": 5873301.0, "repeat_count": 1.0, "routers_loss": 0.023831043392419815, "skip_count": 2.0, "step": 3642, "text_loss": 0.18363867700099945 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0007814895153528635, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5876225.0, "repeat_count": 0.0, "routers_loss": 0.001999989850446582, "skip_count": 0.0, "step": 3644, "text_loss": 0.17581747472286224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0007812336549540903, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5879501.0, "repeat_count": 0.0, "routers_loss": 0.001098626758903265, "skip_count": 0.0, "step": 3646, "text_loss": 0.5040884613990784 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.126797769298502, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0007809776867860499, "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 5882608.0, "repeat_count": 0.0, "routers_loss": 0.012210183776915073, "skip_count": 1.0, "step": 3648, "text_loss": 0.27114811539649963 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00078072161094683, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5886106.0, "repeat_count": 0.0, "routers_loss": 0.005191771313548088, "skip_count": 2.0, "step": 3650, "text_loss": 0.5167917609214783 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0007804654275345591, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5889122.0, "repeat_count": 0.0, "routers_loss": 0.0016411367105320096, "skip_count": 1.0, "step": 3652, "text_loss": 0.7691274285316467 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 17.154975051364836, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0007802091366474074, "loss": 0.005, "macro_f1": 0.8823530077934265, "num_tokens": 5892313.0, "repeat_count": 2.0, "routers_loss": 0.015627093613147736, "skip_count": 1.0, "step": 3654, "text_loss": 0.4646325409412384 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0007799527383835858, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5895577.0, "repeat_count": 0.0, "routers_loss": 0.0009879748104140162, "skip_count": 0.0, "step": 3656, "text_loss": 0.5587969422340393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0007796962328413469, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5898546.0, "repeat_count": 0.0, "routers_loss": 0.004864919930696487, "skip_count": 0.0, "step": 3658, "text_loss": 0.6981375813484192 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.18315233343117, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0007794396201189839, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 5901618.0, "repeat_count": 1.0, "routers_loss": 0.006617432460188866, "skip_count": 2.0, "step": 3660, "text_loss": 0.22521957755088806 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.192544760786618, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0007791829003148312, "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 5904540.0, "repeat_count": 1.0, "routers_loss": 0.0782252699136734, "skip_count": 2.0, "step": 3662, "text_loss": 0.2649642825126648 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0007789260735272647, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 5907827.0, "repeat_count": 0.0, "routers_loss": 0.0012057392159476876, "skip_count": 0.0, "step": 3664, "text_loss": 0.6943771243095398 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.018310546875, "learning_rate": 0.0007786691398547005, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 5911163.0, "repeat_count": 0.0, "routers_loss": 0.007476957980543375, "skip_count": 2.0, "step": 3666, "text_loss": 0.1502683162689209 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.220722042852948, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0007784120993955962, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5913948.0, "repeat_count": 1.0, "routers_loss": 0.004082011990249157, "skip_count": 0.0, "step": 3668, "text_loss": 0.4127517640590668 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 17.230114470208395, "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.0007781549522484503, "loss": 0.0066, "macro_f1": 0.9265305995941162, "num_tokens": 5917360.0, "repeat_count": 3.0, "routers_loss": 0.027505695819854736, "skip_count": 1.0, "step": 3670, "text_loss": 0.23892618715763092 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0007778976985118018, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5920524.0, "repeat_count": 0.0, "routers_loss": 0.0024977331049740314, "skip_count": 2.0, "step": 3672, "text_loss": 0.5076471567153931 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0576171875, "learning_rate": 0.0007776403382842312, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5923632.0, "repeat_count": 0.0, "routers_loss": 0.0015700991498306394, "skip_count": 0.0, "step": 3674, "text_loss": 0.6287924647331238 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.25829175227473, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05810546875, "learning_rate": 0.0007773828716643591, "loss": 0.0085, "macro_f1": 0.3272727429866791, "num_tokens": 5926438.0, "repeat_count": 1.0, "routers_loss": 0.05108916014432907, "skip_count": 0.0, "step": 3676, "text_loss": 0.26517006754875183 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007771252987508474, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5930081.0, "repeat_count": 0.0, "routers_loss": 0.003439917229115963, "skip_count": 0.0, "step": 3678, "text_loss": 0.5189079642295837 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 17.277076606985617, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.056884765625, "learning_rate": 0.0007768676196423984, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 5933463.0, "repeat_count": 1.0, "routers_loss": 0.001935846172273159, "skip_count": 1.0, "step": 3680, "text_loss": 0.6703575849533081 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 17.286469034341064, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007766098344377553, "loss": 0.0082, "macro_f1": 0.31446540355682373, "num_tokens": 5937098.0, "repeat_count": 0.0, "routers_loss": 0.0384826585650444, "skip_count": 2.0, "step": 3682, "text_loss": 0.6424444913864136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0007763519432357018, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 5940436.0, "repeat_count": 0.0, "routers_loss": 0.0008654671837575734, "skip_count": 0.0, "step": 3684, "text_loss": 0.4189988672733307 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0007760939461350623, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 5943731.0, "repeat_count": 0.0, "routers_loss": 0.007468715775758028, "skip_count": 2.0, "step": 3686, "text_loss": 0.2875453233718872 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.314646316407398, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0007758358432347019, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5946707.0, "repeat_count": 0.0, "routers_loss": 0.001252831774763763, "skip_count": 0.0, "step": 3688, "text_loss": 0.5093055367469788 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007755776346335259, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5949833.0, "repeat_count": 0.0, "routers_loss": 0.001680848654359579, "skip_count": 0.0, "step": 3690, "text_loss": 0.4031114876270294 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0007753193204304807, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5953095.0, "repeat_count": 0.0, "routers_loss": 0.0047258250415325165, "skip_count": 2.0, "step": 3692, "text_loss": 0.17632785439491272 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.342823598473732, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0007750609007245524, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5955971.0, "repeat_count": 2.0, "routers_loss": 0.001980359200388193, "skip_count": 4.0, "step": 3694, "text_loss": 0.3423727750778198 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0007748023756147679, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5958948.0, "repeat_count": 0.0, "routers_loss": 0.00511702848598361, "skip_count": 0.0, "step": 3696, "text_loss": 0.28279972076416016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007745437452001949, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5961819.0, "repeat_count": 0.0, "routers_loss": 0.0005220443126745522, "skip_count": 0.0, "step": 3698, "text_loss": 0.4793325662612915 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.371000880540066, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0007742850095799408, "loss": 0.0084, "macro_f1": 0.3272727429866791, "num_tokens": 5964625.0, "repeat_count": 1.0, "routers_loss": 0.06411020457744598, "skip_count": 0.0, "step": 3700, "text_loss": 0.2825184464454651 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0007740261688531536, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 5967134.0, "repeat_count": 0.0, "routers_loss": 0.004408109001815319, "skip_count": 3.0, "step": 3702, "text_loss": 0.690429151058197 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0007737672231190215, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 5969831.0, "repeat_count": 0.0, "routers_loss": 0.0006747521692886949, "skip_count": 0.0, "step": 3704, "text_loss": 0.32556024193763733 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0007735081724767732, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5973015.0, "repeat_count": 0.0, "routers_loss": 0.0020414739847183228, "skip_count": 0.0, "step": 3706, "text_loss": 0.5876469612121582 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.408570589961844, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.072265625, "learning_rate": 0.0007732490170256769, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 5975778.0, "repeat_count": 1.0, "routers_loss": 0.005610425490885973, "skip_count": 0.0, "step": 3708, "text_loss": 0.2968577444553375 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05419921875, "learning_rate": 0.0007729897568650422, "loss": 0.0097, "macro_f1": 0.3333333432674408, "num_tokens": 5979115.0, "repeat_count": 0.0, "routers_loss": 0.001248046406544745, "skip_count": 0.0, "step": 3710, "text_loss": 0.626361608505249 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06787109375, "learning_rate": 0.0007727303920942176, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 5982213.0, "repeat_count": 0.0, "routers_loss": 0.005791695322841406, "skip_count": 2.0, "step": 3712, "text_loss": 0.4133484661579132 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 17.436747872028178, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.08740234375, "learning_rate": 0.0007724709228125922, "loss": 0.0105, "macro_f1": 0.5492662787437439, "num_tokens": 5984930.0, "repeat_count": 0.0, "routers_loss": 0.02114664763212204, "skip_count": 2.0, "step": 3714, "text_loss": 0.4646461308002472 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.44614029938362, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.0007722113491195952, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 5988017.0, "repeat_count": 2.0, "routers_loss": 0.005913930479437113, "skip_count": 5.0, "step": 3716, "text_loss": 0.15474505722522736 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0007719516711146957, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5991562.0, "repeat_count": 0.0, "routers_loss": 0.0075925313867628574, "skip_count": 2.0, "step": 3718, "text_loss": 0.5293686985969543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.000771691888897403, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 5994675.0, "repeat_count": 0.0, "routers_loss": 0.0012335237115621567, "skip_count": 0.0, "step": 3720, "text_loss": 0.5210637450218201 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.0007714320025672657, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 5999070.0, "repeat_count": 0.0, "routers_loss": 0.010582062415778637, "skip_count": 2.0, "step": 3722, "text_loss": 0.2783571779727936 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.4837100088054, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.000771172012223873, "loss": 0.0078, "macro_f1": 0.6598639488220215, "num_tokens": 6002702.0, "repeat_count": 1.0, "routers_loss": 0.015008784830570221, "skip_count": 3.0, "step": 3724, "text_loss": 0.358705073595047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0007709119179668538, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6005517.0, "repeat_count": 0.0, "routers_loss": 0.00111615180503577, "skip_count": 0.0, "step": 3726, "text_loss": 0.45202162861824036 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 17.50249486351629, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0007706517198958764, "loss": 0.0096, "macro_f1": 0.6595745086669922, "num_tokens": 6009111.0, "repeat_count": 1.0, "routers_loss": 0.05215252563357353, "skip_count": 4.0, "step": 3728, "text_loss": 0.20360413193702698 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0007703914181106497, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6012989.0, "repeat_count": 0.0, "routers_loss": 0.010039499960839748, "skip_count": 3.0, "step": 3730, "text_loss": 0.20334361493587494 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.52127971822718, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0007701310127109211, "loss": 0.0062, "macro_f1": 0.3272727429866791, "num_tokens": 6016420.0, "repeat_count": 0.0, "routers_loss": 0.01090205181390047, "skip_count": 1.0, "step": 3732, "text_loss": 0.47959551215171814 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 24.0, "epoch": 17.530672145582624, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.0341796875, "learning_rate": 0.0007698705037964791, "loss": 0.0076, "macro_f1": 0.6225374937057495, "num_tokens": 6019551.0, "repeat_count": 0.0, "routers_loss": 0.02677762135863304, "skip_count": 5.0, "step": 3734, "text_loss": 0.2621438801288605 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.540064572938068, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.056640625, "learning_rate": 0.000769609891467151, "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 6022262.0, "repeat_count": 1.0, "routers_loss": 0.00460716662928462, "skip_count": 0.0, "step": 3736, "text_loss": 0.3433022201061249 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037109375, "learning_rate": 0.0007693491758228037, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6025723.0, "repeat_count": 0.0, "routers_loss": 0.0036111194640398026, "skip_count": 2.0, "step": 3738, "text_loss": 0.38703784346580505 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007690883569633442, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6028652.0, "repeat_count": 0.0, "routers_loss": 0.003299296135082841, "skip_count": 0.0, "step": 3740, "text_loss": 0.24203069508075714 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0007688274349887188, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 6032280.0, "repeat_count": 0.0, "routers_loss": 0.003173880511894822, "skip_count": 0.0, "step": 3742, "text_loss": 0.2827291488647461 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0007685664099989131, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6035111.0, "repeat_count": 0.0, "routers_loss": 0.0008576177642680705, "skip_count": 0.0, "step": 3744, "text_loss": 0.43613526225090027 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0007683052820939524, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6038428.0, "repeat_count": 0.0, "routers_loss": 0.004335585981607437, "skip_count": 2.0, "step": 3746, "text_loss": 1.0385624170303345 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0007680440513739015, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6041185.0, "repeat_count": 0.0, "routers_loss": 0.0008210531086660922, "skip_count": 0.0, "step": 3748, "text_loss": 0.7070431709289551 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 17.60581156442618, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.056640625, "learning_rate": 0.0007677827179388646, "loss": 0.0089, "macro_f1": 1.0, "num_tokens": 6046333.0, "repeat_count": 1.0, "routers_loss": 0.003778942162171006, "skip_count": 1.0, "step": 3750, "text_loss": 0.3682238757610321 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 17.615203991781627, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.000767521281888985, "loss": 0.009, "macro_f1": 1.0, "num_tokens": 6049528.0, "repeat_count": 1.0, "routers_loss": 0.002767334459349513, "skip_count": 1.0, "step": 3752, "text_loss": 0.7619418501853943 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0007672597433244455, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 6053202.0, "repeat_count": 0.0, "routers_loss": 0.004796457476913929, "skip_count": 2.0, "step": 3754, "text_loss": 0.4157083034515381 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0007669981023454682, "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 6056609.0, "repeat_count": 0.0, "routers_loss": 0.0013067846884950995, "skip_count": 0.0, "step": 3756, "text_loss": 0.4529118537902832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0007667363590523142, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 6060504.0, "repeat_count": 0.0, "routers_loss": 0.0010285493917763233, "skip_count": 0.0, "step": 3758, "text_loss": 0.8363246321678162 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.055419921875, "learning_rate": 0.0007664745135452844, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 6063526.0, "repeat_count": 0.0, "routers_loss": 0.006289863493293524, "skip_count": 3.0, "step": 3760, "text_loss": 0.5313657522201538 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05517578125, "learning_rate": 0.0007662125659247183, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6067147.0, "repeat_count": 0.0, "routers_loss": 0.0028537956532090902, "skip_count": 0.0, "step": 3762, "text_loss": 0.5668109059333801 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0007659505162909949, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6070350.0, "repeat_count": 0.0, "routers_loss": 0.0026814753655344248, "skip_count": 0.0, "step": 3764, "text_loss": 0.4983512759208679 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056884765625, "learning_rate": 0.0007656883647445318, "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 6073091.0, "repeat_count": 0.0, "routers_loss": 0.005981382913887501, "skip_count": 1.0, "step": 3766, "text_loss": 0.30372318625450134 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0007654261113857863, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6076244.0, "repeat_count": 0.0, "routers_loss": 0.000803640519734472, "skip_count": 0.0, "step": 3768, "text_loss": 0.6100738048553467 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0007651637563152539, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 6078936.0, "repeat_count": 0.0, "routers_loss": 0.0013324898900464177, "skip_count": 0.0, "step": 3770, "text_loss": 0.4733821153640747 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.709128265336073, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0007649012996334701, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6081951.0, "repeat_count": 1.0, "routers_loss": 0.0021543330512940884, "skip_count": 0.0, "step": 3772, "text_loss": 0.6794875860214233 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007646387414410085, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 6085165.0, "repeat_count": 0.0, "routers_loss": 0.0005426189745776355, "skip_count": 0.0, "step": 3774, "text_loss": 0.5886107683181763 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0007643760818384819, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6088370.0, "repeat_count": 0.0, "routers_loss": 0.002537576947361231, "skip_count": 0.0, "step": 3776, "text_loss": 0.23591920733451843 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0007641133209265423, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6092319.0, "repeat_count": 0.0, "routers_loss": 0.002613696036860347, "skip_count": 0.0, "step": 3778, "text_loss": 0.3217754662036896 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052978515625, "learning_rate": 0.0007638504588058796, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 6095799.0, "repeat_count": 0.0, "routers_loss": 0.0007219464750960469, "skip_count": 0.0, "step": 3780, "text_loss": 0.4276983141899109 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 17.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.0007635874955772234, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6098789.0, "repeat_count": 0.0, "routers_loss": 0.005965052172541618, "skip_count": 3.0, "step": 3782, "text_loss": 0.30936646461486816 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0007633244313413417, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6101631.0, "repeat_count": 0.0, "routers_loss": 0.0007469559786841273, "skip_count": 0.0, "step": 3784, "text_loss": 0.44460123777389526 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045654296875, "learning_rate": 0.0007630612661990412, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 6105097.0, "repeat_count": 0.0, "routers_loss": 0.004300760570913553, "skip_count": 1.0, "step": 3786, "text_loss": 0.41950157284736633 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007627980002511672, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6107847.0, "repeat_count": 0.0, "routers_loss": 0.0023050960153341293, "skip_count": 1.0, "step": 3788, "text_loss": 0.48561373353004456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0007625346335986039, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6110546.0, "repeat_count": 0.0, "routers_loss": 0.0018124044872820377, "skip_count": 0.0, "step": 3790, "text_loss": 0.20882295072078705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0007622711663422735, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6113600.0, "repeat_count": 0.0, "routers_loss": 0.0007613401976414025, "skip_count": 0.0, "step": 3792, "text_loss": 0.31751760840415955 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0400390625, "learning_rate": 0.0007620075985831375, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 6116916.0, "repeat_count": 0.0, "routers_loss": 0.005452962126582861, "skip_count": 2.0, "step": 3794, "text_loss": 0.3246645927429199 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 17.82183739360141, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0007617439304221956, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6120056.0, "repeat_count": 2.0, "routers_loss": 0.0043787881731987, "skip_count": 0.0, "step": 3796, "text_loss": 0.4859195947647095 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0007614801619604856, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6122668.0, "repeat_count": 0.0, "routers_loss": 0.0033891722559928894, "skip_count": 0.0, "step": 3798, "text_loss": 0.48194369673728943 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0007612162932990845, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6126792.0, "repeat_count": 0.0, "routers_loss": 0.001883238204754889, "skip_count": 0.0, "step": 3800, "text_loss": 0.3740062117576599 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0007609523245391068, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 6129801.0, "repeat_count": 0.0, "routers_loss": 0.00882677361369133, "skip_count": 2.0, "step": 3802, "text_loss": 0.5759486556053162 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007606882557817062, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6133613.0, "repeat_count": 0.0, "routers_loss": 0.009537030011415482, "skip_count": 2.0, "step": 3804, "text_loss": 0.3217554986476898 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0007604240871280742, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6137784.0, "repeat_count": 0.0, "routers_loss": 0.0023913346230983734, "skip_count": 0.0, "step": 3806, "text_loss": 0.3718445599079132 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.878191957734078, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007601598186794407, "loss": 0.0081, "macro_f1": 0.6603773832321167, "num_tokens": 6141356.0, "repeat_count": 1.0, "routers_loss": 0.033796411007642746, "skip_count": 1.0, "step": 3808, "text_loss": 0.2717749774456024 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.000759895450537074, "loss": 0.01, "macro_f1": 0.6666666865348816, "num_tokens": 6144448.0, "repeat_count": 0.0, "routers_loss": 0.0037919918540865183, "skip_count": 2.0, "step": 3810, "text_loss": 0.5935076475143433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007596309828022803, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6147526.0, "repeat_count": 0.0, "routers_loss": 0.0008182782912626863, "skip_count": 0.0, "step": 3812, "text_loss": 0.449336439371109 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 17.906369239800412, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0007593664155764044, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6150620.0, "repeat_count": 1.0, "routers_loss": 0.001734903547912836, "skip_count": 0.0, "step": 3814, "text_loss": 0.6647221446037292 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.915761667155856, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0007591017489608286, "loss": 0.0088, "macro_f1": 0.3272727429866791, "num_tokens": 6153714.0, "repeat_count": 1.0, "routers_loss": 0.04721754416823387, "skip_count": 0.0, "step": 3816, "text_loss": 0.25481200218200684 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007588369830569738, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6156974.0, "repeat_count": 0.0, "routers_loss": 0.0002484306460246444, "skip_count": 0.0, "step": 3818, "text_loss": 0.7195295691490173 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.934546521866746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0007585721179662988, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6159660.0, "repeat_count": 0.0, "routers_loss": 0.0051363613456487656, "skip_count": 2.0, "step": 3820, "text_loss": 0.5073586702346802 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0007583071537903005, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6163146.0, "repeat_count": 0.0, "routers_loss": 0.006719176657497883, "skip_count": 0.0, "step": 3822, "text_loss": 0.6950558423995972 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 17.953331376577633, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0007580420906305136, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 6166257.0, "repeat_count": 1.0, "routers_loss": 0.00871267355978489, "skip_count": 3.0, "step": 3824, "text_loss": 0.2549148201942444 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.0007577769285885109, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 6169624.0, "repeat_count": 0.0, "routers_loss": 0.0015642556827515364, "skip_count": 0.0, "step": 3826, "text_loss": 0.3720305860042572 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0007575116677659029, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6172673.0, "repeat_count": 0.0, "routers_loss": 0.0011551049537956715, "skip_count": 0.0, "step": 3828, "text_loss": 0.6819429397583008 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 17.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0007572463082643377, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 6175414.0, "repeat_count": 0.0, "routers_loss": 0.0008922060951590538, "skip_count": 0.0, "step": 3830, "text_loss": 0.5424665212631226 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 17.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0007569808501855023, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6178701.0, "repeat_count": 0.0, "routers_loss": 0.004167596809566021, "skip_count": 1.0, "step": 3832, "text_loss": 0.4429764151573181 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04931640625, "learning_rate": 0.00075671529363112, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 6183036.0, "repeat_count": 0.0, "routers_loss": 0.0008732969872653484, "skip_count": 0.0, "step": 3834, "text_loss": 0.8015334010124207 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007564496387029531, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6186325.0, "repeat_count": 0.0, "routers_loss": 0.0021374202333390713, "skip_count": 1.0, "step": 3836, "text_loss": 0.4233771562576294 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.01878485471089, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.000756183885502801, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6189919.0, "repeat_count": 1.0, "routers_loss": 0.004017227329313755, "skip_count": 0.0, "step": 3838, "text_loss": 0.33691394329071045 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.018310546875, "learning_rate": 0.0007559180341325005, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6193412.0, "repeat_count": 0.0, "routers_loss": 0.0013120946241542697, "skip_count": 0.0, "step": 3840, "text_loss": 0.14970099925994873 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 18.037569709421778, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.031982421875, "learning_rate": 0.0007556520846939265, "loss": 0.0061, "macro_f1": 0.5492662787437439, "num_tokens": 6196588.0, "repeat_count": 0.0, "routers_loss": 0.011793316341936588, "skip_count": 2.0, "step": 3842, "text_loss": 0.2714047133922577 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 18.046962136777225, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0007553860372889914, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6200841.0, "repeat_count": 1.0, "routers_loss": 0.019968654960393906, "skip_count": 4.0, "step": 3844, "text_loss": 0.23680976033210754 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 18.05635456413267, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.052490234375, "learning_rate": 0.0007551198920196452, "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 6203797.0, "repeat_count": 0.0, "routers_loss": 0.013615630567073822, "skip_count": 2.0, "step": 3846, "text_loss": 0.25839608907699585 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0546875, "learning_rate": 0.000754853648987875, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6206790.0, "repeat_count": 0.0, "routers_loss": 0.002420815173536539, "skip_count": 1.0, "step": 3848, "text_loss": 0.5358025431632996 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 18.07513941884356, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.032470703125, "learning_rate": 0.0007545873082957057, "loss": 0.0072, "macro_f1": 0.9265305995941162, "num_tokens": 6209791.0, "repeat_count": 1.0, "routers_loss": 0.018236197531223297, "skip_count": 3.0, "step": 3850, "text_loss": 0.1463700383901596 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 18.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0007543208700451998, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6212792.0, "repeat_count": 0.0, "routers_loss": 0.006242573726922274, "skip_count": 3.0, "step": 3852, "text_loss": 0.9441591501235962 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.093924273554446, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0007540543343384565, "loss": 0.0062, "macro_f1": 0.3272727429866791, "num_tokens": 6215747.0, "repeat_count": 0.0, "routers_loss": 0.01451140083372593, "skip_count": 1.0, "step": 3854, "text_loss": 0.41610902547836304 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007537877012776132, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6218593.0, "repeat_count": 0.0, "routers_loss": 0.00037674361374229193, "skip_count": 0.0, "step": 3856, "text_loss": 0.6048852205276489 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.112709128265337, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0007535209709648439, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 6221315.0, "repeat_count": 1.0, "routers_loss": 0.005776284262537956, "skip_count": 3.0, "step": 3858, "text_loss": 0.35627537965774536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0007532541435023605, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6225012.0, "repeat_count": 0.0, "routers_loss": 0.0009280376834794879, "skip_count": 0.0, "step": 3860, "text_loss": 0.6440183520317078 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0007529872189924114, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6227650.0, "repeat_count": 0.0, "routers_loss": 0.0009876530384644866, "skip_count": 0.0, "step": 3862, "text_loss": 0.35507893562316895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.14088641033167, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.0007527201975372827, "loss": 0.0045, "macro_f1": 0.6603773832321167, "num_tokens": 6230557.0, "repeat_count": 1.0, "routers_loss": 0.013780162669718266, "skip_count": 1.0, "step": 3864, "text_loss": 0.38958442211151123 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 18.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04638671875, "learning_rate": 0.0007524530792392977, "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 6233371.0, "repeat_count": 0.0, "routers_loss": 0.004849869292229414, "skip_count": 3.0, "step": 3866, "text_loss": 0.3826720714569092 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0191650390625, "learning_rate": 0.0007521858642008163, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6236770.0, "repeat_count": 0.0, "routers_loss": 0.008618295192718506, "skip_count": 1.0, "step": 3868, "text_loss": 0.3596078157424927 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.0007519185525242363, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6239661.0, "repeat_count": 0.0, "routers_loss": 0.0013421972980722785, "skip_count": 0.0, "step": 3870, "text_loss": 0.5585550665855408 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 0.0007516511443119916, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6242459.0, "repeat_count": 0.0, "routers_loss": 0.0038009448908269405, "skip_count": 1.0, "step": 3872, "text_loss": 0.4418395757675171 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.187848547108892, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0007513836396665534, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 6245489.0, "repeat_count": 1.0, "routers_loss": 0.002785376040264964, "skip_count": 2.0, "step": 3874, "text_loss": 0.551510751247406 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.19724097446434, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 0.0007511160386904305, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6249014.0, "repeat_count": 0.0, "routers_loss": 0.0021424589212983847, "skip_count": 1.0, "step": 3876, "text_loss": 1.0502676963806152 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0007508483414861679, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6252357.0, "repeat_count": 0.0, "routers_loss": 0.0085759861394763, "skip_count": 1.0, "step": 3878, "text_loss": 0.49212515354156494 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0007505805481563477, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6254975.0, "repeat_count": 0.0, "routers_loss": 0.0010723904706537724, "skip_count": 0.0, "step": 3880, "text_loss": 0.7022985816001892 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.225418256530673, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0007503126588035887, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 6258001.0, "repeat_count": 1.0, "routers_loss": 0.012809890322387218, "skip_count": 2.0, "step": 3882, "text_loss": 0.1829151213169098 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.0007500446735305466, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 6261795.0, "repeat_count": 0.0, "routers_loss": 0.0026790346018970013, "skip_count": 1.0, "step": 3884, "text_loss": 0.20436066389083862 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.24420311124156, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.000749776592439914, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 6265585.0, "repeat_count": 1.0, "routers_loss": 0.005243788007646799, "skip_count": 2.0, "step": 3886, "text_loss": 0.4479229748249054 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.00074950841563442, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6269039.0, "repeat_count": 0.0, "routers_loss": 0.007998534478247166, "skip_count": 1.0, "step": 3888, "text_loss": 0.2154676914215088 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0007492401432168303, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6272315.0, "repeat_count": 0.0, "routers_loss": 0.004648822825402021, "skip_count": 1.0, "step": 3890, "text_loss": 0.3375042676925659 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.272380393307895, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0007489717752899477, "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 6275342.0, "repeat_count": 0.0, "routers_loss": 0.012154200114309788, "skip_count": 1.0, "step": 3892, "text_loss": 0.1964082419872284 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0267333984375, "learning_rate": 0.000748703311956611, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6278700.0, "repeat_count": 1.0, "routers_loss": 0.004610476549714804, "skip_count": 2.0, "step": 3894, "text_loss": 0.26545581221580505 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06201171875, "learning_rate": 0.0007484347533196961, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 6281864.0, "repeat_count": 0.0, "routers_loss": 0.0075586591847240925, "skip_count": 2.0, "step": 3896, "text_loss": 0.3106999397277832 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02099609375, "learning_rate": 0.0007481660994821151, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6284676.0, "repeat_count": 0.0, "routers_loss": 0.007845268584787846, "skip_count": 1.0, "step": 3898, "text_loss": 0.4094304144382477 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.309950102729672, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0007478973505468165, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 6287470.0, "repeat_count": 1.0, "routers_loss": 0.011116391979157925, "skip_count": 2.0, "step": 3900, "text_loss": 0.1838909536600113 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.31934253008512, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0007476285066167857, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 6290432.0, "repeat_count": 1.0, "routers_loss": 0.004599364474415779, "skip_count": 0.0, "step": 3902, "text_loss": 0.25872838497161865 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.0007473595677950439, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 6293557.0, "repeat_count": 0.0, "routers_loss": 0.0016367282951250672, "skip_count": 1.0, "step": 3904, "text_loss": 0.5272360444068909 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0007470905341846492, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 6295979.0, "repeat_count": 0.0, "routers_loss": 0.0004760588926728815, "skip_count": 0.0, "step": 3906, "text_loss": 0.666959822177887 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007468214058886956, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6299215.0, "repeat_count": 0.0, "routers_loss": 0.000524883100297302, "skip_count": 0.0, "step": 3908, "text_loss": 0.5144801139831543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0007465521830103137, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6302320.0, "repeat_count": 0.0, "routers_loss": 0.0016085522947832942, "skip_count": 0.0, "step": 3910, "text_loss": 0.14342890679836273 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007462828656526702, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6305212.0, "repeat_count": 0.0, "routers_loss": 0.002720315707847476, "skip_count": 2.0, "step": 3912, "text_loss": 0.31109121441841125 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.375697094217788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06884765625, "learning_rate": 0.0007460134539189681, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 6308964.0, "repeat_count": 0.0, "routers_loss": 0.0010418406454846263, "skip_count": 1.0, "step": 3914, "text_loss": 0.5662030577659607 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.38508952157323, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0007457439479124459, "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 6313195.0, "repeat_count": 0.0, "routers_loss": 0.0020303844939917326, "skip_count": 0.0, "step": 3916, "text_loss": 0.6358339190483093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.394481948928675, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0007454743477363797, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6315949.0, "repeat_count": 0.0, "routers_loss": 0.0006592223653569818, "skip_count": 0.0, "step": 3918, "text_loss": 0.35648423433303833 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.403874376284122, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0007452046534940803, "loss": 0.0075, "macro_f1": 0.6603773832321167, "num_tokens": 6319024.0, "repeat_count": 1.0, "routers_loss": 0.024555351585149765, "skip_count": 1.0, "step": 3920, "text_loss": 0.21955153346061707 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0007449348652888952, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6321633.0, "repeat_count": 0.0, "routers_loss": 0.003606822807341814, "skip_count": 1.0, "step": 3922, "text_loss": 0.6079489588737488 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007446649832242075, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 6325209.0, "repeat_count": 0.0, "routers_loss": 0.0035831446293741465, "skip_count": 1.0, "step": 3924, "text_loss": 0.2774808406829834 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0007443950074034368, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6327822.0, "repeat_count": 0.0, "routers_loss": 0.006809544749557972, "skip_count": 2.0, "step": 3926, "text_loss": 0.48236769437789917 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.4414440857059, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.049072265625, "learning_rate": 0.0007441249379300381, "loss": 0.007, "macro_f1": 0.6601307392120361, "num_tokens": 6331662.0, "repeat_count": 1.0, "routers_loss": 0.023832591250538826, "skip_count": 2.0, "step": 3928, "text_loss": 0.7287537455558777 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.450836513061343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0007438547749075028, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 6335801.0, "repeat_count": 1.0, "routers_loss": 0.011755098588764668, "skip_count": 3.0, "step": 3930, "text_loss": 0.17253030836582184 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.46022894041679, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0007435845184393577, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6338747.0, "repeat_count": 1.0, "routers_loss": 0.005972472485154867, "skip_count": 0.0, "step": 3932, "text_loss": 0.6400216817855835 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0007433141686291657, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 6342772.0, "repeat_count": 0.0, "routers_loss": 0.0030393085908144712, "skip_count": 1.0, "step": 3934, "text_loss": 0.6865074038505554 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.0007430437255805252, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6345957.0, "repeat_count": 0.0, "routers_loss": 0.0006984061910770833, "skip_count": 0.0, "step": 3936, "text_loss": 0.40398702025413513 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.488406222483125, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0007427731893970706, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6349162.0, "repeat_count": 1.0, "routers_loss": 0.005219762213528156, "skip_count": 0.0, "step": 3938, "text_loss": 0.5951031446456909 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 18.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007425025601824717, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 6352655.0, "repeat_count": 0.0, "routers_loss": 0.015575960278511047, "skip_count": 3.0, "step": 3940, "text_loss": 0.26689088344573975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007422318380404346, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6355890.0, "repeat_count": 0.0, "routers_loss": 0.0012208883417770267, "skip_count": 0.0, "step": 3942, "text_loss": 0.570725679397583 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.516583504549455, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0007419610230746999, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6358891.0, "repeat_count": 1.0, "routers_loss": 0.0029412026051431894, "skip_count": 0.0, "step": 3944, "text_loss": 0.5521301031112671 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0007416901153890448, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6361586.0, "repeat_count": 0.0, "routers_loss": 0.0010283910669386387, "skip_count": 0.0, "step": 3946, "text_loss": 0.4046417772769928 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0007414191150872818, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6364954.0, "repeat_count": 0.0, "routers_loss": 0.008222512900829315, "skip_count": 2.0, "step": 3948, "text_loss": 0.2803446352481842 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0007411480222732583, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6367660.0, "repeat_count": 0.0, "routers_loss": 0.001304348581470549, "skip_count": 0.0, "step": 3950, "text_loss": 0.45553359389305115 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0007408768370508576, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6371585.0, "repeat_count": 0.0, "routers_loss": 0.0016345062758773565, "skip_count": 0.0, "step": 3952, "text_loss": 0.25424402952194214 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0007406055595239986, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6374365.0, "repeat_count": 0.0, "routers_loss": 0.0005097290268167853, "skip_count": 0.0, "step": 3954, "text_loss": 0.5856026411056519 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.060546875, "learning_rate": 0.0007403341897966356, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6377335.0, "repeat_count": 0.0, "routers_loss": 0.002482263371348381, "skip_count": 1.0, "step": 3956, "text_loss": 0.5145615339279175 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0007400627279727574, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 6380799.0, "repeat_count": 0.0, "routers_loss": 0.0011743451468646526, "skip_count": 0.0, "step": 3958, "text_loss": 0.31868961453437805 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0286865234375, "learning_rate": 0.0007397911741563892, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6383963.0, "repeat_count": 1.0, "routers_loss": 0.009861881844699383, "skip_count": 0.0, "step": 3960, "text_loss": 0.21192194521427155 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.601115350748458, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0007395195284515905, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 6387410.0, "repeat_count": 1.0, "routers_loss": 0.004189098719507456, "skip_count": 0.0, "step": 3962, "text_loss": 0.5809708833694458 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.610507778103905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.0007392477909624567, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6390670.0, "repeat_count": 0.0, "routers_loss": 0.001853612600825727, "skip_count": 0.0, "step": 3964, "text_loss": 0.48985618352890015 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.61990020545935, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.0007389759617931182, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6393609.0, "repeat_count": 1.0, "routers_loss": 0.003303771372884512, "skip_count": 0.0, "step": 3966, "text_loss": 0.28729453682899475 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 18.629292632814792, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.10595703125, "learning_rate": 0.0007387040410477404, "loss": 0.0058, "macro_f1": 0.9452888369560242, "num_tokens": 6396608.0, "repeat_count": 1.0, "routers_loss": 0.01791577786207199, "skip_count": 4.0, "step": 3968, "text_loss": 0.30386820435523987 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0007384320288305235, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6399793.0, "repeat_count": 0.0, "routers_loss": 0.0005771282012574375, "skip_count": 0.0, "step": 3970, "text_loss": 0.47285011410713196 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0007381599252457037, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6403365.0, "repeat_count": 0.0, "routers_loss": 0.003010645741596818, "skip_count": 0.0, "step": 3972, "text_loss": 0.5313063859939575 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.657469914881126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.000737887730397551, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6406205.0, "repeat_count": 1.0, "routers_loss": 0.006457438692450523, "skip_count": 0.0, "step": 3974, "text_loss": 0.2323843240737915 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.666862342236573, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0007376154443903713, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6409552.0, "repeat_count": 1.0, "routers_loss": 0.010693981312215328, "skip_count": 0.0, "step": 3976, "text_loss": 0.6304101943969727 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.676254769592017, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007373430673285051, "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 6412386.0, "repeat_count": 1.0, "routers_loss": 0.03116440214216709, "skip_count": 0.0, "step": 3978, "text_loss": 0.23448467254638672 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.68564719694746, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10009765625, "learning_rate": 0.0007370705993163278, "loss": 0.0111, "macro_f1": 0.3272727429866791, "num_tokens": 6416054.0, "repeat_count": 1.0, "routers_loss": 0.011973714455962181, "skip_count": 0.0, "step": 3980, "text_loss": 0.6371755599975586 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.695039624302908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05224609375, "learning_rate": 0.0007367980404582497, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 6419238.0, "repeat_count": 1.0, "routers_loss": 0.005117347463965416, "skip_count": 2.0, "step": 3982, "text_loss": 0.19822923839092255 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0007365253908587158, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6422122.0, "repeat_count": 0.0, "routers_loss": 0.0010648667812347412, "skip_count": 0.0, "step": 3984, "text_loss": 0.566700279712677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.0007362526506222058, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6425313.0, "repeat_count": 0.0, "routers_loss": 0.005726494826376438, "skip_count": 0.0, "step": 3986, "text_loss": 0.6568437814712524 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 18.723216906369238, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0007359798198532343, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 6428422.0, "repeat_count": 1.0, "routers_loss": 0.004504100419580936, "skip_count": 0.0, "step": 3988, "text_loss": 0.598754346370697 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0007357068986563509, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 6431512.0, "repeat_count": 0.0, "routers_loss": 0.0019837068393826485, "skip_count": 1.0, "step": 3990, "text_loss": 0.7152895927429199 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0007354338871361393, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6434358.0, "repeat_count": 0.0, "routers_loss": 0.0026031541638076305, "skip_count": 1.0, "step": 3992, "text_loss": 0.4986513555049896 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.751394188435572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.000735160785397218, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6438175.0, "repeat_count": 0.0, "routers_loss": 0.0024831905029714108, "skip_count": 2.0, "step": 3994, "text_loss": 0.4406205713748932 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007348875935442401, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6441228.0, "repeat_count": 0.0, "routers_loss": 0.0008635876583866775, "skip_count": 0.0, "step": 3996, "text_loss": 0.48884135484695435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007346143116818932, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6444318.0, "repeat_count": 0.0, "routers_loss": 0.004007008858025074, "skip_count": 0.0, "step": 3998, "text_loss": 0.6669428944587708 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0007343409399148994, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6448317.0, "repeat_count": 0.0, "routers_loss": 0.0031380734872072935, "skip_count": 0.0, "step": 4000, "text_loss": 0.6468493938446045 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0007340674783480154, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 6451673.0, "repeat_count": 0.0, "routers_loss": 0.004996029660105705, "skip_count": 0.0, "step": 4002, "text_loss": 0.28135430812835693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.798356325212797, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007337939270860323, "loss": 0.009, "macro_f1": 0.3272727429866791, "num_tokens": 6456372.0, "repeat_count": 1.0, "routers_loss": 0.03784399852156639, "skip_count": 0.0, "step": 4004, "text_loss": 0.41668644547462463 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007335202862337753, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6459047.0, "repeat_count": 0.0, "routers_loss": 0.0011750755365937948, "skip_count": 0.0, "step": 4006, "text_loss": 0.6853910684585571 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 18.817141179923688, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.05908203125, "learning_rate": 0.000733246555896104, "loss": 0.0062, "macro_f1": 0.9452888369560242, "num_tokens": 6462390.0, "repeat_count": 1.0, "routers_loss": 0.01630394533276558, "skip_count": 4.0, "step": 4008, "text_loss": 0.7110592126846313 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0007329727361779124, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6466057.0, "repeat_count": 0.0, "routers_loss": 0.0052404399029910564, "skip_count": 2.0, "step": 4010, "text_loss": 0.13856995105743408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.000732698827184129, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6468878.0, "repeat_count": 0.0, "routers_loss": 0.002138581359758973, "skip_count": 0.0, "step": 4012, "text_loss": 0.3999565839767456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.000732424829019716, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6472364.0, "repeat_count": 0.0, "routers_loss": 0.0037466560024768114, "skip_count": 0.0, "step": 4014, "text_loss": 0.28161346912384033 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0007321507417896699, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 6475379.0, "repeat_count": 0.0, "routers_loss": 0.0010469373082742095, "skip_count": 0.0, "step": 4016, "text_loss": 1.0490952730178833 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06591796875, "learning_rate": 0.0007318765655990218, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6478585.0, "repeat_count": 0.0, "routers_loss": 0.009968385100364685, "skip_count": 2.0, "step": 4018, "text_loss": 0.31696680188179016 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 18.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0007316023005528362, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 6484153.0, "repeat_count": 0.0, "routers_loss": 0.002349073765799403, "skip_count": 1.0, "step": 4020, "text_loss": 0.30981555581092834 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 18.8828881714118, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0299072265625, "learning_rate": 0.0007313279467562124, "loss": 0.0053, "macro_f1": 0.9452888369560242, "num_tokens": 6487029.0, "repeat_count": 1.0, "routers_loss": 0.011854278855025768, "skip_count": 4.0, "step": 4022, "text_loss": 0.9689550399780273 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.892280598767243, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007310535043142829, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 6490315.0, "repeat_count": 1.0, "routers_loss": 0.00908346101641655, "skip_count": 3.0, "step": 4024, "text_loss": 0.1705625057220459 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039306640625, "learning_rate": 0.0007307789733322146, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 6493921.0, "repeat_count": 0.0, "routers_loss": 0.0007360641611739993, "skip_count": 0.0, "step": 4026, "text_loss": 0.6252996325492859 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.0007305043539152083, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6496689.0, "repeat_count": 0.0, "routers_loss": 0.0017757206223905087, "skip_count": 0.0, "step": 4028, "text_loss": 0.40533265471458435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.000730229646168499, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6500090.0, "repeat_count": 0.0, "routers_loss": 0.0022657213266938925, "skip_count": 0.0, "step": 4030, "text_loss": 0.25954708456993103 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0007299548501973548, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6503023.0, "repeat_count": 0.0, "routers_loss": 0.0021747269202023745, "skip_count": 0.0, "step": 4032, "text_loss": 0.6223418712615967 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 18.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0007296799661070782, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6506382.0, "repeat_count": 0.0, "routers_loss": 0.006400502752512693, "skip_count": 4.0, "step": 4034, "text_loss": 0.6873653531074524 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.94863516289991, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0007294049940030055, "loss": 0.0065, "macro_f1": 0.3272727429866791, "num_tokens": 6509194.0, "repeat_count": 0.0, "routers_loss": 0.0197185929864645, "skip_count": 1.0, "step": 4036, "text_loss": 0.16156800091266632 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0007291299339905059, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6512271.0, "repeat_count": 0.0, "routers_loss": 0.0009541353792883456, "skip_count": 0.0, "step": 4038, "text_loss": 0.5038442015647888 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0007288547861749838, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 6516403.0, "repeat_count": 0.0, "routers_loss": 0.008226391859352589, "skip_count": 2.0, "step": 4040, "text_loss": 0.3706657588481903 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.976812444966246, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0007285795506618758, "loss": 0.0063, "macro_f1": 0.3272727429866791, "num_tokens": 6519310.0, "repeat_count": 0.0, "routers_loss": 0.017001887783408165, "skip_count": 1.0, "step": 4042, "text_loss": 0.24296723306179047 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 18.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0615234375, "learning_rate": 0.0007283042275566528, "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 6521979.0, "repeat_count": 0.0, "routers_loss": 0.01666323095560074, "skip_count": 2.0, "step": 4044, "text_loss": 0.36904850602149963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 18.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 0.0007280288169648192, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6524976.0, "repeat_count": 0.0, "routers_loss": 0.0007593175978399813, "skip_count": 0.0, "step": 4046, "text_loss": 0.7312731146812439 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 19.00469621367772, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0007277533189919127, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 6528638.0, "repeat_count": 1.0, "routers_loss": 0.005652119871228933, "skip_count": 1.0, "step": 4048, "text_loss": 0.23326151072978973 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0286865234375, "learning_rate": 0.0007274777337435046, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6532193.0, "repeat_count": 0.0, "routers_loss": 0.010509157553315163, "skip_count": 2.0, "step": 4050, "text_loss": 0.23918013274669647 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037841796875, "learning_rate": 0.0007272020613251999, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 6534994.0, "repeat_count": 0.0, "routers_loss": 0.002153293928131461, "skip_count": 0.0, "step": 4052, "text_loss": 0.5890526175498962 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0007269263018426367, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 6537469.0, "repeat_count": 1.0, "routers_loss": 0.0018494052346795797, "skip_count": 2.0, "step": 4054, "text_loss": 0.36058738827705383 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0007266504554014866, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6541271.0, "repeat_count": 0.0, "routers_loss": 0.0007579320226795971, "skip_count": 0.0, "step": 4056, "text_loss": 0.4089007079601288 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.051658350454947, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0007263745221074545, "loss": 0.0086, "macro_f1": 0.6601307392120361, "num_tokens": 6544293.0, "repeat_count": 1.0, "routers_loss": 0.06202420964837074, "skip_count": 2.0, "step": 4058, "text_loss": 0.2226305454969406 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 19.06105077781039, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0286865234375, "learning_rate": 0.0007260985020662784, "loss": 0.0049, "macro_f1": 0.5934640765190125, "num_tokens": 6547640.0, "repeat_count": 0.0, "routers_loss": 0.044639844447374344, "skip_count": 3.0, "step": 4060, "text_loss": 0.23004353046417236 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 19.070443205165834, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0206298828125, "learning_rate": 0.0007258223953837298, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6550840.0, "repeat_count": 1.0, "routers_loss": 0.004215611144900322, "skip_count": 0.0, "step": 4062, "text_loss": 0.2891770601272583 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0007255462021656132, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6554122.0, "repeat_count": 0.0, "routers_loss": 0.0011056234361603856, "skip_count": 0.0, "step": 4064, "text_loss": 0.7485370635986328 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007252699225177666, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6557138.0, "repeat_count": 0.0, "routers_loss": 0.008258933201432228, "skip_count": 2.0, "step": 4066, "text_loss": 0.25219282507896423 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0007249935565460606, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6560654.0, "repeat_count": 0.0, "routers_loss": 0.005102175287902355, "skip_count": 0.0, "step": 4068, "text_loss": 0.5553314089775085 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0007247171043563994, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6563814.0, "repeat_count": 0.0, "routers_loss": 0.01283820066601038, "skip_count": 2.0, "step": 4070, "text_loss": 0.15729956328868866 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 0.0007244405660547199, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6567060.0, "repeat_count": 0.0, "routers_loss": 0.0009684927063062787, "skip_count": 0.0, "step": 4072, "text_loss": 0.3725031912326813 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01953125, "learning_rate": 0.000724163941746992, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6571608.0, "repeat_count": 0.0, "routers_loss": 0.0007890827837400138, "skip_count": 0.0, "step": 4074, "text_loss": 0.8438301682472229 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 19.13619019665395, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0007238872315392189, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 6575214.0, "repeat_count": 1.0, "routers_loss": 0.0040600355714559555, "skip_count": 1.0, "step": 4076, "text_loss": 0.5923112034797668 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0007236104355374363, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 6578383.0, "repeat_count": 0.0, "routers_loss": 0.0024899677373468876, "skip_count": 2.0, "step": 4078, "text_loss": 0.20302526652812958 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05517578125, "learning_rate": 0.000723333553847713, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6582175.0, "repeat_count": 0.0, "routers_loss": 0.006120906211435795, "skip_count": 2.0, "step": 4080, "text_loss": 0.5400223731994629 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0007230565865761504, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6585516.0, "repeat_count": 0.0, "routers_loss": 0.0029941233806312084, "skip_count": 0.0, "step": 4082, "text_loss": 0.19460804760456085 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07373046875, "learning_rate": 0.0007227795338288831, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 6588266.0, "repeat_count": 0.0, "routers_loss": 0.009357884526252747, "skip_count": 2.0, "step": 4084, "text_loss": 0.35237613320350647 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0007225023957120782, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 6591009.0, "repeat_count": 0.0, "routers_loss": 0.0023083325941115618, "skip_count": 2.0, "step": 4086, "text_loss": 0.4336731433868408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 0.0007222251723319356, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 6594472.0, "repeat_count": 0.0, "routers_loss": 0.0008416616474278271, "skip_count": 0.0, "step": 4088, "text_loss": 0.6390535831451416 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045166015625, "learning_rate": 0.0007219478637946877, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6597477.0, "repeat_count": 0.0, "routers_loss": 0.004390760324895382, "skip_count": 1.0, "step": 4090, "text_loss": 0.525839626789093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0007216704702065997, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6600431.0, "repeat_count": 0.0, "routers_loss": 0.0010311100631952286, "skip_count": 0.0, "step": 4092, "text_loss": 0.5310423374176025 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0007213929916739695, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 6603899.0, "repeat_count": 0.0, "routers_loss": 0.0032497600186616182, "skip_count": 1.0, "step": 4094, "text_loss": 0.2775326073169708 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.230114470208395, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.044189453125, "learning_rate": 0.000721115428303127, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 6606544.0, "repeat_count": 1.0, "routers_loss": 0.004692315589636564, "skip_count": 3.0, "step": 4096, "text_loss": 0.6667124032974243 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0007208377802004353, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6610097.0, "repeat_count": 0.0, "routers_loss": 0.0007263485458679497, "skip_count": 0.0, "step": 4098, "text_loss": 0.6916406750679016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0007205600474722897, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6613836.0, "repeat_count": 0.0, "routers_loss": 0.0017989488551393151, "skip_count": 0.0, "step": 4100, "text_loss": 0.5257929563522339 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.000720282230225118, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6616780.0, "repeat_count": 0.0, "routers_loss": 0.0011308686807751656, "skip_count": 1.0, "step": 4102, "text_loss": 0.4410906732082367 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0007200043285653799, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6620110.0, "repeat_count": 0.0, "routers_loss": 0.002058265497907996, "skip_count": 2.0, "step": 4104, "text_loss": 0.8581191897392273 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 19.277076606985617, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007197263425995681, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 6622585.0, "repeat_count": 1.0, "routers_loss": 0.0017528717871755362, "skip_count": 0.0, "step": 4106, "text_loss": 0.5000449419021606 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.286469034341064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0007194482724342075, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6626356.0, "repeat_count": 0.0, "routers_loss": 0.0021995846182107925, "skip_count": 0.0, "step": 4108, "text_loss": 0.401346892118454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0007191701181758547, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6629738.0, "repeat_count": 0.0, "routers_loss": 0.0014869922306388617, "skip_count": 0.0, "step": 4110, "text_loss": 0.9598422050476074 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0007188918799310993, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 6632807.0, "repeat_count": 0.0, "routers_loss": 0.0012853415682911873, "skip_count": 0.0, "step": 4112, "text_loss": 0.3996548354625702 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.314646316407398, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029296875, "learning_rate": 0.0007186135578065627, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6636227.0, "repeat_count": 0.0, "routers_loss": 0.0009887361666187644, "skip_count": 0.0, "step": 4114, "text_loss": 0.4127283990383148 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007183351519088982, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6639443.0, "repeat_count": 0.0, "routers_loss": 0.006282114889472723, "skip_count": 1.0, "step": 4116, "text_loss": 0.20028606057167053 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.333431171118285, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.061767578125, "learning_rate": 0.0007180566623447917, "loss": 0.0114, "macro_f1": 0.6603773832321167, "num_tokens": 6642127.0, "repeat_count": 1.0, "routers_loss": 0.008101986721158028, "skip_count": 0.0, "step": 4118, "text_loss": 0.763931155204773 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0007177780892209607, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6645376.0, "repeat_count": 0.0, "routers_loss": 0.001953610684722662, "skip_count": 0.0, "step": 4120, "text_loss": 0.42317715287208557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0007174994326441551, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6648150.0, "repeat_count": 0.0, "routers_loss": 0.003279355587437749, "skip_count": 0.0, "step": 4122, "text_loss": 0.19656142592430115 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0007172206927211567, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6650935.0, "repeat_count": 0.0, "routers_loss": 0.0032076311763375998, "skip_count": 0.0, "step": 4124, "text_loss": 0.13608409464359283 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0007169418695587791, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6654464.0, "repeat_count": 0.0, "routers_loss": 0.004065621178597212, "skip_count": 2.0, "step": 4126, "text_loss": 0.4882086217403412 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0007166629632638678, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6657749.0, "repeat_count": 0.0, "routers_loss": 0.0009243001695722342, "skip_count": 0.0, "step": 4128, "text_loss": 0.31632331013679504 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.0007163839739433003, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6660997.0, "repeat_count": 0.0, "routers_loss": 0.0018459554994478822, "skip_count": 0.0, "step": 4130, "text_loss": 0.6123947501182556 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.399178162606397, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0007161049017039857, "loss": 0.0073, "macro_f1": 0.8820862174034119, "num_tokens": 6663542.0, "repeat_count": 2.0, "routers_loss": 0.030032536014914513, "skip_count": 2.0, "step": 4132, "text_loss": 0.6985659003257751 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 0.0007158257466528652, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6666178.0, "repeat_count": 0.0, "routers_loss": 0.0013813833938911557, "skip_count": 0.0, "step": 4134, "text_loss": 0.38380664587020874 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 19.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021484375, "learning_rate": 0.0007155465088969114, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 6668852.0, "repeat_count": 0.0, "routers_loss": 0.00513424864038825, "skip_count": 3.0, "step": 4136, "text_loss": 0.49724283814430237 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0007152671885431288, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6671430.0, "repeat_count": 0.0, "routers_loss": 0.0005165594047866762, "skip_count": 0.0, "step": 4138, "text_loss": 0.666959822177887 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047119140625, "learning_rate": 0.0007149877856985535, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6675215.0, "repeat_count": 0.0, "routers_loss": 0.001685218419879675, "skip_count": 0.0, "step": 4140, "text_loss": 0.3127259612083435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.000714708300470253, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6678505.0, "repeat_count": 0.0, "routers_loss": 0.004025314934551716, "skip_count": 0.0, "step": 4142, "text_loss": 0.3179470896720886 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 19.455532726739065, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0007144287329653269, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 6681127.0, "repeat_count": 1.0, "routers_loss": 0.005965690594166517, "skip_count": 0.0, "step": 4144, "text_loss": 0.3862907886505127 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.464925154094512, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0007141490832909058, "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 6683968.0, "repeat_count": 0.0, "routers_loss": 0.012896374799311161, "skip_count": 1.0, "step": 4146, "text_loss": 0.48156118392944336 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0007138693515541519, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6687196.0, "repeat_count": 0.0, "routers_loss": 0.0006367767928168178, "skip_count": 1.0, "step": 4148, "text_loss": 0.676702082157135 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 19.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030029296875, "learning_rate": 0.0007135895378622592, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 6689972.0, "repeat_count": 0.0, "routers_loss": 0.004532640799880028, "skip_count": 3.0, "step": 4150, "text_loss": 0.5865558981895447 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.493102436160846, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007133096423224526, "loss": 0.0081, "macro_f1": 0.3272727429866791, "num_tokens": 6693568.0, "repeat_count": 1.0, "routers_loss": 0.0377078577876091, "skip_count": 0.0, "step": 4152, "text_loss": 0.2790502607822418 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056640625, "learning_rate": 0.0007130296650419885, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6696468.0, "repeat_count": 0.0, "routers_loss": 0.004455826710909605, "skip_count": 1.0, "step": 4154, "text_loss": 0.5869500041007996 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0007127496061281551, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6699307.0, "repeat_count": 0.0, "routers_loss": 0.001998464809730649, "skip_count": 0.0, "step": 4156, "text_loss": 0.6931945085525513 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 19.52127971822718, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0007124694656882713, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6702647.0, "repeat_count": 3.0, "routers_loss": 0.004117495380342007, "skip_count": 0.0, "step": 4158, "text_loss": 0.4325876832008362 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 0.0007121892438296874, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6705964.0, "repeat_count": 0.0, "routers_loss": 0.0014713290147483349, "skip_count": 0.0, "step": 4160, "text_loss": 0.3672060966491699 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04345703125, "learning_rate": 0.0007119089406597849, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6710182.0, "repeat_count": 0.0, "routers_loss": 0.0037311650812625885, "skip_count": 1.0, "step": 4162, "text_loss": 0.6643805503845215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007116285562859767, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6713410.0, "repeat_count": 0.0, "routers_loss": 0.006017287727445364, "skip_count": 0.0, "step": 4164, "text_loss": 0.4606415927410126 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 19.55884942764896, "f1_execute": 0.9545454382896423, "f1_repeat": 0.5, "f1_skip": 1.0, "grad_norm": 0.05419921875, "learning_rate": 0.0007113480908157065, "loss": 0.0108, "macro_f1": 0.8181818723678589, "num_tokens": 6716056.0, "repeat_count": 3.0, "routers_loss": 0.08640352636575699, "skip_count": 4.0, "step": 4166, "text_loss": 0.3139408528804779 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0007110675443564491, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6719497.0, "repeat_count": 0.0, "routers_loss": 0.0012731150491163135, "skip_count": 0.0, "step": 4168, "text_loss": 0.7283861637115479 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0007107869170157108, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6722297.0, "repeat_count": 0.0, "routers_loss": 0.0021509863436222076, "skip_count": 2.0, "step": 4170, "text_loss": 0.5767703056335449 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0380859375, "learning_rate": 0.000710506208901028, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6725762.0, "repeat_count": 0.0, "routers_loss": 0.00257494836114347, "skip_count": 1.0, "step": 4172, "text_loss": 0.33571913838386536 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.596419137070736, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.000710225420119969, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 6728436.0, "repeat_count": 1.0, "routers_loss": 0.00943201594054699, "skip_count": 3.0, "step": 4174, "text_loss": 0.6849368810653687 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0007099445507801323, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6731427.0, "repeat_count": 0.0, "routers_loss": 0.01046718005090952, "skip_count": 2.0, "step": 4176, "text_loss": 0.3346157670021057 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0007096636009891477, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6734800.0, "repeat_count": 0.0, "routers_loss": 0.0007813365664333105, "skip_count": 0.0, "step": 4178, "text_loss": 0.49989959597587585 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.000709382570854676, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6738244.0, "repeat_count": 0.0, "routers_loss": 0.002825600327923894, "skip_count": 0.0, "step": 4180, "text_loss": 0.15744923055171967 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007091014604844078, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6741695.0, "repeat_count": 0.0, "routers_loss": 0.0017124463338404894, "skip_count": 0.0, "step": 4182, "text_loss": 0.3752405643463135 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0007088202699860655, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 6744882.0, "repeat_count": 1.0, "routers_loss": 0.005134924780577421, "skip_count": 3.0, "step": 4184, "text_loss": 0.18534569442272186 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 0.000708538999467402, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6747811.0, "repeat_count": 0.0, "routers_loss": 0.002371585462242365, "skip_count": 1.0, "step": 4186, "text_loss": 0.6251029968261719 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.0007082576490362004, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6750765.0, "repeat_count": 0.0, "routers_loss": 0.002088436856865883, "skip_count": 0.0, "step": 4188, "text_loss": 0.35471436381340027 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 0.000707976218800275, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6754021.0, "repeat_count": 0.0, "routers_loss": 0.0012272283202037215, "skip_count": 0.0, "step": 4190, "text_loss": 0.5737302899360657 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0007076947088674701, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6756793.0, "repeat_count": 0.0, "routers_loss": 0.0026050808373838663, "skip_count": 0.0, "step": 4192, "text_loss": 0.526336669921875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.054931640625, "learning_rate": 0.000707413119345661, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 6760221.0, "repeat_count": 0.0, "routers_loss": 0.0013151296880096197, "skip_count": 0.0, "step": 4194, "text_loss": 0.5678895711898804 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0007071314503427532, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6763721.0, "repeat_count": 0.0, "routers_loss": 0.001528652966953814, "skip_count": 0.0, "step": 4196, "text_loss": 0.7640175223350525 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0007068497019666829, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6768581.0, "repeat_count": 0.0, "routers_loss": 0.0019202446565032005, "skip_count": 0.0, "step": 4198, "text_loss": 0.41878414154052734 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.051513671875, "learning_rate": 0.0007065678743254167, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6772758.0, "repeat_count": 0.0, "routers_loss": 0.004667408298701048, "skip_count": 1.0, "step": 4200, "text_loss": 0.3550313413143158 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 19.727913120046964, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0007062859675269513, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6776671.0, "repeat_count": 3.0, "routers_loss": 0.00568761583417654, "skip_count": 0.0, "step": 4202, "text_loss": 0.1707649976015091 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0007060039816793141, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6780284.0, "repeat_count": 0.0, "routers_loss": 0.0030401297844946384, "skip_count": 0.0, "step": 4204, "text_loss": 0.2686377167701721 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 19.74669797475785, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04541015625, "learning_rate": 0.0007057219168905625, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 6783525.0, "repeat_count": 1.0, "routers_loss": 0.003353122156113386, "skip_count": 5.0, "step": 4206, "text_loss": 0.5235374569892883 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.000705439773268784, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6787691.0, "repeat_count": 0.0, "routers_loss": 0.0016532237641513348, "skip_count": 1.0, "step": 4208, "text_loss": 0.5002681612968445 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0007051575509220972, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 6790833.0, "repeat_count": 0.0, "routers_loss": 0.0011808308772742748, "skip_count": 0.0, "step": 4210, "text_loss": 0.7251001596450806 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04443359375, "learning_rate": 0.0007048752499586497, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6794260.0, "repeat_count": 0.0, "routers_loss": 0.006246297620236874, "skip_count": 2.0, "step": 4212, "text_loss": 0.2430499643087387 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0419921875, "learning_rate": 0.00070459287048662, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6797413.0, "repeat_count": 0.0, "routers_loss": 0.0012964420020580292, "skip_count": 0.0, "step": 4214, "text_loss": 0.48889362812042236 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0007043104126142163, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6800815.0, "repeat_count": 0.0, "routers_loss": 0.0018109704833477736, "skip_count": 0.0, "step": 4216, "text_loss": 0.5617026686668396 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 19.80305253889052, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0007040278764496771, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6803937.0, "repeat_count": 2.0, "routers_loss": 0.0028699536342173815, "skip_count": 1.0, "step": 4218, "text_loss": 0.548405647277832 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0007037452621012708, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6806946.0, "repeat_count": 0.0, "routers_loss": 0.0007951617590151727, "skip_count": 0.0, "step": 4220, "text_loss": 0.5702725648880005 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0007034625696772958, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6810083.0, "repeat_count": 0.0, "routers_loss": 0.003436052706092596, "skip_count": 2.0, "step": 4222, "text_loss": 0.3898725211620331 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.00070317979928608, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6812845.0, "repeat_count": 0.0, "routers_loss": 0.0005070401239208877, "skip_count": 0.0, "step": 4224, "text_loss": 0.5244157910346985 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.840622248312297, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.000702896951035982, "loss": 0.0101, "macro_f1": 0.3272727429866791, "num_tokens": 6815801.0, "repeat_count": 0.0, "routers_loss": 0.01560303382575512, "skip_count": 1.0, "step": 4226, "text_loss": 0.26503118872642517 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03564453125, "learning_rate": 0.0007026140250353896, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 6819464.0, "repeat_count": 0.0, "routers_loss": 0.009310240857303143, "skip_count": 2.0, "step": 4228, "text_loss": 0.15597499907016754 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0007023310213927208, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6822657.0, "repeat_count": 0.0, "routers_loss": 0.005309136584401131, "skip_count": 0.0, "step": 4230, "text_loss": 0.5271651148796082 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046875, "learning_rate": 0.0007020479402164226, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 6825661.0, "repeat_count": 0.0, "routers_loss": 0.005936166271567345, "skip_count": 2.0, "step": 4232, "text_loss": 0.6105108857154846 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.878191957734078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0007017647816149727, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6828688.0, "repeat_count": 0.0, "routers_loss": 0.001653556595556438, "skip_count": 0.0, "step": 4234, "text_loss": 0.6966437101364136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.000701481545696878, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 6831850.0, "repeat_count": 0.0, "routers_loss": 0.0013501866487786174, "skip_count": 0.0, "step": 4236, "text_loss": 1.259678840637207 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.059814453125, "learning_rate": 0.0007011982325706747, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6834862.0, "repeat_count": 0.0, "routers_loss": 0.008970130234956741, "skip_count": 1.0, "step": 4238, "text_loss": 0.24906545877456665 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.906369239800412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0007009148423449292, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6838148.0, "repeat_count": 0.0, "routers_loss": 0.0026013399474322796, "skip_count": 0.0, "step": 4240, "text_loss": 0.291467547416687 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.915761667155856, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0007006313751282371, "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 6841142.0, "repeat_count": 0.0, "routers_loss": 0.021415632218122482, "skip_count": 1.0, "step": 4242, "text_loss": 0.507606029510498 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0007003478310292236, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6844042.0, "repeat_count": 0.0, "routers_loss": 0.0023636550176888704, "skip_count": 0.0, "step": 4244, "text_loss": 0.11626995354890823 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.934546521866746, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0007000642101565433, "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 6847359.0, "repeat_count": 1.0, "routers_loss": 0.025154776871204376, "skip_count": 0.0, "step": 4246, "text_loss": 0.42898693680763245 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 0.0006997805126188803, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6850443.0, "repeat_count": 0.0, "routers_loss": 0.00540317315608263, "skip_count": 0.0, "step": 4248, "text_loss": 0.18085283041000366 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.000699496738524948, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 6853495.0, "repeat_count": 0.0, "routers_loss": 0.0014433214673772454, "skip_count": 0.0, "step": 4250, "text_loss": 0.5524004697799683 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 19.96272380393308, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006992128879834891, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 6856774.0, "repeat_count": 1.0, "routers_loss": 0.013381492346525192, "skip_count": 3.0, "step": 4252, "text_loss": 0.19605717062950134 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04248046875, "learning_rate": 0.0006989289611032758, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 6860313.0, "repeat_count": 0.0, "routers_loss": 0.007140172645449638, "skip_count": 1.0, "step": 4254, "text_loss": 0.3182447552680969 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 19.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006986449579931091, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6863683.0, "repeat_count": 0.0, "routers_loss": 0.006486213766038418, "skip_count": 1.0, "step": 4256, "text_loss": 0.19250160455703735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 19.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.0006983608787618201, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6867609.0, "repeat_count": 0.0, "routers_loss": 0.001465818495489657, "skip_count": 0.0, "step": 4258, "text_loss": 0.5912898182868958 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.000698076723518268, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6870040.0, "repeat_count": 0.0, "routers_loss": 0.0031106441747397184, "skip_count": 0.0, "step": 4260, "text_loss": 0.13542121648788452 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.0006977924923713418, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6873441.0, "repeat_count": 0.0, "routers_loss": 0.0005377951893024147, "skip_count": 0.0, "step": 4262, "text_loss": 0.352464497089386 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0006975081854299594, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 6876637.0, "repeat_count": 0.0, "routers_loss": 0.007052485831081867, "skip_count": 0.0, "step": 4264, "text_loss": 0.5023844242095947 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0006972238028030678, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6879928.0, "repeat_count": 0.0, "routers_loss": 0.0013608322478830814, "skip_count": 0.0, "step": 4266, "text_loss": 0.8664718270301819 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0006969393445996429, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6883425.0, "repeat_count": 0.0, "routers_loss": 0.0007607188890688121, "skip_count": 0.0, "step": 4268, "text_loss": 0.5131992101669312 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0006966548109286897, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6886790.0, "repeat_count": 0.0, "routers_loss": 0.00035804163780994713, "skip_count": 0.0, "step": 4270, "text_loss": 0.5352054834365845 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.000696370201899242, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6889747.0, "repeat_count": 0.0, "routers_loss": 0.004451376851648092, "skip_count": 1.0, "step": 4272, "text_loss": 0.47865036129951477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0006960855176203623, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6892604.0, "repeat_count": 0.0, "routers_loss": 0.0015342880506068468, "skip_count": 0.0, "step": 4274, "text_loss": 0.36278650164604187 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0006958007582011425, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6895563.0, "repeat_count": 0.0, "routers_loss": 0.0022974940948188305, "skip_count": 2.0, "step": 4276, "text_loss": 0.6695618629455566 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0006955159237507027, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6898591.0, "repeat_count": 0.0, "routers_loss": 0.00859096460044384, "skip_count": 1.0, "step": 4278, "text_loss": 0.44284722208976746 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.093924273554446, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0006952310143781921, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6903119.0, "repeat_count": 1.0, "routers_loss": 0.007919861935079098, "skip_count": 3.0, "step": 4280, "text_loss": 0.5006136298179626 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0006949460301927886, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6906394.0, "repeat_count": 0.0, "routers_loss": 0.0008476210059598088, "skip_count": 0.0, "step": 4282, "text_loss": 0.8153555989265442 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048095703125, "learning_rate": 0.0006946609713036985, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 6909136.0, "repeat_count": 0.0, "routers_loss": 0.006711610127240419, "skip_count": 2.0, "step": 4284, "text_loss": 0.43136683106422424 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0185546875, "learning_rate": 0.0006943758378201571, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 6912734.0, "repeat_count": 0.0, "routers_loss": 0.0038677838165313005, "skip_count": 0.0, "step": 4286, "text_loss": 0.2693749964237213 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.0006940906298514278, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6915838.0, "repeat_count": 0.0, "routers_loss": 0.0012188015971332788, "skip_count": 0.0, "step": 4288, "text_loss": 0.5809219479560852 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0006938053475068031, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6919225.0, "repeat_count": 0.0, "routers_loss": 0.001955829095095396, "skip_count": 0.0, "step": 4290, "text_loss": 0.5116089582443237 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.150278837687114, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.0006935199908956037, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 6922495.0, "repeat_count": 1.0, "routers_loss": 0.0035709093790501356, "skip_count": 0.0, "step": 4292, "text_loss": 0.2745901644229889 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0006932345601271786, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6925317.0, "repeat_count": 0.0, "routers_loss": 0.0005745319649577141, "skip_count": 0.0, "step": 4294, "text_loss": 0.6039219498634338 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 20.169063692398005, "f1_execute": 0.9743589162826538, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.0693359375, "learning_rate": 0.0006929490553109056, "loss": 0.0107, "macro_f1": 0.9247862696647644, "num_tokens": 6928054.0, "repeat_count": 3.0, "routers_loss": 0.061689916998147964, "skip_count": 6.0, "step": 4296, "text_loss": 0.3904837667942047 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0006926634765561907, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 6931348.0, "repeat_count": 0.0, "routers_loss": 0.002007248578593135, "skip_count": 0.0, "step": 4298, "text_loss": 0.5170742273330688 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.187848547108892, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.000692377823972468, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6934411.0, "repeat_count": 0.0, "routers_loss": 0.0005786226247437298, "skip_count": 0.0, "step": 4300, "text_loss": 0.8032443523406982 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.19724097446434, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006920920976692004, "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 6938153.0, "repeat_count": 1.0, "routers_loss": 0.024602646008133888, "skip_count": 0.0, "step": 4302, "text_loss": 0.446534663438797 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.0006918062977558784, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6940731.0, "repeat_count": 0.0, "routers_loss": 0.005759815219789743, "skip_count": 2.0, "step": 4304, "text_loss": 0.15479247272014618 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0006915204243420214, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6943246.0, "repeat_count": 0.0, "routers_loss": 0.005315347574651241, "skip_count": 1.0, "step": 4306, "text_loss": 0.22127842903137207 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.225418256530673, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0006912344775371765, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6947197.0, "repeat_count": 0.0, "routers_loss": 0.0012061651796102524, "skip_count": 0.0, "step": 4308, "text_loss": 0.7058854103088379 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0006909484574509191, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6951817.0, "repeat_count": 0.0, "routers_loss": 0.0029203309677541256, "skip_count": 0.0, "step": 4310, "text_loss": 0.6014000773429871 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0245361328125, "learning_rate": 0.0006906623641928525, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6955094.0, "repeat_count": 0.0, "routers_loss": 0.005703397560864687, "skip_count": 2.0, "step": 4312, "text_loss": 0.5923848152160645 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.253595538597008, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08154296875, "learning_rate": 0.0006903761978726084, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 6958127.0, "repeat_count": 1.0, "routers_loss": 0.004489895887672901, "skip_count": 2.0, "step": 4314, "text_loss": 0.36911651492118835 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.000690089958599846, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 6960871.0, "repeat_count": 0.0, "routers_loss": 0.003871412482112646, "skip_count": 2.0, "step": 4316, "text_loss": 0.442545086145401 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.272380393307895, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.000689803646484253, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6963980.0, "repeat_count": 1.0, "routers_loss": 0.008667866699397564, "skip_count": 2.0, "step": 4318, "text_loss": 0.1987489014863968 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.030517578125, "learning_rate": 0.0006895172616355446, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6967132.0, "repeat_count": 1.0, "routers_loss": 0.00843339879065752, "skip_count": 0.0, "step": 4320, "text_loss": 0.48267918825149536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0006892308041634639, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6969971.0, "repeat_count": 0.0, "routers_loss": 0.0004312851815484464, "skip_count": 0.0, "step": 4322, "text_loss": 0.3662732243537903 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 20.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0006889442741777822, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6973114.0, "repeat_count": 0.0, "routers_loss": 0.004588035400956869, "skip_count": 3.0, "step": 4324, "text_loss": 0.6707104444503784 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.309950102729672, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0006886576717882982, "loss": 0.0057, "macro_f1": 0.8817967176437378, "num_tokens": 6976013.0, "repeat_count": 2.0, "routers_loss": 0.0687296912074089, "skip_count": 3.0, "step": 4326, "text_loss": 0.1662217676639557 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0006883709971048384, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6979200.0, "repeat_count": 0.0, "routers_loss": 0.002950174268335104, "skip_count": 0.0, "step": 4328, "text_loss": 0.21168152987957 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0006880842502372572, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6982640.0, "repeat_count": 0.0, "routers_loss": 0.0032158740796148777, "skip_count": 0.0, "step": 4330, "text_loss": 0.26790961623191833 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0006877974312954365, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6985917.0, "repeat_count": 0.0, "routers_loss": 0.0005083635332994163, "skip_count": 0.0, "step": 4332, "text_loss": 0.9736502170562744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.347519812151454, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.000687510540389286, "loss": 0.0053, "macro_f1": 0.32098764181137085, "num_tokens": 6988388.0, "repeat_count": 0.0, "routers_loss": 0.03473830223083496, "skip_count": 2.0, "step": 4334, "text_loss": 0.21662230789661407 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006872235776287425, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6991360.0, "repeat_count": 0.0, "routers_loss": 0.002206524135544896, "skip_count": 0.0, "step": 4336, "text_loss": 0.6026972532272339 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 0.0006869365431237711, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6995080.0, "repeat_count": 1.0, "routers_loss": 0.000969731598161161, "skip_count": 0.0, "step": 4338, "text_loss": 0.5833017230033875 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.375697094217788, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0006866494369843635, "loss": 0.0054, "macro_f1": 0.8820862174034119, "num_tokens": 6998526.0, "repeat_count": 2.0, "routers_loss": 0.013962293043732643, "skip_count": 2.0, "step": 4340, "text_loss": 0.41465985774993896 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 20.38508952157323, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0006863622593205397, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 7001494.0, "repeat_count": 0.0, "routers_loss": 0.0064964210614562035, "skip_count": 3.0, "step": 4342, "text_loss": 0.3774271011352539 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 20.394481948928675, "f1_execute": 0.9767441749572754, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0006860750102423464, "loss": 0.0062, "macro_f1": 0.6589147448539734, "num_tokens": 7005544.0, "repeat_count": 1.0, "routers_loss": 0.023250726982951164, "skip_count": 6.0, "step": 4344, "text_loss": 0.2732464373111725 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.403874376284122, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0006857876898598582, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 7008847.0, "repeat_count": 0.0, "routers_loss": 0.0038170060142874718, "skip_count": 2.0, "step": 4346, "text_loss": 0.29610875248908997 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0006855002982831769, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7012577.0, "repeat_count": 0.0, "routers_loss": 0.0012856025714427233, "skip_count": 0.0, "step": 4348, "text_loss": 0.6098502278327942 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061767578125, "learning_rate": 0.0006852128356224314, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7015650.0, "repeat_count": 0.0, "routers_loss": 0.008162742480635643, "skip_count": 1.0, "step": 4350, "text_loss": 0.20868146419525146 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.432051658350456, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.023193359375, "learning_rate": 0.0006849253019877778, "loss": 0.0074, "macro_f1": 0.8817967176437378, "num_tokens": 7019925.0, "repeat_count": 2.0, "routers_loss": 0.023544032126665115, "skip_count": 3.0, "step": 4352, "text_loss": 0.628226101398468 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0006846376974893996, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 7023130.0, "repeat_count": 0.0, "routers_loss": 0.004982319660484791, "skip_count": 2.0, "step": 4354, "text_loss": 0.7037544250488281 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.450836513061343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0006843500222375074, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7026422.0, "repeat_count": 1.0, "routers_loss": 0.004015266429632902, "skip_count": 0.0, "step": 4356, "text_loss": 0.22352729737758636 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 20.46022894041679, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.042724609375, "learning_rate": 0.0006840622763423391, "loss": 0.0071, "macro_f1": 0.9449735879898071, "num_tokens": 7029077.0, "repeat_count": 2.0, "routers_loss": 0.021162014454603195, "skip_count": 4.0, "step": 4358, "text_loss": 0.2431403249502182 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0006837744599141591, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7032582.0, "repeat_count": 0.0, "routers_loss": 0.0007044129306450486, "skip_count": 0.0, "step": 4360, "text_loss": 0.26667487621307373 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0006834865730632594, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7035642.0, "repeat_count": 0.0, "routers_loss": 0.0067853196524083614, "skip_count": 1.0, "step": 4362, "text_loss": 0.20965275168418884 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0006831986158999588, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7038601.0, "repeat_count": 0.0, "routers_loss": 0.00899333506822586, "skip_count": 2.0, "step": 4364, "text_loss": 0.26860126852989197 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.000682910588534603, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7042274.0, "repeat_count": 0.0, "routers_loss": 0.0019194348715245724, "skip_count": 0.0, "step": 4366, "text_loss": 0.14046810567378998 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.507191077194012, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0006826224910775647, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 7045268.0, "repeat_count": 1.0, "routers_loss": 0.006915684789419174, "skip_count": 3.0, "step": 4368, "text_loss": 0.5900366306304932 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.516583504549455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0006823343236392432, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7049407.0, "repeat_count": 0.0, "routers_loss": 0.001678116386756301, "skip_count": 0.0, "step": 4370, "text_loss": 0.7868026494979858 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.000682046086330065, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7052783.0, "repeat_count": 0.0, "routers_loss": 0.0003459530707914382, "skip_count": 0.0, "step": 4372, "text_loss": 0.6349637508392334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0006817577792604831, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7055757.0, "repeat_count": 0.0, "routers_loss": 0.0011729507241398096, "skip_count": 0.0, "step": 4374, "text_loss": 0.43258991837501526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0006814694025409773, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 7058684.0, "repeat_count": 0.0, "routers_loss": 0.0006664610700681806, "skip_count": 0.0, "step": 4376, "text_loss": 0.5307940244674683 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.091796875, "learning_rate": 0.0006811809562820542, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 7061902.0, "repeat_count": 0.0, "routers_loss": 0.004595907870680094, "skip_count": 2.0, "step": 4378, "text_loss": 0.5830042362213135 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0006808924405942467, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7065100.0, "repeat_count": 0.0, "routers_loss": 0.0032026609405875206, "skip_count": 0.0, "step": 4380, "text_loss": 0.20797798037528992 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.572938068682124, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0184326171875, "learning_rate": 0.0006806038555881148, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 7068556.0, "repeat_count": 1.0, "routers_loss": 0.0024626904632896185, "skip_count": 0.0, "step": 4382, "text_loss": 0.5791074633598328 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.58233049603757, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 0.0006803152013742448, "loss": 0.0075, "macro_f1": 1.0, "num_tokens": 7071284.0, "repeat_count": 1.0, "routers_loss": 0.010723610408604145, "skip_count": 2.0, "step": 4384, "text_loss": 0.13227243721485138 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.0006800264780632495, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7074428.0, "repeat_count": 1.0, "routers_loss": 0.0011231007520109415, "skip_count": 0.0, "step": 4386, "text_loss": 0.4360627233982086 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 20.601115350748458, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0006797376857657681, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 7078313.0, "repeat_count": 2.0, "routers_loss": 0.008419238030910492, "skip_count": 1.0, "step": 4388, "text_loss": 0.5183924436569214 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.610507778103905, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.0006794488245924664, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 7081258.0, "repeat_count": 1.0, "routers_loss": 0.006582668516784906, "skip_count": 3.0, "step": 4390, "text_loss": 0.2797473669052124 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.61990020545935, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046630859375, "learning_rate": 0.0006791598946540368, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 7084527.0, "repeat_count": 0.0, "routers_loss": 0.00557357631623745, "skip_count": 2.0, "step": 4392, "text_loss": 0.39495575428009033 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.629292632814792, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06005859375, "learning_rate": 0.0006788708960611975, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 7087675.0, "repeat_count": 0.0, "routers_loss": 0.007155992556363344, "skip_count": 0.0, "step": 4394, "text_loss": 0.3785299062728882 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01806640625, "learning_rate": 0.0006785818289246934, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7090171.0, "repeat_count": 0.0, "routers_loss": 0.0009265039698220789, "skip_count": 0.0, "step": 4396, "text_loss": 0.42634522914886475 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 20.648077487525683, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.0006782926933552955, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 7092529.0, "repeat_count": 1.0, "routers_loss": 0.008679097518324852, "skip_count": 7.0, "step": 4398, "text_loss": 0.4283660054206848 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0006780034894638014, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7095141.0, "repeat_count": 0.0, "routers_loss": 0.002363949315622449, "skip_count": 0.0, "step": 4400, "text_loss": 0.481539249420166 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 20.666862342236573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 0.000677714217361034, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7098208.0, "repeat_count": 0.0, "routers_loss": 0.004005146212875843, "skip_count": 3.0, "step": 4402, "text_loss": 0.6443291902542114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0006774248771578435, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7101681.0, "repeat_count": 0.0, "routers_loss": 0.0026864963583648205, "skip_count": 0.0, "step": 4404, "text_loss": 0.16315312683582306 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 20.68564719694746, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0322265625, "learning_rate": 0.0006771354689651054, "loss": 0.005, "macro_f1": 0.9449735879898071, "num_tokens": 7104719.0, "repeat_count": 2.0, "routers_loss": 0.02719845622777939, "skip_count": 4.0, "step": 4406, "text_loss": 0.37855592370033264 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0006768459928937213, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7108697.0, "repeat_count": 0.0, "routers_loss": 0.010488593950867653, "skip_count": 0.0, "step": 4408, "text_loss": 0.23133711516857147 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 20.70443205165835, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.0006765564490546193, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7111426.0, "repeat_count": 1.0, "routers_loss": 0.0013637891970574856, "skip_count": 0.0, "step": 4410, "text_loss": 0.41399383544921875 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0006762668375587528, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7114241.0, "repeat_count": 0.0, "routers_loss": 0.000900395680218935, "skip_count": 0.0, "step": 4412, "text_loss": 0.6460412740707397 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0006759771585171016, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7117031.0, "repeat_count": 0.0, "routers_loss": 0.0024001260753721, "skip_count": 0.0, "step": 4414, "text_loss": 0.7645824551582336 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.732609333724685, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0006756874120406714, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 7120766.0, "repeat_count": 3.0, "routers_loss": 0.005034091416746378, "skip_count": 4.0, "step": 4416, "text_loss": 0.31753066182136536 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0006753975982404934, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7125243.0, "repeat_count": 0.0, "routers_loss": 0.002483269665390253, "skip_count": 0.0, "step": 4418, "text_loss": 0.5304268002510071 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.751394188435572, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0006751077172276249, "loss": 0.0052, "macro_f1": 0.3272727429866791, "num_tokens": 7127795.0, "repeat_count": 0.0, "routers_loss": 0.02676006779074669, "skip_count": 1.0, "step": 4420, "text_loss": 0.22011354565620422 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06201171875, "learning_rate": 0.000674817769113149, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7130837.0, "repeat_count": 0.0, "routers_loss": 0.003267093561589718, "skip_count": 2.0, "step": 4422, "text_loss": 0.2906076908111572 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 20.770179043146463, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.027099609375, "learning_rate": 0.000674527754008174, "loss": 0.0045, "macro_f1": 0.5934640765190125, "num_tokens": 7135090.0, "repeat_count": 0.0, "routers_loss": 0.022510390728712082, "skip_count": 3.0, "step": 4424, "text_loss": 0.2544902563095093 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0006742376720238345, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 7138751.0, "repeat_count": 0.0, "routers_loss": 0.0011178571730852127, "skip_count": 0.0, "step": 4426, "text_loss": 0.6811438798904419 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 20.788963897857354, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0006739475232712904, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 7141762.0, "repeat_count": 2.0, "routers_loss": 0.005595206283032894, "skip_count": 1.0, "step": 4428, "text_loss": 0.38743990659713745 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0006736573078617272, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7145235.0, "repeat_count": 0.0, "routers_loss": 0.002793942578136921, "skip_count": 2.0, "step": 4430, "text_loss": 0.21894219517707825 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 20.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.0006733670259063561, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 7149042.0, "repeat_count": 0.0, "routers_loss": 0.006146818865090609, "skip_count": 3.0, "step": 4432, "text_loss": 0.17822015285491943 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 20.817141179923688, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.042236328125, "learning_rate": 0.0006730766775164136, "loss": 0.0061, "macro_f1": 0.5492662787437439, "num_tokens": 7152166.0, "repeat_count": 0.0, "routers_loss": 0.026045087724924088, "skip_count": 2.0, "step": 4434, "text_loss": 0.2910420000553131 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 20.82653360727913, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03466796875, "learning_rate": 0.0006727862628031618, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7155506.0, "repeat_count": 2.0, "routers_loss": 0.0022973387967795134, "skip_count": 0.0, "step": 4436, "text_loss": 0.3502544164657593 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 0.0006724957818778882, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7158739.0, "repeat_count": 0.0, "routers_loss": 0.002357073128223419, "skip_count": 1.0, "step": 4438, "text_loss": 0.26200664043426514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0006722052348519054, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 7161776.0, "repeat_count": 0.0, "routers_loss": 0.0005521026905626059, "skip_count": 0.0, "step": 4440, "text_loss": 0.3922915458679199 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 20.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044189453125, "learning_rate": 0.000671914621836552, "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 7164763.0, "repeat_count": 0.0, "routers_loss": 0.007691344246268272, "skip_count": 2.0, "step": 4442, "text_loss": 0.6021351218223572 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.000671623942943191, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7167924.0, "repeat_count": 0.0, "routers_loss": 0.0032181134447455406, "skip_count": 0.0, "step": 4444, "text_loss": 0.23639555275440216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.873495744056356, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.030029296875, "learning_rate": 0.0006713331982832113, "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 7170743.0, "repeat_count": 1.0, "routers_loss": 0.024979131296277046, "skip_count": 0.0, "step": 4446, "text_loss": 0.4957772493362427 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0006710423879680271, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7174660.0, "repeat_count": 0.0, "routers_loss": 0.002571308286860585, "skip_count": 0.0, "step": 4448, "text_loss": 0.47968071699142456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.000670751512109077, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7177965.0, "repeat_count": 0.0, "routers_loss": 0.00212799571454525, "skip_count": 0.0, "step": 4450, "text_loss": 0.6550716161727905 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041748046875, "learning_rate": 0.0006704605708178252, "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 7181512.0, "repeat_count": 0.0, "routers_loss": 0.004176430404186249, "skip_count": 1.0, "step": 4452, "text_loss": 0.36959558725357056 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0006701695642057613, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7184555.0, "repeat_count": 0.0, "routers_loss": 0.0010968588758260012, "skip_count": 0.0, "step": 4454, "text_loss": 0.6686749458312988 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0006698784923843993, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7187474.0, "repeat_count": 0.0, "routers_loss": 0.0014241471653804183, "skip_count": 0.0, "step": 4456, "text_loss": 0.6147221922874451 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0006695873554652784, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7190649.0, "repeat_count": 0.0, "routers_loss": 0.008801907300949097, "skip_count": 0.0, "step": 4458, "text_loss": 0.26381927728652954 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04638671875, "learning_rate": 0.0006692961535599634, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 7193961.0, "repeat_count": 0.0, "routers_loss": 0.009027508087456226, "skip_count": 1.0, "step": 4460, "text_loss": 0.1926470547914505 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0006690048867800427, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7197456.0, "repeat_count": 0.0, "routers_loss": 0.0022697453387081623, "skip_count": 0.0, "step": 4462, "text_loss": 0.6736721992492676 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0006687135552371305, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7200290.0, "repeat_count": 0.0, "routers_loss": 0.006747903767973185, "skip_count": 1.0, "step": 4464, "text_loss": 0.2026437371969223 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006684221590428657, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7203320.0, "repeat_count": 0.0, "routers_loss": 0.0011565096210688353, "skip_count": 0.0, "step": 4466, "text_loss": 0.7587730288505554 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 20.976812444966246, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.0006681306983089121, "loss": 0.0083, "macro_f1": 0.8820862174034119, "num_tokens": 7206411.0, "repeat_count": 2.0, "routers_loss": 0.023645581677556038, "skip_count": 2.0, "step": 4468, "text_loss": 0.8981561660766602 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 20.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0006678391731469575, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 7209421.0, "repeat_count": 0.0, "routers_loss": 0.0035848666448146105, "skip_count": 0.0, "step": 4470, "text_loss": 0.1522839516401291 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 20.995597299677137, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0006675475836687152, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 7212267.0, "repeat_count": 1.0, "routers_loss": 0.005046425387263298, "skip_count": 1.0, "step": 4472, "text_loss": 0.46007999777793884 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0006672559299859228, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7215195.0, "repeat_count": 0.0, "routers_loss": 0.0019333874806761742, "skip_count": 0.0, "step": 4474, "text_loss": 1.0859547853469849 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0006669642122103423, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7217941.0, "repeat_count": 0.0, "routers_loss": 0.0005401032394729555, "skip_count": 0.0, "step": 4476, "text_loss": 0.9754356145858765 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.023481068388612, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0006666724304537611, "loss": 0.0053, "macro_f1": 0.3272727429866791, "num_tokens": 7222494.0, "repeat_count": 1.0, "routers_loss": 0.015569722279906273, "skip_count": 0.0, "step": 4478, "text_loss": 0.2896423637866974 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0006663805848279898, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7225292.0, "repeat_count": 0.0, "routers_loss": 0.0020135147497057915, "skip_count": 0.0, "step": 4480, "text_loss": 0.8492724299430847 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 21.0422659230995, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0006660886754448648, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 7229184.0, "repeat_count": 1.0, "routers_loss": 0.002355351345613599, "skip_count": 0.0, "step": 4482, "text_loss": 0.189764603972435 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02099609375, "learning_rate": 0.0006657967024162459, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7232906.0, "repeat_count": 0.0, "routers_loss": 0.003044391982257366, "skip_count": 0.0, "step": 4484, "text_loss": 0.4239847660064697 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0006655046658540179, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7235996.0, "repeat_count": 0.0, "routers_loss": 0.00602696230635047, "skip_count": 2.0, "step": 4486, "text_loss": 0.217103973031044 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.070443205165834, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0169677734375, "learning_rate": 0.0006652125658700896, "loss": 0.0031, "macro_f1": 0.6666666865348816, "num_tokens": 7238882.0, "repeat_count": 0.0, "routers_loss": 0.001470155781134963, "skip_count": 1.0, "step": 4488, "text_loss": 0.6090770363807678 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.07983563252128, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0006649204025763945, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 7241815.0, "repeat_count": 1.0, "routers_loss": 0.008737480267882347, "skip_count": 2.0, "step": 4490, "text_loss": 0.48314425349235535 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0177001953125, "learning_rate": 0.0006646281760848902, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 7244848.0, "repeat_count": 0.0, "routers_loss": 0.0008257135050371289, "skip_count": 0.0, "step": 4492, "text_loss": 0.5884748101234436 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006643358865075581, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7247930.0, "repeat_count": 0.0, "routers_loss": 0.0016262239078059793, "skip_count": 0.0, "step": 4494, "text_loss": 0.21444730460643768 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0006640435339564042, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7251776.0, "repeat_count": 0.0, "routers_loss": 0.001315156347118318, "skip_count": 0.0, "step": 4496, "text_loss": 0.6890370845794678 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.11740534194306, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0006637511185434588, "loss": 0.0091, "macro_f1": 1.0, "num_tokens": 7255070.0, "repeat_count": 1.0, "routers_loss": 0.007614497095346451, "skip_count": 3.0, "step": 4498, "text_loss": 0.516417920589447 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 21.126797769298502, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0006634586403807758, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 7258115.0, "repeat_count": 3.0, "routers_loss": 0.004906686954200268, "skip_count": 2.0, "step": 4500, "text_loss": 0.577463686466217 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.13619019665395, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.0006631660995804334, "loss": 0.0067, "macro_f1": 0.6601307392120361, "num_tokens": 7260769.0, "repeat_count": 1.0, "routers_loss": 0.013337121345102787, "skip_count": 2.0, "step": 4502, "text_loss": 0.37124839425086975 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.145582624009393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0006628734962545339, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7263908.0, "repeat_count": 0.0, "routers_loss": 0.0023418180644512177, "skip_count": 0.0, "step": 4504, "text_loss": 0.17937727272510529 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0006625808305152033, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7267391.0, "repeat_count": 0.0, "routers_loss": 0.0006556165171787143, "skip_count": 0.0, "step": 4506, "text_loss": 0.45344987511634827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0006622881024745919, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 7271402.0, "repeat_count": 0.0, "routers_loss": 0.0021988123189657927, "skip_count": 0.0, "step": 4508, "text_loss": 0.5842905640602112 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029052734375, "learning_rate": 0.0006619953122448734, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 7274354.0, "repeat_count": 0.0, "routers_loss": 0.00774174090474844, "skip_count": 2.0, "step": 4510, "text_loss": 0.27159228920936584 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0006617024599382456, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7277378.0, "repeat_count": 0.0, "routers_loss": 0.0006942499312572181, "skip_count": 0.0, "step": 4512, "text_loss": 0.4464176297187805 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0006614095456669302, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7280526.0, "repeat_count": 0.0, "routers_loss": 0.003003394464030862, "skip_count": 0.0, "step": 4514, "text_loss": 0.31188079714775085 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0006611165695431725, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7283916.0, "repeat_count": 0.0, "routers_loss": 0.0006948060472495854, "skip_count": 0.0, "step": 4516, "text_loss": 0.5266574025154114 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0006608235316792413, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7286843.0, "repeat_count": 0.0, "routers_loss": 0.0014080886030569673, "skip_count": 0.0, "step": 4518, "text_loss": 0.5880120396614075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0006605304321874295, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7289940.0, "repeat_count": 0.0, "routers_loss": 0.0016894340515136719, "skip_count": 0.0, "step": 4520, "text_loss": 0.6623797416687012 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006602372711800531, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7292869.0, "repeat_count": 0.0, "routers_loss": 0.003522444050759077, "skip_count": 0.0, "step": 4522, "text_loss": 0.5488807559013367 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0006599440487694521, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7296618.0, "repeat_count": 0.0, "routers_loss": 0.0011981099378317595, "skip_count": 0.0, "step": 4524, "text_loss": 0.4128517210483551 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.248899324919282, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.00065965076506799, "loss": 0.0047, "macro_f1": 0.9262410998344421, "num_tokens": 7300481.0, "repeat_count": 3.0, "routers_loss": 0.010548194870352745, "skip_count": 2.0, "step": 4526, "text_loss": 0.26450902223587036 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.0006593574201880536, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7303272.0, "repeat_count": 0.0, "routers_loss": 0.005642973352223635, "skip_count": 1.0, "step": 4528, "text_loss": 0.35269856452941895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.000659064014242053, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 7306615.0, "repeat_count": 0.0, "routers_loss": 0.004171932581812143, "skip_count": 1.0, "step": 4530, "text_loss": 0.18814080953598022 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0006587705473424223, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 7310368.0, "repeat_count": 0.0, "routers_loss": 0.002289367141202092, "skip_count": 2.0, "step": 4532, "text_loss": 0.7363705635070801 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.286469034341064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.000658477019601618, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 7313788.0, "repeat_count": 0.0, "routers_loss": 0.004440625663846731, "skip_count": 1.0, "step": 4534, "text_loss": 0.8126176595687866 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006581834311321211, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 7317864.0, "repeat_count": 0.0, "routers_loss": 0.0013160990783944726, "skip_count": 2.0, "step": 4536, "text_loss": 0.7015916109085083 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04736328125, "learning_rate": 0.000657889782046435, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7320693.0, "repeat_count": 0.0, "routers_loss": 0.0032275544945150614, "skip_count": 2.0, "step": 4538, "text_loss": 0.6481677293777466 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.314646316407398, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0006575960724570865, "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 7324335.0, "repeat_count": 0.0, "routers_loss": 0.009769129566848278, "skip_count": 1.0, "step": 4540, "text_loss": 0.22194676101207733 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.32403874376284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.042724609375, "learning_rate": 0.0006573023024766258, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 7327431.0, "repeat_count": 2.0, "routers_loss": 0.0036973082460463047, "skip_count": 4.0, "step": 4542, "text_loss": 0.475127637386322 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.000657008472217626, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7330262.0, "repeat_count": 0.0, "routers_loss": 0.0007046440150588751, "skip_count": 0.0, "step": 4544, "text_loss": 0.2649917006492615 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04443359375, "learning_rate": 0.0006567145817926836, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7333110.0, "repeat_count": 0.0, "routers_loss": 0.0026714997366070747, "skip_count": 0.0, "step": 4546, "text_loss": 0.5490524768829346 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0006564206313144175, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7336101.0, "repeat_count": 0.0, "routers_loss": 0.006552211008965969, "skip_count": 0.0, "step": 4548, "text_loss": 0.14098678529262543 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.0006561266208954707, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 7339435.0, "repeat_count": 0.0, "routers_loss": 0.0035560601390898228, "skip_count": 2.0, "step": 4550, "text_loss": 0.20412275195121765 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0006558325506485081, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7342609.0, "repeat_count": 0.0, "routers_loss": 0.0020106974989175797, "skip_count": 1.0, "step": 4552, "text_loss": 0.6184256076812744 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0006555384206862183, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 7345614.0, "repeat_count": 0.0, "routers_loss": 0.0014235252747312188, "skip_count": 0.0, "step": 4554, "text_loss": 1.0108838081359863 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.389785735250953, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0006552442311213121, "loss": 0.0041, "macro_f1": 0.3272727429866791, "num_tokens": 7348957.0, "repeat_count": 1.0, "routers_loss": 0.01703745685517788, "skip_count": 0.0, "step": 4556, "text_loss": 0.21315747499465942 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 21.399178162606397, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0269775390625, "learning_rate": 0.0006549499820665237, "loss": 0.0077, "macro_f1": 0.5934640765190125, "num_tokens": 7352724.0, "repeat_count": 0.0, "routers_loss": 0.013315381482243538, "skip_count": 3.0, "step": 4558, "text_loss": 0.34369465708732605 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.00065465567363461, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7356592.0, "repeat_count": 0.0, "routers_loss": 0.0017354936571791768, "skip_count": 0.0, "step": 4560, "text_loss": 0.6267461180686951 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0006543613059383503, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7359774.0, "repeat_count": 0.0, "routers_loss": 0.011646085418760777, "skip_count": 2.0, "step": 4562, "text_loss": 0.4400193989276886 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006540668790905471, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7362765.0, "repeat_count": 0.0, "routers_loss": 0.0019345436012372375, "skip_count": 0.0, "step": 4564, "text_loss": 0.49204275012016296 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0006537723932040251, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7366337.0, "repeat_count": 0.0, "routers_loss": 0.00562885170802474, "skip_count": 1.0, "step": 4566, "text_loss": 0.22566382586956024 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.44614029938362, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0006534778483916319, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 7369851.0, "repeat_count": 2.0, "routers_loss": 0.005508176051080227, "skip_count": 2.0, "step": 4568, "text_loss": 0.8057850003242493 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006531832447662377, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7373918.0, "repeat_count": 0.0, "routers_loss": 0.006460923235863447, "skip_count": 2.0, "step": 4570, "text_loss": 0.5141497254371643 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0006528885824407351, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7376674.0, "repeat_count": 0.0, "routers_loss": 0.0032120654359459877, "skip_count": 0.0, "step": 4572, "text_loss": 0.1281338930130005 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052490234375, "learning_rate": 0.0006525938615280394, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 7379791.0, "repeat_count": 0.0, "routers_loss": 0.00443810923025012, "skip_count": 0.0, "step": 4574, "text_loss": 0.268352210521698 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.000652299082141088, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 7382886.0, "repeat_count": 0.0, "routers_loss": 0.008284369483590126, "skip_count": 2.0, "step": 4576, "text_loss": 0.30193832516670227 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.493102436160846, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006520042443928411, "loss": 0.0068, "macro_f1": 0.8823530077934265, "num_tokens": 7386036.0, "repeat_count": 2.0, "routers_loss": 0.03383317217230797, "skip_count": 1.0, "step": 4578, "text_loss": 0.23106542229652405 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0419921875, "learning_rate": 0.000651709348396281, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7388908.0, "repeat_count": 0.0, "routers_loss": 0.0017075951909646392, "skip_count": 1.0, "step": 4580, "text_loss": 0.386099249124527 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0006514143942644124, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7392004.0, "repeat_count": 0.0, "routers_loss": 0.009516917169094086, "skip_count": 1.0, "step": 4582, "text_loss": 0.3162059485912323 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.051513671875, "learning_rate": 0.0006511193821102623, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 7395538.0, "repeat_count": 0.0, "routers_loss": 0.0031392278615385294, "skip_count": 0.0, "step": 4584, "text_loss": 0.5536221861839294 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0006508243120468799, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7398461.0, "repeat_count": 0.0, "routers_loss": 0.0014138511614874005, "skip_count": 0.0, "step": 4586, "text_loss": 0.7934318780899048 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0006505291841873367, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7401611.0, "repeat_count": 0.0, "routers_loss": 0.0005265916115604341, "skip_count": 0.0, "step": 4588, "text_loss": 0.4569905698299408 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.000650233998644726, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7404641.0, "repeat_count": 0.0, "routers_loss": 0.0024988956283777952, "skip_count": 0.0, "step": 4590, "text_loss": 0.49998772144317627 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0006499387555321636, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7407574.0, "repeat_count": 0.0, "routers_loss": 0.004110113717615604, "skip_count": 1.0, "step": 4592, "text_loss": 0.5679413676261902 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006496434549627874, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7410806.0, "repeat_count": 0.0, "routers_loss": 0.0032845588866621256, "skip_count": 0.0, "step": 4594, "text_loss": 0.35515281558036804 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006493480970497568, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7413402.0, "repeat_count": 0.0, "routers_loss": 0.010577172972261906, "skip_count": 1.0, "step": 4596, "text_loss": 0.26111698150634766 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.587026709715293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0006490526819062537, "loss": 0.0091, "macro_f1": 1.0, "num_tokens": 7417236.0, "repeat_count": 1.0, "routers_loss": 0.002054794691503048, "skip_count": 2.0, "step": 4598, "text_loss": 0.6480993628501892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0006487572096454818, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7420278.0, "repeat_count": 0.0, "routers_loss": 0.0017989084590226412, "skip_count": 0.0, "step": 4600, "text_loss": 0.4935401678085327 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0006484616803806665, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7423866.0, "repeat_count": 0.0, "routers_loss": 0.006671485956758261, "skip_count": 1.0, "step": 4602, "text_loss": 0.15030258893966675 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 21.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0006481660942250552, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7426884.0, "repeat_count": 0.0, "routers_loss": 0.008334980346262455, "skip_count": 3.0, "step": 4604, "text_loss": 0.29933279752731323 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 21.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0006478704512919173, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 7431017.0, "repeat_count": 0.0, "routers_loss": 0.011923984624445438, "skip_count": 3.0, "step": 4606, "text_loss": 0.35141825675964355 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 21.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0006475747516945432, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7434406.0, "repeat_count": 0.0, "routers_loss": 0.0031092462595552206, "skip_count": 3.0, "step": 4608, "text_loss": 0.21021464467048645 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 21.64338127384796, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02978515625, "learning_rate": 0.000647278995546246, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7437204.0, "repeat_count": 1.0, "routers_loss": 0.0006713552866131067, "skip_count": 0.0, "step": 4610, "text_loss": 0.4052635431289673 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0006469831829603598, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7439741.0, "repeat_count": 0.0, "routers_loss": 0.0022583482787013054, "skip_count": 2.0, "step": 4612, "text_loss": 0.5443860292434692 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044677734375, "learning_rate": 0.0006466873140502407, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7443619.0, "repeat_count": 0.0, "routers_loss": 0.004187075886875391, "skip_count": 2.0, "step": 4614, "text_loss": 0.30709847807884216 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0006463913889292661, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7446696.0, "repeat_count": 0.0, "routers_loss": 0.008314833045005798, "skip_count": 0.0, "step": 4616, "text_loss": 0.22949637472629547 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0006460954077108353, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7450377.0, "repeat_count": 0.0, "routers_loss": 0.001277514616958797, "skip_count": 0.0, "step": 4618, "text_loss": 0.37715134024620056 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0006457993705083684, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 7453271.0, "repeat_count": 0.0, "routers_loss": 0.0022756033577024937, "skip_count": 2.0, "step": 4620, "text_loss": 0.7373883128166199 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02099609375, "learning_rate": 0.0006455032774353078, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7456492.0, "repeat_count": 0.0, "routers_loss": 0.0039057908579707146, "skip_count": 2.0, "step": 4622, "text_loss": 0.5058769583702087 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0203857421875, "learning_rate": 0.0006452071286051169, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 7459619.0, "repeat_count": 0.0, "routers_loss": 0.0019458672031760216, "skip_count": 0.0, "step": 4624, "text_loss": 0.5110082030296326 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0006449109241312802, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 7462552.0, "repeat_count": 0.0, "routers_loss": 0.0002716891176532954, "skip_count": 1.0, "step": 4626, "text_loss": 0.6197522878646851 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.045654296875, "learning_rate": 0.0006446146641273042, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7466769.0, "repeat_count": 0.0, "routers_loss": 0.0037578947376459837, "skip_count": 2.0, "step": 4628, "text_loss": 0.1653924286365509 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.000644318348706716, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7470216.0, "repeat_count": 0.0, "routers_loss": 0.0012791058979928493, "skip_count": 0.0, "step": 4630, "text_loss": 0.7114694118499756 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.74669797475785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.0006440219779830643, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 7472975.0, "repeat_count": 0.0, "routers_loss": 0.00736592011526227, "skip_count": 2.0, "step": 4632, "text_loss": 0.26601463556289673 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.000643725552069919, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7475672.0, "repeat_count": 0.0, "routers_loss": 0.00045455715735442936, "skip_count": 0.0, "step": 4634, "text_loss": 0.5028402805328369 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022705078125, "learning_rate": 0.0006434290710808711, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7478850.0, "repeat_count": 0.0, "routers_loss": 0.004247233271598816, "skip_count": 2.0, "step": 4636, "text_loss": 0.12746070325374603 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 21.774875256824185, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.04052734375, "learning_rate": 0.0006431325351295324, "loss": 0.0083, "macro_f1": 0.5427350401878357, "num_tokens": 7481747.0, "repeat_count": 1.0, "routers_loss": 0.047564394772052765, "skip_count": 2.0, "step": 4638, "text_loss": 0.24056802690029144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0006428359443295362, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7484885.0, "repeat_count": 0.0, "routers_loss": 0.0011175100225955248, "skip_count": 0.0, "step": 4640, "text_loss": 0.6265338063240051 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 21.793660111535075, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.035400390625, "learning_rate": 0.0006425392987945369, "loss": 0.0086, "macro_f1": 0.5492662787437439, "num_tokens": 7487973.0, "repeat_count": 0.0, "routers_loss": 0.016879938542842865, "skip_count": 2.0, "step": 4642, "text_loss": 0.2523447275161743 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 21.80305253889052, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.032958984375, "learning_rate": 0.0006422425986382093, "loss": 0.0055, "macro_f1": 0.5934640765190125, "num_tokens": 7491024.0, "repeat_count": 0.0, "routers_loss": 0.018616504967212677, "skip_count": 3.0, "step": 4644, "text_loss": 0.38890624046325684 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.812444966245963, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0006419458439742496, "loss": 0.0056, "macro_f1": 0.3272727429866791, "num_tokens": 7494199.0, "repeat_count": 0.0, "routers_loss": 0.023129139095544815, "skip_count": 1.0, "step": 4646, "text_loss": 0.4060848355293274 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0006416490349163747, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 7497287.0, "repeat_count": 0.0, "routers_loss": 0.0018601802876219153, "skip_count": 0.0, "step": 4648, "text_loss": 0.3387545943260193 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0006413521715783225, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 7500598.0, "repeat_count": 0.0, "routers_loss": 0.0017482215771451592, "skip_count": 0.0, "step": 4650, "text_loss": 0.4290996193885803 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.840622248312297, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0006410552540738514, "loss": 0.007, "macro_f1": 0.3272727429866791, "num_tokens": 7503252.0, "repeat_count": 1.0, "routers_loss": 0.0420118011534214, "skip_count": 0.0, "step": 4652, "text_loss": 0.439496248960495 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.850014675667744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 0.000640758282516741, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 7506382.0, "repeat_count": 1.0, "routers_loss": 0.0017782216891646385, "skip_count": 1.0, "step": 4654, "text_loss": 0.8513308167457581 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 21.859407103023187, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0439453125, "learning_rate": 0.0006404612570207911, "loss": 0.0102, "macro_f1": 0.3272727429866791, "num_tokens": 7510423.0, "repeat_count": 0.0, "routers_loss": 0.010385853238403797, "skip_count": 0.0, "step": 4656, "text_loss": 0.7159742712974548 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0006401641776998223, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7513394.0, "repeat_count": 0.0, "routers_loss": 0.0011917101219296455, "skip_count": 0.0, "step": 4658, "text_loss": 0.6165401339530945 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.878191957734078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006398670446676766, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7516828.0, "repeat_count": 3.0, "routers_loss": 0.008860073052346706, "skip_count": 4.0, "step": 4660, "text_loss": 0.923275887966156 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0006395698580382153, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7519764.0, "repeat_count": 0.0, "routers_loss": 0.000505418807733804, "skip_count": 0.0, "step": 4662, "text_loss": 0.6143050789833069 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.0006392726179253212, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7522390.0, "repeat_count": 0.0, "routers_loss": 0.004020806401968002, "skip_count": 1.0, "step": 4664, "text_loss": 0.6935067176818848 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.906369239800412, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.052001953125, "learning_rate": 0.0006389753244428972, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 7525821.0, "repeat_count": 1.0, "routers_loss": 0.00957963801920414, "skip_count": 2.0, "step": 4666, "text_loss": 0.3350338637828827 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.915761667155856, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0006386779777048666, "loss": 0.0063, "macro_f1": 0.6601307392120361, "num_tokens": 7529513.0, "repeat_count": 1.0, "routers_loss": 0.020673364400863647, "skip_count": 2.0, "step": 4668, "text_loss": 0.47800472378730774 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0006383805778251735, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7533450.0, "repeat_count": 0.0, "routers_loss": 0.007217096630483866, "skip_count": 1.0, "step": 4670, "text_loss": 0.4506106972694397 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 21.934546521866746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0006380831249177817, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 7536287.0, "repeat_count": 1.0, "routers_loss": 0.007001714315265417, "skip_count": 0.0, "step": 4672, "text_loss": 0.4081715941429138 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0006377856190966762, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 7539442.0, "repeat_count": 0.0, "routers_loss": 0.0015112817054614425, "skip_count": 0.0, "step": 4674, "text_loss": 0.21451139450073242 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 21.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0006374880604758615, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 7542594.0, "repeat_count": 0.0, "routers_loss": 0.007311929017305374, "skip_count": 2.0, "step": 4676, "text_loss": 0.14785248041152954 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 21.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0006371904491693626, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7545780.0, "repeat_count": 0.0, "routers_loss": 0.007489737123250961, "skip_count": 1.0, "step": 4678, "text_loss": 0.2248108983039856 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 21.972116231288524, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0006368927852912247, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 7548287.0, "repeat_count": 1.0, "routers_loss": 0.009772555902600288, "skip_count": 1.0, "step": 4680, "text_loss": 0.1566995233297348 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 21.981508658643968, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0006365950689555133, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7551424.0, "repeat_count": 0.0, "routers_loss": 0.002134992741048336, "skip_count": 0.0, "step": 4682, "text_loss": 0.7322417497634888 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 21.99090108599941, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0006362973002763139, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7554182.0, "repeat_count": 1.0, "routers_loss": 0.008511497639119625, "skip_count": 4.0, "step": 4684, "text_loss": 0.24387991428375244 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.0006359994793677319, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 7557044.0, "repeat_count": 0.0, "routers_loss": 0.004151526838541031, "skip_count": 2.0, "step": 4686, "text_loss": 0.6139411330223083 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006357016063438928, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7560231.0, "repeat_count": 0.0, "routers_loss": 0.0009724601986818016, "skip_count": 0.0, "step": 4688, "text_loss": 0.7875718474388123 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.0006354036813189421, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7562953.0, "repeat_count": 0.0, "routers_loss": 0.0008926765876822174, "skip_count": 0.0, "step": 4690, "text_loss": 0.5195512771606445 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0006351057044070455, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 7566137.0, "repeat_count": 0.0, "routers_loss": 0.0031294538639485836, "skip_count": 0.0, "step": 4692, "text_loss": 0.7288873195648193 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.0006348076757223877, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 7569073.0, "repeat_count": 0.0, "routers_loss": 0.0015065820189192891, "skip_count": 2.0, "step": 4694, "text_loss": 0.7242236137390137 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0006345095953791746, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7573025.0, "repeat_count": 0.0, "routers_loss": 0.0005603441968560219, "skip_count": 0.0, "step": 4696, "text_loss": 0.34443899989128113 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02490234375, "learning_rate": 0.0006342114634916307, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7576546.0, "repeat_count": 0.0, "routers_loss": 0.0011047758162021637, "skip_count": 0.0, "step": 4698, "text_loss": 0.4892682731151581 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02490234375, "learning_rate": 0.0006339132801740008, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 7580711.0, "repeat_count": 0.0, "routers_loss": 0.0019803126342594624, "skip_count": 2.0, "step": 4700, "text_loss": 0.4479489028453827 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.07513941884356, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0458984375, "learning_rate": 0.0006336150455405494, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 7583385.0, "repeat_count": 1.0, "routers_loss": 0.0005326359532773495, "skip_count": 0.0, "step": 4702, "text_loss": 0.627504825592041 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0006333167597055604, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 7586584.0, "repeat_count": 0.0, "routers_loss": 0.0005587987834587693, "skip_count": 0.0, "step": 4704, "text_loss": 0.43891432881355286 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.0006330184227833376, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 7590408.0, "repeat_count": 0.0, "routers_loss": 0.007053783163428307, "skip_count": 2.0, "step": 4706, "text_loss": 0.19946859776973724 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006327200348882043, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7593857.0, "repeat_count": 1.0, "routers_loss": 0.0009479080326855183, "skip_count": 0.0, "step": 4708, "text_loss": 0.7973214387893677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0006324215961345032, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7596429.0, "repeat_count": 0.0, "routers_loss": 0.0012403312139213085, "skip_count": 0.0, "step": 4710, "text_loss": 0.48477989435195923 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.0006321231066365966, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7599618.0, "repeat_count": 0.0, "routers_loss": 0.0005520360427908599, "skip_count": 0.0, "step": 4712, "text_loss": 0.44222453236579895 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0006318245665088665, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 7603180.0, "repeat_count": 0.0, "routers_loss": 0.0015553623670712113, "skip_count": 0.0, "step": 4714, "text_loss": 0.5132410526275635 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.0006315259758657138, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7606457.0, "repeat_count": 0.0, "routers_loss": 0.004210884217172861, "skip_count": 1.0, "step": 4716, "text_loss": 0.39850690960884094 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.150278837687114, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0006312273348215589, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 7609317.0, "repeat_count": 1.0, "routers_loss": 0.001220117206685245, "skip_count": 0.0, "step": 4718, "text_loss": 0.3509018123149872 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0006309286434908419, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 7613076.0, "repeat_count": 0.0, "routers_loss": 0.007768960203975439, "skip_count": 2.0, "step": 4720, "text_loss": 0.33361560106277466 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0006306299019880217, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7616242.0, "repeat_count": 0.0, "routers_loss": 0.006226699333637953, "skip_count": 0.0, "step": 4722, "text_loss": 0.23661087453365326 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.17845611975345, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0006303311104275766, "loss": 0.0073, "macro_f1": 0.6603773832321167, "num_tokens": 7619069.0, "repeat_count": 1.0, "routers_loss": 0.015590761788189411, "skip_count": 1.0, "step": 4724, "text_loss": 0.23373056948184967 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.187848547108892, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006300322689240041, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 7622581.0, "repeat_count": 1.0, "routers_loss": 0.006862971931695938, "skip_count": 2.0, "step": 4726, "text_loss": 0.8301828503608704 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.19724097446434, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.038818359375, "learning_rate": 0.0006297333775918209, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 7625566.0, "repeat_count": 1.0, "routers_loss": 0.006256614346057177, "skip_count": 1.0, "step": 4728, "text_loss": 0.3756707012653351 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.206633401819783, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0006294344365455626, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 7629047.0, "repeat_count": 1.0, "routers_loss": 0.009151885285973549, "skip_count": 2.0, "step": 4730, "text_loss": 0.33362850546836853 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0006291354458997841, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7631847.0, "repeat_count": 0.0, "routers_loss": 0.0009307434665970504, "skip_count": 0.0, "step": 4732, "text_loss": 0.4572524130344391 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.225418256530673, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0006288364057690591, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7635181.0, "repeat_count": 0.0, "routers_loss": 0.00041220212006010115, "skip_count": 0.0, "step": 4734, "text_loss": 0.40211325883865356 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0006285373162679804, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7637752.0, "repeat_count": 0.0, "routers_loss": 0.0006696670898236334, "skip_count": 2.0, "step": 4736, "text_loss": 0.7588053345680237 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 22.24420311124156, "f1_execute": 0.9777777791023254, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0006282381775111597, "loss": 0.0081, "macro_f1": 0.9449735879898071, "num_tokens": 7640719.0, "repeat_count": 4.0, "routers_loss": 0.016283133998513222, "skip_count": 2.0, "step": 4738, "text_loss": 0.5697863101959229 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 22.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0006279389896132274, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7643524.0, "repeat_count": 0.0, "routers_loss": 0.00763951288536191, "skip_count": 3.0, "step": 4740, "text_loss": 0.548592209815979 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 22.26298796595245, "f1_execute": 0.9756097793579102, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006276397526888329, "loss": 0.0094, "macro_f1": 0.925203263759613, "num_tokens": 7646919.0, "repeat_count": 3.0, "routers_loss": 0.038590483367443085, "skip_count": 5.0, "step": 4742, "text_loss": 0.27226054668426514 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037109375, "learning_rate": 0.0006273404668526443, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7650404.0, "repeat_count": 0.0, "routers_loss": 0.0012555639259517193, "skip_count": 0.0, "step": 4744, "text_loss": 0.47892290353775024 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0006270411322193488, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7652942.0, "repeat_count": 1.0, "routers_loss": 0.0015356402145698667, "skip_count": 0.0, "step": 4746, "text_loss": 0.5515767931938171 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0006267417489036517, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7656269.0, "repeat_count": 0.0, "routers_loss": 0.005182140972465277, "skip_count": 0.0, "step": 4748, "text_loss": 0.3496028184890747 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.0006264423170202773, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7658664.0, "repeat_count": 0.0, "routers_loss": 0.004144361708313227, "skip_count": 0.0, "step": 4750, "text_loss": 0.2786032557487488 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0006261428366839685, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7661471.0, "repeat_count": 0.0, "routers_loss": 0.00035335420398041606, "skip_count": 0.0, "step": 4752, "text_loss": 0.4838487505912781 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0006258433080094868, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7664593.0, "repeat_count": 0.0, "routers_loss": 0.0103341368958354, "skip_count": 2.0, "step": 4754, "text_loss": 0.24325360357761383 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0006255437311116119, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 7667573.0, "repeat_count": 0.0, "routers_loss": 0.014633853919804096, "skip_count": 2.0, "step": 4756, "text_loss": 0.21569855511188507 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0006252441061051426, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7671171.0, "repeat_count": 0.0, "routers_loss": 0.004900569561868906, "skip_count": 0.0, "step": 4758, "text_loss": 0.12832018733024597 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006249444331048955, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 7673932.0, "repeat_count": 0.0, "routers_loss": 0.0020371589343994856, "skip_count": 0.0, "step": 4760, "text_loss": 0.38652482628822327 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.000624644712225706, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7677396.0, "repeat_count": 0.0, "routers_loss": 0.0028059002943336964, "skip_count": 2.0, "step": 4762, "text_loss": 0.7937633395195007 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.36630466686234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.0006243449435824276, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7680392.0, "repeat_count": 0.0, "routers_loss": 0.0007225095760077238, "skip_count": 0.0, "step": 4764, "text_loss": 0.5690395832061768 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.375697094217788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0006240451272899321, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7684121.0, "repeat_count": 0.0, "routers_loss": 0.002052050782367587, "skip_count": 1.0, "step": 4766, "text_loss": 0.5321336984634399 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.38508952157323, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.0006237452634631099, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 7687236.0, "repeat_count": 1.0, "routers_loss": 0.0039039517287164927, "skip_count": 0.0, "step": 4768, "text_loss": 0.30823320150375366 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 22.394481948928675, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0303955078125, "learning_rate": 0.0006234453522168694, "loss": 0.0084, "macro_f1": 0.5492662787437439, "num_tokens": 7690355.0, "repeat_count": 0.0, "routers_loss": 0.014570238068699837, "skip_count": 2.0, "step": 4770, "text_loss": 0.21501587331295013 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 22.403874376284122, "f1_execute": 0.949999988079071, "f1_repeat": 0.800000011920929, "f1_skip": 0.9090909361839294, "grad_norm": 0.04541015625, "learning_rate": 0.000623145393666137, "loss": 0.0069, "macro_f1": 0.886363685131073, "num_tokens": 7693559.0, "repeat_count": 3.0, "routers_loss": 0.061707716435194016, "skip_count": 6.0, "step": 4772, "text_loss": 0.24371100962162018 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0006228453879258576, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 7696422.0, "repeat_count": 0.0, "routers_loss": 0.005053870379924774, "skip_count": 2.0, "step": 4774, "text_loss": 0.237778440117836 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.0006225453351109934, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 7700460.0, "repeat_count": 0.0, "routers_loss": 0.0017990898340940475, "skip_count": 0.0, "step": 4776, "text_loss": 0.612456738948822 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 0.000622245235336526, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7703330.0, "repeat_count": 0.0, "routers_loss": 0.004507021512836218, "skip_count": 2.0, "step": 4778, "text_loss": 0.36898812651634216 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006219450887174537, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7707243.0, "repeat_count": 0.0, "routers_loss": 0.006295828148722649, "skip_count": 1.0, "step": 4780, "text_loss": 0.14474599063396454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.0006216448953687932, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7711121.0, "repeat_count": 0.0, "routers_loss": 0.005049831233918667, "skip_count": 0.0, "step": 4782, "text_loss": 0.4696790277957916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0006213446554055795, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7714889.0, "repeat_count": 0.0, "routers_loss": 0.0006010758224874735, "skip_count": 0.0, "step": 4784, "text_loss": 0.46253830194473267 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 22.469621367772234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006210443689428649, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 7718420.0, "repeat_count": 3.0, "routers_loss": 0.006691234186291695, "skip_count": 1.0, "step": 4786, "text_loss": 0.579987645149231 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.00062074403609572, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7721720.0, "repeat_count": 0.0, "routers_loss": 0.001864895923063159, "skip_count": 0.0, "step": 4788, "text_loss": 0.325242817401886 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0006204436569792324, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 7724916.0, "repeat_count": 0.0, "routers_loss": 0.00202955212444067, "skip_count": 0.0, "step": 4790, "text_loss": 0.49637556076049805 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 22.49779864983857, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006201432317085083, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 7728081.0, "repeat_count": 1.0, "routers_loss": 0.0037843603640794754, "skip_count": 0.0, "step": 4792, "text_loss": 0.38812628388404846 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 22.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0006198427603986711, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7731457.0, "repeat_count": 0.0, "routers_loss": 0.012036679312586784, "skip_count": 3.0, "step": 4794, "text_loss": 0.2996312379837036 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.516583504549455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0006195422431648623, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 7734595.0, "repeat_count": 0.0, "routers_loss": 0.0008874868508428335, "skip_count": 1.0, "step": 4796, "text_loss": 0.3203189969062805 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.525975931904902, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 0.0006192416801222403, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 7737565.0, "repeat_count": 1.0, "routers_loss": 0.0032894534524530172, "skip_count": 1.0, "step": 4798, "text_loss": 0.3283322751522064 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.053955078125, "learning_rate": 0.0006189410713859815, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 7740439.0, "repeat_count": 0.0, "routers_loss": 0.009667043574154377, "skip_count": 2.0, "step": 4800, "text_loss": 0.25219282507896423 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 22.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006186404170712797, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 7743813.0, "repeat_count": 0.0, "routers_loss": 0.012643060646951199, "skip_count": 4.0, "step": 4802, "text_loss": 0.22567439079284668 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0006183397172933462, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7747182.0, "repeat_count": 0.0, "routers_loss": 0.002678517485037446, "skip_count": 0.0, "step": 4804, "text_loss": 0.19188879430294037 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0006180389721674101, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 7750735.0, "repeat_count": 0.0, "routers_loss": 0.0013385121710598469, "skip_count": 0.0, "step": 4806, "text_loss": 0.5860441327095032 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.000617738181808717, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7753843.0, "repeat_count": 0.0, "routers_loss": 0.0034869094379246235, "skip_count": 1.0, "step": 4808, "text_loss": 0.4366260766983032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.0006174373463325306, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7757039.0, "repeat_count": 0.0, "routers_loss": 0.0013648992171511054, "skip_count": 0.0, "step": 4810, "text_loss": 0.5217258334159851 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.591722923393014, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0006171364658541314, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 7760016.0, "repeat_count": 1.0, "routers_loss": 0.0038017008919268847, "skip_count": 2.0, "step": 4812, "text_loss": 0.8130963444709778 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.601115350748458, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03466796875, "learning_rate": 0.0006168355404888177, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 7762961.0, "repeat_count": 0.0, "routers_loss": 0.006867518648505211, "skip_count": 2.0, "step": 4814, "text_loss": 0.17822521924972534 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.610507778103905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0006165345703519043, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7766399.0, "repeat_count": 0.0, "routers_loss": 0.0004653502255678177, "skip_count": 0.0, "step": 4816, "text_loss": 0.5316070914268494 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.61990020545935, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0006162335555587238, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 7769039.0, "repeat_count": 1.0, "routers_loss": 0.0016906452365219593, "skip_count": 1.0, "step": 4818, "text_loss": 0.5680997967720032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.629292632814792, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05615234375, "learning_rate": 0.0006159324962246257, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7772768.0, "repeat_count": 0.0, "routers_loss": 0.002541248919442296, "skip_count": 0.0, "step": 4820, "text_loss": 0.6169226169586182 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0006156313924649762, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7775545.0, "repeat_count": 0.0, "routers_loss": 0.008644679561257362, "skip_count": 2.0, "step": 4822, "text_loss": 0.2211475968360901 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02880859375, "learning_rate": 0.0006153302443951589, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7778837.0, "repeat_count": 0.0, "routers_loss": 0.0041346061043441296, "skip_count": 2.0, "step": 4824, "text_loss": 0.5369775891304016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.0006150290521305746, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 7782309.0, "repeat_count": 0.0, "routers_loss": 0.0012756052892655134, "skip_count": 0.0, "step": 4826, "text_loss": 0.5294989943504333 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.666862342236573, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0006147278157866403, "loss": 0.0046, "macro_f1": 0.3272727429866791, "num_tokens": 7785565.0, "repeat_count": 0.0, "routers_loss": 0.029718991369009018, "skip_count": 1.0, "step": 4828, "text_loss": 0.6920449733734131 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006144265354787906, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7788218.0, "repeat_count": 0.0, "routers_loss": 0.004829924553632736, "skip_count": 0.0, "step": 4830, "text_loss": 0.17072243988513947 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0006141252113224767, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7790788.0, "repeat_count": 0.0, "routers_loss": 0.00254037044942379, "skip_count": 0.0, "step": 4832, "text_loss": 0.20075996220111847 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01519775390625, "learning_rate": 0.0006138238434331666, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7793913.0, "repeat_count": 0.0, "routers_loss": 0.0004426188243087381, "skip_count": 0.0, "step": 4834, "text_loss": 0.695742130279541 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.70443205165835, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.022216796875, "learning_rate": 0.000613522431926345, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 7796932.0, "repeat_count": 1.0, "routers_loss": 0.005176798906177282, "skip_count": 3.0, "step": 4836, "text_loss": 0.4910822808742523 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.713824479013795, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0006132209769175132, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7800686.0, "repeat_count": 0.0, "routers_loss": 0.004120545461773872, "skip_count": 0.0, "step": 4838, "text_loss": 0.3701378405094147 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0006129194785221894, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7804765.0, "repeat_count": 0.0, "routers_loss": 0.0043835826218128204, "skip_count": 0.0, "step": 4840, "text_loss": 0.343635618686676 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0006126179368559086, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 7807498.0, "repeat_count": 0.0, "routers_loss": 0.001394893741235137, "skip_count": 1.0, "step": 4842, "text_loss": 0.47756674885749817 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.000612316352034222, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7810784.0, "repeat_count": 0.0, "routers_loss": 0.0031262130942195654, "skip_count": 2.0, "step": 4844, "text_loss": 0.13077901303768158 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.751394188435572, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.0006120147241726972, "loss": 0.0081, "macro_f1": 0.8823530077934265, "num_tokens": 7814754.0, "repeat_count": 2.0, "routers_loss": 0.016139274463057518, "skip_count": 1.0, "step": 4846, "text_loss": 0.18850074708461761 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041015625, "learning_rate": 0.0006117130533869189, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7818245.0, "repeat_count": 0.0, "routers_loss": 0.0009124451316893101, "skip_count": 0.0, "step": 4848, "text_loss": 0.42503559589385986 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0006114113397924878, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7822214.0, "repeat_count": 0.0, "routers_loss": 0.0015132242115214467, "skip_count": 0.0, "step": 4850, "text_loss": 0.16767354309558868 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.779571470501907, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0006111095835050212, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 7825019.0, "repeat_count": 2.0, "routers_loss": 0.006253300234675407, "skip_count": 2.0, "step": 4852, "text_loss": 0.44826745986938477 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.0006108077846401524, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 7828113.0, "repeat_count": 0.0, "routers_loss": 0.0024391328915953636, "skip_count": 0.0, "step": 4854, "text_loss": 0.2009880244731903 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.798356325212797, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0006105059433135317, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 7831177.0, "repeat_count": 1.0, "routers_loss": 0.0020866121631115675, "skip_count": 1.0, "step": 4856, "text_loss": 0.7082528471946716 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025390625, "learning_rate": 0.0006102040596408251, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 7834485.0, "repeat_count": 0.0, "routers_loss": 0.004373365081846714, "skip_count": 1.0, "step": 4858, "text_loss": 0.2541539669036865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.817141179923688, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0006099021337377148, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7837749.0, "repeat_count": 0.0, "routers_loss": 0.004309024661779404, "skip_count": 0.0, "step": 4860, "text_loss": 0.3163885176181793 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 22.82653360727913, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.049072265625, "learning_rate": 0.0006096001657198995, "loss": 0.0065, "macro_f1": 0.6122449040412903, "num_tokens": 7840979.0, "repeat_count": 0.0, "routers_loss": 0.023044804111123085, "skip_count": 4.0, "step": 4862, "text_loss": 0.49609798192977905 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 22.835926034634575, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0006092981557030941, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 7844905.0, "repeat_count": 1.0, "routers_loss": 0.010683654807507992, "skip_count": 3.0, "step": 4864, "text_loss": 0.16866883635520935 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0006089961038030291, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 7847800.0, "repeat_count": 0.0, "routers_loss": 0.0011224723421037197, "skip_count": 0.0, "step": 4866, "text_loss": 0.5093055367469788 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0006086940101354515, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7850983.0, "repeat_count": 0.0, "routers_loss": 0.003944621421396732, "skip_count": 1.0, "step": 4868, "text_loss": 0.5753747224807739 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 22.86410331670091, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0289306640625, "learning_rate": 0.0006083918748161244, "loss": 0.0069, "macro_f1": 0.5492662787437439, "num_tokens": 7855041.0, "repeat_count": 0.0, "routers_loss": 0.02532145567238331, "skip_count": 2.0, "step": 4870, "text_loss": 0.8082366585731506 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.0006080896979608262, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7858058.0, "repeat_count": 0.0, "routers_loss": 0.0007558314246125519, "skip_count": 0.0, "step": 4872, "text_loss": 0.6476574540138245 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0615234375, "learning_rate": 0.000607787479685352, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7861223.0, "repeat_count": 0.0, "routers_loss": 0.0009224560926668346, "skip_count": 0.0, "step": 4874, "text_loss": 0.5012133717536926 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03515625, "learning_rate": 0.0006074852201055121, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7864180.0, "repeat_count": 0.0, "routers_loss": 0.0028308273758739233, "skip_count": 0.0, "step": 4876, "text_loss": 0.7447214722633362 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.052734375, "learning_rate": 0.0006071829193371331, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7866726.0, "repeat_count": 0.0, "routers_loss": 0.0021505290642380714, "skip_count": 0.0, "step": 4878, "text_loss": 0.5444929599761963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.0006068805774960573, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7870166.0, "repeat_count": 0.0, "routers_loss": 0.0021109723020344973, "skip_count": 0.0, "step": 4880, "text_loss": 0.3577263355255127 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0308837890625, "learning_rate": 0.0006065781946981425, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7873028.0, "repeat_count": 0.0, "routers_loss": 0.0027144821360707283, "skip_count": 0.0, "step": 4882, "text_loss": 0.28464797139167786 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0006062757710592624, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7876747.0, "repeat_count": 0.0, "routers_loss": 0.0004638207610696554, "skip_count": 0.0, "step": 4884, "text_loss": 0.381534606218338 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.939242735544468, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0006059733066953066, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 7879524.0, "repeat_count": 1.0, "routers_loss": 0.002225410658866167, "skip_count": 2.0, "step": 4886, "text_loss": 0.5167883634567261 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0006056708017221796, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 7882809.0, "repeat_count": 0.0, "routers_loss": 0.00419368501752615, "skip_count": 1.0, "step": 4888, "text_loss": 0.22688335180282593 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.000605368256255802, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7886310.0, "repeat_count": 0.0, "routers_loss": 0.0017340193735435605, "skip_count": 1.0, "step": 4890, "text_loss": 1.0128135681152344 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 22.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0006050656704121098, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 7889483.0, "repeat_count": 0.0, "routers_loss": 0.0016647159354761243, "skip_count": 0.0, "step": 4892, "text_loss": 0.2213262915611267 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 22.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0006047630443070547, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7892615.0, "repeat_count": 0.0, "routers_loss": 0.0038971947506070137, "skip_count": 3.0, "step": 4894, "text_loss": 0.45751357078552246 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 22.98620487232169, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0006044603780566032, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 7895747.0, "repeat_count": 1.0, "routers_loss": 0.0036852145567536354, "skip_count": 1.0, "step": 4896, "text_loss": 0.13489919900894165 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 22.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0006041576717767379, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7899155.0, "repeat_count": 0.0, "routers_loss": 0.007661987561732531, "skip_count": 1.0, "step": 4898, "text_loss": 0.281853586435318 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.00469621367772, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0006038549255834563, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7901667.0, "repeat_count": 2.0, "routers_loss": 0.01836695335805416, "skip_count": 5.0, "step": 4900, "text_loss": 0.24879895150661469 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.014088641033165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.000603552139592771, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7904506.0, "repeat_count": 0.0, "routers_loss": 0.0011829182039946318, "skip_count": 0.0, "step": 4902, "text_loss": 0.7550268769264221 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 23.023481068388612, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0006032493139207106, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7907316.0, "repeat_count": 1.0, "routers_loss": 0.0022891140542924404, "skip_count": 0.0, "step": 4904, "text_loss": 0.37596020102500916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0006029464486833186, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 7911283.0, "repeat_count": 0.0, "routers_loss": 0.001990227960050106, "skip_count": 0.0, "step": 4906, "text_loss": 0.5879577994346619 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0211181640625, "learning_rate": 0.0006026435439966531, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 7913907.0, "repeat_count": 0.0, "routers_loss": 0.0026039890944957733, "skip_count": 1.0, "step": 4908, "text_loss": 0.41484713554382324 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0006023405999767879, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7916772.0, "repeat_count": 0.0, "routers_loss": 0.009183229878544807, "skip_count": 1.0, "step": 4910, "text_loss": 0.20732562243938446 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 0.0006020376167398116, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7919346.0, "repeat_count": 0.0, "routers_loss": 0.005508727394044399, "skip_count": 1.0, "step": 4912, "text_loss": 0.41416165232658386 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 23.070443205165834, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0006017345944018284, "loss": 0.0051, "macro_f1": 0.3272727429866791, "num_tokens": 7922404.0, "repeat_count": 0.0, "routers_loss": 0.008651934564113617, "skip_count": 0.0, "step": 4914, "text_loss": 0.4290519952774048 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0006014315330789563, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 7925165.0, "repeat_count": 0.0, "routers_loss": 0.003601635340601206, "skip_count": 1.0, "step": 4916, "text_loss": 0.8447931408882141 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.089228059876724, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0006011284328873296, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 7928146.0, "repeat_count": 1.0, "routers_loss": 0.0049415635876357555, "skip_count": 2.0, "step": 4918, "text_loss": 0.32237401604652405 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0006008252939430967, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7931163.0, "repeat_count": 0.0, "routers_loss": 0.0024150956887751818, "skip_count": 0.0, "step": 4920, "text_loss": 0.2251713126897812 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.108012914587615, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04150390625, "learning_rate": 0.0006005221163624209, "loss": 0.0057, "macro_f1": 0.3272727429866791, "num_tokens": 7934084.0, "repeat_count": 1.0, "routers_loss": 0.03181030973792076, "skip_count": 0.0, "step": 4922, "text_loss": 0.4962928593158722 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.054931640625, "learning_rate": 0.0006002189002614806, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 7937021.0, "repeat_count": 0.0, "routers_loss": 0.00227518193423748, "skip_count": 2.0, "step": 4924, "text_loss": 0.34440335631370544 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0005999156457564685, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 7940205.0, "repeat_count": 0.0, "routers_loss": 0.004331593867391348, "skip_count": 1.0, "step": 4926, "text_loss": 0.14114083349704742 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0005996123529635925, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7945174.0, "repeat_count": 0.0, "routers_loss": 0.000612895586527884, "skip_count": 0.0, "step": 4928, "text_loss": 0.3895469009876251 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.145582624009393, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.000599309021999075, "loss": 0.006, "macro_f1": 0.3272727429866791, "num_tokens": 7948716.0, "repeat_count": 0.0, "routers_loss": 0.02319233864545822, "skip_count": 1.0, "step": 4930, "text_loss": 0.38103172183036804 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0005990056529791528, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7952497.0, "repeat_count": 0.0, "routers_loss": 0.003423231653869152, "skip_count": 0.0, "step": 4932, "text_loss": 0.30447322130203247 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.017822265625, "learning_rate": 0.0005987022460200778, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7955578.0, "repeat_count": 0.0, "routers_loss": 0.0007005351362749934, "skip_count": 0.0, "step": 4934, "text_loss": 0.49621838331222534 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 23.173759906075727, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 0.0005983988012381159, "loss": 0.0061, "macro_f1": 0.8823530077934265, "num_tokens": 7958741.0, "repeat_count": 2.0, "routers_loss": 0.03962617367506027, "skip_count": 1.0, "step": 4936, "text_loss": 0.1920493096113205 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.022216796875, "learning_rate": 0.0005980953187495476, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 7962236.0, "repeat_count": 0.0, "routers_loss": 0.0026006060652434826, "skip_count": 3.0, "step": 4938, "text_loss": 0.5286803841590881 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0224609375, "learning_rate": 0.0005977917986706681, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7965631.0, "repeat_count": 0.0, "routers_loss": 0.005010952707380056, "skip_count": 0.0, "step": 4940, "text_loss": 0.3507745563983917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0005974882411177871, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7968516.0, "repeat_count": 0.0, "routers_loss": 0.0023964287247508764, "skip_count": 0.0, "step": 4942, "text_loss": 0.9110504388809204 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.000597184646207228, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7971310.0, "repeat_count": 0.0, "routers_loss": 0.0026230409275740385, "skip_count": 1.0, "step": 4944, "text_loss": 0.4131232798099518 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0005968810140553292, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 7974809.0, "repeat_count": 0.0, "routers_loss": 0.0007397596491500735, "skip_count": 0.0, "step": 4946, "text_loss": 0.5130466222763062 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.230114470208395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0005965773447784431, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7977800.0, "repeat_count": 0.0, "routers_loss": 0.0009955473942682147, "skip_count": 0.0, "step": 4948, "text_loss": 0.5366153717041016 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01373291015625, "learning_rate": 0.0005962736384929362, "loss": 0.0026, "macro_f1": 0.3333333432674408, "num_tokens": 7981027.0, "repeat_count": 0.0, "routers_loss": 0.0049227322451770306, "skip_count": 0.0, "step": 4950, "text_loss": 0.17266370356082916 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0005959698953151895, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7983580.0, "repeat_count": 0.0, "routers_loss": 0.0009975163266062737, "skip_count": 0.0, "step": 4952, "text_loss": 0.2474549114704132 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0255126953125, "learning_rate": 0.0005956661153615979, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7986711.0, "repeat_count": 0.0, "routers_loss": 0.0006475782720372081, "skip_count": 0.0, "step": 4954, "text_loss": 0.5748327970504761 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02294921875, "learning_rate": 0.0005953622987485703, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7990194.0, "repeat_count": 0.0, "routers_loss": 0.001449751085601747, "skip_count": 0.0, "step": 4956, "text_loss": 0.5163559317588806 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.0005950584455925301, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7993050.0, "repeat_count": 0.0, "routers_loss": 0.0017087773885577917, "skip_count": 0.0, "step": 4958, "text_loss": 0.15892620384693146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.286469034341064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0005947545560099142, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 7996383.0, "repeat_count": 0.0, "routers_loss": 0.0044417232275009155, "skip_count": 0.0, "step": 4960, "text_loss": 0.48022928833961487 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 23.295861461696507, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.031982421875, "learning_rate": 0.0005944506301171734, "loss": 0.0066, "macro_f1": 0.5492662787437439, "num_tokens": 7999843.0, "repeat_count": 0.0, "routers_loss": 0.010093312710523605, "skip_count": 2.0, "step": 4962, "text_loss": 0.5050316452980042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.30525388905195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0005941466680307732, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8003504.0, "repeat_count": 0.0, "routers_loss": 0.009699694812297821, "skip_count": 0.0, "step": 4964, "text_loss": 0.30474427342414856 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 23.314646316407398, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.040771484375, "learning_rate": 0.0005938426698671922, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 8007427.0, "repeat_count": 1.0, "routers_loss": 0.0016759657301008701, "skip_count": 0.0, "step": 4966, "text_loss": 0.25060293078422546 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.32403874376284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04443359375, "learning_rate": 0.0005935386357429232, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 8010265.0, "repeat_count": 2.0, "routers_loss": 0.006916914135217667, "skip_count": 3.0, "step": 4968, "text_loss": 0.49084481596946716 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 23.333431171118285, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.0005932345657744723, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 8013733.0, "repeat_count": 1.0, "routers_loss": 0.017182426527142525, "skip_count": 5.0, "step": 4970, "text_loss": 0.2705717980861664 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.00059293046007836, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8017068.0, "repeat_count": 0.0, "routers_loss": 0.008485594764351845, "skip_count": 2.0, "step": 4972, "text_loss": 0.18570218980312347 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03515625, "learning_rate": 0.0005926263187711201, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 8020185.0, "repeat_count": 0.0, "routers_loss": 0.0021750847809016705, "skip_count": 2.0, "step": 4974, "text_loss": 0.4457069933414459 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.0005923221419693001, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 8023038.0, "repeat_count": 0.0, "routers_loss": 0.0020193420350551605, "skip_count": 0.0, "step": 4976, "text_loss": 0.7394505143165588 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.054931640625, "learning_rate": 0.0005920179297894613, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 8026236.0, "repeat_count": 0.0, "routers_loss": 0.001450369250960648, "skip_count": 1.0, "step": 4978, "text_loss": 0.5914503335952759 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.38039330789551, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.000591713682348178, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8028765.0, "repeat_count": 0.0, "routers_loss": 0.0017808573320508003, "skip_count": 0.0, "step": 4980, "text_loss": 0.19231407344341278 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005914093997620388, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8032043.0, "repeat_count": 0.0, "routers_loss": 0.0018225493840873241, "skip_count": 0.0, "step": 4982, "text_loss": 0.3567875325679779 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0005911050821476449, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8035086.0, "repeat_count": 0.0, "routers_loss": 0.0016285666497424245, "skip_count": 0.0, "step": 4984, "text_loss": 0.34609633684158325 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0005908007296216119, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8038193.0, "repeat_count": 0.0, "routers_loss": 0.0014699801104143262, "skip_count": 0.0, "step": 4986, "text_loss": 0.4492359757423401 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0245361328125, "learning_rate": 0.000590496342300568, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 8041099.0, "repeat_count": 0.0, "routers_loss": 0.002442725468426943, "skip_count": 0.0, "step": 4988, "text_loss": 0.5162975788116455 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0005901919203011548, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 8044350.0, "repeat_count": 0.0, "routers_loss": 0.008624207228422165, "skip_count": 2.0, "step": 4990, "text_loss": 0.2533033490180969 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.0005898874637400279, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 8047467.0, "repeat_count": 0.0, "routers_loss": 0.0015421364223584533, "skip_count": 0.0, "step": 4992, "text_loss": 0.4890289306640625 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.44614029938362, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0005895829727338552, "loss": 0.0065, "macro_f1": 1.0, "num_tokens": 8050626.0, "repeat_count": 1.0, "routers_loss": 0.0024516626726835966, "skip_count": 2.0, "step": 4994, "text_loss": 0.50797039270401 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0005892784473993184, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 8053386.0, "repeat_count": 0.0, "routers_loss": 0.0018553845584392548, "skip_count": 2.0, "step": 4996, "text_loss": 0.628828763961792 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0286865234375, "learning_rate": 0.000588973887853112, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8055941.0, "repeat_count": 0.0, "routers_loss": 0.004258487373590469, "skip_count": 0.0, "step": 4998, "text_loss": 0.2643229067325592 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.474317581449956, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0005886692942119441, "loss": 0.0062, "macro_f1": 0.8820862174034119, "num_tokens": 8058638.0, "repeat_count": 2.0, "routers_loss": 0.019064312800765038, "skip_count": 2.0, "step": 5000, "text_loss": 0.4925006031990051 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.0005883646665925353, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 8062097.0, "repeat_count": 0.0, "routers_loss": 0.0007969749276526272, "skip_count": 0.0, "step": 5002, "text_loss": 0.49412909150123596 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0390625, "learning_rate": 0.0005880600051116196, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 8065202.0, "repeat_count": 0.0, "routers_loss": 0.005813780706375837, "skip_count": 2.0, "step": 5004, "text_loss": 0.5681346654891968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04052734375, "learning_rate": 0.0005877553098859439, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 8068574.0, "repeat_count": 0.0, "routers_loss": 0.005012941546738148, "skip_count": 0.0, "step": 5006, "text_loss": 0.2682424485683441 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0005874505810322678, "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 8071834.0, "repeat_count": 0.0, "routers_loss": 0.005859757773578167, "skip_count": 3.0, "step": 5008, "text_loss": 0.6460036039352417 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.000587145818667364, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 8074687.0, "repeat_count": 0.0, "routers_loss": 0.002868571551516652, "skip_count": 2.0, "step": 5010, "text_loss": 0.2405751347541809 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033203125, "learning_rate": 0.0005868410229080181, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8077617.0, "repeat_count": 0.0, "routers_loss": 0.0021759893279522657, "skip_count": 1.0, "step": 5012, "text_loss": 0.7455595135688782 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.0005865361938710286, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8080734.0, "repeat_count": 0.0, "routers_loss": 0.0008311949786730111, "skip_count": 0.0, "step": 5014, "text_loss": 0.44876906275749207 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 23.549457000293515, "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.0390625, "learning_rate": 0.0005862313316732063, "loss": 0.0054, "macro_f1": 0.9615669250488281, "num_tokens": 8085092.0, "repeat_count": 2.0, "routers_loss": 0.012511664070189, "skip_count": 6.0, "step": 5016, "text_loss": 0.26010942459106445 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.000585926436431375, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 8088333.0, "repeat_count": 0.0, "routers_loss": 0.0035441694781184196, "skip_count": 0.0, "step": 5018, "text_loss": 0.28225192427635193 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 23.568241855004402, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.031494140625, "learning_rate": 0.0005856215082623711, "loss": 0.0093, "macro_f1": 0.8823530077934265, "num_tokens": 8091298.0, "repeat_count": 1.0, "routers_loss": 0.023543989285826683, "skip_count": 2.0, "step": 5020, "text_loss": 0.5757577419281006 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020263671875, "learning_rate": 0.0005853165472830439, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8094361.0, "repeat_count": 0.0, "routers_loss": 0.003124240320175886, "skip_count": 0.0, "step": 5022, "text_loss": 0.4021305739879608 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037841796875, "learning_rate": 0.0005850115536102546, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8097514.0, "repeat_count": 0.0, "routers_loss": 0.008170558139681816, "skip_count": 1.0, "step": 5024, "text_loss": 0.18926584720611572 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 23.596419137070736, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0289306640625, "learning_rate": 0.0005847065273608777, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 8100525.0, "repeat_count": 1.0, "routers_loss": 0.02127663604915142, "skip_count": 5.0, "step": 5026, "text_loss": 0.18827557563781738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0005844014686517998, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 8104016.0, "repeat_count": 0.0, "routers_loss": 0.00272122910246253, "skip_count": 0.0, "step": 5028, "text_loss": 0.15534701943397522 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 23.615203991781627, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0005840963775999199, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 8106697.0, "repeat_count": 5.0, "routers_loss": 0.008979840204119682, "skip_count": 4.0, "step": 5030, "text_loss": 0.8123718500137329 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0005837912543221493, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 8110986.0, "repeat_count": 0.0, "routers_loss": 0.005006929859519005, "skip_count": 0.0, "step": 5032, "text_loss": 0.26128846406936646 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0005834860989354121, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 8114010.0, "repeat_count": 0.0, "routers_loss": 0.0005531277856789529, "skip_count": 0.0, "step": 5034, "text_loss": 0.5100266933441162 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.64338127384796, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0556640625, "learning_rate": 0.0005831809115566442, "loss": 0.0073, "macro_f1": 0.6538461446762085, "num_tokens": 8117168.0, "repeat_count": 2.0, "routers_loss": 0.04978533461689949, "skip_count": 1.0, "step": 5036, "text_loss": 0.41049885749816895 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0005828756923027941, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8119900.0, "repeat_count": 0.0, "routers_loss": 0.0006322385743260384, "skip_count": 0.0, "step": 5038, "text_loss": 0.5584380626678467 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.037353515625, "learning_rate": 0.0005825704412908225, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 8123928.0, "repeat_count": 0.0, "routers_loss": 0.001000594231300056, "skip_count": 0.0, "step": 5040, "text_loss": 0.6460791230201721 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0005822651586377019, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 8127926.0, "repeat_count": 0.0, "routers_loss": 0.011595834977924824, "skip_count": 2.0, "step": 5042, "text_loss": 0.3131820261478424 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0284423828125, "learning_rate": 0.0005819598444604173, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 8131092.0, "repeat_count": 0.0, "routers_loss": 0.004449303261935711, "skip_count": 3.0, "step": 5044, "text_loss": 0.2774372696876526 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0005816544988759658, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 8134051.0, "repeat_count": 0.0, "routers_loss": 0.0007877505850046873, "skip_count": 0.0, "step": 5046, "text_loss": 0.39496293663978577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025390625, "learning_rate": 0.0005813491220013563, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 8138725.0, "repeat_count": 0.0, "routers_loss": 0.002868623472750187, "skip_count": 0.0, "step": 5048, "text_loss": 0.3779948651790619 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.709128265336073, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0005810437139536098, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 8141913.0, "repeat_count": 2.0, "routers_loss": 0.006244937423616648, "skip_count": 4.0, "step": 5050, "text_loss": 0.4512978494167328 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06396484375, "learning_rate": 0.0005807382748497592, "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 8146193.0, "repeat_count": 0.0, "routers_loss": 0.0011013929033651948, "skip_count": 0.0, "step": 5052, "text_loss": 0.6194499731063843 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.0005804328048068493, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8149701.0, "repeat_count": 0.0, "routers_loss": 0.005505079869180918, "skip_count": 1.0, "step": 5054, "text_loss": 0.2932305335998535 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 23.737305547402407, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005801273039419368, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 8152861.0, "repeat_count": 1.0, "routers_loss": 0.0057641929015517235, "skip_count": 1.0, "step": 5056, "text_loss": 0.2631317973136902 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 23.74669797475785, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0005798217723720904, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 8155843.0, "repeat_count": 1.0, "routers_loss": 0.0021671492140740156, "skip_count": 5.0, "step": 5058, "text_loss": 0.2889988422393799 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0005795162102143902, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 8158812.0, "repeat_count": 0.0, "routers_loss": 0.004476628266274929, "skip_count": 1.0, "step": 5060, "text_loss": 0.48028868436813354 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.76548282946874, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.0005792106175859283, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 8162719.0, "repeat_count": 1.0, "routers_loss": 0.0038497636560350657, "skip_count": 3.0, "step": 5062, "text_loss": 0.4559471607208252 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.774875256824185, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0250244140625, "learning_rate": 0.0005789049946038083, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 8165692.0, "repeat_count": 0.0, "routers_loss": 0.004451582673937082, "skip_count": 0.0, "step": 5064, "text_loss": 0.3782602548599243 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.0005785993413851456, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8168900.0, "repeat_count": 0.0, "routers_loss": 0.002951978938654065, "skip_count": 0.0, "step": 5066, "text_loss": 0.32392629981040955 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0255126953125, "learning_rate": 0.000578293658047067, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 8171661.0, "repeat_count": 0.0, "routers_loss": 0.011171254329383373, "skip_count": 2.0, "step": 5068, "text_loss": 0.24492619931697845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0005779879447067109, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 8175075.0, "repeat_count": 0.0, "routers_loss": 0.0016067599644884467, "skip_count": 0.0, "step": 5070, "text_loss": 0.7738823294639587 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.000577682201481227, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8178515.0, "repeat_count": 0.0, "routers_loss": 0.009113503620028496, "skip_count": 1.0, "step": 5072, "text_loss": 0.2082248032093048 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 23.82183739360141, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.039306640625, "learning_rate": 0.0005773764284877774, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 8181790.0, "repeat_count": 1.0, "routers_loss": 0.007332196459174156, "skip_count": 1.0, "step": 5074, "text_loss": 0.4557662904262543 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0537109375, "learning_rate": 0.0005770706258435342, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8184854.0, "repeat_count": 0.0, "routers_loss": 0.0016252279747277498, "skip_count": 0.0, "step": 5076, "text_loss": 0.2888098657131195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0005767647936656818, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 8187860.0, "repeat_count": 0.0, "routers_loss": 0.003406575648114085, "skip_count": 0.0, "step": 5078, "text_loss": 0.6533790230751038 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0005764589320714158, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 8191683.0, "repeat_count": 0.0, "routers_loss": 0.0006520140450447798, "skip_count": 0.0, "step": 5080, "text_loss": 0.6903796195983887 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.030517578125, "learning_rate": 0.0005761530411779426, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 8195109.0, "repeat_count": 0.0, "routers_loss": 0.01188349537551403, "skip_count": 1.0, "step": 5082, "text_loss": 0.20460398495197296 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.083984375, "learning_rate": 0.0005758471211024804, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 8198340.0, "repeat_count": 0.0, "routers_loss": 0.004826809279620647, "skip_count": 3.0, "step": 5084, "text_loss": 0.2203969657421112 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.878191957734078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 0.0005755411719622584, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8200882.0, "repeat_count": 0.0, "routers_loss": 0.0019170823507010937, "skip_count": 0.0, "step": 5086, "text_loss": 0.6744595170021057 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.0005752351938745167, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 8203777.0, "repeat_count": 0.0, "routers_loss": 0.002110893838107586, "skip_count": 1.0, "step": 5088, "text_loss": 0.4137859046459198 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0194091796875, "learning_rate": 0.000574929186956507, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 8207627.0, "repeat_count": 0.0, "routers_loss": 0.0018580821342766285, "skip_count": 1.0, "step": 5090, "text_loss": 0.4830456078052521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.906369239800412, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0005746231513254912, "loss": 0.0066, "macro_f1": 0.3272727429866791, "num_tokens": 8210263.0, "repeat_count": 1.0, "routers_loss": 0.0194723978638649, "skip_count": 0.0, "step": 5092, "text_loss": 0.17383277416229248 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 23.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005743170870987433, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 8214166.0, "repeat_count": 0.0, "routers_loss": 0.006944256369024515, "skip_count": 2.0, "step": 5094, "text_loss": 0.20003484189510345 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0005740109943935472, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8217545.0, "repeat_count": 0.0, "routers_loss": 0.002044794149696827, "skip_count": 1.0, "step": 5096, "text_loss": 0.5117167830467224 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 23.934546521866746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06494140625, "learning_rate": 0.0005737048733271986, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 8220673.0, "repeat_count": 1.0, "routers_loss": 0.009966124780476093, "skip_count": 2.0, "step": 5098, "text_loss": 0.2705996036529541 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033203125, "learning_rate": 0.0005733987240170035, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 8223796.0, "repeat_count": 0.0, "routers_loss": 0.0009675708715803921, "skip_count": 0.0, "step": 5100, "text_loss": 0.7016357183456421 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0208740234375, "learning_rate": 0.0005730925465802788, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8227048.0, "repeat_count": 0.0, "routers_loss": 0.0009548200177960098, "skip_count": 0.0, "step": 5102, "text_loss": 0.30823078751564026 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0005727863411343526, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8229971.0, "repeat_count": 0.0, "routers_loss": 0.0005767418188042939, "skip_count": 0.0, "step": 5104, "text_loss": 0.6897505521774292 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 23.972116231288524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0005724801077965629, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8232758.0, "repeat_count": 0.0, "routers_loss": 0.009297889657318592, "skip_count": 3.0, "step": 5106, "text_loss": 0.21293514966964722 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 23.981508658643968, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0005721738466842592, "loss": 0.0079, "macro_f1": 0.3272727429866791, "num_tokens": 8238154.0, "repeat_count": 1.0, "routers_loss": 0.013964693062007427, "skip_count": 0.0, "step": 5108, "text_loss": 0.7273620367050171 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 23.99090108599941, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0005718675579148014, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 8240818.0, "repeat_count": 3.0, "routers_loss": 0.007218098267912865, "skip_count": 1.0, "step": 5110, "text_loss": 0.5607150793075562 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0005715612416055598, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 8244048.0, "repeat_count": 0.0, "routers_loss": 0.007558444049209356, "skip_count": 2.0, "step": 5112, "text_loss": 0.23694385588169098 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.009392427355444, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.042724609375, "learning_rate": 0.0005712548978739154, "loss": 0.0072, "macro_f1": 0.6603773832321167, "num_tokens": 8247240.0, "repeat_count": 1.0, "routers_loss": 0.015726923942565918, "skip_count": 1.0, "step": 5114, "text_loss": 0.6032099723815918 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.01878485471089, "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.019775390625, "learning_rate": 0.0005709485268372598, "loss": 0.0046, "macro_f1": 0.9262410998344421, "num_tokens": 8250585.0, "repeat_count": 3.0, "routers_loss": 0.011148860678076744, "skip_count": 2.0, "step": 5116, "text_loss": 0.6825997233390808 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0005706421286129948, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 8254240.0, "repeat_count": 0.0, "routers_loss": 0.006977916229516268, "skip_count": 0.0, "step": 5118, "text_loss": 0.2532844543457031 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0005703357033185328, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 8257133.0, "repeat_count": 0.0, "routers_loss": 0.006415650714188814, "skip_count": 2.0, "step": 5120, "text_loss": 0.6132124066352844 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.046962136777225, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0005700292510712967, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 8261076.0, "repeat_count": 1.0, "routers_loss": 0.0044475216418504715, "skip_count": 1.0, "step": 5122, "text_loss": 0.4277699887752533 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0291748046875, "learning_rate": 0.0005697227719887194, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 8264607.0, "repeat_count": 0.0, "routers_loss": 0.005743155721575022, "skip_count": 2.0, "step": 5124, "text_loss": 0.2570968270301819 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.065746991488112, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0005694162661882444, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8267992.0, "repeat_count": 0.0, "routers_loss": 0.0007581565878354013, "skip_count": 0.0, "step": 5126, "text_loss": 0.5850184559822083 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0005691097337873252, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 8271010.0, "repeat_count": 0.0, "routers_loss": 0.0036611228715628386, "skip_count": 0.0, "step": 5128, "text_loss": 0.660999059677124 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0005688031749034258, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 8273638.0, "repeat_count": 0.0, "routers_loss": 0.0039906189776957035, "skip_count": 0.0, "step": 5130, "text_loss": 0.5839648246765137 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.093924273554446, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.037109375, "learning_rate": 0.0005684965896540198, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 8276504.0, "repeat_count": 1.0, "routers_loss": 0.007539632264524698, "skip_count": 3.0, "step": 5132, "text_loss": 0.27675092220306396 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 24.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0005681899781565915, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 8279977.0, "repeat_count": 2.0, "routers_loss": 0.0026953567285090685, "skip_count": 0.0, "step": 5134, "text_loss": 0.532974123954773 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.000567883340528635, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 8282781.0, "repeat_count": 0.0, "routers_loss": 0.005754240322858095, "skip_count": 1.0, "step": 5136, "text_loss": 0.31100207567214966 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005675766768876542, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 8286533.0, "repeat_count": 0.0, "routers_loss": 0.0051517849788069725, "skip_count": 0.0, "step": 5138, "text_loss": 0.5734741687774658 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005672699873511635, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 8289858.0, "repeat_count": 0.0, "routers_loss": 0.0025852699764072895, "skip_count": 2.0, "step": 5140, "text_loss": 0.37045374512672424 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005669632720366868, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 8293038.0, "repeat_count": 0.0, "routers_loss": 0.0038520018570125103, "skip_count": 0.0, "step": 5142, "text_loss": 0.25952374935150146 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0005666565310617577, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8295717.0, "repeat_count": 0.0, "routers_loss": 0.00026914477348327637, "skip_count": 0.0, "step": 5144, "text_loss": 0.32531213760375977 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 0.0005663497645439203, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 8299750.0, "repeat_count": 0.0, "routers_loss": 0.0055860537104308605, "skip_count": 2.0, "step": 5146, "text_loss": 0.2520618438720703 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0005660429726007279, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 8303075.0, "repeat_count": 0.0, "routers_loss": 0.004446739796549082, "skip_count": 1.0, "step": 5148, "text_loss": 0.43672287464141846 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.17845611975345, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.07080078125, "learning_rate": 0.000565736155349744, "loss": 0.0076, "macro_f1": 0.8814815282821655, "num_tokens": 8306268.0, "repeat_count": 2.0, "routers_loss": 0.046915046870708466, "skip_count": 4.0, "step": 5150, "text_loss": 0.35405927896499634 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 24.187848547108892, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0005654293129085412, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8310480.0, "repeat_count": 0.0, "routers_loss": 0.010549088008701801, "skip_count": 4.0, "step": 5152, "text_loss": 0.3523249626159668 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.19724097446434, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0005651224453947023, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8313367.0, "repeat_count": 1.0, "routers_loss": 0.002893900265917182, "skip_count": 0.0, "step": 5154, "text_loss": 0.4503810703754425 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0005648155529258195, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 8318006.0, "repeat_count": 0.0, "routers_loss": 0.0018450213829055429, "skip_count": 0.0, "step": 5156, "text_loss": 0.5687127113342285 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.216025829175226, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.047119140625, "learning_rate": 0.0005645086356194943, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8320646.0, "repeat_count": 0.0, "routers_loss": 0.0026727779768407345, "skip_count": 0.0, "step": 5158, "text_loss": 0.38920050859451294 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.225418256530673, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0005642016935933385, "loss": 0.0035, "macro_f1": 1.0, "num_tokens": 8323915.0, "repeat_count": 1.0, "routers_loss": 0.00611621281132102, "skip_count": 2.0, "step": 5160, "text_loss": 0.3003547787666321 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 24.0, "epoch": 24.234810683886117, "f1_execute": 0.9767441749572754, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.0257568359375, "learning_rate": 0.0005638947269649726, "loss": 0.0063, "macro_f1": 0.9619450569152832, "num_tokens": 8327073.0, "repeat_count": 1.0, "routers_loss": 0.028447439894080162, "skip_count": 6.0, "step": 5162, "text_loss": 0.24053414165973663 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.24420311124156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029541015625, "learning_rate": 0.0005635877358520268, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8330388.0, "repeat_count": 0.0, "routers_loss": 0.0013072624569758773, "skip_count": 0.0, "step": 5164, "text_loss": 0.43772217631340027 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0005632807203721406, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 8333241.0, "repeat_count": 0.0, "routers_loss": 0.0009456822881475091, "skip_count": 0.0, "step": 5166, "text_loss": 0.5217573046684265 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.020751953125, "learning_rate": 0.000562973680642963, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8337257.0, "repeat_count": 0.0, "routers_loss": 0.0023840824142098427, "skip_count": 0.0, "step": 5168, "text_loss": 0.31814974546432495 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0196533203125, "learning_rate": 0.0005626666167821521, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 8340143.0, "repeat_count": 0.0, "routers_loss": 0.0020231492817401886, "skip_count": 3.0, "step": 5170, "text_loss": 0.5478505492210388 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0162353515625, "learning_rate": 0.0005623595289073755, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 8343566.0, "repeat_count": 1.0, "routers_loss": 0.01070715207606554, "skip_count": 2.0, "step": 5172, "text_loss": 0.23213914036750793 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0005620524171363099, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 8346836.0, "repeat_count": 0.0, "routers_loss": 0.003720001084730029, "skip_count": 3.0, "step": 5174, "text_loss": 0.5114789009094238 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.30055767537423, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0005617452815866409, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 8349726.0, "repeat_count": 1.0, "routers_loss": 0.003322509117424488, "skip_count": 1.0, "step": 5176, "text_loss": 0.4894506335258484 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0005614381223760635, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 8352478.0, "repeat_count": 0.0, "routers_loss": 0.00028752797516062856, "skip_count": 0.0, "step": 5178, "text_loss": 0.6418307423591614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.0005611309396222817, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 8355766.0, "repeat_count": 0.0, "routers_loss": 0.0028724796138703823, "skip_count": 0.0, "step": 5180, "text_loss": 0.23635952174663544 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.328734957440563, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035400390625, "learning_rate": 0.0005608237334430085, "loss": 0.0068, "macro_f1": 0.6601307392120361, "num_tokens": 8358888.0, "repeat_count": 1.0, "routers_loss": 0.058520980179309845, "skip_count": 2.0, "step": 5182, "text_loss": 0.23434793949127197 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1015625, "learning_rate": 0.000560516503955966, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 8361761.0, "repeat_count": 0.0, "routers_loss": 0.0021356395445764065, "skip_count": 1.0, "step": 5184, "text_loss": 0.40855672955513 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.000560209251278885, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 8364376.0, "repeat_count": 0.0, "routers_loss": 0.0016185789136216044, "skip_count": 0.0, "step": 5186, "text_loss": 0.6265131831169128 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0005599019755295053, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8367769.0, "repeat_count": 0.0, "routers_loss": 0.0031490204855799675, "skip_count": 2.0, "step": 5188, "text_loss": 0.4716353118419647 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03076171875, "learning_rate": 0.0005595946768255756, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 8370705.0, "repeat_count": 1.0, "routers_loss": 0.003500689286738634, "skip_count": 0.0, "step": 5190, "text_loss": 0.5467679500579834 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.375697094217788, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0005592873552848532, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 8374217.0, "repeat_count": 2.0, "routers_loss": 0.010764475911855698, "skip_count": 3.0, "step": 5192, "text_loss": 0.4345340132713318 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 24.38508952157323, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005589800110251045, "loss": 0.0087, "macro_f1": 1.0, "num_tokens": 8378182.0, "repeat_count": 2.0, "routers_loss": 0.0010365343187004328, "skip_count": 1.0, "step": 5194, "text_loss": 0.46722909808158875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.394481948928675, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028076171875, "learning_rate": 0.0005586726441641044, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8381227.0, "repeat_count": 0.0, "routers_loss": 0.006349093746393919, "skip_count": 2.0, "step": 5196, "text_loss": 0.35410359501838684 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.403874376284122, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0005583652548196362, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 8384886.0, "repeat_count": 0.0, "routers_loss": 0.00038166221929714084, "skip_count": 0.0, "step": 5198, "text_loss": 0.5950250625610352 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.413266803639566, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0005580578431094924, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 8388939.0, "repeat_count": 0.0, "routers_loss": 0.0023578559048473835, "skip_count": 2.0, "step": 5200, "text_loss": 0.6553771495819092 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0390625, "learning_rate": 0.0005577504091514735, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 8391629.0, "repeat_count": 0.0, "routers_loss": 0.0010771085508167744, "skip_count": 0.0, "step": 5202, "text_loss": 0.4441985785961151 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.000557442953063389, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8394440.0, "repeat_count": 0.0, "routers_loss": 0.005844325292855501, "skip_count": 3.0, "step": 5204, "text_loss": 0.5807011723518372 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0005571354749630564, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8397731.0, "repeat_count": 0.0, "routers_loss": 0.006837233901023865, "skip_count": 1.0, "step": 5206, "text_loss": 0.27780941128730774 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.000556827974968302, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 8400859.0, "repeat_count": 0.0, "routers_loss": 0.007656649220734835, "skip_count": 3.0, "step": 5208, "text_loss": 0.4746324121952057 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0005565204531969606, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8404164.0, "repeat_count": 0.0, "routers_loss": 0.0028129038400948048, "skip_count": 1.0, "step": 5210, "text_loss": 0.8513513803482056 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036376953125, "learning_rate": 0.0005562129097668746, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8407196.0, "repeat_count": 0.0, "routers_loss": 0.00492360582575202, "skip_count": 1.0, "step": 5212, "text_loss": 0.12255420535802841 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.479013795127678, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0005559053447958958, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8410633.0, "repeat_count": 0.0, "routers_loss": 0.0020713545382022858, "skip_count": 0.0, "step": 5214, "text_loss": 0.6878522634506226 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02880859375, "learning_rate": 0.0005555977584018833, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8413414.0, "repeat_count": 0.0, "routers_loss": 0.0007216963567771018, "skip_count": 0.0, "step": 5216, "text_loss": 0.845878541469574 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.057861328125, "learning_rate": 0.0005552901507027048, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 8416817.0, "repeat_count": 0.0, "routers_loss": 0.002400130731984973, "skip_count": 1.0, "step": 5218, "text_loss": 0.16753672063350677 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019775390625, "learning_rate": 0.0005549825218162365, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 8419617.0, "repeat_count": 0.0, "routers_loss": 0.004563181661069393, "skip_count": 0.0, "step": 5220, "text_loss": 0.26107168197631836 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.516583504549455, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.000554674871860362, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 8422686.0, "repeat_count": 1.0, "routers_loss": 0.006413881666958332, "skip_count": 1.0, "step": 5222, "text_loss": 0.6333847045898438 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005543672009529734, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 8425571.0, "repeat_count": 0.0, "routers_loss": 0.0057656955905258656, "skip_count": 3.0, "step": 5224, "text_loss": 0.4552212357521057 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 24.535368359260346, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0005540595092119709, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 8429038.0, "repeat_count": 2.0, "routers_loss": 0.011755156330764294, "skip_count": 2.0, "step": 5226, "text_loss": 0.16597330570220947 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0005537517967552626, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8432117.0, "repeat_count": 0.0, "routers_loss": 0.0007519085193052888, "skip_count": 0.0, "step": 5228, "text_loss": 0.6283590197563171 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.554153213971237, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.064453125, "learning_rate": 0.000553444063700764, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 8435176.0, "repeat_count": 0.0, "routers_loss": 0.003066456411033869, "skip_count": 0.0, "step": 5230, "text_loss": 0.2360922247171402 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0198974609375, "learning_rate": 0.0005531363101663998, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 8438515.0, "repeat_count": 0.0, "routers_loss": 0.002865589689463377, "skip_count": 0.0, "step": 5232, "text_loss": 0.8075396418571472 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.040283203125, "learning_rate": 0.0005528285362701011, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 8441731.0, "repeat_count": 0.0, "routers_loss": 0.0012521179160103202, "skip_count": 0.0, "step": 5234, "text_loss": 0.584335446357727 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.58233049603757, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.034423828125, "learning_rate": 0.0005525207421298077, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 8444535.0, "repeat_count": 0.0, "routers_loss": 0.005398475099354982, "skip_count": 3.0, "step": 5236, "text_loss": 0.22711622714996338 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0966796875, "learning_rate": 0.0005522129278634669, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 8448337.0, "repeat_count": 0.0, "routers_loss": 0.002957914723083377, "skip_count": 1.0, "step": 5238, "text_loss": 0.3157515823841095 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 24.601115350748458, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.019287109375, "learning_rate": 0.0005519050935890335, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 8451530.0, "repeat_count": 0.0, "routers_loss": 0.007757039275020361, "skip_count": 3.0, "step": 5240, "text_loss": 0.2815830111503601 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.610507778103905, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.0005515972394244704, "loss": 0.0063, "macro_f1": 0.6603773832321167, "num_tokens": 8454171.0, "repeat_count": 1.0, "routers_loss": 0.021602008491754532, "skip_count": 1.0, "step": 5242, "text_loss": 0.6024490594863892 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.61990020545935, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033935546875, "learning_rate": 0.0005512893654877478, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 8457544.0, "repeat_count": 0.0, "routers_loss": 0.006062488537281752, "skip_count": 0.0, "step": 5244, "text_loss": 0.550110936164856 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.629292632814792, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0005509814718968435, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 8460135.0, "repeat_count": 0.0, "routers_loss": 0.002793943975120783, "skip_count": 0.0, "step": 5246, "text_loss": 0.4361286163330078 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.63868506017024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.0005506735587697433, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8463516.0, "repeat_count": 0.0, "routers_loss": 0.0016669550677761436, "skip_count": 0.0, "step": 5248, "text_loss": 0.4642958641052246 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.648077487525683, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0005503656262244395, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 8466406.0, "repeat_count": 0.0, "routers_loss": 0.0006051387754268944, "skip_count": 0.0, "step": 5250, "text_loss": 0.3445641100406647 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 24.657469914881126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02783203125, "learning_rate": 0.0005500576743789329, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 8468838.0, "repeat_count": 2.0, "routers_loss": 0.00654293829575181, "skip_count": 1.0, "step": 5252, "text_loss": 0.2842808663845062 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.666862342236573, "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0005497497033512309, "loss": 0.0077, "macro_f1": 0.8817967176437378, "num_tokens": 8471815.0, "repeat_count": 2.0, "routers_loss": 0.03845973685383797, "skip_count": 3.0, "step": 5254, "text_loss": 0.2597215175628662 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 26.0, "epoch": 24.676254769592017, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0274658203125, "learning_rate": 0.0005494417132593487, "loss": 0.0047, "macro_f1": 0.9452888369560242, "num_tokens": 8475202.0, "repeat_count": 1.0, "routers_loss": 0.02252381667494774, "skip_count": 4.0, "step": 5256, "text_loss": 0.32269927859306335 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.68564719694746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.055419921875, "learning_rate": 0.0005491337042213088, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 8478650.0, "repeat_count": 0.0, "routers_loss": 0.01232751365751028, "skip_count": 2.0, "step": 5258, "text_loss": 0.6523372530937195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0005488256763551408, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8481724.0, "repeat_count": 0.0, "routers_loss": 0.0028322834987193346, "skip_count": 0.0, "step": 5260, "text_loss": 0.4212580621242523 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0177001953125, "learning_rate": 0.0005485176297788814, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 8485833.0, "repeat_count": 0.0, "routers_loss": 0.002623105887323618, "skip_count": 2.0, "step": 5262, "text_loss": 0.16906329989433289 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.713824479013795, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0005482095646105748, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 8489089.0, "repeat_count": 1.0, "routers_loss": 0.0007179114618338645, "skip_count": 0.0, "step": 5264, "text_loss": 0.4523872137069702 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0005479014809682721, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 8492905.0, "repeat_count": 0.0, "routers_loss": 0.005234059412032366, "skip_count": 0.0, "step": 5266, "text_loss": 0.207139790058136 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.732609333724685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.0005475933789700314, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 8495480.0, "repeat_count": 0.0, "routers_loss": 0.0023258263245224953, "skip_count": 0.0, "step": 5268, "text_loss": 0.18060965836048126 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005472852587339183, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8499070.0, "repeat_count": 0.0, "routers_loss": 0.0013497259933501482, "skip_count": 0.0, "step": 5270, "text_loss": 0.7460769414901733 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.751394188435572, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056640625, "learning_rate": 0.0005469771203780048, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 8502886.0, "repeat_count": 0.0, "routers_loss": 0.0003589815751183778, "skip_count": 0.0, "step": 5272, "text_loss": 0.48119160532951355 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044677734375, "learning_rate": 0.0005466689640203701, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8506646.0, "repeat_count": 0.0, "routers_loss": 0.006619705818593502, "skip_count": 1.0, "step": 5274, "text_loss": 0.15656520426273346 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0005463607897791005, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 8509450.0, "repeat_count": 0.0, "routers_loss": 0.002992175053805113, "skip_count": 1.0, "step": 5276, "text_loss": 0.486930251121521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0005460525977722886, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 8512851.0, "repeat_count": 0.0, "routers_loss": 0.0027784097474068403, "skip_count": 0.0, "step": 5278, "text_loss": 0.19654682278633118 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0005457443881180345, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8516858.0, "repeat_count": 0.0, "routers_loss": 0.0017648129723966122, "skip_count": 0.0, "step": 5280, "text_loss": 0.580982506275177 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.798356325212797, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0005454361609344444, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 8519912.0, "repeat_count": 2.0, "routers_loss": 0.010817649774253368, "skip_count": 3.0, "step": 5282, "text_loss": 0.2644204795360565 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.000545127916339632, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8522396.0, "repeat_count": 0.0, "routers_loss": 0.001453282660804689, "skip_count": 0.0, "step": 5284, "text_loss": 0.5014839172363281 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.817141179923688, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0005448196544517168, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8525326.0, "repeat_count": 0.0, "routers_loss": 0.006645771209150553, "skip_count": 2.0, "step": 5286, "text_loss": 0.2983154058456421 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.82653360727913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.043212890625, "learning_rate": 0.0005445113753888254, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 8528611.0, "repeat_count": 0.0, "routers_loss": 0.0005447337171062827, "skip_count": 0.0, "step": 5288, "text_loss": 0.43598243594169617 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0257568359375, "learning_rate": 0.000544203079269091, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 8531571.0, "repeat_count": 0.0, "routers_loss": 0.0026976624503731728, "skip_count": 0.0, "step": 5290, "text_loss": 0.6454944610595703 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.0005438947662106533, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 8534565.0, "repeat_count": 0.0, "routers_loss": 0.002217630622908473, "skip_count": 0.0, "step": 5292, "text_loss": 0.742935836315155 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 24.854710889345466, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.029052734375, "learning_rate": 0.0005435864363316584, "loss": 0.0073, "macro_f1": 0.8820862174034119, "num_tokens": 8537581.0, "repeat_count": 2.0, "routers_loss": 0.030740609392523766, "skip_count": 2.0, "step": 5294, "text_loss": 0.48913639783859253 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0005432780897502588, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 8541271.0, "repeat_count": 0.0, "routers_loss": 0.005306888837367296, "skip_count": 1.0, "step": 5296, "text_loss": 0.5820846557617188 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 24.873495744056356, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.0005429697265846137, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 8545052.0, "repeat_count": 1.0, "routers_loss": 0.002255369909107685, "skip_count": 0.0, "step": 5298, "text_loss": 0.565483808517456 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0005426613469528881, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 8548605.0, "repeat_count": 0.0, "routers_loss": 0.0010787079809233546, "skip_count": 0.0, "step": 5300, "text_loss": 0.40154510736465454 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036376953125, "learning_rate": 0.000542352950973254, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 8552581.0, "repeat_count": 0.0, "routers_loss": 0.0017972089117392898, "skip_count": 0.0, "step": 5302, "text_loss": 0.5430748462677002 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04736328125, "learning_rate": 0.0005420445387638891, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 8556360.0, "repeat_count": 0.0, "routers_loss": 0.0016180560924112797, "skip_count": 2.0, "step": 5304, "text_loss": 0.544040322303772 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.911065453478134, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0242919921875, "learning_rate": 0.0005417361104429777, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 8559264.0, "repeat_count": 1.0, "routers_loss": 0.012688961811363697, "skip_count": 2.0, "step": 5306, "text_loss": 0.2018517404794693 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0005414276661287101, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 8562169.0, "repeat_count": 0.0, "routers_loss": 0.0012141643092036247, "skip_count": 0.0, "step": 5308, "text_loss": 0.5685747265815735 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.059326171875, "learning_rate": 0.0005411192059392826, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 8565231.0, "repeat_count": 0.0, "routers_loss": 0.0015626107342541218, "skip_count": 0.0, "step": 5310, "text_loss": 0.8073471784591675 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03857421875, "learning_rate": 0.0005408107299928979, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8568122.0, "repeat_count": 0.0, "routers_loss": 0.004773529712110758, "skip_count": 0.0, "step": 5312, "text_loss": 0.22583355009555817 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.94863516289991, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0005405022384077644, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 8571056.0, "repeat_count": 0.0, "routers_loss": 0.0025621228851377964, "skip_count": 1.0, "step": 5314, "text_loss": 0.25274428725242615 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 24.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.0005401937313020967, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 8574300.0, "repeat_count": 0.0, "routers_loss": 0.009726752527058125, "skip_count": 2.0, "step": 5316, "text_loss": 0.3283393979072571 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 24.967420017610802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0005398852087941155, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 8577424.0, "repeat_count": 0.0, "routers_loss": 0.012483839876949787, "skip_count": 4.0, "step": 5318, "text_loss": 0.1876130849123001 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.976812444966246, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.000539576671002047, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 8580309.0, "repeat_count": 0.0, "routers_loss": 0.0009830677881836891, "skip_count": 0.0, "step": 5320, "text_loss": 0.6955490708351135 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.046875, "learning_rate": 0.0005392681180441235, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 8583399.0, "repeat_count": 0.0, "routers_loss": 0.0010819481685757637, "skip_count": 0.0, "step": 5322, "text_loss": 0.4708341956138611 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 24.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.000538959550038583, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 8586259.0, "repeat_count": 0.0, "routers_loss": 0.005763369146734476, "skip_count": 0.0, "step": 5324, "text_loss": 0.20463642477989197 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0005386509671036695, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 8589067.0, "repeat_count": 0.0, "routers_loss": 0.0006229027640074492, "skip_count": 0.0, "step": 5326, "text_loss": 0.6819888353347778 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 24.0, "epoch": 25.014088641033165, "f1_execute": 0.9767441749572754, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.03466796875, "learning_rate": 0.0005383423693576325, "loss": 0.0087, "macro_f1": 0.9619450569152832, "num_tokens": 8592837.0, "repeat_count": 1.0, "routers_loss": 0.030066559091210365, "skip_count": 6.0, "step": 5328, "text_loss": 0.24606549739837646 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.023481068388612, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.038330078125, "learning_rate": 0.0005380337569187272, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 8596293.0, "repeat_count": 1.0, "routers_loss": 0.007445990107953548, "skip_count": 0.0, "step": 5330, "text_loss": 0.16730253398418427 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 25.032873495744056, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0230712890625, "learning_rate": 0.0005377251299052145, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 8599360.0, "repeat_count": 1.0, "routers_loss": 0.004563331138342619, "skip_count": 1.0, "step": 5332, "text_loss": 0.6856988668441772 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0005374164884353608, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8602376.0, "repeat_count": 0.0, "routers_loss": 0.0015491938684135675, "skip_count": 0.0, "step": 5334, "text_loss": 1.3248854875564575 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005371078326274382, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 8605400.0, "repeat_count": 0.0, "routers_loss": 0.0016098044579848647, "skip_count": 0.0, "step": 5336, "text_loss": 0.747150182723999 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 25.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0005367991625997243, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 8608100.0, "repeat_count": 0.0, "routers_loss": 0.0034471298567950726, "skip_count": 3.0, "step": 5338, "text_loss": 0.6443291902542114 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.070443205165834, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.0005364904784705015, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 8611768.0, "repeat_count": 0.0, "routers_loss": 0.007947597652673721, "skip_count": 1.0, "step": 5340, "text_loss": 0.7768037915229797 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 25.07983563252128, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.049072265625, "learning_rate": 0.0005361817803580588, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 8614424.0, "repeat_count": 2.0, "routers_loss": 0.009964234195649624, "skip_count": 2.0, "step": 5342, "text_loss": 0.22826914489269257 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0005358730683806896, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 8617826.0, "repeat_count": 0.0, "routers_loss": 0.0014116480015218258, "skip_count": 0.0, "step": 5344, "text_loss": 0.49022090435028076 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 25.098620487232168, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03564453125, "learning_rate": 0.0005355643426566929, "loss": 0.0061, "macro_f1": 0.8823530077934265, "num_tokens": 8621220.0, "repeat_count": 1.0, "routers_loss": 0.013940622098743916, "skip_count": 2.0, "step": 5346, "text_loss": 0.26819515228271484 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.108012914587615, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.000535255603304373, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 8623957.0, "repeat_count": 0.0, "routers_loss": 0.0032230091746896505, "skip_count": 2.0, "step": 5348, "text_loss": 0.46905452013015747 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005349468504420395, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 8626760.0, "repeat_count": 0.0, "routers_loss": 0.002631337149068713, "skip_count": 1.0, "step": 5350, "text_loss": 0.5312309861183167 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.0005346380841880068, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 8630207.0, "repeat_count": 0.0, "routers_loss": 0.004526057746261358, "skip_count": 2.0, "step": 5352, "text_loss": 0.5810666084289551 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0260009765625, "learning_rate": 0.0005343293046605949, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8633241.0, "repeat_count": 0.0, "routers_loss": 0.0023941127583384514, "skip_count": 0.0, "step": 5354, "text_loss": 0.18468725681304932 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.145582624009393, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0005340205119781288, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 8636215.0, "repeat_count": 1.0, "routers_loss": 0.0017020340310409665, "skip_count": 0.0, "step": 5356, "text_loss": 0.6665788888931274 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005337117062589383, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 8639326.0, "repeat_count": 0.0, "routers_loss": 0.004964717663824558, "skip_count": 2.0, "step": 5358, "text_loss": 0.19770404696464539 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023681640625, "learning_rate": 0.0005334028876213585, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8642157.0, "repeat_count": 0.0, "routers_loss": 0.006587155628949404, "skip_count": 0.0, "step": 5360, "text_loss": 0.2295130044221878 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0005330940561837291, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8645355.0, "repeat_count": 0.0, "routers_loss": 0.0006586945964954793, "skip_count": 0.0, "step": 5362, "text_loss": 0.2701159417629242 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.18315233343117, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0005327852120643947, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8648911.0, "repeat_count": 1.0, "routers_loss": 0.0014281768817454576, "skip_count": 0.0, "step": 5364, "text_loss": 0.8957229852676392 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02685546875, "learning_rate": 0.0005324763553817053, "loss": 0.0027, "macro_f1": 0.3333333432674408, "num_tokens": 8652037.0, "repeat_count": 0.0, "routers_loss": 0.0005899337120354176, "skip_count": 0.0, "step": 5366, "text_loss": 0.38642236590385437 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 25.20193718814206, "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0005321674862540154, "loss": 0.0058, "macro_f1": 0.9265305995941162, "num_tokens": 8655381.0, "repeat_count": 3.0, "routers_loss": 0.024511313065886497, "skip_count": 1.0, "step": 5368, "text_loss": 0.6439879536628723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0284423828125, "learning_rate": 0.000531858604799684, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 8658476.0, "repeat_count": 0.0, "routers_loss": 0.0012558114249259233, "skip_count": 0.0, "step": 5370, "text_loss": 0.3227672874927521 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06005859375, "learning_rate": 0.0005315497111370752, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 8661982.0, "repeat_count": 0.0, "routers_loss": 0.0013541636290028691, "skip_count": 0.0, "step": 5372, "text_loss": 0.6375321745872498 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 25.230114470208395, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.051513671875, "learning_rate": 0.0005312408053845575, "loss": 0.0052, "macro_f1": 0.5492662787437439, "num_tokens": 8665071.0, "repeat_count": 0.0, "routers_loss": 0.010432626120746136, "skip_count": 2.0, "step": 5374, "text_loss": 0.536924421787262 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.023681640625, "learning_rate": 0.0005309318876605042, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 8668411.0, "repeat_count": 0.0, "routers_loss": 0.004450209904462099, "skip_count": 1.0, "step": 5376, "text_loss": 0.2643466889858246 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.248899324919282, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005306229580832933, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 8672088.0, "repeat_count": 1.0, "routers_loss": 0.011189920827746391, "skip_count": 3.0, "step": 5378, "text_loss": 0.8259533047676086 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0289306640625, "learning_rate": 0.000530314016771307, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8675206.0, "repeat_count": 0.0, "routers_loss": 0.0020095291547477245, "skip_count": 0.0, "step": 5380, "text_loss": 0.31364113092422485 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.267684179630173, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0005300050638429324, "loss": 0.0078, "macro_f1": 0.3272727429866791, "num_tokens": 8678289.0, "repeat_count": 0.0, "routers_loss": 0.010738557204604149, "skip_count": 1.0, "step": 5382, "text_loss": 0.19013966619968414 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0294189453125, "learning_rate": 0.0005296960994165607, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 8681555.0, "repeat_count": 0.0, "routers_loss": 0.0018534278497099876, "skip_count": 1.0, "step": 5384, "text_loss": 0.762248694896698 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.286469034341064, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0005293871236105877, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 8684413.0, "repeat_count": 0.0, "routers_loss": 0.009143726900219917, "skip_count": 2.0, "step": 5386, "text_loss": 0.19994212687015533 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 25.295861461696507, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0005290781365434134, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 8687450.0, "repeat_count": 2.0, "routers_loss": 0.002034468576312065, "skip_count": 0.0, "step": 5388, "text_loss": 0.5519160628318787 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.30525388905195, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03955078125, "learning_rate": 0.0005287691383334425, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 8690651.0, "repeat_count": 1.0, "routers_loss": 0.006834167055785656, "skip_count": 0.0, "step": 5390, "text_loss": 0.5439304709434509 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.314646316407398, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.060791015625, "learning_rate": 0.0005284601290990832, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8693929.0, "repeat_count": 1.0, "routers_loss": 0.0022327799815684557, "skip_count": 0.0, "step": 5392, "text_loss": 0.24108269810676575 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.32403874376284, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.027099609375, "learning_rate": 0.0005281511089587491, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 8696727.0, "repeat_count": 0.0, "routers_loss": 0.002669565612450242, "skip_count": 0.0, "step": 5394, "text_loss": 0.8659077286720276 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0005278420780308568, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8700934.0, "repeat_count": 0.0, "routers_loss": 0.007252473384141922, "skip_count": 0.0, "step": 5396, "text_loss": 0.5592793226242065 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.045166015625, "learning_rate": 0.0005275330364338276, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 8704449.0, "repeat_count": 0.0, "routers_loss": 0.001793015981093049, "skip_count": 0.0, "step": 5398, "text_loss": 0.5211784243583679 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 25.352216025829176, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 0.0005272239842860868, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 8707384.0, "repeat_count": 5.0, "routers_loss": 0.00963665172457695, "skip_count": 4.0, "step": 5400, "text_loss": 0.6092788577079773 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 25.36160845318462, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.03515625, "learning_rate": 0.0005269149217060642, "loss": 0.0059, "macro_f1": 0.5492662787437439, "num_tokens": 8710453.0, "repeat_count": 0.0, "routers_loss": 0.01758105307817459, "skip_count": 2.0, "step": 5402, "text_loss": 0.3423936069011688 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.371000880540066, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0005266058488121926, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 8713514.0, "repeat_count": 0.0, "routers_loss": 0.0025636721402406693, "skip_count": 1.0, "step": 5404, "text_loss": 0.484171986579895 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.38039330789551, "f1_execute": 0.9767441749572754, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0005262967657229095, "loss": 0.0064, "macro_f1": 0.9255813956260681, "num_tokens": 8717051.0, "repeat_count": 3.0, "routers_loss": 0.022406045347452164, "skip_count": 4.0, "step": 5406, "text_loss": 0.23368191719055176 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0005259876725566563, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8719987.0, "repeat_count": 0.0, "routers_loss": 0.004114408977329731, "skip_count": 2.0, "step": 5408, "text_loss": 0.20237496495246887 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.000525678569431878, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 8723258.0, "repeat_count": 0.0, "routers_loss": 0.006741158664226532, "skip_count": 2.0, "step": 5410, "text_loss": 0.7969435453414917 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021240234375, "learning_rate": 0.0005253694564670233, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 8726294.0, "repeat_count": 0.0, "routers_loss": 0.0034468702506273985, "skip_count": 0.0, "step": 5412, "text_loss": 0.5533816814422607 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.417963017317287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.000525060333780545, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 8729603.0, "repeat_count": 0.0, "routers_loss": 0.01086533535271883, "skip_count": 2.0, "step": 5414, "text_loss": 0.31856611371040344 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 25.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0005247512014908998, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 8733423.0, "repeat_count": 0.0, "routers_loss": 0.00512756546959281, "skip_count": 6.0, "step": 5416, "text_loss": 0.6710903644561768 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06103515625, "learning_rate": 0.0005244420597165472, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 8736457.0, "repeat_count": 0.0, "routers_loss": 0.0026201079599559307, "skip_count": 0.0, "step": 5418, "text_loss": 0.6469964981079102 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04248046875, "learning_rate": 0.0005241329085759514, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 8739617.0, "repeat_count": 0.0, "routers_loss": 0.004130818881094456, "skip_count": 0.0, "step": 5420, "text_loss": 0.4868837296962738 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.455532726739065, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0361328125, "learning_rate": 0.0005238237481875795, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 8742653.0, "repeat_count": 0.0, "routers_loss": 0.003171122632920742, "skip_count": 0.0, "step": 5422, "text_loss": 0.12026242166757584 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0005235145786699021, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 8745835.0, "repeat_count": 0.0, "routers_loss": 0.0008553664083592594, "skip_count": 0.0, "step": 5424, "text_loss": 0.601640522480011 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.028076171875, "learning_rate": 0.0005232054001413941, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 8749006.0, "repeat_count": 0.0, "routers_loss": 0.0006958908052183688, "skip_count": 0.0, "step": 5426, "text_loss": 0.7083519101142883 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 0.0005228962127205329, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 8752493.0, "repeat_count": 0.0, "routers_loss": 0.0012221037177368999, "skip_count": 1.0, "step": 5428, "text_loss": 0.3949109613895416 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.493102436160846, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.039794921875, "learning_rate": 0.0005225870165257997, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 8755294.0, "repeat_count": 1.0, "routers_loss": 0.003924673888832331, "skip_count": 2.0, "step": 5430, "text_loss": 0.7487186789512634 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005222778116756793, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 8758043.0, "repeat_count": 0.0, "routers_loss": 0.002388258930295706, "skip_count": 0.0, "step": 5432, "text_loss": 0.4092858135700226 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.511887290871734, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0005219685982886594, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 8760618.0, "repeat_count": 1.0, "routers_loss": 0.0045886957086622715, "skip_count": 0.0, "step": 5434, "text_loss": 0.5889580249786377 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.52127971822718, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.052978515625, "learning_rate": 0.0005216593764832311, "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 8764269.0, "repeat_count": 1.0, "routers_loss": 0.00704155582934618, "skip_count": 2.0, "step": 5436, "text_loss": 0.2634117007255554 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.530672145582624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.040283203125, "learning_rate": 0.0005213501463778889, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 8767142.0, "repeat_count": 0.0, "routers_loss": 0.00368728069588542, "skip_count": 2.0, "step": 5438, "text_loss": 0.3512301445007324 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05322265625, "learning_rate": 0.0005210409080911304, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 8770239.0, "repeat_count": 0.0, "routers_loss": 0.0012925115879625082, "skip_count": 0.0, "step": 5440, "text_loss": 0.9330073595046997 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0005207316617414561, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8772927.0, "repeat_count": 0.0, "routers_loss": 0.005604506935924292, "skip_count": 0.0, "step": 5442, "text_loss": 0.23477613925933838 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.55884942764896, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0196533203125, "learning_rate": 0.0005204224074473701, "loss": 0.0049, "macro_f1": 0.6601307392120361, "num_tokens": 8776451.0, "repeat_count": 1.0, "routers_loss": 0.010945434682071209, "skip_count": 2.0, "step": 5444, "text_loss": 0.6184295415878296 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0005201131453273789, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 8779481.0, "repeat_count": 0.0, "routers_loss": 0.0024414353538304567, "skip_count": 0.0, "step": 5446, "text_loss": 0.16186967492103577 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.57763428235985, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.0005198038754999926, "loss": 0.0052, "macro_f1": 0.3272727429866791, "num_tokens": 8782425.0, "repeat_count": 1.0, "routers_loss": 0.013872416689991951, "skip_count": 0.0, "step": 5448, "text_loss": 0.42294546961784363 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03759765625, "learning_rate": 0.0005194945980837237, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 8785466.0, "repeat_count": 0.0, "routers_loss": 0.0006147907115519047, "skip_count": 0.0, "step": 5450, "text_loss": 0.6285432577133179 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0005191853131970881, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 8788461.0, "repeat_count": 0.0, "routers_loss": 0.0010585964191704988, "skip_count": 0.0, "step": 5452, "text_loss": 0.6032317876815796 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.60581156442618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.042724609375, "learning_rate": 0.0005188760209586044, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8791572.0, "repeat_count": 0.0, "routers_loss": 0.005267909727990627, "skip_count": 1.0, "step": 5454, "text_loss": 0.3015609681606293 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03369140625, "learning_rate": 0.0005185667214867937, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 8794697.0, "repeat_count": 0.0, "routers_loss": 0.000532392121385783, "skip_count": 0.0, "step": 5456, "text_loss": 0.9596265554428101 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.038818359375, "learning_rate": 0.0005182574149001805, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 8797880.0, "repeat_count": 0.0, "routers_loss": 0.0007176774088293314, "skip_count": 0.0, "step": 5458, "text_loss": 0.5599364638328552 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.0005179481013172912, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8801995.0, "repeat_count": 0.0, "routers_loss": 0.0022756673861294985, "skip_count": 0.0, "step": 5460, "text_loss": 0.47327280044555664 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005176387808566558, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 8805138.0, "repeat_count": 0.0, "routers_loss": 0.0025084633380174637, "skip_count": 0.0, "step": 5462, "text_loss": 0.26674970984458923 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.652773701203404, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05078125, "learning_rate": 0.0005173294536368061, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 8808102.0, "repeat_count": 0.0, "routers_loss": 0.0008814680040813982, "skip_count": 0.0, "step": 5464, "text_loss": 0.5981299877166748 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.662166128558848, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.036865234375, "learning_rate": 0.0005170201197762773, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8811431.0, "repeat_count": 0.0, "routers_loss": 0.0005443177651613951, "skip_count": 0.0, "step": 5466, "text_loss": 1.037438988685608 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0005167107793936065, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 8814256.0, "repeat_count": 0.0, "routers_loss": 0.000494555220939219, "skip_count": 0.0, "step": 5468, "text_loss": 0.5005733966827393 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0005164014326073333, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 8817024.0, "repeat_count": 0.0, "routers_loss": 0.004793747793883085, "skip_count": 2.0, "step": 5470, "text_loss": 0.6999614834785461 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.690343410625182, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005160920795360002, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 8819892.0, "repeat_count": 0.0, "routers_loss": 0.0020966180600225925, "skip_count": 0.0, "step": 5472, "text_loss": 0.5536707043647766 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0279541015625, "learning_rate": 0.0005157827202981521, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8822928.0, "repeat_count": 0.0, "routers_loss": 0.0020367507822811604, "skip_count": 0.0, "step": 5474, "text_loss": 0.43655988574028015 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0005154733550123356, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8825842.0, "repeat_count": 0.0, "routers_loss": 0.0020070383325219154, "skip_count": 0.0, "step": 5476, "text_loss": 0.48149657249450684 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.0005151639837971004, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 8829534.0, "repeat_count": 0.0, "routers_loss": 0.0016327418852597475, "skip_count": 0.0, "step": 5478, "text_loss": 0.6693689227104187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.000514854606770998, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 8833177.0, "repeat_count": 0.0, "routers_loss": 0.0012691980227828026, "skip_count": 0.0, "step": 5480, "text_loss": 0.44926801323890686 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.737305547402407, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.0005145452240525822, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 8836933.0, "repeat_count": 1.0, "routers_loss": 0.0007724820752628148, "skip_count": 0.0, "step": 5482, "text_loss": 0.5759884119033813 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 25.74669797475785, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0005142358357604092, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 8840093.0, "repeat_count": 1.0, "routers_loss": 0.008331702090799809, "skip_count": 7.0, "step": 5484, "text_loss": 0.47393685579299927 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0224609375, "learning_rate": 0.0005139264420130368, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 8843918.0, "repeat_count": 0.0, "routers_loss": 0.003124477108940482, "skip_count": 2.0, "step": 5486, "text_loss": 0.5298711061477661 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.76548282946874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08447265625, "learning_rate": 0.0005136170429290259, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 8846558.0, "repeat_count": 0.0, "routers_loss": 0.0034127775579690933, "skip_count": 2.0, "step": 5488, "text_loss": 0.43582668900489807 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.774875256824185, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.0005133076386269383, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 8849724.0, "repeat_count": 1.0, "routers_loss": 0.0018056259723380208, "skip_count": 0.0, "step": 5490, "text_loss": 0.8116800785064697 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 22.0, "epoch": 25.784267684179632, "f1_execute": 0.9767441749572754, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03271484375, "learning_rate": 0.0005129982292253384, "loss": 0.0063, "macro_f1": 0.6589147448539734, "num_tokens": 8852447.0, "repeat_count": 1.0, "routers_loss": 0.021452350541949272, "skip_count": 6.0, "step": 5492, "text_loss": 0.31878748536109924 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.793660111535075, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0005126888148427927, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 8855886.0, "repeat_count": 0.0, "routers_loss": 0.0026911941822618246, "skip_count": 0.0, "step": 5494, "text_loss": 0.4021807909011841 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 25.80305253889052, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.025634765625, "learning_rate": 0.0005123793955978693, "loss": 0.007, "macro_f1": 0.5492662787437439, "num_tokens": 8859378.0, "repeat_count": 0.0, "routers_loss": 0.019764510914683342, "skip_count": 2.0, "step": 5496, "text_loss": 0.21608132123947144 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.812444966245963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0005120699716091379, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 8862310.0, "repeat_count": 0.0, "routers_loss": 0.0008988190093077719, "skip_count": 0.0, "step": 5498, "text_loss": 0.34666743874549866 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.82183739360141, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0235595703125, "learning_rate": 0.0005117605429951707, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 8865166.0, "repeat_count": 0.0, "routers_loss": 0.011137975379824638, "skip_count": 2.0, "step": 5500, "text_loss": 0.25385144352912903 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 25.831229820956853, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.031494140625, "learning_rate": 0.0005114511098745412, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 8869923.0, "repeat_count": 1.0, "routers_loss": 0.006476947572082281, "skip_count": 4.0, "step": 5502, "text_loss": 0.4503856301307678 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.000511141672365825, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8872451.0, "repeat_count": 0.0, "routers_loss": 0.0022727579344063997, "skip_count": 0.0, "step": 5504, "text_loss": 0.7522464990615845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.0005108322305875987, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8875968.0, "repeat_count": 0.0, "routers_loss": 0.0020014268811792135, "skip_count": 0.0, "step": 5506, "text_loss": 0.30184176564216614 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04833984375, "learning_rate": 0.0005105227846584414, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8879705.0, "repeat_count": 0.0, "routers_loss": 0.001179999322630465, "skip_count": 0.0, "step": 5508, "text_loss": 0.6187804937362671 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.86879953037863, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0005102133346969329, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 8883535.0, "repeat_count": 1.0, "routers_loss": 0.002946492750197649, "skip_count": 0.0, "step": 5510, "text_loss": 0.5961501002311707 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.878191957734078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0223388671875, "learning_rate": 0.0005099038808216555, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 8886683.0, "repeat_count": 1.0, "routers_loss": 0.004532935563474894, "skip_count": 3.0, "step": 5512, "text_loss": 0.38462957739830017 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0005095944231511922, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 8891049.0, "repeat_count": 0.0, "routers_loss": 0.00917842984199524, "skip_count": 2.0, "step": 5514, "text_loss": 0.27541956305503845 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0005092849618041279, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 8893604.0, "repeat_count": 0.0, "routers_loss": 0.0008756510796956718, "skip_count": 0.0, "step": 5516, "text_loss": 0.681315541267395 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.906369239800412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.042236328125, "learning_rate": 0.0005089754968990487, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 8898072.0, "repeat_count": 0.0, "routers_loss": 0.0008704439387656748, "skip_count": 1.0, "step": 5518, "text_loss": 0.5060005187988281 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.915761667155856, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0005086660285545422, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 8901539.0, "repeat_count": 0.0, "routers_loss": 0.004750201944261789, "skip_count": 1.0, "step": 5520, "text_loss": 0.6008047461509705 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 25.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.039794921875, "learning_rate": 0.000508356556889197, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 8904525.0, "repeat_count": 0.0, "routers_loss": 0.0026552649214863777, "skip_count": 0.0, "step": 5522, "text_loss": 0.4539012908935547 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 25.934546521866746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0005080470820216037, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 8907624.0, "repeat_count": 0.0, "routers_loss": 0.002621029270812869, "skip_count": 1.0, "step": 5524, "text_loss": 0.20088370144367218 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 25.94393894922219, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0005077376040703533, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 8910515.0, "repeat_count": 3.0, "routers_loss": 0.0028921898920089006, "skip_count": 0.0, "step": 5526, "text_loss": 0.6575983166694641 }, { "acc_repeat": 1.0, "acc_skip": 0.8888888955116272, "avg_layers": 21.0, "epoch": 25.953331376577633, "f1_execute": 0.9729729890823364, "f1_repeat": 1.0, "f1_skip": 0.9411765336990356, "grad_norm": 0.02734375, "learning_rate": 0.0005074281231540384, "loss": 0.0076, "macro_f1": 0.9713832139968872, "num_tokens": 8914419.0, "repeat_count": 1.0, "routers_loss": 0.024232301861047745, "skip_count": 9.0, "step": 5528, "text_loss": 0.5435594916343689 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 25.96272380393308, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 0.0005071186393912527, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 8917543.0, "repeat_count": 0.0, "routers_loss": 0.003731841454282403, "skip_count": 2.0, "step": 5530, "text_loss": 0.5152071118354797 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.972116231288524, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0005068091529005909, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 8920728.0, "repeat_count": 1.0, "routers_loss": 0.005905418191105127, "skip_count": 0.0, "step": 5532, "text_loss": 0.29741042852401733 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 25.981508658643968, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.044189453125, "learning_rate": 0.000506499663800649, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 8924112.0, "repeat_count": 1.0, "routers_loss": 0.0021933517418801785, "skip_count": 0.0, "step": 5534, "text_loss": 0.45704230666160583 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 25.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.0005061901722100235, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 8927323.0, "repeat_count": 0.0, "routers_loss": 0.009227502159774303, "skip_count": 4.0, "step": 5536, "text_loss": 0.1968434453010559 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.0, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.038330078125, "learning_rate": 0.0005058806782473125, "loss": 0.0053, "macro_f1": 0.6601307392120361, "num_tokens": 8931052.0, "repeat_count": 1.0, "routers_loss": 0.02054760232567787, "skip_count": 2.0, "step": 5538, "text_loss": 0.23851273953914642 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0194091796875, "learning_rate": 0.0005055711820311144, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8934215.0, "repeat_count": 0.0, "routers_loss": 0.0008434011251665652, "skip_count": 0.0, "step": 5540, "text_loss": 0.85942542552948 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 26.01878485471089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026123046875, "learning_rate": 0.0005052616836800288, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 8937173.0, "repeat_count": 0.0, "routers_loss": 0.011105241253972054, "skip_count": 4.0, "step": 5542, "text_loss": 0.2614556849002838 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0302734375, "learning_rate": 0.0005049521833126561, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8940553.0, "repeat_count": 0.0, "routers_loss": 0.0006273435428738594, "skip_count": 0.0, "step": 5544, "text_loss": 0.6430498957633972 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0341796875, "learning_rate": 0.0005046426810475976, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 8943753.0, "repeat_count": 0.0, "routers_loss": 0.0023464353289455175, "skip_count": 1.0, "step": 5546, "text_loss": 0.7015808820724487 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06689453125, "learning_rate": 0.0005043331770034547, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 8947149.0, "repeat_count": 0.0, "routers_loss": 0.0016024730866774917, "skip_count": 1.0, "step": 5548, "text_loss": 0.5875257253646851 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.044677734375, "learning_rate": 0.0005040236712988304, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 8950374.0, "repeat_count": 0.0, "routers_loss": 0.004096277989447117, "skip_count": 0.0, "step": 5550, "text_loss": 0.1712338626384735 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.065746991488112, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.0005037141640523275, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 8953256.0, "repeat_count": 1.0, "routers_loss": 0.00441550649702549, "skip_count": 0.0, "step": 5552, "text_loss": 0.16560404002666473 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.07513941884356, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0005034046553825501, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 8956845.0, "repeat_count": 4.0, "routers_loss": 0.011712636798620224, "skip_count": 6.0, "step": 5554, "text_loss": 0.24278216063976288 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 0.0005030951454081023, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 8961165.0, "repeat_count": 0.0, "routers_loss": 0.00235542468726635, "skip_count": 1.0, "step": 5556, "text_loss": 0.17214511334896088 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.093924273554446, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0005027856342475888, "loss": 0.0037, "macro_f1": 0.3272727429866791, "num_tokens": 8965262.0, "repeat_count": 0.0, "routers_loss": 0.0160827673971653, "skip_count": 1.0, "step": 5558, "text_loss": 0.40229740738868713 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.10331670090989, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.052001953125, "learning_rate": 0.0005024761220196151, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 8968278.0, "repeat_count": 1.0, "routers_loss": 0.004786997567862272, "skip_count": 0.0, "step": 5560, "text_loss": 0.24828575551509857 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.112709128265337, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.049072265625, "learning_rate": 0.0005021666088427868, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 8971443.0, "repeat_count": 1.0, "routers_loss": 0.0015378865646198392, "skip_count": 0.0, "step": 5562, "text_loss": 0.7269657254219055 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.01904296875, "learning_rate": 0.0005018570948357099, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8975312.0, "repeat_count": 0.0, "routers_loss": 0.0015218508196994662, "skip_count": 0.0, "step": 5564, "text_loss": 0.5198811292648315 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.131493982976224, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.0005015475801169908, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 8977951.0, "repeat_count": 0.0, "routers_loss": 0.008865317329764366, "skip_count": 1.0, "step": 5566, "text_loss": 0.1541406810283661 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.14088641033167, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.045654296875, "learning_rate": 0.0005012380648052359, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 8981325.0, "repeat_count": 1.0, "routers_loss": 0.0055318837985396385, "skip_count": 0.0, "step": 5568, "text_loss": 0.510314404964447 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0005009285490190523, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 8984661.0, "repeat_count": 0.0, "routers_loss": 0.0035060355439782143, "skip_count": 0.0, "step": 5570, "text_loss": 0.29421761631965637 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.000500619032877047, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 8987573.0, "repeat_count": 0.0, "routers_loss": 0.0050126477144658566, "skip_count": 2.0, "step": 5572, "text_loss": 0.1984361708164215 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.169063692398005, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0244140625, "learning_rate": 0.0005003095164978271, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 8991136.0, "repeat_count": 0.0, "routers_loss": 0.0019407360814511776, "skip_count": 0.0, "step": 5574, "text_loss": 0.42751404643058777 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.17845611975345, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02197265625, "learning_rate": 0.0005, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 8994198.0, "repeat_count": 0.0, "routers_loss": 0.0029819176997989416, "skip_count": 2.0, "step": 5576, "text_loss": 0.20589640736579895 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.187848547108892, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0234375, "learning_rate": 0.0004996904835021729, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 8997907.0, "repeat_count": 0.0, "routers_loss": 0.000878945691511035, "skip_count": 1.0, "step": 5578, "text_loss": 0.2801406979560852 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.19724097446434, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.000499380967122953, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 9001141.0, "repeat_count": 0.0, "routers_loss": 0.005223734769970179, "skip_count": 1.0, "step": 5580, "text_loss": 0.20542480051517487 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.206633401819783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0004990714509809478, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 9004794.0, "repeat_count": 0.0, "routers_loss": 0.0015868612099438906, "skip_count": 0.0, "step": 5582, "text_loss": 0.32094934582710266 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 26.216025829175226, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.041259765625, "learning_rate": 0.0004987619351947643, "loss": 0.0064, "macro_f1": 0.6122449040412903, "num_tokens": 9009250.0, "repeat_count": 0.0, "routers_loss": 0.031923454254865646, "skip_count": 4.0, "step": 5584, "text_loss": 0.609201967716217 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.225418256530673, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0004984524198830095, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 9013254.0, "repeat_count": 0.0, "routers_loss": 0.0033124545589089394, "skip_count": 0.0, "step": 5586, "text_loss": 0.3698650300502777 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.234810683886117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.0004981429051642903, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 9016598.0, "repeat_count": 0.0, "routers_loss": 0.0017190382350236177, "skip_count": 1.0, "step": 5588, "text_loss": 0.5306026935577393 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.24420311124156, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.0004978333911572132, "loss": 0.0059, "macro_f1": 0.3272727429866791, "num_tokens": 9019558.0, "repeat_count": 0.0, "routers_loss": 0.02051064372062683, "skip_count": 1.0, "step": 5590, "text_loss": 0.23494470119476318 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.253595538597008, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.0004975238779803849, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 9023024.0, "repeat_count": 0.0, "routers_loss": 0.0010489600244909525, "skip_count": 0.0, "step": 5592, "text_loss": 0.579275906085968 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.26298796595245, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025634765625, "learning_rate": 0.0004972143657524112, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 9026161.0, "repeat_count": 0.0, "routers_loss": 0.0012039231369271874, "skip_count": 0.0, "step": 5594, "text_loss": 0.5776295065879822 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.272380393307895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02685546875, "learning_rate": 0.0004969048545918978, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 9028814.0, "repeat_count": 0.0, "routers_loss": 0.0010212450288236141, "skip_count": 1.0, "step": 5596, "text_loss": 0.6816855669021606 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 26.281772820663342, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00049659534461745, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 9032243.0, "repeat_count": 2.0, "routers_loss": 0.0024297661148011684, "skip_count": 0.0, "step": 5598, "text_loss": 0.743188202381134 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.291165248018785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026611328125, "learning_rate": 0.0004962858359476726, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 9035493.0, "repeat_count": 0.0, "routers_loss": 0.002151754219084978, "skip_count": 0.0, "step": 5600, "text_loss": 0.5213983654975891 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.30055767537423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.0004959763287011698, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 9038213.0, "repeat_count": 0.0, "routers_loss": 0.0028108188416808844, "skip_count": 2.0, "step": 5602, "text_loss": 0.5128397345542908 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.309950102729672, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.031982421875, "learning_rate": 0.0004956668229965454, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 9041152.0, "repeat_count": 0.0, "routers_loss": 0.004022551700472832, "skip_count": 2.0, "step": 5604, "text_loss": 0.15361636877059937 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.31934253008512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.032470703125, "learning_rate": 0.0004953573189524026, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 9044503.0, "repeat_count": 0.0, "routers_loss": 0.0010689410846680403, "skip_count": 1.0, "step": 5606, "text_loss": 0.6454885005950928 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.328734957440563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0267333984375, "learning_rate": 0.0004950478166873439, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 9047742.0, "repeat_count": 0.0, "routers_loss": 0.0025760293938219547, "skip_count": 0.0, "step": 5608, "text_loss": 0.7654000520706177 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.338127384796007, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03564453125, "learning_rate": 0.0004947383163199713, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 9050349.0, "repeat_count": 0.0, "routers_loss": 0.0009846165776252747, "skip_count": 0.0, "step": 5610, "text_loss": 0.41533342003822327 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.347519812151454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01953125, "learning_rate": 0.0004944288179688858, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 9053667.0, "repeat_count": 0.0, "routers_loss": 0.0017193946987390518, "skip_count": 1.0, "step": 5612, "text_loss": 1.0172475576400757 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.356912239506897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0004941193217526875, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 9056777.0, "repeat_count": 0.0, "routers_loss": 0.0026750199031084776, "skip_count": 0.0, "step": 5614, "text_loss": 0.17584927380084991 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 26.36630466686234, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.034912109375, "learning_rate": 0.0004938098277899765, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 9060609.0, "repeat_count": 1.0, "routers_loss": 0.005259076599031687, "skip_count": 1.0, "step": 5616, "text_loss": 0.5522297024726868 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.375697094217788, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004935003361993511, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9063633.0, "repeat_count": 0.0, "routers_loss": 0.0006837095716036856, "skip_count": 0.0, "step": 5618, "text_loss": 0.5212588310241699 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.38508952157323, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0004931908470994091, "loss": 0.0059, "macro_f1": 0.6603773832321167, "num_tokens": 9067777.0, "repeat_count": 1.0, "routers_loss": 0.01067375484853983, "skip_count": 1.0, "step": 5620, "text_loss": 0.5515062808990479 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 26.394481948928675, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.019775390625, "learning_rate": 0.0004928813606087474, "loss": 0.0043, "macro_f1": 0.5934640765190125, "num_tokens": 9070938.0, "repeat_count": 0.0, "routers_loss": 0.016635602340102196, "skip_count": 3.0, "step": 5622, "text_loss": 0.3225076198577881 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.403874376284122, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004925718768459617, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9074050.0, "repeat_count": 0.0, "routers_loss": 0.002216119086369872, "skip_count": 0.0, "step": 5624, "text_loss": 0.32438889145851135 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 26.413266803639566, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.040771484375, "learning_rate": 0.0004922623959296469, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 9076785.0, "repeat_count": 1.0, "routers_loss": 0.012125075794756413, "skip_count": 5.0, "step": 5626, "text_loss": 0.39563658833503723 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.42265923099501, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.050048828125, "learning_rate": 0.0004919529179783965, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9080239.0, "repeat_count": 0.0, "routers_loss": 0.0026486809365451336, "skip_count": 0.0, "step": 5628, "text_loss": 0.5401569604873657 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.432051658350456, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.0004916434431108031, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9083935.0, "repeat_count": 0.0, "routers_loss": 0.0011849761940538883, "skip_count": 0.0, "step": 5630, "text_loss": 0.4798774719238281 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.4414440857059, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0191650390625, "learning_rate": 0.000491333971445458, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 9087174.0, "repeat_count": 0.0, "routers_loss": 0.002799210138618946, "skip_count": 0.0, "step": 5632, "text_loss": 0.22488386929035187 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.450836513061343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0004910245031009515, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 9089803.0, "repeat_count": 0.0, "routers_loss": 0.00139117450453341, "skip_count": 0.0, "step": 5634, "text_loss": 0.6237335205078125 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.46022894041679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.0004907150381958723, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 9093075.0, "repeat_count": 0.0, "routers_loss": 0.006503603886812925, "skip_count": 1.0, "step": 5636, "text_loss": 0.18781614303588867 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.469621367772234, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021484375, "learning_rate": 0.0004904055768488077, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9096355.0, "repeat_count": 0.0, "routers_loss": 0.0009764843271113932, "skip_count": 0.0, "step": 5638, "text_loss": 0.6821450591087341 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.479013795127678, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0004900961191783445, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 9098994.0, "repeat_count": 1.0, "routers_loss": 0.00693159457296133, "skip_count": 3.0, "step": 5640, "text_loss": 0.214790940284729 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.488406222483125, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.0004897866653030671, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 9102048.0, "repeat_count": 0.0, "routers_loss": 0.002469591563567519, "skip_count": 0.0, "step": 5642, "text_loss": 0.1556607335805893 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.49779864983857, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0004894772153415588, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9105379.0, "repeat_count": 0.0, "routers_loss": 0.0004824921488761902, "skip_count": 0.0, "step": 5644, "text_loss": 0.499972403049469 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.507191077194012, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0004891677694124013, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 9108240.0, "repeat_count": 0.0, "routers_loss": 0.0029356612358242273, "skip_count": 1.0, "step": 5646, "text_loss": 0.5169754028320312 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.516583504549455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0174560546875, "learning_rate": 0.0004888583276341751, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 9111381.0, "repeat_count": 0.0, "routers_loss": 0.009489183314144611, "skip_count": 1.0, "step": 5648, "text_loss": 0.23630797863006592 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.525975931904902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.017822265625, "learning_rate": 0.0004885488901254588, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 9114015.0, "repeat_count": 0.0, "routers_loss": 0.004154495894908905, "skip_count": 1.0, "step": 5650, "text_loss": 0.3345947563648224 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.535368359260346, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0546875, "learning_rate": 0.0004882394570048294, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 9117044.0, "repeat_count": 0.0, "routers_loss": 0.0018865863094106317, "skip_count": 0.0, "step": 5652, "text_loss": 0.32814112305641174 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.54476078661579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02587890625, "learning_rate": 0.0004879300283908623, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 9120035.0, "repeat_count": 0.0, "routers_loss": 0.0035278978757560253, "skip_count": 1.0, "step": 5654, "text_loss": 0.4081386625766754 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.554153213971237, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0216064453125, "learning_rate": 0.00048762060440213096, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 9122955.0, "repeat_count": 1.0, "routers_loss": 0.0053498269990086555, "skip_count": 0.0, "step": 5656, "text_loss": 0.31027838587760925 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.56354564132668, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0004873111851572075, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9125635.0, "repeat_count": 0.0, "routers_loss": 0.004556098487228155, "skip_count": 0.0, "step": 5658, "text_loss": 0.25703540444374084 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.572938068682124, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024658203125, "learning_rate": 0.0004870017707746617, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 9128906.0, "repeat_count": 0.0, "routers_loss": 0.0031165245454758406, "skip_count": 2.0, "step": 5660, "text_loss": 0.20663656294345856 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.58233049603757, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0004866923613730617, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 9132030.0, "repeat_count": 1.0, "routers_loss": 0.004887583665549755, "skip_count": 2.0, "step": 5662, "text_loss": 0.6062649488449097 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.591722923393014, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035888671875, "learning_rate": 0.0004863829570709741, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 9135274.0, "repeat_count": 0.0, "routers_loss": 0.0021857863757759333, "skip_count": 0.0, "step": 5664, "text_loss": 0.49644309282302856 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 26.601115350748458, "f1_execute": 0.9756097793579102, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0004860735579869631, "loss": 0.0088, "macro_f1": 0.925203263759613, "num_tokens": 9139735.0, "repeat_count": 3.0, "routers_loss": 0.05413912236690521, "skip_count": 5.0, "step": 5666, "text_loss": 0.25161290168762207 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.610507778103905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00048576416423959097, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 9142419.0, "repeat_count": 0.0, "routers_loss": 0.002229376696050167, "skip_count": 0.0, "step": 5668, "text_loss": 0.5332949161529541 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 26.61990020545935, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.0004854547759474179, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 9145443.0, "repeat_count": 1.0, "routers_loss": 0.005968933925032616, "skip_count": 4.0, "step": 5670, "text_loss": 0.5282154083251953 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.629292632814792, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.0004851453932290021, "loss": 0.0085, "macro_f1": 0.3272727429866791, "num_tokens": 9147754.0, "repeat_count": 0.0, "routers_loss": 0.04015754163265228, "skip_count": 1.0, "step": 5672, "text_loss": 0.8564629554748535 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.63868506017024, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.00048483601620289974, "loss": 0.0058, "macro_f1": 0.8820862174034119, "num_tokens": 9151714.0, "repeat_count": 2.0, "routers_loss": 0.019172413274645805, "skip_count": 2.0, "step": 5674, "text_loss": 0.4149441123008728 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 26.648077487525683, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0004845266449876645, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9154524.0, "repeat_count": 1.0, "routers_loss": 0.005025535821914673, "skip_count": 0.0, "step": 5676, "text_loss": 0.26525792479515076 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.657469914881126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031005859375, "learning_rate": 0.000484217279701848, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 9158546.0, "repeat_count": 0.0, "routers_loss": 0.0012200147612020373, "skip_count": 0.0, "step": 5678, "text_loss": 0.5532271862030029 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.666862342236573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03662109375, "learning_rate": 0.0004839079204639998, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 9161003.0, "repeat_count": 0.0, "routers_loss": 0.0013485675444826484, "skip_count": 1.0, "step": 5680, "text_loss": 0.36826151609420776 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.676254769592017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02001953125, "learning_rate": 0.0004835985673926668, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 9164741.0, "repeat_count": 0.0, "routers_loss": 0.00532014574855566, "skip_count": 2.0, "step": 5682, "text_loss": 0.16154609620571136 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.68564719694746, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0004832892206063938, "loss": 0.0075, "macro_f1": 1.0, "num_tokens": 9168079.0, "repeat_count": 2.0, "routers_loss": 0.007782323285937309, "skip_count": 3.0, "step": 5684, "text_loss": 0.4323575496673584 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.695039624302908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 0.0004829798802237228, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9171352.0, "repeat_count": 0.0, "routers_loss": 0.0024159469176083803, "skip_count": 2.0, "step": 5686, "text_loss": 0.3163119852542877 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.70443205165835, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.000482670546363194, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 9175197.0, "repeat_count": 0.0, "routers_loss": 0.002455134643241763, "skip_count": 0.0, "step": 5688, "text_loss": 0.59735506772995 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.713824479013795, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.03759765625, "learning_rate": 0.0004823612191433443, "loss": 0.0042, "macro_f1": 0.8820862174034119, "num_tokens": 9177648.0, "repeat_count": 2.0, "routers_loss": 0.015524548478424549, "skip_count": 2.0, "step": 5690, "text_loss": 0.759812593460083 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.723216906369238, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.00048205189868270887, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 9180694.0, "repeat_count": 0.0, "routers_loss": 0.002112736226990819, "skip_count": 2.0, "step": 5692, "text_loss": 0.3516882061958313 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 26.732609333724685, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.025146484375, "learning_rate": 0.00048174258509981973, "loss": 0.0063, "macro_f1": 0.9262410998344421, "num_tokens": 9183502.0, "repeat_count": 2.0, "routers_loss": 0.03100527822971344, "skip_count": 3.0, "step": 5694, "text_loss": 0.3722715973854065 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.74200176108013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.0004814332785132064, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 9186417.0, "repeat_count": 0.0, "routers_loss": 0.009176591411232948, "skip_count": 2.0, "step": 5696, "text_loss": 0.33363673090934753 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.751394188435572, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.0004811239790413958, "loss": 0.0076, "macro_f1": 0.3272727429866791, "num_tokens": 9189478.0, "repeat_count": 0.0, "routers_loss": 0.023586507886648178, "skip_count": 1.0, "step": 5698, "text_loss": 0.19698107242584229 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.76078661579102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0228271484375, "learning_rate": 0.00048081468680291194, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9192115.0, "repeat_count": 0.0, "routers_loss": 0.005083440337330103, "skip_count": 1.0, "step": 5700, "text_loss": 0.3476336896419525 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.770179043146463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0004805054019162764, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 9195176.0, "repeat_count": 0.0, "routers_loss": 0.007766073569655418, "skip_count": 1.0, "step": 5702, "text_loss": 0.27114811539649963 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.779571470501907, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02783203125, "learning_rate": 0.0004801961245000076, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9199091.0, "repeat_count": 0.0, "routers_loss": 0.0009058842551894486, "skip_count": 0.0, "step": 5704, "text_loss": 0.6249846816062927 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.788963897857354, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0218505859375, "learning_rate": 0.0004798868546726212, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9202003.0, "repeat_count": 0.0, "routers_loss": 0.005479823332279921, "skip_count": 0.0, "step": 5706, "text_loss": 0.47223609685897827 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.798356325212797, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0166015625, "learning_rate": 0.00047957759255263014, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9205277.0, "repeat_count": 0.0, "routers_loss": 0.001055705244652927, "skip_count": 0.0, "step": 5708, "text_loss": 0.677215576171875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.80774875256824, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.00047926833825854377, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 9208844.0, "repeat_count": 0.0, "routers_loss": 0.003291431115940213, "skip_count": 2.0, "step": 5710, "text_loss": 0.12439999729394913 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.817141179923688, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06591796875, "learning_rate": 0.0004789590919088696, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 9211619.0, "repeat_count": 0.0, "routers_loss": 0.005120242480188608, "skip_count": 2.0, "step": 5712, "text_loss": 0.5771954655647278 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.82653360727913, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0004786498536221111, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 9214914.0, "repeat_count": 1.0, "routers_loss": 0.004877795465290546, "skip_count": 2.0, "step": 5714, "text_loss": 0.6432198882102966 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.835926034634575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0400390625, "learning_rate": 0.00047834062351676893, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 9218186.0, "repeat_count": 0.0, "routers_loss": 0.0026507999282330275, "skip_count": 0.0, "step": 5716, "text_loss": 0.23814935982227325 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.845318461990022, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.00047803140171134075, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 9221754.0, "repeat_count": 0.0, "routers_loss": 0.002605629386380315, "skip_count": 1.0, "step": 5718, "text_loss": 0.2910388708114624 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 26.854710889345466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03125, "learning_rate": 0.0004777221883243208, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 9224502.0, "repeat_count": 0.0, "routers_loss": 0.0048494706861674786, "skip_count": 3.0, "step": 5720, "text_loss": 0.6195104122161865 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.86410331670091, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0004774129834742004, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 9227350.0, "repeat_count": 0.0, "routers_loss": 0.003092368133366108, "skip_count": 0.0, "step": 5722, "text_loss": 0.35447990894317627 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.873495744056356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.00047710378727946725, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 9230166.0, "repeat_count": 0.0, "routers_loss": 0.012780336663126945, "skip_count": 2.0, "step": 5724, "text_loss": 0.27581867575645447 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.8828881714118, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.00047679459985860604, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 9233029.0, "repeat_count": 0.0, "routers_loss": 0.005429140292108059, "skip_count": 1.0, "step": 5726, "text_loss": 0.2636827826499939 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.892280598767243, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024658203125, "learning_rate": 0.00047648542133009794, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9236317.0, "repeat_count": 0.0, "routers_loss": 0.0023909916635602713, "skip_count": 0.0, "step": 5728, "text_loss": 0.4801979064941406 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.901673026122687, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.023193359375, "learning_rate": 0.00047617625181242077, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 9239796.0, "repeat_count": 0.0, "routers_loss": 0.003603481687605381, "skip_count": 0.0, "step": 5730, "text_loss": 0.8374754786491394 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.911065453478134, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02294921875, "learning_rate": 0.0004758670914240488, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 9243489.0, "repeat_count": 0.0, "routers_loss": 0.004478964954614639, "skip_count": 2.0, "step": 5732, "text_loss": 0.3870154917240143 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.920457880833577, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0302734375, "learning_rate": 0.000475557940283453, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9246758.0, "repeat_count": 0.0, "routers_loss": 0.00312575395219028, "skip_count": 1.0, "step": 5734, "text_loss": 0.42341071367263794 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 26.92985030818902, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03369140625, "learning_rate": 0.00047524879850910026, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 9250053.0, "repeat_count": 0.0, "routers_loss": 0.010855631902813911, "skip_count": 4.0, "step": 5736, "text_loss": 0.25729796290397644 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 26.939242735544468, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04638671875, "learning_rate": 0.0004749396662194549, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 9253691.0, "repeat_count": 0.0, "routers_loss": 0.0009250419097952545, "skip_count": 0.0, "step": 5738, "text_loss": 0.6151770949363708 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.94863516289991, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0225830078125, "learning_rate": 0.0004746305435329767, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 9256866.0, "repeat_count": 1.0, "routers_loss": 0.007521102204918861, "skip_count": 3.0, "step": 5740, "text_loss": 0.3094986379146576 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.958027590255355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0004743214305681221, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9259790.0, "repeat_count": 0.0, "routers_loss": 0.0022241887636482716, "skip_count": 1.0, "step": 5742, "text_loss": 0.5418204069137573 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 26.967420017610802, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.00047401232744334376, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 9263205.0, "repeat_count": 1.0, "routers_loss": 0.008611299097537994, "skip_count": 2.0, "step": 5744, "text_loss": 0.35824623703956604 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 26.976812444966246, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0380859375, "learning_rate": 0.0004737032342770906, "loss": 0.0062, "macro_f1": 0.5492662787437439, "num_tokens": 9266126.0, "repeat_count": 0.0, "routers_loss": 0.010788857005536556, "skip_count": 2.0, "step": 5746, "text_loss": 0.2172674983739853 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.98620487232169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.03955078125, "learning_rate": 0.0004733941511878074, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 9269308.0, "repeat_count": 0.0, "routers_loss": 0.005309196189045906, "skip_count": 2.0, "step": 5748, "text_loss": 0.1696814000606537 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 26.995597299677137, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04248046875, "learning_rate": 0.00047308507829393594, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 9272801.0, "repeat_count": 0.0, "routers_loss": 0.009940510615706444, "skip_count": 2.0, "step": 5750, "text_loss": 0.24295592308044434 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.00469621367772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.00047277601571391314, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9276197.0, "repeat_count": 0.0, "routers_loss": 0.000687236781232059, "skip_count": 0.0, "step": 5752, "text_loss": 0.8511804342269897 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.014088641033165, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.036865234375, "learning_rate": 0.00047246696356617254, "loss": 0.0059, "macro_f1": 0.6603773832321167, "num_tokens": 9278965.0, "repeat_count": 1.0, "routers_loss": 0.009816894307732582, "skip_count": 1.0, "step": 5754, "text_loss": 0.45420053601264954 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.023481068388612, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.019287109375, "learning_rate": 0.0004721579219691434, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9282076.0, "repeat_count": 0.0, "routers_loss": 0.0015747188590466976, "skip_count": 0.0, "step": 5756, "text_loss": 0.21671754121780396 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.032873495744056, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.0004718488910412511, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9285465.0, "repeat_count": 0.0, "routers_loss": 0.008654040284454823, "skip_count": 2.0, "step": 5758, "text_loss": 0.25920194387435913 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.0422659230995, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0211181640625, "learning_rate": 0.00047153987090091674, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 9288156.0, "repeat_count": 0.0, "routers_loss": 0.0011430777376517653, "skip_count": 0.0, "step": 5760, "text_loss": 0.7655444741249084 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.051658350454947, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0004712308616665576, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 9291529.0, "repeat_count": 0.0, "routers_loss": 0.003674200503155589, "skip_count": 2.0, "step": 5762, "text_loss": 0.269486665725708 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.06105077781039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0269775390625, "learning_rate": 0.0004709218634565866, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 9294699.0, "repeat_count": 0.0, "routers_loss": 0.003249827306717634, "skip_count": 1.0, "step": 5764, "text_loss": 0.5073734521865845 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.070443205165834, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.027587890625, "learning_rate": 0.00047061287638941235, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 9297863.0, "repeat_count": 1.0, "routers_loss": 0.002763139782473445, "skip_count": 2.0, "step": 5766, "text_loss": 0.2572014033794403 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 27.07983563252128, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.026611328125, "learning_rate": 0.00047030390058343935, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 9301124.0, "repeat_count": 0.0, "routers_loss": 0.007100266870111227, "skip_count": 3.0, "step": 5768, "text_loss": 0.4147387742996216 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.089228059876724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 0.0004699949361570676, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 9304330.0, "repeat_count": 0.0, "routers_loss": 0.005467240232974291, "skip_count": 1.0, "step": 5770, "text_loss": 0.21510964632034302 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.098620487232168, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02197265625, "learning_rate": 0.000469685983228693, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9306882.0, "repeat_count": 0.0, "routers_loss": 0.003167890477925539, "skip_count": 0.0, "step": 5772, "text_loss": 0.45717427134513855 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.108012914587615, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.00046937704191670675, "loss": 0.0057, "macro_f1": 0.6601307392120361, "num_tokens": 9309767.0, "repeat_count": 1.0, "routers_loss": 0.014881107024848461, "skip_count": 2.0, "step": 5774, "text_loss": 0.3464985191822052 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.11740534194306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.029296875, "learning_rate": 0.0004690681123394959, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 9313045.0, "repeat_count": 0.0, "routers_loss": 0.00379011663608253, "skip_count": 2.0, "step": 5776, "text_loss": 0.33194616436958313 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.126797769298502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00046875919461544265, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 9315736.0, "repeat_count": 0.0, "routers_loss": 0.0016733441734686494, "skip_count": 0.0, "step": 5778, "text_loss": 0.5009998679161072 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.13619019665395, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.00046845028886292493, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9318456.0, "repeat_count": 0.0, "routers_loss": 0.005318894516676664, "skip_count": 1.0, "step": 5780, "text_loss": 0.17702752351760864 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.145582624009393, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.044921875, "learning_rate": 0.00046814139520031615, "loss": 0.006, "macro_f1": 0.8820862174034119, "num_tokens": 9323152.0, "repeat_count": 2.0, "routers_loss": 0.01133672520518303, "skip_count": 2.0, "step": 5782, "text_loss": 0.2886650860309601 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.154975051364836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0004678325137459845, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 9326318.0, "repeat_count": 0.0, "routers_loss": 0.002458433620631695, "skip_count": 0.0, "step": 5784, "text_loss": 0.5832745432853699 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.164367478720283, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0004675236446182946, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 9329779.0, "repeat_count": 0.0, "routers_loss": 0.0005402310052886605, "skip_count": 0.0, "step": 5786, "text_loss": 0.5699237585067749 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0283203125, "learning_rate": 0.00046721478793560525, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 9333360.0, "repeat_count": 0.0, "routers_loss": 0.0002638917067088187, "skip_count": 0.0, "step": 5788, "text_loss": 0.6555714011192322 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.18315233343117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0244140625, "learning_rate": 0.00046690594381627106, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 9336498.0, "repeat_count": 0.0, "routers_loss": 0.003998351749032736, "skip_count": 2.0, "step": 5790, "text_loss": 0.2076750248670578 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.192544760786618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00046659711237864157, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 9339724.0, "repeat_count": 0.0, "routers_loss": 0.0045847659930586815, "skip_count": 1.0, "step": 5792, "text_loss": 0.22027169167995453 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.20193718814206, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0186767578125, "learning_rate": 0.00046628829374106167, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 9342835.0, "repeat_count": 0.0, "routers_loss": 0.0014064523857086897, "skip_count": 1.0, "step": 5794, "text_loss": 0.5120179057121277 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.211329615497505, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 0.0004659794880218712, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 9346757.0, "repeat_count": 0.0, "routers_loss": 0.0011155207175761461, "skip_count": 1.0, "step": 5796, "text_loss": 0.6415372490882874 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.220722042852948, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0004656706953394051, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 9349652.0, "repeat_count": 0.0, "routers_loss": 0.0020385095849633217, "skip_count": 0.0, "step": 5798, "text_loss": 0.5410398840904236 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.230114470208395, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.0004653619158119933, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 9354286.0, "repeat_count": 1.0, "routers_loss": 0.0012847178149968386, "skip_count": 0.0, "step": 5800, "text_loss": 0.4386860728263855 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.23950689756384, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0194091796875, "learning_rate": 0.00046505314955796074, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 9357682.0, "repeat_count": 0.0, "routers_loss": 0.0035008061677217484, "skip_count": 2.0, "step": 5802, "text_loss": 0.13655950129032135 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.248899324919282, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0283203125, "learning_rate": 0.00046474439669562715, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 9361058.0, "repeat_count": 0.0, "routers_loss": 0.0020033426117151976, "skip_count": 1.0, "step": 5804, "text_loss": 0.6293444037437439 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.25829175227473, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00046443565734330714, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9364173.0, "repeat_count": 0.0, "routers_loss": 0.0004935986362397671, "skip_count": 0.0, "step": 5806, "text_loss": 0.2923166751861572 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.267684179630173, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0004641269316193104, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9366980.0, "repeat_count": 0.0, "routers_loss": 0.001654456602409482, "skip_count": 0.0, "step": 5808, "text_loss": 0.7273373007774353 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.277076606985617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0264892578125, "learning_rate": 0.0004638182196419411, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 9370581.0, "repeat_count": 0.0, "routers_loss": 0.0017011919990181923, "skip_count": 0.0, "step": 5810, "text_loss": 0.6029995083808899 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 27.286469034341064, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.038330078125, "learning_rate": 0.0004635095215294984, "loss": 0.0072, "macro_f1": 0.9265305995941162, "num_tokens": 9374233.0, "repeat_count": 1.0, "routers_loss": 0.01361197978258133, "skip_count": 3.0, "step": 5812, "text_loss": 0.14051523804664612 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.295861461696507, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02392578125, "learning_rate": 0.00046320083740027584, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 9377217.0, "repeat_count": 0.0, "routers_loss": 0.004597014281898737, "skip_count": 0.0, "step": 5814, "text_loss": 0.2766880691051483 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 27.30525388905195, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.021240234375, "learning_rate": 0.00046289216737256184, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 9380336.0, "repeat_count": 3.0, "routers_loss": 0.006628422066569328, "skip_count": 1.0, "step": 5816, "text_loss": 0.8092381954193115 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.314646316407398, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.021728515625, "learning_rate": 0.0004625835115646393, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9382968.0, "repeat_count": 0.0, "routers_loss": 0.002737772185355425, "skip_count": 0.0, "step": 5818, "text_loss": 0.22090643644332886 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 27.32403874376284, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.027099609375, "learning_rate": 0.0004622748700947856, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 9386203.0, "repeat_count": 1.0, "routers_loss": 0.004552177153527737, "skip_count": 1.0, "step": 5820, "text_loss": 0.42869850993156433 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.333431171118285, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0274658203125, "learning_rate": 0.0004619662430812729, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 9388968.0, "repeat_count": 0.0, "routers_loss": 0.003149240743368864, "skip_count": 2.0, "step": 5822, "text_loss": 0.45137661695480347 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.342823598473732, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0004616576306423677, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 9392487.0, "repeat_count": 0.0, "routers_loss": 0.0008133690571412444, "skip_count": 0.0, "step": 5824, "text_loss": 0.638685941696167 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.352216025829176, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.032958984375, "learning_rate": 0.0004613490328963307, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 9395665.0, "repeat_count": 0.0, "routers_loss": 0.00042717234464362264, "skip_count": 0.0, "step": 5826, "text_loss": 0.8134317398071289 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.36160845318462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0218505859375, "learning_rate": 0.00046104044996141716, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 9398831.0, "repeat_count": 0.0, "routers_loss": 0.0084775285795331, "skip_count": 2.0, "step": 5828, "text_loss": 0.19263958930969238 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.371000880540066, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.042236328125, "learning_rate": 0.0004607318819558768, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 9403118.0, "repeat_count": 1.0, "routers_loss": 0.0030239911284297705, "skip_count": 0.0, "step": 5830, "text_loss": 0.45556432008743286 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 27.38039330789551, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0458984375, "learning_rate": 0.00046042332899795313, "loss": 0.0075, "macro_f1": 0.5492662787437439, "num_tokens": 9406206.0, "repeat_count": 0.0, "routers_loss": 0.026389889419078827, "skip_count": 2.0, "step": 5832, "text_loss": 0.26458361744880676 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.389785735250953, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0252685546875, "learning_rate": 0.0004601147912058845, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 9409806.0, "repeat_count": 0.0, "routers_loss": 0.0013476534513756633, "skip_count": 0.0, "step": 5834, "text_loss": 0.7443689107894897 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.399178162606397, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0281982421875, "learning_rate": 0.0004598062686979033, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 9412737.0, "repeat_count": 0.0, "routers_loss": 0.004275512881577015, "skip_count": 1.0, "step": 5836, "text_loss": 0.2808683514595032 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.408570589961844, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0206298828125, "learning_rate": 0.00045949776159223563, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 9415818.0, "repeat_count": 0.0, "routers_loss": 0.0027225434314459562, "skip_count": 0.0, "step": 5838, "text_loss": 0.6283587217330933 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.417963017317287, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.055419921875, "learning_rate": 0.0004591892700071022, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 9419119.0, "repeat_count": 1.0, "routers_loss": 0.01574302278459072, "skip_count": 2.0, "step": 5840, "text_loss": 0.33239027857780457 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.42735544467273, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.00045888079406071746, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 9422257.0, "repeat_count": 0.0, "routers_loss": 0.0007227854221127927, "skip_count": 0.0, "step": 5842, "text_loss": 0.6658740043640137 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.436747872028178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.00045857233387129, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 9425071.0, "repeat_count": 0.0, "routers_loss": 0.0020696306601166725, "skip_count": 2.0, "step": 5844, "text_loss": 0.5773820877075195 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.44614029938362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0205078125, "learning_rate": 0.0004582638895570224, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 9427980.0, "repeat_count": 0.0, "routers_loss": 0.0019764541648328304, "skip_count": 0.0, "step": 5846, "text_loss": 0.3388919532299042 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.455532726739065, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.033447265625, "learning_rate": 0.000457955461236111, "loss": 0.0058, "macro_f1": 0.3272727429866791, "num_tokens": 9430733.0, "repeat_count": 1.0, "routers_loss": 0.04235004261136055, "skip_count": 0.0, "step": 5848, "text_loss": 0.44346582889556885 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.464925154094512, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0004576470490267462, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 9433347.0, "repeat_count": 0.0, "routers_loss": 0.000801609072368592, "skip_count": 0.0, "step": 5850, "text_loss": 0.5825944542884827 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.474317581449956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.0004573386530471121, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 9436172.0, "repeat_count": 0.0, "routers_loss": 0.0018224078230559826, "skip_count": 2.0, "step": 5852, "text_loss": 0.8111652135848999 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.4837100088054, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0004570302734153866, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 9439040.0, "repeat_count": 0.0, "routers_loss": 0.006614950485527515, "skip_count": 2.0, "step": 5854, "text_loss": 0.31270334124565125 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.493102436160846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05859375, "learning_rate": 0.0004567219102497412, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 9442138.0, "repeat_count": 0.0, "routers_loss": 0.0012984242057427764, "skip_count": 0.0, "step": 5856, "text_loss": 0.6126856803894043 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.50249486351629, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0234375, "learning_rate": 0.0004564135636683416, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 9445600.0, "repeat_count": 0.0, "routers_loss": 0.0008388847345486283, "skip_count": 0.0, "step": 5858, "text_loss": 0.8526380658149719 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.511887290871734, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046875, "learning_rate": 0.0004561052337893467, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 9449609.0, "repeat_count": 0.0, "routers_loss": 0.008125773631036282, "skip_count": 2.0, "step": 5860, "text_loss": 0.2843833863735199 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.52127971822718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0242919921875, "learning_rate": 0.000455796920730909, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 9452756.0, "repeat_count": 0.0, "routers_loss": 0.0019371749367564917, "skip_count": 0.0, "step": 5862, "text_loss": 0.5293750166893005 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.530672145582624, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.028564453125, "learning_rate": 0.0004554886246111746, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 9455467.0, "repeat_count": 1.0, "routers_loss": 0.005594742484390736, "skip_count": 2.0, "step": 5864, "text_loss": 0.572329044342041 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 27.540064572938068, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0004551803455482833, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 9458953.0, "repeat_count": 0.0, "routers_loss": 0.005960086826235056, "skip_count": 3.0, "step": 5866, "text_loss": 0.19459208846092224 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.549457000293515, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0301513671875, "learning_rate": 0.00045487208366036807, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 9462130.0, "repeat_count": 0.0, "routers_loss": 0.0034781871363520622, "skip_count": 1.0, "step": 5868, "text_loss": 0.20467053353786469 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.55884942764896, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.00045456383906555554, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 9465590.0, "repeat_count": 0.0, "routers_loss": 0.0012246103724464774, "skip_count": 0.0, "step": 5870, "text_loss": 0.6086251735687256 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.568241855004402, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0262451171875, "learning_rate": 0.00045425561188196565, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 9468092.0, "repeat_count": 0.0, "routers_loss": 0.002874316181987524, "skip_count": 1.0, "step": 5872, "text_loss": 0.3430633544921875 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.57763428235985, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041259765625, "learning_rate": 0.0004539474022277115, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 9471433.0, "repeat_count": 0.0, "routers_loss": 0.004340244457125664, "skip_count": 2.0, "step": 5874, "text_loss": 0.28219133615493774 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.587026709715293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0004536392102208997, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 9474363.0, "repeat_count": 0.0, "routers_loss": 0.0007322742021642625, "skip_count": 0.0, "step": 5876, "text_loss": 0.7305856943130493 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.596419137070736, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.025146484375, "learning_rate": 0.0004533310359796299, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 9478469.0, "repeat_count": 0.0, "routers_loss": 0.0018631393322721124, "skip_count": 0.0, "step": 5878, "text_loss": 0.5821442604064941 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 27.60581156442618, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0299072265625, "learning_rate": 0.0004530228796219952, "loss": 0.0088, "macro_f1": 0.9262410998344421, "num_tokens": 9481200.0, "repeat_count": 2.0, "routers_loss": 0.026109615340828896, "skip_count": 3.0, "step": 5880, "text_loss": 0.3962891101837158 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.615203991781627, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02587890625, "learning_rate": 0.00045271474126608167, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 9484200.0, "repeat_count": 0.0, "routers_loss": 0.0004716445691883564, "skip_count": 0.0, "step": 5882, "text_loss": 0.31901776790618896 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.62459641913707, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.031494140625, "learning_rate": 0.0004524066210299685, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 9488939.0, "repeat_count": 0.0, "routers_loss": 0.0003797562967520207, "skip_count": 0.0, "step": 5884, "text_loss": 0.3992912471294403 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.633988846492514, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0220947265625, "learning_rate": 0.0004520985190317279, "loss": 0.0032, "macro_f1": 0.6666666865348816, "num_tokens": 9492010.0, "repeat_count": 0.0, "routers_loss": 0.005681614391505718, "skip_count": 1.0, "step": 5886, "text_loss": 0.5318995118141174 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.64338127384796, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.026123046875, "learning_rate": 0.0004517904353894253, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 9494770.0, "repeat_count": 0.0, "routers_loss": 0.0021422000136226416, "skip_count": 0.0, "step": 5888, "text_loss": 0.435088187456131 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.652773701203404, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.032958984375, "learning_rate": 0.0004514823702211187, "loss": 0.0052, "macro_f1": 0.8820862174034119, "num_tokens": 9497327.0, "repeat_count": 2.0, "routers_loss": 0.01593884639441967, "skip_count": 2.0, "step": 5890, "text_loss": 0.5068450570106506 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.662166128558848, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0257568359375, "learning_rate": 0.00045117432364485927, "loss": 0.0075, "macro_f1": 0.6601307392120361, "num_tokens": 9500488.0, "repeat_count": 1.0, "routers_loss": 0.0729660913348198, "skip_count": 2.0, "step": 5892, "text_loss": 0.42718732357025146 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.671558555914295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.02978515625, "learning_rate": 0.00045086629577869127, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 9503593.0, "repeat_count": 0.0, "routers_loss": 0.007092897780239582, "skip_count": 2.0, "step": 5894, "text_loss": 0.4264345169067383 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.68095098326974, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.043212890625, "learning_rate": 0.00045055828674065134, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 9507188.0, "repeat_count": 0.0, "routers_loss": 0.004088073968887329, "skip_count": 2.0, "step": 5896, "text_loss": 0.20932413637638092 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.690343410625182, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0238037109375, "learning_rate": 0.00045025029664876926, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 9510126.0, "repeat_count": 1.0, "routers_loss": 0.0026970503386110067, "skip_count": 0.0, "step": 5898, "text_loss": 0.47661110758781433 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.69973583798063, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0164794921875, "learning_rate": 0.0004499423256210673, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 9513891.0, "repeat_count": 0.0, "routers_loss": 0.003428407246246934, "skip_count": 0.0, "step": 5900, "text_loss": 0.18232668936252594 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.709128265336073, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.024169921875, "learning_rate": 0.00044963437377556066, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 9516718.0, "repeat_count": 0.0, "routers_loss": 0.0020270352251827717, "skip_count": 0.0, "step": 5902, "text_loss": 0.16833586990833282 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.718520692691516, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0294189453125, "learning_rate": 0.000449326441230257, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 9520248.0, "repeat_count": 0.0, "routers_loss": 0.0019144838443025947, "skip_count": 0.0, "step": 5904, "text_loss": 0.44434574246406555 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.727913120046964, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.01904296875, "learning_rate": 0.00044901852810315634, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 9523651.0, "repeat_count": 0.0, "routers_loss": 0.0044578867964446545, "skip_count": 2.0, "step": 5906, "text_loss": 0.1248839721083641 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.737305547402407, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.022705078125, "learning_rate": 0.0004487106345122522, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 9527235.0, "repeat_count": 0.0, "routers_loss": 0.000827222247608006, "skip_count": 0.0, "step": 5908, "text_loss": 0.6052893996238708 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.74669797475785, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.031982421875, "learning_rate": 0.0004484027605755296, "loss": 0.0065, "macro_f1": 0.5492662787437439, "num_tokens": 9530407.0, "repeat_count": 2.0, "routers_loss": 0.029739778488874435, "skip_count": 0.0, "step": 5910, "text_loss": 0.7625715732574463 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.756090402113298, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0213623046875, "learning_rate": 0.00044809490641096653, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 9533229.0, "repeat_count": 0.0, "routers_loss": 0.0025658784434199333, "skip_count": 0.0, "step": 5912, "text_loss": 0.27842655777931213 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 27.76548282946874, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.042724609375, "learning_rate": 0.00044778707213653324, "loss": 0.0069, "macro_f1": 0.9265305995941162, "num_tokens": 9537397.0, "repeat_count": 1.0, "routers_loss": 0.010157953947782516, "skip_count": 3.0, "step": 5914, "text_loss": 0.45196083188056946 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.774875256824185, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0306396484375, "learning_rate": 0.0004474792578701924, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 9540564.0, "repeat_count": 3.0, "routers_loss": 0.011994685977697372, "skip_count": 5.0, "step": 5916, "text_loss": 0.22617442905902863 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.784267684179632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0263671875, "learning_rate": 0.000447171463729899, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 9543602.0, "repeat_count": 0.0, "routers_loss": 0.0022214490454643965, "skip_count": 0.0, "step": 5918, "text_loss": 0.5089073777198792 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.793660111535075, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.03173828125, "learning_rate": 0.0004468636898336003, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 9546829.0, "repeat_count": 1.0, "routers_loss": 0.009353389963507652, "skip_count": 2.0, "step": 5920, "text_loss": 0.7560386657714844 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.80305253889052, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.057373046875, "learning_rate": 0.00044655593629923596, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 9550259.0, "repeat_count": 0.0, "routers_loss": 0.005637963302433491, "skip_count": 0.0, "step": 5922, "text_loss": 0.17084793746471405 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.812444966245963, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0286865234375, "learning_rate": 0.00044624820324473766, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 9554376.0, "repeat_count": 1.0, "routers_loss": 0.008556432090699673, "skip_count": 2.0, "step": 5924, "text_loss": 0.5906872749328613 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.82183739360141, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.034912109375, "learning_rate": 0.0004459404907880292, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9558348.0, "repeat_count": 1.0, "routers_loss": 0.0016659445827826858, "skip_count": 0.0, "step": 5926, "text_loss": 0.8197194933891296 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 27.831229820956853, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.00044563279904702674, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 9561139.0, "repeat_count": 0.0, "routers_loss": 0.01341368816792965, "skip_count": 3.0, "step": 5928, "text_loss": 0.3264874815940857 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.840622248312297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.020751953125, "learning_rate": 0.000445325128139638, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9564387.0, "repeat_count": 0.0, "routers_loss": 0.005023977253586054, "skip_count": 2.0, "step": 5930, "text_loss": 0.9055862426757812 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.850014675667744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.02734375, "learning_rate": 0.0004450174781837635, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 9567053.0, "repeat_count": 0.0, "routers_loss": 0.0006051476229913533, "skip_count": 0.0, "step": 5932, "text_loss": 0.6908539533615112 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.859407103023187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0240478515625, "learning_rate": 0.0004447098492972951, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 9570036.0, "repeat_count": 0.0, "routers_loss": 0.003152312943711877, "skip_count": 0.0, "step": 5934, "text_loss": 0.6321061849594116 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.86879953037863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.047119140625, "learning_rate": 0.0004444022415981167, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 9574146.0, "repeat_count": 0.0, "routers_loss": 0.004859412554651499, "skip_count": 1.0, "step": 5936, "text_loss": 0.5905604958534241 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 27.878191957734078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0361328125, "learning_rate": 0.00044409465520410426, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 9577071.0, "repeat_count": 1.0, "routers_loss": 0.004376287572085857, "skip_count": 1.0, "step": 5938, "text_loss": 0.6928377747535706 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.88758438508952, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.024169921875, "learning_rate": 0.00044378709023312535, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 9580537.0, "repeat_count": 0.0, "routers_loss": 0.004038849379867315, "skip_count": 1.0, "step": 5940, "text_loss": 0.2686770558357239 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.896976812444965, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0277099609375, "learning_rate": 0.0004434795468030396, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 9583225.0, "repeat_count": 0.0, "routers_loss": 0.005459951236844063, "skip_count": 2.0, "step": 5942, "text_loss": 0.16855180263519287 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 27.906369239800412, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.025146484375, "learning_rate": 0.000443172025031698, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 9586018.0, "repeat_count": 0.0, "routers_loss": 0.0032985717989504337, "skip_count": 2.0, "step": 5944, "text_loss": 0.20335732400417328 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.915761667155856, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.027587890625, "learning_rate": 0.0004428645250369437, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 9589321.0, "repeat_count": 1.0, "routers_loss": 0.003573323367163539, "skip_count": 0.0, "step": 5946, "text_loss": 0.6318653225898743 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.9251540945113, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.00044255704693661117, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 9592518.0, "repeat_count": 0.0, "routers_loss": 0.002226749900728464, "skip_count": 0.0, "step": 5948, "text_loss": 0.5320658683776855 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.934546521866746, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0311279296875, "learning_rate": 0.0004422495908485265, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 9595664.0, "repeat_count": 0.0, "routers_loss": 0.0007805621717125177, "skip_count": 0.0, "step": 5950, "text_loss": 0.6330106258392334 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.94393894922219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0004419421568905077, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 9598885.0, "repeat_count": 0.0, "routers_loss": 0.0017050127498805523, "skip_count": 0.0, "step": 5952, "text_loss": 0.6098045706748962 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.953331376577633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03271484375, "learning_rate": 0.00044163474518036375, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 9603021.0, "repeat_count": 0.0, "routers_loss": 0.0025974081363528967, "skip_count": 0.0, "step": 5954, "text_loss": 0.2655932903289795 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 27.96272380393308, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04931640625, "learning_rate": 0.00044132735583589567, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 9605841.0, "repeat_count": 1.0, "routers_loss": 0.010364850051701069, "skip_count": 2.0, "step": 5956, "text_loss": 0.3028552532196045 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 27.972116231288524, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.015869140625, "learning_rate": 0.00044101998897489553, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 9608810.0, "repeat_count": 1.0, "routers_loss": 0.0015063622267916799, "skip_count": 0.0, "step": 5958, "text_loss": 0.5602094531059265 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 27.981508658643968, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.02880859375, "learning_rate": 0.00044071264471514683, "loss": 0.0051, "macro_f1": 0.5934640765190125, "num_tokens": 9611995.0, "repeat_count": 0.0, "routers_loss": 0.011538165621459484, "skip_count": 3.0, "step": 5960, "text_loss": 0.14332173764705658 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 27.99090108599941, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.029052734375, "learning_rate": 0.00044040532317442455, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 9615434.0, "repeat_count": 0.0, "routers_loss": 0.004693889059126377, "skip_count": 0.0, "step": 5962, "text_loss": 0.334369033575058 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.021728515625, "learning_rate": 0.00044009802447049474, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 9618056.0, "repeat_count": 1.0, "routers_loss": 0.0045085870660841465, "skip_count": 1.0, "step": 5964, "text_loss": 0.8163170218467712 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.009392427355444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.035400390625, "learning_rate": 0.00043979074872111507, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 9621428.0, "repeat_count": 0.0, "routers_loss": 0.0018220023484900594, "skip_count": 0.0, "step": 5966, "text_loss": 0.2513850927352905 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.01878485471089, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.033935546875, "learning_rate": 0.0004394834960440341, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 9625433.0, "repeat_count": 4.0, "routers_loss": 0.007051277905702591, "skip_count": 5.0, "step": 5968, "text_loss": 0.6263421177864075 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.028177282066334, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03125, "learning_rate": 0.00043917626655699154, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 9629508.0, "repeat_count": 0.0, "routers_loss": 0.0006454752874560654, "skip_count": 0.0, "step": 5970, "text_loss": 0.645618736743927 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.037569709421778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0272216796875, "learning_rate": 0.0004388690603777184, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 9632504.0, "repeat_count": 0.0, "routers_loss": 0.004847112577408552, "skip_count": 1.0, "step": 5972, "text_loss": 0.47306978702545166 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.046962136777225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.03173828125, "learning_rate": 0.00043856187762393665, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 9636685.0, "repeat_count": 0.0, "routers_loss": 0.0006580828921869397, "skip_count": 0.0, "step": 5974, "text_loss": 0.42226532101631165 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.05635456413267, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0341796875, "learning_rate": 0.0004382547184133593, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 9639958.0, "repeat_count": 0.0, "routers_loss": 0.002188180573284626, "skip_count": 0.0, "step": 5976, "text_loss": 0.4456600248813629 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.065746991488112, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.035888671875, "learning_rate": 0.0004379475828636901, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 9643228.0, "repeat_count": 1.0, "routers_loss": 0.0017135308589786291, "skip_count": 2.0, "step": 5978, "text_loss": 0.6295822262763977 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.07513941884356, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.0004376404710926244, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 9646746.0, "repeat_count": 0.0, "routers_loss": 0.0008841048111207783, "skip_count": 0.0, "step": 5980, "text_loss": 0.5102712512016296 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.084531846199003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0299072265625, "learning_rate": 0.00043733338321784784, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 9649452.0, "repeat_count": 0.0, "routers_loss": 0.0006229099817574024, "skip_count": 0.0, "step": 5982, "text_loss": 0.6944046020507812 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.093924273554446, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0252685546875, "learning_rate": 0.000437026319357037, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 9652700.0, "repeat_count": 0.0, "routers_loss": 0.005293759983032942, "skip_count": 2.0, "step": 5984, "text_loss": 0.6748214960098267 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.10331670090989, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0322265625, "learning_rate": 0.00043671927962785946, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 9655825.0, "repeat_count": 0.0, "routers_loss": 0.0013537590857595205, "skip_count": 0.0, "step": 5986, "text_loss": 1.000306248664856 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 28.112709128265337, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0380859375, "learning_rate": 0.0004364122641479733, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 9658713.0, "repeat_count": 0.0, "routers_loss": 0.004548195283859968, "skip_count": 0.0, "step": 5988, "text_loss": 0.24580086767673492 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 28.12210155562078, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0233154296875, "learning_rate": 0.0004361052730350275, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 9661535.0, "repeat_count": 0.0, "routers_loss": 0.011149964295327663, "skip_count": 4.0, "step": 5990, "text_loss": 0.5737863779067993 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.131493982976224, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0263671875, "learning_rate": 0.00043579830640666154, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 9664406.0, "repeat_count": 1.0, "routers_loss": 0.003783488878980279, "skip_count": 1.0, "step": 5992, "text_loss": 0.7836558222770691 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.14088641033167, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0439453125, "learning_rate": 0.00043549136438050573, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 9669050.0, "repeat_count": 0.0, "routers_loss": 0.0050374288111925125, "skip_count": 1.0, "step": 5994, "text_loss": 0.13072487711906433 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.150278837687114, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.046142578125, "learning_rate": 0.00043518444707418076, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 9672698.0, "repeat_count": 0.0, "routers_loss": 0.004047670867294073, "skip_count": 2.0, "step": 5996, "text_loss": 0.4748993217945099 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 28.159671265042558, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0211181640625, "learning_rate": 0.00043487755460529796, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 9676159.0, "repeat_count": 0.0, "routers_loss": 0.008628991432487965, "skip_count": 2.0, "step": 5998, "text_loss": 0.1921990066766739 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.169063692398005, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0322265625, "learning_rate": 0.00043457068709145904, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 9679528.0, "repeat_count": 3.0, "routers_loss": 0.01094671618193388, "skip_count": 3.0, "step": 6000, "text_loss": 0.3651769459247589 } ], "logging_steps": 2, "max_steps": 10650, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.64345135714893e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }