{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998757609640949, "eval_steps": 100, "global_step": 4527, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 384.625, "epoch": 0.0011043469858229456, "grad_norm": 1.912142623070508, "kl": 0.0005407754331827163, "learning_rate": 2.2075055187637973e-07, "loss": 0.0, "reward": 0.59375, "reward_std": 0.30935921147465706, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.45625, "step": 5 }, { "completion_length": 461.3875, "epoch": 0.002208693971645891, "grad_norm": 1.2625025592593675, "kl": 0.00020947456359863282, "learning_rate": 4.4150110375275946e-07, "loss": 0.0, "reward": 0.575, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.41875, "step": 10 }, { "completion_length": 385.9875, "epoch": 0.0033130409574688366, "grad_norm": 1.4824872365395827, "kl": 0.0002246655523777008, "learning_rate": 6.622516556291392e-07, "loss": 0.0, "reward": 0.61875, "reward_std": 0.3623922191560268, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.46875, "step": 15 }, { "completion_length": 429.06875, "epoch": 0.004417387943291782, "grad_norm": 1.1157617082303197, "kl": 0.000436440110206604, "learning_rate": 8.830022075055189e-07, "loss": 0.0, "reward": 0.6, "reward_std": 0.3712310537695885, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.46875, "step": 20 }, { "completion_length": 352.0125, "epoch": 0.005521734929114728, "grad_norm": 0.6004950274417132, "kl": 0.002642902731895447, "learning_rate": 1.1037527593818985e-06, "loss": 0.0001, "reward": 0.725, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.60625, "step": 25 }, { "completion_length": 294.3125, "epoch": 0.006626081914937673, "grad_norm": 3.738595553790936, "kl": 0.015436601638793946, "learning_rate": 1.3245033112582784e-06, "loss": 0.0006, "reward": 0.73125, "reward_std": 0.32703688070178033, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.61875, "step": 30 }, { "completion_length": 302.69375, "epoch": 0.007730428900760619, "grad_norm": 1.655779879795067, "kl": 0.019573783874511717, "learning_rate": 1.545253863134658e-06, "loss": 0.0008, "reward": 0.79375, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.73125, "step": 35 }, { "completion_length": 375.9125, "epoch": 0.008834775886583565, "grad_norm": 2.572886866328185, "kl": 0.018162012100219727, "learning_rate": 1.7660044150110378e-06, "loss": 0.0007, "reward": 0.8375, "reward_std": 0.30052037686109545, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.7, "step": 40 }, { "completion_length": 363.75625, "epoch": 0.00993912287240651, "grad_norm": 0.7289483033436792, "kl": 0.017768669128417968, "learning_rate": 1.9867549668874175e-06, "loss": 0.0007, "reward": 0.81875, "reward_std": 0.2916815422475338, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.73125, "step": 45 }, { "completion_length": 317.81875, "epoch": 0.011043469858229456, "grad_norm": 0.608013151594999, "kl": 0.014789676666259766, "learning_rate": 2.207505518763797e-06, "loss": 0.0006, "reward": 0.85625, "reward_std": 0.2916815422475338, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.75625, "step": 50 }, { "completion_length": 288.8625, "epoch": 0.012147816844052401, "grad_norm": 1.087832023876081, "kl": 0.014481544494628906, "learning_rate": 2.4282560706401767e-06, "loss": 0.0006, "reward": 0.8, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.71875, "step": 55 }, { "completion_length": 266.53125, "epoch": 0.013252163829875346, "grad_norm": 1.453031787000738, "kl": 0.013717460632324218, "learning_rate": 2.6490066225165567e-06, "loss": 0.0005, "reward": 0.8625, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.8, "step": 60 }, { "completion_length": 285.14375, "epoch": 0.014356510815698293, "grad_norm": 1.139133573143394, "kl": 0.01951141357421875, "learning_rate": 2.8697571743929364e-06, "loss": 0.0008, "reward": 0.90625, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.81875, "step": 65 }, { "completion_length": 265.5875, "epoch": 0.015460857801521238, "grad_norm": 0.8819487715358617, "kl": 0.01654224395751953, "learning_rate": 3.090507726269316e-06, "loss": 0.0007, "reward": 0.89375, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.79375, "step": 70 }, { "completion_length": 275.11875, "epoch": 0.016565204787344183, "grad_norm": 0.6456181700347499, "kl": 0.026264095306396486, "learning_rate": 3.311258278145696e-06, "loss": 0.0011, "reward": 0.91875, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.8, "step": 75 }, { "completion_length": 260.90625, "epoch": 0.01766955177316713, "grad_norm": 1.1108485585200154, "kl": 0.0263336181640625, "learning_rate": 3.5320088300220757e-06, "loss": 0.0011, "reward": 0.91875, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.83125, "step": 80 }, { "completion_length": 210.76875, "epoch": 0.018773898758990076, "grad_norm": 0.6452807102957274, "kl": 0.03692817687988281, "learning_rate": 3.752759381898455e-06, "loss": 0.0015, "reward": 0.98125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.90625, "step": 85 }, { "completion_length": 264.475, "epoch": 0.01987824574481302, "grad_norm": 0.7880782851098767, "kl": 0.035125350952148436, "learning_rate": 3.973509933774835e-06, "loss": 0.0014, "reward": 0.88125, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.8125, "step": 90 }, { "completion_length": 284.78125, "epoch": 0.020982592730635966, "grad_norm": 0.7085853946380432, "kl": 0.023376846313476564, "learning_rate": 4.1942604856512145e-06, "loss": 0.0009, "reward": 0.95625, "reward_std": 0.2563262037932873, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.8375, "step": 95 }, { "completion_length": 278.6875, "epoch": 0.022086939716458913, "grad_norm": 0.8560416069975489, "kl": 0.04554176330566406, "learning_rate": 4.415011037527594e-06, "loss": 0.0018, "reward": 0.9625, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.86875, "step": 100 }, { "epoch": 0.022086939716458913, "eval_completion_length": 248.37, "eval_kl": 0.046234130859375, "eval_loss": 0.0018549839733168483, "eval_reward": 0.985, "eval_reward_std": 0.1767766922712326, "eval_rewards/accuracy_reward": 0.07, "eval_rewards/format_reward": 0.915, "eval_runtime": 127.1942, "eval_samples_per_second": 0.778, "eval_steps_per_second": 0.197, "step": 100 }, { "completion_length": 252.65625, "epoch": 0.023191286702281856, "grad_norm": 0.8513924054195807, "kl": 0.07780532836914063, "learning_rate": 4.635761589403974e-06, "loss": 0.0031, "reward": 0.99375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.89375, "step": 105 }, { "completion_length": 212.9375, "epoch": 0.024295633688104803, "grad_norm": 1.1979997527722244, "kl": 0.03602142333984375, "learning_rate": 4.856512141280353e-06, "loss": 0.0014, "reward": 0.9375, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.8875, "step": 110 }, { "completion_length": 195.11875, "epoch": 0.02539998067392775, "grad_norm": 0.5507213314932029, "kl": 0.03738479614257813, "learning_rate": 5.077262693156734e-06, "loss": 0.0015, "reward": 0.95625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.90625, "step": 115 }, { "completion_length": 160.11875, "epoch": 0.026504327659750693, "grad_norm": 1.1186714555390784, "kl": 0.036508941650390626, "learning_rate": 5.2980132450331135e-06, "loss": 0.0015, "reward": 1.0, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.93125, "step": 120 }, { "completion_length": 183.3, "epoch": 0.02760867464557364, "grad_norm": 1.378767133063548, "kl": 0.044263458251953124, "learning_rate": 5.518763796909493e-06, "loss": 0.0018, "reward": 0.99375, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.8875, "step": 125 }, { "completion_length": 164.45, "epoch": 0.028713021631396586, "grad_norm": 0.7101264478347288, "kl": 0.052339935302734376, "learning_rate": 5.739514348785873e-06, "loss": 0.0021, "reward": 1.0625, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.925, "step": 130 }, { "completion_length": 180.11875, "epoch": 0.02981736861721953, "grad_norm": 0.49907818395454545, "kl": 0.05917434692382813, "learning_rate": 5.960264900662252e-06, "loss": 0.0024, "reward": 1.025, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.90625, "step": 135 }, { "completion_length": 198.775, "epoch": 0.030921715603042476, "grad_norm": 0.8587681927090215, "kl": 0.06558990478515625, "learning_rate": 6.181015452538632e-06, "loss": 0.0026, "reward": 0.98125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.86875, "step": 140 }, { "completion_length": 177.4625, "epoch": 0.03202606258886542, "grad_norm": 1.5915533844114518, "kl": 0.08502044677734374, "learning_rate": 6.4017660044150125e-06, "loss": 0.0034, "reward": 0.99375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9125, "step": 145 }, { "completion_length": 204.075, "epoch": 0.033130409574688366, "grad_norm": 0.7038907751646122, "kl": 0.06302261352539062, "learning_rate": 6.622516556291392e-06, "loss": 0.0025, "reward": 1.03125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9375, "step": 150 }, { "completion_length": 160.18125, "epoch": 0.034234756560511316, "grad_norm": 1.0970259017722608, "kl": 0.09358978271484375, "learning_rate": 6.843267108167772e-06, "loss": 0.0037, "reward": 1.08125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.96875, "step": 155 }, { "completion_length": 180.5875, "epoch": 0.03533910354633426, "grad_norm": 0.5682863259937629, "kl": 0.091546630859375, "learning_rate": 7.064017660044151e-06, "loss": 0.0037, "reward": 1.0375, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.975, "step": 160 }, { "completion_length": 209.2625, "epoch": 0.0364434505321572, "grad_norm": 0.9429119760227406, "kl": 0.08664703369140625, "learning_rate": 7.28476821192053e-06, "loss": 0.0035, "reward": 1.08125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.96875, "step": 165 }, { "completion_length": 208.41875, "epoch": 0.03754779751798015, "grad_norm": 1.2790088963784818, "kl": 0.101275634765625, "learning_rate": 7.50551876379691e-06, "loss": 0.0041, "reward": 1.0625, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.16875, "rewards/format_reward": 0.89375, "step": 170 }, { "completion_length": 259.24375, "epoch": 0.038652144503803096, "grad_norm": 0.46075949216263296, "kl": 0.0850189208984375, "learning_rate": 7.726269315673288e-06, "loss": 0.0034, "reward": 0.94375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.89375, "step": 175 }, { "completion_length": 248.26875, "epoch": 0.03975649148962604, "grad_norm": 0.6242098331069889, "kl": 0.086053466796875, "learning_rate": 7.94701986754967e-06, "loss": 0.0034, "reward": 1.025, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.8875, "step": 180 }, { "completion_length": 238.375, "epoch": 0.04086083847544899, "grad_norm": 1.150282122681421, "kl": 0.1117950439453125, "learning_rate": 8.16777041942605e-06, "loss": 0.0045, "reward": 1.025, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.9125, "step": 185 }, { "completion_length": 219.4, "epoch": 0.04196518546127193, "grad_norm": 0.7542415145174329, "kl": 0.126629638671875, "learning_rate": 8.388520971302429e-06, "loss": 0.0051, "reward": 0.975, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.89375, "step": 190 }, { "completion_length": 198.54375, "epoch": 0.043069532447094876, "grad_norm": 0.6615977009838082, "kl": 0.1339691162109375, "learning_rate": 8.609271523178809e-06, "loss": 0.0054, "reward": 0.975, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.89375, "step": 195 }, { "completion_length": 230.6, "epoch": 0.044173879432917826, "grad_norm": 1.2480834347897671, "kl": 0.1072998046875, "learning_rate": 8.830022075055188e-06, "loss": 0.0043, "reward": 0.95625, "reward_std": 0.2563262037932873, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.85, "step": 200 }, { "epoch": 0.044173879432917826, "eval_completion_length": 205.55, "eval_kl": 0.0896044921875, "eval_loss": 0.003579025389626622, "eval_reward": 1.085, "eval_reward_std": 0.1484924215078354, "eval_rewards/accuracy_reward": 0.13, "eval_rewards/format_reward": 0.955, "eval_runtime": 103.514, "eval_samples_per_second": 0.956, "eval_steps_per_second": 0.242, "step": 200 }, { "completion_length": 224.75, "epoch": 0.04527822641874077, "grad_norm": 0.5677247970494854, "kl": 0.1135223388671875, "learning_rate": 9.050772626931568e-06, "loss": 0.0045, "reward": 1.025, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.90625, "step": 205 }, { "completion_length": 276.25625, "epoch": 0.04638257340456371, "grad_norm": 0.6313414085910611, "kl": 0.12886962890625, "learning_rate": 9.271523178807948e-06, "loss": 0.0052, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.20625, "rewards/format_reward": 0.91875, "step": 210 }, { "completion_length": 337.5125, "epoch": 0.04748692039038666, "grad_norm": 0.6076982934519585, "kl": 0.1175445556640625, "learning_rate": 9.492273730684327e-06, "loss": 0.0047, "reward": 0.9125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.81875, "step": 215 }, { "completion_length": 331.2875, "epoch": 0.048591267376209606, "grad_norm": 0.887758372335282, "kl": 0.11523284912109374, "learning_rate": 9.713024282560707e-06, "loss": 0.0046, "reward": 0.94375, "reward_std": 0.23864853456616403, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.85625, "step": 220 }, { "completion_length": 269.38125, "epoch": 0.04969561436203255, "grad_norm": 0.6864886594586257, "kl": 0.13491058349609375, "learning_rate": 9.933774834437086e-06, "loss": 0.0054, "reward": 0.95, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.875, "step": 225 }, { "completion_length": 246.025, "epoch": 0.0507999613478555, "grad_norm": 1.2699307354463993, "kl": 0.166473388671875, "learning_rate": 1.0154525386313468e-05, "loss": 0.0067, "reward": 1.025, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.88125, "step": 230 }, { "completion_length": 206.09375, "epoch": 0.05190430833367844, "grad_norm": 0.6720713057618479, "kl": 0.1639678955078125, "learning_rate": 1.0375275938189846e-05, "loss": 0.0066, "reward": 1.0375, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.925, "step": 235 }, { "completion_length": 312.0875, "epoch": 0.053008655319501385, "grad_norm": 0.8771338863302535, "kl": 0.25369873046875, "learning_rate": 1.0596026490066227e-05, "loss": 0.0101, "reward": 0.9, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.8125, "step": 240 }, { "completion_length": 188.65, "epoch": 0.054113002305324336, "grad_norm": 0.6127023135715081, "kl": 0.209857177734375, "learning_rate": 1.0816777041942605e-05, "loss": 0.0084, "reward": 1.0125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.91875, "step": 245 }, { "completion_length": 183.35, "epoch": 0.05521734929114728, "grad_norm": 0.9178124946472914, "kl": 0.2119171142578125, "learning_rate": 1.1037527593818986e-05, "loss": 0.0085, "reward": 0.95625, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.90625, "step": 250 }, { "completion_length": 199.725, "epoch": 0.05632169627697022, "grad_norm": 0.9841255331094259, "kl": 0.188714599609375, "learning_rate": 1.1258278145695364e-05, "loss": 0.0075, "reward": 0.9875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.91875, "step": 255 }, { "completion_length": 222.5875, "epoch": 0.05742604326279317, "grad_norm": 0.6571161565702436, "kl": 1504.1844940185547, "learning_rate": 1.1479028697571745e-05, "loss": 60.0766, "reward": 0.975, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9, "step": 260 }, { "completion_length": 238.375, "epoch": 0.058530390248616115, "grad_norm": 0.7793687563873339, "kl": 0.16044921875, "learning_rate": 1.1699779249448125e-05, "loss": 0.0064, "reward": 1.04375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.925, "step": 265 }, { "completion_length": 308.975, "epoch": 0.05963473723443906, "grad_norm": 0.6524174398492628, "kl": 0.344976806640625, "learning_rate": 1.1920529801324505e-05, "loss": 0.0138, "reward": 1.03125, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.89375, "step": 270 }, { "completion_length": 298.2875, "epoch": 0.06073908422026201, "grad_norm": 0.515875299778908, "kl": 0.228948974609375, "learning_rate": 1.2141280353200884e-05, "loss": 0.0092, "reward": 0.98125, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.88125, "step": 275 }, { "completion_length": 328.84375, "epoch": 0.06184343120608495, "grad_norm": 31.625220010701653, "kl": 0.2593719482421875, "learning_rate": 1.2362030905077264e-05, "loss": 0.0104, "reward": 1.08125, "reward_std": 0.30935921147465706, "rewards/accuracy_reward": 0.225, "rewards/format_reward": 0.85625, "step": 280 }, { "completion_length": 338.28125, "epoch": 0.0629477781919079, "grad_norm": 0.8132166957068776, "kl": 0.26093597412109376, "learning_rate": 1.2582781456953644e-05, "loss": 0.0104, "reward": 0.925, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.83125, "step": 285 }, { "completion_length": 276.55625, "epoch": 0.06405212517773085, "grad_norm": 0.8732592923475726, "kl": 0.2025054931640625, "learning_rate": 1.2803532008830025e-05, "loss": 0.0081, "reward": 1.0125, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8875, "step": 290 }, { "completion_length": 222.75, "epoch": 0.06515647216355379, "grad_norm": 0.5307249225741028, "kl": 0.1828094482421875, "learning_rate": 1.3024282560706403e-05, "loss": 0.0073, "reward": 1.0625, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "step": 295 }, { "completion_length": 176.43125, "epoch": 0.06626081914937673, "grad_norm": 0.7065408921391656, "kl": 0.169866943359375, "learning_rate": 1.3245033112582784e-05, "loss": 0.0068, "reward": 1.11875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.95625, "step": 300 }, { "epoch": 0.06626081914937673, "eval_completion_length": 205.125, "eval_kl": 0.178349609375, "eval_loss": 0.007154763210564852, "eval_reward": 1.085, "eval_reward_std": 0.162634556889534, "eval_rewards/accuracy_reward": 0.13, "eval_rewards/format_reward": 0.955, "eval_runtime": 111.3878, "eval_samples_per_second": 0.889, "eval_steps_per_second": 0.224, "step": 300 }, { "completion_length": 243.425, "epoch": 0.06736516613519968, "grad_norm": 0.7324178851810815, "kl": 0.2018463134765625, "learning_rate": 1.3465783664459162e-05, "loss": 0.0081, "reward": 1.0375, "reward_std": 0.2828427076339722, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.9, "step": 305 }, { "completion_length": 246.875, "epoch": 0.06846951312102263, "grad_norm": 0.5430407102590828, "kl": 0.199591064453125, "learning_rate": 1.3686534216335543e-05, "loss": 0.008, "reward": 1.0, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9125, "step": 310 }, { "completion_length": 231.8625, "epoch": 0.06957386010684558, "grad_norm": 2.525719433106217, "kl": 0.256878662109375, "learning_rate": 1.3907284768211921e-05, "loss": 0.0103, "reward": 1.08125, "reward_std": 0.23864853456616403, "rewards/accuracy_reward": 0.18125, "rewards/format_reward": 0.9, "step": 315 }, { "completion_length": 269.71875, "epoch": 0.07067820709266852, "grad_norm": 2.2679595474497583, "kl": 0.81484375, "learning_rate": 1.4128035320088303e-05, "loss": 0.0326, "reward": 1.0, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.875, "step": 320 }, { "completion_length": 208.94375, "epoch": 0.07178255407849146, "grad_norm": 1.1848713457261022, "kl": 0.43358154296875, "learning_rate": 1.434878587196468e-05, "loss": 0.0174, "reward": 0.99375, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.88125, "step": 325 }, { "completion_length": 186.3375, "epoch": 0.0728869010643144, "grad_norm": 1.434354885038631, "kl": 0.292218017578125, "learning_rate": 1.456953642384106e-05, "loss": 0.0117, "reward": 0.74375, "reward_std": 0.30935921147465706, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.66875, "step": 330 }, { "completion_length": 616.03125, "epoch": 0.07399124805013735, "grad_norm": 0.46617566479772593, "kl": 0.228448486328125, "learning_rate": 1.479028697571744e-05, "loss": 0.0091, "reward": 0.58125, "reward_std": 0.2563262037932873, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.5, "step": 335 }, { "completion_length": 781.5875, "epoch": 0.0750955950359603, "grad_norm": 0.4999142123724588, "kl": 0.203704833984375, "learning_rate": 1.501103752759382e-05, "loss": 0.0081, "reward": 0.5125, "reward_std": 0.30052037686109545, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.48125, "step": 340 }, { "completion_length": 438.83125, "epoch": 0.07619994202178325, "grad_norm": 1.1259807453052872, "kl": 0.229962158203125, "learning_rate": 1.52317880794702e-05, "loss": 0.0092, "reward": 0.99375, "reward_std": 0.2916815422475338, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.8625, "step": 345 }, { "completion_length": 308.03125, "epoch": 0.07730428900760619, "grad_norm": 1.1381907828574633, "kl": 0.246209716796875, "learning_rate": 1.5452538631346577e-05, "loss": 0.0098, "reward": 1.01875, "reward_std": 0.2740038730204105, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.8625, "step": 350 }, { "completion_length": 466.78125, "epoch": 0.07840863599342913, "grad_norm": 145.06654530948717, "kl": 1.06409912109375, "learning_rate": 1.567328918322296e-05, "loss": 0.0425, "reward": 0.83125, "reward_std": 0.3623922191560268, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.725, "step": 355 }, { "completion_length": 601.19375, "epoch": 0.07951298297925208, "grad_norm": 4.173181885857791, "kl": 0.5132080078125, "learning_rate": 1.589403973509934e-05, "loss": 0.0205, "reward": 0.5625, "reward_std": 0.38890872299671175, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.5125, "step": 360 }, { "completion_length": 552.525, "epoch": 0.08061732996507502, "grad_norm": 2.488203792347465, "kl": 0.5505126953125, "learning_rate": 1.6114790286975718e-05, "loss": 0.022, "reward": 0.675, "reward_std": 0.38890872299671175, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.61875, "step": 365 }, { "completion_length": 323.6625, "epoch": 0.08172167695089798, "grad_norm": 0.4380180481079113, "kl": 0.425970458984375, "learning_rate": 1.63355408388521e-05, "loss": 0.017, "reward": 1.0375, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.90625, "step": 370 }, { "completion_length": 284.35, "epoch": 0.08282602393672092, "grad_norm": 1.2931698264509297, "kl": 0.374395751953125, "learning_rate": 1.6556291390728477e-05, "loss": 0.015, "reward": 1.01875, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9, "step": 375 }, { "completion_length": 271.5875, "epoch": 0.08393037092254386, "grad_norm": 2.19321510252618, "kl": 0.78577880859375, "learning_rate": 1.6777041942604858e-05, "loss": 0.0315, "reward": 1.0625, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.175, "rewards/format_reward": 0.8875, "step": 380 }, { "completion_length": 247.65, "epoch": 0.08503471790836681, "grad_norm": 0.8492367847141669, "kl": 0.65609130859375, "learning_rate": 1.699779249448124e-05, "loss": 0.0263, "reward": 1.04375, "reward_std": 0.23864853456616403, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.89375, "step": 385 }, { "completion_length": 279.06875, "epoch": 0.08613906489418975, "grad_norm": 1.1176439301337926, "kl": 0.440093994140625, "learning_rate": 1.7218543046357617e-05, "loss": 0.0176, "reward": 1.0125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.90625, "step": 390 }, { "completion_length": 381.325, "epoch": 0.0872434118800127, "grad_norm": 1.4438637661323328, "kl": 1.2351318359375, "learning_rate": 1.7439293598234e-05, "loss": 0.0494, "reward": 0.81875, "reward_std": 0.30935921147465706, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.73125, "step": 395 }, { "completion_length": 393.53125, "epoch": 0.08834775886583565, "grad_norm": 0.8840798582686505, "kl": 0.66854248046875, "learning_rate": 1.7660044150110377e-05, "loss": 0.0267, "reward": 0.9625, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8375, "step": 400 }, { "epoch": 0.08834775886583565, "eval_completion_length": 357.46, "eval_kl": 0.44, "eval_loss": 0.017609253525733948, "eval_reward": 0.99, "eval_reward_std": 0.24041630148887635, "eval_rewards/accuracy_reward": 0.155, "eval_rewards/format_reward": 0.835, "eval_runtime": 153.4656, "eval_samples_per_second": 0.645, "eval_steps_per_second": 0.163, "step": 400 }, { "completion_length": 289.31875, "epoch": 0.0894521058516586, "grad_norm": 5.233032104359776, "kl": 0.511737060546875, "learning_rate": 1.7880794701986758e-05, "loss": 0.0205, "reward": 1.00625, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9125, "step": 405 }, { "completion_length": 255.6, "epoch": 0.09055645283748154, "grad_norm": 0.5288317975266756, "kl": 0.2679931640625, "learning_rate": 1.8101545253863136e-05, "loss": 0.0107, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95, "step": 410 }, { "completion_length": 283.04375, "epoch": 0.09166079982330448, "grad_norm": 0.44523434013578594, "kl": 0.219281005859375, "learning_rate": 1.8322295805739517e-05, "loss": 0.0088, "reward": 1.0625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95, "step": 415 }, { "completion_length": 308.90625, "epoch": 0.09276514680912742, "grad_norm": 0.8815212129055245, "kl": 0.264593505859375, "learning_rate": 1.8543046357615895e-05, "loss": 0.0106, "reward": 1.0125, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9125, "step": 420 }, { "completion_length": 314.7375, "epoch": 0.09386949379495037, "grad_norm": 1.3789377076942395, "kl": 0.3002197265625, "learning_rate": 1.8763796909492276e-05, "loss": 0.012, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.90625, "step": 425 }, { "completion_length": 339.475, "epoch": 0.09497384078077332, "grad_norm": 0.6184446711073517, "kl": 0.441192626953125, "learning_rate": 1.8984547461368654e-05, "loss": 0.0177, "reward": 0.95625, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.90625, "step": 430 }, { "completion_length": 317.0125, "epoch": 0.09607818776659627, "grad_norm": 1.5357040975928846, "kl": 0.2957275390625, "learning_rate": 1.9205298013245036e-05, "loss": 0.0118, "reward": 1.0, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.94375, "step": 435 }, { "completion_length": 326.9, "epoch": 0.09718253475241921, "grad_norm": 0.9509356833694793, "kl": 0.24755859375, "learning_rate": 1.9426048565121414e-05, "loss": 0.0099, "reward": 0.95, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.89375, "step": 440 }, { "completion_length": 247.94375, "epoch": 0.09828688173824215, "grad_norm": 7.331260846492792, "kl": 0.458038330078125, "learning_rate": 1.9646799116997795e-05, "loss": 0.0183, "reward": 0.9875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.93125, "step": 445 }, { "completion_length": 244.2, "epoch": 0.0993912287240651, "grad_norm": 2.2209695163797973, "kl": 0.579638671875, "learning_rate": 1.9867549668874173e-05, "loss": 0.0232, "reward": 0.98125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.93125, "step": 450 }, { "completion_length": 247.9875, "epoch": 0.10049557570988804, "grad_norm": 0.6146792415154715, "kl": 0.3593017578125, "learning_rate": 1.9999988107104428e-05, "loss": 0.0144, "reward": 1.025, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.95625, "step": 455 }, { "completion_length": 204.425, "epoch": 0.101599922695711, "grad_norm": 1.2131689914986956, "kl": 0.246929931640625, "learning_rate": 1.9999854312354064e-05, "loss": 0.0099, "reward": 1.08125, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.95625, "step": 460 }, { "completion_length": 215.4625, "epoch": 0.10270426968153394, "grad_norm": 22.462295342430743, "kl": 0.606536865234375, "learning_rate": 1.999957185872951e-05, "loss": 0.0243, "reward": 0.94375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.90625, "step": 465 }, { "completion_length": 180.38125, "epoch": 0.10380861666735688, "grad_norm": 0.22049287292981123, "kl": 1.026416015625, "learning_rate": 1.999914075042975e-05, "loss": 0.0411, "reward": 0.99375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.93125, "step": 470 }, { "completion_length": 227.85625, "epoch": 0.10491296365317983, "grad_norm": 0.5306656385299113, "kl": 0.257354736328125, "learning_rate": 1.9998560993863682e-05, "loss": 0.0103, "reward": 0.925, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.8875, "step": 475 }, { "completion_length": 173.825, "epoch": 0.10601731063900277, "grad_norm": 0.7697195391306778, "kl": 0.243804931640625, "learning_rate": 1.999783259765003e-05, "loss": 0.0098, "reward": 0.9875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.95, "step": 480 }, { "completion_length": 245.43125, "epoch": 0.10712165762482571, "grad_norm": 0.8895584020157263, "kl": 0.344189453125, "learning_rate": 1.9996955572617202e-05, "loss": 0.0138, "reward": 0.94375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.925, "step": 485 }, { "completion_length": 228.575, "epoch": 0.10822600461064867, "grad_norm": 0.4019967230109815, "kl": 0.6487060546875, "learning_rate": 1.999592993180315e-05, "loss": 0.026, "reward": 0.95, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9, "step": 490 }, { "completion_length": 191.1125, "epoch": 0.10933035159647161, "grad_norm": 2.2236898903555584, "kl": 0.3437255859375, "learning_rate": 1.9994755690455154e-05, "loss": 0.0137, "reward": 1.025, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.95625, "step": 495 }, { "completion_length": 227.65625, "epoch": 0.11043469858229456, "grad_norm": 0.31534477505615033, "kl": 0.254638671875, "learning_rate": 1.9993432866029604e-05, "loss": 0.0102, "reward": 1.0375, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.96875, "step": 500 }, { "epoch": 0.11043469858229456, "eval_completion_length": 305.16, "eval_kl": 0.25392578125, "eval_loss": 0.010133117437362671, "eval_reward": 1.01, "eval_reward_std": 0.09899494767189027, "eval_rewards/accuracy_reward": 0.065, "eval_rewards/format_reward": 0.945, "eval_runtime": 127.2591, "eval_samples_per_second": 0.778, "eval_steps_per_second": 0.196, "step": 500 }, { "completion_length": 348.06875, "epoch": 0.1115390455681175, "grad_norm": 0.47067061501157526, "kl": 0.3302001953125, "learning_rate": 1.9991961478191753e-05, "loss": 0.0132, "reward": 0.95625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.9125, "step": 505 }, { "completion_length": 404.30625, "epoch": 0.11264339255394044, "grad_norm": 0.3909987106694373, "kl": 0.290771484375, "learning_rate": 1.99903415488154e-05, "loss": 0.0116, "reward": 0.975, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.93125, "step": 510 }, { "completion_length": 331.25, "epoch": 0.11374773953976339, "grad_norm": 0.13154079900966428, "kl": 0.2630859375, "learning_rate": 1.998857310198259e-05, "loss": 0.0105, "reward": 0.90625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0125, "rewards/format_reward": 0.89375, "step": 515 }, { "completion_length": 294.86875, "epoch": 0.11485208652558634, "grad_norm": 0.600369457847733, "kl": 0.209710693359375, "learning_rate": 1.998665616398323e-05, "loss": 0.0084, "reward": 0.9875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.95, "step": 520 }, { "completion_length": 194.80625, "epoch": 0.11595643351140929, "grad_norm": 0.6528037119159925, "kl": 0.23944091796875, "learning_rate": 1.9984590763314722e-05, "loss": 0.0096, "reward": 1.01875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.975, "step": 525 }, { "completion_length": 150.6, "epoch": 0.11706078049723223, "grad_norm": 0.2685863872987803, "kl": 0.19815673828125, "learning_rate": 1.998237693068153e-05, "loss": 0.0079, "reward": 1.00625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.975, "step": 530 }, { "completion_length": 190.7, "epoch": 0.11816512748305517, "grad_norm": 0.6345565715762368, "kl": 0.385015869140625, "learning_rate": 1.9980014698994722e-05, "loss": 0.0154, "reward": 1.0, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.95625, "step": 535 }, { "completion_length": 263.48125, "epoch": 0.11926947446887812, "grad_norm": 0.7032479277266585, "kl": 0.430792236328125, "learning_rate": 1.997750410337147e-05, "loss": 0.0172, "reward": 1.0125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.96875, "step": 540 }, { "completion_length": 322.3125, "epoch": 0.12037382145470106, "grad_norm": 0.7900459644547021, "kl": 0.45318603515625, "learning_rate": 1.997484518113456e-05, "loss": 0.0181, "reward": 0.9625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.925, "step": 545 }, { "completion_length": 282.39375, "epoch": 0.12147816844052402, "grad_norm": 1.7131720462446698, "kl": 0.667669677734375, "learning_rate": 1.9972037971811802e-05, "loss": 0.0267, "reward": 0.94375, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0125, "rewards/format_reward": 0.93125, "step": 550 }, { "completion_length": 351.20625, "epoch": 0.12258251542634696, "grad_norm": 2.3429062472049664, "kl": 0.4404296875, "learning_rate": 1.9969082517135463e-05, "loss": 0.0176, "reward": 0.9875, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.93125, "step": 555 }, { "completion_length": 302.83125, "epoch": 0.1236868624121699, "grad_norm": 0.42331075217778713, "kl": 0.244976806640625, "learning_rate": 1.9965978861041637e-05, "loss": 0.0098, "reward": 1.04375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9625, "step": 560 }, { "completion_length": 227.3625, "epoch": 0.12479120939799285, "grad_norm": 6.878210574805294, "kl": 0.490325927734375, "learning_rate": 1.99627270496696e-05, "loss": 0.0196, "reward": 1.0125, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.95625, "step": 565 }, { "completion_length": 259.70625, "epoch": 0.1258955563838158, "grad_norm": 0.29828625265818887, "kl": 0.570684814453125, "learning_rate": 1.995932713136112e-05, "loss": 0.0228, "reward": 0.9625, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.925, "step": 570 }, { "completion_length": 256.55625, "epoch": 0.12699990336963873, "grad_norm": 0.5399903589695695, "kl": 0.375726318359375, "learning_rate": 1.9955779156659735e-05, "loss": 0.015, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.975, "step": 575 }, { "completion_length": 242.5, "epoch": 0.1281042503554617, "grad_norm": 9.42984310059387, "kl": 0.512103271484375, "learning_rate": 1.9952083178310002e-05, "loss": 0.0205, "reward": 1.0375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9625, "step": 580 }, { "completion_length": 217.3375, "epoch": 0.12920859734128462, "grad_norm": 4.883219507705031, "kl": 0.71790771484375, "learning_rate": 1.994823925125672e-05, "loss": 0.0287, "reward": 1.03125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.95, "step": 585 }, { "completion_length": 213.125, "epoch": 0.13031294432710758, "grad_norm": 0.656048665938222, "kl": 0.331298828125, "learning_rate": 1.994424743264412e-05, "loss": 0.0132, "reward": 1.10625, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.95625, "step": 590 }, { "completion_length": 175.19375, "epoch": 0.13141729131293053, "grad_norm": 0.4384345369089057, "kl": 0.21328125, "learning_rate": 1.9940107781814976e-05, "loss": 0.0085, "reward": 1.08125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.99375, "step": 595 }, { "completion_length": 232.13125, "epoch": 0.13252163829875346, "grad_norm": 9.864648628359978, "kl": 0.3187255859375, "learning_rate": 1.993582036030978e-05, "loss": 0.0127, "reward": 1.00625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.95625, "step": 600 }, { "epoch": 0.13252163829875346, "eval_completion_length": 205.38, "eval_kl": 16.67103515625, "eval_loss": 0.669373095035553, "eval_reward": 1.06, "eval_reward_std": 0.22627416610717774, "eval_rewards/accuracy_reward": 0.135, "eval_rewards/format_reward": 0.925, "eval_runtime": 95.7714, "eval_samples_per_second": 1.034, "eval_steps_per_second": 0.261, "step": 600 }, { "completion_length": 220.1375, "epoch": 0.13362598528457642, "grad_norm": 0.9515331014381163, "kl": 0.321160888671875, "learning_rate": 1.993138523186578e-05, "loss": 0.0129, "reward": 1.06875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.98125, "step": 605 }, { "completion_length": 219.99375, "epoch": 0.13473033227039935, "grad_norm": 3.2832305875346033, "kl": 0.55125732421875, "learning_rate": 1.9926802462416054e-05, "loss": 0.0221, "reward": 1.04375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9625, "step": 610 }, { "completion_length": 218.5875, "epoch": 0.1358346792562223, "grad_norm": 1.0232056540109529, "kl": 0.942523193359375, "learning_rate": 1.9922072120088537e-05, "loss": 0.0377, "reward": 0.99375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.93125, "step": 615 }, { "completion_length": 268.2125, "epoch": 0.13693902624204526, "grad_norm": 1.7246486491338628, "kl": 0.6009521484375, "learning_rate": 1.991719427520499e-05, "loss": 0.024, "reward": 1.0125, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.9, "step": 620 }, { "completion_length": 240.51875, "epoch": 0.1380433732278682, "grad_norm": 7.217525782475475, "kl": 0.579345703125, "learning_rate": 1.9912169000279952e-05, "loss": 0.0231, "reward": 1.05, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.975, "step": 625 }, { "completion_length": 307.725, "epoch": 0.13914772021369115, "grad_norm": 0.32686422446894037, "kl": 0.320556640625, "learning_rate": 1.9906996370019692e-05, "loss": 0.0128, "reward": 1.05625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9375, "step": 630 }, { "completion_length": 448.2375, "epoch": 0.14025206719951408, "grad_norm": 4.724746821053579, "kl": 0.55283203125, "learning_rate": 1.990167646132107e-05, "loss": 0.0221, "reward": 0.8125, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.74375, "step": 635 }, { "completion_length": 466.6125, "epoch": 0.14135641418533704, "grad_norm": 0.3626333681552602, "kl": 3.0367431640625, "learning_rate": 1.9896209353270394e-05, "loss": 0.1216, "reward": 0.7625, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.725, "step": 640 }, { "completion_length": 351.9125, "epoch": 0.14246076117115997, "grad_norm": 0.3534359394092258, "kl": 0.2459228515625, "learning_rate": 1.989059512714227e-05, "loss": 0.0098, "reward": 1.01875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.925, "step": 645 }, { "completion_length": 309.93125, "epoch": 0.14356510815698292, "grad_norm": 0.6269748799278583, "kl": 0.239697265625, "learning_rate": 1.988483386639836e-05, "loss": 0.0096, "reward": 1.1125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.9625, "step": 650 }, { "completion_length": 304.49375, "epoch": 0.14466945514280588, "grad_norm": 2.4513786768690364, "kl": 0.2604248046875, "learning_rate": 1.9878925656686167e-05, "loss": 0.0104, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.98125, "step": 655 }, { "completion_length": 307.99375, "epoch": 0.1457738021286288, "grad_norm": 0.5055349332276848, "kl": 0.27725830078125, "learning_rate": 1.9872870585837757e-05, "loss": 0.0111, "reward": 1.00625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.95, "step": 660 }, { "completion_length": 281.29375, "epoch": 0.14687814911445177, "grad_norm": 42388.24147147825, "kl": 170.7101318359375, "learning_rate": 1.9866668743868437e-05, "loss": 6.8324, "reward": 0.9375, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.0125, "rewards/format_reward": 0.925, "step": 665 }, { "completion_length": 331.09375, "epoch": 0.1479824961002747, "grad_norm": 3.1708940830460186, "kl": 1.48009033203125, "learning_rate": 1.9860320222975435e-05, "loss": 0.0594, "reward": 0.8875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.84375, "step": 670 }, { "completion_length": 329.43125, "epoch": 0.14908684308609765, "grad_norm": 9.028073103998265, "kl": 0.427667236328125, "learning_rate": 1.9853825117536522e-05, "loss": 0.0171, "reward": 0.95, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.9125, "step": 675 }, { "completion_length": 295.99375, "epoch": 0.1501911900719206, "grad_norm": 1.3497959756065208, "kl": 0.8688232421875, "learning_rate": 1.9847183524108614e-05, "loss": 0.0348, "reward": 0.98125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.93125, "step": 680 }, { "completion_length": 256.76875, "epoch": 0.15129553705774354, "grad_norm": 0.5127846687930365, "kl": 0.305950927734375, "learning_rate": 1.9840395541426333e-05, "loss": 0.0122, "reward": 1.06875, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.96875, "step": 685 }, { "completion_length": 248.68125, "epoch": 0.1523998840435665, "grad_norm": 0.9607285175875181, "kl": 0.33861083984375, "learning_rate": 1.983346127040053e-05, "loss": 0.0135, "reward": 1.01875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.95, "step": 690 }, { "completion_length": 209.85, "epoch": 0.15350423102938943, "grad_norm": 2.0950203003912313, "kl": 0.3568115234375, "learning_rate": 1.9826380814116795e-05, "loss": 0.0143, "reward": 1.0, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.925, "step": 695 }, { "completion_length": 618.08125, "epoch": 0.15460857801521238, "grad_norm": 2.8704953934506423, "kl": 1.026025390625, "learning_rate": 1.9819154277833938e-05, "loss": 0.041, "reward": 0.71875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.6875, "step": 700 }, { "epoch": 0.15460857801521238, "eval_completion_length": 959.67, "eval_kl": 1.478125, "eval_loss": 0.05926254764199257, "eval_reward": 0.6, "eval_reward_std": 0.38183765530586244, "eval_rewards/accuracy_reward": 0.045, "eval_rewards/format_reward": 0.555, "eval_runtime": 261.2361, "eval_samples_per_second": 0.379, "eval_steps_per_second": 0.096, "step": 700 }, { "completion_length": 1003.5125, "epoch": 0.1557129250010353, "grad_norm": 0.28454444589509803, "kl": 1.074609375, "learning_rate": 1.9811781768982392e-05, "loss": 0.043, "reward": 0.7375, "reward_std": 0.2828427076339722, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.71875, "step": 705 }, { "completion_length": 1024.0, "epoch": 0.15681727198685827, "grad_norm": 0.20551909954742464, "kl": 648601.9237182618, "learning_rate": 1.980426339716264e-05, "loss": 25944.5938, "reward": 0.8125, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.00625, "rewards/format_reward": 0.80625, "step": 710 }, { "completion_length": 1024.0, "epoch": 0.15792161897268123, "grad_norm": 0.1216302069380067, "kl": 0.264453125, "learning_rate": 1.9796599274143586e-05, "loss": 0.0106, "reward": 0.99375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.0125, "rewards/format_reward": 0.98125, "step": 715 }, { "completion_length": 1024.0, "epoch": 0.15902596595850416, "grad_norm": 0.13579361297724876, "kl": 0.23675537109375, "learning_rate": 1.9788789513860875e-05, "loss": 0.0095, "reward": 1.0, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.98125, "step": 720 }, { "completion_length": 1024.0, "epoch": 0.1601303129443271, "grad_norm": 0.1484233873762049, "kl": 0.26705322265625, "learning_rate": 1.9780834232415214e-05, "loss": 0.0107, "reward": 0.975, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.95625, "step": 725 }, { "completion_length": 1024.0, "epoch": 0.16123465993015004, "grad_norm": 0.2382323266501869, "kl": 0.26546630859375, "learning_rate": 1.9772733548070647e-05, "loss": 0.0106, "reward": 0.88125, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.85625, "step": 730 }, { "completion_length": 1024.0, "epoch": 0.162339006915973, "grad_norm": 0.22617291442101026, "kl": 0.29144287109375, "learning_rate": 1.9764487581252787e-05, "loss": 0.0117, "reward": 0.90625, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.88125, "step": 735 }, { "completion_length": 1024.0, "epoch": 0.16344335390179596, "grad_norm": 0.27522740788676725, "kl": 0.272119140625, "learning_rate": 1.975609645454704e-05, "loss": 0.0109, "reward": 0.95625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.91875, "step": 740 }, { "completion_length": 1024.0, "epoch": 0.1645477008876189, "grad_norm": 0.26380621267798754, "kl": 0.3930419921875, "learning_rate": 1.9747560292696763e-05, "loss": 0.0157, "reward": 0.9625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.93125, "step": 745 }, { "completion_length": 1024.0, "epoch": 0.16565204787344184, "grad_norm": 0.28940243726369763, "kl": 0.33485107421875, "learning_rate": 1.9738879222601425e-05, "loss": 0.0134, "reward": 0.8875, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.85625, "step": 750 }, { "completion_length": 1024.0, "epoch": 0.16675639485926477, "grad_norm": 0.28544947271914256, "kl": 0.5123046875, "learning_rate": 1.9730053373314722e-05, "loss": 0.0205, "reward": 0.9125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.875, "step": 755 }, { "completion_length": 1020.45, "epoch": 0.16786074184508773, "grad_norm": 0.25645868139823796, "kl": 0.56268310546875, "learning_rate": 1.9721082876042644e-05, "loss": 0.0225, "reward": 0.95625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.925, "step": 760 }, { "completion_length": 1024.0, "epoch": 0.16896508883091066, "grad_norm": 0.12040072575635787, "kl": 0.48553466796875, "learning_rate": 1.9711967864141542e-05, "loss": 0.0194, "reward": 0.98125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.9375, "step": 765 }, { "completion_length": 1024.0, "epoch": 0.17006943581673362, "grad_norm": 0.17007869061264408, "kl": 0.50712890625, "learning_rate": 1.970270847311612e-05, "loss": 0.0203, "reward": 1.05625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.975, "step": 770 }, { "completion_length": 1020.9375, "epoch": 0.17117378280255657, "grad_norm": 0.17581235812523752, "kl": 0.49864501953125, "learning_rate": 1.9693304840617456e-05, "loss": 0.0199, "reward": 1.0375, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.94375, "step": 775 }, { "completion_length": 1024.0, "epoch": 0.1722781297883795, "grad_norm": 0.18169044119511507, "kl": 0.66134033203125, "learning_rate": 1.968375710644093e-05, "loss": 0.0265, "reward": 0.96875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.8875, "step": 780 }, { "completion_length": 1024.0, "epoch": 0.17338247677420246, "grad_norm": 0.44893330655977454, "kl": 0.538287353515625, "learning_rate": 1.9674065412524147e-05, "loss": 0.0215, "reward": 0.75, "reward_std": 0.30052037686109545, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.7, "step": 785 }, { "completion_length": 1024.0, "epoch": 0.1744868237600254, "grad_norm": 1.1859702994440748, "kl": 0.7820556640625, "learning_rate": 1.9664229902944833e-05, "loss": 0.0313, "reward": 0.88125, "reward_std": 0.2563262037932873, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.81875, "step": 790 }, { "completion_length": 1016.3125, "epoch": 0.17559117074584835, "grad_norm": 0.22143586568637066, "kl": 0.54296875, "learning_rate": 1.9654250723918706e-05, "loss": 0.0217, "reward": 0.99375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.925, "step": 795 }, { "completion_length": 1018.775, "epoch": 0.1766955177316713, "grad_norm": 22.572401118778046, "kl": 1.007623291015625, "learning_rate": 1.9644128023797273e-05, "loss": 0.0403, "reward": 1.025, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.9875, "step": 800 }, { "epoch": 0.1766955177316713, "eval_completion_length": 999.645, "eval_kl": 0.35353515625, "eval_loss": 0.0141792893409729, "eval_reward": 1.045, "eval_reward_std": 0.13435028612613678, "eval_rewards/accuracy_reward": 0.085, "eval_rewards/format_reward": 0.96, "eval_runtime": 263.5974, "eval_samples_per_second": 0.376, "eval_steps_per_second": 0.095, "step": 800 }, { "completion_length": 856.90625, "epoch": 0.17779986471749423, "grad_norm": 0.4670617314622781, "kl": 0.29603271484375, "learning_rate": 1.9633861953065648e-05, "loss": 0.0118, "reward": 0.9125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0125, "rewards/format_reward": 0.9, "step": 805 }, { "completion_length": 235.19375, "epoch": 0.1789042117033172, "grad_norm": 0.8767573552781158, "kl": 0.281640625, "learning_rate": 1.9623452664340305e-05, "loss": 0.0113, "reward": 0.95625, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.88125, "step": 810 }, { "completion_length": 194.6875, "epoch": 0.18000855868914012, "grad_norm": 0.8245609924387703, "kl": 0.235968017578125, "learning_rate": 1.9612900312366815e-05, "loss": 0.0094, "reward": 0.98125, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.89375, "step": 815 }, { "completion_length": 177.96875, "epoch": 0.18111290567496308, "grad_norm": 0.5059362794610306, "kl": 0.2334716796875, "learning_rate": 1.9602205054017534e-05, "loss": 0.0093, "reward": 1.04375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9625, "step": 820 }, { "completion_length": 217.3625, "epoch": 0.182217252660786, "grad_norm": 0.4835214879479257, "kl": 0.26324462890625, "learning_rate": 1.9591367048289297e-05, "loss": 0.0105, "reward": 1.01875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.975, "step": 825 }, { "completion_length": 221.90625, "epoch": 0.18332159964660896, "grad_norm": 0.6626328778695273, "kl": 0.28028564453125, "learning_rate": 1.9580386456301014e-05, "loss": 0.0112, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.975, "step": 830 }, { "completion_length": 205.31875, "epoch": 0.18442594663243192, "grad_norm": 0.26541083822971634, "kl": 0.27823486328125, "learning_rate": 1.9569263441291312e-05, "loss": 0.0111, "reward": 1.00625, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.98125, "step": 835 }, { "completion_length": 206.03125, "epoch": 0.18553029361825485, "grad_norm": 0.12829507495547837, "kl": 0.29173583984375, "learning_rate": 1.9557998168616087e-05, "loss": 0.0117, "reward": 1.025, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.98125, "step": 840 }, { "completion_length": 197.2375, "epoch": 0.1866346406040778, "grad_norm": 0.632523566350679, "kl": 0.3269287109375, "learning_rate": 1.9546590805746054e-05, "loss": 0.0131, "reward": 1.0125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.96875, "step": 845 }, { "completion_length": 201.3125, "epoch": 0.18773898758990074, "grad_norm": 0.4218691032301411, "kl": 0.254644775390625, "learning_rate": 1.9535041522264256e-05, "loss": 0.0102, "reward": 1.03125, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.9875, "step": 850 }, { "completion_length": 215.075, "epoch": 0.1888433345757237, "grad_norm": 0.5263796297221296, "kl": 0.24737548828125, "learning_rate": 1.9523350489863545e-05, "loss": 0.0099, "reward": 1.01875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.98125, "step": 855 }, { "completion_length": 293.66875, "epoch": 0.18994768156154665, "grad_norm": 0.5106483317493103, "kl": 0.24151611328125, "learning_rate": 1.951151788234402e-05, "loss": 0.0097, "reward": 1.03125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "step": 860 }, { "completion_length": 341.4625, "epoch": 0.19105202854736958, "grad_norm": 0.3446610557866059, "kl": 0.21561279296875, "learning_rate": 1.949954387561046e-05, "loss": 0.0086, "reward": 1.05625, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 1.0, "step": 865 }, { "completion_length": 290.48125, "epoch": 0.19215637553319254, "grad_norm": 0.5188322122434208, "kl": 0.22645263671875, "learning_rate": 1.9487428647669688e-05, "loss": 0.0091, "reward": 1.05625, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9875, "step": 870 }, { "completion_length": 265.8, "epoch": 0.19326072251901547, "grad_norm": 0.36026950031545973, "kl": 0.2487060546875, "learning_rate": 1.947517237862795e-05, "loss": 0.0099, "reward": 1.05625, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 1.0, "step": 875 }, { "completion_length": 229.44375, "epoch": 0.19436506950483842, "grad_norm": 0.27419984769066713, "kl": 0.247833251953125, "learning_rate": 1.9462775250688208e-05, "loss": 0.0099, "reward": 1.05, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 1.0, "step": 880 }, { "completion_length": 242.05625, "epoch": 0.19546941649066138, "grad_norm": 0.22671689499182268, "kl": 0.2520751953125, "learning_rate": 1.9450237448147463e-05, "loss": 0.0101, "reward": 1.0625, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9875, "step": 885 }, { "completion_length": 250.46875, "epoch": 0.1965737634764843, "grad_norm": 0.6022152556248251, "kl": 0.26463623046875, "learning_rate": 1.943755915739399e-05, "loss": 0.0106, "reward": 1.03125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.95625, "step": 890 }, { "completion_length": 202.0625, "epoch": 0.19767811046230727, "grad_norm": 0.3986065730178507, "kl": 0.2731201171875, "learning_rate": 1.9424740566904572e-05, "loss": 0.0109, "reward": 1.01875, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.99375, "step": 895 }, { "completion_length": 199.31875, "epoch": 0.1987824574481302, "grad_norm": 0.3484175794006397, "kl": 0.2666259765625, "learning_rate": 1.9411781867241718e-05, "loss": 0.0107, "reward": 1.03125, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.9875, "step": 900 }, { "epoch": 0.1987824574481302, "eval_completion_length": 219.105, "eval_kl": 0.2623046875, "eval_loss": 0.010502400808036327, "eval_reward": 1.085, "eval_reward_std": 0.13435028612613678, "eval_rewards/accuracy_reward": 0.1, "eval_rewards/format_reward": 0.985, "eval_runtime": 94.3452, "eval_samples_per_second": 1.049, "eval_steps_per_second": 0.265, "step": 900 }, { "completion_length": 249.375, "epoch": 0.19988680443395315, "grad_norm": 0.11204663969136651, "kl": 0.28580322265625, "learning_rate": 1.9398683251050796e-05, "loss": 0.0114, "reward": 1.04375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9875, "step": 905 }, { "completion_length": 258.64375, "epoch": 0.20099115141977608, "grad_norm": 0.38371933952741333, "kl": 0.28076171875, "learning_rate": 1.93854449130572e-05, "loss": 0.0112, "reward": 1.0625, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 910 }, { "completion_length": 246.2125, "epoch": 0.20209549840559904, "grad_norm": 0.596227033187015, "kl": 0.26915283203125, "learning_rate": 1.937206705006344e-05, "loss": 0.0108, "reward": 1.04375, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9875, "step": 915 }, { "completion_length": 229.68125, "epoch": 0.203199845391422, "grad_norm": 1.0512533587389215, "kl": 0.27860107421875, "learning_rate": 1.9358549860946217e-05, "loss": 0.0111, "reward": 0.9875, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.925, "step": 920 }, { "completion_length": 193.76875, "epoch": 0.20430419237724493, "grad_norm": 0.3503850314920464, "kl": 0.26494140625, "learning_rate": 1.934489354665347e-05, "loss": 0.0106, "reward": 0.8375, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.79375, "step": 925 }, { "completion_length": 307.98125, "epoch": 0.20540853936306788, "grad_norm": 0.49849280178686406, "kl": 0.31822509765625, "learning_rate": 1.9331098310201392e-05, "loss": 0.0127, "reward": 0.99375, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.88125, "step": 930 }, { "completion_length": 149.59375, "epoch": 0.2065128863488908, "grad_norm": 0.7917539737118271, "kl": 0.3322509765625, "learning_rate": 1.9317164356671395e-05, "loss": 0.0133, "reward": 1.0375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9625, "step": 935 }, { "completion_length": 168.63125, "epoch": 0.20761723333471377, "grad_norm": 0.6201527306472769, "kl": 0.41641845703125, "learning_rate": 1.930309189320709e-05, "loss": 0.0167, "reward": 1.06875, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9625, "step": 940 }, { "completion_length": 200.69375, "epoch": 0.20872158032053673, "grad_norm": 0.48233327830154826, "kl": 0.333837890625, "learning_rate": 1.9288881129011177e-05, "loss": 0.0134, "reward": 1.1, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.975, "step": 945 }, { "completion_length": 202.0, "epoch": 0.20982592730635966, "grad_norm": 0.8281740363940345, "kl": 0.35965576171875, "learning_rate": 1.9274532275342355e-05, "loss": 0.0144, "reward": 1.075, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.96875, "step": 950 }, { "completion_length": 237.83125, "epoch": 0.2109302742921826, "grad_norm": 0.28772905337093585, "kl": 0.32879638671875, "learning_rate": 1.9260045545512174e-05, "loss": 0.0131, "reward": 1.05625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9875, "step": 955 }, { "completion_length": 260.9375, "epoch": 0.21203462127800554, "grad_norm": 0.5484628404079348, "kl": 0.340283203125, "learning_rate": 1.9245421154881873e-05, "loss": 0.0136, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.975, "step": 960 }, { "completion_length": 325.5625, "epoch": 0.2131389682638285, "grad_norm": 0.37202208226132655, "kl": 0.40030517578125, "learning_rate": 1.9230659320859157e-05, "loss": 0.016, "reward": 1.01875, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9875, "step": 965 }, { "completion_length": 249.5125, "epoch": 0.21424331524965143, "grad_norm": 0.37717614370685104, "kl": 0.35120849609375, "learning_rate": 1.9215760262894982e-05, "loss": 0.014, "reward": 1.00625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.975, "step": 970 }, { "completion_length": 230.45625, "epoch": 0.21534766223547439, "grad_norm": 0.5086005905523352, "kl": 0.38538818359375, "learning_rate": 1.9200724202480305e-05, "loss": 0.0154, "reward": 1.025, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.96875, "step": 975 }, { "completion_length": 203.0125, "epoch": 0.21645200922129734, "grad_norm": 0.3935754886467958, "kl": 0.32562255859375, "learning_rate": 1.9185551363142754e-05, "loss": 0.013, "reward": 1.0, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "step": 980 }, { "completion_length": 177.90625, "epoch": 0.21755635620712027, "grad_norm": 0.4203555792398308, "kl": 0.29302978515625, "learning_rate": 1.9170241970443344e-05, "loss": 0.0117, "reward": 1.01875, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.99375, "step": 985 }, { "completion_length": 149.94375, "epoch": 0.21866070319294323, "grad_norm": 0.26776293393774087, "kl": 0.3010009765625, "learning_rate": 1.9154796251973092e-05, "loss": 0.012, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.98125, "step": 990 }, { "completion_length": 171.49375, "epoch": 0.21976505017876616, "grad_norm": 0.5930942674995531, "kl": 0.30377197265625, "learning_rate": 1.9139214437349663e-05, "loss": 0.0121, "reward": 1.025, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.99375, "step": 995 }, { "completion_length": 182.9125, "epoch": 0.22086939716458912, "grad_norm": 0.5327664873628459, "kl": 0.2912841796875, "learning_rate": 1.9123496758213926e-05, "loss": 0.0117, "reward": 1.06875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9875, "step": 1000 }, { "epoch": 0.22086939716458912, "eval_completion_length": 214.23, "eval_kl": 0.269765625, "eval_loss": 0.01077475119382143, "eval_reward": 1.075, "eval_reward_std": 0.1767766922712326, "eval_rewards/accuracy_reward": 0.105, "eval_rewards/format_reward": 0.97, "eval_runtime": 90.4327, "eval_samples_per_second": 1.095, "eval_steps_per_second": 0.276, "step": 1000 }, { "completion_length": 192.26875, "epoch": 0.22197374415041207, "grad_norm": 0.607671116808556, "kl": 0.29041748046875, "learning_rate": 1.9107643448226536e-05, "loss": 0.0116, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.975, "step": 1005 }, { "completion_length": 233.1875, "epoch": 0.223078091136235, "grad_norm": 0.42966284071874145, "kl": 4.75654296875, "learning_rate": 1.909165474306445e-05, "loss": 0.1909, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.975, "step": 1010 }, { "completion_length": 451.7875, "epoch": 0.22418243812205796, "grad_norm": 0.2531114981656375, "kl": 0.275, "learning_rate": 1.9075530880417422e-05, "loss": 0.011, "reward": 0.925, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.9, "step": 1015 }, { "completion_length": 535.35625, "epoch": 0.2252867851078809, "grad_norm": 0.4277427642056076, "kl": 0.28232421875, "learning_rate": 1.905927209998447e-05, "loss": 0.0113, "reward": 0.89375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.83125, "step": 1020 }, { "completion_length": 319.4625, "epoch": 0.22639113209370385, "grad_norm": 0.44601881214583666, "kl": 0.30135498046875, "learning_rate": 1.9042878643470313e-05, "loss": 0.0121, "reward": 1.00625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.98125, "step": 1025 }, { "completion_length": 219.5, "epoch": 0.22749547907952677, "grad_norm": 0.42697703684794175, "kl": 0.3317626953125, "learning_rate": 1.9026350754581782e-05, "loss": 0.0133, "reward": 1.0125, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.95, "step": 1030 }, { "completion_length": 203.14375, "epoch": 0.22859982606534973, "grad_norm": 0.2565664818941694, "kl": 0.315625, "learning_rate": 1.900968867902419e-05, "loss": 0.0126, "reward": 0.93125, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.88125, "step": 1035 }, { "completion_length": 202.94375, "epoch": 0.2297041730511727, "grad_norm": 0.42814411154309545, "kl": 0.31634521484375, "learning_rate": 1.8992892664497693e-05, "loss": 0.0127, "reward": 1.06875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.99375, "step": 1040 }, { "completion_length": 205.91875, "epoch": 0.23080852003699562, "grad_norm": 0.43764023473742697, "kl": 0.2983154296875, "learning_rate": 1.897596296069358e-05, "loss": 0.0119, "reward": 1.05, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9875, "step": 1045 }, { "completion_length": 196.5375, "epoch": 0.23191286702281858, "grad_norm": 0.5131717281451017, "kl": 0.27991943359375, "learning_rate": 1.8958899819290592e-05, "loss": 0.0112, "reward": 1.025, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.98125, "step": 1050 }, { "completion_length": 205.26875, "epoch": 0.2330172140086415, "grad_norm": 0.32785012231096045, "kl": 0.290869140625, "learning_rate": 1.8941703493951163e-05, "loss": 0.0116, "reward": 1.075, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.99375, "step": 1055 }, { "completion_length": 231.63125, "epoch": 0.23412156099446446, "grad_norm": 0.5254694645789388, "kl": 0.28807373046875, "learning_rate": 1.892437424031766e-05, "loss": 0.0115, "reward": 1.05625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.99375, "step": 1060 }, { "completion_length": 242.0375, "epoch": 0.23522590798028742, "grad_norm": 0.5026068127158217, "kl": 0.29200439453125, "learning_rate": 1.890691231600856e-05, "loss": 0.0117, "reward": 1.06875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9625, "step": 1065 }, { "completion_length": 246.08125, "epoch": 0.23633025496611035, "grad_norm": 0.13203150585306936, "kl": 0.322235107421875, "learning_rate": 1.8889317980614653e-05, "loss": 0.0129, "reward": 1.075, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.96875, "step": 1070 }, { "completion_length": 224.4125, "epoch": 0.2374346019519333, "grad_norm": 0.41058165320035295, "kl": 0.354974365234375, "learning_rate": 1.8871591495695156e-05, "loss": 0.0142, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.975, "step": 1075 }, { "completion_length": 202.43125, "epoch": 0.23853894893775623, "grad_norm": 0.5981781477284894, "kl": 0.3003173828125, "learning_rate": 1.8853733124773837e-05, "loss": 0.012, "reward": 1.025, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.98125, "step": 1080 }, { "completion_length": 170.225, "epoch": 0.2396432959235792, "grad_norm": 0.4268507688185011, "kl": 0.301025390625, "learning_rate": 1.8835743133335096e-05, "loss": 0.012, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9875, "step": 1085 }, { "completion_length": 175.80625, "epoch": 0.24074764290940212, "grad_norm": 0.5747423342289615, "kl": 0.3157562255859375, "learning_rate": 1.8817621788820017e-05, "loss": 0.0126, "reward": 1.0125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9625, "step": 1090 }, { "completion_length": 167.35, "epoch": 0.24185198989522508, "grad_norm": 0.519815561291836, "kl": 0.32388916015625, "learning_rate": 1.8799369360622394e-05, "loss": 0.013, "reward": 1.08125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.99375, "step": 1095 }, { "completion_length": 218.21875, "epoch": 0.24295633688104804, "grad_norm": 0.5333785220488585, "kl": 0.36580810546875, "learning_rate": 1.8780986120084715e-05, "loss": 0.0146, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.975, "step": 1100 }, { "epoch": 0.24295633688104804, "eval_completion_length": 262.785, "eval_kl": 0.3383984375, "eval_loss": 0.013547366484999657, "eval_reward": 1.03, "eval_reward_std": 0.15556348919868468, "eval_rewards/accuracy_reward": 0.075, "eval_rewards/format_reward": 0.955, "eval_runtime": 118.9008, "eval_samples_per_second": 0.833, "eval_steps_per_second": 0.21, "step": 1100 }, { "completion_length": 226.5375, "epoch": 0.24406068386687096, "grad_norm": 0.6802013609013249, "kl": 0.3069580078125, "learning_rate": 1.876247234049416e-05, "loss": 0.0123, "reward": 1.05, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.975, "step": 1105 }, { "completion_length": 264.125, "epoch": 0.24516503085269392, "grad_norm": 0.9555057313081333, "kl": 0.42034912109375, "learning_rate": 1.8743828297078485e-05, "loss": 0.0168, "reward": 0.95625, "reward_std": 0.23864853456616403, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.8875, "step": 1110 }, { "completion_length": 204.44375, "epoch": 0.24626937783851685, "grad_norm": 0.7866089087734848, "kl": 0.45830078125, "learning_rate": 1.8725054267001992e-05, "loss": 0.0183, "reward": 0.7625, "reward_std": 0.30052037686109545, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.725, "step": 1115 }, { "completion_length": 170.48125, "epoch": 0.2473737248243398, "grad_norm": 0.4917046420126656, "kl": 0.535284423828125, "learning_rate": 1.8706150529361355e-05, "loss": 0.0214, "reward": 0.825, "reward_std": 0.30052037686109545, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.74375, "step": 1120 }, { "completion_length": 170.68125, "epoch": 0.24847807181016277, "grad_norm": 0.4598219209937343, "kl": 0.50391845703125, "learning_rate": 1.8687117365181514e-05, "loss": 0.0202, "reward": 0.90625, "reward_std": 0.23864853456616403, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.8625, "step": 1125 }, { "completion_length": 226.35, "epoch": 0.2495824187959857, "grad_norm": 0.729450452143476, "kl": 0.4862548828125, "learning_rate": 1.8667955057411454e-05, "loss": 0.0195, "reward": 0.9375, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.85625, "step": 1130 }, { "completion_length": 199.03125, "epoch": 0.25068676578180865, "grad_norm": 0.9720760882388615, "kl": 0.467034912109375, "learning_rate": 1.864866389092005e-05, "loss": 0.0187, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9125, "step": 1135 }, { "completion_length": 213.5875, "epoch": 0.2517911127676316, "grad_norm": 0.6415965551575933, "kl": 0.8406494140625, "learning_rate": 1.8629244152491773e-05, "loss": 0.0336, "reward": 0.96875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9125, "step": 1140 }, { "completion_length": 264.75, "epoch": 0.2528954597534545, "grad_norm": 3.4377172313710402, "kl": 1.02249755859375, "learning_rate": 1.860969613082249e-05, "loss": 0.0409, "reward": 0.95, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9, "step": 1145 }, { "completion_length": 262.26875, "epoch": 0.25399980673927747, "grad_norm": 0.5339383124761619, "kl": 1.24478759765625, "learning_rate": 1.8590020116515116e-05, "loss": 0.0496, "reward": 1.01875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.94375, "step": 1150 }, { "completion_length": 271.375, "epoch": 0.2551041537251004, "grad_norm": 1.6266177185437845, "kl": 0.5312255859375, "learning_rate": 1.8570216402075326e-05, "loss": 0.0213, "reward": 1.0125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.925, "step": 1155 }, { "completion_length": 274.64375, "epoch": 0.2562085007109234, "grad_norm": 1.0477658800658394, "kl": 0.89627685546875, "learning_rate": 1.8550285281907198e-05, "loss": 0.0358, "reward": 1.0, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.91875, "step": 1160 }, { "completion_length": 262.7, "epoch": 0.25731284769674634, "grad_norm": 0.4264557263560298, "kl": 0.498876953125, "learning_rate": 1.8530227052308843e-05, "loss": 0.0199, "reward": 1.0125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9375, "step": 1165 }, { "completion_length": 199.46875, "epoch": 0.25841719468256924, "grad_norm": 0.6299395922753566, "kl": 0.44912109375, "learning_rate": 1.8510042011467978e-05, "loss": 0.018, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.96875, "step": 1170 }, { "completion_length": 222.3625, "epoch": 0.2595215416683922, "grad_norm": 0.529042632954111, "kl": 0.43572998046875, "learning_rate": 1.848973045945753e-05, "loss": 0.0174, "reward": 1.00625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.94375, "step": 1175 }, { "completion_length": 209.43125, "epoch": 0.26062588865421515, "grad_norm": 0.7827952040899112, "kl": 0.83447265625, "learning_rate": 1.8469292698231137e-05, "loss": 0.0335, "reward": 1.05, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.95625, "step": 1180 }, { "completion_length": 153.6875, "epoch": 0.2617302356400381, "grad_norm": 0.42454488448610755, "kl": 0.35516357421875, "learning_rate": 1.8448729031618687e-05, "loss": 0.0142, "reward": 1.04375, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.96875, "step": 1185 }, { "completion_length": 174.7125, "epoch": 0.26283458262586107, "grad_norm": 0.37149930187993413, "kl": 0.365673828125, "learning_rate": 1.8428039765321783e-05, "loss": 0.0146, "reward": 1.0375, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.96875, "step": 1190 }, { "completion_length": 265.21875, "epoch": 0.26393892961168397, "grad_norm": 0.66806143396866, "kl": 0.3682373046875, "learning_rate": 1.840722520690921e-05, "loss": 0.0147, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.90625, "step": 1195 }, { "completion_length": 270.75, "epoch": 0.2650432765975069, "grad_norm": 0.62491488989135, "kl": 0.37822265625, "learning_rate": 1.838628566581236e-05, "loss": 0.0151, "reward": 0.91875, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.8375, "step": 1200 }, { "epoch": 0.2650432765975069, "eval_completion_length": 337.06, "eval_kl": 0.4419140625, "eval_loss": 0.0176791213452816, "eval_reward": 0.905, "eval_reward_std": 0.3181980448961258, "eval_rewards/accuracy_reward": 0.085, "eval_rewards/format_reward": 0.82, "eval_runtime": 175.1338, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.143, "step": 1200 }, { "completion_length": 257.49375, "epoch": 0.2661476235833299, "grad_norm": 0.83147670526204, "kl": 0.35118408203125, "learning_rate": 1.8365221453320625e-05, "loss": 0.014, "reward": 0.9125, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.84375, "step": 1205 }, { "completion_length": 223.31875, "epoch": 0.26725197056915284, "grad_norm": 0.24158167072477596, "kl": 0.3875, "learning_rate": 1.8344032882576784e-05, "loss": 0.0155, "reward": 0.95625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.90625, "step": 1210 }, { "completion_length": 163.0, "epoch": 0.2683563175549758, "grad_norm": 0.6479529739729771, "kl": 0.405072021484375, "learning_rate": 1.8322720268572333e-05, "loss": 0.0162, "reward": 0.99375, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9375, "step": 1215 }, { "completion_length": 292.16875, "epoch": 0.2694606645407987, "grad_norm": 1.7491765816716196, "kl": 0.540771484375, "learning_rate": 1.83012839281428e-05, "loss": 0.0216, "reward": 0.95, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.8875, "step": 1220 }, { "completion_length": 358.31875, "epoch": 0.27056501152662166, "grad_norm": 0.5769509244571842, "kl": 0.5750244140625, "learning_rate": 1.827972417996306e-05, "loss": 0.023, "reward": 0.8375, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.7875, "step": 1225 }, { "completion_length": 263.51875, "epoch": 0.2716693585124446, "grad_norm": 0.47084507783132956, "kl": 0.51199951171875, "learning_rate": 1.8258041344542567e-05, "loss": 0.0205, "reward": 0.89375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.875, "step": 1230 }, { "completion_length": 145.9875, "epoch": 0.27277370549826757, "grad_norm": 0.6657898991831224, "kl": 0.52864990234375, "learning_rate": 1.823623574422061e-05, "loss": 0.0212, "reward": 1.025, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9625, "step": 1235 }, { "completion_length": 151.34375, "epoch": 0.27387805248409053, "grad_norm": 0.3173979928590889, "kl": 0.4265869140625, "learning_rate": 1.821430770316151e-05, "loss": 0.0171, "reward": 1.0125, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.96875, "step": 1240 }, { "completion_length": 583.2, "epoch": 0.27498239946991343, "grad_norm": 0.3100947155151386, "kl": 0.40513916015625, "learning_rate": 1.8192257547349805e-05, "loss": 0.0162, "reward": 0.65625, "reward_std": 0.2916815422475338, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.625, "step": 1245 }, { "completion_length": 315.43125, "epoch": 0.2760867464557364, "grad_norm": 0.1782330419853091, "kl": 0.51036376953125, "learning_rate": 1.817008560458541e-05, "loss": 0.0204, "reward": 0.84375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8125, "step": 1250 }, { "completion_length": 87.08125, "epoch": 0.27719109344155934, "grad_norm": 0.2736695226581386, "kl": 0.59683837890625, "learning_rate": 1.814779220447872e-05, "loss": 0.0239, "reward": 0.9625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.9375, "step": 1255 }, { "completion_length": 124.04375, "epoch": 0.2782954404273823, "grad_norm": 0.4315634020248374, "kl": 0.48404541015625, "learning_rate": 1.8125377678445755e-05, "loss": 0.0194, "reward": 1.01875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9375, "step": 1260 }, { "completion_length": 242.1875, "epoch": 0.2793997874132052, "grad_norm": 0.6754023429831525, "kl": 0.5489013671875, "learning_rate": 1.8102842359703177e-05, "loss": 0.022, "reward": 0.9375, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.8875, "step": 1265 }, { "completion_length": 154.88125, "epoch": 0.28050413439902816, "grad_norm": 0.7500021389013647, "kl": 0.44857177734375, "learning_rate": 1.8080186583263386e-05, "loss": 0.018, "reward": 0.9875, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9375, "step": 1270 }, { "completion_length": 138.74375, "epoch": 0.2816084813848511, "grad_norm": 0.6511896035985808, "kl": 0.660400390625, "learning_rate": 1.8057410685929505e-05, "loss": 0.0264, "reward": 1.01875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.96875, "step": 1275 }, { "completion_length": 278.21875, "epoch": 0.2827128283706741, "grad_norm": 0.37768724134381376, "kl": 0.4253173828125, "learning_rate": 1.8034515006290398e-05, "loss": 0.017, "reward": 0.9625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.9375, "step": 1280 }, { "completion_length": 306.81875, "epoch": 0.28381717535649703, "grad_norm": 0.29452808394209296, "kl": 0.38458251953125, "learning_rate": 1.8011499884715616e-05, "loss": 0.0154, "reward": 1.0375, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.95625, "step": 1285 }, { "completion_length": 331.20625, "epoch": 0.28492152234231993, "grad_norm": 0.40009340612327654, "kl": 0.40555419921875, "learning_rate": 1.7988365663350352e-05, "loss": 0.0162, "reward": 1.00625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9375, "step": 1290 }, { "completion_length": 291.39375, "epoch": 0.2860258693281429, "grad_norm": 0.2398401379002744, "kl": 0.3625, "learning_rate": 1.7965112686110346e-05, "loss": 0.0145, "reward": 0.96875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.93125, "step": 1295 }, { "completion_length": 210.84375, "epoch": 0.28713021631396585, "grad_norm": 0.16393982679750796, "kl": 0.36951904296875, "learning_rate": 1.7941741298676777e-05, "loss": 0.0148, "reward": 1.0125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.96875, "step": 1300 }, { "epoch": 0.28713021631396585, "eval_completion_length": 168.92, "eval_kl": 0.39515625, "eval_loss": 0.01582903414964676, "eval_reward": 1.05, "eval_reward_std": 0.11313708305358887, "eval_rewards/accuracy_reward": 0.07, "eval_rewards/format_reward": 0.98, "eval_runtime": 82.4657, "eval_samples_per_second": 1.2, "eval_steps_per_second": 0.303, "step": 1300 }, { "completion_length": 157.2875, "epoch": 0.2882345632997888, "grad_norm": 0.5544518957919585, "kl": 0.35565185546875, "learning_rate": 1.7918251848491118e-05, "loss": 0.0142, "reward": 1.00625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.9875, "step": 1305 }, { "completion_length": 143.9625, "epoch": 0.28933891028561176, "grad_norm": 1.0694156042827718, "kl": 0.36668701171875, "learning_rate": 1.7894644684749983e-05, "loss": 0.0147, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.99375, "step": 1310 }, { "completion_length": 129.1875, "epoch": 0.29044325727143466, "grad_norm": 0.7797356796661203, "kl": 0.363818359375, "learning_rate": 1.7870920158399918e-05, "loss": 0.0146, "reward": 1.075, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.96875, "step": 1315 }, { "completion_length": 107.58125, "epoch": 0.2915476042572576, "grad_norm": 0.42231572420920943, "kl": 0.415740966796875, "learning_rate": 1.7847078622132202e-05, "loss": 0.0166, "reward": 1.0, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.95, "step": 1320 }, { "completion_length": 137.425, "epoch": 0.2926519512430806, "grad_norm": 0.31138070158067094, "kl": 0.4673095703125, "learning_rate": 1.7823120430377593e-05, "loss": 0.0187, "reward": 1.0, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.975, "step": 1325 }, { "completion_length": 169.975, "epoch": 0.29375629822890353, "grad_norm": 0.7054962096277919, "kl": 0.4247802734375, "learning_rate": 1.7799045939301063e-05, "loss": 0.017, "reward": 1.04375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.98125, "step": 1330 }, { "completion_length": 199.325, "epoch": 0.2948606452147265, "grad_norm": 0.5371054293908584, "kl": 0.4145263671875, "learning_rate": 1.7774855506796497e-05, "loss": 0.0166, "reward": 1.06875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.98125, "step": 1335 }, { "completion_length": 285.3625, "epoch": 0.2959649922005494, "grad_norm": 0.49107133614596143, "kl": 0.38699951171875, "learning_rate": 1.775054949248138e-05, "loss": 0.0155, "reward": 1.0625, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95, "step": 1340 }, { "completion_length": 195.01875, "epoch": 0.29706933918637235, "grad_norm": 0.18854873920164167, "kl": 0.401611328125, "learning_rate": 1.7726128257691447e-05, "loss": 0.0161, "reward": 1.08125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9875, "step": 1345 }, { "completion_length": 144.2625, "epoch": 0.2981736861721953, "grad_norm": 0.194977538009497, "kl": 0.3897705078125, "learning_rate": 1.770159216547532e-05, "loss": 0.0156, "reward": 1.06875, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.99375, "step": 1350 }, { "completion_length": 142.68125, "epoch": 0.29927803315801826, "grad_norm": 0.2721232391131242, "kl": 0.4391845703125, "learning_rate": 1.7676941580589097e-05, "loss": 0.0176, "reward": 1.01875, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9875, "step": 1355 }, { "completion_length": 141.6625, "epoch": 0.3003823801438412, "grad_norm": 0.12367929505701036, "kl": 0.4060302734375, "learning_rate": 1.7652176869490933e-05, "loss": 0.0162, "reward": 1.0375, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.98125, "step": 1360 }, { "completion_length": 147.54375, "epoch": 0.3014867271296641, "grad_norm": 0.1722353049345247, "kl": 0.43369140625, "learning_rate": 1.76272984003356e-05, "loss": 0.0173, "reward": 1.025, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 1.0, "step": 1365 }, { "completion_length": 161.99375, "epoch": 0.3025910741154871, "grad_norm": 0.6029005445006613, "kl": 0.47828369140625, "learning_rate": 1.7602306542969006e-05, "loss": 0.0191, "reward": 0.975, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.95625, "step": 1370 }, { "completion_length": 300.3375, "epoch": 0.30369542110131004, "grad_norm": 0.6594210074645137, "kl": 0.5243896484375, "learning_rate": 1.7577201668922702e-05, "loss": 0.021, "reward": 0.9, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.875, "step": 1375 }, { "completion_length": 235.9625, "epoch": 0.304799768087133, "grad_norm": 0.42920157132224784, "kl": 0.418603515625, "learning_rate": 1.7551984151408363e-05, "loss": 0.0167, "reward": 0.9125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.8875, "step": 1380 }, { "completion_length": 148.1375, "epoch": 0.3059041150729559, "grad_norm": 0.6394391510717589, "kl": 0.38023681640625, "learning_rate": 1.7526654365312222e-05, "loss": 0.0152, "reward": 1.06875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.99375, "step": 1385 }, { "completion_length": 130.94375, "epoch": 0.30700846205877885, "grad_norm": 0.38449234338266886, "kl": 0.381109619140625, "learning_rate": 1.750121268718951e-05, "loss": 0.0152, "reward": 1.025, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.975, "step": 1390 }, { "completion_length": 134.8125, "epoch": 0.3081128090446018, "grad_norm": 0.32691206341821716, "kl": 0.37451171875, "learning_rate": 1.7475659495258864e-05, "loss": 0.015, "reward": 1.00625, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.9625, "step": 1395 }, { "completion_length": 152.13125, "epoch": 0.30921715603042477, "grad_norm": 0.5233437303073457, "kl": 0.39224853515625, "learning_rate": 1.7449995169396693e-05, "loss": 0.0157, "reward": 0.99375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.93125, "step": 1400 }, { "epoch": 0.30921715603042477, "eval_completion_length": 140.44, "eval_kl": 0.42287109375, "eval_loss": 0.016920818015933037, "eval_reward": 1.07, "eval_reward_std": 0.18384775936603545, "eval_rewards/accuracy_reward": 0.115, "eval_rewards/format_reward": 0.955, "eval_runtime": 81.6042, "eval_samples_per_second": 1.213, "eval_steps_per_second": 0.306, "step": 1400 }, { "completion_length": 130.34375, "epoch": 0.3103215030162477, "grad_norm": 0.28398981693339564, "kl": 0.38331298828125, "learning_rate": 1.7424220091131536e-05, "loss": 0.0153, "reward": 1.01875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.95625, "step": 1405 }, { "completion_length": 180.325, "epoch": 0.3114258500020706, "grad_norm": 0.6590820065232371, "kl": 0.417919921875, "learning_rate": 1.739833464363838e-05, "loss": 0.0167, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.91875, "step": 1410 }, { "completion_length": 174.08125, "epoch": 0.3125301969878936, "grad_norm": 0.19303045686565795, "kl": 0.37972412109375, "learning_rate": 1.7372339211732988e-05, "loss": 0.0152, "reward": 1.0125, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.95, "step": 1415 }, { "completion_length": 208.6375, "epoch": 0.31363454397371654, "grad_norm": 1.2704281522495193, "kl": 0.3710205078125, "learning_rate": 1.734623418186615e-05, "loss": 0.0148, "reward": 0.99375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9125, "step": 1420 }, { "completion_length": 208.275, "epoch": 0.3147388909595395, "grad_norm": 0.4017752883432233, "kl": 0.42412109375, "learning_rate": 1.7320019942117954e-05, "loss": 0.017, "reward": 0.99375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.94375, "step": 1425 }, { "completion_length": 263.475, "epoch": 0.31584323794536245, "grad_norm": 0.6704877034578468, "kl": 0.443798828125, "learning_rate": 1.729369688219202e-05, "loss": 0.0178, "reward": 0.98125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9, "step": 1430 }, { "completion_length": 268.4625, "epoch": 0.31694758493118536, "grad_norm": 1.1806288895312564, "kl": 0.475146484375, "learning_rate": 1.7267265393409684e-05, "loss": 0.019, "reward": 0.9875, "reward_std": 0.24748736917972564, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.89375, "step": 1435 }, { "completion_length": 239.40625, "epoch": 0.3180519319170083, "grad_norm": 0.7720008549372368, "kl": 0.47855224609375, "learning_rate": 1.7240725868704218e-05, "loss": 0.0192, "reward": 0.98125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9125, "step": 1440 }, { "completion_length": 209.23125, "epoch": 0.31915627890283127, "grad_norm": 0.6559969920402181, "kl": 0.36781005859375, "learning_rate": 1.7214078702614946e-05, "loss": 0.0147, "reward": 1.0, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.93125, "step": 1445 }, { "completion_length": 181.79375, "epoch": 0.3202606258886542, "grad_norm": 1.033869102520755, "kl": 0.44471435546875, "learning_rate": 1.7187324291281423e-05, "loss": 0.0178, "reward": 0.95625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.925, "step": 1450 }, { "completion_length": 144.8375, "epoch": 0.3213649728744772, "grad_norm": 0.23531024361520275, "kl": 0.55146484375, "learning_rate": 1.71604630324375e-05, "loss": 0.0221, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.975, "step": 1455 }, { "completion_length": 152.3375, "epoch": 0.3224693198603001, "grad_norm": 0.521389309576663, "kl": 0.3604736328125, "learning_rate": 1.7133495325405448e-05, "loss": 0.0144, "reward": 1.05, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9875, "step": 1460 }, { "completion_length": 147.61875, "epoch": 0.32357366684612304, "grad_norm": 0.5218385357295671, "kl": 0.3684814453125, "learning_rate": 1.7106421571090003e-05, "loss": 0.0147, "reward": 1.03125, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.99375, "step": 1465 }, { "completion_length": 146.7625, "epoch": 0.324678013831946, "grad_norm": 0.40670196743586623, "kl": 0.383203125, "learning_rate": 1.7079242171972417e-05, "loss": 0.0153, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.98125, "step": 1470 }, { "completion_length": 183.4125, "epoch": 0.32578236081776896, "grad_norm": 0.17326012835635307, "kl": 0.40433349609375, "learning_rate": 1.705195753210446e-05, "loss": 0.0162, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.99375, "step": 1475 }, { "completion_length": 252.45, "epoch": 0.3268867078035919, "grad_norm": 0.33618781399155934, "kl": 0.43258056640625, "learning_rate": 1.7024568057102423e-05, "loss": 0.0173, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.975, "step": 1480 }, { "completion_length": 316.8125, "epoch": 0.3279910547894148, "grad_norm": 0.6985966336266197, "kl": 0.4880126953125, "learning_rate": 1.6997074154141097e-05, "loss": 0.0195, "reward": 1.0625, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9875, "step": 1485 }, { "completion_length": 227.04375, "epoch": 0.3290954017752378, "grad_norm": 0.2716042884976899, "kl": 0.42822265625, "learning_rate": 1.69694762319477e-05, "loss": 0.0171, "reward": 1.06875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.98125, "step": 1490 }, { "completion_length": 246.30625, "epoch": 0.33019974876106073, "grad_norm": 0.34216236944018125, "kl": 0.432666015625, "learning_rate": 1.694177470079581e-05, "loss": 0.0173, "reward": 0.9875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.95, "step": 1495 }, { "completion_length": 319.29375, "epoch": 0.3313040957468837, "grad_norm": 0.3073497949162371, "kl": 0.4350341796875, "learning_rate": 1.6913969972499272e-05, "loss": 0.0174, "reward": 1.025, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9625, "step": 1500 }, { "epoch": 0.3313040957468837, "eval_completion_length": 508.6, "eval_kl": 0.5653125, "eval_loss": 0.022641615942120552, "eval_reward": 0.97, "eval_reward_std": 0.15556348919868468, "eval_rewards/accuracy_reward": 0.06, "eval_rewards/format_reward": 0.91, "eval_runtime": 242.1008, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.103, "step": 1500 }, { "completion_length": 219.5875, "epoch": 0.33240844273270664, "grad_norm": 0.5738378890078689, "kl": 0.447314453125, "learning_rate": 1.688606246040607e-05, "loss": 0.0179, "reward": 1.01875, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.90625, "step": 1505 }, { "completion_length": 175.0, "epoch": 0.33351278971852955, "grad_norm": 0.4329938576388711, "kl": 0.36485595703125, "learning_rate": 1.6858052579392182e-05, "loss": 0.0146, "reward": 1.06875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 1.0, "step": 1510 }, { "completion_length": 212.8375, "epoch": 0.3346171367043525, "grad_norm": 0.3974460378419368, "kl": 0.3696533203125, "learning_rate": 1.682994074585541e-05, "loss": 0.0148, "reward": 0.95625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.925, "step": 1515 }, { "completion_length": 198.4125, "epoch": 0.33572148369017546, "grad_norm": 0.6339047206651848, "kl": 0.3900634765625, "learning_rate": 1.6801727377709195e-05, "loss": 0.0156, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.925, "step": 1520 }, { "completion_length": 175.4, "epoch": 0.3368258306759984, "grad_norm": 0.31233984339595194, "kl": 0.36982421875, "learning_rate": 1.6773412894376404e-05, "loss": 0.0148, "reward": 0.98125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.95625, "step": 1525 }, { "completion_length": 162.075, "epoch": 0.3379301776618213, "grad_norm": 0.41593882245992403, "kl": 0.3514892578125, "learning_rate": 1.674499771678309e-05, "loss": 0.0141, "reward": 1.01875, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.98125, "step": 1530 }, { "completion_length": 146.81875, "epoch": 0.3390345246476443, "grad_norm": 0.6916723408968213, "kl": 0.4715576171875, "learning_rate": 1.6716482267352234e-05, "loss": 0.0189, "reward": 1.04375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9375, "step": 1535 }, { "completion_length": 146.05625, "epoch": 0.34013887163346723, "grad_norm": 0.16801257159790053, "kl": 0.4378662109375, "learning_rate": 1.6687866969997483e-05, "loss": 0.0175, "reward": 1.0, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.95, "step": 1540 }, { "completion_length": 158.125, "epoch": 0.3412432186192902, "grad_norm": 0.3924153520384322, "kl": 0.3984375, "learning_rate": 1.665915225011681e-05, "loss": 0.0159, "reward": 1.00625, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.96875, "step": 1545 }, { "completion_length": 152.525, "epoch": 0.34234756560511315, "grad_norm": 0.2188171820439607, "kl": 0.3915771484375, "learning_rate": 1.663033853458624e-05, "loss": 0.0157, "reward": 1.0, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.98125, "step": 1550 }, { "completion_length": 185.0, "epoch": 0.34345191259093605, "grad_norm": 0.2492866797409777, "kl": 0.446630859375, "learning_rate": 1.660142625175346e-05, "loss": 0.0179, "reward": 1.0375, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.98125, "step": 1555 }, { "completion_length": 197.925, "epoch": 0.344556259576759, "grad_norm": 0.43125503310433044, "kl": 0.417333984375, "learning_rate": 1.6572415831431466e-05, "loss": 0.0167, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.975, "step": 1560 }, { "completion_length": 231.45, "epoch": 0.34566060656258196, "grad_norm": 0.547580901229839, "kl": 0.4208251953125, "learning_rate": 1.6543307704892196e-05, "loss": 0.0168, "reward": 1.0125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.95, "step": 1565 }, { "completion_length": 210.36875, "epoch": 0.3467649535484049, "grad_norm": 0.30578684489167307, "kl": 0.40220947265625, "learning_rate": 1.6514102304860077e-05, "loss": 0.0161, "reward": 1.01875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.98125, "step": 1570 }, { "completion_length": 169.575, "epoch": 0.3478693005342279, "grad_norm": 0.3714599755051663, "kl": 0.4043701171875, "learning_rate": 1.6484800065505627e-05, "loss": 0.0162, "reward": 1.01875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.98125, "step": 1575 }, { "completion_length": 157.6625, "epoch": 0.3489736475200508, "grad_norm": 1.1408284138746587, "kl": 0.51844482421875, "learning_rate": 1.6455401422438984e-05, "loss": 0.0207, "reward": 1.0375, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.96875, "step": 1580 }, { "completion_length": 131.2875, "epoch": 0.35007799450587374, "grad_norm": 0.47193321313326936, "kl": 0.4167236328125, "learning_rate": 1.6425906812703435e-05, "loss": 0.0167, "reward": 1.08125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.975, "step": 1585 }, { "completion_length": 194.43125, "epoch": 0.3511823414916967, "grad_norm": 0.723120589080064, "kl": 0.4700439453125, "learning_rate": 1.6396316674768914e-05, "loss": 0.0188, "reward": 0.99375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.925, "step": 1590 }, { "completion_length": 216.175, "epoch": 0.35228668847751965, "grad_norm": 0.4975629560332776, "kl": 0.42794189453125, "learning_rate": 1.6366631448525486e-05, "loss": 0.0171, "reward": 1.075, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.95625, "step": 1595 }, { "completion_length": 195.9375, "epoch": 0.3533910354633426, "grad_norm": 0.33985255338891107, "kl": 0.3559814453125, "learning_rate": 1.6336851575276814e-05, "loss": 0.0142, "reward": 1.05, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9875, "step": 1600 }, { "epoch": 0.3533910354633426, "eval_completion_length": 231.0, "eval_kl": 0.735625, "eval_loss": 0.02945670112967491, "eval_reward": 1.05, "eval_reward_std": 0.15556348919868468, "eval_rewards/accuracy_reward": 0.1, "eval_rewards/format_reward": 0.95, "eval_runtime": 111.0663, "eval_samples_per_second": 0.891, "eval_steps_per_second": 0.225, "step": 1600 }, { "completion_length": 231.275, "epoch": 0.3544953824491655, "grad_norm": 0.595242649626797, "kl": 0.4072265625, "learning_rate": 1.630697749773359e-05, "loss": 0.0163, "reward": 1.05, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.96875, "step": 1605 }, { "completion_length": 278.25, "epoch": 0.35559972943498847, "grad_norm": 0.4801274687526583, "kl": 0.40982666015625, "learning_rate": 1.627700966000696e-05, "loss": 0.0164, "reward": 1.025, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.95, "step": 1610 }, { "completion_length": 260.71875, "epoch": 0.3567040764208114, "grad_norm": 0.29704464145114623, "kl": 0.3713134765625, "learning_rate": 1.6246948507601915e-05, "loss": 0.0149, "reward": 1.025, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.94375, "step": 1615 }, { "completion_length": 220.7375, "epoch": 0.3578084234066344, "grad_norm": 0.16551151233488073, "kl": 0.33929443359375, "learning_rate": 1.621679448741067e-05, "loss": 0.0136, "reward": 1.05, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.98125, "step": 1620 }, { "completion_length": 203.19375, "epoch": 0.35891277039245734, "grad_norm": 0.44232175696554416, "kl": 0.3436279296875, "learning_rate": 1.618654804770603e-05, "loss": 0.0137, "reward": 1.0875, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9875, "step": 1625 }, { "completion_length": 196.01875, "epoch": 0.36001711737828024, "grad_norm": 0.3595404694857126, "kl": 0.33565673828125, "learning_rate": 1.615620963813471e-05, "loss": 0.0134, "reward": 1.03125, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.9875, "step": 1630 }, { "completion_length": 220.28125, "epoch": 0.3611214643641032, "grad_norm": 0.09068347346699927, "kl": 0.334228515625, "learning_rate": 1.6125779709710668e-05, "loss": 0.0134, "reward": 1.04375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.99375, "step": 1635 }, { "completion_length": 217.84375, "epoch": 0.36222581134992615, "grad_norm": 0.24326484641045593, "kl": 0.323681640625, "learning_rate": 1.6095258714808373e-05, "loss": 0.0129, "reward": 1.09375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.99375, "step": 1640 }, { "completion_length": 190.775, "epoch": 0.3633301583357491, "grad_norm": 0.32151529940248824, "kl": 0.3042724609375, "learning_rate": 1.606464710715612e-05, "loss": 0.0122, "reward": 1.04375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.975, "step": 1645 }, { "completion_length": 223.5375, "epoch": 0.364434505321572, "grad_norm": 0.4066387353346626, "kl": 0.35045166015625, "learning_rate": 1.603394534182925e-05, "loss": 0.014, "reward": 1.04375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.975, "step": 1650 }, { "completion_length": 200.40625, "epoch": 0.36553885230739497, "grad_norm": 0.6150107546663145, "kl": 0.42801513671875, "learning_rate": 1.600315387524339e-05, "loss": 0.0171, "reward": 1.05625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 1655 }, { "completion_length": 206.625, "epoch": 0.3666431992932179, "grad_norm": 0.3881248346947634, "kl": 0.36854248046875, "learning_rate": 1.5972273165147697e-05, "loss": 0.0147, "reward": 1.05, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.98125, "step": 1660 }, { "completion_length": 223.15625, "epoch": 0.3677475462790409, "grad_norm": 0.45585685791218283, "kl": 0.35394287109375, "learning_rate": 1.5941303670618018e-05, "loss": 0.0141, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.96875, "step": 1665 }, { "completion_length": 205.44375, "epoch": 0.36885189326486384, "grad_norm": 0.24229473958778308, "kl": 0.32738037109375, "learning_rate": 1.591024585205007e-05, "loss": 0.0131, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.98125, "step": 1670 }, { "completion_length": 237.66875, "epoch": 0.36995624025068674, "grad_norm": 0.9451634093337382, "kl": 0.37305908203125, "learning_rate": 1.587910017115262e-05, "loss": 0.0149, "reward": 1.0125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.925, "step": 1675 }, { "completion_length": 234.78125, "epoch": 0.3710605872365097, "grad_norm": 0.4259015577951971, "kl": 0.3545654296875, "learning_rate": 1.5847867090940602e-05, "loss": 0.0142, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.96875, "step": 1680 }, { "completion_length": 259.5, "epoch": 0.37216493422233266, "grad_norm": 0.3894739125660781, "kl": 0.33232421875, "learning_rate": 1.5816547075728227e-05, "loss": 0.0133, "reward": 1.0125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.94375, "step": 1685 }, { "completion_length": 218.225, "epoch": 0.3732692812081556, "grad_norm": 0.5751023644328291, "kl": 0.3769775390625, "learning_rate": 1.5785140591122107e-05, "loss": 0.0151, "reward": 1.075, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.96875, "step": 1690 }, { "completion_length": 198.24375, "epoch": 0.37437362819397857, "grad_norm": 0.6070740663767715, "kl": 0.39683837890625, "learning_rate": 1.57536481040143e-05, "loss": 0.0159, "reward": 1.05, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.94375, "step": 1695 }, { "completion_length": 171.06875, "epoch": 0.37547797517980147, "grad_norm": 0.5506629078773986, "kl": 0.37344970703125, "learning_rate": 1.57220700825754e-05, "loss": 0.0149, "reward": 1.09375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.98125, "step": 1700 }, { "epoch": 0.37547797517980147, "eval_completion_length": 164.62, "eval_kl": 0.39626953125, "eval_loss": 0.015575483441352844, "eval_reward": 1.06, "eval_reward_std": 0.08485281229019165, "eval_rewards/accuracy_reward": 0.085, "eval_rewards/format_reward": 0.975, "eval_runtime": 82.4164, "eval_samples_per_second": 1.201, "eval_steps_per_second": 0.303, "step": 1700 }, { "completion_length": 148.7125, "epoch": 0.37658232216562443, "grad_norm": 0.346175215998511, "kl": 0.34801025390625, "learning_rate": 1.5690406996247557e-05, "loss": 0.0139, "reward": 1.10625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.975, "step": 1705 }, { "completion_length": 157.10625, "epoch": 0.3776866691514474, "grad_norm": 0.41957587709124056, "kl": 0.35477294921875, "learning_rate": 1.5658659315737505e-05, "loss": 0.0142, "reward": 1.075, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.9625, "step": 1710 }, { "completion_length": 172.70625, "epoch": 0.37879101613727034, "grad_norm": 0.2876351231003489, "kl": 0.35120849609375, "learning_rate": 1.5626827513009565e-05, "loss": 0.014, "reward": 1.00625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.95625, "step": 1715 }, { "completion_length": 158.39375, "epoch": 0.3798953631230933, "grad_norm": 0.5026511321796595, "kl": 0.3473388671875, "learning_rate": 1.5594912061278627e-05, "loss": 0.0139, "reward": 1.04375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.975, "step": 1720 }, { "completion_length": 173.2875, "epoch": 0.3809997101089162, "grad_norm": 0.43958113689444284, "kl": 0.3528564453125, "learning_rate": 1.5562913435003113e-05, "loss": 0.0141, "reward": 1.025, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.96875, "step": 1725 }, { "completion_length": 195.36875, "epoch": 0.38210405709473916, "grad_norm": 0.7155259543987148, "kl": 0.343896484375, "learning_rate": 1.5530832109877932e-05, "loss": 0.0138, "reward": 1.0375, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9625, "step": 1730 }, { "completion_length": 157.21875, "epoch": 0.3832084040805621, "grad_norm": 0.3870602492613469, "kl": 0.343524169921875, "learning_rate": 1.5498668562827397e-05, "loss": 0.0137, "reward": 1.04375, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.975, "step": 1735 }, { "completion_length": 182.0625, "epoch": 0.38431275106638507, "grad_norm": 0.5306169810415612, "kl": 0.35052490234375, "learning_rate": 1.5466423271998144e-05, "loss": 0.014, "reward": 1.01875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.94375, "step": 1740 }, { "completion_length": 166.49375, "epoch": 0.38541709805220803, "grad_norm": 0.4946006693969201, "kl": 0.3321044921875, "learning_rate": 1.5434096716752023e-05, "loss": 0.0133, "reward": 1.05, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.96875, "step": 1745 }, { "completion_length": 186.98125, "epoch": 0.38652144503803093, "grad_norm": 0.31165636186354284, "kl": 0.35806884765625, "learning_rate": 1.5401689377658962e-05, "loss": 0.0143, "reward": 1.0625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.975, "step": 1750 }, { "completion_length": 181.3, "epoch": 0.3876257920238539, "grad_norm": 0.5512017419947034, "kl": 0.43681640625, "learning_rate": 1.536920173648984e-05, "loss": 0.0175, "reward": 1.0125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.95, "step": 1755 }, { "completion_length": 220.86875, "epoch": 0.38873013900967684, "grad_norm": 0.7898520406737889, "kl": 0.3798583984375, "learning_rate": 1.53366342762093e-05, "loss": 0.0152, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9125, "step": 1760 }, { "completion_length": 225.61875, "epoch": 0.3898344859954998, "grad_norm": 0.7541459820766353, "kl": 0.3935302734375, "learning_rate": 1.5303987480968607e-05, "loss": 0.0157, "reward": 0.9625, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.89375, "step": 1765 }, { "completion_length": 189.53125, "epoch": 0.39093883298132276, "grad_norm": 0.33298536402698525, "kl": 0.322216796875, "learning_rate": 1.5271261836098403e-05, "loss": 0.0129, "reward": 0.99375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9375, "step": 1770 }, { "completion_length": 168.725, "epoch": 0.39204317996714566, "grad_norm": 0.7771633455821945, "kl": 0.3632568359375, "learning_rate": 1.5238457828101531e-05, "loss": 0.0145, "reward": 1.0, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.95625, "step": 1775 }, { "completion_length": 160.28125, "epoch": 0.3931475269529686, "grad_norm": 0.478390100746179, "kl": 0.372265625, "learning_rate": 1.520557594464579e-05, "loss": 0.0149, "reward": 0.9875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.94375, "step": 1780 }, { "completion_length": 188.59375, "epoch": 0.3942518739387916, "grad_norm": 0.4217931355042731, "kl": 0.3974609375, "learning_rate": 1.5172616674556673e-05, "loss": 0.0159, "reward": 0.95, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.89375, "step": 1785 }, { "completion_length": 149.525, "epoch": 0.39535622092461453, "grad_norm": 0.7501382613974432, "kl": 0.4054931640625, "learning_rate": 1.5139580507810118e-05, "loss": 0.0162, "reward": 0.9875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.93125, "step": 1790 }, { "completion_length": 145.725, "epoch": 0.39646056791043743, "grad_norm": 0.5349731933801097, "kl": 0.35238037109375, "learning_rate": 1.510646793552522e-05, "loss": 0.0141, "reward": 1.01875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.975, "step": 1795 }, { "completion_length": 142.375, "epoch": 0.3975649148962604, "grad_norm": 0.6112477717509665, "kl": 0.4273193359375, "learning_rate": 1.5073279449956916e-05, "loss": 0.0171, "reward": 1.05, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.99375, "step": 1800 }, { "epoch": 0.3975649148962604, "eval_completion_length": 169.21, "eval_kl": 0.37419921875, "eval_loss": 0.014996632933616638, "eval_reward": 1.055, "eval_reward_std": 0.07778174459934234, "eval_rewards/accuracy_reward": 0.08, "eval_rewards/format_reward": 0.975, "eval_runtime": 85.5248, "eval_samples_per_second": 1.158, "eval_steps_per_second": 0.292, "step": 1800 }, { "completion_length": 183.75625, "epoch": 0.39866926188208335, "grad_norm": 0.42239616616022757, "kl": 0.35390625, "learning_rate": 1.5040015544488689e-05, "loss": 0.0142, "reward": 1.0, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "step": 1805 }, { "completion_length": 231.51875, "epoch": 0.3997736088679063, "grad_norm": 0.38357330680196106, "kl": 0.34571533203125, "learning_rate": 1.5006676713625217e-05, "loss": 0.0138, "reward": 1.0125, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9625, "step": 1810 }, { "completion_length": 249.51875, "epoch": 0.40087795585372926, "grad_norm": 0.36278847988909035, "kl": 0.34613037109375, "learning_rate": 1.4973263452985023e-05, "loss": 0.0138, "reward": 1.01875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.96875, "step": 1815 }, { "completion_length": 254.9, "epoch": 0.40198230283955216, "grad_norm": 0.13063316008666095, "kl": 0.3719482421875, "learning_rate": 1.493977625929312e-05, "loss": 0.0149, "reward": 0.975, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.0125, "rewards/format_reward": 0.9625, "step": 1820 }, { "completion_length": 222.58125, "epoch": 0.4030866498253751, "grad_norm": 0.4311131808891421, "kl": 0.33587646484375, "learning_rate": 1.4906215630373606e-05, "loss": 0.0134, "reward": 1.025, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.975, "step": 1825 }, { "completion_length": 221.6125, "epoch": 0.4041909968111981, "grad_norm": 0.6819109826661146, "kl": 0.38231201171875, "learning_rate": 1.4872582065142285e-05, "loss": 0.0153, "reward": 1.0125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.95, "step": 1830 }, { "completion_length": 238.88125, "epoch": 0.40529534379702103, "grad_norm": 0.43973316451031436, "kl": 0.35328369140625, "learning_rate": 1.4838876063599234e-05, "loss": 0.0141, "reward": 0.9625, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.91875, "step": 1835 }, { "completion_length": 309.99375, "epoch": 0.406399690782844, "grad_norm": 0.8444306019322414, "kl": 0.44168701171875, "learning_rate": 1.480509812682138e-05, "loss": 0.0177, "reward": 0.86875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.84375, "step": 1840 }, { "completion_length": 126.275, "epoch": 0.4075040377686669, "grad_norm": 0.35468253817906953, "kl": 0.38702392578125, "learning_rate": 1.4771248756955042e-05, "loss": 0.0155, "reward": 1.05625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 1.0, "step": 1845 }, { "completion_length": 123.95625, "epoch": 0.40860838475448985, "grad_norm": 0.5763156946426605, "kl": 0.36905517578125, "learning_rate": 1.4737328457208471e-05, "loss": 0.0148, "reward": 1.125, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.99375, "step": 1850 }, { "completion_length": 113.06875, "epoch": 0.4097127317403128, "grad_norm": 0.47557294439345416, "kl": 0.379296875, "learning_rate": 1.4703337731844374e-05, "loss": 0.0152, "reward": 1.0875, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 1.0, "step": 1855 }, { "completion_length": 106.64375, "epoch": 0.41081707872613576, "grad_norm": 0.787870748692926, "kl": 0.402252197265625, "learning_rate": 1.4669277086172406e-05, "loss": 0.0161, "reward": 1.08125, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.99375, "step": 1860 }, { "completion_length": 113.35, "epoch": 0.4119214257119587, "grad_norm": 0.5993634901343988, "kl": 0.395849609375, "learning_rate": 1.4635147026541674e-05, "loss": 0.0158, "reward": 1.08125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 1.0, "step": 1865 }, { "completion_length": 142.475, "epoch": 0.4130257726977816, "grad_norm": 0.14466786983492855, "kl": 0.3989501953125, "learning_rate": 1.4600948060333187e-05, "loss": 0.016, "reward": 1.1, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.99375, "step": 1870 }, { "completion_length": 180.33125, "epoch": 0.4141301196836046, "grad_norm": 0.2913646742496878, "kl": 0.3666259765625, "learning_rate": 1.4566680695952333e-05, "loss": 0.0147, "reward": 1.0375, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9875, "step": 1875 }, { "completion_length": 246.5625, "epoch": 0.41523446666942754, "grad_norm": 0.10584068251739959, "kl": 0.3494873046875, "learning_rate": 1.4532345442821323e-05, "loss": 0.014, "reward": 1.0375, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.99375, "step": 1880 }, { "completion_length": 349.275, "epoch": 0.4163388136552505, "grad_norm": 0.42138126797069886, "kl": 0.40732421875, "learning_rate": 1.4497942811371592e-05, "loss": 0.0163, "reward": 0.95625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.925, "step": 1885 }, { "completion_length": 282.8125, "epoch": 0.41744316064107345, "grad_norm": 0.5708867419498027, "kl": 0.4172607421875, "learning_rate": 1.4463473313036241e-05, "loss": 0.0167, "reward": 0.93125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.90625, "step": 1890 }, { "completion_length": 245.1, "epoch": 0.41854750762689635, "grad_norm": 0.5020193212271058, "kl": 0.431884765625, "learning_rate": 1.4428937460242417e-05, "loss": 0.0173, "reward": 0.99375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.91875, "step": 1895 }, { "completion_length": 197.70625, "epoch": 0.4196518546127193, "grad_norm": 0.590350066389971, "kl": 0.3642578125, "learning_rate": 1.4394335766403703e-05, "loss": 0.0146, "reward": 1.0, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.95625, "step": 1900 }, { "epoch": 0.4196518546127193, "eval_completion_length": 210.86, "eval_kl": 0.4005078125, "eval_loss": 0.016043836250901222, "eval_reward": 1.025, "eval_reward_std": 0.12020815074443818, "eval_rewards/accuracy_reward": 0.09, "eval_rewards/format_reward": 0.935, "eval_runtime": 109.1917, "eval_samples_per_second": 0.907, "eval_steps_per_second": 0.229, "step": 1900 }, { "completion_length": 210.5125, "epoch": 0.42075620159854227, "grad_norm": 0.5480387591743037, "kl": 0.4215087890625, "learning_rate": 1.4359668745912472e-05, "loss": 0.0169, "reward": 0.98125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.93125, "step": 1905 }, { "completion_length": 187.575, "epoch": 0.4218605485843652, "grad_norm": 0.37297156781214846, "kl": 0.318310546875, "learning_rate": 1.4324936914132255e-05, "loss": 0.0127, "reward": 1.0125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.95625, "step": 1910 }, { "completion_length": 194.59375, "epoch": 0.4229648955701881, "grad_norm": 0.4621580856810021, "kl": 0.3155029296875, "learning_rate": 1.4290140787390083e-05, "loss": 0.0126, "reward": 1.0125, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.9875, "step": 1915 }, { "completion_length": 215.26875, "epoch": 0.4240692425560111, "grad_norm": 0.23236440356104268, "kl": 0.31630859375, "learning_rate": 1.4255280882968787e-05, "loss": 0.0126, "reward": 1.0375, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.98125, "step": 1920 }, { "completion_length": 257.96875, "epoch": 0.42517358954183404, "grad_norm": 0.1821862648072376, "kl": 0.36019287109375, "learning_rate": 1.4220357719099338e-05, "loss": 0.0144, "reward": 1.00625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.95, "step": 1925 }, { "completion_length": 278.40625, "epoch": 0.426277936527657, "grad_norm": 0.5171028800770205, "kl": 0.341015625, "learning_rate": 1.4185371814953116e-05, "loss": 0.0136, "reward": 0.96875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9125, "step": 1930 }, { "completion_length": 203.1, "epoch": 0.42738228351347995, "grad_norm": 0.3194976531325425, "kl": 0.33564453125, "learning_rate": 1.415032369063422e-05, "loss": 0.0134, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.98125, "step": 1935 }, { "completion_length": 248.76875, "epoch": 0.42848663049930286, "grad_norm": 0.32014297487636434, "kl": 0.34617919921875, "learning_rate": 1.41152138671717e-05, "loss": 0.0138, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "step": 1940 }, { "completion_length": 224.6875, "epoch": 0.4295909774851258, "grad_norm": 0.7093467850610166, "kl": 0.32181396484375, "learning_rate": 1.408004286651185e-05, "loss": 0.0129, "reward": 1.025, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.99375, "step": 1945 }, { "completion_length": 219.30625, "epoch": 0.43069532447094877, "grad_norm": 0.12714345360829074, "kl": 0.3352294921875, "learning_rate": 1.4044811211510419e-05, "loss": 0.0134, "reward": 1.04375, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.98125, "step": 1950 }, { "completion_length": 253.2875, "epoch": 0.4317996714567717, "grad_norm": 0.180488927556792, "kl": 0.3677734375, "learning_rate": 1.4009519425924858e-05, "loss": 0.0147, "reward": 1.04375, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9875, "step": 1955 }, { "completion_length": 270.91875, "epoch": 0.4329040184425947, "grad_norm": 0.30372496615737005, "kl": 0.301806640625, "learning_rate": 1.3974168034406524e-05, "loss": 0.0121, "reward": 0.99375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9625, "step": 1960 }, { "completion_length": 248.61875, "epoch": 0.4340083654284176, "grad_norm": 0.4701815288270493, "kl": 0.33768310546875, "learning_rate": 1.3938757562492873e-05, "loss": 0.0135, "reward": 1.04375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.98125, "step": 1965 }, { "completion_length": 225.59375, "epoch": 0.43511271241424054, "grad_norm": 0.36757084788553285, "kl": 0.3396484375, "learning_rate": 1.3903288536599668e-05, "loss": 0.0136, "reward": 1.0625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.98125, "step": 1970 }, { "completion_length": 312.15, "epoch": 0.4362170594000635, "grad_norm": 0.48977082806979283, "kl": 0.358837890625, "learning_rate": 1.3867761484013135e-05, "loss": 0.0144, "reward": 1.01875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.95625, "step": 1975 }, { "completion_length": 307.16875, "epoch": 0.43732140638588646, "grad_norm": 0.49526262796042186, "kl": 0.340966796875, "learning_rate": 1.3832176932882136e-05, "loss": 0.0136, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9875, "step": 1980 }, { "completion_length": 252.50625, "epoch": 0.4384257533717094, "grad_norm": 0.6091872240850794, "kl": 0.3243408203125, "learning_rate": 1.3796535412210301e-05, "loss": 0.013, "reward": 1.025, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.96875, "step": 1985 }, { "completion_length": 230.20625, "epoch": 0.4395301003575323, "grad_norm": 0.29579068631634226, "kl": 0.34698486328125, "learning_rate": 1.3760837451848193e-05, "loss": 0.0139, "reward": 1.075, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.975, "step": 1990 }, { "completion_length": 246.5375, "epoch": 0.4406344473433553, "grad_norm": 0.3033284715610845, "kl": 0.34864501953125, "learning_rate": 1.3725083582485397e-05, "loss": 0.0139, "reward": 1.01875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.94375, "step": 1995 }, { "completion_length": 213.94375, "epoch": 0.44173879432917823, "grad_norm": 0.4124635672949258, "kl": 0.33160400390625, "learning_rate": 1.3689274335642653e-05, "loss": 0.0133, "reward": 1.01875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.96875, "step": 2000 }, { "epoch": 0.44173879432917823, "eval_completion_length": 175.91, "eval_kl": 0.3898828125, "eval_loss": 0.015537865459918976, "eval_reward": 1.1, "eval_reward_std": 0.11313708305358887, "eval_rewards/accuracy_reward": 0.11, "eval_rewards/format_reward": 0.99, "eval_runtime": 88.9833, "eval_samples_per_second": 1.113, "eval_steps_per_second": 0.281, "step": 2000 }, { "completion_length": 198.1125, "epoch": 0.4428431413150012, "grad_norm": 0.41251225473854053, "kl": 0.32073974609375, "learning_rate": 1.3653410243663953e-05, "loss": 0.0128, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.98125, "step": 2005 }, { "completion_length": 174.21875, "epoch": 0.44394748830082414, "grad_norm": 0.6755273210792271, "kl": 0.318115234375, "learning_rate": 1.3617491839708614e-05, "loss": 0.0127, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9875, "step": 2010 }, { "completion_length": 208.9875, "epoch": 0.44505183528664705, "grad_norm": 0.37867244007672246, "kl": 0.32615966796875, "learning_rate": 1.3581519657743365e-05, "loss": 0.013, "reward": 1.05625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9625, "step": 2015 }, { "completion_length": 255.75, "epoch": 0.44615618227247, "grad_norm": 0.6185289191665273, "kl": 0.3501220703125, "learning_rate": 1.3545494232534406e-05, "loss": 0.014, "reward": 1.06875, "reward_std": 0.23864853456616403, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.9375, "step": 2020 }, { "completion_length": 327.4125, "epoch": 0.44726052925829296, "grad_norm": 0.8766696054374737, "kl": 0.4114990234375, "learning_rate": 1.3509416099639456e-05, "loss": 0.0165, "reward": 1.03125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.95, "step": 2025 }, { "completion_length": 336.04375, "epoch": 0.4483648762441159, "grad_norm": 0.26246836662809214, "kl": 0.33997802734375, "learning_rate": 1.3473285795399792e-05, "loss": 0.0136, "reward": 1.09375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9875, "step": 2030 }, { "completion_length": 242.35625, "epoch": 0.4494692232299388, "grad_norm": 0.4471292709567679, "kl": 0.3488037109375, "learning_rate": 1.3437103856932266e-05, "loss": 0.014, "reward": 1.1, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.975, "step": 2035 }, { "completion_length": 209.25625, "epoch": 0.4505735702157618, "grad_norm": 0.73758038746274, "kl": 0.3877685546875, "learning_rate": 1.3400870822121348e-05, "loss": 0.0155, "reward": 0.9375, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.85, "step": 2040 }, { "completion_length": 196.275, "epoch": 0.45167791720158473, "grad_norm": 0.5384037353444987, "kl": 0.373583984375, "learning_rate": 1.3364587229611095e-05, "loss": 0.0149, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.975, "step": 2045 }, { "completion_length": 242.125, "epoch": 0.4527822641874077, "grad_norm": 0.35105762795134465, "kl": 0.43712158203125, "learning_rate": 1.332825361879717e-05, "loss": 0.0175, "reward": 1.1, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.975, "step": 2050 }, { "completion_length": 226.36875, "epoch": 0.45388661117323065, "grad_norm": 0.5975228945667029, "kl": 0.51041259765625, "learning_rate": 1.3291870529818809e-05, "loss": 0.0204, "reward": 1.0875, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.975, "step": 2055 }, { "completion_length": 243.575, "epoch": 0.45499095815905355, "grad_norm": 0.4387220366275729, "kl": 0.458203125, "learning_rate": 1.3255438503550796e-05, "loss": 0.0183, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.975, "step": 2060 }, { "completion_length": 250.0875, "epoch": 0.4560953051448765, "grad_norm": 0.5275971912489743, "kl": 0.39151611328125, "learning_rate": 1.3218958081595426e-05, "loss": 0.0157, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.98125, "step": 2065 }, { "completion_length": 282.99375, "epoch": 0.45719965213069946, "grad_norm": 0.4169345983090638, "kl": 0.4114990234375, "learning_rate": 1.3182429806274442e-05, "loss": 0.0165, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.975, "step": 2070 }, { "completion_length": 242.4, "epoch": 0.4583039991165224, "grad_norm": 0.5265515421182827, "kl": 0.48302001953125, "learning_rate": 1.3145854220620981e-05, "loss": 0.0193, "reward": 1.0, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.94375, "step": 2075 }, { "completion_length": 241.05, "epoch": 0.4594083461023454, "grad_norm": 0.5585942695958915, "kl": 0.4310302734375, "learning_rate": 1.3109231868371511e-05, "loss": 0.0172, "reward": 0.9625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.94375, "step": 2080 }, { "completion_length": 176.9125, "epoch": 0.4605126930881683, "grad_norm": 0.4414724029442064, "kl": 0.35958251953125, "learning_rate": 1.3072563293957725e-05, "loss": 0.0144, "reward": 1.05625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 2085 }, { "completion_length": 192.8, "epoch": 0.46161704007399124, "grad_norm": 0.586018890860745, "kl": 0.322509765625, "learning_rate": 1.3035849042498462e-05, "loss": 0.0129, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9625, "step": 2090 }, { "completion_length": 208.95625, "epoch": 0.4627213870598142, "grad_norm": 0.42268112037658245, "kl": 0.319384765625, "learning_rate": 1.299908965979161e-05, "loss": 0.0128, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9625, "step": 2095 }, { "completion_length": 222.16875, "epoch": 0.46382573404563715, "grad_norm": 0.31473777919390844, "kl": 0.30546875, "learning_rate": 1.2962285692305964e-05, "loss": 0.0122, "reward": 1.0125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9625, "step": 2100 }, { "epoch": 0.46382573404563715, "eval_completion_length": 192.71, "eval_kl": 0.31083984375, "eval_loss": 0.012439416721463203, "eval_reward": 1.1, "eval_reward_std": 0.21213203072547912, "eval_rewards/accuracy_reward": 0.145, "eval_rewards/format_reward": 0.955, "eval_runtime": 96.8968, "eval_samples_per_second": 1.022, "eval_steps_per_second": 0.258, "step": 2100 }, { "completion_length": 216.8125, "epoch": 0.4649300810314601, "grad_norm": 0.35648646516795124, "kl": 0.32301025390625, "learning_rate": 1.2925437687173144e-05, "loss": 0.0129, "reward": 1.04375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9625, "step": 2105 }, { "completion_length": 241.8125, "epoch": 0.466034428017283, "grad_norm": 0.601644494723294, "kl": 0.3125732421875, "learning_rate": 1.2888546192179417e-05, "loss": 0.0125, "reward": 0.96875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.94375, "step": 2110 }, { "completion_length": 217.7625, "epoch": 0.46713877500310597, "grad_norm": 0.39671636795145077, "kl": 0.326318359375, "learning_rate": 1.2851611755757587e-05, "loss": 0.013, "reward": 1.025, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9375, "step": 2115 }, { "completion_length": 210.575, "epoch": 0.4682431219889289, "grad_norm": 0.5031259322905296, "kl": 0.35986328125, "learning_rate": 1.2814634926978831e-05, "loss": 0.0144, "reward": 1.025, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.93125, "step": 2120 }, { "completion_length": 187.1, "epoch": 0.4693474689747519, "grad_norm": 0.48486411865791645, "kl": 0.35367431640625, "learning_rate": 1.2777616255544527e-05, "loss": 0.0141, "reward": 1.075, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.95, "step": 2125 }, { "completion_length": 151.975, "epoch": 0.47045181596057484, "grad_norm": 0.7338227984314649, "kl": 0.3826904296875, "learning_rate": 1.2740556291778096e-05, "loss": 0.0153, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.975, "step": 2130 }, { "completion_length": 133.78125, "epoch": 0.47155616294639774, "grad_norm": 0.11684943721597078, "kl": 0.33577880859375, "learning_rate": 1.2703455586616811e-05, "loss": 0.0134, "reward": 1.0875, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.99375, "step": 2135 }, { "completion_length": 145.9875, "epoch": 0.4726605099322207, "grad_norm": 0.32149812408314604, "kl": 0.38963623046875, "learning_rate": 1.2666314691603615e-05, "loss": 0.0156, "reward": 1.13125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.9875, "step": 2140 }, { "completion_length": 232.575, "epoch": 0.47376485691804365, "grad_norm": 0.6480932091195085, "kl": 0.3406494140625, "learning_rate": 1.2629134158878919e-05, "loss": 0.0136, "reward": 1.05625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 2145 }, { "completion_length": 253.8875, "epoch": 0.4748692039038666, "grad_norm": 0.36437117134621355, "kl": 0.3377685546875, "learning_rate": 1.259191454117239e-05, "loss": 0.0135, "reward": 1.04375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.975, "step": 2150 }, { "completion_length": 231.3125, "epoch": 0.47597355088968957, "grad_norm": 0.4391123760933655, "kl": 0.3203125, "learning_rate": 1.255465639179473e-05, "loss": 0.0128, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.975, "step": 2155 }, { "completion_length": 260.05625, "epoch": 0.47707789787551247, "grad_norm": 0.34571139879091517, "kl": 0.35738525390625, "learning_rate": 1.2517360264629463e-05, "loss": 0.0143, "reward": 1.01875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.95, "step": 2160 }, { "completion_length": 221.43125, "epoch": 0.4781822448613354, "grad_norm": 0.45867628713278896, "kl": 0.38974609375, "learning_rate": 1.24800267141247e-05, "loss": 0.0156, "reward": 1.0375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.93125, "step": 2165 }, { "completion_length": 170.575, "epoch": 0.4792865918471584, "grad_norm": 0.19943826053198088, "kl": 0.37861328125, "learning_rate": 1.2442656295284879e-05, "loss": 0.0151, "reward": 1.05, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.98125, "step": 2170 }, { "completion_length": 197.7875, "epoch": 0.48039093883298134, "grad_norm": 0.37120994010979813, "kl": 0.342919921875, "learning_rate": 1.2405249563662539e-05, "loss": 0.0137, "reward": 0.99375, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.95625, "step": 2175 }, { "completion_length": 175.96875, "epoch": 0.48149528581880424, "grad_norm": 0.6013419839896456, "kl": 0.3757080078125, "learning_rate": 1.2367807075350036e-05, "loss": 0.015, "reward": 1.08125, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.95625, "step": 2180 }, { "completion_length": 184.83125, "epoch": 0.4825996328046272, "grad_norm": 0.5134474475685822, "kl": 0.36424560546875, "learning_rate": 1.23303293869713e-05, "loss": 0.0146, "reward": 1.0125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9625, "step": 2185 }, { "completion_length": 234.29375, "epoch": 0.48370397979045016, "grad_norm": 0.5230059460040423, "kl": 0.34986572265625, "learning_rate": 1.2292817055673543e-05, "loss": 0.014, "reward": 1.0125, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9375, "step": 2190 }, { "completion_length": 308.00625, "epoch": 0.4848083267762731, "grad_norm": 0.7592675553160979, "kl": 0.3602783203125, "learning_rate": 1.2255270639118984e-05, "loss": 0.0144, "reward": 1.0125, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9125, "step": 2195 }, { "completion_length": 273.43125, "epoch": 0.48591267376209607, "grad_norm": 0.2856132458649576, "kl": 0.37276611328125, "learning_rate": 1.2217690695476551e-05, "loss": 0.0149, "reward": 1.00625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.90625, "step": 2200 }, { "epoch": 0.48591267376209607, "eval_completion_length": 253.775, "eval_kl": 0.5310546875, "eval_loss": 0.02128330059349537, "eval_reward": 1.045, "eval_reward_std": 0.162634556889534, "eval_rewards/accuracy_reward": 0.105, "eval_rewards/format_reward": 0.94, "eval_runtime": 127.3028, "eval_samples_per_second": 0.778, "eval_steps_per_second": 0.196, "step": 2200 }, { "completion_length": 259.58125, "epoch": 0.48701702074791897, "grad_norm": 0.3284138468757768, "kl": 0.3307373046875, "learning_rate": 1.2180077783413601e-05, "loss": 0.0132, "reward": 1.05625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 2205 }, { "completion_length": 252.30625, "epoch": 0.48812136773374193, "grad_norm": 0.4057901132375836, "kl": 0.4347412109375, "learning_rate": 1.21424324620876e-05, "loss": 0.0174, "reward": 0.9875, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.91875, "step": 2210 }, { "completion_length": 249.60625, "epoch": 0.4892257147195649, "grad_norm": 0.9156591934586986, "kl": 0.3991943359375, "learning_rate": 1.2104755291137797e-05, "loss": 0.016, "reward": 0.9875, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9375, "step": 2215 }, { "completion_length": 229.95625, "epoch": 0.49033006170538784, "grad_norm": 0.42300199898124896, "kl": 0.374468994140625, "learning_rate": 1.2067046830676947e-05, "loss": 0.015, "reward": 1.025, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.95, "step": 2220 }, { "completion_length": 206.26875, "epoch": 0.4914344086912108, "grad_norm": 0.3982306025965269, "kl": 0.3041748046875, "learning_rate": 1.2029307641282935e-05, "loss": 0.0122, "reward": 1.05, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.96875, "step": 2225 }, { "completion_length": 225.10625, "epoch": 0.4925387556770337, "grad_norm": 0.3221497073306949, "kl": 0.30826416015625, "learning_rate": 1.1991538283990483e-05, "loss": 0.0123, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.95625, "step": 2230 }, { "completion_length": 193.3125, "epoch": 0.49364310266285666, "grad_norm": 0.13887134423717792, "kl": 0.32666015625, "learning_rate": 1.1953739320282778e-05, "loss": 0.0131, "reward": 1.06875, "reward_std": 0.02651650384068489, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.99375, "step": 2235 }, { "completion_length": 208.83125, "epoch": 0.4947474496486796, "grad_norm": 0.3671257080599345, "kl": 0.30867919921875, "learning_rate": 1.191591131208315e-05, "loss": 0.0123, "reward": 1.0875, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.99375, "step": 2240 }, { "completion_length": 196.06875, "epoch": 0.4958517966345026, "grad_norm": 0.4192081679963359, "kl": 0.3347412109375, "learning_rate": 1.1878054821746703e-05, "loss": 0.0134, "reward": 1.11875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.96875, "step": 2245 }, { "completion_length": 218.25625, "epoch": 0.49695614362032553, "grad_norm": 0.2809177260646367, "kl": 0.31444091796875, "learning_rate": 1.1840170412051957e-05, "loss": 0.0126, "reward": 1.075, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.975, "step": 2250 }, { "completion_length": 246.13125, "epoch": 0.49806049060614843, "grad_norm": 0.3266723103626801, "kl": 0.290673828125, "learning_rate": 1.1802258646192486e-05, "loss": 0.0116, "reward": 1.025, "reward_std": 0.03535533845424652, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 1.0, "step": 2255 }, { "completion_length": 263.225, "epoch": 0.4991648375919714, "grad_norm": 0.19489451125924528, "kl": 0.29127197265625, "learning_rate": 1.1764320087768546e-05, "loss": 0.0116, "reward": 1.0875, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.975, "step": 2260 }, { "completion_length": 265.95, "epoch": 0.5002691845777943, "grad_norm": 0.5199830849997911, "kl": 0.34532470703125, "learning_rate": 1.1726355300778693e-05, "loss": 0.0138, "reward": 1.04375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9625, "step": 2265 }, { "completion_length": 230.08125, "epoch": 0.5013735315636173, "grad_norm": 0.5370502961123099, "kl": 0.31375732421875, "learning_rate": 1.1688364849611395e-05, "loss": 0.0125, "reward": 1.075, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.98125, "step": 2270 }, { "completion_length": 268.4375, "epoch": 0.5024778785494403, "grad_norm": 0.4955767601038962, "kl": 0.28502197265625, "learning_rate": 1.1650349299036656e-05, "loss": 0.0114, "reward": 1.0625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.98125, "step": 2275 }, { "completion_length": 204.94375, "epoch": 0.5035822255352632, "grad_norm": 0.5349423661973638, "kl": 0.3089111328125, "learning_rate": 1.1612309214197599e-05, "loss": 0.0124, "reward": 1.0625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.975, "step": 2280 }, { "completion_length": 191.61875, "epoch": 0.5046865725210862, "grad_norm": 0.7009751890329485, "kl": 0.32718505859375, "learning_rate": 1.1574245160602085e-05, "loss": 0.0131, "reward": 1.0375, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9875, "step": 2285 }, { "completion_length": 174.96875, "epoch": 0.505790919506909, "grad_norm": 0.15634356767517202, "kl": 0.32626953125, "learning_rate": 1.153615770411429e-05, "loss": 0.013, "reward": 1.1125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.99375, "step": 2290 }, { "completion_length": 213.85625, "epoch": 0.506895266492732, "grad_norm": 0.47252723075105413, "kl": 0.311212158203125, "learning_rate": 1.1498047410946307e-05, "loss": 0.0124, "reward": 1.06875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9875, "step": 2295 }, { "completion_length": 215.2875, "epoch": 0.5079996134785549, "grad_norm": 0.4948372981089919, "kl": 0.33463134765625, "learning_rate": 1.1459914847649716e-05, "loss": 0.0134, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.98125, "step": 2300 }, { "epoch": 0.5079996134785549, "eval_completion_length": 230.115, "eval_kl": 0.37994140625, "eval_loss": 0.015226633287966251, "eval_reward": 1.105, "eval_reward_std": 0.13435028612613678, "eval_rewards/accuracy_reward": 0.135, "eval_rewards/format_reward": 0.97, "eval_runtime": 115.6989, "eval_samples_per_second": 0.856, "eval_steps_per_second": 0.216, "step": 2300 }, { "completion_length": 215.875, "epoch": 0.5091039604643779, "grad_norm": 0.3666409705462901, "kl": 0.35133056640625, "learning_rate": 1.1421760581107164e-05, "loss": 0.0141, "reward": 1.04375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9625, "step": 2305 }, { "completion_length": 254.09375, "epoch": 0.5102083074502008, "grad_norm": 0.2975662859403391, "kl": 0.34254150390625, "learning_rate": 1.1383585178523955e-05, "loss": 0.0137, "reward": 0.98125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.94375, "step": 2310 }, { "completion_length": 256.36875, "epoch": 0.5113126544360238, "grad_norm": 0.719346374442343, "kl": 0.35948486328125, "learning_rate": 1.1345389207419588e-05, "loss": 0.0144, "reward": 1.01875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.925, "step": 2315 }, { "completion_length": 209.3, "epoch": 0.5124170014218468, "grad_norm": 0.7163698011097263, "kl": 0.32767333984375, "learning_rate": 1.1307173235619342e-05, "loss": 0.0131, "reward": 1.05625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 2320 }, { "completion_length": 229.61875, "epoch": 0.5135213484076697, "grad_norm": 0.16271285683956263, "kl": 0.32386474609375, "learning_rate": 1.126893783124583e-05, "loss": 0.013, "reward": 1.04375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.96875, "step": 2325 }, { "completion_length": 241.45625, "epoch": 0.5146256953934927, "grad_norm": 0.29867489688674986, "kl": 0.34649658203125, "learning_rate": 1.1230683562710549e-05, "loss": 0.0139, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.975, "step": 2330 }, { "completion_length": 241.88125, "epoch": 0.5157300423793156, "grad_norm": 0.2511157393264926, "kl": 0.33067626953125, "learning_rate": 1.1192410998705432e-05, "loss": 0.0132, "reward": 1.08125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.98125, "step": 2335 }, { "completion_length": 259.81875, "epoch": 0.5168343893651385, "grad_norm": 0.1754130257017029, "kl": 0.29140625, "learning_rate": 1.1154120708194398e-05, "loss": 0.0117, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9875, "step": 2340 }, { "completion_length": 238.56875, "epoch": 0.5179387363509614, "grad_norm": 0.2633940153329421, "kl": 0.32506103515625, "learning_rate": 1.1115813260404889e-05, "loss": 0.013, "reward": 1.05625, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.98125, "step": 2345 }, { "completion_length": 194.5625, "epoch": 0.5190430833367844, "grad_norm": 0.37301932919862296, "kl": 0.3505615234375, "learning_rate": 1.1077489224819402e-05, "loss": 0.014, "reward": 1.075, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.975, "step": 2350 }, { "completion_length": 202.15, "epoch": 0.5201474303226074, "grad_norm": 0.5495626073196226, "kl": 0.377880859375, "learning_rate": 1.1039149171167046e-05, "loss": 0.0151, "reward": 0.975, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.93125, "step": 2355 }, { "completion_length": 220.68125, "epoch": 0.5212517773084303, "grad_norm": 0.38846325239139723, "kl": 0.3607421875, "learning_rate": 1.1000793669415035e-05, "loss": 0.0144, "reward": 1.025, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.91875, "step": 2360 }, { "completion_length": 211.93125, "epoch": 0.5223561242942533, "grad_norm": 0.5335204386556052, "kl": 0.3825927734375, "learning_rate": 1.0962423289760254e-05, "loss": 0.0153, "reward": 1.0125, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.925, "step": 2365 }, { "completion_length": 195.3375, "epoch": 0.5234604712800762, "grad_norm": 0.5401729852880928, "kl": 0.35989990234375, "learning_rate": 1.0924038602620757e-05, "loss": 0.0144, "reward": 1.025, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.95625, "step": 2370 }, { "completion_length": 186.30625, "epoch": 0.5245648182658992, "grad_norm": 0.6847630132207224, "kl": 0.32152099609375, "learning_rate": 1.0885640178627291e-05, "loss": 0.0129, "reward": 1.05, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.96875, "step": 2375 }, { "completion_length": 187.09375, "epoch": 0.5256691652517221, "grad_norm": 0.5295522607198633, "kl": 0.288751220703125, "learning_rate": 1.0847228588614821e-05, "loss": 0.0115, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.99375, "step": 2380 }, { "completion_length": 207.14375, "epoch": 0.526773512237545, "grad_norm": 0.39564621021267, "kl": 0.310736083984375, "learning_rate": 1.0808804403614044e-05, "loss": 0.0124, "reward": 1.025, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.98125, "step": 2385 }, { "completion_length": 217.94375, "epoch": 0.5278778592233679, "grad_norm": 0.32699751280459455, "kl": 0.30042724609375, "learning_rate": 1.0770368194842886e-05, "loss": 0.012, "reward": 1.03125, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.99375, "step": 2390 }, { "completion_length": 214.59375, "epoch": 0.5289822062091909, "grad_norm": 0.3390808446949196, "kl": 0.321435546875, "learning_rate": 1.073192053369802e-05, "loss": 0.0129, "reward": 1.025, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.96875, "step": 2395 }, { "completion_length": 227.125, "epoch": 0.5300865531950139, "grad_norm": 0.3714729764465815, "kl": 0.36365966796875, "learning_rate": 1.0693461991746389e-05, "loss": 0.0146, "reward": 1.05, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95, "step": 2400 }, { "epoch": 0.5300865531950139, "eval_completion_length": 213.925, "eval_kl": 0.4396484375, "eval_loss": 0.017622916027903557, "eval_reward": 1.03, "eval_reward_std": 0.1414213538169861, "eval_rewards/accuracy_reward": 0.08, "eval_rewards/format_reward": 0.95, "eval_runtime": 105.7779, "eval_samples_per_second": 0.936, "eval_steps_per_second": 0.236, "step": 2400 }, { "completion_length": 225.29375, "epoch": 0.5311909001808368, "grad_norm": 0.2664948280585707, "kl": 0.40863037109375, "learning_rate": 1.0654993140716665e-05, "loss": 0.0164, "reward": 1.0, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.94375, "step": 2405 }, { "completion_length": 208.39375, "epoch": 0.5322952471666598, "grad_norm": 0.634943961085242, "kl": 0.33333740234375, "learning_rate": 1.0616514552490791e-05, "loss": 0.0133, "reward": 1.05625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 2410 }, { "completion_length": 186.975, "epoch": 0.5333995941524827, "grad_norm": 0.5915967584567521, "kl": 0.294097900390625, "learning_rate": 1.0578026799095464e-05, "loss": 0.0118, "reward": 1.0125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.975, "step": 2415 }, { "completion_length": 202.1625, "epoch": 0.5345039411383057, "grad_norm": 0.45619021814548455, "kl": 0.321533203125, "learning_rate": 1.0539530452693625e-05, "loss": 0.0129, "reward": 1.06875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.975, "step": 2420 }, { "completion_length": 243.76875, "epoch": 0.5356082881241286, "grad_norm": 0.3048810072947555, "kl": 0.37376708984375, "learning_rate": 1.0501026085575967e-05, "loss": 0.0149, "reward": 1.0, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.94375, "step": 2425 }, { "completion_length": 252.825, "epoch": 0.5367126351099516, "grad_norm": 0.40580599059859296, "kl": 0.36983642578125, "learning_rate": 1.046251427015241e-05, "loss": 0.0148, "reward": 1.01875, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.94375, "step": 2430 }, { "completion_length": 265.5875, "epoch": 0.5378169820957744, "grad_norm": 0.5522550075448642, "kl": 0.384912109375, "learning_rate": 1.0423995578943615e-05, "loss": 0.0154, "reward": 1.025, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.93125, "step": 2435 }, { "completion_length": 274.49375, "epoch": 0.5389213290815974, "grad_norm": 0.7214328168716406, "kl": 0.4843017578125, "learning_rate": 1.0385470584572449e-05, "loss": 0.0194, "reward": 1.04375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.925, "step": 2440 }, { "completion_length": 336.85625, "epoch": 0.5400256760674204, "grad_norm": 0.3475290976205475, "kl": 0.52220458984375, "learning_rate": 1.0346939859755481e-05, "loss": 0.0209, "reward": 0.9625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9, "step": 2445 }, { "completion_length": 298.64375, "epoch": 0.5411300230532433, "grad_norm": 0.37045136738406037, "kl": 0.3713623046875, "learning_rate": 1.0308403977294476e-05, "loss": 0.0149, "reward": 1.025, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.95, "step": 2450 }, { "completion_length": 247.96875, "epoch": 0.5422343700390663, "grad_norm": 1.2798359500362633, "kl": 0.4294677734375, "learning_rate": 1.0269863510067872e-05, "loss": 0.0172, "reward": 1.05625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.95, "step": 2455 }, { "completion_length": 220.65625, "epoch": 0.5433387170248892, "grad_norm": 0.3900864517809055, "kl": 0.405615234375, "learning_rate": 1.023131903102226e-05, "loss": 0.0162, "reward": 1.0125, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.95625, "step": 2460 }, { "completion_length": 229.73125, "epoch": 0.5444430640107122, "grad_norm": 0.4673057649728135, "kl": 0.3134033203125, "learning_rate": 1.0192771113163875e-05, "loss": 0.0125, "reward": 1.04375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.98125, "step": 2465 }, { "completion_length": 197.48125, "epoch": 0.5455474109965351, "grad_norm": 0.5576580736483916, "kl": 0.34364013671875, "learning_rate": 1.0154220329550076e-05, "loss": 0.0137, "reward": 1.06875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9875, "step": 2470 }, { "completion_length": 227.26875, "epoch": 0.5466517579823581, "grad_norm": 0.21804579013846342, "kl": 0.40257568359375, "learning_rate": 1.0115667253280817e-05, "loss": 0.0161, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9625, "step": 2475 }, { "completion_length": 209.29375, "epoch": 0.5477561049681811, "grad_norm": 0.3304966161490469, "kl": 0.35350341796875, "learning_rate": 1.0077112457490143e-05, "loss": 0.0141, "reward": 1.03125, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.975, "step": 2480 }, { "completion_length": 190.29375, "epoch": 0.5488604519540039, "grad_norm": 0.47912529430555245, "kl": 0.34261474609375, "learning_rate": 1.0038556515337654e-05, "loss": 0.0137, "reward": 1.0625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.98125, "step": 2485 }, { "completion_length": 210.5625, "epoch": 0.5499647989398269, "grad_norm": 0.6442806902182178, "kl": 0.3751953125, "learning_rate": 1e-05, "loss": 0.015, "reward": 0.99375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.95625, "step": 2490 }, { "completion_length": 205.20625, "epoch": 0.5510691459256498, "grad_norm": 0.6416478639095549, "kl": 0.412200927734375, "learning_rate": 9.961443484662349e-06, "loss": 0.0165, "reward": 1.03125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.95625, "step": 2495 }, { "completion_length": 222.10625, "epoch": 0.5521734929114728, "grad_norm": 0.38828171969188574, "kl": 0.35648193359375, "learning_rate": 9.92288754250986e-06, "loss": 0.0143, "reward": 1.0125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9625, "step": 2500 }, { "epoch": 0.5521734929114728, "eval_completion_length": 193.71, "eval_kl": 0.44693359375, "eval_loss": 0.01760600134730339, "eval_reward": 1.055, "eval_reward_std": 0.13435028612613678, "eval_rewards/accuracy_reward": 0.085, "eval_rewards/format_reward": 0.97, "eval_runtime": 90.4642, "eval_samples_per_second": 1.094, "eval_steps_per_second": 0.276, "step": 2500 }, { "completion_length": 221.71875, "epoch": 0.5532778398972957, "grad_norm": 0.5618731254639551, "kl": 0.420947265625, "learning_rate": 9.884332746719186e-06, "loss": 0.0168, "reward": 1.0, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.94375, "step": 2505 }, { "completion_length": 180.55625, "epoch": 0.5543821868831187, "grad_norm": 0.4034995486933272, "kl": 0.32989501953125, "learning_rate": 9.845779670449926e-06, "loss": 0.0132, "reward": 1.05, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.99375, "step": 2510 }, { "completion_length": 211.625, "epoch": 0.5554865338689416, "grad_norm": 0.291793288012953, "kl": 0.4625244140625, "learning_rate": 9.807228886836128e-06, "loss": 0.0185, "reward": 1.05, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.975, "step": 2515 }, { "completion_length": 173.81875, "epoch": 0.5565908808547646, "grad_norm": 0.4350329917552652, "kl": 0.319976806640625, "learning_rate": 9.768680968977743e-06, "loss": 0.0128, "reward": 1.0875, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 1.0, "step": 2520 }, { "completion_length": 209.65, "epoch": 0.5576952278405876, "grad_norm": 0.32148315759606677, "kl": 0.31556396484375, "learning_rate": 9.730136489932133e-06, "loss": 0.0126, "reward": 1.1, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.975, "step": 2525 }, { "completion_length": 181.725, "epoch": 0.5587995748264104, "grad_norm": 0.4796485021571092, "kl": 0.355078125, "learning_rate": 9.691596022705527e-06, "loss": 0.0142, "reward": 1.0625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9875, "step": 2530 }, { "completion_length": 217.01875, "epoch": 0.5599039218122334, "grad_norm": 0.6531621153503112, "kl": 0.52369384765625, "learning_rate": 9.653060140244524e-06, "loss": 0.0209, "reward": 1.0, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9375, "step": 2535 }, { "completion_length": 222.725, "epoch": 0.5610082687980563, "grad_norm": 0.48376942186846866, "kl": 0.5350341796875, "learning_rate": 9.614529415427556e-06, "loss": 0.0214, "reward": 1.0125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9375, "step": 2540 }, { "completion_length": 169.14375, "epoch": 0.5621126157838793, "grad_norm": 0.6248488648541215, "kl": 0.3317138671875, "learning_rate": 9.576004421056389e-06, "loss": 0.0133, "reward": 1.08125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.98125, "step": 2545 }, { "completion_length": 173.6875, "epoch": 0.5632169627697022, "grad_norm": 0.3043924981907395, "kl": 0.31737060546875, "learning_rate": 9.537485729847594e-06, "loss": 0.0127, "reward": 1.05625, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.99375, "step": 2550 }, { "completion_length": 163.68125, "epoch": 0.5643213097555252, "grad_norm": 0.42066464481441374, "kl": 0.300604248046875, "learning_rate": 9.498973914424035e-06, "loss": 0.012, "reward": 1.0875, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.99375, "step": 2555 }, { "completion_length": 182.5625, "epoch": 0.5654256567413481, "grad_norm": 0.3979137026903759, "kl": 0.3016357421875, "learning_rate": 9.460469547306375e-06, "loss": 0.0121, "reward": 1.0625, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 2560 }, { "completion_length": 186.04375, "epoch": 0.5665300037271711, "grad_norm": 0.21641364510353625, "kl": 0.29324951171875, "learning_rate": 9.421973200904538e-06, "loss": 0.0117, "reward": 1.04375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.99375, "step": 2565 }, { "completion_length": 161.68125, "epoch": 0.5676343507129941, "grad_norm": 0.24964356866992105, "kl": 0.3321533203125, "learning_rate": 9.38348544750921e-06, "loss": 0.0133, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9875, "step": 2570 }, { "completion_length": 187.28125, "epoch": 0.568738697698817, "grad_norm": 0.1836091653753328, "kl": 0.31024169921875, "learning_rate": 9.345006859283338e-06, "loss": 0.0124, "reward": 1.10625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.99375, "step": 2575 }, { "completion_length": 212.85, "epoch": 0.5698430446846399, "grad_norm": 0.6849145983537044, "kl": 0.339111328125, "learning_rate": 9.306538008253611e-06, "loss": 0.0136, "reward": 1.08125, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 1.0, "step": 2580 }, { "completion_length": 204.25625, "epoch": 0.5709473916704628, "grad_norm": 0.8512773917093649, "kl": 0.32901611328125, "learning_rate": 9.268079466301978e-06, "loss": 0.0132, "reward": 1.11875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.96875, "step": 2585 }, { "completion_length": 223.1375, "epoch": 0.5720517386562858, "grad_norm": 0.18764654024348235, "kl": 0.29111328125, "learning_rate": 9.229631805157116e-06, "loss": 0.0116, "reward": 1.05, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.99375, "step": 2590 }, { "completion_length": 232.8625, "epoch": 0.5731560856421087, "grad_norm": 0.26470565817220254, "kl": 0.30040283203125, "learning_rate": 9.19119559638596e-06, "loss": 0.012, "reward": 1.0875, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.975, "step": 2595 }, { "completion_length": 224.3875, "epoch": 0.5742604326279317, "grad_norm": 0.28812544109822696, "kl": 0.30372314453125, "learning_rate": 9.15277141138518e-06, "loss": 0.0121, "reward": 1.1125, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.99375, "step": 2600 }, { "epoch": 0.5742604326279317, "eval_completion_length": 200.555, "eval_kl": 0.3039453125, "eval_loss": 0.012129716575145721, "eval_reward": 1.095, "eval_reward_std": 0.12020815074443818, "eval_rewards/accuracy_reward": 0.105, "eval_rewards/format_reward": 0.99, "eval_runtime": 84.0641, "eval_samples_per_second": 1.178, "eval_steps_per_second": 0.297, "step": 2600 }, { "completion_length": 206.1, "epoch": 0.5753647796137547, "grad_norm": 0.647401912936629, "kl": 0.309521484375, "learning_rate": 9.114359821372714e-06, "loss": 0.0124, "reward": 1.06875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9875, "step": 2605 }, { "completion_length": 232.76875, "epoch": 0.5764691265995776, "grad_norm": 0.4001821517172711, "kl": 0.28963623046875, "learning_rate": 9.075961397379247e-06, "loss": 0.0116, "reward": 1.06875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9875, "step": 2610 }, { "completion_length": 210.9, "epoch": 0.5775734735854006, "grad_norm": 0.46275481774122, "kl": 0.29266357421875, "learning_rate": 9.037576710239748e-06, "loss": 0.0117, "reward": 1.075, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 1.0, "step": 2615 }, { "completion_length": 232.54375, "epoch": 0.5786778205712235, "grad_norm": 0.06132598498237229, "kl": 0.2859619140625, "learning_rate": 8.999206330584969e-06, "loss": 0.0114, "reward": 1.03125, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.99375, "step": 2620 }, { "completion_length": 215.91875, "epoch": 0.5797821675570465, "grad_norm": 0.3911529231832668, "kl": 0.29134521484375, "learning_rate": 8.960850828832958e-06, "loss": 0.0116, "reward": 1.10625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.99375, "step": 2625 }, { "completion_length": 220.8625, "epoch": 0.5808865145428693, "grad_norm": 0.2442930603266259, "kl": 0.31162109375, "learning_rate": 8.9225107751806e-06, "loss": 0.0125, "reward": 1.06875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9875, "step": 2630 }, { "completion_length": 224.05625, "epoch": 0.5819908615286923, "grad_norm": 0.09213249286006794, "kl": 0.3035400390625, "learning_rate": 8.884186739595114e-06, "loss": 0.0121, "reward": 1.04375, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.99375, "step": 2635 }, { "completion_length": 207.7, "epoch": 0.5830952085145152, "grad_norm": 0.4516706074689143, "kl": 0.300616455078125, "learning_rate": 8.845879291805605e-06, "loss": 0.012, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9875, "step": 2640 }, { "completion_length": 200.91875, "epoch": 0.5841995555003382, "grad_norm": 0.5391545499409273, "kl": 0.29783935546875, "learning_rate": 8.807589001294571e-06, "loss": 0.0119, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.98125, "step": 2645 }, { "completion_length": 174.175, "epoch": 0.5853039024861612, "grad_norm": 0.7085634992406773, "kl": 0.31053466796875, "learning_rate": 8.769316437289456e-06, "loss": 0.0124, "reward": 1.0625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.96875, "step": 2650 }, { "completion_length": 181.50625, "epoch": 0.5864082494719841, "grad_norm": 0.4855936312310574, "kl": 0.33160400390625, "learning_rate": 8.731062168754174e-06, "loss": 0.0133, "reward": 1.075, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9875, "step": 2655 }, { "completion_length": 195.2625, "epoch": 0.5875125964578071, "grad_norm": 0.5682639526974919, "kl": 0.359893798828125, "learning_rate": 8.692826764380662e-06, "loss": 0.0144, "reward": 1.05625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.98125, "step": 2660 }, { "completion_length": 208.2875, "epoch": 0.58861694344363, "grad_norm": 0.4588795280012722, "kl": 0.34942626953125, "learning_rate": 8.654610792580415e-06, "loss": 0.014, "reward": 1.0125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.96875, "step": 2665 }, { "completion_length": 168.025, "epoch": 0.589721290429453, "grad_norm": 0.500593183120369, "kl": 0.36273193359375, "learning_rate": 8.616414821476048e-06, "loss": 0.0145, "reward": 1.025, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9625, "step": 2670 }, { "completion_length": 183.0375, "epoch": 0.5908256374152758, "grad_norm": 0.5184938308817854, "kl": 0.3274658203125, "learning_rate": 8.57823941889284e-06, "loss": 0.0131, "reward": 1.04375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.98125, "step": 2675 }, { "completion_length": 158.6, "epoch": 0.5919299844010988, "grad_norm": 0.2914033499028057, "kl": 0.289453125, "learning_rate": 8.54008515235029e-06, "loss": 0.0116, "reward": 1.14375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.99375, "step": 2680 }, { "completion_length": 166.40625, "epoch": 0.5930343313869217, "grad_norm": 0.32731329886172095, "kl": 0.30450439453125, "learning_rate": 8.501952589053694e-06, "loss": 0.0122, "reward": 1.05, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 1.0, "step": 2685 }, { "completion_length": 156.31875, "epoch": 0.5941386783727447, "grad_norm": 0.09256103441005661, "kl": 0.3406005859375, "learning_rate": 8.463842295885712e-06, "loss": 0.0136, "reward": 1.01875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.98125, "step": 2690 }, { "completion_length": 183.61875, "epoch": 0.5952430253585677, "grad_norm": 0.5104668071449497, "kl": 0.309075927734375, "learning_rate": 8.425754839397917e-06, "loss": 0.0124, "reward": 1.0875, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.96875, "step": 2695 }, { "completion_length": 191.0875, "epoch": 0.5963473723443906, "grad_norm": 0.47594505089710887, "kl": 0.30067138671875, "learning_rate": 8.387690785802403e-06, "loss": 0.012, "reward": 1.05625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 2700 }, { "epoch": 0.5963473723443906, "eval_completion_length": 200.535, "eval_kl": 0.33861328125, "eval_loss": 0.013564695604145527, "eval_reward": 1.105, "eval_reward_std": 0.20506096243858338, "eval_rewards/accuracy_reward": 0.15, "eval_rewards/format_reward": 0.955, "eval_runtime": 104.0245, "eval_samples_per_second": 0.952, "eval_steps_per_second": 0.24, "step": 2700 }, { "completion_length": 181.2, "epoch": 0.5974517193302136, "grad_norm": 0.38636194379006417, "kl": 0.30714111328125, "learning_rate": 8.349650700963346e-06, "loss": 0.0123, "reward": 1.14375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.99375, "step": 2705 }, { "completion_length": 216.9, "epoch": 0.5985560663160365, "grad_norm": 0.21083257411179623, "kl": 0.30963134765625, "learning_rate": 8.311635150388607e-06, "loss": 0.0124, "reward": 1.025, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9625, "step": 2710 }, { "completion_length": 209.09375, "epoch": 0.5996604133018595, "grad_norm": 0.254195239435785, "kl": 0.34473876953125, "learning_rate": 8.273644699221309e-06, "loss": 0.0138, "reward": 1.0375, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.975, "step": 2715 }, { "completion_length": 210.60625, "epoch": 0.6007647602876824, "grad_norm": 0.2751894182054688, "kl": 0.315087890625, "learning_rate": 8.235679912231456e-06, "loss": 0.0126, "reward": 1.01875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.96875, "step": 2720 }, { "completion_length": 184.66875, "epoch": 0.6018691072735053, "grad_norm": 0.5013893987512199, "kl": 0.300457763671875, "learning_rate": 8.197741353807515e-06, "loss": 0.012, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.98125, "step": 2725 }, { "completion_length": 209.50625, "epoch": 0.6029734542593282, "grad_norm": 0.2668494553767842, "kl": 0.343133544921875, "learning_rate": 8.159829587948048e-06, "loss": 0.0137, "reward": 1.075, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.95625, "step": 2730 }, { "completion_length": 217.9875, "epoch": 0.6040778012451512, "grad_norm": 1.1790834992221495, "kl": 0.34041748046875, "learning_rate": 8.1219451782533e-06, "loss": 0.0136, "reward": 1.01875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.95625, "step": 2735 }, { "completion_length": 199.21875, "epoch": 0.6051821482309742, "grad_norm": 0.38803314249531284, "kl": 0.31158447265625, "learning_rate": 8.084088687916853e-06, "loss": 0.0125, "reward": 1.04375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.975, "step": 2740 }, { "completion_length": 185.13125, "epoch": 0.6062864952167971, "grad_norm": 0.5365344661762009, "kl": 0.3183349609375, "learning_rate": 8.046260679717225e-06, "loss": 0.0127, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.99375, "step": 2745 }, { "completion_length": 195.78125, "epoch": 0.6073908422026201, "grad_norm": 0.5602246756166963, "kl": 0.361181640625, "learning_rate": 8.00846171600952e-06, "loss": 0.0144, "reward": 1.05625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 2750 }, { "completion_length": 174.98125, "epoch": 0.608495189188443, "grad_norm": 0.45096759701250927, "kl": 0.326898193359375, "learning_rate": 7.970692358717067e-06, "loss": 0.0131, "reward": 1.0625, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.99375, "step": 2755 }, { "completion_length": 219.3125, "epoch": 0.609599536174266, "grad_norm": 0.52626187470055, "kl": 0.349560546875, "learning_rate": 7.932953169323057e-06, "loss": 0.014, "reward": 1.06875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9875, "step": 2760 }, { "completion_length": 220.175, "epoch": 0.610703883160089, "grad_norm": 0.3604233546767503, "kl": 0.31280517578125, "learning_rate": 7.895244708862204e-06, "loss": 0.0125, "reward": 1.0625, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.99375, "step": 2765 }, { "completion_length": 207.06875, "epoch": 0.6118082301459118, "grad_norm": 0.5288120469694233, "kl": 0.33223876953125, "learning_rate": 7.857567537912404e-06, "loss": 0.0133, "reward": 1.05625, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9875, "step": 2770 }, { "completion_length": 236.71875, "epoch": 0.6129125771317347, "grad_norm": 0.10425504560684959, "kl": 0.32779541015625, "learning_rate": 7.8199222165864e-06, "loss": 0.0131, "reward": 1.05625, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9875, "step": 2775 }, { "completion_length": 207.39375, "epoch": 0.6140169241175577, "grad_norm": 0.2589149844532793, "kl": 0.283251953125, "learning_rate": 7.78230930452345e-06, "loss": 0.0113, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 1.0, "step": 2780 }, { "completion_length": 242.43125, "epoch": 0.6151212711033807, "grad_norm": 0.22583571089563595, "kl": 0.27052001953125, "learning_rate": 7.744729360881023e-06, "loss": 0.0108, "reward": 1.11875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.99375, "step": 2785 }, { "completion_length": 231.89375, "epoch": 0.6162256180892036, "grad_norm": 0.4749669282378495, "kl": 0.278765869140625, "learning_rate": 7.70718294432646e-06, "loss": 0.0111, "reward": 1.08125, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 1.0, "step": 2790 }, { "completion_length": 245.79375, "epoch": 0.6173299650750266, "grad_norm": 0.3617583523975815, "kl": 0.27476806640625, "learning_rate": 7.669670613028705e-06, "loss": 0.011, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.98125, "step": 2795 }, { "completion_length": 227.44375, "epoch": 0.6184343120608495, "grad_norm": 0.2965537312612653, "kl": 0.28612060546875, "learning_rate": 7.632192924649969e-06, "loss": 0.0114, "reward": 1.08125, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 1.0, "step": 2800 }, { "epoch": 0.6184343120608495, "eval_completion_length": 230.735, "eval_kl": 0.3071484375, "eval_loss": 0.01229775045067072, "eval_reward": 1.095, "eval_reward_std": 0.13435028612613678, "eval_rewards/accuracy_reward": 0.11, "eval_rewards/format_reward": 0.985, "eval_runtime": 108.6588, "eval_samples_per_second": 0.911, "eval_steps_per_second": 0.23, "step": 2800 }, { "completion_length": 239.04375, "epoch": 0.6195386590466725, "grad_norm": 0.5214893247053585, "kl": 0.2989501953125, "learning_rate": 7.594750436337467e-06, "loss": 0.012, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 2805 }, { "completion_length": 233.93125, "epoch": 0.6206430060324954, "grad_norm": 0.4598794915978221, "kl": 0.29241943359375, "learning_rate": 7.557343704715121e-06, "loss": 0.0117, "reward": 1.05625, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.99375, "step": 2810 }, { "completion_length": 242.64375, "epoch": 0.6217473530183184, "grad_norm": 3.1244852402266345, "kl": 0.31943359375, "learning_rate": 7.519973285875303e-06, "loss": 0.0128, "reward": 1.0625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.975, "step": 2815 }, { "completion_length": 236.81875, "epoch": 0.6228517000041413, "grad_norm": 0.5414981338856611, "kl": 0.3600830078125, "learning_rate": 7.482639735370536e-06, "loss": 0.0144, "reward": 1.03125, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.98125, "step": 2820 }, { "completion_length": 279.33125, "epoch": 0.6239560469899642, "grad_norm": 0.8596187860646796, "kl": 0.40133056640625, "learning_rate": 7.445343608205273e-06, "loss": 0.0161, "reward": 1.01875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9375, "step": 2825 }, { "completion_length": 247.0875, "epoch": 0.6250603939757872, "grad_norm": 0.5780188447264637, "kl": 0.3927734375, "learning_rate": 7.408085458827612e-06, "loss": 0.0157, "reward": 1.05, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.95625, "step": 2830 }, { "completion_length": 249.1, "epoch": 0.6261647409616101, "grad_norm": 0.316789753877662, "kl": 0.38126220703125, "learning_rate": 7.37086584112108e-06, "loss": 0.0153, "reward": 1.05, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.95625, "step": 2835 }, { "completion_length": 252.71875, "epoch": 0.6272690879474331, "grad_norm": 0.4453525406044762, "kl": 0.37978515625, "learning_rate": 7.333685308396383e-06, "loss": 0.0152, "reward": 1.0125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9375, "step": 2840 }, { "completion_length": 195.29375, "epoch": 0.628373434933256, "grad_norm": 0.6694404625243146, "kl": 0.33199462890625, "learning_rate": 7.2965444133831905e-06, "loss": 0.0133, "reward": 1.04375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.975, "step": 2845 }, { "completion_length": 203.05, "epoch": 0.629477781919079, "grad_norm": 0.5384625756146423, "kl": 0.3214599609375, "learning_rate": 7.2594437082219074e-06, "loss": 0.0129, "reward": 1.05, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.975, "step": 2850 }, { "completion_length": 181.36875, "epoch": 0.630582128904902, "grad_norm": 0.5857866433127917, "kl": 0.31610107421875, "learning_rate": 7.222383744455477e-06, "loss": 0.0126, "reward": 1.10625, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 1.0, "step": 2855 }, { "completion_length": 192.7625, "epoch": 0.6316864758907249, "grad_norm": 0.5306577984285455, "kl": 0.2734375, "learning_rate": 7.185365073021171e-06, "loss": 0.0109, "reward": 1.10625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9875, "step": 2860 }, { "completion_length": 186.95625, "epoch": 0.6327908228765479, "grad_norm": 0.25146113191688085, "kl": 0.2825439453125, "learning_rate": 7.148388244242414e-06, "loss": 0.0113, "reward": 1.05625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 1.0, "step": 2865 }, { "completion_length": 188.88125, "epoch": 0.6338951698623707, "grad_norm": 0.29446615942878507, "kl": 0.28685302734375, "learning_rate": 7.111453807820587e-06, "loss": 0.0115, "reward": 1.0875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9875, "step": 2870 }, { "completion_length": 215.34375, "epoch": 0.6349995168481937, "grad_norm": 0.11271213250290812, "kl": 0.24910888671875, "learning_rate": 7.0745623128268605e-06, "loss": 0.01, "reward": 1.05, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9875, "step": 2875 }, { "completion_length": 220.21875, "epoch": 0.6361038638340166, "grad_norm": 0.13827817934352812, "kl": 0.265997314453125, "learning_rate": 7.037714307694038e-06, "loss": 0.0106, "reward": 1.05, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9875, "step": 2880 }, { "completion_length": 224.525, "epoch": 0.6372082108198396, "grad_norm": 0.3760142764874246, "kl": 0.30487060546875, "learning_rate": 7.000910340208393e-06, "loss": 0.0122, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.98125, "step": 2885 }, { "completion_length": 229.125, "epoch": 0.6383125578056625, "grad_norm": 0.4800060583908338, "kl": 0.28165283203125, "learning_rate": 6.964150957501538e-06, "loss": 0.0113, "reward": 1.0875, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.975, "step": 2890 }, { "completion_length": 259.29375, "epoch": 0.6394169047914855, "grad_norm": 0.46816183628445984, "kl": 0.2957763671875, "learning_rate": 6.927436706042276e-06, "loss": 0.0118, "reward": 1.11875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.9875, "step": 2895 }, { "completion_length": 290.36875, "epoch": 0.6405212517773085, "grad_norm": 0.4620742885523695, "kl": 0.291473388671875, "learning_rate": 6.890768131628492e-06, "loss": 0.0117, "reward": 1.04375, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.95625, "step": 2900 }, { "epoch": 0.6405212517773085, "eval_completion_length": 255.545, "eval_kl": 0.29716796875, "eval_loss": 0.01189742237329483, "eval_reward": 1.085, "eval_reward_std": 0.13435028612613678, "eval_rewards/accuracy_reward": 0.1, "eval_rewards/format_reward": 0.985, "eval_runtime": 124.9105, "eval_samples_per_second": 0.793, "eval_steps_per_second": 0.2, "step": 2900 }, { "completion_length": 284.0625, "epoch": 0.6416255987631314, "grad_norm": 0.3834368146178332, "kl": 0.2900390625, "learning_rate": 6.8541457793790204e-06, "loss": 0.0116, "reward": 1.0375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9625, "step": 2905 }, { "completion_length": 251.69375, "epoch": 0.6427299457489544, "grad_norm": 0.2517910604330981, "kl": 0.2760498046875, "learning_rate": 6.8175701937255645e-06, "loss": 0.011, "reward": 1.03125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.975, "step": 2910 }, { "completion_length": 264.5, "epoch": 0.6438342927347772, "grad_norm": 0.2750608123545671, "kl": 0.2783935546875, "learning_rate": 6.781041918404578e-06, "loss": 0.0111, "reward": 1.08125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.975, "step": 2915 }, { "completion_length": 258.33125, "epoch": 0.6449386397206002, "grad_norm": 0.3654656487363651, "kl": 0.2917236328125, "learning_rate": 6.744561496449208e-06, "loss": 0.0117, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9875, "step": 2920 }, { "completion_length": 245.49375, "epoch": 0.6460429867064231, "grad_norm": 0.6036458859995204, "kl": 0.267669677734375, "learning_rate": 6.708129470181197e-06, "loss": 0.0107, "reward": 1.15, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 1.0, "step": 2925 }, { "completion_length": 265.6875, "epoch": 0.6471473336922461, "grad_norm": 0.7309833094166883, "kl": 0.26925048828125, "learning_rate": 6.671746381202835e-06, "loss": 0.0108, "reward": 1.08125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9875, "step": 2930 }, { "completion_length": 252.36875, "epoch": 0.648251680678069, "grad_norm": 0.42288290479080043, "kl": 0.2704833984375, "learning_rate": 6.635412770388911e-06, "loss": 0.0108, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "step": 2935 }, { "completion_length": 247.08125, "epoch": 0.649356027663892, "grad_norm": 0.49374444365360365, "kl": 0.27628173828125, "learning_rate": 6.5991291778786556e-06, "loss": 0.0111, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.98125, "step": 2940 }, { "completion_length": 233.7375, "epoch": 0.650460374649715, "grad_norm": 0.5239819846390583, "kl": 0.28341064453125, "learning_rate": 6.562896143067734e-06, "loss": 0.0113, "reward": 1.1125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9875, "step": 2945 }, { "completion_length": 255.7, "epoch": 0.6515647216355379, "grad_norm": 0.43425079053320653, "kl": 0.2722412109375, "learning_rate": 6.526714204600212e-06, "loss": 0.0109, "reward": 1.10625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.99375, "step": 2950 }, { "completion_length": 249.475, "epoch": 0.6526690686213609, "grad_norm": 0.25987542033307665, "kl": 0.27745361328125, "learning_rate": 6.490583900360543e-06, "loss": 0.0111, "reward": 1.06875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.98125, "step": 2955 }, { "completion_length": 273.275, "epoch": 0.6537734156071838, "grad_norm": 0.45996917419712435, "kl": 0.292822265625, "learning_rate": 6.4545057674655954e-06, "loss": 0.0117, "reward": 1.1, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.95625, "step": 2960 }, { "completion_length": 268.3625, "epoch": 0.6548777625930067, "grad_norm": 0.5624671363615396, "kl": 0.32144775390625, "learning_rate": 6.418480342256635e-06, "loss": 0.0129, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.95625, "step": 2965 }, { "completion_length": 269.64375, "epoch": 0.6559821095788296, "grad_norm": 0.32479361186684463, "kl": 0.27386474609375, "learning_rate": 6.38250816029139e-06, "loss": 0.011, "reward": 1.0375, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.94375, "step": 2970 }, { "completion_length": 232.7375, "epoch": 0.6570864565646526, "grad_norm": 0.4550467921271227, "kl": 0.25955810546875, "learning_rate": 6.34658975633605e-06, "loss": 0.0104, "reward": 1.09375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9875, "step": 2975 }, { "completion_length": 237.9875, "epoch": 0.6581908035504755, "grad_norm": 0.5807970338558914, "kl": 0.282135009765625, "learning_rate": 6.310725664357349e-06, "loss": 0.0113, "reward": 1.05, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95, "step": 2980 }, { "completion_length": 246.35625, "epoch": 0.6592951505362985, "grad_norm": 0.1538823043299478, "kl": 0.321832275390625, "learning_rate": 6.274916417514605e-06, "loss": 0.0129, "reward": 1.08125, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.98125, "step": 2985 }, { "completion_length": 265.53125, "epoch": 0.6603994975221215, "grad_norm": 0.41231875233030124, "kl": 0.287841796875, "learning_rate": 6.239162548151809e-06, "loss": 0.0115, "reward": 1.14375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.99375, "step": 2990 }, { "completion_length": 282.34375, "epoch": 0.6615038445079444, "grad_norm": 0.4537223091285934, "kl": 0.29276123046875, "learning_rate": 6.2034645877897e-06, "loss": 0.0117, "reward": 1.1125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.95625, "step": 2995 }, { "completion_length": 251.75, "epoch": 0.6626081914937674, "grad_norm": 0.5005714464427982, "kl": 0.30860595703125, "learning_rate": 6.167823067117868e-06, "loss": 0.0123, "reward": 1.1125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.975, "step": 3000 }, { "epoch": 0.6626081914937674, "eval_completion_length": 255.26, "eval_kl": 0.29595703125, "eval_loss": 0.011857852339744568, "eval_reward": 1.14, "eval_reward_std": 0.12727921843528747, "eval_rewards/accuracy_reward": 0.16, "eval_rewards/format_reward": 0.98, "eval_runtime": 109.8303, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.228, "step": 3000 }, { "completion_length": 300.43125, "epoch": 0.6637125384795903, "grad_norm": 0.32746973812290947, "kl": 0.308380126953125, "learning_rate": 6.132238515986868e-06, "loss": 0.0123, "reward": 1.04375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.94375, "step": 3005 }, { "completion_length": 243.6375, "epoch": 0.6648168854654133, "grad_norm": 0.4677149628391746, "kl": 0.294287109375, "learning_rate": 6.096711463400333e-06, "loss": 0.0118, "reward": 1.1125, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.98125, "step": 3010 }, { "completion_length": 247.98125, "epoch": 0.6659212324512361, "grad_norm": 0.5439611294896475, "kl": 0.281396484375, "learning_rate": 6.061242437507131e-06, "loss": 0.0113, "reward": 1.14375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.98125, "step": 3015 }, { "completion_length": 275.20625, "epoch": 0.6670255794370591, "grad_norm": 0.4749719977776336, "kl": 0.31934814453125, "learning_rate": 6.025831965593479e-06, "loss": 0.0128, "reward": 1.1125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.95625, "step": 3020 }, { "completion_length": 256.54375, "epoch": 0.668129926422882, "grad_norm": 0.3744944357004974, "kl": 0.32305908203125, "learning_rate": 5.990480574075143e-06, "loss": 0.0129, "reward": 1.00625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.94375, "step": 3025 }, { "completion_length": 245.075, "epoch": 0.669234273408705, "grad_norm": 0.36682653313436625, "kl": 0.2878173828125, "learning_rate": 5.955188788489583e-06, "loss": 0.0115, "reward": 1.0375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.95, "step": 3030 }, { "completion_length": 228.79375, "epoch": 0.670338620394528, "grad_norm": 0.5471531880195812, "kl": 0.2614013671875, "learning_rate": 5.919957133488155e-06, "loss": 0.0105, "reward": 1.0625, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.975, "step": 3035 }, { "completion_length": 281.49375, "epoch": 0.6714429673803509, "grad_norm": 0.44721939894789436, "kl": 0.313287353515625, "learning_rate": 5.884786132828304e-06, "loss": 0.0125, "reward": 1.05625, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9375, "step": 3040 }, { "completion_length": 220.5375, "epoch": 0.6725473143661739, "grad_norm": 0.4903267420807382, "kl": 0.280303955078125, "learning_rate": 5.849676309365786e-06, "loss": 0.0112, "reward": 1.0625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.95625, "step": 3045 }, { "completion_length": 213.80625, "epoch": 0.6736516613519968, "grad_norm": 0.6637137632075046, "kl": 0.3021484375, "learning_rate": 5.814628185046884e-06, "loss": 0.0121, "reward": 1.075, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.94375, "step": 3050 }, { "completion_length": 193.11875, "epoch": 0.6747560083378198, "grad_norm": 0.48238872145583594, "kl": 0.3324951171875, "learning_rate": 5.779642280900668e-06, "loss": 0.0133, "reward": 1.05625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.96875, "step": 3055 }, { "completion_length": 157.91875, "epoch": 0.6758603553236426, "grad_norm": 0.3447631853492008, "kl": 0.324847412109375, "learning_rate": 5.744719117031217e-06, "loss": 0.013, "reward": 1.08125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.975, "step": 3060 }, { "completion_length": 142.34375, "epoch": 0.6769647023094656, "grad_norm": 0.2144181948765553, "kl": 0.333251953125, "learning_rate": 5.709859212609919e-06, "loss": 0.0133, "reward": 1.075, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.98125, "step": 3065 }, { "completion_length": 142.88125, "epoch": 0.6780690492952886, "grad_norm": 2.1794877397302637, "kl": 0.31337890625, "learning_rate": 5.675063085867747e-06, "loss": 0.0125, "reward": 1.0375, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.9875, "step": 3070 }, { "completion_length": 167.0625, "epoch": 0.6791733962811115, "grad_norm": 0.5227859895364816, "kl": 0.31148681640625, "learning_rate": 5.6403312540875325e-06, "loss": 0.0125, "reward": 1.075, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.98125, "step": 3075 }, { "completion_length": 183.3, "epoch": 0.6802777432669345, "grad_norm": 0.5037852453688297, "kl": 0.3081787109375, "learning_rate": 5.6056642335963e-06, "loss": 0.0123, "reward": 1.08125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.96875, "step": 3080 }, { "completion_length": 170.60625, "epoch": 0.6813820902527574, "grad_norm": 0.39645655269998004, "kl": 0.38388671875, "learning_rate": 5.571062539757582e-06, "loss": 0.0154, "reward": 1.0875, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9625, "step": 3085 }, { "completion_length": 178.66875, "epoch": 0.6824864372385804, "grad_norm": 0.3386051871333338, "kl": 0.353631591796875, "learning_rate": 5.536526686963762e-06, "loss": 0.0141, "reward": 1.08125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9625, "step": 3090 }, { "completion_length": 179.43125, "epoch": 0.6835907842244033, "grad_norm": 0.1674625268129377, "kl": 0.311474609375, "learning_rate": 5.50205718862841e-06, "loss": 0.0125, "reward": 1.09375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9875, "step": 3095 }, { "completion_length": 198.5, "epoch": 0.6846951312102263, "grad_norm": 896.8738993114102, "kl": 5.80845947265625, "learning_rate": 5.467654557178679e-06, "loss": 0.2331, "reward": 1.0375, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9625, "step": 3100 }, { "epoch": 0.6846951312102263, "eval_completion_length": 212.23, "eval_kl": 0.383203125, "eval_loss": 0.015370451845228672, "eval_reward": 1.08, "eval_reward_std": 0.15556348919868468, "eval_rewards/accuracy_reward": 0.12, "eval_rewards/format_reward": 0.96, "eval_runtime": 115.3303, "eval_samples_per_second": 0.858, "eval_steps_per_second": 0.217, "step": 3100 }, { "completion_length": 191.40625, "epoch": 0.6857994781960493, "grad_norm": 0.23939982985583666, "kl": 0.347125244140625, "learning_rate": 5.433319304047666e-06, "loss": 0.0139, "reward": 1.0375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.96875, "step": 3105 }, { "completion_length": 196.55, "epoch": 0.6869038251818721, "grad_norm": 0.35370609706150546, "kl": 0.33798828125, "learning_rate": 5.399051939666817e-06, "loss": 0.0135, "reward": 1.0875, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.975, "step": 3110 }, { "completion_length": 184.975, "epoch": 0.688008172167695, "grad_norm": 0.35760764478630763, "kl": 0.30341796875, "learning_rate": 5.36485297345833e-06, "loss": 0.0121, "reward": 1.1125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9875, "step": 3115 }, { "completion_length": 203.2, "epoch": 0.689112519153518, "grad_norm": 0.644558524459656, "kl": 0.292742919921875, "learning_rate": 5.330722913827594e-06, "loss": 0.0117, "reward": 1.0625, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.96875, "step": 3120 }, { "completion_length": 230.50625, "epoch": 0.690216866139341, "grad_norm": 0.3464456804053971, "kl": 0.27896728515625, "learning_rate": 5.29666226815563e-06, "loss": 0.0112, "reward": 1.1125, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.98125, "step": 3125 }, { "completion_length": 221.98125, "epoch": 0.6913212131251639, "grad_norm": 0.20402330430464163, "kl": 0.286383056640625, "learning_rate": 5.262671542791531e-06, "loss": 0.0115, "reward": 1.0, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.98125, "step": 3130 }, { "completion_length": 218.54375, "epoch": 0.6924255601109869, "grad_norm": 0.5547640484775744, "kl": 0.2820068359375, "learning_rate": 5.228751243044961e-06, "loss": 0.0113, "reward": 1.04375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9625, "step": 3135 }, { "completion_length": 207.03125, "epoch": 0.6935299070968098, "grad_norm": 0.5409399988975135, "kl": 0.27630615234375, "learning_rate": 5.194901873178622e-06, "loss": 0.0111, "reward": 1.075, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 1.0, "step": 3140 }, { "completion_length": 188.34375, "epoch": 0.6946342540826328, "grad_norm": 0.15055443349583902, "kl": 0.267608642578125, "learning_rate": 5.1611239364007694e-06, "loss": 0.0107, "reward": 1.075, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.98125, "step": 3145 }, { "completion_length": 183.2125, "epoch": 0.6957386010684558, "grad_norm": 0.4919745608219073, "kl": 0.27034912109375, "learning_rate": 5.127417934857718e-06, "loss": 0.0108, "reward": 1.08125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.975, "step": 3150 }, { "completion_length": 193.4875, "epoch": 0.6968429480542786, "grad_norm": 0.08730250985788413, "kl": 0.28955078125, "learning_rate": 5.093784369626397e-06, "loss": 0.0116, "reward": 1.1125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9875, "step": 3155 }, { "completion_length": 209.58125, "epoch": 0.6979472950401016, "grad_norm": 0.32846616068602047, "kl": 0.298028564453125, "learning_rate": 5.060223740706883e-06, "loss": 0.0119, "reward": 1.06875, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9875, "step": 3160 }, { "completion_length": 219.4625, "epoch": 0.6990516420259245, "grad_norm": 1.1444441188491123, "kl": 0.36575927734375, "learning_rate": 5.026736547014981e-06, "loss": 0.0146, "reward": 1.01875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9625, "step": 3165 }, { "completion_length": 239.91875, "epoch": 0.7001559890117475, "grad_norm": 0.8099775678937355, "kl": 0.308984375, "learning_rate": 4.993323286374787e-06, "loss": 0.0124, "reward": 1.025, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.95, "step": 3170 }, { "completion_length": 219.4125, "epoch": 0.7012603359975704, "grad_norm": 0.4265583411464448, "kl": 0.29287109375, "learning_rate": 4.959984455511313e-06, "loss": 0.0117, "reward": 1.08125, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.99375, "step": 3175 }, { "completion_length": 231.3125, "epoch": 0.7023646829833934, "grad_norm": 0.454103943617819, "kl": 0.29195556640625, "learning_rate": 4.926720550043089e-06, "loss": 0.0117, "reward": 1.06875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.98125, "step": 3180 }, { "completion_length": 197.88125, "epoch": 0.7034690299692163, "grad_norm": 0.4885755590367494, "kl": 0.25499267578125, "learning_rate": 4.893532064474787e-06, "loss": 0.0102, "reward": 1.08125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 1.0, "step": 3185 }, { "completion_length": 227.78125, "epoch": 0.7045733769550393, "grad_norm": 0.2719638893519356, "kl": 0.270703125, "learning_rate": 4.860419492189886e-06, "loss": 0.0108, "reward": 1.06875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.975, "step": 3190 }, { "completion_length": 216.89375, "epoch": 0.7056777239408623, "grad_norm": 0.6224163001095495, "kl": 0.29207763671875, "learning_rate": 4.827383325443331e-06, "loss": 0.0117, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9625, "step": 3195 }, { "completion_length": 260.0, "epoch": 0.7067820709266852, "grad_norm": 0.4086697490241937, "kl": 0.273291015625, "learning_rate": 4.794424055354213e-06, "loss": 0.0109, "reward": 1.08125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9625, "step": 3200 }, { "epoch": 0.7067820709266852, "eval_completion_length": 267.11, "eval_kl": 0.30763671875, "eval_loss": 0.012311533093452454, "eval_reward": 1.09, "eval_reward_std": 0.21213203012943269, "eval_rewards/accuracy_reward": 0.14, "eval_rewards/format_reward": 0.95, "eval_runtime": 127.8367, "eval_samples_per_second": 0.774, "eval_steps_per_second": 0.196, "step": 3200 }, { "completion_length": 246.5875, "epoch": 0.7078864179125081, "grad_norm": 1.0355398852634203, "kl": 0.26212158203125, "learning_rate": 4.761542171898469e-06, "loss": 0.0105, "reward": 1.0625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.975, "step": 3205 }, { "completion_length": 282.29375, "epoch": 0.708990764898331, "grad_norm": 0.5137021254058943, "kl": 0.3453125, "learning_rate": 4.728738163901597e-06, "loss": 0.0138, "reward": 1.03125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.94375, "step": 3210 }, { "completion_length": 272.9, "epoch": 0.710095111884154, "grad_norm": 0.4557603593431217, "kl": 0.3007080078125, "learning_rate": 4.696012519031397e-06, "loss": 0.012, "reward": 1.06875, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95625, "step": 3215 }, { "completion_length": 300.46875, "epoch": 0.7111994588699769, "grad_norm": 0.9407435380966918, "kl": 0.3644775390625, "learning_rate": 4.663365723790698e-06, "loss": 0.0146, "reward": 1.0, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.90625, "step": 3220 }, { "completion_length": 282.19375, "epoch": 0.7123038058557999, "grad_norm": 0.48300892660454786, "kl": 0.3069580078125, "learning_rate": 4.630798263510162e-06, "loss": 0.0123, "reward": 1.05625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.95, "step": 3225 }, { "completion_length": 234.26875, "epoch": 0.7134081528416228, "grad_norm": 0.5153762702568343, "kl": 0.33831787109375, "learning_rate": 4.598310622341037e-06, "loss": 0.0135, "reward": 1.08125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9625, "step": 3230 }, { "completion_length": 283.26875, "epoch": 0.7145124998274458, "grad_norm": 0.2278004212721197, "kl": 0.2323486328125, "learning_rate": 4.565903283247981e-06, "loss": 0.0093, "reward": 1.13125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.975, "step": 3235 }, { "completion_length": 254.85, "epoch": 0.7156168468132688, "grad_norm": 0.4057412022574356, "kl": 0.224237060546875, "learning_rate": 4.533576728001858e-06, "loss": 0.009, "reward": 1.11875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.98125, "step": 3240 }, { "completion_length": 243.23125, "epoch": 0.7167211937990917, "grad_norm": 0.3936093803196274, "kl": 0.2511962890625, "learning_rate": 4.501331437172606e-06, "loss": 0.01, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9625, "step": 3245 }, { "completion_length": 261.04375, "epoch": 0.7178255407849147, "grad_norm": 0.15034549538860667, "kl": 0.28284912109375, "learning_rate": 4.469167890122073e-06, "loss": 0.0113, "reward": 1.03125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.94375, "step": 3250 }, { "completion_length": 270.925, "epoch": 0.7189298877707375, "grad_norm": 0.5028341851811142, "kl": 0.24959716796875, "learning_rate": 4.437086564996891e-06, "loss": 0.01, "reward": 1.0375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.95625, "step": 3255 }, { "completion_length": 264.66875, "epoch": 0.7200342347565605, "grad_norm": 0.5870825869850653, "kl": 0.26156005859375, "learning_rate": 4.405087938721376e-06, "loss": 0.0105, "reward": 1.0375, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.93125, "step": 3260 }, { "completion_length": 333.51875, "epoch": 0.7211385817423834, "grad_norm": 0.5363985927856229, "kl": 0.268310546875, "learning_rate": 4.373172486990436e-06, "loss": 0.0107, "reward": 1.03125, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9125, "step": 3265 }, { "completion_length": 259.7625, "epoch": 0.7222429287282064, "grad_norm": 0.34294137834570276, "kl": 0.25391845703125, "learning_rate": 4.341340684262498e-06, "loss": 0.0102, "reward": 1.05625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9625, "step": 3270 }, { "completion_length": 279.15625, "epoch": 0.7233472757140293, "grad_norm": 0.45234678819267615, "kl": 0.2611572265625, "learning_rate": 4.309593003752446e-06, "loss": 0.0104, "reward": 1.0875, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.9375, "step": 3275 }, { "completion_length": 297.00625, "epoch": 0.7244516226998523, "grad_norm": 0.40479995606264946, "kl": 0.280712890625, "learning_rate": 4.277929917424602e-06, "loss": 0.0112, "reward": 1.0125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.925, "step": 3280 }, { "completion_length": 241.7875, "epoch": 0.7255559696856753, "grad_norm": 0.31716356544020063, "kl": 0.2287841796875, "learning_rate": 4.246351895985702e-06, "loss": 0.0091, "reward": 1.04375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9375, "step": 3285 }, { "completion_length": 254.69375, "epoch": 0.7266603166714982, "grad_norm": 0.3098099830382794, "kl": 0.24783935546875, "learning_rate": 4.214859408877899e-06, "loss": 0.0099, "reward": 1.05625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9625, "step": 3290 }, { "completion_length": 236.5875, "epoch": 0.7277646636573212, "grad_norm": 0.2207504226236474, "kl": 0.2484619140625, "learning_rate": 4.183452924271776e-06, "loss": 0.0099, "reward": 1.0375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.96875, "step": 3295 }, { "completion_length": 224.6625, "epoch": 0.728869010643144, "grad_norm": 0.5845311907558509, "kl": 0.25625, "learning_rate": 4.152132909059402e-06, "loss": 0.0103, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.975, "step": 3300 }, { "epoch": 0.728869010643144, "eval_completion_length": 241.69, "eval_kl": 0.31572265625, "eval_loss": 0.012639479711651802, "eval_reward": 1.09, "eval_reward_std": 0.1414213538169861, "eval_rewards/accuracy_reward": 0.135, "eval_rewards/format_reward": 0.955, "eval_runtime": 118.6805, "eval_samples_per_second": 0.834, "eval_steps_per_second": 0.211, "step": 3300 }, { "completion_length": 215.5125, "epoch": 0.729973357628967, "grad_norm": 0.30909422545672033, "kl": 0.245867919921875, "learning_rate": 4.120899828847385e-06, "loss": 0.0098, "reward": 1.0875, "reward_std": 0.05303300768136978, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9875, "step": 3305 }, { "completion_length": 230.26875, "epoch": 0.7310777046147899, "grad_norm": 0.6453873653199322, "kl": 0.260516357421875, "learning_rate": 4.089754147949935e-06, "loss": 0.0104, "reward": 1.08125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.95625, "step": 3310 }, { "completion_length": 210.56875, "epoch": 0.7321820516006129, "grad_norm": 0.40594249764413265, "kl": 0.229119873046875, "learning_rate": 4.058696329381987e-06, "loss": 0.0092, "reward": 1.1125, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.98125, "step": 3315 }, { "completion_length": 212.29375, "epoch": 0.7332863985864359, "grad_norm": 0.38422267389292253, "kl": 0.2646240234375, "learning_rate": 4.027726834852303e-06, "loss": 0.0106, "reward": 1.0875, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9625, "step": 3320 }, { "completion_length": 232.43125, "epoch": 0.7343907455722588, "grad_norm": 0.5042182184524241, "kl": 0.2716796875, "learning_rate": 3.996846124756609e-06, "loss": 0.0109, "reward": 1.05, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95, "step": 3325 }, { "completion_length": 219.50625, "epoch": 0.7354950925580818, "grad_norm": 0.5264628768885443, "kl": 0.272119140625, "learning_rate": 3.966054658170754e-06, "loss": 0.0109, "reward": 1.0875, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.95625, "step": 3330 }, { "completion_length": 223.0125, "epoch": 0.7365994395439047, "grad_norm": 0.2967573269006475, "kl": 0.258892822265625, "learning_rate": 3.93535289284388e-06, "loss": 0.0104, "reward": 1.075, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.96875, "step": 3335 }, { "completion_length": 256.3625, "epoch": 0.7377037865297277, "grad_norm": 0.35416855035423694, "kl": 0.2759521484375, "learning_rate": 3.904741285191629e-06, "loss": 0.011, "reward": 1.08125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.94375, "step": 3340 }, { "completion_length": 237.9875, "epoch": 0.7388081335155506, "grad_norm": 0.5938232640376352, "kl": 0.283270263671875, "learning_rate": 3.874220290289337e-06, "loss": 0.0113, "reward": 1.15625, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.19375, "rewards/format_reward": 0.9625, "step": 3345 }, { "completion_length": 249.93125, "epoch": 0.7399124805013735, "grad_norm": 0.25454486548911043, "kl": 0.254449462890625, "learning_rate": 3.8437903618652895e-06, "loss": 0.0102, "reward": 1.0625, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9625, "step": 3350 }, { "completion_length": 221.375, "epoch": 0.7410168274871964, "grad_norm": 0.2326519300763832, "kl": 0.24263916015625, "learning_rate": 3.8134519522939693e-06, "loss": 0.0097, "reward": 1.0875, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9875, "step": 3355 }, { "completion_length": 224.9375, "epoch": 0.7421211744730194, "grad_norm": 0.4822164383039262, "kl": 0.2813232421875, "learning_rate": 3.7832055125893318e-06, "loss": 0.0113, "reward": 1.10625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.96875, "step": 3360 }, { "completion_length": 230.09375, "epoch": 0.7432255214588424, "grad_norm": 0.6372609601101804, "kl": 0.298992919921875, "learning_rate": 3.753051492398089e-06, "loss": 0.012, "reward": 1.14375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.19375, "rewards/format_reward": 0.95, "step": 3365 }, { "completion_length": 219.25, "epoch": 0.7443298684446653, "grad_norm": 0.3081391116247598, "kl": 0.2989990234375, "learning_rate": 3.7229903399930423e-06, "loss": 0.012, "reward": 1.1, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.9625, "step": 3370 }, { "completion_length": 155.59375, "epoch": 0.7454342154304883, "grad_norm": 0.5909119308745682, "kl": 0.31363525390625, "learning_rate": 3.6930225022664136e-06, "loss": 0.0125, "reward": 1.11875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.98125, "step": 3375 }, { "completion_length": 187.8375, "epoch": 0.7465385624163112, "grad_norm": 0.34179198645052977, "kl": 0.3404541015625, "learning_rate": 3.6631484247231896e-06, "loss": 0.0136, "reward": 1.04375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.95, "step": 3380 }, { "completion_length": 212.6, "epoch": 0.7476429094021342, "grad_norm": 0.75092071011766, "kl": 0.36864013671875, "learning_rate": 3.6333685514745165e-06, "loss": 0.0147, "reward": 1.075, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.925, "step": 3385 }, { "completion_length": 193.43125, "epoch": 0.7487472563879571, "grad_norm": 0.329990173152014, "kl": 0.365625, "learning_rate": 3.6036833252310887e-06, "loss": 0.0146, "reward": 1.0625, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95, "step": 3390 }, { "completion_length": 201.61875, "epoch": 0.7498516033737801, "grad_norm": 0.47149722693689095, "kl": 0.37044677734375, "learning_rate": 3.574093187296568e-06, "loss": 0.0148, "reward": 1.075, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.95, "step": 3395 }, { "completion_length": 210.40625, "epoch": 0.7509559503596029, "grad_norm": 0.6186398549798994, "kl": 0.31754150390625, "learning_rate": 3.544598577561016e-06, "loss": 0.0127, "reward": 1.06875, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.96875, "step": 3400 }, { "epoch": 0.7509559503596029, "eval_completion_length": 217.785, "eval_kl": 0.5054296875, "eval_loss": 0.020201342180371284, "eval_reward": 1.095, "eval_reward_std": 0.162634556889534, "eval_rewards/accuracy_reward": 0.13, "eval_rewards/format_reward": 0.965, "eval_runtime": 115.0896, "eval_samples_per_second": 0.86, "eval_steps_per_second": 0.217, "step": 3400 }, { "completion_length": 220.7875, "epoch": 0.7520602973454259, "grad_norm": 0.502156340630982, "kl": 0.31522216796875, "learning_rate": 3.515199934494373e-06, "loss": 0.0126, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "step": 3405 }, { "completion_length": 235.6625, "epoch": 0.7531646443312489, "grad_norm": 0.405309226466982, "kl": 0.34342041015625, "learning_rate": 3.4858976951399237e-06, "loss": 0.0137, "reward": 1.075, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.9625, "step": 3410 }, { "completion_length": 207.125, "epoch": 0.7542689913170718, "grad_norm": 0.42471895189637104, "kl": 0.37327880859375, "learning_rate": 3.4566922951078086e-06, "loss": 0.0149, "reward": 1.10625, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.9625, "step": 3415 }, { "completion_length": 204.0625, "epoch": 0.7553733383028948, "grad_norm": 0.328073526920033, "kl": 0.277392578125, "learning_rate": 3.427584168568535e-06, "loss": 0.0111, "reward": 1.10625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.975, "step": 3420 }, { "completion_length": 205.0875, "epoch": 0.7564776852887177, "grad_norm": 0.5369831637398775, "kl": 0.2722412109375, "learning_rate": 3.398573748246544e-06, "loss": 0.0109, "reward": 1.175, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9875, "step": 3425 }, { "completion_length": 271.4875, "epoch": 0.7575820322745407, "grad_norm": 0.5616908933906252, "kl": 0.249615478515625, "learning_rate": 3.3696614654137637e-06, "loss": 0.01, "reward": 0.9625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.93125, "step": 3430 }, { "completion_length": 246.775, "epoch": 0.7586863792603636, "grad_norm": 0.602386946053219, "kl": 0.25421142578125, "learning_rate": 3.3408477498831917e-06, "loss": 0.0102, "reward": 1.1375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.18125, "rewards/format_reward": 0.95625, "step": 3435 }, { "completion_length": 227.56875, "epoch": 0.7597907262461866, "grad_norm": 0.5462886025152357, "kl": 0.259625244140625, "learning_rate": 3.3121330300025222e-06, "loss": 0.0104, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.975, "step": 3440 }, { "completion_length": 217.3875, "epoch": 0.7608950732320094, "grad_norm": 0.513835490839363, "kl": 0.26868896484375, "learning_rate": 3.2835177326477675e-06, "loss": 0.0108, "reward": 1.1125, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9875, "step": 3445 }, { "completion_length": 221.81875, "epoch": 0.7619994202178324, "grad_norm": 0.44817623889299235, "kl": 0.251348876953125, "learning_rate": 3.2550022832169125e-06, "loss": 0.0101, "reward": 1.05, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.96875, "step": 3450 }, { "completion_length": 201.45625, "epoch": 0.7631037672036554, "grad_norm": 0.3221696210681099, "kl": 0.258599853515625, "learning_rate": 3.2265871056235974e-06, "loss": 0.0103, "reward": 1.0875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.99375, "step": 3455 }, { "completion_length": 200.58125, "epoch": 0.7642081141894783, "grad_norm": 0.748323660705002, "kl": 0.27359619140625, "learning_rate": 3.1982726222908046e-06, "loss": 0.0109, "reward": 1.11875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.975, "step": 3460 }, { "completion_length": 239.84375, "epoch": 0.7653124611753013, "grad_norm": 0.40897691141192144, "kl": 0.24927978515625, "learning_rate": 3.170059254144593e-06, "loss": 0.01, "reward": 1.06875, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.98125, "step": 3465 }, { "completion_length": 220.5125, "epoch": 0.7664168081611242, "grad_norm": 0.32005616347994614, "kl": 0.26856689453125, "learning_rate": 3.1419474206078203e-06, "loss": 0.0107, "reward": 1.1625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.18125, "rewards/format_reward": 0.98125, "step": 3470 }, { "completion_length": 235.375, "epoch": 0.7675211551469472, "grad_norm": 0.38318241182760876, "kl": 0.2571533203125, "learning_rate": 3.113937539593931e-06, "loss": 0.0103, "reward": 1.09375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9875, "step": 3475 }, { "completion_length": 265.41875, "epoch": 0.7686255021327701, "grad_norm": 0.5005202602287694, "kl": 0.2830078125, "learning_rate": 3.086030027500728e-06, "loss": 0.0113, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.98125, "step": 3480 }, { "completion_length": 263.2875, "epoch": 0.7697298491185931, "grad_norm": 0.07189820608786429, "kl": 0.28231201171875, "learning_rate": 3.058225299204195e-06, "loss": 0.0113, "reward": 1.05625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.98125, "step": 3485 }, { "completion_length": 255.85, "epoch": 0.7708341961044161, "grad_norm": 0.5124546790054572, "kl": 0.28914794921875, "learning_rate": 3.0305237680523046e-06, "loss": 0.0116, "reward": 1.1, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.98125, "step": 3490 }, { "completion_length": 243.6625, "epoch": 0.7719385430902389, "grad_norm": 0.2684526887471308, "kl": 0.257568359375, "learning_rate": 3.002925845858905e-06, "loss": 0.0103, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.98125, "step": 3495 }, { "completion_length": 279.725, "epoch": 0.7730428900760619, "grad_norm": 0.45994470057081904, "kl": 0.265185546875, "learning_rate": 2.9754319428975796e-06, "loss": 0.0106, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.9625, "step": 3500 }, { "epoch": 0.7730428900760619, "eval_completion_length": 227.09, "eval_kl": 0.26388671875, "eval_loss": 0.010543497279286385, "eval_reward": 1.095, "eval_reward_std": 0.1484924215078354, "eval_rewards/accuracy_reward": 0.115, "eval_rewards/format_reward": 0.98, "eval_runtime": 102.2929, "eval_samples_per_second": 0.968, "eval_steps_per_second": 0.244, "step": 3500 }, { "completion_length": 277.59375, "epoch": 0.7741472370618848, "grad_norm": 0.4448459387591611, "kl": 0.287054443359375, "learning_rate": 2.948042467895544e-06, "loss": 0.0115, "reward": 1.05625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9625, "step": 3505 }, { "completion_length": 259.24375, "epoch": 0.7752515840477078, "grad_norm": 0.4966119961041164, "kl": 0.29287109375, "learning_rate": 2.920757828027586e-06, "loss": 0.0117, "reward": 1.03125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9625, "step": 3510 }, { "completion_length": 286.8, "epoch": 0.7763559310335307, "grad_norm": 0.484519618170077, "kl": 0.2783935546875, "learning_rate": 2.893578428909998e-06, "loss": 0.0111, "reward": 1.10625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.9625, "step": 3515 }, { "completion_length": 236.9875, "epoch": 0.7774602780193537, "grad_norm": 0.5777828810239061, "kl": 0.27120361328125, "learning_rate": 2.8665046745945555e-06, "loss": 0.0109, "reward": 1.1, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.95625, "step": 3520 }, { "completion_length": 269.5625, "epoch": 0.7785646250051766, "grad_norm": 0.21783452027900907, "kl": 0.25440673828125, "learning_rate": 2.839536967562504e-06, "loss": 0.0102, "reward": 1.10625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.98125, "step": 3525 }, { "completion_length": 254.6625, "epoch": 0.7796689719909996, "grad_norm": 0.5865883286348121, "kl": 0.233740234375, "learning_rate": 2.8126757087185797e-06, "loss": 0.0093, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.975, "step": 3530 }, { "completion_length": 280.525, "epoch": 0.7807733189768226, "grad_norm": 0.43883058743962794, "kl": 0.271136474609375, "learning_rate": 2.7859212973850535e-06, "loss": 0.0108, "reward": 1.075, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.9375, "step": 3535 }, { "completion_length": 205.775, "epoch": 0.7818776659626455, "grad_norm": 0.606176189949368, "kl": 0.27373046875, "learning_rate": 2.759274131295787e-06, "loss": 0.0109, "reward": 1.11875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.975, "step": 3540 }, { "completion_length": 260.25, "epoch": 0.7829820129484684, "grad_norm": 0.3955866589392802, "kl": 0.270849609375, "learning_rate": 2.732734606590318e-06, "loss": 0.0108, "reward": 1.03125, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.93125, "step": 3545 }, { "completion_length": 262.69375, "epoch": 0.7840863599342913, "grad_norm": 0.3782847315356218, "kl": 0.275299072265625, "learning_rate": 2.7063031178079847e-06, "loss": 0.011, "reward": 1.0625, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.94375, "step": 3550 }, { "completion_length": 250.175, "epoch": 0.7851907069201143, "grad_norm": 0.27828806764961916, "kl": 0.2863525390625, "learning_rate": 2.679980057882049e-06, "loss": 0.0115, "reward": 1.00625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9375, "step": 3555 }, { "completion_length": 235.0625, "epoch": 0.7862950539059372, "grad_norm": 0.7313042767403699, "kl": 0.284637451171875, "learning_rate": 2.6537658181338534e-06, "loss": 0.0114, "reward": 1.08125, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.9375, "step": 3560 }, { "completion_length": 222.3625, "epoch": 0.7873994008917602, "grad_norm": 0.40644071357218936, "kl": 0.286981201171875, "learning_rate": 2.6276607882670135e-06, "loss": 0.0115, "reward": 1.075, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.9625, "step": 3565 }, { "completion_length": 231.09375, "epoch": 0.7885037478775831, "grad_norm": 0.6111130783275136, "kl": 0.306976318359375, "learning_rate": 2.60166535636162e-06, "loss": 0.0123, "reward": 1.1375, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.18125, "rewards/format_reward": 0.95625, "step": 3570 }, { "completion_length": 216.48125, "epoch": 0.7896080948634061, "grad_norm": 0.49711551639970475, "kl": 0.25562744140625, "learning_rate": 2.5757799088684654e-06, "loss": 0.0102, "reward": 1.1875, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.19375, "rewards/format_reward": 0.99375, "step": 3575 }, { "completion_length": 218.78125, "epoch": 0.7907124418492291, "grad_norm": 0.31764284101121393, "kl": 0.30023193359375, "learning_rate": 2.5500048306033065e-06, "loss": 0.012, "reward": 1.06875, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95625, "step": 3580 }, { "completion_length": 216.31875, "epoch": 0.791816788835052, "grad_norm": 0.4787122183538524, "kl": 0.2999267578125, "learning_rate": 2.5243405047411353e-06, "loss": 0.012, "reward": 1.09375, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.975, "step": 3585 }, { "completion_length": 249.625, "epoch": 0.7929211358208749, "grad_norm": 0.3111958011260876, "kl": 0.28712158203125, "learning_rate": 2.498787312810492e-06, "loss": 0.0115, "reward": 1.05, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.94375, "step": 3590 }, { "completion_length": 231.98125, "epoch": 0.7940254828066978, "grad_norm": 0.39194633386336397, "kl": 0.28707275390625, "learning_rate": 2.4733456346877817e-06, "loss": 0.0115, "reward": 1.05, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.95625, "step": 3595 }, { "completion_length": 175.5375, "epoch": 0.7951298297925208, "grad_norm": 0.42822142675950153, "kl": 0.3134765625, "learning_rate": 2.448015848591638e-06, "loss": 0.0125, "reward": 1.10625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.98125, "step": 3600 }, { "epoch": 0.7951298297925208, "eval_completion_length": 171.885, "eval_kl": 0.3194921875, "eval_loss": 0.012775387614965439, "eval_reward": 1.14, "eval_reward_std": 0.15556348919868468, "eval_rewards/accuracy_reward": 0.16, "eval_rewards/format_reward": 0.98, "eval_runtime": 92.7145, "eval_samples_per_second": 1.068, "eval_steps_per_second": 0.27, "step": 3600 }, { "completion_length": 201.60625, "epoch": 0.7962341767783437, "grad_norm": 0.5464532193593336, "kl": 0.325775146484375, "learning_rate": 2.4227983310772963e-06, "loss": 0.013, "reward": 1.075, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.95625, "step": 3605 }, { "completion_length": 176.4625, "epoch": 0.7973385237641667, "grad_norm": 0.5021094659088707, "kl": 0.36162109375, "learning_rate": 2.3976934570309974e-06, "loss": 0.0145, "reward": 1.1125, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.96875, "step": 3610 }, { "completion_length": 167.06875, "epoch": 0.7984428707499897, "grad_norm": 0.3403630562580807, "kl": 0.325738525390625, "learning_rate": 2.3727015996644043e-06, "loss": 0.013, "reward": 1.125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.98125, "step": 3615 }, { "completion_length": 138.4625, "epoch": 0.7995472177358126, "grad_norm": 0.6077367973458568, "kl": 0.333203125, "learning_rate": 2.3478231305090694e-06, "loss": 0.0133, "reward": 1.13125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.99375, "step": 3620 }, { "completion_length": 177.075, "epoch": 0.8006515647216356, "grad_norm": 0.4466360553445801, "kl": 0.325604248046875, "learning_rate": 2.3230584194109074e-06, "loss": 0.013, "reward": 1.11875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.98125, "step": 3625 }, { "completion_length": 161.88125, "epoch": 0.8017559117074585, "grad_norm": 0.3897741643622985, "kl": 0.345849609375, "learning_rate": 2.298407834524682e-06, "loss": 0.0138, "reward": 1.0875, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.975, "step": 3630 }, { "completion_length": 166.9, "epoch": 0.8028602586932815, "grad_norm": 2.30034414615901, "kl": 0.372119140625, "learning_rate": 2.2738717423085543e-06, "loss": 0.0149, "reward": 1.10625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.98125, "step": 3635 }, { "completion_length": 238.34375, "epoch": 0.8039646056791043, "grad_norm": 0.6076643483832027, "kl": 0.309075927734375, "learning_rate": 2.2494505075186234e-06, "loss": 0.0124, "reward": 1.0875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.16875, "rewards/format_reward": 0.91875, "step": 3640 }, { "completion_length": 181.86875, "epoch": 0.8050689526649273, "grad_norm": 0.2992763139298062, "kl": 0.269927978515625, "learning_rate": 2.2251444932035094e-06, "loss": 0.0108, "reward": 1.125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.98125, "step": 3645 }, { "completion_length": 164.825, "epoch": 0.8061732996507502, "grad_norm": 0.6026739836434083, "kl": 0.284381103515625, "learning_rate": 2.200954060698941e-06, "loss": 0.0114, "reward": 1.11875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.96875, "step": 3650 }, { "completion_length": 230.73125, "epoch": 0.8072776466365732, "grad_norm": 0.48565298064687734, "kl": 0.30531005859375, "learning_rate": 2.176879569622409e-06, "loss": 0.0122, "reward": 1.075, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.95625, "step": 3655 }, { "completion_length": 242.775, "epoch": 0.8083819936223962, "grad_norm": 0.21896055218236896, "kl": 0.2802001953125, "learning_rate": 2.1529213778677993e-06, "loss": 0.0112, "reward": 1.025, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.94375, "step": 3660 }, { "completion_length": 229.15, "epoch": 0.8094863406082191, "grad_norm": 0.14196401938191486, "kl": 0.259161376953125, "learning_rate": 2.1290798416000857e-06, "loss": 0.0104, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.95625, "step": 3665 }, { "completion_length": 267.93125, "epoch": 0.8105906875940421, "grad_norm": 0.5415824445762728, "kl": 0.263916015625, "learning_rate": 2.1053553152500204e-06, "loss": 0.0106, "reward": 1.0125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.91875, "step": 3670 }, { "completion_length": 234.425, "epoch": 0.811695034579865, "grad_norm": 0.44188434661367404, "kl": 0.27322998046875, "learning_rate": 2.081748151508883e-06, "loss": 0.0109, "reward": 1.075, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.95625, "step": 3675 }, { "completion_length": 223.39375, "epoch": 0.812799381565688, "grad_norm": 0.29953298263136474, "kl": 0.2898193359375, "learning_rate": 2.0582587013232268e-06, "loss": 0.0116, "reward": 1.05625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95625, "step": 3680 }, { "completion_length": 242.9125, "epoch": 0.8139037285515108, "grad_norm": 0.5105270540146248, "kl": 0.28282470703125, "learning_rate": 2.0348873138896563e-06, "loss": 0.0113, "reward": 1.0, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.05, "rewards/format_reward": 0.95, "step": 3685 }, { "completion_length": 225.31875, "epoch": 0.8150080755373338, "grad_norm": 0.3698502677044578, "kl": 0.252008056640625, "learning_rate": 2.0116343366496493e-06, "loss": 0.0101, "reward": 1.0625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9625, "step": 3690 }, { "completion_length": 236.0875, "epoch": 0.8161124225231567, "grad_norm": 1.4290601982893592, "kl": 0.321392822265625, "learning_rate": 1.988500115284385e-06, "loss": 0.0129, "reward": 1.0375, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.95625, "step": 3695 }, { "completion_length": 211.28125, "epoch": 0.8172167695089797, "grad_norm": 0.3911358009799874, "kl": 0.278375244140625, "learning_rate": 1.9654849937096033e-06, "loss": 0.0111, "reward": 1.0625, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.975, "step": 3700 }, { "epoch": 0.8172167695089797, "eval_completion_length": 202.245, "eval_kl": 0.299296875, "eval_loss": 0.011967692524194717, "eval_reward": 1.13, "eval_reward_std": 0.11313708305358887, "eval_rewards/accuracy_reward": 0.14, "eval_rewards/format_reward": 0.99, "eval_runtime": 97.9771, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.255, "step": 3700 }, { "completion_length": 245.05, "epoch": 0.8183211164948027, "grad_norm": 0.6857212516430605, "kl": 0.283929443359375, "learning_rate": 1.942589314070494e-06, "loss": 0.0114, "reward": 1.05, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.94375, "step": 3705 }, { "completion_length": 207.8, "epoch": 0.8194254634806256, "grad_norm": 0.8931067063405094, "kl": 0.33228759765625, "learning_rate": 1.9198134167366156e-06, "loss": 0.0133, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.96875, "step": 3710 }, { "completion_length": 231.15625, "epoch": 0.8205298104664486, "grad_norm": 4.939225391817678, "kl": 0.328973388671875, "learning_rate": 1.897157640296825e-06, "loss": 0.0131, "reward": 1.06875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95625, "step": 3715 }, { "completion_length": 213.325, "epoch": 0.8216341574522715, "grad_norm": 0.5141296496399171, "kl": 0.280364990234375, "learning_rate": 1.8746223215542482e-06, "loss": 0.0112, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "step": 3720 }, { "completion_length": 216.3625, "epoch": 0.8227385044380945, "grad_norm": 0.49407058755769534, "kl": 0.245660400390625, "learning_rate": 1.8522077955212791e-06, "loss": 0.0098, "reward": 1.1375, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.98125, "step": 3725 }, { "completion_length": 206.43125, "epoch": 0.8238428514239174, "grad_norm": 0.2188098942709737, "kl": 0.278680419921875, "learning_rate": 1.8299143954145926e-06, "loss": 0.0111, "reward": 1.1, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.95, "step": 3730 }, { "completion_length": 211.05, "epoch": 0.8249471984097403, "grad_norm": 0.8180293925174863, "kl": 0.28306884765625, "learning_rate": 1.8077424526501964e-06, "loss": 0.0113, "reward": 1.0875, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.96875, "step": 3735 }, { "completion_length": 216.4625, "epoch": 0.8260515453955632, "grad_norm": 0.6158285951662569, "kl": 0.28001708984375, "learning_rate": 1.7856922968384926e-06, "loss": 0.0112, "reward": 1.0875, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.95625, "step": 3740 }, { "completion_length": 238.0125, "epoch": 0.8271558923813862, "grad_norm": 0.615093259382316, "kl": 0.301104736328125, "learning_rate": 1.763764255779392e-06, "loss": 0.012, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.94375, "step": 3745 }, { "completion_length": 212.09375, "epoch": 0.8282602393672092, "grad_norm": 0.5625866842898283, "kl": 0.2462158203125, "learning_rate": 1.7419586554574364e-06, "loss": 0.0098, "reward": 1.14375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.175, "rewards/format_reward": 0.96875, "step": 3750 }, { "completion_length": 244.36875, "epoch": 0.8293645863530321, "grad_norm": 0.6830147990367013, "kl": 0.36268310546875, "learning_rate": 1.720275820036944e-06, "loss": 0.0145, "reward": 1.05, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.925, "step": 3755 }, { "completion_length": 185.325, "epoch": 0.8304689333388551, "grad_norm": 0.09647577383094562, "kl": 0.28330078125, "learning_rate": 1.6987160718572027e-06, "loss": 0.0113, "reward": 1.10625, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.975, "step": 3760 }, { "completion_length": 169.075, "epoch": 0.831573280324678, "grad_norm": 0.42190186308598165, "kl": 0.2713623046875, "learning_rate": 1.6772797314276712e-06, "loss": 0.0109, "reward": 1.11875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.98125, "step": 3765 }, { "completion_length": 209.425, "epoch": 0.832677627310501, "grad_norm": 0.4838527676338876, "kl": 0.3133056640625, "learning_rate": 1.6559671174232195e-06, "loss": 0.0125, "reward": 1.03125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.95, "step": 3770 }, { "completion_length": 196.15625, "epoch": 0.833781974296324, "grad_norm": 0.541789987335856, "kl": 0.284918212890625, "learning_rate": 1.6347785466793764e-06, "loss": 0.0114, "reward": 1.13125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.98125, "step": 3775 }, { "completion_length": 189.86875, "epoch": 0.8348863212821469, "grad_norm": 0.37944243070397565, "kl": 0.305487060546875, "learning_rate": 1.6137143341876439e-06, "loss": 0.0122, "reward": 1.09375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.99375, "step": 3780 }, { "completion_length": 192.91875, "epoch": 0.8359906682679697, "grad_norm": 0.4016733182249456, "kl": 0.266162109375, "learning_rate": 1.5927747930907921e-06, "loss": 0.0106, "reward": 1.08125, "reward_std": 0.06187184229493141, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9875, "step": 3785 }, { "completion_length": 199.01875, "epoch": 0.8370950152537927, "grad_norm": 0.555504541233714, "kl": 0.31944580078125, "learning_rate": 1.5719602346782215e-06, "loss": 0.0128, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9625, "step": 3790 }, { "completion_length": 191.89375, "epoch": 0.8381993622396157, "grad_norm": 0.6714397875203784, "kl": 0.400775146484375, "learning_rate": 1.5512709683813165e-06, "loss": 0.016, "reward": 1.15, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9625, "step": 3795 }, { "completion_length": 211.68125, "epoch": 0.8393037092254386, "grad_norm": 0.3384649202970216, "kl": 0.278900146484375, "learning_rate": 1.5307073017688644e-06, "loss": 0.0112, "reward": 1.03125, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.9625, "step": 3800 }, { "epoch": 0.8393037092254386, "eval_completion_length": 195.9, "eval_kl": 0.30609375, "eval_loss": 0.012239097617566586, "eval_reward": 1.095, "eval_reward_std": 0.14849242091178894, "eval_rewards/accuracy_reward": 0.13, "eval_rewards/format_reward": 0.965, "eval_runtime": 100.9941, "eval_samples_per_second": 0.98, "eval_steps_per_second": 0.248, "step": 3800 }, { "completion_length": 217.4375, "epoch": 0.8404080562112616, "grad_norm": 0.6534309102123147, "kl": 0.330548095703125, "learning_rate": 1.5102695405424738e-06, "loss": 0.0132, "reward": 1.08125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.95625, "step": 3805 }, { "completion_length": 207.18125, "epoch": 0.8415124031970845, "grad_norm": 0.35957581300760044, "kl": 0.358197021484375, "learning_rate": 1.4899579885320237e-06, "loss": 0.0143, "reward": 1.0875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.94375, "step": 3810 }, { "completion_length": 222.85625, "epoch": 0.8426167501829075, "grad_norm": 0.5604436470793626, "kl": 0.305682373046875, "learning_rate": 1.4697729476911614e-06, "loss": 0.0122, "reward": 1.13125, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.18125, "rewards/format_reward": 0.95, "step": 3815 }, { "completion_length": 202.29375, "epoch": 0.8437210971687304, "grad_norm": 0.4206816591710824, "kl": 0.2802001953125, "learning_rate": 1.449714718092803e-06, "loss": 0.0112, "reward": 1.08125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.98125, "step": 3820 }, { "completion_length": 205.2375, "epoch": 0.8448254441545534, "grad_norm": 0.16670952436146919, "kl": 0.273931884765625, "learning_rate": 1.4297835979246777e-06, "loss": 0.011, "reward": 1.075, "reward_std": 0.07071067690849304, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9875, "step": 3825 }, { "completion_length": 238.1875, "epoch": 0.8459297911403763, "grad_norm": 0.613398892634411, "kl": 0.276251220703125, "learning_rate": 1.4099798834848855e-06, "loss": 0.0111, "reward": 1.10625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.95, "step": 3830 }, { "completion_length": 261.0375, "epoch": 0.8470341381261992, "grad_norm": 0.47116863545999577, "kl": 0.307647705078125, "learning_rate": 1.3903038691775095e-06, "loss": 0.0123, "reward": 1.09375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.94375, "step": 3835 }, { "completion_length": 211.49375, "epoch": 0.8481384851120222, "grad_norm": 0.3456577853406588, "kl": 0.296746826171875, "learning_rate": 1.370755847508226e-06, "loss": 0.0119, "reward": 1.1125, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.975, "step": 3840 }, { "completion_length": 210.94375, "epoch": 0.8492428320978451, "grad_norm": 0.3986816328493071, "kl": 0.29635009765625, "learning_rate": 1.3513361090799537e-06, "loss": 0.0119, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.96875, "step": 3845 }, { "completion_length": 216.41875, "epoch": 0.8503471790836681, "grad_norm": 0.36582880270044166, "kl": 0.265789794921875, "learning_rate": 1.332044942588545e-06, "loss": 0.0106, "reward": 1.14375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.18125, "rewards/format_reward": 0.9625, "step": 3850 }, { "completion_length": 196.1875, "epoch": 0.851451526069491, "grad_norm": 0.43079415258986453, "kl": 0.3136474609375, "learning_rate": 1.3128826348184886e-06, "loss": 0.0125, "reward": 1.1625, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.19375, "rewards/format_reward": 0.96875, "step": 3855 }, { "completion_length": 205.025, "epoch": 0.852555873055314, "grad_norm": 0.26604232127523036, "kl": 0.30478515625, "learning_rate": 1.2938494706386462e-06, "loss": 0.0122, "reward": 1.0875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.96875, "step": 3860 }, { "completion_length": 218.6, "epoch": 0.853660220041137, "grad_norm": 0.4745459719079689, "kl": 0.239349365234375, "learning_rate": 1.2749457329980108e-06, "loss": 0.0096, "reward": 1.13125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.975, "step": 3865 }, { "completion_length": 221.925, "epoch": 0.8547645670269599, "grad_norm": 1.0389906784554162, "kl": 0.282757568359375, "learning_rate": 1.256171702921516e-06, "loss": 0.0113, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.9625, "step": 3870 }, { "completion_length": 227.575, "epoch": 0.8558689140127829, "grad_norm": 0.5325421261443589, "kl": 0.273663330078125, "learning_rate": 1.237527659505846e-06, "loss": 0.0109, "reward": 1.0625, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95, "step": 3875 }, { "completion_length": 253.325, "epoch": 0.8569732609986057, "grad_norm": 0.19847592400408953, "kl": 0.26175537109375, "learning_rate": 1.2190138799152851e-06, "loss": 0.0105, "reward": 1.05, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.94375, "step": 3880 }, { "completion_length": 252.20625, "epoch": 0.8580776079844287, "grad_norm": 0.6511065620949663, "kl": 0.270318603515625, "learning_rate": 1.200630639377609e-06, "loss": 0.0108, "reward": 1.0625, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.94375, "step": 3885 }, { "completion_length": 222.19375, "epoch": 0.8591819549702516, "grad_norm": 0.510836488773378, "kl": 0.246490478515625, "learning_rate": 1.1823782111799843e-06, "loss": 0.0099, "reward": 1.1375, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.18125, "rewards/format_reward": 0.95625, "step": 3890 }, { "completion_length": 252.8125, "epoch": 0.8602863019560746, "grad_norm": 0.4973937206692706, "kl": 0.240765380859375, "learning_rate": 1.1642568666649067e-06, "loss": 0.0096, "reward": 1.08125, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.94375, "step": 3895 }, { "completion_length": 235.05625, "epoch": 0.8613906489418975, "grad_norm": 0.896473119654828, "kl": 0.25704345703125, "learning_rate": 1.1462668752261652e-06, "loss": 0.0103, "reward": 1.11875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9625, "step": 3900 }, { "epoch": 0.8613906489418975, "eval_completion_length": 235.22, "eval_kl": 0.29470703125, "eval_loss": 0.011815370991826057, "eval_reward": 1.07, "eval_reward_std": 0.12727921783924104, "eval_rewards/accuracy_reward": 0.11, "eval_rewards/format_reward": 0.96, "eval_runtime": 109.2786, "eval_samples_per_second": 0.906, "eval_steps_per_second": 0.229, "step": 3900 }, { "completion_length": 233.93125, "epoch": 0.8624949959277205, "grad_norm": 0.791697614971569, "kl": 0.280670166015625, "learning_rate": 1.1284085043048465e-06, "loss": 0.0112, "reward": 1.05625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95625, "step": 3905 }, { "completion_length": 262.68125, "epoch": 0.8635993429135435, "grad_norm": 0.5051810763575918, "kl": 0.27977294921875, "learning_rate": 1.1106820193853484e-06, "loss": 0.0112, "reward": 1.0125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.9375, "step": 3910 }, { "completion_length": 251.48125, "epoch": 0.8647036898993664, "grad_norm": 0.5025881602992487, "kl": 0.273834228515625, "learning_rate": 1.0930876839914418e-06, "loss": 0.011, "reward": 1.06875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95625, "step": 3915 }, { "completion_length": 236.35625, "epoch": 0.8658080368851894, "grad_norm": 0.4129347471678857, "kl": 0.2613372802734375, "learning_rate": 1.0756257596823427e-06, "loss": 0.0105, "reward": 1.075, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.95625, "step": 3920 }, { "completion_length": 265.86875, "epoch": 0.8669123838710123, "grad_norm": 0.4235003049667533, "kl": 0.253765869140625, "learning_rate": 1.058296506048836e-06, "loss": 0.0101, "reward": 1.1, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.95, "step": 3925 }, { "completion_length": 232.975, "epoch": 0.8680167308568352, "grad_norm": 0.37693409083366114, "kl": 0.2826416015625, "learning_rate": 1.04110018070941e-06, "loss": 0.0113, "reward": 1.14375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.16875, "rewards/format_reward": 0.975, "step": 3930 }, { "completion_length": 256.3375, "epoch": 0.8691210778426581, "grad_norm": 0.47005147974118267, "kl": 0.28515625, "learning_rate": 1.0240370393064235e-06, "loss": 0.0114, "reward": 1.125, "reward_std": 0.2298096999526024, "rewards/accuracy_reward": 0.175, "rewards/format_reward": 0.95, "step": 3935 }, { "completion_length": 258.5625, "epoch": 0.8702254248284811, "grad_norm": 0.27709333139181463, "kl": 0.31121826171875, "learning_rate": 1.0071073355023097e-06, "loss": 0.0124, "reward": 1.0875, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.93125, "step": 3940 }, { "completion_length": 227.4875, "epoch": 0.871329771814304, "grad_norm": 0.2761772502885486, "kl": 0.301312255859375, "learning_rate": 9.903113209758098e-07, "loss": 0.012, "reward": 1.11875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.96875, "step": 3945 }, { "completion_length": 225.5875, "epoch": 0.872434118800127, "grad_norm": 0.26781461171540255, "kl": 0.31710205078125, "learning_rate": 9.736492454182211e-07, "loss": 0.0127, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.9625, "step": 3950 }, { "completion_length": 241.3375, "epoch": 0.87353846578595, "grad_norm": 1.1383280325532497, "kl": 0.262933349609375, "learning_rate": 9.571213565296877e-07, "loss": 0.0105, "reward": 1.075, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.9625, "step": 3955 }, { "completion_length": 233.5, "epoch": 0.8746428127717729, "grad_norm": 0.29444920945103936, "kl": 0.333721923828125, "learning_rate": 9.407279000155311e-07, "loss": 0.0133, "reward": 1.075, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.93125, "step": 3960 }, { "completion_length": 219.39375, "epoch": 0.8757471597575959, "grad_norm": 0.42276681745389866, "kl": 0.26292724609375, "learning_rate": 9.244691195825794e-07, "loss": 0.0105, "reward": 1.1375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.975, "step": 3965 }, { "completion_length": 269.35625, "epoch": 0.8768515067434188, "grad_norm": 0.5714466190012454, "kl": 0.2780029296875, "learning_rate": 9.0834525693555e-07, "loss": 0.0111, "reward": 1.0625, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "step": 3970 }, { "completion_length": 221.74375, "epoch": 0.8779558537292417, "grad_norm": 0.5132142260680984, "kl": 0.23480224609375, "learning_rate": 8.923565517734633e-07, "loss": 0.0094, "reward": 1.09375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.98125, "step": 3975 }, { "completion_length": 239.19375, "epoch": 0.8790602007150646, "grad_norm": 0.6209622546123578, "kl": 0.246160888671875, "learning_rate": 8.765032417860753e-07, "loss": 0.0099, "reward": 1.15625, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.19375, "rewards/format_reward": 0.9625, "step": 3980 }, { "completion_length": 230.28125, "epoch": 0.8801645477008876, "grad_norm": 0.4959744957429339, "kl": 0.328594970703125, "learning_rate": 8.607855626503403e-07, "loss": 0.0132, "reward": 1.1125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.95, "step": 3985 }, { "completion_length": 242.23125, "epoch": 0.8812688946867105, "grad_norm": 0.6000894011738015, "kl": 0.264337158203125, "learning_rate": 8.452037480269082e-07, "loss": 0.0106, "reward": 1.09375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.95, "step": 3990 }, { "completion_length": 241.79375, "epoch": 0.8823732416725335, "grad_norm": 0.5856288163785148, "kl": 0.274072265625, "learning_rate": 8.297580295566576e-07, "loss": 0.011, "reward": 1.0375, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.95, "step": 3995 }, { "completion_length": 228.125, "epoch": 0.8834775886583565, "grad_norm": 0.7727469678433277, "kl": 0.239056396484375, "learning_rate": 8.144486368572468e-07, "loss": 0.0096, "reward": 1.1875, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.2125, "rewards/format_reward": 0.975, "step": 4000 }, { "epoch": 0.8834775886583565, "eval_completion_length": 240.635, "eval_kl": 0.28673828125, "eval_loss": 0.011466315016150475, "eval_reward": 1.09, "eval_reward_std": 0.16970562398433686, "eval_rewards/accuracy_reward": 0.14, "eval_rewards/format_reward": 0.95, "eval_runtime": 124.8847, "eval_samples_per_second": 0.793, "eval_steps_per_second": 0.2, "step": 4000 }, { "completion_length": 222.24375, "epoch": 0.8845819356441794, "grad_norm": 0.13203636774859265, "kl": 0.271759033203125, "learning_rate": 7.992757975196974e-07, "loss": 0.0109, "reward": 1.10625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.975, "step": 4005 }, { "completion_length": 217.46875, "epoch": 0.8856862826300024, "grad_norm": 0.3836137257482129, "kl": 0.25550537109375, "learning_rate": 7.842397371050181e-07, "loss": 0.0102, "reward": 1.075, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.975, "step": 4010 }, { "completion_length": 264.93125, "epoch": 0.8867906296158253, "grad_norm": 0.35191809685250214, "kl": 0.23974609375, "learning_rate": 7.693406791408476e-07, "loss": 0.0096, "reward": 1.09375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.95625, "step": 4015 }, { "completion_length": 241.44375, "epoch": 0.8878949766016483, "grad_norm": 0.4672837627346682, "kl": 0.26492919921875, "learning_rate": 7.545788451181313e-07, "loss": 0.0106, "reward": 1.0625, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9625, "step": 4020 }, { "completion_length": 277.9875, "epoch": 0.8889993235874711, "grad_norm": 0.7088610794073225, "kl": 0.29766845703125, "learning_rate": 7.399544544878268e-07, "loss": 0.0119, "reward": 1.06875, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.94375, "step": 4025 }, { "completion_length": 226.96875, "epoch": 0.8901036705732941, "grad_norm": 0.29863896309113996, "kl": 0.246087646484375, "learning_rate": 7.25467724657647e-07, "loss": 0.0098, "reward": 1.09375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.98125, "step": 4030 }, { "completion_length": 261.29375, "epoch": 0.891208017559117, "grad_norm": 0.5220634238395366, "kl": 0.238995361328125, "learning_rate": 7.11118870988825e-07, "loss": 0.0096, "reward": 1.1125, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.96875, "step": 4035 }, { "completion_length": 227.1875, "epoch": 0.89231236454494, "grad_norm": 0.468574862123508, "kl": 0.254180908203125, "learning_rate": 6.969081067929129e-07, "loss": 0.0102, "reward": 1.09375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.98125, "step": 4040 }, { "completion_length": 223.36875, "epoch": 0.893416711530763, "grad_norm": 0.7105477246802777, "kl": 0.23565673828125, "learning_rate": 6.828356433286065e-07, "loss": 0.0094, "reward": 1.15625, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.16875, "rewards/format_reward": 0.9875, "step": 4045 }, { "completion_length": 242.2875, "epoch": 0.8945210585165859, "grad_norm": 0.3124111487407041, "kl": 0.272393798828125, "learning_rate": 6.689016897986123e-07, "loss": 0.0109, "reward": 1.09375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "step": 4050 }, { "completion_length": 235.10625, "epoch": 0.8956254055024089, "grad_norm": 0.5287755229051584, "kl": 0.263592529296875, "learning_rate": 6.551064533465335e-07, "loss": 0.0105, "reward": 1.16875, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.2, "rewards/format_reward": 0.96875, "step": 4055 }, { "completion_length": 210.3, "epoch": 0.8967297524882318, "grad_norm": 0.3865388105745784, "kl": 0.243426513671875, "learning_rate": 6.414501390537875e-07, "loss": 0.0097, "reward": 1.0875, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9875, "step": 4060 }, { "completion_length": 257.4125, "epoch": 0.8978340994740548, "grad_norm": 0.5827223262448566, "kl": 0.28209228515625, "learning_rate": 6.279329499365649e-07, "loss": 0.0113, "reward": 1.01875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.06875, "rewards/format_reward": 0.95, "step": 4065 }, { "completion_length": 245.89375, "epoch": 0.8989384464598776, "grad_norm": 0.5568021946499023, "kl": 0.329107666015625, "learning_rate": 6.14555086942804e-07, "loss": 0.0132, "reward": 1.05625, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.94375, "step": 4070 }, { "completion_length": 270.7, "epoch": 0.9000427934457006, "grad_norm": 0.8265886341669334, "kl": 0.343658447265625, "learning_rate": 6.013167489492089e-07, "loss": 0.0137, "reward": 1.0375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9375, "step": 4075 }, { "completion_length": 235.875, "epoch": 0.9011471404315236, "grad_norm": 0.15521466379213147, "kl": 0.21239013671875, "learning_rate": 5.88218132758287e-07, "loss": 0.0085, "reward": 1.09375, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.9875, "step": 4080 }, { "completion_length": 263.94375, "epoch": 0.9022514874173465, "grad_norm": 0.3565020657376661, "kl": 0.248944091796875, "learning_rate": 5.752594330954275e-07, "loss": 0.01, "reward": 1.0875, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9625, "step": 4085 }, { "completion_length": 217.0875, "epoch": 0.9033558344031695, "grad_norm": 0.7599338431132417, "kl": 0.256341552734375, "learning_rate": 5.624408426060124e-07, "loss": 0.0103, "reward": 1.09375, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.99375, "step": 4090 }, { "completion_length": 240.49375, "epoch": 0.9044601813889924, "grad_norm": 0.3404631065084141, "kl": 0.26585693359375, "learning_rate": 5.497625518525374e-07, "loss": 0.0106, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.96875, "step": 4095 }, { "completion_length": 240.90625, "epoch": 0.9055645283748154, "grad_norm": 0.4830591822507376, "kl": 0.2419189453125, "learning_rate": 5.372247493117921e-07, "loss": 0.0097, "reward": 1.0375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.95625, "step": 4100 }, { "epoch": 0.9055645283748154, "eval_completion_length": 236.99, "eval_kl": 0.26658203125, "eval_loss": 0.01068319845944643, "eval_reward": 1.13, "eval_reward_std": 0.1838477599620819, "eval_rewards/accuracy_reward": 0.16, "eval_rewards/format_reward": 0.97, "eval_runtime": 112.8111, "eval_samples_per_second": 0.878, "eval_steps_per_second": 0.222, "step": 4100 }, { "completion_length": 213.69375, "epoch": 0.9066688753606383, "grad_norm": 0.0963839062331533, "kl": 0.2247802734375, "learning_rate": 5.248276213720526e-07, "loss": 0.009, "reward": 1.11875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.98125, "step": 4105 }, { "completion_length": 235.7625, "epoch": 0.9077732223464613, "grad_norm": 0.3586489435080358, "kl": 113.0680419921875, "learning_rate": 5.125713523303133e-07, "loss": 4.5501, "reward": 1.08125, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.9625, "step": 4110 }, { "completion_length": 245.6875, "epoch": 0.9088775693322843, "grad_norm": 0.5993938735102521, "kl": 0.2639892578125, "learning_rate": 5.004561243895433e-07, "loss": 0.0106, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.95625, "step": 4115 }, { "completion_length": 238.9875, "epoch": 0.9099819163181071, "grad_norm": 0.7059681339718733, "kl": 0.25078125, "learning_rate": 4.884821176559817e-07, "loss": 0.01, "reward": 1.09375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.95625, "step": 4120 }, { "completion_length": 214.73125, "epoch": 0.91108626330393, "grad_norm": 0.5836610939153032, "kl": 0.248724365234375, "learning_rate": 4.7664951013645875e-07, "loss": 0.01, "reward": 1.1, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.98125, "step": 4125 }, { "completion_length": 287.74375, "epoch": 0.912190610289753, "grad_norm": 0.5029836450413667, "kl": 0.338372802734375, "learning_rate": 4.649584777357452e-07, "loss": 0.0135, "reward": 1.0375, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9375, "step": 4130 }, { "completion_length": 238.50625, "epoch": 0.913294957275576, "grad_norm": 0.3679025891536969, "kl": 0.2593505859375, "learning_rate": 4.534091942539476e-07, "loss": 0.0104, "reward": 1.06875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.96875, "step": 4135 }, { "completion_length": 243.9, "epoch": 0.9143993042613989, "grad_norm": 0.36542863601047937, "kl": 0.2465576171875, "learning_rate": 4.420018313839147e-07, "loss": 0.0099, "reward": 1.1625, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.2, "rewards/format_reward": 0.9625, "step": 4140 }, { "completion_length": 244.55625, "epoch": 0.9155036512472219, "grad_norm": 0.6654427169718511, "kl": 0.266680908203125, "learning_rate": 4.3073655870869093e-07, "loss": 0.0107, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.95625, "step": 4145 }, { "completion_length": 261.3625, "epoch": 0.9166079982330448, "grad_norm": 0.4755302093000005, "kl": 0.246258544921875, "learning_rate": 4.1961354369898675e-07, "loss": 0.0099, "reward": 1.1, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.96875, "step": 4150 }, { "completion_length": 238.525, "epoch": 0.9177123452188678, "grad_norm": 0.3919488350765311, "kl": 0.269342041015625, "learning_rate": 4.086329517107046e-07, "loss": 0.0108, "reward": 1.175, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.2125, "rewards/format_reward": 0.9625, "step": 4155 }, { "completion_length": 259.00625, "epoch": 0.9188166922046908, "grad_norm": 0.4320891208444687, "kl": 0.283648681640625, "learning_rate": 3.9779494598246484e-07, "loss": 0.0113, "reward": 1.1125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.95625, "step": 4160 }, { "completion_length": 261.85625, "epoch": 0.9199210391905137, "grad_norm": 0.4840697345614265, "kl": 0.2813232421875, "learning_rate": 3.8709968763318894e-07, "loss": 0.0113, "reward": 1.0875, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.95, "step": 4165 }, { "completion_length": 244.525, "epoch": 0.9210253861763366, "grad_norm": 0.29181391006128693, "kl": 0.273553466796875, "learning_rate": 3.7654733565969826e-07, "loss": 0.0109, "reward": 1.1125, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.9625, "step": 4170 }, { "completion_length": 263.0, "epoch": 0.9221297331621595, "grad_norm": 0.6600185394674087, "kl": 0.275067138671875, "learning_rate": 3.661380469343556e-07, "loss": 0.011, "reward": 1.01875, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.91875, "step": 4175 }, { "completion_length": 257.275, "epoch": 0.9232340801479825, "grad_norm": 0.3026772238703725, "kl": 0.2882568359375, "learning_rate": 3.558719762027307e-07, "loss": 0.0115, "reward": 1.075, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.9625, "step": 4180 }, { "completion_length": 238.7, "epoch": 0.9243384271338054, "grad_norm": 0.5128635762934807, "kl": 0.256884765625, "learning_rate": 3.457492760812975e-07, "loss": 0.0103, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.94375, "step": 4185 }, { "completion_length": 271.00625, "epoch": 0.9254427741196284, "grad_norm": 0.41061824834878685, "kl": 0.3115478515625, "learning_rate": 3.357700970551681e-07, "loss": 0.0125, "reward": 1.0875, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9625, "step": 4190 }, { "completion_length": 236.125, "epoch": 0.9265471211054513, "grad_norm": 0.6143144747308046, "kl": 0.2658935546875, "learning_rate": 3.2593458747585683e-07, "loss": 0.0106, "reward": 1.0625, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9625, "step": 4195 }, { "completion_length": 263.36875, "epoch": 0.9276514680912743, "grad_norm": 0.31447425015692126, "kl": 0.238623046875, "learning_rate": 3.1624289355907334e-07, "loss": 0.0095, "reward": 1.0875, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.975, "step": 4200 }, { "epoch": 0.9276514680912743, "eval_completion_length": 263.69, "eval_kl": 0.3008984375, "eval_loss": 0.012053935788571835, "eval_reward": 1.095, "eval_reward_std": 0.1767766922712326, "eval_rewards/accuracy_reward": 0.15, "eval_rewards/format_reward": 0.945, "eval_runtime": 126.8567, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.197, "step": 4200 }, { "completion_length": 227.49375, "epoch": 0.9287558150770973, "grad_norm": 0.3848576143966496, "kl": 0.260791015625, "learning_rate": 3.0669515938254404e-07, "loss": 0.0104, "reward": 1.08125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.96875, "step": 4205 }, { "completion_length": 242.55625, "epoch": 0.9298601620629202, "grad_norm": 0.660183404776511, "kl": 0.3072021484375, "learning_rate": 2.972915268838794e-07, "loss": 0.0123, "reward": 1.11875, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9625, "step": 4210 }, { "completion_length": 237.59375, "epoch": 0.9309645090487431, "grad_norm": 0.38237841247452214, "kl": 0.236932373046875, "learning_rate": 2.8803213585846036e-07, "loss": 0.0095, "reward": 1.09375, "reward_std": 0.07954951152205467, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.98125, "step": 4215 }, { "completion_length": 254.3125, "epoch": 0.932068856034566, "grad_norm": 0.938722788620333, "kl": 0.304193115234375, "learning_rate": 2.7891712395735513e-07, "loss": 0.0122, "reward": 1.03125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.94375, "step": 4220 }, { "completion_length": 270.375, "epoch": 0.933173203020389, "grad_norm": 0.6446127146773467, "kl": 0.32706298828125, "learning_rate": 2.699466266852779e-07, "loss": 0.0131, "reward": 1.05625, "reward_std": 0.23864853456616403, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.925, "step": 4225 }, { "completion_length": 225.74375, "epoch": 0.9342775500062119, "grad_norm": 0.514239970716185, "kl": 0.2426910400390625, "learning_rate": 2.6112077739857465e-07, "loss": 0.0097, "reward": 1.0875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9625, "step": 4230 }, { "completion_length": 249.1625, "epoch": 0.9353818969920349, "grad_norm": 0.4120891261838415, "kl": 0.245001220703125, "learning_rate": 2.524397073032403e-07, "loss": 0.0098, "reward": 1.05, "reward_std": 0.10606601536273956, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.96875, "step": 4235 }, { "completion_length": 240.0125, "epoch": 0.9364862439778578, "grad_norm": 0.4432984558834709, "kl": 0.27457275390625, "learning_rate": 2.4390354545296257e-07, "loss": 0.011, "reward": 1.09375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.9625, "step": 4240 }, { "completion_length": 216.80625, "epoch": 0.9375905909636808, "grad_norm": 0.2646324742458765, "kl": 0.221075439453125, "learning_rate": 2.3551241874721353e-07, "loss": 0.0088, "reward": 1.10625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.99375, "step": 4245 }, { "completion_length": 247.775, "epoch": 0.9386949379495038, "grad_norm": 0.183761058738616, "kl": 0.28399658203125, "learning_rate": 2.272664519293566e-07, "loss": 0.0114, "reward": 1.1, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.96875, "step": 4250 }, { "completion_length": 245.94375, "epoch": 0.9397992849353267, "grad_norm": 0.49084283791698335, "kl": 0.25836181640625, "learning_rate": 2.1916576758478913e-07, "loss": 0.0103, "reward": 1.06875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.98125, "step": 4255 }, { "completion_length": 255.21875, "epoch": 0.9409036319211497, "grad_norm": 0.5752245613234309, "kl": 0.304534912109375, "learning_rate": 2.1121048613912843e-07, "loss": 0.0122, "reward": 1.08125, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.94375, "step": 4260 }, { "completion_length": 248.875, "epoch": 0.9420079789069725, "grad_norm": 0.4954935572252636, "kl": 0.280999755859375, "learning_rate": 2.0340072585641523e-07, "loss": 0.0112, "reward": 1.11875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9625, "step": 4265 }, { "completion_length": 274.95, "epoch": 0.9431123258927955, "grad_norm": 0.6291854725110926, "kl": 0.3195556640625, "learning_rate": 1.9573660283735974e-07, "loss": 0.0128, "reward": 1.09375, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9375, "step": 4270 }, { "completion_length": 227.425, "epoch": 0.9442166728786184, "grad_norm": 0.4759307581588166, "kl": 0.221356201171875, "learning_rate": 1.8821823101760949e-07, "loss": 0.0089, "reward": 1.10625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.975, "step": 4275 }, { "completion_length": 222.7625, "epoch": 0.9453210198644414, "grad_norm": 0.5007795680830986, "kl": 0.27132568359375, "learning_rate": 1.8084572216606422e-07, "loss": 0.0109, "reward": 1.1875, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.19375, "rewards/format_reward": 0.99375, "step": 4280 }, { "completion_length": 243.20625, "epoch": 0.9464253668502643, "grad_norm": 0.43659783572888766, "kl": 0.2771484375, "learning_rate": 1.736191858832048e-07, "loss": 0.0111, "reward": 1.1125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.95, "step": 4285 }, { "completion_length": 230.93125, "epoch": 0.9475297138360873, "grad_norm": 0.5036319473035266, "kl": 0.315594482421875, "learning_rate": 1.665387295994747e-07, "loss": 0.0126, "reward": 1.05, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95, "step": 4290 }, { "completion_length": 257.44375, "epoch": 0.9486340608219103, "grad_norm": 0.21837455215402343, "kl": 0.318695068359375, "learning_rate": 1.5960445857367003e-07, "loss": 0.0128, "reward": 1.08125, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.96875, "step": 4295 }, { "completion_length": 282.2, "epoch": 0.9497384078077332, "grad_norm": 0.8008170036383372, "kl": 0.27298583984375, "learning_rate": 1.5281647589138527e-07, "loss": 0.0109, "reward": 1.01875, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.9375, "step": 4300 }, { "epoch": 0.9497384078077332, "eval_completion_length": 243.03, "eval_kl": 0.26423828125, "eval_loss": 0.010587015189230442, "eval_reward": 1.15, "eval_reward_std": 0.1697056245803833, "eval_rewards/accuracy_reward": 0.17, "eval_rewards/format_reward": 0.98, "eval_runtime": 110.5252, "eval_samples_per_second": 0.896, "eval_steps_per_second": 0.226, "step": 4300 }, { "completion_length": 233.6625, "epoch": 0.9508427547935562, "grad_norm": 0.7236286239926621, "kl": 0.329180908203125, "learning_rate": 1.4617488246348012e-07, "loss": 0.0132, "reward": 1.06875, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95625, "step": 4305 }, { "completion_length": 236.18125, "epoch": 0.9519471017793791, "grad_norm": 0.5259274208633408, "kl": 0.270782470703125, "learning_rate": 1.3967977702456946e-07, "loss": 0.0108, "reward": 1.08125, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.95625, "step": 4310 }, { "completion_length": 222.0125, "epoch": 0.953051448765202, "grad_norm": 0.4716816022245921, "kl": 0.269232177734375, "learning_rate": 1.3333125613156695e-07, "loss": 0.0108, "reward": 1.11875, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.975, "step": 4315 }, { "completion_length": 254.275, "epoch": 0.9541557957510249, "grad_norm": 0.3798073604709502, "kl": 0.29171142578125, "learning_rate": 1.271294141622459e-07, "loss": 0.0117, "reward": 1.08125, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.95625, "step": 4320 }, { "completion_length": 263.34375, "epoch": 0.9552601427368479, "grad_norm": 0.39932023814829193, "kl": 0.31627197265625, "learning_rate": 1.2107434331383504e-07, "loss": 0.0126, "reward": 1.075, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.93125, "step": 4325 }, { "completion_length": 233.03125, "epoch": 0.9563644897226709, "grad_norm": 0.915012168079641, "kl": 0.30823974609375, "learning_rate": 1.1516613360164408e-07, "loss": 0.0123, "reward": 1.1125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.96875, "step": 4330 }, { "completion_length": 235.96875, "epoch": 0.9574688367084938, "grad_norm": 0.6431925020300033, "kl": 0.26207275390625, "learning_rate": 1.094048728577346e-07, "loss": 0.0105, "reward": 1.0625, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.9625, "step": 4335 }, { "completion_length": 227.89375, "epoch": 0.9585731836943168, "grad_norm": 0.2197375620895538, "kl": 0.22823486328125, "learning_rate": 1.0379064672960793e-07, "loss": 0.0091, "reward": 1.1375, "reward_std": 0.12374368458986282, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.99375, "step": 4340 }, { "completion_length": 236.425, "epoch": 0.9596775306801397, "grad_norm": 0.24464788267487614, "kl": 0.237347412109375, "learning_rate": 9.832353867893385e-08, "loss": 0.0095, "reward": 1.05625, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95625, "step": 4345 }, { "completion_length": 222.8, "epoch": 0.9607818776659627, "grad_norm": 0.6551456803073707, "kl": 0.2460235595703125, "learning_rate": 9.300362998030832e-08, "loss": 0.0098, "reward": 1.175, "reward_std": 0.21213203072547912, "rewards/accuracy_reward": 0.2125, "rewards/format_reward": 0.9625, "step": 4350 }, { "completion_length": 268.76875, "epoch": 0.9618862246517856, "grad_norm": 0.592425806579998, "kl": 0.27703857421875, "learning_rate": 8.783099972004882e-08, "loss": 0.0111, "reward": 1.0125, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.91875, "step": 4355 }, { "completion_length": 212.39375, "epoch": 0.9629905716376085, "grad_norm": 0.46261395016364687, "kl": 0.278302001953125, "learning_rate": 8.280572479501426e-08, "loss": 0.0111, "reward": 1.125, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "step": 4360 }, { "completion_length": 242.06875, "epoch": 0.9640949186234314, "grad_norm": 0.28486230034584065, "kl": 0.22513427734375, "learning_rate": 7.792787991146356e-08, "loss": 0.009, "reward": 1.09375, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.98125, "step": 4365 }, { "completion_length": 240.1875, "epoch": 0.9651992656092544, "grad_norm": 0.43886555296427354, "kl": 0.30863037109375, "learning_rate": 7.319753758394665e-08, "loss": 0.0123, "reward": 1.0375, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.94375, "step": 4370 }, { "completion_length": 223.30625, "epoch": 0.9663036125950774, "grad_norm": 0.542732303860562, "kl": 0.27510986328125, "learning_rate": 6.861476813422419e-08, "loss": 0.011, "reward": 1.06875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.98125, "step": 4375 }, { "completion_length": 269.04375, "epoch": 0.9674079595809003, "grad_norm": 0.35561046298902815, "kl": 0.350921630859375, "learning_rate": 6.417963969022389e-08, "loss": 0.014, "reward": 1.04375, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.93125, "step": 4380 }, { "completion_length": 294.5625, "epoch": 0.9685123065667233, "grad_norm": 0.6237843740200061, "kl": 0.262664794921875, "learning_rate": 5.989221818502478e-08, "loss": 0.0105, "reward": 1.10625, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.95, "step": 4385 }, { "completion_length": 257.58125, "epoch": 0.9696166535525462, "grad_norm": 0.6942753855581536, "kl": 0.319189453125, "learning_rate": 5.5752567355883415e-08, "loss": 0.0128, "reward": 1.06875, "reward_std": 0.15026018843054773, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.95, "step": 4390 }, { "completion_length": 259.525, "epoch": 0.9707210005383692, "grad_norm": 0.47467571553923327, "kl": 0.288873291015625, "learning_rate": 5.176074874327919e-08, "loss": 0.0116, "reward": 1.1, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.9625, "step": 4395 }, { "completion_length": 247.05, "epoch": 0.9718253475241921, "grad_norm": 0.47878465160417794, "kl": 0.256573486328125, "learning_rate": 4.791682169000056e-08, "loss": 0.0103, "reward": 1.09375, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.975, "step": 4400 }, { "epoch": 0.9718253475241921, "eval_completion_length": 254.0, "eval_kl": 0.27548828125, "eval_loss": 0.011038653552532196, "eval_reward": 1.11, "eval_reward_std": 0.1838477599620819, "eval_rewards/accuracy_reward": 0.16, "eval_rewards/format_reward": 0.95, "eval_runtime": 128.6135, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.194, "step": 4400 }, { "completion_length": 288.8375, "epoch": 0.9729296945100151, "grad_norm": 0.6464757294619069, "kl": 0.3235595703125, "learning_rate": 4.4220843340269105e-08, "loss": 0.0129, "reward": 1.025, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.9125, "step": 4405 }, { "completion_length": 245.01875, "epoch": 0.9740340414958379, "grad_norm": 0.6385791474694716, "kl": 0.33304443359375, "learning_rate": 4.067286863888131e-08, "loss": 0.0133, "reward": 1.10625, "reward_std": 0.2563262037932873, "rewards/accuracy_reward": 0.16875, "rewards/format_reward": 0.9375, "step": 4410 }, { "completion_length": 219.36875, "epoch": 0.9751383884816609, "grad_norm": 0.6252687285926767, "kl": 0.28475341796875, "learning_rate": 3.727295033040035e-08, "loss": 0.0114, "reward": 1.11875, "reward_std": 0.09722718074917794, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.99375, "step": 4415 }, { "completion_length": 266.64375, "epoch": 0.9762427354674839, "grad_norm": 0.4059885139351556, "kl": 0.2508056640625, "learning_rate": 3.402113895836445e-08, "loss": 0.01, "reward": 1.05, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.1, "rewards/format_reward": 0.95, "step": 4420 }, { "completion_length": 234.075, "epoch": 0.9773470824533068, "grad_norm": 0.5571412291452437, "kl": 0.258990478515625, "learning_rate": 3.091748286453866e-08, "loss": 0.0104, "reward": 1.125, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "step": 4425 }, { "completion_length": 245.19375, "epoch": 0.9784514294391298, "grad_norm": 0.48222048728786027, "kl": 0.279296875, "learning_rate": 2.796202818819871e-08, "loss": 0.0112, "reward": 1.06875, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.1125, "rewards/format_reward": 0.95625, "step": 4430 }, { "completion_length": 266.23125, "epoch": 0.9795557764249527, "grad_norm": 0.33073986574713815, "kl": 0.2812255859375, "learning_rate": 2.5154818865440466e-08, "loss": 0.0113, "reward": 1.09375, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.94375, "step": 4435 }, { "completion_length": 243.86875, "epoch": 0.9806601234107757, "grad_norm": 0.5979836605535603, "kl": 0.3093017578125, "learning_rate": 2.2495896628529355e-08, "loss": 0.0124, "reward": 1.10625, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.95625, "step": 4440 }, { "completion_length": 234.6375, "epoch": 0.9817644703965986, "grad_norm": 0.71956275463976, "kl": 0.267864990234375, "learning_rate": 1.9985301005280843e-08, "loss": 0.0107, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.11875, "rewards/format_reward": 0.975, "step": 4445 }, { "completion_length": 250.375, "epoch": 0.9828688173824216, "grad_norm": 0.4915100209116087, "kl": 0.26923828125, "learning_rate": 1.7623069318469797e-08, "loss": 0.0108, "reward": 1.025, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.9375, "step": 4450 }, { "completion_length": 238.46875, "epoch": 0.9839731643682446, "grad_norm": 0.4934697459512418, "kl": 0.263037109375, "learning_rate": 1.5409236685277608e-08, "loss": 0.0105, "reward": 1.09375, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.9625, "step": 4455 }, { "completion_length": 242.4625, "epoch": 0.9850775113540674, "grad_norm": 0.7032342242514465, "kl": 0.2882568359375, "learning_rate": 1.3343836016772582e-08, "loss": 0.0115, "reward": 1.0625, "reward_std": 0.1414213538169861, "rewards/accuracy_reward": 0.10625, "rewards/format_reward": 0.95625, "step": 4460 }, { "completion_length": 253.40625, "epoch": 0.9861818583398904, "grad_norm": 0.40012002703514815, "kl": 0.28798828125, "learning_rate": 1.1426898017412591e-08, "loss": 0.0115, "reward": 1.10625, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.175, "rewards/format_reward": 0.93125, "step": 4465 }, { "completion_length": 231.40625, "epoch": 0.9872862053257133, "grad_norm": 0.5527803201803545, "kl": 0.273876953125, "learning_rate": 9.658451184600959e-09, "loss": 0.0109, "reward": 1.08125, "reward_std": 0.16793785765767097, "rewards/accuracy_reward": 0.13125, "rewards/format_reward": 0.95, "step": 4470 }, { "completion_length": 276.0125, "epoch": 0.9883905523115363, "grad_norm": 0.5553612160206977, "kl": 0.271929931640625, "learning_rate": 8.038521808249045e-09, "loss": 0.0109, "reward": 1.05625, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.93125, "step": 4475 }, { "completion_length": 269.39375, "epoch": 0.9894948992973592, "grad_norm": 0.38046769308040707, "kl": 0.31407470703125, "learning_rate": 6.567133970397654e-09, "loss": 0.0126, "reward": 1.03125, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.0875, "rewards/format_reward": 0.94375, "step": 4480 }, { "completion_length": 253.03125, "epoch": 0.9905992462831822, "grad_norm": 0.25828108036325964, "kl": 0.36680908203125, "learning_rate": 5.2443095448506674e-09, "loss": 0.0147, "reward": 1.025, "reward_std": 0.15909902304410933, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.93125, "step": 4485 }, { "completion_length": 242.60625, "epoch": 0.9917035932690051, "grad_norm": 0.3615879163519988, "kl": 0.23807373046875, "learning_rate": 4.070068196853072e-09, "loss": 0.0095, "reward": 1.10625, "reward_std": 0.13258251920342445, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.96875, "step": 4490 }, { "completion_length": 271.5, "epoch": 0.9928079402548281, "grad_norm": 0.31635821168080147, "kl": 0.35159912109375, "learning_rate": 3.0444273828000857e-09, "loss": 0.0141, "reward": 1.075, "reward_std": 0.19445436149835588, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.93125, "step": 4495 }, { "completion_length": 233.03125, "epoch": 0.9939122872406511, "grad_norm": 0.30099890850035027, "kl": 0.325830078125, "learning_rate": 2.167402349972925e-09, "loss": 0.013, "reward": 1.0375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.95625, "step": 4500 }, { "epoch": 0.9939122872406511, "eval_completion_length": 251.83, "eval_kl": 0.32880859375, "eval_loss": 0.013173764571547508, "eval_reward": 1.115, "eval_reward_std": 0.19091882765293122, "eval_rewards/accuracy_reward": 0.16, "eval_rewards/format_reward": 0.955, "eval_runtime": 141.5462, "eval_samples_per_second": 0.699, "eval_steps_per_second": 0.177, "step": 4500 }, { "completion_length": 207.225, "epoch": 0.9950166342264739, "grad_norm": 0.7679754782825252, "kl": 0.2902587890625, "learning_rate": 1.4390061363189767e-09, "loss": 0.0116, "reward": 1.11875, "reward_std": 0.18561552688479424, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.96875, "step": 4505 }, { "completion_length": 278.71875, "epoch": 0.9961209812122969, "grad_norm": 0.44631107231254136, "kl": 0.279425048828125, "learning_rate": 8.592495702497427e-10, "loss": 0.0112, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.90625, "step": 4510 }, { "completion_length": 246.39375, "epoch": 0.9972253281981198, "grad_norm": 0.36414142285577833, "kl": 0.30335693359375, "learning_rate": 4.2814127048873553e-10, "loss": 0.0121, "reward": 1.05625, "reward_std": 0.1149048499763012, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9625, "step": 4515 }, { "completion_length": 258.74375, "epoch": 0.9983296751839428, "grad_norm": 0.5750843977179474, "kl": 0.405145263671875, "learning_rate": 1.4568764593603235e-10, "loss": 0.0162, "reward": 1.05, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.95625, "step": 4520 }, { "completion_length": 261.95, "epoch": 0.9994340221697657, "grad_norm": 0.5025685502837802, "kl": 0.259100341796875, "learning_rate": 1.1892895576126606e-11, "loss": 0.0104, "reward": 1.10625, "reward_std": 0.20329319611191748, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.94375, "step": 4525 }, { "completion_length": 248.6875, "epoch": 0.9998757609640949, "kl": 0.22357177734375, "reward": 1.171875, "reward_std": 0.19887377880513668, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.96875, "step": 4527, "total_flos": 0.0, "train_loss": 28.747999461705767, "train_runtime": 163973.669, "train_samples_per_second": 0.442, "train_steps_per_second": 0.028 } ], "logging_steps": 5, "max_steps": 4527, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }