Qwen2.5-1.5B-Open-R1-GRPO / trainer_state.json
ztt0821's picture
Model save
ffae261 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998757609640949,
"eval_steps": 100,
"global_step": 4527,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 384.625,
"epoch": 0.0011043469858229456,
"grad_norm": 1.912142623070508,
"kl": 0.0005407754331827163,
"learning_rate": 2.2075055187637973e-07,
"loss": 0.0,
"reward": 0.59375,
"reward_std": 0.30935921147465706,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.45625,
"step": 5
},
{
"completion_length": 461.3875,
"epoch": 0.002208693971645891,
"grad_norm": 1.2625025592593675,
"kl": 0.00020947456359863282,
"learning_rate": 4.4150110375275946e-07,
"loss": 0.0,
"reward": 0.575,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.41875,
"step": 10
},
{
"completion_length": 385.9875,
"epoch": 0.0033130409574688366,
"grad_norm": 1.4824872365395827,
"kl": 0.0002246655523777008,
"learning_rate": 6.622516556291392e-07,
"loss": 0.0,
"reward": 0.61875,
"reward_std": 0.3623922191560268,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.46875,
"step": 15
},
{
"completion_length": 429.06875,
"epoch": 0.004417387943291782,
"grad_norm": 1.1157617082303197,
"kl": 0.000436440110206604,
"learning_rate": 8.830022075055189e-07,
"loss": 0.0,
"reward": 0.6,
"reward_std": 0.3712310537695885,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.46875,
"step": 20
},
{
"completion_length": 352.0125,
"epoch": 0.005521734929114728,
"grad_norm": 0.6004950274417132,
"kl": 0.002642902731895447,
"learning_rate": 1.1037527593818985e-06,
"loss": 0.0001,
"reward": 0.725,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.60625,
"step": 25
},
{
"completion_length": 294.3125,
"epoch": 0.006626081914937673,
"grad_norm": 3.738595553790936,
"kl": 0.015436601638793946,
"learning_rate": 1.3245033112582784e-06,
"loss": 0.0006,
"reward": 0.73125,
"reward_std": 0.32703688070178033,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.61875,
"step": 30
},
{
"completion_length": 302.69375,
"epoch": 0.007730428900760619,
"grad_norm": 1.655779879795067,
"kl": 0.019573783874511717,
"learning_rate": 1.545253863134658e-06,
"loss": 0.0008,
"reward": 0.79375,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.73125,
"step": 35
},
{
"completion_length": 375.9125,
"epoch": 0.008834775886583565,
"grad_norm": 2.572886866328185,
"kl": 0.018162012100219727,
"learning_rate": 1.7660044150110378e-06,
"loss": 0.0007,
"reward": 0.8375,
"reward_std": 0.30052037686109545,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.7,
"step": 40
},
{
"completion_length": 363.75625,
"epoch": 0.00993912287240651,
"grad_norm": 0.7289483033436792,
"kl": 0.017768669128417968,
"learning_rate": 1.9867549668874175e-06,
"loss": 0.0007,
"reward": 0.81875,
"reward_std": 0.2916815422475338,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.73125,
"step": 45
},
{
"completion_length": 317.81875,
"epoch": 0.011043469858229456,
"grad_norm": 0.608013151594999,
"kl": 0.014789676666259766,
"learning_rate": 2.207505518763797e-06,
"loss": 0.0006,
"reward": 0.85625,
"reward_std": 0.2916815422475338,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.75625,
"step": 50
},
{
"completion_length": 288.8625,
"epoch": 0.012147816844052401,
"grad_norm": 1.087832023876081,
"kl": 0.014481544494628906,
"learning_rate": 2.4282560706401767e-06,
"loss": 0.0006,
"reward": 0.8,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.71875,
"step": 55
},
{
"completion_length": 266.53125,
"epoch": 0.013252163829875346,
"grad_norm": 1.453031787000738,
"kl": 0.013717460632324218,
"learning_rate": 2.6490066225165567e-06,
"loss": 0.0005,
"reward": 0.8625,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.8,
"step": 60
},
{
"completion_length": 285.14375,
"epoch": 0.014356510815698293,
"grad_norm": 1.139133573143394,
"kl": 0.01951141357421875,
"learning_rate": 2.8697571743929364e-06,
"loss": 0.0008,
"reward": 0.90625,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.81875,
"step": 65
},
{
"completion_length": 265.5875,
"epoch": 0.015460857801521238,
"grad_norm": 0.8819487715358617,
"kl": 0.01654224395751953,
"learning_rate": 3.090507726269316e-06,
"loss": 0.0007,
"reward": 0.89375,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.79375,
"step": 70
},
{
"completion_length": 275.11875,
"epoch": 0.016565204787344183,
"grad_norm": 0.6456181700347499,
"kl": 0.026264095306396486,
"learning_rate": 3.311258278145696e-06,
"loss": 0.0011,
"reward": 0.91875,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.8,
"step": 75
},
{
"completion_length": 260.90625,
"epoch": 0.01766955177316713,
"grad_norm": 1.1108485585200154,
"kl": 0.0263336181640625,
"learning_rate": 3.5320088300220757e-06,
"loss": 0.0011,
"reward": 0.91875,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.83125,
"step": 80
},
{
"completion_length": 210.76875,
"epoch": 0.018773898758990076,
"grad_norm": 0.6452807102957274,
"kl": 0.03692817687988281,
"learning_rate": 3.752759381898455e-06,
"loss": 0.0015,
"reward": 0.98125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.90625,
"step": 85
},
{
"completion_length": 264.475,
"epoch": 0.01987824574481302,
"grad_norm": 0.7880782851098767,
"kl": 0.035125350952148436,
"learning_rate": 3.973509933774835e-06,
"loss": 0.0014,
"reward": 0.88125,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.8125,
"step": 90
},
{
"completion_length": 284.78125,
"epoch": 0.020982592730635966,
"grad_norm": 0.7085853946380432,
"kl": 0.023376846313476564,
"learning_rate": 4.1942604856512145e-06,
"loss": 0.0009,
"reward": 0.95625,
"reward_std": 0.2563262037932873,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.8375,
"step": 95
},
{
"completion_length": 278.6875,
"epoch": 0.022086939716458913,
"grad_norm": 0.8560416069975489,
"kl": 0.04554176330566406,
"learning_rate": 4.415011037527594e-06,
"loss": 0.0018,
"reward": 0.9625,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.86875,
"step": 100
},
{
"epoch": 0.022086939716458913,
"eval_completion_length": 248.37,
"eval_kl": 0.046234130859375,
"eval_loss": 0.0018549839733168483,
"eval_reward": 0.985,
"eval_reward_std": 0.1767766922712326,
"eval_rewards/accuracy_reward": 0.07,
"eval_rewards/format_reward": 0.915,
"eval_runtime": 127.1942,
"eval_samples_per_second": 0.778,
"eval_steps_per_second": 0.197,
"step": 100
},
{
"completion_length": 252.65625,
"epoch": 0.023191286702281856,
"grad_norm": 0.8513924054195807,
"kl": 0.07780532836914063,
"learning_rate": 4.635761589403974e-06,
"loss": 0.0031,
"reward": 0.99375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.89375,
"step": 105
},
{
"completion_length": 212.9375,
"epoch": 0.024295633688104803,
"grad_norm": 1.1979997527722244,
"kl": 0.03602142333984375,
"learning_rate": 4.856512141280353e-06,
"loss": 0.0014,
"reward": 0.9375,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.8875,
"step": 110
},
{
"completion_length": 195.11875,
"epoch": 0.02539998067392775,
"grad_norm": 0.5507213314932029,
"kl": 0.03738479614257813,
"learning_rate": 5.077262693156734e-06,
"loss": 0.0015,
"reward": 0.95625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.90625,
"step": 115
},
{
"completion_length": 160.11875,
"epoch": 0.026504327659750693,
"grad_norm": 1.1186714555390784,
"kl": 0.036508941650390626,
"learning_rate": 5.2980132450331135e-06,
"loss": 0.0015,
"reward": 1.0,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.93125,
"step": 120
},
{
"completion_length": 183.3,
"epoch": 0.02760867464557364,
"grad_norm": 1.378767133063548,
"kl": 0.044263458251953124,
"learning_rate": 5.518763796909493e-06,
"loss": 0.0018,
"reward": 0.99375,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.8875,
"step": 125
},
{
"completion_length": 164.45,
"epoch": 0.028713021631396586,
"grad_norm": 0.7101264478347288,
"kl": 0.052339935302734376,
"learning_rate": 5.739514348785873e-06,
"loss": 0.0021,
"reward": 1.0625,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.925,
"step": 130
},
{
"completion_length": 180.11875,
"epoch": 0.02981736861721953,
"grad_norm": 0.49907818395454545,
"kl": 0.05917434692382813,
"learning_rate": 5.960264900662252e-06,
"loss": 0.0024,
"reward": 1.025,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.90625,
"step": 135
},
{
"completion_length": 198.775,
"epoch": 0.030921715603042476,
"grad_norm": 0.8587681927090215,
"kl": 0.06558990478515625,
"learning_rate": 6.181015452538632e-06,
"loss": 0.0026,
"reward": 0.98125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.86875,
"step": 140
},
{
"completion_length": 177.4625,
"epoch": 0.03202606258886542,
"grad_norm": 1.5915533844114518,
"kl": 0.08502044677734374,
"learning_rate": 6.4017660044150125e-06,
"loss": 0.0034,
"reward": 0.99375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9125,
"step": 145
},
{
"completion_length": 204.075,
"epoch": 0.033130409574688366,
"grad_norm": 0.7038907751646122,
"kl": 0.06302261352539062,
"learning_rate": 6.622516556291392e-06,
"loss": 0.0025,
"reward": 1.03125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9375,
"step": 150
},
{
"completion_length": 160.18125,
"epoch": 0.034234756560511316,
"grad_norm": 1.0970259017722608,
"kl": 0.09358978271484375,
"learning_rate": 6.843267108167772e-06,
"loss": 0.0037,
"reward": 1.08125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.96875,
"step": 155
},
{
"completion_length": 180.5875,
"epoch": 0.03533910354633426,
"grad_norm": 0.5682863259937629,
"kl": 0.091546630859375,
"learning_rate": 7.064017660044151e-06,
"loss": 0.0037,
"reward": 1.0375,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.975,
"step": 160
},
{
"completion_length": 209.2625,
"epoch": 0.0364434505321572,
"grad_norm": 0.9429119760227406,
"kl": 0.08664703369140625,
"learning_rate": 7.28476821192053e-06,
"loss": 0.0035,
"reward": 1.08125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.96875,
"step": 165
},
{
"completion_length": 208.41875,
"epoch": 0.03754779751798015,
"grad_norm": 1.2790088963784818,
"kl": 0.101275634765625,
"learning_rate": 7.50551876379691e-06,
"loss": 0.0041,
"reward": 1.0625,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.16875,
"rewards/format_reward": 0.89375,
"step": 170
},
{
"completion_length": 259.24375,
"epoch": 0.038652144503803096,
"grad_norm": 0.46075949216263296,
"kl": 0.0850189208984375,
"learning_rate": 7.726269315673288e-06,
"loss": 0.0034,
"reward": 0.94375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.89375,
"step": 175
},
{
"completion_length": 248.26875,
"epoch": 0.03975649148962604,
"grad_norm": 0.6242098331069889,
"kl": 0.086053466796875,
"learning_rate": 7.94701986754967e-06,
"loss": 0.0034,
"reward": 1.025,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.8875,
"step": 180
},
{
"completion_length": 238.375,
"epoch": 0.04086083847544899,
"grad_norm": 1.150282122681421,
"kl": 0.1117950439453125,
"learning_rate": 8.16777041942605e-06,
"loss": 0.0045,
"reward": 1.025,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.9125,
"step": 185
},
{
"completion_length": 219.4,
"epoch": 0.04196518546127193,
"grad_norm": 0.7542415145174329,
"kl": 0.126629638671875,
"learning_rate": 8.388520971302429e-06,
"loss": 0.0051,
"reward": 0.975,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.89375,
"step": 190
},
{
"completion_length": 198.54375,
"epoch": 0.043069532447094876,
"grad_norm": 0.6615977009838082,
"kl": 0.1339691162109375,
"learning_rate": 8.609271523178809e-06,
"loss": 0.0054,
"reward": 0.975,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.89375,
"step": 195
},
{
"completion_length": 230.6,
"epoch": 0.044173879432917826,
"grad_norm": 1.2480834347897671,
"kl": 0.1072998046875,
"learning_rate": 8.830022075055188e-06,
"loss": 0.0043,
"reward": 0.95625,
"reward_std": 0.2563262037932873,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.85,
"step": 200
},
{
"epoch": 0.044173879432917826,
"eval_completion_length": 205.55,
"eval_kl": 0.0896044921875,
"eval_loss": 0.003579025389626622,
"eval_reward": 1.085,
"eval_reward_std": 0.1484924215078354,
"eval_rewards/accuracy_reward": 0.13,
"eval_rewards/format_reward": 0.955,
"eval_runtime": 103.514,
"eval_samples_per_second": 0.956,
"eval_steps_per_second": 0.242,
"step": 200
},
{
"completion_length": 224.75,
"epoch": 0.04527822641874077,
"grad_norm": 0.5677247970494854,
"kl": 0.1135223388671875,
"learning_rate": 9.050772626931568e-06,
"loss": 0.0045,
"reward": 1.025,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.90625,
"step": 205
},
{
"completion_length": 276.25625,
"epoch": 0.04638257340456371,
"grad_norm": 0.6313414085910611,
"kl": 0.12886962890625,
"learning_rate": 9.271523178807948e-06,
"loss": 0.0052,
"reward": 1.125,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.20625,
"rewards/format_reward": 0.91875,
"step": 210
},
{
"completion_length": 337.5125,
"epoch": 0.04748692039038666,
"grad_norm": 0.6076982934519585,
"kl": 0.1175445556640625,
"learning_rate": 9.492273730684327e-06,
"loss": 0.0047,
"reward": 0.9125,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.81875,
"step": 215
},
{
"completion_length": 331.2875,
"epoch": 0.048591267376209606,
"grad_norm": 0.887758372335282,
"kl": 0.11523284912109374,
"learning_rate": 9.713024282560707e-06,
"loss": 0.0046,
"reward": 0.94375,
"reward_std": 0.23864853456616403,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.85625,
"step": 220
},
{
"completion_length": 269.38125,
"epoch": 0.04969561436203255,
"grad_norm": 0.6864886594586257,
"kl": 0.13491058349609375,
"learning_rate": 9.933774834437086e-06,
"loss": 0.0054,
"reward": 0.95,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.875,
"step": 225
},
{
"completion_length": 246.025,
"epoch": 0.0507999613478555,
"grad_norm": 1.2699307354463993,
"kl": 0.166473388671875,
"learning_rate": 1.0154525386313468e-05,
"loss": 0.0067,
"reward": 1.025,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.88125,
"step": 230
},
{
"completion_length": 206.09375,
"epoch": 0.05190430833367844,
"grad_norm": 0.6720713057618479,
"kl": 0.1639678955078125,
"learning_rate": 1.0375275938189846e-05,
"loss": 0.0066,
"reward": 1.0375,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.925,
"step": 235
},
{
"completion_length": 312.0875,
"epoch": 0.053008655319501385,
"grad_norm": 0.8771338863302535,
"kl": 0.25369873046875,
"learning_rate": 1.0596026490066227e-05,
"loss": 0.0101,
"reward": 0.9,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.8125,
"step": 240
},
{
"completion_length": 188.65,
"epoch": 0.054113002305324336,
"grad_norm": 0.6127023135715081,
"kl": 0.209857177734375,
"learning_rate": 1.0816777041942605e-05,
"loss": 0.0084,
"reward": 1.0125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.91875,
"step": 245
},
{
"completion_length": 183.35,
"epoch": 0.05521734929114728,
"grad_norm": 0.9178124946472914,
"kl": 0.2119171142578125,
"learning_rate": 1.1037527593818986e-05,
"loss": 0.0085,
"reward": 0.95625,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.90625,
"step": 250
},
{
"completion_length": 199.725,
"epoch": 0.05632169627697022,
"grad_norm": 0.9841255331094259,
"kl": 0.188714599609375,
"learning_rate": 1.1258278145695364e-05,
"loss": 0.0075,
"reward": 0.9875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.91875,
"step": 255
},
{
"completion_length": 222.5875,
"epoch": 0.05742604326279317,
"grad_norm": 0.6571161565702436,
"kl": 1504.1844940185547,
"learning_rate": 1.1479028697571745e-05,
"loss": 60.0766,
"reward": 0.975,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9,
"step": 260
},
{
"completion_length": 238.375,
"epoch": 0.058530390248616115,
"grad_norm": 0.7793687563873339,
"kl": 0.16044921875,
"learning_rate": 1.1699779249448125e-05,
"loss": 0.0064,
"reward": 1.04375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.925,
"step": 265
},
{
"completion_length": 308.975,
"epoch": 0.05963473723443906,
"grad_norm": 0.6524174398492628,
"kl": 0.344976806640625,
"learning_rate": 1.1920529801324505e-05,
"loss": 0.0138,
"reward": 1.03125,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.89375,
"step": 270
},
{
"completion_length": 298.2875,
"epoch": 0.06073908422026201,
"grad_norm": 0.515875299778908,
"kl": 0.228948974609375,
"learning_rate": 1.2141280353200884e-05,
"loss": 0.0092,
"reward": 0.98125,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.88125,
"step": 275
},
{
"completion_length": 328.84375,
"epoch": 0.06184343120608495,
"grad_norm": 31.625220010701653,
"kl": 0.2593719482421875,
"learning_rate": 1.2362030905077264e-05,
"loss": 0.0104,
"reward": 1.08125,
"reward_std": 0.30935921147465706,
"rewards/accuracy_reward": 0.225,
"rewards/format_reward": 0.85625,
"step": 280
},
{
"completion_length": 338.28125,
"epoch": 0.0629477781919079,
"grad_norm": 0.8132166957068776,
"kl": 0.26093597412109376,
"learning_rate": 1.2582781456953644e-05,
"loss": 0.0104,
"reward": 0.925,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.83125,
"step": 285
},
{
"completion_length": 276.55625,
"epoch": 0.06405212517773085,
"grad_norm": 0.8732592923475726,
"kl": 0.2025054931640625,
"learning_rate": 1.2803532008830025e-05,
"loss": 0.0081,
"reward": 1.0125,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.8875,
"step": 290
},
{
"completion_length": 222.75,
"epoch": 0.06515647216355379,
"grad_norm": 0.5307249225741028,
"kl": 0.1828094482421875,
"learning_rate": 1.3024282560706403e-05,
"loss": 0.0073,
"reward": 1.0625,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9375,
"step": 295
},
{
"completion_length": 176.43125,
"epoch": 0.06626081914937673,
"grad_norm": 0.7065408921391656,
"kl": 0.169866943359375,
"learning_rate": 1.3245033112582784e-05,
"loss": 0.0068,
"reward": 1.11875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1625,
"rewards/format_reward": 0.95625,
"step": 300
},
{
"epoch": 0.06626081914937673,
"eval_completion_length": 205.125,
"eval_kl": 0.178349609375,
"eval_loss": 0.007154763210564852,
"eval_reward": 1.085,
"eval_reward_std": 0.162634556889534,
"eval_rewards/accuracy_reward": 0.13,
"eval_rewards/format_reward": 0.955,
"eval_runtime": 111.3878,
"eval_samples_per_second": 0.889,
"eval_steps_per_second": 0.224,
"step": 300
},
{
"completion_length": 243.425,
"epoch": 0.06736516613519968,
"grad_norm": 0.7324178851810815,
"kl": 0.2018463134765625,
"learning_rate": 1.3465783664459162e-05,
"loss": 0.0081,
"reward": 1.0375,
"reward_std": 0.2828427076339722,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.9,
"step": 305
},
{
"completion_length": 246.875,
"epoch": 0.06846951312102263,
"grad_norm": 0.5430407102590828,
"kl": 0.199591064453125,
"learning_rate": 1.3686534216335543e-05,
"loss": 0.008,
"reward": 1.0,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9125,
"step": 310
},
{
"completion_length": 231.8625,
"epoch": 0.06957386010684558,
"grad_norm": 2.525719433106217,
"kl": 0.256878662109375,
"learning_rate": 1.3907284768211921e-05,
"loss": 0.0103,
"reward": 1.08125,
"reward_std": 0.23864853456616403,
"rewards/accuracy_reward": 0.18125,
"rewards/format_reward": 0.9,
"step": 315
},
{
"completion_length": 269.71875,
"epoch": 0.07067820709266852,
"grad_norm": 2.2679595474497583,
"kl": 0.81484375,
"learning_rate": 1.4128035320088303e-05,
"loss": 0.0326,
"reward": 1.0,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.875,
"step": 320
},
{
"completion_length": 208.94375,
"epoch": 0.07178255407849146,
"grad_norm": 1.1848713457261022,
"kl": 0.43358154296875,
"learning_rate": 1.434878587196468e-05,
"loss": 0.0174,
"reward": 0.99375,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.88125,
"step": 325
},
{
"completion_length": 186.3375,
"epoch": 0.0728869010643144,
"grad_norm": 1.434354885038631,
"kl": 0.292218017578125,
"learning_rate": 1.456953642384106e-05,
"loss": 0.0117,
"reward": 0.74375,
"reward_std": 0.30935921147465706,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.66875,
"step": 330
},
{
"completion_length": 616.03125,
"epoch": 0.07399124805013735,
"grad_norm": 0.46617566479772593,
"kl": 0.228448486328125,
"learning_rate": 1.479028697571744e-05,
"loss": 0.0091,
"reward": 0.58125,
"reward_std": 0.2563262037932873,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.5,
"step": 335
},
{
"completion_length": 781.5875,
"epoch": 0.0750955950359603,
"grad_norm": 0.4999142123724588,
"kl": 0.203704833984375,
"learning_rate": 1.501103752759382e-05,
"loss": 0.0081,
"reward": 0.5125,
"reward_std": 0.30052037686109545,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.48125,
"step": 340
},
{
"completion_length": 438.83125,
"epoch": 0.07619994202178325,
"grad_norm": 1.1259807453052872,
"kl": 0.229962158203125,
"learning_rate": 1.52317880794702e-05,
"loss": 0.0092,
"reward": 0.99375,
"reward_std": 0.2916815422475338,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.8625,
"step": 345
},
{
"completion_length": 308.03125,
"epoch": 0.07730428900760619,
"grad_norm": 1.1381907828574633,
"kl": 0.246209716796875,
"learning_rate": 1.5452538631346577e-05,
"loss": 0.0098,
"reward": 1.01875,
"reward_std": 0.2740038730204105,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.8625,
"step": 350
},
{
"completion_length": 466.78125,
"epoch": 0.07840863599342913,
"grad_norm": 145.06654530948717,
"kl": 1.06409912109375,
"learning_rate": 1.567328918322296e-05,
"loss": 0.0425,
"reward": 0.83125,
"reward_std": 0.3623922191560268,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.725,
"step": 355
},
{
"completion_length": 601.19375,
"epoch": 0.07951298297925208,
"grad_norm": 4.173181885857791,
"kl": 0.5132080078125,
"learning_rate": 1.589403973509934e-05,
"loss": 0.0205,
"reward": 0.5625,
"reward_std": 0.38890872299671175,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.5125,
"step": 360
},
{
"completion_length": 552.525,
"epoch": 0.08061732996507502,
"grad_norm": 2.488203792347465,
"kl": 0.5505126953125,
"learning_rate": 1.6114790286975718e-05,
"loss": 0.022,
"reward": 0.675,
"reward_std": 0.38890872299671175,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.61875,
"step": 365
},
{
"completion_length": 323.6625,
"epoch": 0.08172167695089798,
"grad_norm": 0.4380180481079113,
"kl": 0.425970458984375,
"learning_rate": 1.63355408388521e-05,
"loss": 0.017,
"reward": 1.0375,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.90625,
"step": 370
},
{
"completion_length": 284.35,
"epoch": 0.08282602393672092,
"grad_norm": 1.2931698264509297,
"kl": 0.374395751953125,
"learning_rate": 1.6556291390728477e-05,
"loss": 0.015,
"reward": 1.01875,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9,
"step": 375
},
{
"completion_length": 271.5875,
"epoch": 0.08393037092254386,
"grad_norm": 2.19321510252618,
"kl": 0.78577880859375,
"learning_rate": 1.6777041942604858e-05,
"loss": 0.0315,
"reward": 1.0625,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.175,
"rewards/format_reward": 0.8875,
"step": 380
},
{
"completion_length": 247.65,
"epoch": 0.08503471790836681,
"grad_norm": 0.8492367847141669,
"kl": 0.65609130859375,
"learning_rate": 1.699779249448124e-05,
"loss": 0.0263,
"reward": 1.04375,
"reward_std": 0.23864853456616403,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.89375,
"step": 385
},
{
"completion_length": 279.06875,
"epoch": 0.08613906489418975,
"grad_norm": 1.1176439301337926,
"kl": 0.440093994140625,
"learning_rate": 1.7218543046357617e-05,
"loss": 0.0176,
"reward": 1.0125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.90625,
"step": 390
},
{
"completion_length": 381.325,
"epoch": 0.0872434118800127,
"grad_norm": 1.4438637661323328,
"kl": 1.2351318359375,
"learning_rate": 1.7439293598234e-05,
"loss": 0.0494,
"reward": 0.81875,
"reward_std": 0.30935921147465706,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.73125,
"step": 395
},
{
"completion_length": 393.53125,
"epoch": 0.08834775886583565,
"grad_norm": 0.8840798582686505,
"kl": 0.66854248046875,
"learning_rate": 1.7660044150110377e-05,
"loss": 0.0267,
"reward": 0.9625,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.8375,
"step": 400
},
{
"epoch": 0.08834775886583565,
"eval_completion_length": 357.46,
"eval_kl": 0.44,
"eval_loss": 0.017609253525733948,
"eval_reward": 0.99,
"eval_reward_std": 0.24041630148887635,
"eval_rewards/accuracy_reward": 0.155,
"eval_rewards/format_reward": 0.835,
"eval_runtime": 153.4656,
"eval_samples_per_second": 0.645,
"eval_steps_per_second": 0.163,
"step": 400
},
{
"completion_length": 289.31875,
"epoch": 0.0894521058516586,
"grad_norm": 5.233032104359776,
"kl": 0.511737060546875,
"learning_rate": 1.7880794701986758e-05,
"loss": 0.0205,
"reward": 1.00625,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9125,
"step": 405
},
{
"completion_length": 255.6,
"epoch": 0.09055645283748154,
"grad_norm": 0.5288317975266756,
"kl": 0.2679931640625,
"learning_rate": 1.8101545253863136e-05,
"loss": 0.0107,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95,
"step": 410
},
{
"completion_length": 283.04375,
"epoch": 0.09166079982330448,
"grad_norm": 0.44523434013578594,
"kl": 0.219281005859375,
"learning_rate": 1.8322295805739517e-05,
"loss": 0.0088,
"reward": 1.0625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95,
"step": 415
},
{
"completion_length": 308.90625,
"epoch": 0.09276514680912742,
"grad_norm": 0.8815212129055245,
"kl": 0.264593505859375,
"learning_rate": 1.8543046357615895e-05,
"loss": 0.0106,
"reward": 1.0125,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9125,
"step": 420
},
{
"completion_length": 314.7375,
"epoch": 0.09386949379495037,
"grad_norm": 1.3789377076942395,
"kl": 0.3002197265625,
"learning_rate": 1.8763796909492276e-05,
"loss": 0.012,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.90625,
"step": 425
},
{
"completion_length": 339.475,
"epoch": 0.09497384078077332,
"grad_norm": 0.6184446711073517,
"kl": 0.441192626953125,
"learning_rate": 1.8984547461368654e-05,
"loss": 0.0177,
"reward": 0.95625,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.90625,
"step": 430
},
{
"completion_length": 317.0125,
"epoch": 0.09607818776659627,
"grad_norm": 1.5357040975928846,
"kl": 0.2957275390625,
"learning_rate": 1.9205298013245036e-05,
"loss": 0.0118,
"reward": 1.0,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.94375,
"step": 435
},
{
"completion_length": 326.9,
"epoch": 0.09718253475241921,
"grad_norm": 0.9509356833694793,
"kl": 0.24755859375,
"learning_rate": 1.9426048565121414e-05,
"loss": 0.0099,
"reward": 0.95,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.89375,
"step": 440
},
{
"completion_length": 247.94375,
"epoch": 0.09828688173824215,
"grad_norm": 7.331260846492792,
"kl": 0.458038330078125,
"learning_rate": 1.9646799116997795e-05,
"loss": 0.0183,
"reward": 0.9875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.93125,
"step": 445
},
{
"completion_length": 244.2,
"epoch": 0.0993912287240651,
"grad_norm": 2.2209695163797973,
"kl": 0.579638671875,
"learning_rate": 1.9867549668874173e-05,
"loss": 0.0232,
"reward": 0.98125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.93125,
"step": 450
},
{
"completion_length": 247.9875,
"epoch": 0.10049557570988804,
"grad_norm": 0.6146792415154715,
"kl": 0.3593017578125,
"learning_rate": 1.9999988107104428e-05,
"loss": 0.0144,
"reward": 1.025,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.95625,
"step": 455
},
{
"completion_length": 204.425,
"epoch": 0.101599922695711,
"grad_norm": 1.2131689914986956,
"kl": 0.246929931640625,
"learning_rate": 1.9999854312354064e-05,
"loss": 0.0099,
"reward": 1.08125,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.95625,
"step": 460
},
{
"completion_length": 215.4625,
"epoch": 0.10270426968153394,
"grad_norm": 22.462295342430743,
"kl": 0.606536865234375,
"learning_rate": 1.999957185872951e-05,
"loss": 0.0243,
"reward": 0.94375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.90625,
"step": 465
},
{
"completion_length": 180.38125,
"epoch": 0.10380861666735688,
"grad_norm": 0.22049287292981123,
"kl": 1.026416015625,
"learning_rate": 1.999914075042975e-05,
"loss": 0.0411,
"reward": 0.99375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.93125,
"step": 470
},
{
"completion_length": 227.85625,
"epoch": 0.10491296365317983,
"grad_norm": 0.5306656385299113,
"kl": 0.257354736328125,
"learning_rate": 1.9998560993863682e-05,
"loss": 0.0103,
"reward": 0.925,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.8875,
"step": 475
},
{
"completion_length": 173.825,
"epoch": 0.10601731063900277,
"grad_norm": 0.7697195391306778,
"kl": 0.243804931640625,
"learning_rate": 1.999783259765003e-05,
"loss": 0.0098,
"reward": 0.9875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.95,
"step": 480
},
{
"completion_length": 245.43125,
"epoch": 0.10712165762482571,
"grad_norm": 0.8895584020157263,
"kl": 0.344189453125,
"learning_rate": 1.9996955572617202e-05,
"loss": 0.0138,
"reward": 0.94375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.925,
"step": 485
},
{
"completion_length": 228.575,
"epoch": 0.10822600461064867,
"grad_norm": 0.4019967230109815,
"kl": 0.6487060546875,
"learning_rate": 1.999592993180315e-05,
"loss": 0.026,
"reward": 0.95,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9,
"step": 490
},
{
"completion_length": 191.1125,
"epoch": 0.10933035159647161,
"grad_norm": 2.2236898903555584,
"kl": 0.3437255859375,
"learning_rate": 1.9994755690455154e-05,
"loss": 0.0137,
"reward": 1.025,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.95625,
"step": 495
},
{
"completion_length": 227.65625,
"epoch": 0.11043469858229456,
"grad_norm": 0.31534477505615033,
"kl": 0.254638671875,
"learning_rate": 1.9993432866029604e-05,
"loss": 0.0102,
"reward": 1.0375,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.96875,
"step": 500
},
{
"epoch": 0.11043469858229456,
"eval_completion_length": 305.16,
"eval_kl": 0.25392578125,
"eval_loss": 0.010133117437362671,
"eval_reward": 1.01,
"eval_reward_std": 0.09899494767189027,
"eval_rewards/accuracy_reward": 0.065,
"eval_rewards/format_reward": 0.945,
"eval_runtime": 127.2591,
"eval_samples_per_second": 0.778,
"eval_steps_per_second": 0.196,
"step": 500
},
{
"completion_length": 348.06875,
"epoch": 0.1115390455681175,
"grad_norm": 0.47067061501157526,
"kl": 0.3302001953125,
"learning_rate": 1.9991961478191753e-05,
"loss": 0.0132,
"reward": 0.95625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.9125,
"step": 505
},
{
"completion_length": 404.30625,
"epoch": 0.11264339255394044,
"grad_norm": 0.3909987106694373,
"kl": 0.290771484375,
"learning_rate": 1.99903415488154e-05,
"loss": 0.0116,
"reward": 0.975,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.93125,
"step": 510
},
{
"completion_length": 331.25,
"epoch": 0.11374773953976339,
"grad_norm": 0.13154079900966428,
"kl": 0.2630859375,
"learning_rate": 1.998857310198259e-05,
"loss": 0.0105,
"reward": 0.90625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0125,
"rewards/format_reward": 0.89375,
"step": 515
},
{
"completion_length": 294.86875,
"epoch": 0.11485208652558634,
"grad_norm": 0.600369457847733,
"kl": 0.209710693359375,
"learning_rate": 1.998665616398323e-05,
"loss": 0.0084,
"reward": 0.9875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.95,
"step": 520
},
{
"completion_length": 194.80625,
"epoch": 0.11595643351140929,
"grad_norm": 0.6528037119159925,
"kl": 0.23944091796875,
"learning_rate": 1.9984590763314722e-05,
"loss": 0.0096,
"reward": 1.01875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.975,
"step": 525
},
{
"completion_length": 150.6,
"epoch": 0.11706078049723223,
"grad_norm": 0.2685863872987803,
"kl": 0.19815673828125,
"learning_rate": 1.998237693068153e-05,
"loss": 0.0079,
"reward": 1.00625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.975,
"step": 530
},
{
"completion_length": 190.7,
"epoch": 0.11816512748305517,
"grad_norm": 0.6345565715762368,
"kl": 0.385015869140625,
"learning_rate": 1.9980014698994722e-05,
"loss": 0.0154,
"reward": 1.0,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.95625,
"step": 535
},
{
"completion_length": 263.48125,
"epoch": 0.11926947446887812,
"grad_norm": 0.7032479277266585,
"kl": 0.430792236328125,
"learning_rate": 1.997750410337147e-05,
"loss": 0.0172,
"reward": 1.0125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.96875,
"step": 540
},
{
"completion_length": 322.3125,
"epoch": 0.12037382145470106,
"grad_norm": 0.7900459644547021,
"kl": 0.45318603515625,
"learning_rate": 1.997484518113456e-05,
"loss": 0.0181,
"reward": 0.9625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.925,
"step": 545
},
{
"completion_length": 282.39375,
"epoch": 0.12147816844052402,
"grad_norm": 1.7131720462446698,
"kl": 0.667669677734375,
"learning_rate": 1.9972037971811802e-05,
"loss": 0.0267,
"reward": 0.94375,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0125,
"rewards/format_reward": 0.93125,
"step": 550
},
{
"completion_length": 351.20625,
"epoch": 0.12258251542634696,
"grad_norm": 2.3429062472049664,
"kl": 0.4404296875,
"learning_rate": 1.9969082517135463e-05,
"loss": 0.0176,
"reward": 0.9875,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.93125,
"step": 555
},
{
"completion_length": 302.83125,
"epoch": 0.1236868624121699,
"grad_norm": 0.42331075217778713,
"kl": 0.244976806640625,
"learning_rate": 1.9965978861041637e-05,
"loss": 0.0098,
"reward": 1.04375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9625,
"step": 560
},
{
"completion_length": 227.3625,
"epoch": 0.12479120939799285,
"grad_norm": 6.878210574805294,
"kl": 0.490325927734375,
"learning_rate": 1.99627270496696e-05,
"loss": 0.0196,
"reward": 1.0125,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.95625,
"step": 565
},
{
"completion_length": 259.70625,
"epoch": 0.1258955563838158,
"grad_norm": 0.29828625265818887,
"kl": 0.570684814453125,
"learning_rate": 1.995932713136112e-05,
"loss": 0.0228,
"reward": 0.9625,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.925,
"step": 570
},
{
"completion_length": 256.55625,
"epoch": 0.12699990336963873,
"grad_norm": 0.5399903589695695,
"kl": 0.375726318359375,
"learning_rate": 1.9955779156659735e-05,
"loss": 0.015,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.975,
"step": 575
},
{
"completion_length": 242.5,
"epoch": 0.1281042503554617,
"grad_norm": 9.42984310059387,
"kl": 0.512103271484375,
"learning_rate": 1.9952083178310002e-05,
"loss": 0.0205,
"reward": 1.0375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9625,
"step": 580
},
{
"completion_length": 217.3375,
"epoch": 0.12920859734128462,
"grad_norm": 4.883219507705031,
"kl": 0.71790771484375,
"learning_rate": 1.994823925125672e-05,
"loss": 0.0287,
"reward": 1.03125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.95,
"step": 585
},
{
"completion_length": 213.125,
"epoch": 0.13031294432710758,
"grad_norm": 0.656048665938222,
"kl": 0.331298828125,
"learning_rate": 1.994424743264412e-05,
"loss": 0.0132,
"reward": 1.10625,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.95625,
"step": 590
},
{
"completion_length": 175.19375,
"epoch": 0.13141729131293053,
"grad_norm": 0.4384345369089057,
"kl": 0.21328125,
"learning_rate": 1.9940107781814976e-05,
"loss": 0.0085,
"reward": 1.08125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.99375,
"step": 595
},
{
"completion_length": 232.13125,
"epoch": 0.13252163829875346,
"grad_norm": 9.864648628359978,
"kl": 0.3187255859375,
"learning_rate": 1.993582036030978e-05,
"loss": 0.0127,
"reward": 1.00625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.95625,
"step": 600
},
{
"epoch": 0.13252163829875346,
"eval_completion_length": 205.38,
"eval_kl": 16.67103515625,
"eval_loss": 0.669373095035553,
"eval_reward": 1.06,
"eval_reward_std": 0.22627416610717774,
"eval_rewards/accuracy_reward": 0.135,
"eval_rewards/format_reward": 0.925,
"eval_runtime": 95.7714,
"eval_samples_per_second": 1.034,
"eval_steps_per_second": 0.261,
"step": 600
},
{
"completion_length": 220.1375,
"epoch": 0.13362598528457642,
"grad_norm": 0.9515331014381163,
"kl": 0.321160888671875,
"learning_rate": 1.993138523186578e-05,
"loss": 0.0129,
"reward": 1.06875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.98125,
"step": 605
},
{
"completion_length": 219.99375,
"epoch": 0.13473033227039935,
"grad_norm": 3.2832305875346033,
"kl": 0.55125732421875,
"learning_rate": 1.9926802462416054e-05,
"loss": 0.0221,
"reward": 1.04375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9625,
"step": 610
},
{
"completion_length": 218.5875,
"epoch": 0.1358346792562223,
"grad_norm": 1.0232056540109529,
"kl": 0.942523193359375,
"learning_rate": 1.9922072120088537e-05,
"loss": 0.0377,
"reward": 0.99375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.93125,
"step": 615
},
{
"completion_length": 268.2125,
"epoch": 0.13693902624204526,
"grad_norm": 1.7246486491338628,
"kl": 0.6009521484375,
"learning_rate": 1.991719427520499e-05,
"loss": 0.024,
"reward": 1.0125,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.9,
"step": 620
},
{
"completion_length": 240.51875,
"epoch": 0.1380433732278682,
"grad_norm": 7.217525782475475,
"kl": 0.579345703125,
"learning_rate": 1.9912169000279952e-05,
"loss": 0.0231,
"reward": 1.05,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.975,
"step": 625
},
{
"completion_length": 307.725,
"epoch": 0.13914772021369115,
"grad_norm": 0.32686422446894037,
"kl": 0.320556640625,
"learning_rate": 1.9906996370019692e-05,
"loss": 0.0128,
"reward": 1.05625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9375,
"step": 630
},
{
"completion_length": 448.2375,
"epoch": 0.14025206719951408,
"grad_norm": 4.724746821053579,
"kl": 0.55283203125,
"learning_rate": 1.990167646132107e-05,
"loss": 0.0221,
"reward": 0.8125,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.74375,
"step": 635
},
{
"completion_length": 466.6125,
"epoch": 0.14135641418533704,
"grad_norm": 0.3626333681552602,
"kl": 3.0367431640625,
"learning_rate": 1.9896209353270394e-05,
"loss": 0.1216,
"reward": 0.7625,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.725,
"step": 640
},
{
"completion_length": 351.9125,
"epoch": 0.14246076117115997,
"grad_norm": 0.3534359394092258,
"kl": 0.2459228515625,
"learning_rate": 1.989059512714227e-05,
"loss": 0.0098,
"reward": 1.01875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.925,
"step": 645
},
{
"completion_length": 309.93125,
"epoch": 0.14356510815698292,
"grad_norm": 0.6269748799278583,
"kl": 0.239697265625,
"learning_rate": 1.988483386639836e-05,
"loss": 0.0096,
"reward": 1.1125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.9625,
"step": 650
},
{
"completion_length": 304.49375,
"epoch": 0.14466945514280588,
"grad_norm": 2.4513786768690364,
"kl": 0.2604248046875,
"learning_rate": 1.9878925656686167e-05,
"loss": 0.0104,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.98125,
"step": 655
},
{
"completion_length": 307.99375,
"epoch": 0.1457738021286288,
"grad_norm": 0.5055349332276848,
"kl": 0.27725830078125,
"learning_rate": 1.9872870585837757e-05,
"loss": 0.0111,
"reward": 1.00625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.95,
"step": 660
},
{
"completion_length": 281.29375,
"epoch": 0.14687814911445177,
"grad_norm": 42388.24147147825,
"kl": 170.7101318359375,
"learning_rate": 1.9866668743868437e-05,
"loss": 6.8324,
"reward": 0.9375,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.0125,
"rewards/format_reward": 0.925,
"step": 665
},
{
"completion_length": 331.09375,
"epoch": 0.1479824961002747,
"grad_norm": 3.1708940830460186,
"kl": 1.48009033203125,
"learning_rate": 1.9860320222975435e-05,
"loss": 0.0594,
"reward": 0.8875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.84375,
"step": 670
},
{
"completion_length": 329.43125,
"epoch": 0.14908684308609765,
"grad_norm": 9.028073103998265,
"kl": 0.427667236328125,
"learning_rate": 1.9853825117536522e-05,
"loss": 0.0171,
"reward": 0.95,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.9125,
"step": 675
},
{
"completion_length": 295.99375,
"epoch": 0.1501911900719206,
"grad_norm": 1.3497959756065208,
"kl": 0.8688232421875,
"learning_rate": 1.9847183524108614e-05,
"loss": 0.0348,
"reward": 0.98125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.93125,
"step": 680
},
{
"completion_length": 256.76875,
"epoch": 0.15129553705774354,
"grad_norm": 0.5127846687930365,
"kl": 0.305950927734375,
"learning_rate": 1.9840395541426333e-05,
"loss": 0.0122,
"reward": 1.06875,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.96875,
"step": 685
},
{
"completion_length": 248.68125,
"epoch": 0.1523998840435665,
"grad_norm": 0.9607285175875181,
"kl": 0.33861083984375,
"learning_rate": 1.983346127040053e-05,
"loss": 0.0135,
"reward": 1.01875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.95,
"step": 690
},
{
"completion_length": 209.85,
"epoch": 0.15350423102938943,
"grad_norm": 2.0950203003912313,
"kl": 0.3568115234375,
"learning_rate": 1.9826380814116795e-05,
"loss": 0.0143,
"reward": 1.0,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.925,
"step": 695
},
{
"completion_length": 618.08125,
"epoch": 0.15460857801521238,
"grad_norm": 2.8704953934506423,
"kl": 1.026025390625,
"learning_rate": 1.9819154277833938e-05,
"loss": 0.041,
"reward": 0.71875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.6875,
"step": 700
},
{
"epoch": 0.15460857801521238,
"eval_completion_length": 959.67,
"eval_kl": 1.478125,
"eval_loss": 0.05926254764199257,
"eval_reward": 0.6,
"eval_reward_std": 0.38183765530586244,
"eval_rewards/accuracy_reward": 0.045,
"eval_rewards/format_reward": 0.555,
"eval_runtime": 261.2361,
"eval_samples_per_second": 0.379,
"eval_steps_per_second": 0.096,
"step": 700
},
{
"completion_length": 1003.5125,
"epoch": 0.1557129250010353,
"grad_norm": 0.28454444589509803,
"kl": 1.074609375,
"learning_rate": 1.9811781768982392e-05,
"loss": 0.043,
"reward": 0.7375,
"reward_std": 0.2828427076339722,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.71875,
"step": 705
},
{
"completion_length": 1024.0,
"epoch": 0.15681727198685827,
"grad_norm": 0.20551909954742464,
"kl": 648601.9237182618,
"learning_rate": 1.980426339716264e-05,
"loss": 25944.5938,
"reward": 0.8125,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.00625,
"rewards/format_reward": 0.80625,
"step": 710
},
{
"completion_length": 1024.0,
"epoch": 0.15792161897268123,
"grad_norm": 0.1216302069380067,
"kl": 0.264453125,
"learning_rate": 1.9796599274143586e-05,
"loss": 0.0106,
"reward": 0.99375,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.0125,
"rewards/format_reward": 0.98125,
"step": 715
},
{
"completion_length": 1024.0,
"epoch": 0.15902596595850416,
"grad_norm": 0.13579361297724876,
"kl": 0.23675537109375,
"learning_rate": 1.9788789513860875e-05,
"loss": 0.0095,
"reward": 1.0,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.98125,
"step": 720
},
{
"completion_length": 1024.0,
"epoch": 0.1601303129443271,
"grad_norm": 0.1484233873762049,
"kl": 0.26705322265625,
"learning_rate": 1.9780834232415214e-05,
"loss": 0.0107,
"reward": 0.975,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.95625,
"step": 725
},
{
"completion_length": 1024.0,
"epoch": 0.16123465993015004,
"grad_norm": 0.2382323266501869,
"kl": 0.26546630859375,
"learning_rate": 1.9772733548070647e-05,
"loss": 0.0106,
"reward": 0.88125,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.85625,
"step": 730
},
{
"completion_length": 1024.0,
"epoch": 0.162339006915973,
"grad_norm": 0.22617291442101026,
"kl": 0.29144287109375,
"learning_rate": 1.9764487581252787e-05,
"loss": 0.0117,
"reward": 0.90625,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.88125,
"step": 735
},
{
"completion_length": 1024.0,
"epoch": 0.16344335390179596,
"grad_norm": 0.27522740788676725,
"kl": 0.272119140625,
"learning_rate": 1.975609645454704e-05,
"loss": 0.0109,
"reward": 0.95625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.91875,
"step": 740
},
{
"completion_length": 1024.0,
"epoch": 0.1645477008876189,
"grad_norm": 0.26380621267798754,
"kl": 0.3930419921875,
"learning_rate": 1.9747560292696763e-05,
"loss": 0.0157,
"reward": 0.9625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.93125,
"step": 745
},
{
"completion_length": 1024.0,
"epoch": 0.16565204787344184,
"grad_norm": 0.28940243726369763,
"kl": 0.33485107421875,
"learning_rate": 1.9738879222601425e-05,
"loss": 0.0134,
"reward": 0.8875,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.85625,
"step": 750
},
{
"completion_length": 1024.0,
"epoch": 0.16675639485926477,
"grad_norm": 0.28544947271914256,
"kl": 0.5123046875,
"learning_rate": 1.9730053373314722e-05,
"loss": 0.0205,
"reward": 0.9125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.875,
"step": 755
},
{
"completion_length": 1020.45,
"epoch": 0.16786074184508773,
"grad_norm": 0.25645868139823796,
"kl": 0.56268310546875,
"learning_rate": 1.9721082876042644e-05,
"loss": 0.0225,
"reward": 0.95625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.925,
"step": 760
},
{
"completion_length": 1024.0,
"epoch": 0.16896508883091066,
"grad_norm": 0.12040072575635787,
"kl": 0.48553466796875,
"learning_rate": 1.9711967864141542e-05,
"loss": 0.0194,
"reward": 0.98125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.9375,
"step": 765
},
{
"completion_length": 1024.0,
"epoch": 0.17006943581673362,
"grad_norm": 0.17007869061264408,
"kl": 0.50712890625,
"learning_rate": 1.970270847311612e-05,
"loss": 0.0203,
"reward": 1.05625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.975,
"step": 770
},
{
"completion_length": 1020.9375,
"epoch": 0.17117378280255657,
"grad_norm": 0.17581235812523752,
"kl": 0.49864501953125,
"learning_rate": 1.9693304840617456e-05,
"loss": 0.0199,
"reward": 1.0375,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.94375,
"step": 775
},
{
"completion_length": 1024.0,
"epoch": 0.1722781297883795,
"grad_norm": 0.18169044119511507,
"kl": 0.66134033203125,
"learning_rate": 1.968375710644093e-05,
"loss": 0.0265,
"reward": 0.96875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.8875,
"step": 780
},
{
"completion_length": 1024.0,
"epoch": 0.17338247677420246,
"grad_norm": 0.44893330655977454,
"kl": 0.538287353515625,
"learning_rate": 1.9674065412524147e-05,
"loss": 0.0215,
"reward": 0.75,
"reward_std": 0.30052037686109545,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.7,
"step": 785
},
{
"completion_length": 1024.0,
"epoch": 0.1744868237600254,
"grad_norm": 1.1859702994440748,
"kl": 0.7820556640625,
"learning_rate": 1.9664229902944833e-05,
"loss": 0.0313,
"reward": 0.88125,
"reward_std": 0.2563262037932873,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.81875,
"step": 790
},
{
"completion_length": 1016.3125,
"epoch": 0.17559117074584835,
"grad_norm": 0.22143586568637066,
"kl": 0.54296875,
"learning_rate": 1.9654250723918706e-05,
"loss": 0.0217,
"reward": 0.99375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.925,
"step": 795
},
{
"completion_length": 1018.775,
"epoch": 0.1766955177316713,
"grad_norm": 22.572401118778046,
"kl": 1.007623291015625,
"learning_rate": 1.9644128023797273e-05,
"loss": 0.0403,
"reward": 1.025,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.9875,
"step": 800
},
{
"epoch": 0.1766955177316713,
"eval_completion_length": 999.645,
"eval_kl": 0.35353515625,
"eval_loss": 0.0141792893409729,
"eval_reward": 1.045,
"eval_reward_std": 0.13435028612613678,
"eval_rewards/accuracy_reward": 0.085,
"eval_rewards/format_reward": 0.96,
"eval_runtime": 263.5974,
"eval_samples_per_second": 0.376,
"eval_steps_per_second": 0.095,
"step": 800
},
{
"completion_length": 856.90625,
"epoch": 0.17779986471749423,
"grad_norm": 0.4670617314622781,
"kl": 0.29603271484375,
"learning_rate": 1.9633861953065648e-05,
"loss": 0.0118,
"reward": 0.9125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0125,
"rewards/format_reward": 0.9,
"step": 805
},
{
"completion_length": 235.19375,
"epoch": 0.1789042117033172,
"grad_norm": 0.8767573552781158,
"kl": 0.281640625,
"learning_rate": 1.9623452664340305e-05,
"loss": 0.0113,
"reward": 0.95625,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.88125,
"step": 810
},
{
"completion_length": 194.6875,
"epoch": 0.18000855868914012,
"grad_norm": 0.8245609924387703,
"kl": 0.235968017578125,
"learning_rate": 1.9612900312366815e-05,
"loss": 0.0094,
"reward": 0.98125,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.89375,
"step": 815
},
{
"completion_length": 177.96875,
"epoch": 0.18111290567496308,
"grad_norm": 0.5059362794610306,
"kl": 0.2334716796875,
"learning_rate": 1.9602205054017534e-05,
"loss": 0.0093,
"reward": 1.04375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9625,
"step": 820
},
{
"completion_length": 217.3625,
"epoch": 0.182217252660786,
"grad_norm": 0.4835214879479257,
"kl": 0.26324462890625,
"learning_rate": 1.9591367048289297e-05,
"loss": 0.0105,
"reward": 1.01875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.975,
"step": 825
},
{
"completion_length": 221.90625,
"epoch": 0.18332159964660896,
"grad_norm": 0.6626328778695273,
"kl": 0.28028564453125,
"learning_rate": 1.9580386456301014e-05,
"loss": 0.0112,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.975,
"step": 830
},
{
"completion_length": 205.31875,
"epoch": 0.18442594663243192,
"grad_norm": 0.26541083822971634,
"kl": 0.27823486328125,
"learning_rate": 1.9569263441291312e-05,
"loss": 0.0111,
"reward": 1.00625,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.98125,
"step": 835
},
{
"completion_length": 206.03125,
"epoch": 0.18553029361825485,
"grad_norm": 0.12829507495547837,
"kl": 0.29173583984375,
"learning_rate": 1.9557998168616087e-05,
"loss": 0.0117,
"reward": 1.025,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.98125,
"step": 840
},
{
"completion_length": 197.2375,
"epoch": 0.1866346406040778,
"grad_norm": 0.632523566350679,
"kl": 0.3269287109375,
"learning_rate": 1.9546590805746054e-05,
"loss": 0.0131,
"reward": 1.0125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.96875,
"step": 845
},
{
"completion_length": 201.3125,
"epoch": 0.18773898758990074,
"grad_norm": 0.4218691032301411,
"kl": 0.254644775390625,
"learning_rate": 1.9535041522264256e-05,
"loss": 0.0102,
"reward": 1.03125,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.9875,
"step": 850
},
{
"completion_length": 215.075,
"epoch": 0.1888433345757237,
"grad_norm": 0.5263796297221296,
"kl": 0.24737548828125,
"learning_rate": 1.9523350489863545e-05,
"loss": 0.0099,
"reward": 1.01875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.98125,
"step": 855
},
{
"completion_length": 293.66875,
"epoch": 0.18994768156154665,
"grad_norm": 0.5106483317493103,
"kl": 0.24151611328125,
"learning_rate": 1.951151788234402e-05,
"loss": 0.0097,
"reward": 1.03125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.96875,
"step": 860
},
{
"completion_length": 341.4625,
"epoch": 0.19105202854736958,
"grad_norm": 0.3446610557866059,
"kl": 0.21561279296875,
"learning_rate": 1.949954387561046e-05,
"loss": 0.0086,
"reward": 1.05625,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 1.0,
"step": 865
},
{
"completion_length": 290.48125,
"epoch": 0.19215637553319254,
"grad_norm": 0.5188322122434208,
"kl": 0.22645263671875,
"learning_rate": 1.9487428647669688e-05,
"loss": 0.0091,
"reward": 1.05625,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9875,
"step": 870
},
{
"completion_length": 265.8,
"epoch": 0.19326072251901547,
"grad_norm": 0.36026950031545973,
"kl": 0.2487060546875,
"learning_rate": 1.947517237862795e-05,
"loss": 0.0099,
"reward": 1.05625,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 1.0,
"step": 875
},
{
"completion_length": 229.44375,
"epoch": 0.19436506950483842,
"grad_norm": 0.27419984769066713,
"kl": 0.247833251953125,
"learning_rate": 1.9462775250688208e-05,
"loss": 0.0099,
"reward": 1.05,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 1.0,
"step": 880
},
{
"completion_length": 242.05625,
"epoch": 0.19546941649066138,
"grad_norm": 0.22671689499182268,
"kl": 0.2520751953125,
"learning_rate": 1.9450237448147463e-05,
"loss": 0.0101,
"reward": 1.0625,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9875,
"step": 885
},
{
"completion_length": 250.46875,
"epoch": 0.1965737634764843,
"grad_norm": 0.6022152556248251,
"kl": 0.26463623046875,
"learning_rate": 1.943755915739399e-05,
"loss": 0.0106,
"reward": 1.03125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.95625,
"step": 890
},
{
"completion_length": 202.0625,
"epoch": 0.19767811046230727,
"grad_norm": 0.3986065730178507,
"kl": 0.2731201171875,
"learning_rate": 1.9424740566904572e-05,
"loss": 0.0109,
"reward": 1.01875,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.99375,
"step": 895
},
{
"completion_length": 199.31875,
"epoch": 0.1987824574481302,
"grad_norm": 0.3484175794006397,
"kl": 0.2666259765625,
"learning_rate": 1.9411781867241718e-05,
"loss": 0.0107,
"reward": 1.03125,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.9875,
"step": 900
},
{
"epoch": 0.1987824574481302,
"eval_completion_length": 219.105,
"eval_kl": 0.2623046875,
"eval_loss": 0.010502400808036327,
"eval_reward": 1.085,
"eval_reward_std": 0.13435028612613678,
"eval_rewards/accuracy_reward": 0.1,
"eval_rewards/format_reward": 0.985,
"eval_runtime": 94.3452,
"eval_samples_per_second": 1.049,
"eval_steps_per_second": 0.265,
"step": 900
},
{
"completion_length": 249.375,
"epoch": 0.19988680443395315,
"grad_norm": 0.11204663969136651,
"kl": 0.28580322265625,
"learning_rate": 1.9398683251050796e-05,
"loss": 0.0114,
"reward": 1.04375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.9875,
"step": 905
},
{
"completion_length": 258.64375,
"epoch": 0.20099115141977608,
"grad_norm": 0.38371933952741333,
"kl": 0.28076171875,
"learning_rate": 1.93854449130572e-05,
"loss": 0.0112,
"reward": 1.0625,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 1.0,
"step": 910
},
{
"completion_length": 246.2125,
"epoch": 0.20209549840559904,
"grad_norm": 0.596227033187015,
"kl": 0.26915283203125,
"learning_rate": 1.937206705006344e-05,
"loss": 0.0108,
"reward": 1.04375,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.9875,
"step": 915
},
{
"completion_length": 229.68125,
"epoch": 0.203199845391422,
"grad_norm": 1.0512533587389215,
"kl": 0.27860107421875,
"learning_rate": 1.9358549860946217e-05,
"loss": 0.0111,
"reward": 0.9875,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.925,
"step": 920
},
{
"completion_length": 193.76875,
"epoch": 0.20430419237724493,
"grad_norm": 0.3503850314920464,
"kl": 0.26494140625,
"learning_rate": 1.934489354665347e-05,
"loss": 0.0106,
"reward": 0.8375,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.79375,
"step": 925
},
{
"completion_length": 307.98125,
"epoch": 0.20540853936306788,
"grad_norm": 0.49849280178686406,
"kl": 0.31822509765625,
"learning_rate": 1.9331098310201392e-05,
"loss": 0.0127,
"reward": 0.99375,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.88125,
"step": 930
},
{
"completion_length": 149.59375,
"epoch": 0.2065128863488908,
"grad_norm": 0.7917539737118271,
"kl": 0.3322509765625,
"learning_rate": 1.9317164356671395e-05,
"loss": 0.0133,
"reward": 1.0375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9625,
"step": 935
},
{
"completion_length": 168.63125,
"epoch": 0.20761723333471377,
"grad_norm": 0.6201527306472769,
"kl": 0.41641845703125,
"learning_rate": 1.930309189320709e-05,
"loss": 0.0167,
"reward": 1.06875,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9625,
"step": 940
},
{
"completion_length": 200.69375,
"epoch": 0.20872158032053673,
"grad_norm": 0.48233327830154826,
"kl": 0.333837890625,
"learning_rate": 1.9288881129011177e-05,
"loss": 0.0134,
"reward": 1.1,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.975,
"step": 945
},
{
"completion_length": 202.0,
"epoch": 0.20982592730635966,
"grad_norm": 0.8281740363940345,
"kl": 0.35965576171875,
"learning_rate": 1.9274532275342355e-05,
"loss": 0.0144,
"reward": 1.075,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.96875,
"step": 950
},
{
"completion_length": 237.83125,
"epoch": 0.2109302742921826,
"grad_norm": 0.28772905337093585,
"kl": 0.32879638671875,
"learning_rate": 1.9260045545512174e-05,
"loss": 0.0131,
"reward": 1.05625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9875,
"step": 955
},
{
"completion_length": 260.9375,
"epoch": 0.21203462127800554,
"grad_norm": 0.5484628404079348,
"kl": 0.340283203125,
"learning_rate": 1.9245421154881873e-05,
"loss": 0.0136,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.975,
"step": 960
},
{
"completion_length": 325.5625,
"epoch": 0.2131389682638285,
"grad_norm": 0.37202208226132655,
"kl": 0.40030517578125,
"learning_rate": 1.9230659320859157e-05,
"loss": 0.016,
"reward": 1.01875,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.9875,
"step": 965
},
{
"completion_length": 249.5125,
"epoch": 0.21424331524965143,
"grad_norm": 0.37717614370685104,
"kl": 0.35120849609375,
"learning_rate": 1.9215760262894982e-05,
"loss": 0.014,
"reward": 1.00625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.975,
"step": 970
},
{
"completion_length": 230.45625,
"epoch": 0.21534766223547439,
"grad_norm": 0.5086005905523352,
"kl": 0.38538818359375,
"learning_rate": 1.9200724202480305e-05,
"loss": 0.0154,
"reward": 1.025,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.96875,
"step": 975
},
{
"completion_length": 203.0125,
"epoch": 0.21645200922129734,
"grad_norm": 0.3935754886467958,
"kl": 0.32562255859375,
"learning_rate": 1.9185551363142754e-05,
"loss": 0.013,
"reward": 1.0,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.96875,
"step": 980
},
{
"completion_length": 177.90625,
"epoch": 0.21755635620712027,
"grad_norm": 0.4203555792398308,
"kl": 0.29302978515625,
"learning_rate": 1.9170241970443344e-05,
"loss": 0.0117,
"reward": 1.01875,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.99375,
"step": 985
},
{
"completion_length": 149.94375,
"epoch": 0.21866070319294323,
"grad_norm": 0.26776293393774087,
"kl": 0.3010009765625,
"learning_rate": 1.9154796251973092e-05,
"loss": 0.012,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.98125,
"step": 990
},
{
"completion_length": 171.49375,
"epoch": 0.21976505017876616,
"grad_norm": 0.5930942674995531,
"kl": 0.30377197265625,
"learning_rate": 1.9139214437349663e-05,
"loss": 0.0121,
"reward": 1.025,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.99375,
"step": 995
},
{
"completion_length": 182.9125,
"epoch": 0.22086939716458912,
"grad_norm": 0.5327664873628459,
"kl": 0.2912841796875,
"learning_rate": 1.9123496758213926e-05,
"loss": 0.0117,
"reward": 1.06875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9875,
"step": 1000
},
{
"epoch": 0.22086939716458912,
"eval_completion_length": 214.23,
"eval_kl": 0.269765625,
"eval_loss": 0.01077475119382143,
"eval_reward": 1.075,
"eval_reward_std": 0.1767766922712326,
"eval_rewards/accuracy_reward": 0.105,
"eval_rewards/format_reward": 0.97,
"eval_runtime": 90.4327,
"eval_samples_per_second": 1.095,
"eval_steps_per_second": 0.276,
"step": 1000
},
{
"completion_length": 192.26875,
"epoch": 0.22197374415041207,
"grad_norm": 0.607671116808556,
"kl": 0.29041748046875,
"learning_rate": 1.9107643448226536e-05,
"loss": 0.0116,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.975,
"step": 1005
},
{
"completion_length": 233.1875,
"epoch": 0.223078091136235,
"grad_norm": 0.42966284071874145,
"kl": 4.75654296875,
"learning_rate": 1.909165474306445e-05,
"loss": 0.1909,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.975,
"step": 1010
},
{
"completion_length": 451.7875,
"epoch": 0.22418243812205796,
"grad_norm": 0.2531114981656375,
"kl": 0.275,
"learning_rate": 1.9075530880417422e-05,
"loss": 0.011,
"reward": 0.925,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.9,
"step": 1015
},
{
"completion_length": 535.35625,
"epoch": 0.2252867851078809,
"grad_norm": 0.4277427642056076,
"kl": 0.28232421875,
"learning_rate": 1.905927209998447e-05,
"loss": 0.0113,
"reward": 0.89375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.83125,
"step": 1020
},
{
"completion_length": 319.4625,
"epoch": 0.22639113209370385,
"grad_norm": 0.44601881214583666,
"kl": 0.30135498046875,
"learning_rate": 1.9042878643470313e-05,
"loss": 0.0121,
"reward": 1.00625,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.98125,
"step": 1025
},
{
"completion_length": 219.5,
"epoch": 0.22749547907952677,
"grad_norm": 0.42697703684794175,
"kl": 0.3317626953125,
"learning_rate": 1.9026350754581782e-05,
"loss": 0.0133,
"reward": 1.0125,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.95,
"step": 1030
},
{
"completion_length": 203.14375,
"epoch": 0.22859982606534973,
"grad_norm": 0.2565664818941694,
"kl": 0.315625,
"learning_rate": 1.900968867902419e-05,
"loss": 0.0126,
"reward": 0.93125,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.88125,
"step": 1035
},
{
"completion_length": 202.94375,
"epoch": 0.2297041730511727,
"grad_norm": 0.42814411154309545,
"kl": 0.31634521484375,
"learning_rate": 1.8992892664497693e-05,
"loss": 0.0127,
"reward": 1.06875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.99375,
"step": 1040
},
{
"completion_length": 205.91875,
"epoch": 0.23080852003699562,
"grad_norm": 0.43764023473742697,
"kl": 0.2983154296875,
"learning_rate": 1.897596296069358e-05,
"loss": 0.0119,
"reward": 1.05,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9875,
"step": 1045
},
{
"completion_length": 196.5375,
"epoch": 0.23191286702281858,
"grad_norm": 0.5131717281451017,
"kl": 0.27991943359375,
"learning_rate": 1.8958899819290592e-05,
"loss": 0.0112,
"reward": 1.025,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.98125,
"step": 1050
},
{
"completion_length": 205.26875,
"epoch": 0.2330172140086415,
"grad_norm": 0.32785012231096045,
"kl": 0.290869140625,
"learning_rate": 1.8941703493951163e-05,
"loss": 0.0116,
"reward": 1.075,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.99375,
"step": 1055
},
{
"completion_length": 231.63125,
"epoch": 0.23412156099446446,
"grad_norm": 0.5254694645789388,
"kl": 0.28807373046875,
"learning_rate": 1.892437424031766e-05,
"loss": 0.0115,
"reward": 1.05625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.99375,
"step": 1060
},
{
"completion_length": 242.0375,
"epoch": 0.23522590798028742,
"grad_norm": 0.5026068127158217,
"kl": 0.29200439453125,
"learning_rate": 1.890691231600856e-05,
"loss": 0.0117,
"reward": 1.06875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9625,
"step": 1065
},
{
"completion_length": 246.08125,
"epoch": 0.23633025496611035,
"grad_norm": 0.13203150585306936,
"kl": 0.322235107421875,
"learning_rate": 1.8889317980614653e-05,
"loss": 0.0129,
"reward": 1.075,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.96875,
"step": 1070
},
{
"completion_length": 224.4125,
"epoch": 0.2374346019519333,
"grad_norm": 0.41058165320035295,
"kl": 0.354974365234375,
"learning_rate": 1.8871591495695156e-05,
"loss": 0.0142,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.975,
"step": 1075
},
{
"completion_length": 202.43125,
"epoch": 0.23853894893775623,
"grad_norm": 0.5981781477284894,
"kl": 0.3003173828125,
"learning_rate": 1.8853733124773837e-05,
"loss": 0.012,
"reward": 1.025,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.98125,
"step": 1080
},
{
"completion_length": 170.225,
"epoch": 0.2396432959235792,
"grad_norm": 0.4268507688185011,
"kl": 0.301025390625,
"learning_rate": 1.8835743133335096e-05,
"loss": 0.012,
"reward": 1.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9875,
"step": 1085
},
{
"completion_length": 175.80625,
"epoch": 0.24074764290940212,
"grad_norm": 0.5747423342289615,
"kl": 0.3157562255859375,
"learning_rate": 1.8817621788820017e-05,
"loss": 0.0126,
"reward": 1.0125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9625,
"step": 1090
},
{
"completion_length": 167.35,
"epoch": 0.24185198989522508,
"grad_norm": 0.519815561291836,
"kl": 0.32388916015625,
"learning_rate": 1.8799369360622394e-05,
"loss": 0.013,
"reward": 1.08125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.99375,
"step": 1095
},
{
"completion_length": 218.21875,
"epoch": 0.24295633688104804,
"grad_norm": 0.5333785220488585,
"kl": 0.36580810546875,
"learning_rate": 1.8780986120084715e-05,
"loss": 0.0146,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.975,
"step": 1100
},
{
"epoch": 0.24295633688104804,
"eval_completion_length": 262.785,
"eval_kl": 0.3383984375,
"eval_loss": 0.013547366484999657,
"eval_reward": 1.03,
"eval_reward_std": 0.15556348919868468,
"eval_rewards/accuracy_reward": 0.075,
"eval_rewards/format_reward": 0.955,
"eval_runtime": 118.9008,
"eval_samples_per_second": 0.833,
"eval_steps_per_second": 0.21,
"step": 1100
},
{
"completion_length": 226.5375,
"epoch": 0.24406068386687096,
"grad_norm": 0.6802013609013249,
"kl": 0.3069580078125,
"learning_rate": 1.876247234049416e-05,
"loss": 0.0123,
"reward": 1.05,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.975,
"step": 1105
},
{
"completion_length": 264.125,
"epoch": 0.24516503085269392,
"grad_norm": 0.9555057313081333,
"kl": 0.42034912109375,
"learning_rate": 1.8743828297078485e-05,
"loss": 0.0168,
"reward": 0.95625,
"reward_std": 0.23864853456616403,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.8875,
"step": 1110
},
{
"completion_length": 204.44375,
"epoch": 0.24626937783851685,
"grad_norm": 0.7866089087734848,
"kl": 0.45830078125,
"learning_rate": 1.8725054267001992e-05,
"loss": 0.0183,
"reward": 0.7625,
"reward_std": 0.30052037686109545,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.725,
"step": 1115
},
{
"completion_length": 170.48125,
"epoch": 0.2473737248243398,
"grad_norm": 0.4917046420126656,
"kl": 0.535284423828125,
"learning_rate": 1.8706150529361355e-05,
"loss": 0.0214,
"reward": 0.825,
"reward_std": 0.30052037686109545,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.74375,
"step": 1120
},
{
"completion_length": 170.68125,
"epoch": 0.24847807181016277,
"grad_norm": 0.4598219209937343,
"kl": 0.50391845703125,
"learning_rate": 1.8687117365181514e-05,
"loss": 0.0202,
"reward": 0.90625,
"reward_std": 0.23864853456616403,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.8625,
"step": 1125
},
{
"completion_length": 226.35,
"epoch": 0.2495824187959857,
"grad_norm": 0.729450452143476,
"kl": 0.4862548828125,
"learning_rate": 1.8667955057411454e-05,
"loss": 0.0195,
"reward": 0.9375,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.85625,
"step": 1130
},
{
"completion_length": 199.03125,
"epoch": 0.25068676578180865,
"grad_norm": 0.9720760882388615,
"kl": 0.467034912109375,
"learning_rate": 1.864866389092005e-05,
"loss": 0.0187,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9125,
"step": 1135
},
{
"completion_length": 213.5875,
"epoch": 0.2517911127676316,
"grad_norm": 0.6415965551575933,
"kl": 0.8406494140625,
"learning_rate": 1.8629244152491773e-05,
"loss": 0.0336,
"reward": 0.96875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.9125,
"step": 1140
},
{
"completion_length": 264.75,
"epoch": 0.2528954597534545,
"grad_norm": 3.4377172313710402,
"kl": 1.02249755859375,
"learning_rate": 1.860969613082249e-05,
"loss": 0.0409,
"reward": 0.95,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9,
"step": 1145
},
{
"completion_length": 262.26875,
"epoch": 0.25399980673927747,
"grad_norm": 0.5339383124761619,
"kl": 1.24478759765625,
"learning_rate": 1.8590020116515116e-05,
"loss": 0.0496,
"reward": 1.01875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.94375,
"step": 1150
},
{
"completion_length": 271.375,
"epoch": 0.2551041537251004,
"grad_norm": 1.6266177185437845,
"kl": 0.5312255859375,
"learning_rate": 1.8570216402075326e-05,
"loss": 0.0213,
"reward": 1.0125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.925,
"step": 1155
},
{
"completion_length": 274.64375,
"epoch": 0.2562085007109234,
"grad_norm": 1.0477658800658394,
"kl": 0.89627685546875,
"learning_rate": 1.8550285281907198e-05,
"loss": 0.0358,
"reward": 1.0,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.91875,
"step": 1160
},
{
"completion_length": 262.7,
"epoch": 0.25731284769674634,
"grad_norm": 0.4264557263560298,
"kl": 0.498876953125,
"learning_rate": 1.8530227052308843e-05,
"loss": 0.0199,
"reward": 1.0125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9375,
"step": 1165
},
{
"completion_length": 199.46875,
"epoch": 0.25841719468256924,
"grad_norm": 0.6299395922753566,
"kl": 0.44912109375,
"learning_rate": 1.8510042011467978e-05,
"loss": 0.018,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.96875,
"step": 1170
},
{
"completion_length": 222.3625,
"epoch": 0.2595215416683922,
"grad_norm": 0.529042632954111,
"kl": 0.43572998046875,
"learning_rate": 1.848973045945753e-05,
"loss": 0.0174,
"reward": 1.00625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.94375,
"step": 1175
},
{
"completion_length": 209.43125,
"epoch": 0.26062588865421515,
"grad_norm": 0.7827952040899112,
"kl": 0.83447265625,
"learning_rate": 1.8469292698231137e-05,
"loss": 0.0335,
"reward": 1.05,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.95625,
"step": 1180
},
{
"completion_length": 153.6875,
"epoch": 0.2617302356400381,
"grad_norm": 0.42454488448610755,
"kl": 0.35516357421875,
"learning_rate": 1.8448729031618687e-05,
"loss": 0.0142,
"reward": 1.04375,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.96875,
"step": 1185
},
{
"completion_length": 174.7125,
"epoch": 0.26283458262586107,
"grad_norm": 0.37149930187993413,
"kl": 0.365673828125,
"learning_rate": 1.8428039765321783e-05,
"loss": 0.0146,
"reward": 1.0375,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.96875,
"step": 1190
},
{
"completion_length": 265.21875,
"epoch": 0.26393892961168397,
"grad_norm": 0.66806143396866,
"kl": 0.3682373046875,
"learning_rate": 1.840722520690921e-05,
"loss": 0.0147,
"reward": 0.96875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.90625,
"step": 1195
},
{
"completion_length": 270.75,
"epoch": 0.2650432765975069,
"grad_norm": 0.62491488989135,
"kl": 0.37822265625,
"learning_rate": 1.838628566581236e-05,
"loss": 0.0151,
"reward": 0.91875,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.8375,
"step": 1200
},
{
"epoch": 0.2650432765975069,
"eval_completion_length": 337.06,
"eval_kl": 0.4419140625,
"eval_loss": 0.0176791213452816,
"eval_reward": 0.905,
"eval_reward_std": 0.3181980448961258,
"eval_rewards/accuracy_reward": 0.085,
"eval_rewards/format_reward": 0.82,
"eval_runtime": 175.1338,
"eval_samples_per_second": 0.565,
"eval_steps_per_second": 0.143,
"step": 1200
},
{
"completion_length": 257.49375,
"epoch": 0.2661476235833299,
"grad_norm": 0.83147670526204,
"kl": 0.35118408203125,
"learning_rate": 1.8365221453320625e-05,
"loss": 0.014,
"reward": 0.9125,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.84375,
"step": 1205
},
{
"completion_length": 223.31875,
"epoch": 0.26725197056915284,
"grad_norm": 0.24158167072477596,
"kl": 0.3875,
"learning_rate": 1.8344032882576784e-05,
"loss": 0.0155,
"reward": 0.95625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.90625,
"step": 1210
},
{
"completion_length": 163.0,
"epoch": 0.2683563175549758,
"grad_norm": 0.6479529739729771,
"kl": 0.405072021484375,
"learning_rate": 1.8322720268572333e-05,
"loss": 0.0162,
"reward": 0.99375,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.9375,
"step": 1215
},
{
"completion_length": 292.16875,
"epoch": 0.2694606645407987,
"grad_norm": 1.7491765816716196,
"kl": 0.540771484375,
"learning_rate": 1.83012839281428e-05,
"loss": 0.0216,
"reward": 0.95,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.8875,
"step": 1220
},
{
"completion_length": 358.31875,
"epoch": 0.27056501152662166,
"grad_norm": 0.5769509244571842,
"kl": 0.5750244140625,
"learning_rate": 1.827972417996306e-05,
"loss": 0.023,
"reward": 0.8375,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.7875,
"step": 1225
},
{
"completion_length": 263.51875,
"epoch": 0.2716693585124446,
"grad_norm": 0.47084507783132956,
"kl": 0.51199951171875,
"learning_rate": 1.8258041344542567e-05,
"loss": 0.0205,
"reward": 0.89375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.875,
"step": 1230
},
{
"completion_length": 145.9875,
"epoch": 0.27277370549826757,
"grad_norm": 0.6657898991831224,
"kl": 0.52864990234375,
"learning_rate": 1.823623574422061e-05,
"loss": 0.0212,
"reward": 1.025,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9625,
"step": 1235
},
{
"completion_length": 151.34375,
"epoch": 0.27387805248409053,
"grad_norm": 0.3173979928590889,
"kl": 0.4265869140625,
"learning_rate": 1.821430770316151e-05,
"loss": 0.0171,
"reward": 1.0125,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.96875,
"step": 1240
},
{
"completion_length": 583.2,
"epoch": 0.27498239946991343,
"grad_norm": 0.3100947155151386,
"kl": 0.40513916015625,
"learning_rate": 1.8192257547349805e-05,
"loss": 0.0162,
"reward": 0.65625,
"reward_std": 0.2916815422475338,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.625,
"step": 1245
},
{
"completion_length": 315.43125,
"epoch": 0.2760867464557364,
"grad_norm": 0.1782330419853091,
"kl": 0.51036376953125,
"learning_rate": 1.817008560458541e-05,
"loss": 0.0204,
"reward": 0.84375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.8125,
"step": 1250
},
{
"completion_length": 87.08125,
"epoch": 0.27719109344155934,
"grad_norm": 0.2736695226581386,
"kl": 0.59683837890625,
"learning_rate": 1.814779220447872e-05,
"loss": 0.0239,
"reward": 0.9625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.9375,
"step": 1255
},
{
"completion_length": 124.04375,
"epoch": 0.2782954404273823,
"grad_norm": 0.4315634020248374,
"kl": 0.48404541015625,
"learning_rate": 1.8125377678445755e-05,
"loss": 0.0194,
"reward": 1.01875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9375,
"step": 1260
},
{
"completion_length": 242.1875,
"epoch": 0.2793997874132052,
"grad_norm": 0.6754023429831525,
"kl": 0.5489013671875,
"learning_rate": 1.8102842359703177e-05,
"loss": 0.022,
"reward": 0.9375,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.8875,
"step": 1265
},
{
"completion_length": 154.88125,
"epoch": 0.28050413439902816,
"grad_norm": 0.7500021389013647,
"kl": 0.44857177734375,
"learning_rate": 1.8080186583263386e-05,
"loss": 0.018,
"reward": 0.9875,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9375,
"step": 1270
},
{
"completion_length": 138.74375,
"epoch": 0.2816084813848511,
"grad_norm": 0.6511896035985808,
"kl": 0.660400390625,
"learning_rate": 1.8057410685929505e-05,
"loss": 0.0264,
"reward": 1.01875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.96875,
"step": 1275
},
{
"completion_length": 278.21875,
"epoch": 0.2827128283706741,
"grad_norm": 0.37768724134381376,
"kl": 0.4253173828125,
"learning_rate": 1.8034515006290398e-05,
"loss": 0.017,
"reward": 0.9625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.9375,
"step": 1280
},
{
"completion_length": 306.81875,
"epoch": 0.28381717535649703,
"grad_norm": 0.29452808394209296,
"kl": 0.38458251953125,
"learning_rate": 1.8011499884715616e-05,
"loss": 0.0154,
"reward": 1.0375,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.95625,
"step": 1285
},
{
"completion_length": 331.20625,
"epoch": 0.28492152234231993,
"grad_norm": 0.40009340612327654,
"kl": 0.40555419921875,
"learning_rate": 1.7988365663350352e-05,
"loss": 0.0162,
"reward": 1.00625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9375,
"step": 1290
},
{
"completion_length": 291.39375,
"epoch": 0.2860258693281429,
"grad_norm": 0.2398401379002744,
"kl": 0.3625,
"learning_rate": 1.7965112686110346e-05,
"loss": 0.0145,
"reward": 0.96875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.93125,
"step": 1295
},
{
"completion_length": 210.84375,
"epoch": 0.28713021631396585,
"grad_norm": 0.16393982679750796,
"kl": 0.36951904296875,
"learning_rate": 1.7941741298676777e-05,
"loss": 0.0148,
"reward": 1.0125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.96875,
"step": 1300
},
{
"epoch": 0.28713021631396585,
"eval_completion_length": 168.92,
"eval_kl": 0.39515625,
"eval_loss": 0.01582903414964676,
"eval_reward": 1.05,
"eval_reward_std": 0.11313708305358887,
"eval_rewards/accuracy_reward": 0.07,
"eval_rewards/format_reward": 0.98,
"eval_runtime": 82.4657,
"eval_samples_per_second": 1.2,
"eval_steps_per_second": 0.303,
"step": 1300
},
{
"completion_length": 157.2875,
"epoch": 0.2882345632997888,
"grad_norm": 0.5544518957919585,
"kl": 0.35565185546875,
"learning_rate": 1.7918251848491118e-05,
"loss": 0.0142,
"reward": 1.00625,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.9875,
"step": 1305
},
{
"completion_length": 143.9625,
"epoch": 0.28933891028561176,
"grad_norm": 1.0694156042827718,
"kl": 0.36668701171875,
"learning_rate": 1.7894644684749983e-05,
"loss": 0.0147,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.99375,
"step": 1310
},
{
"completion_length": 129.1875,
"epoch": 0.29044325727143466,
"grad_norm": 0.7797356796661203,
"kl": 0.363818359375,
"learning_rate": 1.7870920158399918e-05,
"loss": 0.0146,
"reward": 1.075,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.96875,
"step": 1315
},
{
"completion_length": 107.58125,
"epoch": 0.2915476042572576,
"grad_norm": 0.42231572420920943,
"kl": 0.415740966796875,
"learning_rate": 1.7847078622132202e-05,
"loss": 0.0166,
"reward": 1.0,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.95,
"step": 1320
},
{
"completion_length": 137.425,
"epoch": 0.2926519512430806,
"grad_norm": 0.31138070158067094,
"kl": 0.4673095703125,
"learning_rate": 1.7823120430377593e-05,
"loss": 0.0187,
"reward": 1.0,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.975,
"step": 1325
},
{
"completion_length": 169.975,
"epoch": 0.29375629822890353,
"grad_norm": 0.7054962096277919,
"kl": 0.4247802734375,
"learning_rate": 1.7799045939301063e-05,
"loss": 0.017,
"reward": 1.04375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.98125,
"step": 1330
},
{
"completion_length": 199.325,
"epoch": 0.2948606452147265,
"grad_norm": 0.5371054293908584,
"kl": 0.4145263671875,
"learning_rate": 1.7774855506796497e-05,
"loss": 0.0166,
"reward": 1.06875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.98125,
"step": 1335
},
{
"completion_length": 285.3625,
"epoch": 0.2959649922005494,
"grad_norm": 0.49107133614596143,
"kl": 0.38699951171875,
"learning_rate": 1.775054949248138e-05,
"loss": 0.0155,
"reward": 1.0625,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95,
"step": 1340
},
{
"completion_length": 195.01875,
"epoch": 0.29706933918637235,
"grad_norm": 0.18854873920164167,
"kl": 0.401611328125,
"learning_rate": 1.7726128257691447e-05,
"loss": 0.0161,
"reward": 1.08125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9875,
"step": 1345
},
{
"completion_length": 144.2625,
"epoch": 0.2981736861721953,
"grad_norm": 0.194977538009497,
"kl": 0.3897705078125,
"learning_rate": 1.770159216547532e-05,
"loss": 0.0156,
"reward": 1.06875,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.99375,
"step": 1350
},
{
"completion_length": 142.68125,
"epoch": 0.29927803315801826,
"grad_norm": 0.2721232391131242,
"kl": 0.4391845703125,
"learning_rate": 1.7676941580589097e-05,
"loss": 0.0176,
"reward": 1.01875,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.9875,
"step": 1355
},
{
"completion_length": 141.6625,
"epoch": 0.3003823801438412,
"grad_norm": 0.12367929505701036,
"kl": 0.4060302734375,
"learning_rate": 1.7652176869490933e-05,
"loss": 0.0162,
"reward": 1.0375,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.98125,
"step": 1360
},
{
"completion_length": 147.54375,
"epoch": 0.3014867271296641,
"grad_norm": 0.1722353049345247,
"kl": 0.43369140625,
"learning_rate": 1.76272984003356e-05,
"loss": 0.0173,
"reward": 1.025,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 1.0,
"step": 1365
},
{
"completion_length": 161.99375,
"epoch": 0.3025910741154871,
"grad_norm": 0.6029005445006613,
"kl": 0.47828369140625,
"learning_rate": 1.7602306542969006e-05,
"loss": 0.0191,
"reward": 0.975,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.95625,
"step": 1370
},
{
"completion_length": 300.3375,
"epoch": 0.30369542110131004,
"grad_norm": 0.6594210074645137,
"kl": 0.5243896484375,
"learning_rate": 1.7577201668922702e-05,
"loss": 0.021,
"reward": 0.9,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.875,
"step": 1375
},
{
"completion_length": 235.9625,
"epoch": 0.304799768087133,
"grad_norm": 0.42920157132224784,
"kl": 0.418603515625,
"learning_rate": 1.7551984151408363e-05,
"loss": 0.0167,
"reward": 0.9125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.8875,
"step": 1380
},
{
"completion_length": 148.1375,
"epoch": 0.3059041150729559,
"grad_norm": 0.6394391510717589,
"kl": 0.38023681640625,
"learning_rate": 1.7526654365312222e-05,
"loss": 0.0152,
"reward": 1.06875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.99375,
"step": 1385
},
{
"completion_length": 130.94375,
"epoch": 0.30700846205877885,
"grad_norm": 0.38449234338266886,
"kl": 0.381109619140625,
"learning_rate": 1.750121268718951e-05,
"loss": 0.0152,
"reward": 1.025,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.975,
"step": 1390
},
{
"completion_length": 134.8125,
"epoch": 0.3081128090446018,
"grad_norm": 0.32691206341821716,
"kl": 0.37451171875,
"learning_rate": 1.7475659495258864e-05,
"loss": 0.015,
"reward": 1.00625,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.9625,
"step": 1395
},
{
"completion_length": 152.13125,
"epoch": 0.30921715603042477,
"grad_norm": 0.5233437303073457,
"kl": 0.39224853515625,
"learning_rate": 1.7449995169396693e-05,
"loss": 0.0157,
"reward": 0.99375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.93125,
"step": 1400
},
{
"epoch": 0.30921715603042477,
"eval_completion_length": 140.44,
"eval_kl": 0.42287109375,
"eval_loss": 0.016920818015933037,
"eval_reward": 1.07,
"eval_reward_std": 0.18384775936603545,
"eval_rewards/accuracy_reward": 0.115,
"eval_rewards/format_reward": 0.955,
"eval_runtime": 81.6042,
"eval_samples_per_second": 1.213,
"eval_steps_per_second": 0.306,
"step": 1400
},
{
"completion_length": 130.34375,
"epoch": 0.3103215030162477,
"grad_norm": 0.28398981693339564,
"kl": 0.38331298828125,
"learning_rate": 1.7424220091131536e-05,
"loss": 0.0153,
"reward": 1.01875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.95625,
"step": 1405
},
{
"completion_length": 180.325,
"epoch": 0.3114258500020706,
"grad_norm": 0.6590820065232371,
"kl": 0.417919921875,
"learning_rate": 1.739833464363838e-05,
"loss": 0.0167,
"reward": 0.96875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.91875,
"step": 1410
},
{
"completion_length": 174.08125,
"epoch": 0.3125301969878936,
"grad_norm": 0.19303045686565795,
"kl": 0.37972412109375,
"learning_rate": 1.7372339211732988e-05,
"loss": 0.0152,
"reward": 1.0125,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.95,
"step": 1415
},
{
"completion_length": 208.6375,
"epoch": 0.31363454397371654,
"grad_norm": 1.2704281522495193,
"kl": 0.3710205078125,
"learning_rate": 1.734623418186615e-05,
"loss": 0.0148,
"reward": 0.99375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9125,
"step": 1420
},
{
"completion_length": 208.275,
"epoch": 0.3147388909595395,
"grad_norm": 0.4017752883432233,
"kl": 0.42412109375,
"learning_rate": 1.7320019942117954e-05,
"loss": 0.017,
"reward": 0.99375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.94375,
"step": 1425
},
{
"completion_length": 263.475,
"epoch": 0.31584323794536245,
"grad_norm": 0.6704877034578468,
"kl": 0.443798828125,
"learning_rate": 1.729369688219202e-05,
"loss": 0.0178,
"reward": 0.98125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9,
"step": 1430
},
{
"completion_length": 268.4625,
"epoch": 0.31694758493118536,
"grad_norm": 1.1806288895312564,
"kl": 0.475146484375,
"learning_rate": 1.7267265393409684e-05,
"loss": 0.019,
"reward": 0.9875,
"reward_std": 0.24748736917972564,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.89375,
"step": 1435
},
{
"completion_length": 239.40625,
"epoch": 0.3180519319170083,
"grad_norm": 0.7720008549372368,
"kl": 0.47855224609375,
"learning_rate": 1.7240725868704218e-05,
"loss": 0.0192,
"reward": 0.98125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9125,
"step": 1440
},
{
"completion_length": 209.23125,
"epoch": 0.31915627890283127,
"grad_norm": 0.6559969920402181,
"kl": 0.36781005859375,
"learning_rate": 1.7214078702614946e-05,
"loss": 0.0147,
"reward": 1.0,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.93125,
"step": 1445
},
{
"completion_length": 181.79375,
"epoch": 0.3202606258886542,
"grad_norm": 1.033869102520755,
"kl": 0.44471435546875,
"learning_rate": 1.7187324291281423e-05,
"loss": 0.0178,
"reward": 0.95625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.925,
"step": 1450
},
{
"completion_length": 144.8375,
"epoch": 0.3213649728744772,
"grad_norm": 0.23531024361520275,
"kl": 0.55146484375,
"learning_rate": 1.71604630324375e-05,
"loss": 0.0221,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.975,
"step": 1455
},
{
"completion_length": 152.3375,
"epoch": 0.3224693198603001,
"grad_norm": 0.521389309576663,
"kl": 0.3604736328125,
"learning_rate": 1.7133495325405448e-05,
"loss": 0.0144,
"reward": 1.05,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9875,
"step": 1460
},
{
"completion_length": 147.61875,
"epoch": 0.32357366684612304,
"grad_norm": 0.5218385357295671,
"kl": 0.3684814453125,
"learning_rate": 1.7106421571090003e-05,
"loss": 0.0147,
"reward": 1.03125,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.99375,
"step": 1465
},
{
"completion_length": 146.7625,
"epoch": 0.324678013831946,
"grad_norm": 0.40670196743586623,
"kl": 0.383203125,
"learning_rate": 1.7079242171972417e-05,
"loss": 0.0153,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.98125,
"step": 1470
},
{
"completion_length": 183.4125,
"epoch": 0.32578236081776896,
"grad_norm": 0.17326012835635307,
"kl": 0.40433349609375,
"learning_rate": 1.705195753210446e-05,
"loss": 0.0162,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.99375,
"step": 1475
},
{
"completion_length": 252.45,
"epoch": 0.3268867078035919,
"grad_norm": 0.33618781399155934,
"kl": 0.43258056640625,
"learning_rate": 1.7024568057102423e-05,
"loss": 0.0173,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.975,
"step": 1480
},
{
"completion_length": 316.8125,
"epoch": 0.3279910547894148,
"grad_norm": 0.6985966336266197,
"kl": 0.4880126953125,
"learning_rate": 1.6997074154141097e-05,
"loss": 0.0195,
"reward": 1.0625,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9875,
"step": 1485
},
{
"completion_length": 227.04375,
"epoch": 0.3290954017752378,
"grad_norm": 0.2716042884976899,
"kl": 0.42822265625,
"learning_rate": 1.69694762319477e-05,
"loss": 0.0171,
"reward": 1.06875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.98125,
"step": 1490
},
{
"completion_length": 246.30625,
"epoch": 0.33019974876106073,
"grad_norm": 0.34216236944018125,
"kl": 0.432666015625,
"learning_rate": 1.694177470079581e-05,
"loss": 0.0173,
"reward": 0.9875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.95,
"step": 1495
},
{
"completion_length": 319.29375,
"epoch": 0.3313040957468837,
"grad_norm": 0.3073497949162371,
"kl": 0.4350341796875,
"learning_rate": 1.6913969972499272e-05,
"loss": 0.0174,
"reward": 1.025,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9625,
"step": 1500
},
{
"epoch": 0.3313040957468837,
"eval_completion_length": 508.6,
"eval_kl": 0.5653125,
"eval_loss": 0.022641615942120552,
"eval_reward": 0.97,
"eval_reward_std": 0.15556348919868468,
"eval_rewards/accuracy_reward": 0.06,
"eval_rewards/format_reward": 0.91,
"eval_runtime": 242.1008,
"eval_samples_per_second": 0.409,
"eval_steps_per_second": 0.103,
"step": 1500
},
{
"completion_length": 219.5875,
"epoch": 0.33240844273270664,
"grad_norm": 0.5738378890078689,
"kl": 0.447314453125,
"learning_rate": 1.688606246040607e-05,
"loss": 0.0179,
"reward": 1.01875,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.90625,
"step": 1505
},
{
"completion_length": 175.0,
"epoch": 0.33351278971852955,
"grad_norm": 0.4329938576388711,
"kl": 0.36485595703125,
"learning_rate": 1.6858052579392182e-05,
"loss": 0.0146,
"reward": 1.06875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 1.0,
"step": 1510
},
{
"completion_length": 212.8375,
"epoch": 0.3346171367043525,
"grad_norm": 0.3974460378419368,
"kl": 0.3696533203125,
"learning_rate": 1.682994074585541e-05,
"loss": 0.0148,
"reward": 0.95625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.925,
"step": 1515
},
{
"completion_length": 198.4125,
"epoch": 0.33572148369017546,
"grad_norm": 0.6339047206651848,
"kl": 0.3900634765625,
"learning_rate": 1.6801727377709195e-05,
"loss": 0.0156,
"reward": 0.96875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.925,
"step": 1520
},
{
"completion_length": 175.4,
"epoch": 0.3368258306759984,
"grad_norm": 0.31233984339595194,
"kl": 0.36982421875,
"learning_rate": 1.6773412894376404e-05,
"loss": 0.0148,
"reward": 0.98125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.95625,
"step": 1525
},
{
"completion_length": 162.075,
"epoch": 0.3379301776618213,
"grad_norm": 0.41593882245992403,
"kl": 0.3514892578125,
"learning_rate": 1.674499771678309e-05,
"loss": 0.0141,
"reward": 1.01875,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.98125,
"step": 1530
},
{
"completion_length": 146.81875,
"epoch": 0.3390345246476443,
"grad_norm": 0.6916723408968213,
"kl": 0.4715576171875,
"learning_rate": 1.6716482267352234e-05,
"loss": 0.0189,
"reward": 1.04375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9375,
"step": 1535
},
{
"completion_length": 146.05625,
"epoch": 0.34013887163346723,
"grad_norm": 0.16801257159790053,
"kl": 0.4378662109375,
"learning_rate": 1.6687866969997483e-05,
"loss": 0.0175,
"reward": 1.0,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.95,
"step": 1540
},
{
"completion_length": 158.125,
"epoch": 0.3412432186192902,
"grad_norm": 0.3924153520384322,
"kl": 0.3984375,
"learning_rate": 1.665915225011681e-05,
"loss": 0.0159,
"reward": 1.00625,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.96875,
"step": 1545
},
{
"completion_length": 152.525,
"epoch": 0.34234756560511315,
"grad_norm": 0.2188171820439607,
"kl": 0.3915771484375,
"learning_rate": 1.663033853458624e-05,
"loss": 0.0157,
"reward": 1.0,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.98125,
"step": 1550
},
{
"completion_length": 185.0,
"epoch": 0.34345191259093605,
"grad_norm": 0.2492866797409777,
"kl": 0.446630859375,
"learning_rate": 1.660142625175346e-05,
"loss": 0.0179,
"reward": 1.0375,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.98125,
"step": 1555
},
{
"completion_length": 197.925,
"epoch": 0.344556259576759,
"grad_norm": 0.43125503310433044,
"kl": 0.417333984375,
"learning_rate": 1.6572415831431466e-05,
"loss": 0.0167,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.975,
"step": 1560
},
{
"completion_length": 231.45,
"epoch": 0.34566060656258196,
"grad_norm": 0.547580901229839,
"kl": 0.4208251953125,
"learning_rate": 1.6543307704892196e-05,
"loss": 0.0168,
"reward": 1.0125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.95,
"step": 1565
},
{
"completion_length": 210.36875,
"epoch": 0.3467649535484049,
"grad_norm": 0.30578684489167307,
"kl": 0.40220947265625,
"learning_rate": 1.6514102304860077e-05,
"loss": 0.0161,
"reward": 1.01875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.98125,
"step": 1570
},
{
"completion_length": 169.575,
"epoch": 0.3478693005342279,
"grad_norm": 0.3714599755051663,
"kl": 0.4043701171875,
"learning_rate": 1.6484800065505627e-05,
"loss": 0.0162,
"reward": 1.01875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.98125,
"step": 1575
},
{
"completion_length": 157.6625,
"epoch": 0.3489736475200508,
"grad_norm": 1.1408284138746587,
"kl": 0.51844482421875,
"learning_rate": 1.6455401422438984e-05,
"loss": 0.0207,
"reward": 1.0375,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.96875,
"step": 1580
},
{
"completion_length": 131.2875,
"epoch": 0.35007799450587374,
"grad_norm": 0.47193321313326936,
"kl": 0.4167236328125,
"learning_rate": 1.6425906812703435e-05,
"loss": 0.0167,
"reward": 1.08125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.975,
"step": 1585
},
{
"completion_length": 194.43125,
"epoch": 0.3511823414916967,
"grad_norm": 0.723120589080064,
"kl": 0.4700439453125,
"learning_rate": 1.6396316674768914e-05,
"loss": 0.0188,
"reward": 0.99375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.925,
"step": 1590
},
{
"completion_length": 216.175,
"epoch": 0.35228668847751965,
"grad_norm": 0.4975629560332776,
"kl": 0.42794189453125,
"learning_rate": 1.6366631448525486e-05,
"loss": 0.0171,
"reward": 1.075,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.95625,
"step": 1595
},
{
"completion_length": 195.9375,
"epoch": 0.3533910354633426,
"grad_norm": 0.33985255338891107,
"kl": 0.3559814453125,
"learning_rate": 1.6336851575276814e-05,
"loss": 0.0142,
"reward": 1.05,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9875,
"step": 1600
},
{
"epoch": 0.3533910354633426,
"eval_completion_length": 231.0,
"eval_kl": 0.735625,
"eval_loss": 0.02945670112967491,
"eval_reward": 1.05,
"eval_reward_std": 0.15556348919868468,
"eval_rewards/accuracy_reward": 0.1,
"eval_rewards/format_reward": 0.95,
"eval_runtime": 111.0663,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.225,
"step": 1600
},
{
"completion_length": 231.275,
"epoch": 0.3544953824491655,
"grad_norm": 0.595242649626797,
"kl": 0.4072265625,
"learning_rate": 1.630697749773359e-05,
"loss": 0.0163,
"reward": 1.05,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.96875,
"step": 1605
},
{
"completion_length": 278.25,
"epoch": 0.35559972943498847,
"grad_norm": 0.4801274687526583,
"kl": 0.40982666015625,
"learning_rate": 1.627700966000696e-05,
"loss": 0.0164,
"reward": 1.025,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.95,
"step": 1610
},
{
"completion_length": 260.71875,
"epoch": 0.3567040764208114,
"grad_norm": 0.29704464145114623,
"kl": 0.3713134765625,
"learning_rate": 1.6246948507601915e-05,
"loss": 0.0149,
"reward": 1.025,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.94375,
"step": 1615
},
{
"completion_length": 220.7375,
"epoch": 0.3578084234066344,
"grad_norm": 0.16551151233488073,
"kl": 0.33929443359375,
"learning_rate": 1.621679448741067e-05,
"loss": 0.0136,
"reward": 1.05,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.98125,
"step": 1620
},
{
"completion_length": 203.19375,
"epoch": 0.35891277039245734,
"grad_norm": 0.44232175696554416,
"kl": 0.3436279296875,
"learning_rate": 1.618654804770603e-05,
"loss": 0.0137,
"reward": 1.0875,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9875,
"step": 1625
},
{
"completion_length": 196.01875,
"epoch": 0.36001711737828024,
"grad_norm": 0.3595404694857126,
"kl": 0.33565673828125,
"learning_rate": 1.615620963813471e-05,
"loss": 0.0134,
"reward": 1.03125,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.9875,
"step": 1630
},
{
"completion_length": 220.28125,
"epoch": 0.3611214643641032,
"grad_norm": 0.09068347346699927,
"kl": 0.334228515625,
"learning_rate": 1.6125779709710668e-05,
"loss": 0.0134,
"reward": 1.04375,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.99375,
"step": 1635
},
{
"completion_length": 217.84375,
"epoch": 0.36222581134992615,
"grad_norm": 0.24326484641045593,
"kl": 0.323681640625,
"learning_rate": 1.6095258714808373e-05,
"loss": 0.0129,
"reward": 1.09375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.99375,
"step": 1640
},
{
"completion_length": 190.775,
"epoch": 0.3633301583357491,
"grad_norm": 0.32151529940248824,
"kl": 0.3042724609375,
"learning_rate": 1.606464710715612e-05,
"loss": 0.0122,
"reward": 1.04375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.975,
"step": 1645
},
{
"completion_length": 223.5375,
"epoch": 0.364434505321572,
"grad_norm": 0.4066387353346626,
"kl": 0.35045166015625,
"learning_rate": 1.603394534182925e-05,
"loss": 0.014,
"reward": 1.04375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.975,
"step": 1650
},
{
"completion_length": 200.40625,
"epoch": 0.36553885230739497,
"grad_norm": 0.6150107546663145,
"kl": 0.42801513671875,
"learning_rate": 1.600315387524339e-05,
"loss": 0.0171,
"reward": 1.05625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 1655
},
{
"completion_length": 206.625,
"epoch": 0.3666431992932179,
"grad_norm": 0.3881248346947634,
"kl": 0.36854248046875,
"learning_rate": 1.5972273165147697e-05,
"loss": 0.0147,
"reward": 1.05,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.98125,
"step": 1660
},
{
"completion_length": 223.15625,
"epoch": 0.3677475462790409,
"grad_norm": 0.45585685791218283,
"kl": 0.35394287109375,
"learning_rate": 1.5941303670618018e-05,
"loss": 0.0141,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.96875,
"step": 1665
},
{
"completion_length": 205.44375,
"epoch": 0.36885189326486384,
"grad_norm": 0.24229473958778308,
"kl": 0.32738037109375,
"learning_rate": 1.591024585205007e-05,
"loss": 0.0131,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.98125,
"step": 1670
},
{
"completion_length": 237.66875,
"epoch": 0.36995624025068674,
"grad_norm": 0.9451634093337382,
"kl": 0.37305908203125,
"learning_rate": 1.587910017115262e-05,
"loss": 0.0149,
"reward": 1.0125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.925,
"step": 1675
},
{
"completion_length": 234.78125,
"epoch": 0.3710605872365097,
"grad_norm": 0.4259015577951971,
"kl": 0.3545654296875,
"learning_rate": 1.5847867090940602e-05,
"loss": 0.0142,
"reward": 1.0625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.96875,
"step": 1680
},
{
"completion_length": 259.5,
"epoch": 0.37216493422233266,
"grad_norm": 0.3894739125660781,
"kl": 0.33232421875,
"learning_rate": 1.5816547075728227e-05,
"loss": 0.0133,
"reward": 1.0125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.94375,
"step": 1685
},
{
"completion_length": 218.225,
"epoch": 0.3732692812081556,
"grad_norm": 0.5751023644328291,
"kl": 0.3769775390625,
"learning_rate": 1.5785140591122107e-05,
"loss": 0.0151,
"reward": 1.075,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.96875,
"step": 1690
},
{
"completion_length": 198.24375,
"epoch": 0.37437362819397857,
"grad_norm": 0.6070740663767715,
"kl": 0.39683837890625,
"learning_rate": 1.57536481040143e-05,
"loss": 0.0159,
"reward": 1.05,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.94375,
"step": 1695
},
{
"completion_length": 171.06875,
"epoch": 0.37547797517980147,
"grad_norm": 0.5506629078773986,
"kl": 0.37344970703125,
"learning_rate": 1.57220700825754e-05,
"loss": 0.0149,
"reward": 1.09375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.98125,
"step": 1700
},
{
"epoch": 0.37547797517980147,
"eval_completion_length": 164.62,
"eval_kl": 0.39626953125,
"eval_loss": 0.015575483441352844,
"eval_reward": 1.06,
"eval_reward_std": 0.08485281229019165,
"eval_rewards/accuracy_reward": 0.085,
"eval_rewards/format_reward": 0.975,
"eval_runtime": 82.4164,
"eval_samples_per_second": 1.201,
"eval_steps_per_second": 0.303,
"step": 1700
},
{
"completion_length": 148.7125,
"epoch": 0.37658232216562443,
"grad_norm": 0.346175215998511,
"kl": 0.34801025390625,
"learning_rate": 1.5690406996247557e-05,
"loss": 0.0139,
"reward": 1.10625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.975,
"step": 1705
},
{
"completion_length": 157.10625,
"epoch": 0.3776866691514474,
"grad_norm": 0.41957587709124056,
"kl": 0.35477294921875,
"learning_rate": 1.5658659315737505e-05,
"loss": 0.0142,
"reward": 1.075,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.9625,
"step": 1710
},
{
"completion_length": 172.70625,
"epoch": 0.37879101613727034,
"grad_norm": 0.2876351231003489,
"kl": 0.35120849609375,
"learning_rate": 1.5626827513009565e-05,
"loss": 0.014,
"reward": 1.00625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.95625,
"step": 1715
},
{
"completion_length": 158.39375,
"epoch": 0.3798953631230933,
"grad_norm": 0.5026511321796595,
"kl": 0.3473388671875,
"learning_rate": 1.5594912061278627e-05,
"loss": 0.0139,
"reward": 1.04375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.975,
"step": 1720
},
{
"completion_length": 173.2875,
"epoch": 0.3809997101089162,
"grad_norm": 0.43958113689444284,
"kl": 0.3528564453125,
"learning_rate": 1.5562913435003113e-05,
"loss": 0.0141,
"reward": 1.025,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.96875,
"step": 1725
},
{
"completion_length": 195.36875,
"epoch": 0.38210405709473916,
"grad_norm": 0.7155259543987148,
"kl": 0.343896484375,
"learning_rate": 1.5530832109877932e-05,
"loss": 0.0138,
"reward": 1.0375,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9625,
"step": 1730
},
{
"completion_length": 157.21875,
"epoch": 0.3832084040805621,
"grad_norm": 0.3870602492613469,
"kl": 0.343524169921875,
"learning_rate": 1.5498668562827397e-05,
"loss": 0.0137,
"reward": 1.04375,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.975,
"step": 1735
},
{
"completion_length": 182.0625,
"epoch": 0.38431275106638507,
"grad_norm": 0.5306169810415612,
"kl": 0.35052490234375,
"learning_rate": 1.5466423271998144e-05,
"loss": 0.014,
"reward": 1.01875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.94375,
"step": 1740
},
{
"completion_length": 166.49375,
"epoch": 0.38541709805220803,
"grad_norm": 0.4946006693969201,
"kl": 0.3321044921875,
"learning_rate": 1.5434096716752023e-05,
"loss": 0.0133,
"reward": 1.05,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.96875,
"step": 1745
},
{
"completion_length": 186.98125,
"epoch": 0.38652144503803093,
"grad_norm": 0.31165636186354284,
"kl": 0.35806884765625,
"learning_rate": 1.5401689377658962e-05,
"loss": 0.0143,
"reward": 1.0625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.975,
"step": 1750
},
{
"completion_length": 181.3,
"epoch": 0.3876257920238539,
"grad_norm": 0.5512017419947034,
"kl": 0.43681640625,
"learning_rate": 1.536920173648984e-05,
"loss": 0.0175,
"reward": 1.0125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.95,
"step": 1755
},
{
"completion_length": 220.86875,
"epoch": 0.38873013900967684,
"grad_norm": 0.7898520406737889,
"kl": 0.3798583984375,
"learning_rate": 1.53366342762093e-05,
"loss": 0.0152,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9125,
"step": 1760
},
{
"completion_length": 225.61875,
"epoch": 0.3898344859954998,
"grad_norm": 0.7541459820766353,
"kl": 0.3935302734375,
"learning_rate": 1.5303987480968607e-05,
"loss": 0.0157,
"reward": 0.9625,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.89375,
"step": 1765
},
{
"completion_length": 189.53125,
"epoch": 0.39093883298132276,
"grad_norm": 0.33298536402698525,
"kl": 0.322216796875,
"learning_rate": 1.5271261836098403e-05,
"loss": 0.0129,
"reward": 0.99375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.9375,
"step": 1770
},
{
"completion_length": 168.725,
"epoch": 0.39204317996714566,
"grad_norm": 0.7771633455821945,
"kl": 0.3632568359375,
"learning_rate": 1.5238457828101531e-05,
"loss": 0.0145,
"reward": 1.0,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.95625,
"step": 1775
},
{
"completion_length": 160.28125,
"epoch": 0.3931475269529686,
"grad_norm": 0.478390100746179,
"kl": 0.372265625,
"learning_rate": 1.520557594464579e-05,
"loss": 0.0149,
"reward": 0.9875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.94375,
"step": 1780
},
{
"completion_length": 188.59375,
"epoch": 0.3942518739387916,
"grad_norm": 0.4217931355042731,
"kl": 0.3974609375,
"learning_rate": 1.5172616674556673e-05,
"loss": 0.0159,
"reward": 0.95,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.89375,
"step": 1785
},
{
"completion_length": 149.525,
"epoch": 0.39535622092461453,
"grad_norm": 0.7501382613974432,
"kl": 0.4054931640625,
"learning_rate": 1.5139580507810118e-05,
"loss": 0.0162,
"reward": 0.9875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.93125,
"step": 1790
},
{
"completion_length": 145.725,
"epoch": 0.39646056791043743,
"grad_norm": 0.5349731933801097,
"kl": 0.35238037109375,
"learning_rate": 1.510646793552522e-05,
"loss": 0.0141,
"reward": 1.01875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.975,
"step": 1795
},
{
"completion_length": 142.375,
"epoch": 0.3975649148962604,
"grad_norm": 0.6112477717509665,
"kl": 0.4273193359375,
"learning_rate": 1.5073279449956916e-05,
"loss": 0.0171,
"reward": 1.05,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.99375,
"step": 1800
},
{
"epoch": 0.3975649148962604,
"eval_completion_length": 169.21,
"eval_kl": 0.37419921875,
"eval_loss": 0.014996632933616638,
"eval_reward": 1.055,
"eval_reward_std": 0.07778174459934234,
"eval_rewards/accuracy_reward": 0.08,
"eval_rewards/format_reward": 0.975,
"eval_runtime": 85.5248,
"eval_samples_per_second": 1.158,
"eval_steps_per_second": 0.292,
"step": 1800
},
{
"completion_length": 183.75625,
"epoch": 0.39866926188208335,
"grad_norm": 0.42239616616022757,
"kl": 0.35390625,
"learning_rate": 1.5040015544488689e-05,
"loss": 0.0142,
"reward": 1.0,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.96875,
"step": 1805
},
{
"completion_length": 231.51875,
"epoch": 0.3997736088679063,
"grad_norm": 0.38357330680196106,
"kl": 0.34571533203125,
"learning_rate": 1.5006676713625217e-05,
"loss": 0.0138,
"reward": 1.0125,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9625,
"step": 1810
},
{
"completion_length": 249.51875,
"epoch": 0.40087795585372926,
"grad_norm": 0.36278847988909035,
"kl": 0.34613037109375,
"learning_rate": 1.4973263452985023e-05,
"loss": 0.0138,
"reward": 1.01875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.96875,
"step": 1815
},
{
"completion_length": 254.9,
"epoch": 0.40198230283955216,
"grad_norm": 0.13063316008666095,
"kl": 0.3719482421875,
"learning_rate": 1.493977625929312e-05,
"loss": 0.0149,
"reward": 0.975,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.0125,
"rewards/format_reward": 0.9625,
"step": 1820
},
{
"completion_length": 222.58125,
"epoch": 0.4030866498253751,
"grad_norm": 0.4311131808891421,
"kl": 0.33587646484375,
"learning_rate": 1.4906215630373606e-05,
"loss": 0.0134,
"reward": 1.025,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.975,
"step": 1825
},
{
"completion_length": 221.6125,
"epoch": 0.4041909968111981,
"grad_norm": 0.6819109826661146,
"kl": 0.38231201171875,
"learning_rate": 1.4872582065142285e-05,
"loss": 0.0153,
"reward": 1.0125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.95,
"step": 1830
},
{
"completion_length": 238.88125,
"epoch": 0.40529534379702103,
"grad_norm": 0.43973316451031436,
"kl": 0.35328369140625,
"learning_rate": 1.4838876063599234e-05,
"loss": 0.0141,
"reward": 0.9625,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.91875,
"step": 1835
},
{
"completion_length": 309.99375,
"epoch": 0.406399690782844,
"grad_norm": 0.8444306019322414,
"kl": 0.44168701171875,
"learning_rate": 1.480509812682138e-05,
"loss": 0.0177,
"reward": 0.86875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.84375,
"step": 1840
},
{
"completion_length": 126.275,
"epoch": 0.4075040377686669,
"grad_norm": 0.35468253817906953,
"kl": 0.38702392578125,
"learning_rate": 1.4771248756955042e-05,
"loss": 0.0155,
"reward": 1.05625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 1.0,
"step": 1845
},
{
"completion_length": 123.95625,
"epoch": 0.40860838475448985,
"grad_norm": 0.5763156946426605,
"kl": 0.36905517578125,
"learning_rate": 1.4737328457208471e-05,
"loss": 0.0148,
"reward": 1.125,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.99375,
"step": 1850
},
{
"completion_length": 113.06875,
"epoch": 0.4097127317403128,
"grad_norm": 0.47557294439345416,
"kl": 0.379296875,
"learning_rate": 1.4703337731844374e-05,
"loss": 0.0152,
"reward": 1.0875,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 1.0,
"step": 1855
},
{
"completion_length": 106.64375,
"epoch": 0.41081707872613576,
"grad_norm": 0.787870748692926,
"kl": 0.402252197265625,
"learning_rate": 1.4669277086172406e-05,
"loss": 0.0161,
"reward": 1.08125,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.99375,
"step": 1860
},
{
"completion_length": 113.35,
"epoch": 0.4119214257119587,
"grad_norm": 0.5993634901343988,
"kl": 0.395849609375,
"learning_rate": 1.4635147026541674e-05,
"loss": 0.0158,
"reward": 1.08125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 1.0,
"step": 1865
},
{
"completion_length": 142.475,
"epoch": 0.4130257726977816,
"grad_norm": 0.14466786983492855,
"kl": 0.3989501953125,
"learning_rate": 1.4600948060333187e-05,
"loss": 0.016,
"reward": 1.1,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.99375,
"step": 1870
},
{
"completion_length": 180.33125,
"epoch": 0.4141301196836046,
"grad_norm": 0.2913646742496878,
"kl": 0.3666259765625,
"learning_rate": 1.4566680695952333e-05,
"loss": 0.0147,
"reward": 1.0375,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9875,
"step": 1875
},
{
"completion_length": 246.5625,
"epoch": 0.41523446666942754,
"grad_norm": 0.10584068251739959,
"kl": 0.3494873046875,
"learning_rate": 1.4532345442821323e-05,
"loss": 0.014,
"reward": 1.0375,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.99375,
"step": 1880
},
{
"completion_length": 349.275,
"epoch": 0.4163388136552505,
"grad_norm": 0.42138126797069886,
"kl": 0.40732421875,
"learning_rate": 1.4497942811371592e-05,
"loss": 0.0163,
"reward": 0.95625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.925,
"step": 1885
},
{
"completion_length": 282.8125,
"epoch": 0.41744316064107345,
"grad_norm": 0.5708867419498027,
"kl": 0.4172607421875,
"learning_rate": 1.4463473313036241e-05,
"loss": 0.0167,
"reward": 0.93125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.90625,
"step": 1890
},
{
"completion_length": 245.1,
"epoch": 0.41854750762689635,
"grad_norm": 0.5020193212271058,
"kl": 0.431884765625,
"learning_rate": 1.4428937460242417e-05,
"loss": 0.0173,
"reward": 0.99375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.91875,
"step": 1895
},
{
"completion_length": 197.70625,
"epoch": 0.4196518546127193,
"grad_norm": 0.590350066389971,
"kl": 0.3642578125,
"learning_rate": 1.4394335766403703e-05,
"loss": 0.0146,
"reward": 1.0,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.95625,
"step": 1900
},
{
"epoch": 0.4196518546127193,
"eval_completion_length": 210.86,
"eval_kl": 0.4005078125,
"eval_loss": 0.016043836250901222,
"eval_reward": 1.025,
"eval_reward_std": 0.12020815074443818,
"eval_rewards/accuracy_reward": 0.09,
"eval_rewards/format_reward": 0.935,
"eval_runtime": 109.1917,
"eval_samples_per_second": 0.907,
"eval_steps_per_second": 0.229,
"step": 1900
},
{
"completion_length": 210.5125,
"epoch": 0.42075620159854227,
"grad_norm": 0.5480387591743037,
"kl": 0.4215087890625,
"learning_rate": 1.4359668745912472e-05,
"loss": 0.0169,
"reward": 0.98125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.93125,
"step": 1905
},
{
"completion_length": 187.575,
"epoch": 0.4218605485843652,
"grad_norm": 0.37297156781214846,
"kl": 0.318310546875,
"learning_rate": 1.4324936914132255e-05,
"loss": 0.0127,
"reward": 1.0125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.95625,
"step": 1910
},
{
"completion_length": 194.59375,
"epoch": 0.4229648955701881,
"grad_norm": 0.4621580856810021,
"kl": 0.3155029296875,
"learning_rate": 1.4290140787390083e-05,
"loss": 0.0126,
"reward": 1.0125,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.9875,
"step": 1915
},
{
"completion_length": 215.26875,
"epoch": 0.4240692425560111,
"grad_norm": 0.23236440356104268,
"kl": 0.31630859375,
"learning_rate": 1.4255280882968787e-05,
"loss": 0.0126,
"reward": 1.0375,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.98125,
"step": 1920
},
{
"completion_length": 257.96875,
"epoch": 0.42517358954183404,
"grad_norm": 0.1821862648072376,
"kl": 0.36019287109375,
"learning_rate": 1.4220357719099338e-05,
"loss": 0.0144,
"reward": 1.00625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.95,
"step": 1925
},
{
"completion_length": 278.40625,
"epoch": 0.426277936527657,
"grad_norm": 0.5171028800770205,
"kl": 0.341015625,
"learning_rate": 1.4185371814953116e-05,
"loss": 0.0136,
"reward": 0.96875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.9125,
"step": 1930
},
{
"completion_length": 203.1,
"epoch": 0.42738228351347995,
"grad_norm": 0.3194976531325425,
"kl": 0.33564453125,
"learning_rate": 1.415032369063422e-05,
"loss": 0.0134,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.98125,
"step": 1935
},
{
"completion_length": 248.76875,
"epoch": 0.42848663049930286,
"grad_norm": 0.32014297487636434,
"kl": 0.34617919921875,
"learning_rate": 1.41152138671717e-05,
"loss": 0.0138,
"reward": 0.96875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.9375,
"step": 1940
},
{
"completion_length": 224.6875,
"epoch": 0.4295909774851258,
"grad_norm": 0.7093467850610166,
"kl": 0.32181396484375,
"learning_rate": 1.408004286651185e-05,
"loss": 0.0129,
"reward": 1.025,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.99375,
"step": 1945
},
{
"completion_length": 219.30625,
"epoch": 0.43069532447094877,
"grad_norm": 0.12714345360829074,
"kl": 0.3352294921875,
"learning_rate": 1.4044811211510419e-05,
"loss": 0.0134,
"reward": 1.04375,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.98125,
"step": 1950
},
{
"completion_length": 253.2875,
"epoch": 0.4317996714567717,
"grad_norm": 0.180488927556792,
"kl": 0.3677734375,
"learning_rate": 1.4009519425924858e-05,
"loss": 0.0147,
"reward": 1.04375,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.9875,
"step": 1955
},
{
"completion_length": 270.91875,
"epoch": 0.4329040184425947,
"grad_norm": 0.30372496615737005,
"kl": 0.301806640625,
"learning_rate": 1.3974168034406524e-05,
"loss": 0.0121,
"reward": 0.99375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.9625,
"step": 1960
},
{
"completion_length": 248.61875,
"epoch": 0.4340083654284176,
"grad_norm": 0.4701815288270493,
"kl": 0.33768310546875,
"learning_rate": 1.3938757562492873e-05,
"loss": 0.0135,
"reward": 1.04375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.98125,
"step": 1965
},
{
"completion_length": 225.59375,
"epoch": 0.43511271241424054,
"grad_norm": 0.36757084788553285,
"kl": 0.3396484375,
"learning_rate": 1.3903288536599668e-05,
"loss": 0.0136,
"reward": 1.0625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.98125,
"step": 1970
},
{
"completion_length": 312.15,
"epoch": 0.4362170594000635,
"grad_norm": 0.48977082806979283,
"kl": 0.358837890625,
"learning_rate": 1.3867761484013135e-05,
"loss": 0.0144,
"reward": 1.01875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.95625,
"step": 1975
},
{
"completion_length": 307.16875,
"epoch": 0.43732140638588646,
"grad_norm": 0.49526262796042186,
"kl": 0.340966796875,
"learning_rate": 1.3832176932882136e-05,
"loss": 0.0136,
"reward": 1.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9875,
"step": 1980
},
{
"completion_length": 252.50625,
"epoch": 0.4384257533717094,
"grad_norm": 0.6091872240850794,
"kl": 0.3243408203125,
"learning_rate": 1.3796535412210301e-05,
"loss": 0.013,
"reward": 1.025,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.96875,
"step": 1985
},
{
"completion_length": 230.20625,
"epoch": 0.4395301003575323,
"grad_norm": 0.29579068631634226,
"kl": 0.34698486328125,
"learning_rate": 1.3760837451848193e-05,
"loss": 0.0139,
"reward": 1.075,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.975,
"step": 1990
},
{
"completion_length": 246.5375,
"epoch": 0.4406344473433553,
"grad_norm": 0.3033284715610845,
"kl": 0.34864501953125,
"learning_rate": 1.3725083582485397e-05,
"loss": 0.0139,
"reward": 1.01875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.94375,
"step": 1995
},
{
"completion_length": 213.94375,
"epoch": 0.44173879432917823,
"grad_norm": 0.4124635672949258,
"kl": 0.33160400390625,
"learning_rate": 1.3689274335642653e-05,
"loss": 0.0133,
"reward": 1.01875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.96875,
"step": 2000
},
{
"epoch": 0.44173879432917823,
"eval_completion_length": 175.91,
"eval_kl": 0.3898828125,
"eval_loss": 0.015537865459918976,
"eval_reward": 1.1,
"eval_reward_std": 0.11313708305358887,
"eval_rewards/accuracy_reward": 0.11,
"eval_rewards/format_reward": 0.99,
"eval_runtime": 88.9833,
"eval_samples_per_second": 1.113,
"eval_steps_per_second": 0.281,
"step": 2000
},
{
"completion_length": 198.1125,
"epoch": 0.4428431413150012,
"grad_norm": 0.41251225473854053,
"kl": 0.32073974609375,
"learning_rate": 1.3653410243663953e-05,
"loss": 0.0128,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.98125,
"step": 2005
},
{
"completion_length": 174.21875,
"epoch": 0.44394748830082414,
"grad_norm": 0.6755273210792271,
"kl": 0.318115234375,
"learning_rate": 1.3617491839708614e-05,
"loss": 0.0127,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9875,
"step": 2010
},
{
"completion_length": 208.9875,
"epoch": 0.44505183528664705,
"grad_norm": 0.37867244007672246,
"kl": 0.32615966796875,
"learning_rate": 1.3581519657743365e-05,
"loss": 0.013,
"reward": 1.05625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9625,
"step": 2015
},
{
"completion_length": 255.75,
"epoch": 0.44615618227247,
"grad_norm": 0.6185289191665273,
"kl": 0.3501220703125,
"learning_rate": 1.3545494232534406e-05,
"loss": 0.014,
"reward": 1.06875,
"reward_std": 0.23864853456616403,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.9375,
"step": 2020
},
{
"completion_length": 327.4125,
"epoch": 0.44726052925829296,
"grad_norm": 0.8766696054374737,
"kl": 0.4114990234375,
"learning_rate": 1.3509416099639456e-05,
"loss": 0.0165,
"reward": 1.03125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.95,
"step": 2025
},
{
"completion_length": 336.04375,
"epoch": 0.4483648762441159,
"grad_norm": 0.26246836662809214,
"kl": 0.33997802734375,
"learning_rate": 1.3473285795399792e-05,
"loss": 0.0136,
"reward": 1.09375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9875,
"step": 2030
},
{
"completion_length": 242.35625,
"epoch": 0.4494692232299388,
"grad_norm": 0.4471292709567679,
"kl": 0.3488037109375,
"learning_rate": 1.3437103856932266e-05,
"loss": 0.014,
"reward": 1.1,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.975,
"step": 2035
},
{
"completion_length": 209.25625,
"epoch": 0.4505735702157618,
"grad_norm": 0.73758038746274,
"kl": 0.3877685546875,
"learning_rate": 1.3400870822121348e-05,
"loss": 0.0155,
"reward": 0.9375,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.85,
"step": 2040
},
{
"completion_length": 196.275,
"epoch": 0.45167791720158473,
"grad_norm": 0.5384037353444987,
"kl": 0.373583984375,
"learning_rate": 1.3364587229611095e-05,
"loss": 0.0149,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.975,
"step": 2045
},
{
"completion_length": 242.125,
"epoch": 0.4527822641874077,
"grad_norm": 0.35105762795134465,
"kl": 0.43712158203125,
"learning_rate": 1.332825361879717e-05,
"loss": 0.0175,
"reward": 1.1,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.975,
"step": 2050
},
{
"completion_length": 226.36875,
"epoch": 0.45388661117323065,
"grad_norm": 0.5975228945667029,
"kl": 0.51041259765625,
"learning_rate": 1.3291870529818809e-05,
"loss": 0.0204,
"reward": 1.0875,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.975,
"step": 2055
},
{
"completion_length": 243.575,
"epoch": 0.45499095815905355,
"grad_norm": 0.4387220366275729,
"kl": 0.458203125,
"learning_rate": 1.3255438503550796e-05,
"loss": 0.0183,
"reward": 1.09375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.975,
"step": 2060
},
{
"completion_length": 250.0875,
"epoch": 0.4560953051448765,
"grad_norm": 0.5275971912489743,
"kl": 0.39151611328125,
"learning_rate": 1.3218958081595426e-05,
"loss": 0.0157,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.98125,
"step": 2065
},
{
"completion_length": 282.99375,
"epoch": 0.45719965213069946,
"grad_norm": 0.4169345983090638,
"kl": 0.4114990234375,
"learning_rate": 1.3182429806274442e-05,
"loss": 0.0165,
"reward": 1.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.975,
"step": 2070
},
{
"completion_length": 242.4,
"epoch": 0.4583039991165224,
"grad_norm": 0.5265515421182827,
"kl": 0.48302001953125,
"learning_rate": 1.3145854220620981e-05,
"loss": 0.0193,
"reward": 1.0,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.94375,
"step": 2075
},
{
"completion_length": 241.05,
"epoch": 0.4594083461023454,
"grad_norm": 0.5585942695958915,
"kl": 0.4310302734375,
"learning_rate": 1.3109231868371511e-05,
"loss": 0.0172,
"reward": 0.9625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.94375,
"step": 2080
},
{
"completion_length": 176.9125,
"epoch": 0.4605126930881683,
"grad_norm": 0.4414724029442064,
"kl": 0.35958251953125,
"learning_rate": 1.3072563293957725e-05,
"loss": 0.0144,
"reward": 1.05625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 2085
},
{
"completion_length": 192.8,
"epoch": 0.46161704007399124,
"grad_norm": 0.586018890860745,
"kl": 0.322509765625,
"learning_rate": 1.3035849042498462e-05,
"loss": 0.0129,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9625,
"step": 2090
},
{
"completion_length": 208.95625,
"epoch": 0.4627213870598142,
"grad_norm": 0.42268112037658245,
"kl": 0.319384765625,
"learning_rate": 1.299908965979161e-05,
"loss": 0.0128,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9625,
"step": 2095
},
{
"completion_length": 222.16875,
"epoch": 0.46382573404563715,
"grad_norm": 0.31473777919390844,
"kl": 0.30546875,
"learning_rate": 1.2962285692305964e-05,
"loss": 0.0122,
"reward": 1.0125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9625,
"step": 2100
},
{
"epoch": 0.46382573404563715,
"eval_completion_length": 192.71,
"eval_kl": 0.31083984375,
"eval_loss": 0.012439416721463203,
"eval_reward": 1.1,
"eval_reward_std": 0.21213203072547912,
"eval_rewards/accuracy_reward": 0.145,
"eval_rewards/format_reward": 0.955,
"eval_runtime": 96.8968,
"eval_samples_per_second": 1.022,
"eval_steps_per_second": 0.258,
"step": 2100
},
{
"completion_length": 216.8125,
"epoch": 0.4649300810314601,
"grad_norm": 0.35648646516795124,
"kl": 0.32301025390625,
"learning_rate": 1.2925437687173144e-05,
"loss": 0.0129,
"reward": 1.04375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9625,
"step": 2105
},
{
"completion_length": 241.8125,
"epoch": 0.466034428017283,
"grad_norm": 0.601644494723294,
"kl": 0.3125732421875,
"learning_rate": 1.2888546192179417e-05,
"loss": 0.0125,
"reward": 0.96875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 0.94375,
"step": 2110
},
{
"completion_length": 217.7625,
"epoch": 0.46713877500310597,
"grad_norm": 0.39671636795145077,
"kl": 0.326318359375,
"learning_rate": 1.2851611755757587e-05,
"loss": 0.013,
"reward": 1.025,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9375,
"step": 2115
},
{
"completion_length": 210.575,
"epoch": 0.4682431219889289,
"grad_norm": 0.5031259322905296,
"kl": 0.35986328125,
"learning_rate": 1.2814634926978831e-05,
"loss": 0.0144,
"reward": 1.025,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.93125,
"step": 2120
},
{
"completion_length": 187.1,
"epoch": 0.4693474689747519,
"grad_norm": 0.48486411865791645,
"kl": 0.35367431640625,
"learning_rate": 1.2777616255544527e-05,
"loss": 0.0141,
"reward": 1.075,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.95,
"step": 2125
},
{
"completion_length": 151.975,
"epoch": 0.47045181596057484,
"grad_norm": 0.7338227984314649,
"kl": 0.3826904296875,
"learning_rate": 1.2740556291778096e-05,
"loss": 0.0153,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.975,
"step": 2130
},
{
"completion_length": 133.78125,
"epoch": 0.47155616294639774,
"grad_norm": 0.11684943721597078,
"kl": 0.33577880859375,
"learning_rate": 1.2703455586616811e-05,
"loss": 0.0134,
"reward": 1.0875,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.99375,
"step": 2135
},
{
"completion_length": 145.9875,
"epoch": 0.4726605099322207,
"grad_norm": 0.32149812408314604,
"kl": 0.38963623046875,
"learning_rate": 1.2666314691603615e-05,
"loss": 0.0156,
"reward": 1.13125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.9875,
"step": 2140
},
{
"completion_length": 232.575,
"epoch": 0.47376485691804365,
"grad_norm": 0.6480932091195085,
"kl": 0.3406494140625,
"learning_rate": 1.2629134158878919e-05,
"loss": 0.0136,
"reward": 1.05625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 2145
},
{
"completion_length": 253.8875,
"epoch": 0.4748692039038666,
"grad_norm": 0.36437117134621355,
"kl": 0.3377685546875,
"learning_rate": 1.259191454117239e-05,
"loss": 0.0135,
"reward": 1.04375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.975,
"step": 2150
},
{
"completion_length": 231.3125,
"epoch": 0.47597355088968957,
"grad_norm": 0.4391123760933655,
"kl": 0.3203125,
"learning_rate": 1.255465639179473e-05,
"loss": 0.0128,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.975,
"step": 2155
},
{
"completion_length": 260.05625,
"epoch": 0.47707789787551247,
"grad_norm": 0.34571139879091517,
"kl": 0.35738525390625,
"learning_rate": 1.2517360264629463e-05,
"loss": 0.0143,
"reward": 1.01875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.95,
"step": 2160
},
{
"completion_length": 221.43125,
"epoch": 0.4781822448613354,
"grad_norm": 0.45867628713278896,
"kl": 0.38974609375,
"learning_rate": 1.24800267141247e-05,
"loss": 0.0156,
"reward": 1.0375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.93125,
"step": 2165
},
{
"completion_length": 170.575,
"epoch": 0.4792865918471584,
"grad_norm": 0.19943826053198088,
"kl": 0.37861328125,
"learning_rate": 1.2442656295284879e-05,
"loss": 0.0151,
"reward": 1.05,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.98125,
"step": 2170
},
{
"completion_length": 197.7875,
"epoch": 0.48039093883298134,
"grad_norm": 0.37120994010979813,
"kl": 0.342919921875,
"learning_rate": 1.2405249563662539e-05,
"loss": 0.0137,
"reward": 0.99375,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.95625,
"step": 2175
},
{
"completion_length": 175.96875,
"epoch": 0.48149528581880424,
"grad_norm": 0.6013419839896456,
"kl": 0.3757080078125,
"learning_rate": 1.2367807075350036e-05,
"loss": 0.015,
"reward": 1.08125,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.95625,
"step": 2180
},
{
"completion_length": 184.83125,
"epoch": 0.4825996328046272,
"grad_norm": 0.5134474475685822,
"kl": 0.36424560546875,
"learning_rate": 1.23303293869713e-05,
"loss": 0.0146,
"reward": 1.0125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9625,
"step": 2185
},
{
"completion_length": 234.29375,
"epoch": 0.48370397979045016,
"grad_norm": 0.5230059460040423,
"kl": 0.34986572265625,
"learning_rate": 1.2292817055673543e-05,
"loss": 0.014,
"reward": 1.0125,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9375,
"step": 2190
},
{
"completion_length": 308.00625,
"epoch": 0.4848083267762731,
"grad_norm": 0.7592675553160979,
"kl": 0.3602783203125,
"learning_rate": 1.2255270639118984e-05,
"loss": 0.0144,
"reward": 1.0125,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9125,
"step": 2195
},
{
"completion_length": 273.43125,
"epoch": 0.48591267376209607,
"grad_norm": 0.2856132458649576,
"kl": 0.37276611328125,
"learning_rate": 1.2217690695476551e-05,
"loss": 0.0149,
"reward": 1.00625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.90625,
"step": 2200
},
{
"epoch": 0.48591267376209607,
"eval_completion_length": 253.775,
"eval_kl": 0.5310546875,
"eval_loss": 0.02128330059349537,
"eval_reward": 1.045,
"eval_reward_std": 0.162634556889534,
"eval_rewards/accuracy_reward": 0.105,
"eval_rewards/format_reward": 0.94,
"eval_runtime": 127.3028,
"eval_samples_per_second": 0.778,
"eval_steps_per_second": 0.196,
"step": 2200
},
{
"completion_length": 259.58125,
"epoch": 0.48701702074791897,
"grad_norm": 0.3284138468757768,
"kl": 0.3307373046875,
"learning_rate": 1.2180077783413601e-05,
"loss": 0.0132,
"reward": 1.05625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 2205
},
{
"completion_length": 252.30625,
"epoch": 0.48812136773374193,
"grad_norm": 0.4057901132375836,
"kl": 0.4347412109375,
"learning_rate": 1.21424324620876e-05,
"loss": 0.0174,
"reward": 0.9875,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.91875,
"step": 2210
},
{
"completion_length": 249.60625,
"epoch": 0.4892257147195649,
"grad_norm": 0.9156591934586986,
"kl": 0.3991943359375,
"learning_rate": 1.2104755291137797e-05,
"loss": 0.016,
"reward": 0.9875,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9375,
"step": 2215
},
{
"completion_length": 229.95625,
"epoch": 0.49033006170538784,
"grad_norm": 0.42300199898124896,
"kl": 0.374468994140625,
"learning_rate": 1.2067046830676947e-05,
"loss": 0.015,
"reward": 1.025,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.95,
"step": 2220
},
{
"completion_length": 206.26875,
"epoch": 0.4914344086912108,
"grad_norm": 0.3982306025965269,
"kl": 0.3041748046875,
"learning_rate": 1.2029307641282935e-05,
"loss": 0.0122,
"reward": 1.05,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.96875,
"step": 2225
},
{
"completion_length": 225.10625,
"epoch": 0.4925387556770337,
"grad_norm": 0.3221497073306949,
"kl": 0.30826416015625,
"learning_rate": 1.1991538283990483e-05,
"loss": 0.0123,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.95625,
"step": 2230
},
{
"completion_length": 193.3125,
"epoch": 0.49364310266285666,
"grad_norm": 0.13887134423717792,
"kl": 0.32666015625,
"learning_rate": 1.1953739320282778e-05,
"loss": 0.0131,
"reward": 1.06875,
"reward_std": 0.02651650384068489,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.99375,
"step": 2235
},
{
"completion_length": 208.83125,
"epoch": 0.4947474496486796,
"grad_norm": 0.3671257080599345,
"kl": 0.30867919921875,
"learning_rate": 1.191591131208315e-05,
"loss": 0.0123,
"reward": 1.0875,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.99375,
"step": 2240
},
{
"completion_length": 196.06875,
"epoch": 0.4958517966345026,
"grad_norm": 0.4192081679963359,
"kl": 0.3347412109375,
"learning_rate": 1.1878054821746703e-05,
"loss": 0.0134,
"reward": 1.11875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.96875,
"step": 2245
},
{
"completion_length": 218.25625,
"epoch": 0.49695614362032553,
"grad_norm": 0.2809177260646367,
"kl": 0.31444091796875,
"learning_rate": 1.1840170412051957e-05,
"loss": 0.0126,
"reward": 1.075,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.975,
"step": 2250
},
{
"completion_length": 246.13125,
"epoch": 0.49806049060614843,
"grad_norm": 0.3266723103626801,
"kl": 0.290673828125,
"learning_rate": 1.1802258646192486e-05,
"loss": 0.0116,
"reward": 1.025,
"reward_std": 0.03535533845424652,
"rewards/accuracy_reward": 0.025,
"rewards/format_reward": 1.0,
"step": 2255
},
{
"completion_length": 263.225,
"epoch": 0.4991648375919714,
"grad_norm": 0.19489451125924528,
"kl": 0.29127197265625,
"learning_rate": 1.1764320087768546e-05,
"loss": 0.0116,
"reward": 1.0875,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.975,
"step": 2260
},
{
"completion_length": 265.95,
"epoch": 0.5002691845777943,
"grad_norm": 0.5199830849997911,
"kl": 0.34532470703125,
"learning_rate": 1.1726355300778693e-05,
"loss": 0.0138,
"reward": 1.04375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9625,
"step": 2265
},
{
"completion_length": 230.08125,
"epoch": 0.5013735315636173,
"grad_norm": 0.5370502961123099,
"kl": 0.31375732421875,
"learning_rate": 1.1688364849611395e-05,
"loss": 0.0125,
"reward": 1.075,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.98125,
"step": 2270
},
{
"completion_length": 268.4375,
"epoch": 0.5024778785494403,
"grad_norm": 0.4955767601038962,
"kl": 0.28502197265625,
"learning_rate": 1.1650349299036656e-05,
"loss": 0.0114,
"reward": 1.0625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.98125,
"step": 2275
},
{
"completion_length": 204.94375,
"epoch": 0.5035822255352632,
"grad_norm": 0.5349423661973638,
"kl": 0.3089111328125,
"learning_rate": 1.1612309214197599e-05,
"loss": 0.0124,
"reward": 1.0625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.975,
"step": 2280
},
{
"completion_length": 191.61875,
"epoch": 0.5046865725210862,
"grad_norm": 0.7009751890329485,
"kl": 0.32718505859375,
"learning_rate": 1.1574245160602085e-05,
"loss": 0.0131,
"reward": 1.0375,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9875,
"step": 2285
},
{
"completion_length": 174.96875,
"epoch": 0.505790919506909,
"grad_norm": 0.15634356767517202,
"kl": 0.32626953125,
"learning_rate": 1.153615770411429e-05,
"loss": 0.013,
"reward": 1.1125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.99375,
"step": 2290
},
{
"completion_length": 213.85625,
"epoch": 0.506895266492732,
"grad_norm": 0.47252723075105413,
"kl": 0.311212158203125,
"learning_rate": 1.1498047410946307e-05,
"loss": 0.0124,
"reward": 1.06875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9875,
"step": 2295
},
{
"completion_length": 215.2875,
"epoch": 0.5079996134785549,
"grad_norm": 0.4948372981089919,
"kl": 0.33463134765625,
"learning_rate": 1.1459914847649716e-05,
"loss": 0.0134,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.98125,
"step": 2300
},
{
"epoch": 0.5079996134785549,
"eval_completion_length": 230.115,
"eval_kl": 0.37994140625,
"eval_loss": 0.015226633287966251,
"eval_reward": 1.105,
"eval_reward_std": 0.13435028612613678,
"eval_rewards/accuracy_reward": 0.135,
"eval_rewards/format_reward": 0.97,
"eval_runtime": 115.6989,
"eval_samples_per_second": 0.856,
"eval_steps_per_second": 0.216,
"step": 2300
},
{
"completion_length": 215.875,
"epoch": 0.5091039604643779,
"grad_norm": 0.3666409705462901,
"kl": 0.35133056640625,
"learning_rate": 1.1421760581107164e-05,
"loss": 0.0141,
"reward": 1.04375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9625,
"step": 2305
},
{
"completion_length": 254.09375,
"epoch": 0.5102083074502008,
"grad_norm": 0.2975662859403391,
"kl": 0.34254150390625,
"learning_rate": 1.1383585178523955e-05,
"loss": 0.0137,
"reward": 0.98125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.94375,
"step": 2310
},
{
"completion_length": 256.36875,
"epoch": 0.5113126544360238,
"grad_norm": 0.719346374442343,
"kl": 0.35948486328125,
"learning_rate": 1.1345389207419588e-05,
"loss": 0.0144,
"reward": 1.01875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.925,
"step": 2315
},
{
"completion_length": 209.3,
"epoch": 0.5124170014218468,
"grad_norm": 0.7163698011097263,
"kl": 0.32767333984375,
"learning_rate": 1.1307173235619342e-05,
"loss": 0.0131,
"reward": 1.05625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 2320
},
{
"completion_length": 229.61875,
"epoch": 0.5135213484076697,
"grad_norm": 0.16271285683956263,
"kl": 0.32386474609375,
"learning_rate": 1.126893783124583e-05,
"loss": 0.013,
"reward": 1.04375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.96875,
"step": 2325
},
{
"completion_length": 241.45625,
"epoch": 0.5146256953934927,
"grad_norm": 0.29867489688674986,
"kl": 0.34649658203125,
"learning_rate": 1.1230683562710549e-05,
"loss": 0.0139,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.975,
"step": 2330
},
{
"completion_length": 241.88125,
"epoch": 0.5157300423793156,
"grad_norm": 0.2511157393264926,
"kl": 0.33067626953125,
"learning_rate": 1.1192410998705432e-05,
"loss": 0.0132,
"reward": 1.08125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.98125,
"step": 2335
},
{
"completion_length": 259.81875,
"epoch": 0.5168343893651385,
"grad_norm": 0.1754130257017029,
"kl": 0.29140625,
"learning_rate": 1.1154120708194398e-05,
"loss": 0.0117,
"reward": 1.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9875,
"step": 2340
},
{
"completion_length": 238.56875,
"epoch": 0.5179387363509614,
"grad_norm": 0.2633940153329421,
"kl": 0.32506103515625,
"learning_rate": 1.1115813260404889e-05,
"loss": 0.013,
"reward": 1.05625,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.98125,
"step": 2345
},
{
"completion_length": 194.5625,
"epoch": 0.5190430833367844,
"grad_norm": 0.37301932919862296,
"kl": 0.3505615234375,
"learning_rate": 1.1077489224819402e-05,
"loss": 0.014,
"reward": 1.075,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.975,
"step": 2350
},
{
"completion_length": 202.15,
"epoch": 0.5201474303226074,
"grad_norm": 0.5495626073196226,
"kl": 0.377880859375,
"learning_rate": 1.1039149171167046e-05,
"loss": 0.0151,
"reward": 0.975,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.93125,
"step": 2355
},
{
"completion_length": 220.68125,
"epoch": 0.5212517773084303,
"grad_norm": 0.38846325239139723,
"kl": 0.3607421875,
"learning_rate": 1.1000793669415035e-05,
"loss": 0.0144,
"reward": 1.025,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.91875,
"step": 2360
},
{
"completion_length": 211.93125,
"epoch": 0.5223561242942533,
"grad_norm": 0.5335204386556052,
"kl": 0.3825927734375,
"learning_rate": 1.0962423289760254e-05,
"loss": 0.0153,
"reward": 1.0125,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.925,
"step": 2365
},
{
"completion_length": 195.3375,
"epoch": 0.5234604712800762,
"grad_norm": 0.5401729852880928,
"kl": 0.35989990234375,
"learning_rate": 1.0924038602620757e-05,
"loss": 0.0144,
"reward": 1.025,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.95625,
"step": 2370
},
{
"completion_length": 186.30625,
"epoch": 0.5245648182658992,
"grad_norm": 0.6847630132207224,
"kl": 0.32152099609375,
"learning_rate": 1.0885640178627291e-05,
"loss": 0.0129,
"reward": 1.05,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.96875,
"step": 2375
},
{
"completion_length": 187.09375,
"epoch": 0.5256691652517221,
"grad_norm": 0.5295522607198633,
"kl": 0.288751220703125,
"learning_rate": 1.0847228588614821e-05,
"loss": 0.0115,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.99375,
"step": 2380
},
{
"completion_length": 207.14375,
"epoch": 0.526773512237545,
"grad_norm": 0.39564621021267,
"kl": 0.310736083984375,
"learning_rate": 1.0808804403614044e-05,
"loss": 0.0124,
"reward": 1.025,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.98125,
"step": 2385
},
{
"completion_length": 217.94375,
"epoch": 0.5278778592233679,
"grad_norm": 0.32699751280459455,
"kl": 0.30042724609375,
"learning_rate": 1.0770368194842886e-05,
"loss": 0.012,
"reward": 1.03125,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.99375,
"step": 2390
},
{
"completion_length": 214.59375,
"epoch": 0.5289822062091909,
"grad_norm": 0.3390808446949196,
"kl": 0.321435546875,
"learning_rate": 1.073192053369802e-05,
"loss": 0.0129,
"reward": 1.025,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.96875,
"step": 2395
},
{
"completion_length": 227.125,
"epoch": 0.5300865531950139,
"grad_norm": 0.3714729764465815,
"kl": 0.36365966796875,
"learning_rate": 1.0693461991746389e-05,
"loss": 0.0146,
"reward": 1.05,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95,
"step": 2400
},
{
"epoch": 0.5300865531950139,
"eval_completion_length": 213.925,
"eval_kl": 0.4396484375,
"eval_loss": 0.017622916027903557,
"eval_reward": 1.03,
"eval_reward_std": 0.1414213538169861,
"eval_rewards/accuracy_reward": 0.08,
"eval_rewards/format_reward": 0.95,
"eval_runtime": 105.7779,
"eval_samples_per_second": 0.936,
"eval_steps_per_second": 0.236,
"step": 2400
},
{
"completion_length": 225.29375,
"epoch": 0.5311909001808368,
"grad_norm": 0.2664948280585707,
"kl": 0.40863037109375,
"learning_rate": 1.0654993140716665e-05,
"loss": 0.0164,
"reward": 1.0,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.94375,
"step": 2405
},
{
"completion_length": 208.39375,
"epoch": 0.5322952471666598,
"grad_norm": 0.634943961085242,
"kl": 0.33333740234375,
"learning_rate": 1.0616514552490791e-05,
"loss": 0.0133,
"reward": 1.05625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 2410
},
{
"completion_length": 186.975,
"epoch": 0.5333995941524827,
"grad_norm": 0.5915967584567521,
"kl": 0.294097900390625,
"learning_rate": 1.0578026799095464e-05,
"loss": 0.0118,
"reward": 1.0125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.975,
"step": 2415
},
{
"completion_length": 202.1625,
"epoch": 0.5345039411383057,
"grad_norm": 0.45619021814548455,
"kl": 0.321533203125,
"learning_rate": 1.0539530452693625e-05,
"loss": 0.0129,
"reward": 1.06875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.975,
"step": 2420
},
{
"completion_length": 243.76875,
"epoch": 0.5356082881241286,
"grad_norm": 0.3048810072947555,
"kl": 0.37376708984375,
"learning_rate": 1.0501026085575967e-05,
"loss": 0.0149,
"reward": 1.0,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.94375,
"step": 2425
},
{
"completion_length": 252.825,
"epoch": 0.5367126351099516,
"grad_norm": 0.40580599059859296,
"kl": 0.36983642578125,
"learning_rate": 1.046251427015241e-05,
"loss": 0.0148,
"reward": 1.01875,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.94375,
"step": 2430
},
{
"completion_length": 265.5875,
"epoch": 0.5378169820957744,
"grad_norm": 0.5522550075448642,
"kl": 0.384912109375,
"learning_rate": 1.0423995578943615e-05,
"loss": 0.0154,
"reward": 1.025,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.93125,
"step": 2435
},
{
"completion_length": 274.49375,
"epoch": 0.5389213290815974,
"grad_norm": 0.7214328168716406,
"kl": 0.4843017578125,
"learning_rate": 1.0385470584572449e-05,
"loss": 0.0194,
"reward": 1.04375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.925,
"step": 2440
},
{
"completion_length": 336.85625,
"epoch": 0.5400256760674204,
"grad_norm": 0.3475290976205475,
"kl": 0.52220458984375,
"learning_rate": 1.0346939859755481e-05,
"loss": 0.0209,
"reward": 0.9625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9,
"step": 2445
},
{
"completion_length": 298.64375,
"epoch": 0.5411300230532433,
"grad_norm": 0.37045136738406037,
"kl": 0.3713623046875,
"learning_rate": 1.0308403977294476e-05,
"loss": 0.0149,
"reward": 1.025,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.95,
"step": 2450
},
{
"completion_length": 247.96875,
"epoch": 0.5422343700390663,
"grad_norm": 1.2798359500362633,
"kl": 0.4294677734375,
"learning_rate": 1.0269863510067872e-05,
"loss": 0.0172,
"reward": 1.05625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.95,
"step": 2455
},
{
"completion_length": 220.65625,
"epoch": 0.5433387170248892,
"grad_norm": 0.3900864517809055,
"kl": 0.405615234375,
"learning_rate": 1.023131903102226e-05,
"loss": 0.0162,
"reward": 1.0125,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.95625,
"step": 2460
},
{
"completion_length": 229.73125,
"epoch": 0.5444430640107122,
"grad_norm": 0.4673057649728135,
"kl": 0.3134033203125,
"learning_rate": 1.0192771113163875e-05,
"loss": 0.0125,
"reward": 1.04375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.98125,
"step": 2465
},
{
"completion_length": 197.48125,
"epoch": 0.5455474109965351,
"grad_norm": 0.5576580736483916,
"kl": 0.34364013671875,
"learning_rate": 1.0154220329550076e-05,
"loss": 0.0137,
"reward": 1.06875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9875,
"step": 2470
},
{
"completion_length": 227.26875,
"epoch": 0.5466517579823581,
"grad_norm": 0.21804579013846342,
"kl": 0.40257568359375,
"learning_rate": 1.0115667253280817e-05,
"loss": 0.0161,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9625,
"step": 2475
},
{
"completion_length": 209.29375,
"epoch": 0.5477561049681811,
"grad_norm": 0.3304966161490469,
"kl": 0.35350341796875,
"learning_rate": 1.0077112457490143e-05,
"loss": 0.0141,
"reward": 1.03125,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.975,
"step": 2480
},
{
"completion_length": 190.29375,
"epoch": 0.5488604519540039,
"grad_norm": 0.47912529430555245,
"kl": 0.34261474609375,
"learning_rate": 1.0038556515337654e-05,
"loss": 0.0137,
"reward": 1.0625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.98125,
"step": 2485
},
{
"completion_length": 210.5625,
"epoch": 0.5499647989398269,
"grad_norm": 0.6442806902182178,
"kl": 0.3751953125,
"learning_rate": 1e-05,
"loss": 0.015,
"reward": 0.99375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.95625,
"step": 2490
},
{
"completion_length": 205.20625,
"epoch": 0.5510691459256498,
"grad_norm": 0.6416478639095549,
"kl": 0.412200927734375,
"learning_rate": 9.961443484662349e-06,
"loss": 0.0165,
"reward": 1.03125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.95625,
"step": 2495
},
{
"completion_length": 222.10625,
"epoch": 0.5521734929114728,
"grad_norm": 0.38828171969188574,
"kl": 0.35648193359375,
"learning_rate": 9.92288754250986e-06,
"loss": 0.0143,
"reward": 1.0125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9625,
"step": 2500
},
{
"epoch": 0.5521734929114728,
"eval_completion_length": 193.71,
"eval_kl": 0.44693359375,
"eval_loss": 0.01760600134730339,
"eval_reward": 1.055,
"eval_reward_std": 0.13435028612613678,
"eval_rewards/accuracy_reward": 0.085,
"eval_rewards/format_reward": 0.97,
"eval_runtime": 90.4642,
"eval_samples_per_second": 1.094,
"eval_steps_per_second": 0.276,
"step": 2500
},
{
"completion_length": 221.71875,
"epoch": 0.5532778398972957,
"grad_norm": 0.5618731254639551,
"kl": 0.420947265625,
"learning_rate": 9.884332746719186e-06,
"loss": 0.0168,
"reward": 1.0,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.94375,
"step": 2505
},
{
"completion_length": 180.55625,
"epoch": 0.5543821868831187,
"grad_norm": 0.4034995486933272,
"kl": 0.32989501953125,
"learning_rate": 9.845779670449926e-06,
"loss": 0.0132,
"reward": 1.05,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.99375,
"step": 2510
},
{
"completion_length": 211.625,
"epoch": 0.5554865338689416,
"grad_norm": 0.291793288012953,
"kl": 0.4625244140625,
"learning_rate": 9.807228886836128e-06,
"loss": 0.0185,
"reward": 1.05,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.975,
"step": 2515
},
{
"completion_length": 173.81875,
"epoch": 0.5565908808547646,
"grad_norm": 0.4350329917552652,
"kl": 0.319976806640625,
"learning_rate": 9.768680968977743e-06,
"loss": 0.0128,
"reward": 1.0875,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 1.0,
"step": 2520
},
{
"completion_length": 209.65,
"epoch": 0.5576952278405876,
"grad_norm": 0.32148315759606677,
"kl": 0.31556396484375,
"learning_rate": 9.730136489932133e-06,
"loss": 0.0126,
"reward": 1.1,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.975,
"step": 2525
},
{
"completion_length": 181.725,
"epoch": 0.5587995748264104,
"grad_norm": 0.4796485021571092,
"kl": 0.355078125,
"learning_rate": 9.691596022705527e-06,
"loss": 0.0142,
"reward": 1.0625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9875,
"step": 2530
},
{
"completion_length": 217.01875,
"epoch": 0.5599039218122334,
"grad_norm": 0.6531621153503112,
"kl": 0.52369384765625,
"learning_rate": 9.653060140244524e-06,
"loss": 0.0209,
"reward": 1.0,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9375,
"step": 2535
},
{
"completion_length": 222.725,
"epoch": 0.5610082687980563,
"grad_norm": 0.48376942186846866,
"kl": 0.5350341796875,
"learning_rate": 9.614529415427556e-06,
"loss": 0.0214,
"reward": 1.0125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9375,
"step": 2540
},
{
"completion_length": 169.14375,
"epoch": 0.5621126157838793,
"grad_norm": 0.6248488648541215,
"kl": 0.3317138671875,
"learning_rate": 9.576004421056389e-06,
"loss": 0.0133,
"reward": 1.08125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.98125,
"step": 2545
},
{
"completion_length": 173.6875,
"epoch": 0.5632169627697022,
"grad_norm": 0.3043924981907395,
"kl": 0.31737060546875,
"learning_rate": 9.537485729847594e-06,
"loss": 0.0127,
"reward": 1.05625,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.99375,
"step": 2550
},
{
"completion_length": 163.68125,
"epoch": 0.5643213097555252,
"grad_norm": 0.42066464481441374,
"kl": 0.300604248046875,
"learning_rate": 9.498973914424035e-06,
"loss": 0.012,
"reward": 1.0875,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.99375,
"step": 2555
},
{
"completion_length": 182.5625,
"epoch": 0.5654256567413481,
"grad_norm": 0.3979137026903759,
"kl": 0.3016357421875,
"learning_rate": 9.460469547306375e-06,
"loss": 0.0121,
"reward": 1.0625,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 1.0,
"step": 2560
},
{
"completion_length": 186.04375,
"epoch": 0.5665300037271711,
"grad_norm": 0.21641364510353625,
"kl": 0.29324951171875,
"learning_rate": 9.421973200904538e-06,
"loss": 0.0117,
"reward": 1.04375,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.99375,
"step": 2565
},
{
"completion_length": 161.68125,
"epoch": 0.5676343507129941,
"grad_norm": 0.24964356866992105,
"kl": 0.3321533203125,
"learning_rate": 9.38348544750921e-06,
"loss": 0.0133,
"reward": 1.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9875,
"step": 2570
},
{
"completion_length": 187.28125,
"epoch": 0.568738697698817,
"grad_norm": 0.1836091653753328,
"kl": 0.31024169921875,
"learning_rate": 9.345006859283338e-06,
"loss": 0.0124,
"reward": 1.10625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.99375,
"step": 2575
},
{
"completion_length": 212.85,
"epoch": 0.5698430446846399,
"grad_norm": 0.6849145983537044,
"kl": 0.339111328125,
"learning_rate": 9.306538008253611e-06,
"loss": 0.0136,
"reward": 1.08125,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 1.0,
"step": 2580
},
{
"completion_length": 204.25625,
"epoch": 0.5709473916704628,
"grad_norm": 0.8512773917093649,
"kl": 0.32901611328125,
"learning_rate": 9.268079466301978e-06,
"loss": 0.0132,
"reward": 1.11875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.96875,
"step": 2585
},
{
"completion_length": 223.1375,
"epoch": 0.5720517386562858,
"grad_norm": 0.18764654024348235,
"kl": 0.29111328125,
"learning_rate": 9.229631805157116e-06,
"loss": 0.0116,
"reward": 1.05,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.99375,
"step": 2590
},
{
"completion_length": 232.8625,
"epoch": 0.5731560856421087,
"grad_norm": 0.26470565817220254,
"kl": 0.30040283203125,
"learning_rate": 9.19119559638596e-06,
"loss": 0.012,
"reward": 1.0875,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.975,
"step": 2595
},
{
"completion_length": 224.3875,
"epoch": 0.5742604326279317,
"grad_norm": 0.28812544109822696,
"kl": 0.30372314453125,
"learning_rate": 9.15277141138518e-06,
"loss": 0.0121,
"reward": 1.1125,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.99375,
"step": 2600
},
{
"epoch": 0.5742604326279317,
"eval_completion_length": 200.555,
"eval_kl": 0.3039453125,
"eval_loss": 0.012129716575145721,
"eval_reward": 1.095,
"eval_reward_std": 0.12020815074443818,
"eval_rewards/accuracy_reward": 0.105,
"eval_rewards/format_reward": 0.99,
"eval_runtime": 84.0641,
"eval_samples_per_second": 1.178,
"eval_steps_per_second": 0.297,
"step": 2600
},
{
"completion_length": 206.1,
"epoch": 0.5753647796137547,
"grad_norm": 0.647401912936629,
"kl": 0.309521484375,
"learning_rate": 9.114359821372714e-06,
"loss": 0.0124,
"reward": 1.06875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9875,
"step": 2605
},
{
"completion_length": 232.76875,
"epoch": 0.5764691265995776,
"grad_norm": 0.4001821517172711,
"kl": 0.28963623046875,
"learning_rate": 9.075961397379247e-06,
"loss": 0.0116,
"reward": 1.06875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9875,
"step": 2610
},
{
"completion_length": 210.9,
"epoch": 0.5775734735854006,
"grad_norm": 0.46275481774122,
"kl": 0.29266357421875,
"learning_rate": 9.037576710239748e-06,
"loss": 0.0117,
"reward": 1.075,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 1.0,
"step": 2615
},
{
"completion_length": 232.54375,
"epoch": 0.5786778205712235,
"grad_norm": 0.06132598498237229,
"kl": 0.2859619140625,
"learning_rate": 8.999206330584969e-06,
"loss": 0.0114,
"reward": 1.03125,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.99375,
"step": 2620
},
{
"completion_length": 215.91875,
"epoch": 0.5797821675570465,
"grad_norm": 0.3911529231832668,
"kl": 0.29134521484375,
"learning_rate": 8.960850828832958e-06,
"loss": 0.0116,
"reward": 1.10625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.99375,
"step": 2625
},
{
"completion_length": 220.8625,
"epoch": 0.5808865145428693,
"grad_norm": 0.2442930603266259,
"kl": 0.31162109375,
"learning_rate": 8.9225107751806e-06,
"loss": 0.0125,
"reward": 1.06875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9875,
"step": 2630
},
{
"completion_length": 224.05625,
"epoch": 0.5819908615286923,
"grad_norm": 0.09213249286006794,
"kl": 0.3035400390625,
"learning_rate": 8.884186739595114e-06,
"loss": 0.0121,
"reward": 1.04375,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.99375,
"step": 2635
},
{
"completion_length": 207.7,
"epoch": 0.5830952085145152,
"grad_norm": 0.4516706074689143,
"kl": 0.300616455078125,
"learning_rate": 8.845879291805605e-06,
"loss": 0.012,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9875,
"step": 2640
},
{
"completion_length": 200.91875,
"epoch": 0.5841995555003382,
"grad_norm": 0.5391545499409273,
"kl": 0.29783935546875,
"learning_rate": 8.807589001294571e-06,
"loss": 0.0119,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.98125,
"step": 2645
},
{
"completion_length": 174.175,
"epoch": 0.5853039024861612,
"grad_norm": 0.7085634992406773,
"kl": 0.31053466796875,
"learning_rate": 8.769316437289456e-06,
"loss": 0.0124,
"reward": 1.0625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.96875,
"step": 2650
},
{
"completion_length": 181.50625,
"epoch": 0.5864082494719841,
"grad_norm": 0.4855936312310574,
"kl": 0.33160400390625,
"learning_rate": 8.731062168754174e-06,
"loss": 0.0133,
"reward": 1.075,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9875,
"step": 2655
},
{
"completion_length": 195.2625,
"epoch": 0.5875125964578071,
"grad_norm": 0.5682639526974919,
"kl": 0.359893798828125,
"learning_rate": 8.692826764380662e-06,
"loss": 0.0144,
"reward": 1.05625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.98125,
"step": 2660
},
{
"completion_length": 208.2875,
"epoch": 0.58861694344363,
"grad_norm": 0.4588795280012722,
"kl": 0.34942626953125,
"learning_rate": 8.654610792580415e-06,
"loss": 0.014,
"reward": 1.0125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.04375,
"rewards/format_reward": 0.96875,
"step": 2665
},
{
"completion_length": 168.025,
"epoch": 0.589721290429453,
"grad_norm": 0.500593183120369,
"kl": 0.36273193359375,
"learning_rate": 8.616414821476048e-06,
"loss": 0.0145,
"reward": 1.025,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9625,
"step": 2670
},
{
"completion_length": 183.0375,
"epoch": 0.5908256374152758,
"grad_norm": 0.5184938308817854,
"kl": 0.3274658203125,
"learning_rate": 8.57823941889284e-06,
"loss": 0.0131,
"reward": 1.04375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.98125,
"step": 2675
},
{
"completion_length": 158.6,
"epoch": 0.5919299844010988,
"grad_norm": 0.2914033499028057,
"kl": 0.289453125,
"learning_rate": 8.54008515235029e-06,
"loss": 0.0116,
"reward": 1.14375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.99375,
"step": 2680
},
{
"completion_length": 166.40625,
"epoch": 0.5930343313869217,
"grad_norm": 0.32731329886172095,
"kl": 0.30450439453125,
"learning_rate": 8.501952589053694e-06,
"loss": 0.0122,
"reward": 1.05,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 1.0,
"step": 2685
},
{
"completion_length": 156.31875,
"epoch": 0.5941386783727447,
"grad_norm": 0.09256103441005661,
"kl": 0.3406005859375,
"learning_rate": 8.463842295885712e-06,
"loss": 0.0136,
"reward": 1.01875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0375,
"rewards/format_reward": 0.98125,
"step": 2690
},
{
"completion_length": 183.61875,
"epoch": 0.5952430253585677,
"grad_norm": 0.5104668071449497,
"kl": 0.309075927734375,
"learning_rate": 8.425754839397917e-06,
"loss": 0.0124,
"reward": 1.0875,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.96875,
"step": 2695
},
{
"completion_length": 191.0875,
"epoch": 0.5963473723443906,
"grad_norm": 0.47594505089710887,
"kl": 0.30067138671875,
"learning_rate": 8.387690785802403e-06,
"loss": 0.012,
"reward": 1.05625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 2700
},
{
"epoch": 0.5963473723443906,
"eval_completion_length": 200.535,
"eval_kl": 0.33861328125,
"eval_loss": 0.013564695604145527,
"eval_reward": 1.105,
"eval_reward_std": 0.20506096243858338,
"eval_rewards/accuracy_reward": 0.15,
"eval_rewards/format_reward": 0.955,
"eval_runtime": 104.0245,
"eval_samples_per_second": 0.952,
"eval_steps_per_second": 0.24,
"step": 2700
},
{
"completion_length": 181.2,
"epoch": 0.5974517193302136,
"grad_norm": 0.38636194379006417,
"kl": 0.30714111328125,
"learning_rate": 8.349650700963346e-06,
"loss": 0.0123,
"reward": 1.14375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.99375,
"step": 2705
},
{
"completion_length": 216.9,
"epoch": 0.5985560663160365,
"grad_norm": 0.21083257411179623,
"kl": 0.30963134765625,
"learning_rate": 8.311635150388607e-06,
"loss": 0.0124,
"reward": 1.025,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9625,
"step": 2710
},
{
"completion_length": 209.09375,
"epoch": 0.5996604133018595,
"grad_norm": 0.254195239435785,
"kl": 0.34473876953125,
"learning_rate": 8.273644699221309e-06,
"loss": 0.0138,
"reward": 1.0375,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.975,
"step": 2715
},
{
"completion_length": 210.60625,
"epoch": 0.6007647602876824,
"grad_norm": 0.2751894182054688,
"kl": 0.315087890625,
"learning_rate": 8.235679912231456e-06,
"loss": 0.0126,
"reward": 1.01875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.96875,
"step": 2720
},
{
"completion_length": 184.66875,
"epoch": 0.6018691072735053,
"grad_norm": 0.5013893987512199,
"kl": 0.300457763671875,
"learning_rate": 8.197741353807515e-06,
"loss": 0.012,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.98125,
"step": 2725
},
{
"completion_length": 209.50625,
"epoch": 0.6029734542593282,
"grad_norm": 0.2668494553767842,
"kl": 0.343133544921875,
"learning_rate": 8.159829587948048e-06,
"loss": 0.0137,
"reward": 1.075,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.95625,
"step": 2730
},
{
"completion_length": 217.9875,
"epoch": 0.6040778012451512,
"grad_norm": 1.1790834992221495,
"kl": 0.34041748046875,
"learning_rate": 8.1219451782533e-06,
"loss": 0.0136,
"reward": 1.01875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.95625,
"step": 2735
},
{
"completion_length": 199.21875,
"epoch": 0.6051821482309742,
"grad_norm": 0.38803314249531284,
"kl": 0.31158447265625,
"learning_rate": 8.084088687916853e-06,
"loss": 0.0125,
"reward": 1.04375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.975,
"step": 2740
},
{
"completion_length": 185.13125,
"epoch": 0.6062864952167971,
"grad_norm": 0.5365344661762009,
"kl": 0.3183349609375,
"learning_rate": 8.046260679717225e-06,
"loss": 0.0127,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.99375,
"step": 2745
},
{
"completion_length": 195.78125,
"epoch": 0.6073908422026201,
"grad_norm": 0.5602246756166963,
"kl": 0.361181640625,
"learning_rate": 8.00846171600952e-06,
"loss": 0.0144,
"reward": 1.05625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 2750
},
{
"completion_length": 174.98125,
"epoch": 0.608495189188443,
"grad_norm": 0.45096759701250927,
"kl": 0.326898193359375,
"learning_rate": 7.970692358717067e-06,
"loss": 0.0131,
"reward": 1.0625,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.99375,
"step": 2755
},
{
"completion_length": 219.3125,
"epoch": 0.609599536174266,
"grad_norm": 0.52626187470055,
"kl": 0.349560546875,
"learning_rate": 7.932953169323057e-06,
"loss": 0.014,
"reward": 1.06875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9875,
"step": 2760
},
{
"completion_length": 220.175,
"epoch": 0.610703883160089,
"grad_norm": 0.3604233546767503,
"kl": 0.31280517578125,
"learning_rate": 7.895244708862204e-06,
"loss": 0.0125,
"reward": 1.0625,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.99375,
"step": 2765
},
{
"completion_length": 207.06875,
"epoch": 0.6118082301459118,
"grad_norm": 0.5288120469694233,
"kl": 0.33223876953125,
"learning_rate": 7.857567537912404e-06,
"loss": 0.0133,
"reward": 1.05625,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9875,
"step": 2770
},
{
"completion_length": 236.71875,
"epoch": 0.6129125771317347,
"grad_norm": 0.10425504560684959,
"kl": 0.32779541015625,
"learning_rate": 7.8199222165864e-06,
"loss": 0.0131,
"reward": 1.05625,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9875,
"step": 2775
},
{
"completion_length": 207.39375,
"epoch": 0.6140169241175577,
"grad_norm": 0.2589149844532793,
"kl": 0.283251953125,
"learning_rate": 7.78230930452345e-06,
"loss": 0.0113,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 1.0,
"step": 2780
},
{
"completion_length": 242.43125,
"epoch": 0.6151212711033807,
"grad_norm": 0.22583571089563595,
"kl": 0.27052001953125,
"learning_rate": 7.744729360881023e-06,
"loss": 0.0108,
"reward": 1.11875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.99375,
"step": 2785
},
{
"completion_length": 231.89375,
"epoch": 0.6162256180892036,
"grad_norm": 0.4749669282378495,
"kl": 0.278765869140625,
"learning_rate": 7.70718294432646e-06,
"loss": 0.0111,
"reward": 1.08125,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 1.0,
"step": 2790
},
{
"completion_length": 245.79375,
"epoch": 0.6173299650750266,
"grad_norm": 0.3617583523975815,
"kl": 0.27476806640625,
"learning_rate": 7.669670613028705e-06,
"loss": 0.011,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.98125,
"step": 2795
},
{
"completion_length": 227.44375,
"epoch": 0.6184343120608495,
"grad_norm": 0.2965537312612653,
"kl": 0.28612060546875,
"learning_rate": 7.632192924649969e-06,
"loss": 0.0114,
"reward": 1.08125,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 1.0,
"step": 2800
},
{
"epoch": 0.6184343120608495,
"eval_completion_length": 230.735,
"eval_kl": 0.3071484375,
"eval_loss": 0.01229775045067072,
"eval_reward": 1.095,
"eval_reward_std": 0.13435028612613678,
"eval_rewards/accuracy_reward": 0.11,
"eval_rewards/format_reward": 0.985,
"eval_runtime": 108.6588,
"eval_samples_per_second": 0.911,
"eval_steps_per_second": 0.23,
"step": 2800
},
{
"completion_length": 239.04375,
"epoch": 0.6195386590466725,
"grad_norm": 0.5214893247053585,
"kl": 0.2989501953125,
"learning_rate": 7.594750436337467e-06,
"loss": 0.012,
"reward": 1.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 1.0,
"step": 2805
},
{
"completion_length": 233.93125,
"epoch": 0.6206430060324954,
"grad_norm": 0.4598794915978221,
"kl": 0.29241943359375,
"learning_rate": 7.557343704715121e-06,
"loss": 0.0117,
"reward": 1.05625,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.99375,
"step": 2810
},
{
"completion_length": 242.64375,
"epoch": 0.6217473530183184,
"grad_norm": 3.1244852402266345,
"kl": 0.31943359375,
"learning_rate": 7.519973285875303e-06,
"loss": 0.0128,
"reward": 1.0625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.975,
"step": 2815
},
{
"completion_length": 236.81875,
"epoch": 0.6228517000041413,
"grad_norm": 0.5414981338856611,
"kl": 0.3600830078125,
"learning_rate": 7.482639735370536e-06,
"loss": 0.0144,
"reward": 1.03125,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.98125,
"step": 2820
},
{
"completion_length": 279.33125,
"epoch": 0.6239560469899642,
"grad_norm": 0.8596187860646796,
"kl": 0.40133056640625,
"learning_rate": 7.445343608205273e-06,
"loss": 0.0161,
"reward": 1.01875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9375,
"step": 2825
},
{
"completion_length": 247.0875,
"epoch": 0.6250603939757872,
"grad_norm": 0.5780188447264637,
"kl": 0.3927734375,
"learning_rate": 7.408085458827612e-06,
"loss": 0.0157,
"reward": 1.05,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.95625,
"step": 2830
},
{
"completion_length": 249.1,
"epoch": 0.6261647409616101,
"grad_norm": 0.316789753877662,
"kl": 0.38126220703125,
"learning_rate": 7.37086584112108e-06,
"loss": 0.0153,
"reward": 1.05,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.95625,
"step": 2835
},
{
"completion_length": 252.71875,
"epoch": 0.6272690879474331,
"grad_norm": 0.4453525406044762,
"kl": 0.37978515625,
"learning_rate": 7.333685308396383e-06,
"loss": 0.0152,
"reward": 1.0125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9375,
"step": 2840
},
{
"completion_length": 195.29375,
"epoch": 0.628373434933256,
"grad_norm": 0.6694404625243146,
"kl": 0.33199462890625,
"learning_rate": 7.2965444133831905e-06,
"loss": 0.0133,
"reward": 1.04375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.975,
"step": 2845
},
{
"completion_length": 203.05,
"epoch": 0.629477781919079,
"grad_norm": 0.5384625756146423,
"kl": 0.3214599609375,
"learning_rate": 7.2594437082219074e-06,
"loss": 0.0129,
"reward": 1.05,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.975,
"step": 2850
},
{
"completion_length": 181.36875,
"epoch": 0.630582128904902,
"grad_norm": 0.5857866433127917,
"kl": 0.31610107421875,
"learning_rate": 7.222383744455477e-06,
"loss": 0.0126,
"reward": 1.10625,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 1.0,
"step": 2855
},
{
"completion_length": 192.7625,
"epoch": 0.6316864758907249,
"grad_norm": 0.5306577984285455,
"kl": 0.2734375,
"learning_rate": 7.185365073021171e-06,
"loss": 0.0109,
"reward": 1.10625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9875,
"step": 2860
},
{
"completion_length": 186.95625,
"epoch": 0.6327908228765479,
"grad_norm": 0.25146113191688085,
"kl": 0.2825439453125,
"learning_rate": 7.148388244242414e-06,
"loss": 0.0113,
"reward": 1.05625,
"reward_std": 0.04419417306780815,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 1.0,
"step": 2865
},
{
"completion_length": 188.88125,
"epoch": 0.6338951698623707,
"grad_norm": 0.29446615942878507,
"kl": 0.28685302734375,
"learning_rate": 7.111453807820587e-06,
"loss": 0.0115,
"reward": 1.0875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9875,
"step": 2870
},
{
"completion_length": 215.34375,
"epoch": 0.6349995168481937,
"grad_norm": 0.11271213250290812,
"kl": 0.24910888671875,
"learning_rate": 7.0745623128268605e-06,
"loss": 0.01,
"reward": 1.05,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9875,
"step": 2875
},
{
"completion_length": 220.21875,
"epoch": 0.6361038638340166,
"grad_norm": 0.13827817934352812,
"kl": 0.265997314453125,
"learning_rate": 7.037714307694038e-06,
"loss": 0.0106,
"reward": 1.05,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.9875,
"step": 2880
},
{
"completion_length": 224.525,
"epoch": 0.6372082108198396,
"grad_norm": 0.3760142764874246,
"kl": 0.30487060546875,
"learning_rate": 7.000910340208393e-06,
"loss": 0.0122,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.98125,
"step": 2885
},
{
"completion_length": 229.125,
"epoch": 0.6383125578056625,
"grad_norm": 0.4800060583908338,
"kl": 0.28165283203125,
"learning_rate": 6.964150957501538e-06,
"loss": 0.0113,
"reward": 1.0875,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.975,
"step": 2890
},
{
"completion_length": 259.29375,
"epoch": 0.6394169047914855,
"grad_norm": 0.46816183628445984,
"kl": 0.2957763671875,
"learning_rate": 6.927436706042276e-06,
"loss": 0.0118,
"reward": 1.11875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.9875,
"step": 2895
},
{
"completion_length": 290.36875,
"epoch": 0.6405212517773085,
"grad_norm": 0.4620742885523695,
"kl": 0.291473388671875,
"learning_rate": 6.890768131628492e-06,
"loss": 0.0117,
"reward": 1.04375,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.95625,
"step": 2900
},
{
"epoch": 0.6405212517773085,
"eval_completion_length": 255.545,
"eval_kl": 0.29716796875,
"eval_loss": 0.01189742237329483,
"eval_reward": 1.085,
"eval_reward_std": 0.13435028612613678,
"eval_rewards/accuracy_reward": 0.1,
"eval_rewards/format_reward": 0.985,
"eval_runtime": 124.9105,
"eval_samples_per_second": 0.793,
"eval_steps_per_second": 0.2,
"step": 2900
},
{
"completion_length": 284.0625,
"epoch": 0.6416255987631314,
"grad_norm": 0.3834368146178332,
"kl": 0.2900390625,
"learning_rate": 6.8541457793790204e-06,
"loss": 0.0116,
"reward": 1.0375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9625,
"step": 2905
},
{
"completion_length": 251.69375,
"epoch": 0.6427299457489544,
"grad_norm": 0.2517910604330981,
"kl": 0.2760498046875,
"learning_rate": 6.8175701937255645e-06,
"loss": 0.011,
"reward": 1.03125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.975,
"step": 2910
},
{
"completion_length": 264.5,
"epoch": 0.6438342927347772,
"grad_norm": 0.2750608123545671,
"kl": 0.2783935546875,
"learning_rate": 6.781041918404578e-06,
"loss": 0.0111,
"reward": 1.08125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.975,
"step": 2915
},
{
"completion_length": 258.33125,
"epoch": 0.6449386397206002,
"grad_norm": 0.3654656487363651,
"kl": 0.2917236328125,
"learning_rate": 6.744561496449208e-06,
"loss": 0.0117,
"reward": 1.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9875,
"step": 2920
},
{
"completion_length": 245.49375,
"epoch": 0.6460429867064231,
"grad_norm": 0.6036458859995204,
"kl": 0.267669677734375,
"learning_rate": 6.708129470181197e-06,
"loss": 0.0107,
"reward": 1.15,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 1.0,
"step": 2925
},
{
"completion_length": 265.6875,
"epoch": 0.6471473336922461,
"grad_norm": 0.7309833094166883,
"kl": 0.26925048828125,
"learning_rate": 6.671746381202835e-06,
"loss": 0.0108,
"reward": 1.08125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9875,
"step": 2930
},
{
"completion_length": 252.36875,
"epoch": 0.648251680678069,
"grad_norm": 0.42288290479080043,
"kl": 0.2704833984375,
"learning_rate": 6.635412770388911e-06,
"loss": 0.0108,
"reward": 1.125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 1.0,
"step": 2935
},
{
"completion_length": 247.08125,
"epoch": 0.649356027663892,
"grad_norm": 0.49374444365360365,
"kl": 0.27628173828125,
"learning_rate": 6.5991291778786556e-06,
"loss": 0.0111,
"reward": 1.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.98125,
"step": 2940
},
{
"completion_length": 233.7375,
"epoch": 0.650460374649715,
"grad_norm": 0.5239819846390583,
"kl": 0.28341064453125,
"learning_rate": 6.562896143067734e-06,
"loss": 0.0113,
"reward": 1.1125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9875,
"step": 2945
},
{
"completion_length": 255.7,
"epoch": 0.6515647216355379,
"grad_norm": 0.43425079053320653,
"kl": 0.2722412109375,
"learning_rate": 6.526714204600212e-06,
"loss": 0.0109,
"reward": 1.10625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.99375,
"step": 2950
},
{
"completion_length": 249.475,
"epoch": 0.6526690686213609,
"grad_norm": 0.25987542033307665,
"kl": 0.27745361328125,
"learning_rate": 6.490583900360543e-06,
"loss": 0.0111,
"reward": 1.06875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.98125,
"step": 2955
},
{
"completion_length": 273.275,
"epoch": 0.6537734156071838,
"grad_norm": 0.45996917419712435,
"kl": 0.292822265625,
"learning_rate": 6.4545057674655954e-06,
"loss": 0.0117,
"reward": 1.1,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.95625,
"step": 2960
},
{
"completion_length": 268.3625,
"epoch": 0.6548777625930067,
"grad_norm": 0.5624671363615396,
"kl": 0.32144775390625,
"learning_rate": 6.418480342256635e-06,
"loss": 0.0129,
"reward": 1.09375,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.95625,
"step": 2965
},
{
"completion_length": 269.64375,
"epoch": 0.6559821095788296,
"grad_norm": 0.32479361186684463,
"kl": 0.27386474609375,
"learning_rate": 6.38250816029139e-06,
"loss": 0.011,
"reward": 1.0375,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.94375,
"step": 2970
},
{
"completion_length": 232.7375,
"epoch": 0.6570864565646526,
"grad_norm": 0.4550467921271227,
"kl": 0.25955810546875,
"learning_rate": 6.34658975633605e-06,
"loss": 0.0104,
"reward": 1.09375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9875,
"step": 2975
},
{
"completion_length": 237.9875,
"epoch": 0.6581908035504755,
"grad_norm": 0.5807970338558914,
"kl": 0.282135009765625,
"learning_rate": 6.310725664357349e-06,
"loss": 0.0113,
"reward": 1.05,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95,
"step": 2980
},
{
"completion_length": 246.35625,
"epoch": 0.6592951505362985,
"grad_norm": 0.1538823043299478,
"kl": 0.321832275390625,
"learning_rate": 6.274916417514605e-06,
"loss": 0.0129,
"reward": 1.08125,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.98125,
"step": 2985
},
{
"completion_length": 265.53125,
"epoch": 0.6603994975221215,
"grad_norm": 0.41231875233030124,
"kl": 0.287841796875,
"learning_rate": 6.239162548151809e-06,
"loss": 0.0115,
"reward": 1.14375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.99375,
"step": 2990
},
{
"completion_length": 282.34375,
"epoch": 0.6615038445079444,
"grad_norm": 0.4537223091285934,
"kl": 0.29276123046875,
"learning_rate": 6.2034645877897e-06,
"loss": 0.0117,
"reward": 1.1125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.95625,
"step": 2995
},
{
"completion_length": 251.75,
"epoch": 0.6626081914937674,
"grad_norm": 0.5005714464427982,
"kl": 0.30860595703125,
"learning_rate": 6.167823067117868e-06,
"loss": 0.0123,
"reward": 1.1125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.975,
"step": 3000
},
{
"epoch": 0.6626081914937674,
"eval_completion_length": 255.26,
"eval_kl": 0.29595703125,
"eval_loss": 0.011857852339744568,
"eval_reward": 1.14,
"eval_reward_std": 0.12727921843528747,
"eval_rewards/accuracy_reward": 0.16,
"eval_rewards/format_reward": 0.98,
"eval_runtime": 109.8303,
"eval_samples_per_second": 0.901,
"eval_steps_per_second": 0.228,
"step": 3000
},
{
"completion_length": 300.43125,
"epoch": 0.6637125384795903,
"grad_norm": 0.32746973812290947,
"kl": 0.308380126953125,
"learning_rate": 6.132238515986868e-06,
"loss": 0.0123,
"reward": 1.04375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.94375,
"step": 3005
},
{
"completion_length": 243.6375,
"epoch": 0.6648168854654133,
"grad_norm": 0.4677149628391746,
"kl": 0.294287109375,
"learning_rate": 6.096711463400333e-06,
"loss": 0.0118,
"reward": 1.1125,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.98125,
"step": 3010
},
{
"completion_length": 247.98125,
"epoch": 0.6659212324512361,
"grad_norm": 0.5439611294896475,
"kl": 0.281396484375,
"learning_rate": 6.061242437507131e-06,
"loss": 0.0113,
"reward": 1.14375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1625,
"rewards/format_reward": 0.98125,
"step": 3015
},
{
"completion_length": 275.20625,
"epoch": 0.6670255794370591,
"grad_norm": 0.4749719977776336,
"kl": 0.31934814453125,
"learning_rate": 6.025831965593479e-06,
"loss": 0.0128,
"reward": 1.1125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.95625,
"step": 3020
},
{
"completion_length": 256.54375,
"epoch": 0.668129926422882,
"grad_norm": 0.3744944357004974,
"kl": 0.32305908203125,
"learning_rate": 5.990480574075143e-06,
"loss": 0.0129,
"reward": 1.00625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.94375,
"step": 3025
},
{
"completion_length": 245.075,
"epoch": 0.669234273408705,
"grad_norm": 0.36682653313436625,
"kl": 0.2878173828125,
"learning_rate": 5.955188788489583e-06,
"loss": 0.0115,
"reward": 1.0375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.95,
"step": 3030
},
{
"completion_length": 228.79375,
"epoch": 0.670338620394528,
"grad_norm": 0.5471531880195812,
"kl": 0.2614013671875,
"learning_rate": 5.919957133488155e-06,
"loss": 0.0105,
"reward": 1.0625,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.975,
"step": 3035
},
{
"completion_length": 281.49375,
"epoch": 0.6714429673803509,
"grad_norm": 0.44721939894789436,
"kl": 0.313287353515625,
"learning_rate": 5.884786132828304e-06,
"loss": 0.0125,
"reward": 1.05625,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9375,
"step": 3040
},
{
"completion_length": 220.5375,
"epoch": 0.6725473143661739,
"grad_norm": 0.4903267420807382,
"kl": 0.280303955078125,
"learning_rate": 5.849676309365786e-06,
"loss": 0.0112,
"reward": 1.0625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.95625,
"step": 3045
},
{
"completion_length": 213.80625,
"epoch": 0.6736516613519968,
"grad_norm": 0.6637137632075046,
"kl": 0.3021484375,
"learning_rate": 5.814628185046884e-06,
"loss": 0.0121,
"reward": 1.075,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.94375,
"step": 3050
},
{
"completion_length": 193.11875,
"epoch": 0.6747560083378198,
"grad_norm": 0.48238872145583594,
"kl": 0.3324951171875,
"learning_rate": 5.779642280900668e-06,
"loss": 0.0133,
"reward": 1.05625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.96875,
"step": 3055
},
{
"completion_length": 157.91875,
"epoch": 0.6758603553236426,
"grad_norm": 0.3447631853492008,
"kl": 0.324847412109375,
"learning_rate": 5.744719117031217e-06,
"loss": 0.013,
"reward": 1.08125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.975,
"step": 3060
},
{
"completion_length": 142.34375,
"epoch": 0.6769647023094656,
"grad_norm": 0.2144181948765553,
"kl": 0.333251953125,
"learning_rate": 5.709859212609919e-06,
"loss": 0.0133,
"reward": 1.075,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.98125,
"step": 3065
},
{
"completion_length": 142.88125,
"epoch": 0.6780690492952886,
"grad_norm": 2.1794877397302637,
"kl": 0.31337890625,
"learning_rate": 5.675063085867747e-06,
"loss": 0.0125,
"reward": 1.0375,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.9875,
"step": 3070
},
{
"completion_length": 167.0625,
"epoch": 0.6791733962811115,
"grad_norm": 0.5227859895364816,
"kl": 0.31148681640625,
"learning_rate": 5.6403312540875325e-06,
"loss": 0.0125,
"reward": 1.075,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.98125,
"step": 3075
},
{
"completion_length": 183.3,
"epoch": 0.6802777432669345,
"grad_norm": 0.5037852453688297,
"kl": 0.3081787109375,
"learning_rate": 5.6056642335963e-06,
"loss": 0.0123,
"reward": 1.08125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.96875,
"step": 3080
},
{
"completion_length": 170.60625,
"epoch": 0.6813820902527574,
"grad_norm": 0.39645655269998004,
"kl": 0.38388671875,
"learning_rate": 5.571062539757582e-06,
"loss": 0.0154,
"reward": 1.0875,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9625,
"step": 3085
},
{
"completion_length": 178.66875,
"epoch": 0.6824864372385804,
"grad_norm": 0.3386051871333338,
"kl": 0.353631591796875,
"learning_rate": 5.536526686963762e-06,
"loss": 0.0141,
"reward": 1.08125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9625,
"step": 3090
},
{
"completion_length": 179.43125,
"epoch": 0.6835907842244033,
"grad_norm": 0.1674625268129377,
"kl": 0.311474609375,
"learning_rate": 5.50205718862841e-06,
"loss": 0.0125,
"reward": 1.09375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9875,
"step": 3095
},
{
"completion_length": 198.5,
"epoch": 0.6846951312102263,
"grad_norm": 896.8738993114102,
"kl": 5.80845947265625,
"learning_rate": 5.467654557178679e-06,
"loss": 0.2331,
"reward": 1.0375,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9625,
"step": 3100
},
{
"epoch": 0.6846951312102263,
"eval_completion_length": 212.23,
"eval_kl": 0.383203125,
"eval_loss": 0.015370451845228672,
"eval_reward": 1.08,
"eval_reward_std": 0.15556348919868468,
"eval_rewards/accuracy_reward": 0.12,
"eval_rewards/format_reward": 0.96,
"eval_runtime": 115.3303,
"eval_samples_per_second": 0.858,
"eval_steps_per_second": 0.217,
"step": 3100
},
{
"completion_length": 191.40625,
"epoch": 0.6857994781960493,
"grad_norm": 0.23939982985583666,
"kl": 0.347125244140625,
"learning_rate": 5.433319304047666e-06,
"loss": 0.0139,
"reward": 1.0375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.96875,
"step": 3105
},
{
"completion_length": 196.55,
"epoch": 0.6869038251818721,
"grad_norm": 0.35370609706150546,
"kl": 0.33798828125,
"learning_rate": 5.399051939666817e-06,
"loss": 0.0135,
"reward": 1.0875,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.975,
"step": 3110
},
{
"completion_length": 184.975,
"epoch": 0.688008172167695,
"grad_norm": 0.35760764478630763,
"kl": 0.30341796875,
"learning_rate": 5.36485297345833e-06,
"loss": 0.0121,
"reward": 1.1125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9875,
"step": 3115
},
{
"completion_length": 203.2,
"epoch": 0.689112519153518,
"grad_norm": 0.644558524459656,
"kl": 0.292742919921875,
"learning_rate": 5.330722913827594e-06,
"loss": 0.0117,
"reward": 1.0625,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.96875,
"step": 3120
},
{
"completion_length": 230.50625,
"epoch": 0.690216866139341,
"grad_norm": 0.3464456804053971,
"kl": 0.27896728515625,
"learning_rate": 5.29666226815563e-06,
"loss": 0.0112,
"reward": 1.1125,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.98125,
"step": 3125
},
{
"completion_length": 221.98125,
"epoch": 0.6913212131251639,
"grad_norm": 0.20402330430464163,
"kl": 0.286383056640625,
"learning_rate": 5.262671542791531e-06,
"loss": 0.0115,
"reward": 1.0,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.01875,
"rewards/format_reward": 0.98125,
"step": 3130
},
{
"completion_length": 218.54375,
"epoch": 0.6924255601109869,
"grad_norm": 0.5547640484775744,
"kl": 0.2820068359375,
"learning_rate": 5.228751243044961e-06,
"loss": 0.0113,
"reward": 1.04375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9625,
"step": 3135
},
{
"completion_length": 207.03125,
"epoch": 0.6935299070968098,
"grad_norm": 0.5409399988975135,
"kl": 0.27630615234375,
"learning_rate": 5.194901873178622e-06,
"loss": 0.0111,
"reward": 1.075,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 1.0,
"step": 3140
},
{
"completion_length": 188.34375,
"epoch": 0.6946342540826328,
"grad_norm": 0.15055443349583902,
"kl": 0.267608642578125,
"learning_rate": 5.1611239364007694e-06,
"loss": 0.0107,
"reward": 1.075,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.98125,
"step": 3145
},
{
"completion_length": 183.2125,
"epoch": 0.6957386010684558,
"grad_norm": 0.4919745608219073,
"kl": 0.27034912109375,
"learning_rate": 5.127417934857718e-06,
"loss": 0.0108,
"reward": 1.08125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.975,
"step": 3150
},
{
"completion_length": 193.4875,
"epoch": 0.6968429480542786,
"grad_norm": 0.08730250985788413,
"kl": 0.28955078125,
"learning_rate": 5.093784369626397e-06,
"loss": 0.0116,
"reward": 1.1125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9875,
"step": 3155
},
{
"completion_length": 209.58125,
"epoch": 0.6979472950401016,
"grad_norm": 0.32846616068602047,
"kl": 0.298028564453125,
"learning_rate": 5.060223740706883e-06,
"loss": 0.0119,
"reward": 1.06875,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9875,
"step": 3160
},
{
"completion_length": 219.4625,
"epoch": 0.6990516420259245,
"grad_norm": 1.1444441188491123,
"kl": 0.36575927734375,
"learning_rate": 5.026736547014981e-06,
"loss": 0.0146,
"reward": 1.01875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.9625,
"step": 3165
},
{
"completion_length": 239.91875,
"epoch": 0.7001559890117475,
"grad_norm": 0.8099775678937355,
"kl": 0.308984375,
"learning_rate": 4.993323286374787e-06,
"loss": 0.0124,
"reward": 1.025,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.95,
"step": 3170
},
{
"completion_length": 219.4125,
"epoch": 0.7012603359975704,
"grad_norm": 0.4265583411464448,
"kl": 0.29287109375,
"learning_rate": 4.959984455511313e-06,
"loss": 0.0117,
"reward": 1.08125,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.99375,
"step": 3175
},
{
"completion_length": 231.3125,
"epoch": 0.7023646829833934,
"grad_norm": 0.454103943617819,
"kl": 0.29195556640625,
"learning_rate": 4.926720550043089e-06,
"loss": 0.0117,
"reward": 1.06875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.98125,
"step": 3180
},
{
"completion_length": 197.88125,
"epoch": 0.7034690299692163,
"grad_norm": 0.4885755590367494,
"kl": 0.25499267578125,
"learning_rate": 4.893532064474787e-06,
"loss": 0.0102,
"reward": 1.08125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 1.0,
"step": 3185
},
{
"completion_length": 227.78125,
"epoch": 0.7045733769550393,
"grad_norm": 0.2719638893519356,
"kl": 0.270703125,
"learning_rate": 4.860419492189886e-06,
"loss": 0.0108,
"reward": 1.06875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.975,
"step": 3190
},
{
"completion_length": 216.89375,
"epoch": 0.7056777239408623,
"grad_norm": 0.6224163001095495,
"kl": 0.29207763671875,
"learning_rate": 4.827383325443331e-06,
"loss": 0.0117,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9625,
"step": 3195
},
{
"completion_length": 260.0,
"epoch": 0.7067820709266852,
"grad_norm": 0.4086697490241937,
"kl": 0.273291015625,
"learning_rate": 4.794424055354213e-06,
"loss": 0.0109,
"reward": 1.08125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9625,
"step": 3200
},
{
"epoch": 0.7067820709266852,
"eval_completion_length": 267.11,
"eval_kl": 0.30763671875,
"eval_loss": 0.012311533093452454,
"eval_reward": 1.09,
"eval_reward_std": 0.21213203012943269,
"eval_rewards/accuracy_reward": 0.14,
"eval_rewards/format_reward": 0.95,
"eval_runtime": 127.8367,
"eval_samples_per_second": 0.774,
"eval_steps_per_second": 0.196,
"step": 3200
},
{
"completion_length": 246.5875,
"epoch": 0.7078864179125081,
"grad_norm": 1.0355398852634203,
"kl": 0.26212158203125,
"learning_rate": 4.761542171898469e-06,
"loss": 0.0105,
"reward": 1.0625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.975,
"step": 3205
},
{
"completion_length": 282.29375,
"epoch": 0.708990764898331,
"grad_norm": 0.5137021254058943,
"kl": 0.3453125,
"learning_rate": 4.728738163901597e-06,
"loss": 0.0138,
"reward": 1.03125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.94375,
"step": 3210
},
{
"completion_length": 272.9,
"epoch": 0.710095111884154,
"grad_norm": 0.4557603593431217,
"kl": 0.3007080078125,
"learning_rate": 4.696012519031397e-06,
"loss": 0.012,
"reward": 1.06875,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95625,
"step": 3215
},
{
"completion_length": 300.46875,
"epoch": 0.7111994588699769,
"grad_norm": 0.9407435380966918,
"kl": 0.3644775390625,
"learning_rate": 4.663365723790698e-06,
"loss": 0.0146,
"reward": 1.0,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.90625,
"step": 3220
},
{
"completion_length": 282.19375,
"epoch": 0.7123038058557999,
"grad_norm": 0.48300892660454786,
"kl": 0.3069580078125,
"learning_rate": 4.630798263510162e-06,
"loss": 0.0123,
"reward": 1.05625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.95,
"step": 3225
},
{
"completion_length": 234.26875,
"epoch": 0.7134081528416228,
"grad_norm": 0.5153762702568343,
"kl": 0.33831787109375,
"learning_rate": 4.598310622341037e-06,
"loss": 0.0135,
"reward": 1.08125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9625,
"step": 3230
},
{
"completion_length": 283.26875,
"epoch": 0.7145124998274458,
"grad_norm": 0.2278004212721197,
"kl": 0.2323486328125,
"learning_rate": 4.565903283247981e-06,
"loss": 0.0093,
"reward": 1.13125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.975,
"step": 3235
},
{
"completion_length": 254.85,
"epoch": 0.7156168468132688,
"grad_norm": 0.4057412022574356,
"kl": 0.224237060546875,
"learning_rate": 4.533576728001858e-06,
"loss": 0.009,
"reward": 1.11875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.98125,
"step": 3240
},
{
"completion_length": 243.23125,
"epoch": 0.7167211937990917,
"grad_norm": 0.3936093803196274,
"kl": 0.2511962890625,
"learning_rate": 4.501331437172606e-06,
"loss": 0.01,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9625,
"step": 3245
},
{
"completion_length": 261.04375,
"epoch": 0.7178255407849147,
"grad_norm": 0.15034549538860667,
"kl": 0.28284912109375,
"learning_rate": 4.469167890122073e-06,
"loss": 0.0113,
"reward": 1.03125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.94375,
"step": 3250
},
{
"completion_length": 270.925,
"epoch": 0.7189298877707375,
"grad_norm": 0.5028341851811142,
"kl": 0.24959716796875,
"learning_rate": 4.437086564996891e-06,
"loss": 0.01,
"reward": 1.0375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.95625,
"step": 3255
},
{
"completion_length": 264.66875,
"epoch": 0.7200342347565605,
"grad_norm": 0.5870825869850653,
"kl": 0.26156005859375,
"learning_rate": 4.405087938721376e-06,
"loss": 0.0105,
"reward": 1.0375,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.93125,
"step": 3260
},
{
"completion_length": 333.51875,
"epoch": 0.7211385817423834,
"grad_norm": 0.5363985927856229,
"kl": 0.268310546875,
"learning_rate": 4.373172486990436e-06,
"loss": 0.0107,
"reward": 1.03125,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9125,
"step": 3265
},
{
"completion_length": 259.7625,
"epoch": 0.7222429287282064,
"grad_norm": 0.34294137834570276,
"kl": 0.25391845703125,
"learning_rate": 4.341340684262498e-06,
"loss": 0.0102,
"reward": 1.05625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9625,
"step": 3270
},
{
"completion_length": 279.15625,
"epoch": 0.7233472757140293,
"grad_norm": 0.45234678819267615,
"kl": 0.2611572265625,
"learning_rate": 4.309593003752446e-06,
"loss": 0.0104,
"reward": 1.0875,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.9375,
"step": 3275
},
{
"completion_length": 297.00625,
"epoch": 0.7244516226998523,
"grad_norm": 0.40479995606264946,
"kl": 0.280712890625,
"learning_rate": 4.277929917424602e-06,
"loss": 0.0112,
"reward": 1.0125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.925,
"step": 3280
},
{
"completion_length": 241.7875,
"epoch": 0.7255559696856753,
"grad_norm": 0.31716356544020063,
"kl": 0.2287841796875,
"learning_rate": 4.246351895985702e-06,
"loss": 0.0091,
"reward": 1.04375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9375,
"step": 3285
},
{
"completion_length": 254.69375,
"epoch": 0.7266603166714982,
"grad_norm": 0.3098099830382794,
"kl": 0.24783935546875,
"learning_rate": 4.214859408877899e-06,
"loss": 0.0099,
"reward": 1.05625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9625,
"step": 3290
},
{
"completion_length": 236.5875,
"epoch": 0.7277646636573212,
"grad_norm": 0.2207504226236474,
"kl": 0.2484619140625,
"learning_rate": 4.183452924271776e-06,
"loss": 0.0099,
"reward": 1.0375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.96875,
"step": 3295
},
{
"completion_length": 224.6625,
"epoch": 0.728869010643144,
"grad_norm": 0.5845311907558509,
"kl": 0.25625,
"learning_rate": 4.152132909059402e-06,
"loss": 0.0103,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.975,
"step": 3300
},
{
"epoch": 0.728869010643144,
"eval_completion_length": 241.69,
"eval_kl": 0.31572265625,
"eval_loss": 0.012639479711651802,
"eval_reward": 1.09,
"eval_reward_std": 0.1414213538169861,
"eval_rewards/accuracy_reward": 0.135,
"eval_rewards/format_reward": 0.955,
"eval_runtime": 118.6805,
"eval_samples_per_second": 0.834,
"eval_steps_per_second": 0.211,
"step": 3300
},
{
"completion_length": 215.5125,
"epoch": 0.729973357628967,
"grad_norm": 0.30909422545672033,
"kl": 0.245867919921875,
"learning_rate": 4.120899828847385e-06,
"loss": 0.0098,
"reward": 1.0875,
"reward_std": 0.05303300768136978,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9875,
"step": 3305
},
{
"completion_length": 230.26875,
"epoch": 0.7310777046147899,
"grad_norm": 0.6453873653199322,
"kl": 0.260516357421875,
"learning_rate": 4.089754147949935e-06,
"loss": 0.0104,
"reward": 1.08125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.95625,
"step": 3310
},
{
"completion_length": 210.56875,
"epoch": 0.7321820516006129,
"grad_norm": 0.40594249764413265,
"kl": 0.229119873046875,
"learning_rate": 4.058696329381987e-06,
"loss": 0.0092,
"reward": 1.1125,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.98125,
"step": 3315
},
{
"completion_length": 212.29375,
"epoch": 0.7332863985864359,
"grad_norm": 0.38422267389292253,
"kl": 0.2646240234375,
"learning_rate": 4.027726834852303e-06,
"loss": 0.0106,
"reward": 1.0875,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9625,
"step": 3320
},
{
"completion_length": 232.43125,
"epoch": 0.7343907455722588,
"grad_norm": 0.5042182184524241,
"kl": 0.2716796875,
"learning_rate": 3.996846124756609e-06,
"loss": 0.0109,
"reward": 1.05,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95,
"step": 3325
},
{
"completion_length": 219.50625,
"epoch": 0.7354950925580818,
"grad_norm": 0.5264628768885443,
"kl": 0.272119140625,
"learning_rate": 3.966054658170754e-06,
"loss": 0.0109,
"reward": 1.0875,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.95625,
"step": 3330
},
{
"completion_length": 223.0125,
"epoch": 0.7365994395439047,
"grad_norm": 0.2967573269006475,
"kl": 0.258892822265625,
"learning_rate": 3.93535289284388e-06,
"loss": 0.0104,
"reward": 1.075,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.96875,
"step": 3335
},
{
"completion_length": 256.3625,
"epoch": 0.7377037865297277,
"grad_norm": 0.35416855035423694,
"kl": 0.2759521484375,
"learning_rate": 3.904741285191629e-06,
"loss": 0.011,
"reward": 1.08125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.94375,
"step": 3340
},
{
"completion_length": 237.9875,
"epoch": 0.7388081335155506,
"grad_norm": 0.5938232640376352,
"kl": 0.283270263671875,
"learning_rate": 3.874220290289337e-06,
"loss": 0.0113,
"reward": 1.15625,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.19375,
"rewards/format_reward": 0.9625,
"step": 3345
},
{
"completion_length": 249.93125,
"epoch": 0.7399124805013735,
"grad_norm": 0.25454486548911043,
"kl": 0.254449462890625,
"learning_rate": 3.8437903618652895e-06,
"loss": 0.0102,
"reward": 1.0625,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9625,
"step": 3350
},
{
"completion_length": 221.375,
"epoch": 0.7410168274871964,
"grad_norm": 0.2326519300763832,
"kl": 0.24263916015625,
"learning_rate": 3.8134519522939693e-06,
"loss": 0.0097,
"reward": 1.0875,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9875,
"step": 3355
},
{
"completion_length": 224.9375,
"epoch": 0.7421211744730194,
"grad_norm": 0.4822164383039262,
"kl": 0.2813232421875,
"learning_rate": 3.7832055125893318e-06,
"loss": 0.0113,
"reward": 1.10625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.96875,
"step": 3360
},
{
"completion_length": 230.09375,
"epoch": 0.7432255214588424,
"grad_norm": 0.6372609601101804,
"kl": 0.298992919921875,
"learning_rate": 3.753051492398089e-06,
"loss": 0.012,
"reward": 1.14375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.19375,
"rewards/format_reward": 0.95,
"step": 3365
},
{
"completion_length": 219.25,
"epoch": 0.7443298684446653,
"grad_norm": 0.3081391116247598,
"kl": 0.2989990234375,
"learning_rate": 3.7229903399930423e-06,
"loss": 0.012,
"reward": 1.1,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.9625,
"step": 3370
},
{
"completion_length": 155.59375,
"epoch": 0.7454342154304883,
"grad_norm": 0.5909119308745682,
"kl": 0.31363525390625,
"learning_rate": 3.6930225022664136e-06,
"loss": 0.0125,
"reward": 1.11875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.98125,
"step": 3375
},
{
"completion_length": 187.8375,
"epoch": 0.7465385624163112,
"grad_norm": 0.34179198645052977,
"kl": 0.3404541015625,
"learning_rate": 3.6631484247231896e-06,
"loss": 0.0136,
"reward": 1.04375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.95,
"step": 3380
},
{
"completion_length": 212.6,
"epoch": 0.7476429094021342,
"grad_norm": 0.75092071011766,
"kl": 0.36864013671875,
"learning_rate": 3.6333685514745165e-06,
"loss": 0.0147,
"reward": 1.075,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.925,
"step": 3385
},
{
"completion_length": 193.43125,
"epoch": 0.7487472563879571,
"grad_norm": 0.329990173152014,
"kl": 0.365625,
"learning_rate": 3.6036833252310887e-06,
"loss": 0.0146,
"reward": 1.0625,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95,
"step": 3390
},
{
"completion_length": 201.61875,
"epoch": 0.7498516033737801,
"grad_norm": 0.47149722693689095,
"kl": 0.37044677734375,
"learning_rate": 3.574093187296568e-06,
"loss": 0.0148,
"reward": 1.075,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.95,
"step": 3395
},
{
"completion_length": 210.40625,
"epoch": 0.7509559503596029,
"grad_norm": 0.6186398549798994,
"kl": 0.31754150390625,
"learning_rate": 3.544598577561016e-06,
"loss": 0.0127,
"reward": 1.06875,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.96875,
"step": 3400
},
{
"epoch": 0.7509559503596029,
"eval_completion_length": 217.785,
"eval_kl": 0.5054296875,
"eval_loss": 0.020201342180371284,
"eval_reward": 1.095,
"eval_reward_std": 0.162634556889534,
"eval_rewards/accuracy_reward": 0.13,
"eval_rewards/format_reward": 0.965,
"eval_runtime": 115.0896,
"eval_samples_per_second": 0.86,
"eval_steps_per_second": 0.217,
"step": 3400
},
{
"completion_length": 220.7875,
"epoch": 0.7520602973454259,
"grad_norm": 0.502156340630982,
"kl": 0.31522216796875,
"learning_rate": 3.515199934494373e-06,
"loss": 0.0126,
"reward": 1.0625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9375,
"step": 3405
},
{
"completion_length": 235.6625,
"epoch": 0.7531646443312489,
"grad_norm": 0.405309226466982,
"kl": 0.34342041015625,
"learning_rate": 3.4858976951399237e-06,
"loss": 0.0137,
"reward": 1.075,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.9625,
"step": 3410
},
{
"completion_length": 207.125,
"epoch": 0.7542689913170718,
"grad_norm": 0.42471895189637104,
"kl": 0.37327880859375,
"learning_rate": 3.4566922951078086e-06,
"loss": 0.0149,
"reward": 1.10625,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.9625,
"step": 3415
},
{
"completion_length": 204.0625,
"epoch": 0.7553733383028948,
"grad_norm": 0.328073526920033,
"kl": 0.277392578125,
"learning_rate": 3.427584168568535e-06,
"loss": 0.0111,
"reward": 1.10625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.975,
"step": 3420
},
{
"completion_length": 205.0875,
"epoch": 0.7564776852887177,
"grad_norm": 0.5369831637398775,
"kl": 0.2722412109375,
"learning_rate": 3.398573748246544e-06,
"loss": 0.0109,
"reward": 1.175,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.9875,
"step": 3425
},
{
"completion_length": 271.4875,
"epoch": 0.7575820322745407,
"grad_norm": 0.5616908933906252,
"kl": 0.249615478515625,
"learning_rate": 3.3696614654137637e-06,
"loss": 0.01,
"reward": 0.9625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.93125,
"step": 3430
},
{
"completion_length": 246.775,
"epoch": 0.7586863792603636,
"grad_norm": 0.602386946053219,
"kl": 0.25421142578125,
"learning_rate": 3.3408477498831917e-06,
"loss": 0.0102,
"reward": 1.1375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.18125,
"rewards/format_reward": 0.95625,
"step": 3435
},
{
"completion_length": 227.56875,
"epoch": 0.7597907262461866,
"grad_norm": 0.5462886025152357,
"kl": 0.259625244140625,
"learning_rate": 3.3121330300025222e-06,
"loss": 0.0104,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.975,
"step": 3440
},
{
"completion_length": 217.3875,
"epoch": 0.7608950732320094,
"grad_norm": 0.513835490839363,
"kl": 0.26868896484375,
"learning_rate": 3.2835177326477675e-06,
"loss": 0.0108,
"reward": 1.1125,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9875,
"step": 3445
},
{
"completion_length": 221.81875,
"epoch": 0.7619994202178324,
"grad_norm": 0.44817623889299235,
"kl": 0.251348876953125,
"learning_rate": 3.2550022832169125e-06,
"loss": 0.0101,
"reward": 1.05,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.96875,
"step": 3450
},
{
"completion_length": 201.45625,
"epoch": 0.7631037672036554,
"grad_norm": 0.3221696210681099,
"kl": 0.258599853515625,
"learning_rate": 3.2265871056235974e-06,
"loss": 0.0103,
"reward": 1.0875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.99375,
"step": 3455
},
{
"completion_length": 200.58125,
"epoch": 0.7642081141894783,
"grad_norm": 0.748323660705002,
"kl": 0.27359619140625,
"learning_rate": 3.1982726222908046e-06,
"loss": 0.0109,
"reward": 1.11875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.975,
"step": 3460
},
{
"completion_length": 239.84375,
"epoch": 0.7653124611753013,
"grad_norm": 0.40897691141192144,
"kl": 0.24927978515625,
"learning_rate": 3.170059254144593e-06,
"loss": 0.01,
"reward": 1.06875,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.98125,
"step": 3465
},
{
"completion_length": 220.5125,
"epoch": 0.7664168081611242,
"grad_norm": 0.32005616347994614,
"kl": 0.26856689453125,
"learning_rate": 3.1419474206078203e-06,
"loss": 0.0107,
"reward": 1.1625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.18125,
"rewards/format_reward": 0.98125,
"step": 3470
},
{
"completion_length": 235.375,
"epoch": 0.7675211551469472,
"grad_norm": 0.38318241182760876,
"kl": 0.2571533203125,
"learning_rate": 3.113937539593931e-06,
"loss": 0.0103,
"reward": 1.09375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9875,
"step": 3475
},
{
"completion_length": 265.41875,
"epoch": 0.7686255021327701,
"grad_norm": 0.5005202602287694,
"kl": 0.2830078125,
"learning_rate": 3.086030027500728e-06,
"loss": 0.0113,
"reward": 1.09375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.98125,
"step": 3480
},
{
"completion_length": 263.2875,
"epoch": 0.7697298491185931,
"grad_norm": 0.07189820608786429,
"kl": 0.28231201171875,
"learning_rate": 3.058225299204195e-06,
"loss": 0.0113,
"reward": 1.05625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.98125,
"step": 3485
},
{
"completion_length": 255.85,
"epoch": 0.7708341961044161,
"grad_norm": 0.5124546790054572,
"kl": 0.28914794921875,
"learning_rate": 3.0305237680523046e-06,
"loss": 0.0116,
"reward": 1.1,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.98125,
"step": 3490
},
{
"completion_length": 243.6625,
"epoch": 0.7719385430902389,
"grad_norm": 0.2684526887471308,
"kl": 0.257568359375,
"learning_rate": 3.002925845858905e-06,
"loss": 0.0103,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.05625,
"rewards/format_reward": 0.98125,
"step": 3495
},
{
"completion_length": 279.725,
"epoch": 0.7730428900760619,
"grad_norm": 0.45994470057081904,
"kl": 0.265185546875,
"learning_rate": 2.9754319428975796e-06,
"loss": 0.0106,
"reward": 1.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.1625,
"rewards/format_reward": 0.9625,
"step": 3500
},
{
"epoch": 0.7730428900760619,
"eval_completion_length": 227.09,
"eval_kl": 0.26388671875,
"eval_loss": 0.010543497279286385,
"eval_reward": 1.095,
"eval_reward_std": 0.1484924215078354,
"eval_rewards/accuracy_reward": 0.115,
"eval_rewards/format_reward": 0.98,
"eval_runtime": 102.2929,
"eval_samples_per_second": 0.968,
"eval_steps_per_second": 0.244,
"step": 3500
},
{
"completion_length": 277.59375,
"epoch": 0.7741472370618848,
"grad_norm": 0.4448459387591611,
"kl": 0.287054443359375,
"learning_rate": 2.948042467895544e-06,
"loss": 0.0115,
"reward": 1.05625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9625,
"step": 3505
},
{
"completion_length": 259.24375,
"epoch": 0.7752515840477078,
"grad_norm": 0.4966119961041164,
"kl": 0.29287109375,
"learning_rate": 2.920757828027586e-06,
"loss": 0.0117,
"reward": 1.03125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9625,
"step": 3510
},
{
"completion_length": 286.8,
"epoch": 0.7763559310335307,
"grad_norm": 0.484519618170077,
"kl": 0.2783935546875,
"learning_rate": 2.893578428909998e-06,
"loss": 0.0111,
"reward": 1.10625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.9625,
"step": 3515
},
{
"completion_length": 236.9875,
"epoch": 0.7774602780193537,
"grad_norm": 0.5777828810239061,
"kl": 0.27120361328125,
"learning_rate": 2.8665046745945555e-06,
"loss": 0.0109,
"reward": 1.1,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.95625,
"step": 3520
},
{
"completion_length": 269.5625,
"epoch": 0.7785646250051766,
"grad_norm": 0.21783452027900907,
"kl": 0.25440673828125,
"learning_rate": 2.839536967562504e-06,
"loss": 0.0102,
"reward": 1.10625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.98125,
"step": 3525
},
{
"completion_length": 254.6625,
"epoch": 0.7796689719909996,
"grad_norm": 0.5865883286348121,
"kl": 0.233740234375,
"learning_rate": 2.8126757087185797e-06,
"loss": 0.0093,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.975,
"step": 3530
},
{
"completion_length": 280.525,
"epoch": 0.7807733189768226,
"grad_norm": 0.43883058743962794,
"kl": 0.271136474609375,
"learning_rate": 2.7859212973850535e-06,
"loss": 0.0108,
"reward": 1.075,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.9375,
"step": 3535
},
{
"completion_length": 205.775,
"epoch": 0.7818776659626455,
"grad_norm": 0.606176189949368,
"kl": 0.27373046875,
"learning_rate": 2.759274131295787e-06,
"loss": 0.0109,
"reward": 1.11875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.975,
"step": 3540
},
{
"completion_length": 260.25,
"epoch": 0.7829820129484684,
"grad_norm": 0.3955866589392802,
"kl": 0.270849609375,
"learning_rate": 2.732734606590318e-06,
"loss": 0.0108,
"reward": 1.03125,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.93125,
"step": 3545
},
{
"completion_length": 262.69375,
"epoch": 0.7840863599342913,
"grad_norm": 0.3782847315356218,
"kl": 0.275299072265625,
"learning_rate": 2.7063031178079847e-06,
"loss": 0.011,
"reward": 1.0625,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.94375,
"step": 3550
},
{
"completion_length": 250.175,
"epoch": 0.7851907069201143,
"grad_norm": 0.27828806764961916,
"kl": 0.2863525390625,
"learning_rate": 2.679980057882049e-06,
"loss": 0.0115,
"reward": 1.00625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9375,
"step": 3555
},
{
"completion_length": 235.0625,
"epoch": 0.7862950539059372,
"grad_norm": 0.7313042767403699,
"kl": 0.284637451171875,
"learning_rate": 2.6537658181338534e-06,
"loss": 0.0114,
"reward": 1.08125,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.9375,
"step": 3560
},
{
"completion_length": 222.3625,
"epoch": 0.7873994008917602,
"grad_norm": 0.40644071357218936,
"kl": 0.286981201171875,
"learning_rate": 2.6276607882670135e-06,
"loss": 0.0115,
"reward": 1.075,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.9625,
"step": 3565
},
{
"completion_length": 231.09375,
"epoch": 0.7885037478775831,
"grad_norm": 0.6111130783275136,
"kl": 0.306976318359375,
"learning_rate": 2.60166535636162e-06,
"loss": 0.0123,
"reward": 1.1375,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.18125,
"rewards/format_reward": 0.95625,
"step": 3570
},
{
"completion_length": 216.48125,
"epoch": 0.7896080948634061,
"grad_norm": 0.49711551639970475,
"kl": 0.25562744140625,
"learning_rate": 2.5757799088684654e-06,
"loss": 0.0102,
"reward": 1.1875,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.19375,
"rewards/format_reward": 0.99375,
"step": 3575
},
{
"completion_length": 218.78125,
"epoch": 0.7907124418492291,
"grad_norm": 0.31764284101121393,
"kl": 0.30023193359375,
"learning_rate": 2.5500048306033065e-06,
"loss": 0.012,
"reward": 1.06875,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95625,
"step": 3580
},
{
"completion_length": 216.31875,
"epoch": 0.791816788835052,
"grad_norm": 0.4787122183538524,
"kl": 0.2999267578125,
"learning_rate": 2.5243405047411353e-06,
"loss": 0.012,
"reward": 1.09375,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.975,
"step": 3585
},
{
"completion_length": 249.625,
"epoch": 0.7929211358208749,
"grad_norm": 0.3111958011260876,
"kl": 0.28712158203125,
"learning_rate": 2.498787312810492e-06,
"loss": 0.0115,
"reward": 1.05,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.94375,
"step": 3590
},
{
"completion_length": 231.98125,
"epoch": 0.7940254828066978,
"grad_norm": 0.39194633386336397,
"kl": 0.28707275390625,
"learning_rate": 2.4733456346877817e-06,
"loss": 0.0115,
"reward": 1.05,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.95625,
"step": 3595
},
{
"completion_length": 175.5375,
"epoch": 0.7951298297925208,
"grad_norm": 0.42822142675950153,
"kl": 0.3134765625,
"learning_rate": 2.448015848591638e-06,
"loss": 0.0125,
"reward": 1.10625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.98125,
"step": 3600
},
{
"epoch": 0.7951298297925208,
"eval_completion_length": 171.885,
"eval_kl": 0.3194921875,
"eval_loss": 0.012775387614965439,
"eval_reward": 1.14,
"eval_reward_std": 0.15556348919868468,
"eval_rewards/accuracy_reward": 0.16,
"eval_rewards/format_reward": 0.98,
"eval_runtime": 92.7145,
"eval_samples_per_second": 1.068,
"eval_steps_per_second": 0.27,
"step": 3600
},
{
"completion_length": 201.60625,
"epoch": 0.7962341767783437,
"grad_norm": 0.5464532193593336,
"kl": 0.325775146484375,
"learning_rate": 2.4227983310772963e-06,
"loss": 0.013,
"reward": 1.075,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.95625,
"step": 3605
},
{
"completion_length": 176.4625,
"epoch": 0.7973385237641667,
"grad_norm": 0.5021094659088707,
"kl": 0.36162109375,
"learning_rate": 2.3976934570309974e-06,
"loss": 0.0145,
"reward": 1.1125,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.96875,
"step": 3610
},
{
"completion_length": 167.06875,
"epoch": 0.7984428707499897,
"grad_norm": 0.3403630562580807,
"kl": 0.325738525390625,
"learning_rate": 2.3727015996644043e-06,
"loss": 0.013,
"reward": 1.125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.98125,
"step": 3615
},
{
"completion_length": 138.4625,
"epoch": 0.7995472177358126,
"grad_norm": 0.6077367973458568,
"kl": 0.333203125,
"learning_rate": 2.3478231305090694e-06,
"loss": 0.0133,
"reward": 1.13125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.99375,
"step": 3620
},
{
"completion_length": 177.075,
"epoch": 0.8006515647216356,
"grad_norm": 0.4466360553445801,
"kl": 0.325604248046875,
"learning_rate": 2.3230584194109074e-06,
"loss": 0.013,
"reward": 1.11875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.98125,
"step": 3625
},
{
"completion_length": 161.88125,
"epoch": 0.8017559117074585,
"grad_norm": 0.3897741643622985,
"kl": 0.345849609375,
"learning_rate": 2.298407834524682e-06,
"loss": 0.0138,
"reward": 1.0875,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.975,
"step": 3630
},
{
"completion_length": 166.9,
"epoch": 0.8028602586932815,
"grad_norm": 2.30034414615901,
"kl": 0.372119140625,
"learning_rate": 2.2738717423085543e-06,
"loss": 0.0149,
"reward": 1.10625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.98125,
"step": 3635
},
{
"completion_length": 238.34375,
"epoch": 0.8039646056791043,
"grad_norm": 0.6076643483832027,
"kl": 0.309075927734375,
"learning_rate": 2.2494505075186234e-06,
"loss": 0.0124,
"reward": 1.0875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.16875,
"rewards/format_reward": 0.91875,
"step": 3640
},
{
"completion_length": 181.86875,
"epoch": 0.8050689526649273,
"grad_norm": 0.2992763139298062,
"kl": 0.269927978515625,
"learning_rate": 2.2251444932035094e-06,
"loss": 0.0108,
"reward": 1.125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.98125,
"step": 3645
},
{
"completion_length": 164.825,
"epoch": 0.8061732996507502,
"grad_norm": 0.6026739836434083,
"kl": 0.284381103515625,
"learning_rate": 2.200954060698941e-06,
"loss": 0.0114,
"reward": 1.11875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.96875,
"step": 3650
},
{
"completion_length": 230.73125,
"epoch": 0.8072776466365732,
"grad_norm": 0.48565298064687734,
"kl": 0.30531005859375,
"learning_rate": 2.176879569622409e-06,
"loss": 0.0122,
"reward": 1.075,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.95625,
"step": 3655
},
{
"completion_length": 242.775,
"epoch": 0.8083819936223962,
"grad_norm": 0.21896055218236896,
"kl": 0.2802001953125,
"learning_rate": 2.1529213778677993e-06,
"loss": 0.0112,
"reward": 1.025,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.94375,
"step": 3660
},
{
"completion_length": 229.15,
"epoch": 0.8094863406082191,
"grad_norm": 0.14196401938191486,
"kl": 0.259161376953125,
"learning_rate": 2.1290798416000857e-06,
"loss": 0.0104,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.95625,
"step": 3665
},
{
"completion_length": 267.93125,
"epoch": 0.8105906875940421,
"grad_norm": 0.5415824445762728,
"kl": 0.263916015625,
"learning_rate": 2.1053553152500204e-06,
"loss": 0.0106,
"reward": 1.0125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.91875,
"step": 3670
},
{
"completion_length": 234.425,
"epoch": 0.811695034579865,
"grad_norm": 0.44188434661367404,
"kl": 0.27322998046875,
"learning_rate": 2.081748151508883e-06,
"loss": 0.0109,
"reward": 1.075,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.95625,
"step": 3675
},
{
"completion_length": 223.39375,
"epoch": 0.812799381565688,
"grad_norm": 0.29953298263136474,
"kl": 0.2898193359375,
"learning_rate": 2.0582587013232268e-06,
"loss": 0.0116,
"reward": 1.05625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95625,
"step": 3680
},
{
"completion_length": 242.9125,
"epoch": 0.8139037285515108,
"grad_norm": 0.5105270540146248,
"kl": 0.28282470703125,
"learning_rate": 2.0348873138896563e-06,
"loss": 0.0113,
"reward": 1.0,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.05,
"rewards/format_reward": 0.95,
"step": 3685
},
{
"completion_length": 225.31875,
"epoch": 0.8150080755373338,
"grad_norm": 0.3698502677044578,
"kl": 0.252008056640625,
"learning_rate": 2.0116343366496493e-06,
"loss": 0.0101,
"reward": 1.0625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9625,
"step": 3690
},
{
"completion_length": 236.0875,
"epoch": 0.8161124225231567,
"grad_norm": 1.4290601982893592,
"kl": 0.321392822265625,
"learning_rate": 1.988500115284385e-06,
"loss": 0.0129,
"reward": 1.0375,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.95625,
"step": 3695
},
{
"completion_length": 211.28125,
"epoch": 0.8172167695089797,
"grad_norm": 0.3911358009799874,
"kl": 0.278375244140625,
"learning_rate": 1.9654849937096033e-06,
"loss": 0.0111,
"reward": 1.0625,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.975,
"step": 3700
},
{
"epoch": 0.8172167695089797,
"eval_completion_length": 202.245,
"eval_kl": 0.299296875,
"eval_loss": 0.011967692524194717,
"eval_reward": 1.13,
"eval_reward_std": 0.11313708305358887,
"eval_rewards/accuracy_reward": 0.14,
"eval_rewards/format_reward": 0.99,
"eval_runtime": 97.9771,
"eval_samples_per_second": 1.01,
"eval_steps_per_second": 0.255,
"step": 3700
},
{
"completion_length": 245.05,
"epoch": 0.8183211164948027,
"grad_norm": 0.6857212516430605,
"kl": 0.283929443359375,
"learning_rate": 1.942589314070494e-06,
"loss": 0.0114,
"reward": 1.05,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.94375,
"step": 3705
},
{
"completion_length": 207.8,
"epoch": 0.8194254634806256,
"grad_norm": 0.8931067063405094,
"kl": 0.33228759765625,
"learning_rate": 1.9198134167366156e-06,
"loss": 0.0133,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.96875,
"step": 3710
},
{
"completion_length": 231.15625,
"epoch": 0.8205298104664486,
"grad_norm": 4.939225391817678,
"kl": 0.328973388671875,
"learning_rate": 1.897157640296825e-06,
"loss": 0.0131,
"reward": 1.06875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95625,
"step": 3715
},
{
"completion_length": 213.325,
"epoch": 0.8216341574522715,
"grad_norm": 0.5141296496399171,
"kl": 0.280364990234375,
"learning_rate": 1.8746223215542482e-06,
"loss": 0.0112,
"reward": 1.09375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.96875,
"step": 3720
},
{
"completion_length": 216.3625,
"epoch": 0.8227385044380945,
"grad_norm": 0.49407058755769534,
"kl": 0.245660400390625,
"learning_rate": 1.8522077955212791e-06,
"loss": 0.0098,
"reward": 1.1375,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.98125,
"step": 3725
},
{
"completion_length": 206.43125,
"epoch": 0.8238428514239174,
"grad_norm": 0.2188098942709737,
"kl": 0.278680419921875,
"learning_rate": 1.8299143954145926e-06,
"loss": 0.0111,
"reward": 1.1,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.95,
"step": 3730
},
{
"completion_length": 211.05,
"epoch": 0.8249471984097403,
"grad_norm": 0.8180293925174863,
"kl": 0.28306884765625,
"learning_rate": 1.8077424526501964e-06,
"loss": 0.0113,
"reward": 1.0875,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.96875,
"step": 3735
},
{
"completion_length": 216.4625,
"epoch": 0.8260515453955632,
"grad_norm": 0.6158285951662569,
"kl": 0.28001708984375,
"learning_rate": 1.7856922968384926e-06,
"loss": 0.0112,
"reward": 1.0875,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.95625,
"step": 3740
},
{
"completion_length": 238.0125,
"epoch": 0.8271558923813862,
"grad_norm": 0.615093259382316,
"kl": 0.301104736328125,
"learning_rate": 1.763764255779392e-06,
"loss": 0.012,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.94375,
"step": 3745
},
{
"completion_length": 212.09375,
"epoch": 0.8282602393672092,
"grad_norm": 0.5625866842898283,
"kl": 0.2462158203125,
"learning_rate": 1.7419586554574364e-06,
"loss": 0.0098,
"reward": 1.14375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.175,
"rewards/format_reward": 0.96875,
"step": 3750
},
{
"completion_length": 244.36875,
"epoch": 0.8293645863530321,
"grad_norm": 0.6830147990367013,
"kl": 0.36268310546875,
"learning_rate": 1.720275820036944e-06,
"loss": 0.0145,
"reward": 1.05,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.925,
"step": 3755
},
{
"completion_length": 185.325,
"epoch": 0.8304689333388551,
"grad_norm": 0.09647577383094562,
"kl": 0.28330078125,
"learning_rate": 1.6987160718572027e-06,
"loss": 0.0113,
"reward": 1.10625,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.975,
"step": 3760
},
{
"completion_length": 169.075,
"epoch": 0.831573280324678,
"grad_norm": 0.42190186308598165,
"kl": 0.2713623046875,
"learning_rate": 1.6772797314276712e-06,
"loss": 0.0109,
"reward": 1.11875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.98125,
"step": 3765
},
{
"completion_length": 209.425,
"epoch": 0.832677627310501,
"grad_norm": 0.4838527676338876,
"kl": 0.3133056640625,
"learning_rate": 1.6559671174232195e-06,
"loss": 0.0125,
"reward": 1.03125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.95,
"step": 3770
},
{
"completion_length": 196.15625,
"epoch": 0.833781974296324,
"grad_norm": 0.541789987335856,
"kl": 0.284918212890625,
"learning_rate": 1.6347785466793764e-06,
"loss": 0.0114,
"reward": 1.13125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.98125,
"step": 3775
},
{
"completion_length": 189.86875,
"epoch": 0.8348863212821469,
"grad_norm": 0.37944243070397565,
"kl": 0.305487060546875,
"learning_rate": 1.6137143341876439e-06,
"loss": 0.0122,
"reward": 1.09375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.99375,
"step": 3780
},
{
"completion_length": 192.91875,
"epoch": 0.8359906682679697,
"grad_norm": 0.4016733182249456,
"kl": 0.266162109375,
"learning_rate": 1.5927747930907921e-06,
"loss": 0.0106,
"reward": 1.08125,
"reward_std": 0.06187184229493141,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9875,
"step": 3785
},
{
"completion_length": 199.01875,
"epoch": 0.8370950152537927,
"grad_norm": 0.555504541233714,
"kl": 0.31944580078125,
"learning_rate": 1.5719602346782215e-06,
"loss": 0.0128,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9625,
"step": 3790
},
{
"completion_length": 191.89375,
"epoch": 0.8381993622396157,
"grad_norm": 0.6714397875203784,
"kl": 0.400775146484375,
"learning_rate": 1.5512709683813165e-06,
"loss": 0.016,
"reward": 1.15,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.9625,
"step": 3795
},
{
"completion_length": 211.68125,
"epoch": 0.8393037092254386,
"grad_norm": 0.3384649202970216,
"kl": 0.278900146484375,
"learning_rate": 1.5307073017688644e-06,
"loss": 0.0112,
"reward": 1.03125,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.9625,
"step": 3800
},
{
"epoch": 0.8393037092254386,
"eval_completion_length": 195.9,
"eval_kl": 0.30609375,
"eval_loss": 0.012239097617566586,
"eval_reward": 1.095,
"eval_reward_std": 0.14849242091178894,
"eval_rewards/accuracy_reward": 0.13,
"eval_rewards/format_reward": 0.965,
"eval_runtime": 100.9941,
"eval_samples_per_second": 0.98,
"eval_steps_per_second": 0.248,
"step": 3800
},
{
"completion_length": 217.4375,
"epoch": 0.8404080562112616,
"grad_norm": 0.6534309102123147,
"kl": 0.330548095703125,
"learning_rate": 1.5102695405424738e-06,
"loss": 0.0132,
"reward": 1.08125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.95625,
"step": 3805
},
{
"completion_length": 207.18125,
"epoch": 0.8415124031970845,
"grad_norm": 0.35957581300760044,
"kl": 0.358197021484375,
"learning_rate": 1.4899579885320237e-06,
"loss": 0.0143,
"reward": 1.0875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.94375,
"step": 3810
},
{
"completion_length": 222.85625,
"epoch": 0.8426167501829075,
"grad_norm": 0.5604436470793626,
"kl": 0.305682373046875,
"learning_rate": 1.4697729476911614e-06,
"loss": 0.0122,
"reward": 1.13125,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.18125,
"rewards/format_reward": 0.95,
"step": 3815
},
{
"completion_length": 202.29375,
"epoch": 0.8437210971687304,
"grad_norm": 0.4206816591710824,
"kl": 0.2802001953125,
"learning_rate": 1.449714718092803e-06,
"loss": 0.0112,
"reward": 1.08125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.98125,
"step": 3820
},
{
"completion_length": 205.2375,
"epoch": 0.8448254441545534,
"grad_norm": 0.16670952436146919,
"kl": 0.273931884765625,
"learning_rate": 1.4297835979246777e-06,
"loss": 0.011,
"reward": 1.075,
"reward_std": 0.07071067690849304,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9875,
"step": 3825
},
{
"completion_length": 238.1875,
"epoch": 0.8459297911403763,
"grad_norm": 0.613398892634411,
"kl": 0.276251220703125,
"learning_rate": 1.4099798834848855e-06,
"loss": 0.0111,
"reward": 1.10625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.95,
"step": 3830
},
{
"completion_length": 261.0375,
"epoch": 0.8470341381261992,
"grad_norm": 0.47116863545999577,
"kl": 0.307647705078125,
"learning_rate": 1.3903038691775095e-06,
"loss": 0.0123,
"reward": 1.09375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.94375,
"step": 3835
},
{
"completion_length": 211.49375,
"epoch": 0.8481384851120222,
"grad_norm": 0.3456577853406588,
"kl": 0.296746826171875,
"learning_rate": 1.370755847508226e-06,
"loss": 0.0119,
"reward": 1.1125,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.975,
"step": 3840
},
{
"completion_length": 210.94375,
"epoch": 0.8492428320978451,
"grad_norm": 0.3986816328493071,
"kl": 0.29635009765625,
"learning_rate": 1.3513361090799537e-06,
"loss": 0.0119,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.96875,
"step": 3845
},
{
"completion_length": 216.41875,
"epoch": 0.8503471790836681,
"grad_norm": 0.36582880270044166,
"kl": 0.265789794921875,
"learning_rate": 1.332044942588545e-06,
"loss": 0.0106,
"reward": 1.14375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.18125,
"rewards/format_reward": 0.9625,
"step": 3850
},
{
"completion_length": 196.1875,
"epoch": 0.851451526069491,
"grad_norm": 0.43079415258986453,
"kl": 0.3136474609375,
"learning_rate": 1.3128826348184886e-06,
"loss": 0.0125,
"reward": 1.1625,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.19375,
"rewards/format_reward": 0.96875,
"step": 3855
},
{
"completion_length": 205.025,
"epoch": 0.852555873055314,
"grad_norm": 0.26604232127523036,
"kl": 0.30478515625,
"learning_rate": 1.2938494706386462e-06,
"loss": 0.0122,
"reward": 1.0875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.96875,
"step": 3860
},
{
"completion_length": 218.6,
"epoch": 0.853660220041137,
"grad_norm": 0.4745459719079689,
"kl": 0.239349365234375,
"learning_rate": 1.2749457329980108e-06,
"loss": 0.0096,
"reward": 1.13125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.975,
"step": 3865
},
{
"completion_length": 221.925,
"epoch": 0.8547645670269599,
"grad_norm": 1.0389906784554162,
"kl": 0.282757568359375,
"learning_rate": 1.256171702921516e-06,
"loss": 0.0113,
"reward": 1.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.1625,
"rewards/format_reward": 0.9625,
"step": 3870
},
{
"completion_length": 227.575,
"epoch": 0.8558689140127829,
"grad_norm": 0.5325421261443589,
"kl": 0.273663330078125,
"learning_rate": 1.237527659505846e-06,
"loss": 0.0109,
"reward": 1.0625,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95,
"step": 3875
},
{
"completion_length": 253.325,
"epoch": 0.8569732609986057,
"grad_norm": 0.19847592400408953,
"kl": 0.26175537109375,
"learning_rate": 1.2190138799152851e-06,
"loss": 0.0105,
"reward": 1.05,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.94375,
"step": 3880
},
{
"completion_length": 252.20625,
"epoch": 0.8580776079844287,
"grad_norm": 0.6511065620949663,
"kl": 0.270318603515625,
"learning_rate": 1.200630639377609e-06,
"loss": 0.0108,
"reward": 1.0625,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.94375,
"step": 3885
},
{
"completion_length": 222.19375,
"epoch": 0.8591819549702516,
"grad_norm": 0.510836488773378,
"kl": 0.246490478515625,
"learning_rate": 1.1823782111799843e-06,
"loss": 0.0099,
"reward": 1.1375,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.18125,
"rewards/format_reward": 0.95625,
"step": 3890
},
{
"completion_length": 252.8125,
"epoch": 0.8602863019560746,
"grad_norm": 0.4973937206692706,
"kl": 0.240765380859375,
"learning_rate": 1.1642568666649067e-06,
"loss": 0.0096,
"reward": 1.08125,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.94375,
"step": 3895
},
{
"completion_length": 235.05625,
"epoch": 0.8613906489418975,
"grad_norm": 0.896473119654828,
"kl": 0.25704345703125,
"learning_rate": 1.1462668752261652e-06,
"loss": 0.0103,
"reward": 1.11875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.9625,
"step": 3900
},
{
"epoch": 0.8613906489418975,
"eval_completion_length": 235.22,
"eval_kl": 0.29470703125,
"eval_loss": 0.011815370991826057,
"eval_reward": 1.07,
"eval_reward_std": 0.12727921783924104,
"eval_rewards/accuracy_reward": 0.11,
"eval_rewards/format_reward": 0.96,
"eval_runtime": 109.2786,
"eval_samples_per_second": 0.906,
"eval_steps_per_second": 0.229,
"step": 3900
},
{
"completion_length": 233.93125,
"epoch": 0.8624949959277205,
"grad_norm": 0.791697614971569,
"kl": 0.280670166015625,
"learning_rate": 1.1284085043048465e-06,
"loss": 0.0112,
"reward": 1.05625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95625,
"step": 3905
},
{
"completion_length": 262.68125,
"epoch": 0.8635993429135435,
"grad_norm": 0.5051810763575918,
"kl": 0.27977294921875,
"learning_rate": 1.1106820193853484e-06,
"loss": 0.0112,
"reward": 1.0125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.075,
"rewards/format_reward": 0.9375,
"step": 3910
},
{
"completion_length": 251.48125,
"epoch": 0.8647036898993664,
"grad_norm": 0.5025881602992487,
"kl": 0.273834228515625,
"learning_rate": 1.0930876839914418e-06,
"loss": 0.011,
"reward": 1.06875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95625,
"step": 3915
},
{
"completion_length": 236.35625,
"epoch": 0.8658080368851894,
"grad_norm": 0.4129347471678857,
"kl": 0.2613372802734375,
"learning_rate": 1.0756257596823427e-06,
"loss": 0.0105,
"reward": 1.075,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.95625,
"step": 3920
},
{
"completion_length": 265.86875,
"epoch": 0.8669123838710123,
"grad_norm": 0.4235003049667533,
"kl": 0.253765869140625,
"learning_rate": 1.058296506048836e-06,
"loss": 0.0101,
"reward": 1.1,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.95,
"step": 3925
},
{
"completion_length": 232.975,
"epoch": 0.8680167308568352,
"grad_norm": 0.37693409083366114,
"kl": 0.2826416015625,
"learning_rate": 1.04110018070941e-06,
"loss": 0.0113,
"reward": 1.14375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.16875,
"rewards/format_reward": 0.975,
"step": 3930
},
{
"completion_length": 256.3375,
"epoch": 0.8691210778426581,
"grad_norm": 0.47005147974118267,
"kl": 0.28515625,
"learning_rate": 1.0240370393064235e-06,
"loss": 0.0114,
"reward": 1.125,
"reward_std": 0.2298096999526024,
"rewards/accuracy_reward": 0.175,
"rewards/format_reward": 0.95,
"step": 3935
},
{
"completion_length": 258.5625,
"epoch": 0.8702254248284811,
"grad_norm": 0.27709333139181463,
"kl": 0.31121826171875,
"learning_rate": 1.0071073355023097e-06,
"loss": 0.0124,
"reward": 1.0875,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.93125,
"step": 3940
},
{
"completion_length": 227.4875,
"epoch": 0.871329771814304,
"grad_norm": 0.2761772502885486,
"kl": 0.301312255859375,
"learning_rate": 9.903113209758098e-07,
"loss": 0.012,
"reward": 1.11875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.96875,
"step": 3945
},
{
"completion_length": 225.5875,
"epoch": 0.872434118800127,
"grad_norm": 0.26781461171540255,
"kl": 0.31710205078125,
"learning_rate": 9.736492454182211e-07,
"loss": 0.0127,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.9625,
"step": 3950
},
{
"completion_length": 241.3375,
"epoch": 0.87353846578595,
"grad_norm": 1.1383280325532497,
"kl": 0.262933349609375,
"learning_rate": 9.571213565296877e-07,
"loss": 0.0105,
"reward": 1.075,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.9625,
"step": 3955
},
{
"completion_length": 233.5,
"epoch": 0.8746428127717729,
"grad_norm": 0.29444920945103936,
"kl": 0.333721923828125,
"learning_rate": 9.407279000155311e-07,
"loss": 0.0133,
"reward": 1.075,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.93125,
"step": 3960
},
{
"completion_length": 219.39375,
"epoch": 0.8757471597575959,
"grad_norm": 0.42276681745389866,
"kl": 0.26292724609375,
"learning_rate": 9.244691195825794e-07,
"loss": 0.0105,
"reward": 1.1375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.1625,
"rewards/format_reward": 0.975,
"step": 3965
},
{
"completion_length": 269.35625,
"epoch": 0.8768515067434188,
"grad_norm": 0.5714466190012454,
"kl": 0.2780029296875,
"learning_rate": 9.0834525693555e-07,
"loss": 0.0111,
"reward": 1.0625,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9375,
"step": 3970
},
{
"completion_length": 221.74375,
"epoch": 0.8779558537292417,
"grad_norm": 0.5132142260680984,
"kl": 0.23480224609375,
"learning_rate": 8.923565517734633e-07,
"loss": 0.0094,
"reward": 1.09375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.98125,
"step": 3975
},
{
"completion_length": 239.19375,
"epoch": 0.8790602007150646,
"grad_norm": 0.6209622546123578,
"kl": 0.246160888671875,
"learning_rate": 8.765032417860753e-07,
"loss": 0.0099,
"reward": 1.15625,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.19375,
"rewards/format_reward": 0.9625,
"step": 3980
},
{
"completion_length": 230.28125,
"epoch": 0.8801645477008876,
"grad_norm": 0.4959744957429339,
"kl": 0.328594970703125,
"learning_rate": 8.607855626503403e-07,
"loss": 0.0132,
"reward": 1.1125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1625,
"rewards/format_reward": 0.95,
"step": 3985
},
{
"completion_length": 242.23125,
"epoch": 0.8812688946867105,
"grad_norm": 0.6000894011738015,
"kl": 0.264337158203125,
"learning_rate": 8.452037480269082e-07,
"loss": 0.0106,
"reward": 1.09375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.95,
"step": 3990
},
{
"completion_length": 241.79375,
"epoch": 0.8823732416725335,
"grad_norm": 0.5856288163785148,
"kl": 0.274072265625,
"learning_rate": 8.297580295566576e-07,
"loss": 0.011,
"reward": 1.0375,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.95,
"step": 3995
},
{
"completion_length": 228.125,
"epoch": 0.8834775886583565,
"grad_norm": 0.7727469678433277,
"kl": 0.239056396484375,
"learning_rate": 8.144486368572468e-07,
"loss": 0.0096,
"reward": 1.1875,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.2125,
"rewards/format_reward": 0.975,
"step": 4000
},
{
"epoch": 0.8834775886583565,
"eval_completion_length": 240.635,
"eval_kl": 0.28673828125,
"eval_loss": 0.011466315016150475,
"eval_reward": 1.09,
"eval_reward_std": 0.16970562398433686,
"eval_rewards/accuracy_reward": 0.14,
"eval_rewards/format_reward": 0.95,
"eval_runtime": 124.8847,
"eval_samples_per_second": 0.793,
"eval_steps_per_second": 0.2,
"step": 4000
},
{
"completion_length": 222.24375,
"epoch": 0.8845819356441794,
"grad_norm": 0.13203636774859265,
"kl": 0.271759033203125,
"learning_rate": 7.992757975196974e-07,
"loss": 0.0109,
"reward": 1.10625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.975,
"step": 4005
},
{
"completion_length": 217.46875,
"epoch": 0.8856862826300024,
"grad_norm": 0.3836137257482129,
"kl": 0.25550537109375,
"learning_rate": 7.842397371050181e-07,
"loss": 0.0102,
"reward": 1.075,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.975,
"step": 4010
},
{
"completion_length": 264.93125,
"epoch": 0.8867906296158253,
"grad_norm": 0.35191809685250214,
"kl": 0.23974609375,
"learning_rate": 7.693406791408476e-07,
"loss": 0.0096,
"reward": 1.09375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.95625,
"step": 4015
},
{
"completion_length": 241.44375,
"epoch": 0.8878949766016483,
"grad_norm": 0.4672837627346682,
"kl": 0.26492919921875,
"learning_rate": 7.545788451181313e-07,
"loss": 0.0106,
"reward": 1.0625,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9625,
"step": 4020
},
{
"completion_length": 277.9875,
"epoch": 0.8889993235874711,
"grad_norm": 0.7088610794073225,
"kl": 0.29766845703125,
"learning_rate": 7.399544544878268e-07,
"loss": 0.0119,
"reward": 1.06875,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.94375,
"step": 4025
},
{
"completion_length": 226.96875,
"epoch": 0.8901036705732941,
"grad_norm": 0.29863896309113996,
"kl": 0.246087646484375,
"learning_rate": 7.25467724657647e-07,
"loss": 0.0098,
"reward": 1.09375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.98125,
"step": 4030
},
{
"completion_length": 261.29375,
"epoch": 0.891208017559117,
"grad_norm": 0.5220634238395366,
"kl": 0.238995361328125,
"learning_rate": 7.11118870988825e-07,
"loss": 0.0096,
"reward": 1.1125,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.96875,
"step": 4035
},
{
"completion_length": 227.1875,
"epoch": 0.89231236454494,
"grad_norm": 0.468574862123508,
"kl": 0.254180908203125,
"learning_rate": 6.969081067929129e-07,
"loss": 0.0102,
"reward": 1.09375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.98125,
"step": 4040
},
{
"completion_length": 223.36875,
"epoch": 0.893416711530763,
"grad_norm": 0.7105477246802777,
"kl": 0.23565673828125,
"learning_rate": 6.828356433286065e-07,
"loss": 0.0094,
"reward": 1.15625,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.16875,
"rewards/format_reward": 0.9875,
"step": 4045
},
{
"completion_length": 242.2875,
"epoch": 0.8945210585165859,
"grad_norm": 0.3124111487407041,
"kl": 0.272393798828125,
"learning_rate": 6.689016897986123e-07,
"loss": 0.0109,
"reward": 1.09375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.96875,
"step": 4050
},
{
"completion_length": 235.10625,
"epoch": 0.8956254055024089,
"grad_norm": 0.5287755229051584,
"kl": 0.263592529296875,
"learning_rate": 6.551064533465335e-07,
"loss": 0.0105,
"reward": 1.16875,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.2,
"rewards/format_reward": 0.96875,
"step": 4055
},
{
"completion_length": 210.3,
"epoch": 0.8967297524882318,
"grad_norm": 0.3865388105745784,
"kl": 0.243426513671875,
"learning_rate": 6.414501390537875e-07,
"loss": 0.0097,
"reward": 1.0875,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9875,
"step": 4060
},
{
"completion_length": 257.4125,
"epoch": 0.8978340994740548,
"grad_norm": 0.5827223262448566,
"kl": 0.28209228515625,
"learning_rate": 6.279329499365649e-07,
"loss": 0.0113,
"reward": 1.01875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.06875,
"rewards/format_reward": 0.95,
"step": 4065
},
{
"completion_length": 245.89375,
"epoch": 0.8989384464598776,
"grad_norm": 0.5568021946499023,
"kl": 0.329107666015625,
"learning_rate": 6.14555086942804e-07,
"loss": 0.0132,
"reward": 1.05625,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.94375,
"step": 4070
},
{
"completion_length": 270.7,
"epoch": 0.9000427934457006,
"grad_norm": 0.8265886341669334,
"kl": 0.343658447265625,
"learning_rate": 6.013167489492089e-07,
"loss": 0.0137,
"reward": 1.0375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9375,
"step": 4075
},
{
"completion_length": 235.875,
"epoch": 0.9011471404315236,
"grad_norm": 0.15521466379213147,
"kl": 0.21239013671875,
"learning_rate": 5.88218132758287e-07,
"loss": 0.0085,
"reward": 1.09375,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.9875,
"step": 4080
},
{
"completion_length": 263.94375,
"epoch": 0.9022514874173465,
"grad_norm": 0.3565020657376661,
"kl": 0.248944091796875,
"learning_rate": 5.752594330954275e-07,
"loss": 0.01,
"reward": 1.0875,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9625,
"step": 4085
},
{
"completion_length": 217.0875,
"epoch": 0.9033558344031695,
"grad_norm": 0.7599338431132417,
"kl": 0.256341552734375,
"learning_rate": 5.624408426060124e-07,
"loss": 0.0103,
"reward": 1.09375,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.99375,
"step": 4090
},
{
"completion_length": 240.49375,
"epoch": 0.9044601813889924,
"grad_norm": 0.3404631065084141,
"kl": 0.26585693359375,
"learning_rate": 5.497625518525374e-07,
"loss": 0.0106,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.96875,
"step": 4095
},
{
"completion_length": 240.90625,
"epoch": 0.9055645283748154,
"grad_norm": 0.4830591822507376,
"kl": 0.2419189453125,
"learning_rate": 5.372247493117921e-07,
"loss": 0.0097,
"reward": 1.0375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.95625,
"step": 4100
},
{
"epoch": 0.9055645283748154,
"eval_completion_length": 236.99,
"eval_kl": 0.26658203125,
"eval_loss": 0.01068319845944643,
"eval_reward": 1.13,
"eval_reward_std": 0.1838477599620819,
"eval_rewards/accuracy_reward": 0.16,
"eval_rewards/format_reward": 0.97,
"eval_runtime": 112.8111,
"eval_samples_per_second": 0.878,
"eval_steps_per_second": 0.222,
"step": 4100
},
{
"completion_length": 213.69375,
"epoch": 0.9066688753606383,
"grad_norm": 0.0963839062331533,
"kl": 0.2247802734375,
"learning_rate": 5.248276213720526e-07,
"loss": 0.009,
"reward": 1.11875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.98125,
"step": 4105
},
{
"completion_length": 235.7625,
"epoch": 0.9077732223464613,
"grad_norm": 0.3586489435080358,
"kl": 113.0680419921875,
"learning_rate": 5.125713523303133e-07,
"loss": 4.5501,
"reward": 1.08125,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.9625,
"step": 4110
},
{
"completion_length": 245.6875,
"epoch": 0.9088775693322843,
"grad_norm": 0.5993938735102521,
"kl": 0.2639892578125,
"learning_rate": 5.004561243895433e-07,
"loss": 0.0106,
"reward": 1.09375,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.95625,
"step": 4115
},
{
"completion_length": 238.9875,
"epoch": 0.9099819163181071,
"grad_norm": 0.7059681339718733,
"kl": 0.25078125,
"learning_rate": 4.884821176559817e-07,
"loss": 0.01,
"reward": 1.09375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.95625,
"step": 4120
},
{
"completion_length": 214.73125,
"epoch": 0.91108626330393,
"grad_norm": 0.5836610939153032,
"kl": 0.248724365234375,
"learning_rate": 4.7664951013645875e-07,
"loss": 0.01,
"reward": 1.1,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.98125,
"step": 4125
},
{
"completion_length": 287.74375,
"epoch": 0.912190610289753,
"grad_norm": 0.5029836450413667,
"kl": 0.338372802734375,
"learning_rate": 4.649584777357452e-07,
"loss": 0.0135,
"reward": 1.0375,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9375,
"step": 4130
},
{
"completion_length": 238.50625,
"epoch": 0.913294957275576,
"grad_norm": 0.3679025891536969,
"kl": 0.2593505859375,
"learning_rate": 4.534091942539476e-07,
"loss": 0.0104,
"reward": 1.06875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.96875,
"step": 4135
},
{
"completion_length": 243.9,
"epoch": 0.9143993042613989,
"grad_norm": 0.36542863601047937,
"kl": 0.2465576171875,
"learning_rate": 4.420018313839147e-07,
"loss": 0.0099,
"reward": 1.1625,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.2,
"rewards/format_reward": 0.9625,
"step": 4140
},
{
"completion_length": 244.55625,
"epoch": 0.9155036512472219,
"grad_norm": 0.6654427169718511,
"kl": 0.266680908203125,
"learning_rate": 4.3073655870869093e-07,
"loss": 0.0107,
"reward": 1.09375,
"reward_std": 0.22097086533904076,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.95625,
"step": 4145
},
{
"completion_length": 261.3625,
"epoch": 0.9166079982330448,
"grad_norm": 0.4755302093000005,
"kl": 0.246258544921875,
"learning_rate": 4.1961354369898675e-07,
"loss": 0.0099,
"reward": 1.1,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.96875,
"step": 4150
},
{
"completion_length": 238.525,
"epoch": 0.9177123452188678,
"grad_norm": 0.3919488350765311,
"kl": 0.269342041015625,
"learning_rate": 4.086329517107046e-07,
"loss": 0.0108,
"reward": 1.175,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.2125,
"rewards/format_reward": 0.9625,
"step": 4155
},
{
"completion_length": 259.00625,
"epoch": 0.9188166922046908,
"grad_norm": 0.4320891208444687,
"kl": 0.283648681640625,
"learning_rate": 3.9779494598246484e-07,
"loss": 0.0113,
"reward": 1.1125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.95625,
"step": 4160
},
{
"completion_length": 261.85625,
"epoch": 0.9199210391905137,
"grad_norm": 0.4840697345614265,
"kl": 0.2813232421875,
"learning_rate": 3.8709968763318894e-07,
"loss": 0.0113,
"reward": 1.0875,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.95,
"step": 4165
},
{
"completion_length": 244.525,
"epoch": 0.9210253861763366,
"grad_norm": 0.29181391006128693,
"kl": 0.273553466796875,
"learning_rate": 3.7654733565969826e-07,
"loss": 0.0109,
"reward": 1.1125,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.9625,
"step": 4170
},
{
"completion_length": 263.0,
"epoch": 0.9221297331621595,
"grad_norm": 0.6600185394674087,
"kl": 0.275067138671875,
"learning_rate": 3.661380469343556e-07,
"loss": 0.011,
"reward": 1.01875,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.91875,
"step": 4175
},
{
"completion_length": 257.275,
"epoch": 0.9232340801479825,
"grad_norm": 0.3026772238703725,
"kl": 0.2882568359375,
"learning_rate": 3.558719762027307e-07,
"loss": 0.0115,
"reward": 1.075,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.9625,
"step": 4180
},
{
"completion_length": 238.7,
"epoch": 0.9243384271338054,
"grad_norm": 0.5128635762934807,
"kl": 0.256884765625,
"learning_rate": 3.457492760812975e-07,
"loss": 0.0103,
"reward": 1.0625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.94375,
"step": 4185
},
{
"completion_length": 271.00625,
"epoch": 0.9254427741196284,
"grad_norm": 0.41061824834878685,
"kl": 0.3115478515625,
"learning_rate": 3.357700970551681e-07,
"loss": 0.0125,
"reward": 1.0875,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9625,
"step": 4190
},
{
"completion_length": 236.125,
"epoch": 0.9265471211054513,
"grad_norm": 0.6143144747308046,
"kl": 0.2658935546875,
"learning_rate": 3.2593458747585683e-07,
"loss": 0.0106,
"reward": 1.0625,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9625,
"step": 4195
},
{
"completion_length": 263.36875,
"epoch": 0.9276514680912743,
"grad_norm": 0.31447425015692126,
"kl": 0.238623046875,
"learning_rate": 3.1624289355907334e-07,
"loss": 0.0095,
"reward": 1.0875,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.975,
"step": 4200
},
{
"epoch": 0.9276514680912743,
"eval_completion_length": 263.69,
"eval_kl": 0.3008984375,
"eval_loss": 0.012053935788571835,
"eval_reward": 1.095,
"eval_reward_std": 0.1767766922712326,
"eval_rewards/accuracy_reward": 0.15,
"eval_rewards/format_reward": 0.945,
"eval_runtime": 126.8567,
"eval_samples_per_second": 0.78,
"eval_steps_per_second": 0.197,
"step": 4200
},
{
"completion_length": 227.49375,
"epoch": 0.9287558150770973,
"grad_norm": 0.3848576143966496,
"kl": 0.260791015625,
"learning_rate": 3.0669515938254404e-07,
"loss": 0.0104,
"reward": 1.08125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.96875,
"step": 4205
},
{
"completion_length": 242.55625,
"epoch": 0.9298601620629202,
"grad_norm": 0.660183404776511,
"kl": 0.3072021484375,
"learning_rate": 2.972915268838794e-07,
"loss": 0.0123,
"reward": 1.11875,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.9625,
"step": 4210
},
{
"completion_length": 237.59375,
"epoch": 0.9309645090487431,
"grad_norm": 0.38237841247452214,
"kl": 0.236932373046875,
"learning_rate": 2.8803213585846036e-07,
"loss": 0.0095,
"reward": 1.09375,
"reward_std": 0.07954951152205467,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.98125,
"step": 4215
},
{
"completion_length": 254.3125,
"epoch": 0.932068856034566,
"grad_norm": 0.938722788620333,
"kl": 0.304193115234375,
"learning_rate": 2.7891712395735513e-07,
"loss": 0.0122,
"reward": 1.03125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.94375,
"step": 4220
},
{
"completion_length": 270.375,
"epoch": 0.933173203020389,
"grad_norm": 0.6446127146773467,
"kl": 0.32706298828125,
"learning_rate": 2.699466266852779e-07,
"loss": 0.0131,
"reward": 1.05625,
"reward_std": 0.23864853456616403,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.925,
"step": 4225
},
{
"completion_length": 225.74375,
"epoch": 0.9342775500062119,
"grad_norm": 0.514239970716185,
"kl": 0.2426910400390625,
"learning_rate": 2.6112077739857465e-07,
"loss": 0.0097,
"reward": 1.0875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9625,
"step": 4230
},
{
"completion_length": 249.1625,
"epoch": 0.9353818969920349,
"grad_norm": 0.4120891261838415,
"kl": 0.245001220703125,
"learning_rate": 2.524397073032403e-07,
"loss": 0.0098,
"reward": 1.05,
"reward_std": 0.10606601536273956,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.96875,
"step": 4235
},
{
"completion_length": 240.0125,
"epoch": 0.9364862439778578,
"grad_norm": 0.4432984558834709,
"kl": 0.27457275390625,
"learning_rate": 2.4390354545296257e-07,
"loss": 0.011,
"reward": 1.09375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.9625,
"step": 4240
},
{
"completion_length": 216.80625,
"epoch": 0.9375905909636808,
"grad_norm": 0.2646324742458765,
"kl": 0.221075439453125,
"learning_rate": 2.3551241874721353e-07,
"loss": 0.0088,
"reward": 1.10625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.99375,
"step": 4245
},
{
"completion_length": 247.775,
"epoch": 0.9386949379495038,
"grad_norm": 0.183761058738616,
"kl": 0.28399658203125,
"learning_rate": 2.272664519293566e-07,
"loss": 0.0114,
"reward": 1.1,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.96875,
"step": 4250
},
{
"completion_length": 245.94375,
"epoch": 0.9397992849353267,
"grad_norm": 0.49084283791698335,
"kl": 0.25836181640625,
"learning_rate": 2.1916576758478913e-07,
"loss": 0.0103,
"reward": 1.06875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.98125,
"step": 4255
},
{
"completion_length": 255.21875,
"epoch": 0.9409036319211497,
"grad_norm": 0.5752245613234309,
"kl": 0.304534912109375,
"learning_rate": 2.1121048613912843e-07,
"loss": 0.0122,
"reward": 1.08125,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.94375,
"step": 4260
},
{
"completion_length": 248.875,
"epoch": 0.9420079789069725,
"grad_norm": 0.4954935572252636,
"kl": 0.280999755859375,
"learning_rate": 2.0340072585641523e-07,
"loss": 0.0112,
"reward": 1.11875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.9625,
"step": 4265
},
{
"completion_length": 274.95,
"epoch": 0.9431123258927955,
"grad_norm": 0.6291854725110926,
"kl": 0.3195556640625,
"learning_rate": 1.9573660283735974e-07,
"loss": 0.0128,
"reward": 1.09375,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.9375,
"step": 4270
},
{
"completion_length": 227.425,
"epoch": 0.9442166728786184,
"grad_norm": 0.4759307581588166,
"kl": 0.221356201171875,
"learning_rate": 1.8821823101760949e-07,
"loss": 0.0089,
"reward": 1.10625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.975,
"step": 4275
},
{
"completion_length": 222.7625,
"epoch": 0.9453210198644414,
"grad_norm": 0.5007795680830986,
"kl": 0.27132568359375,
"learning_rate": 1.8084572216606422e-07,
"loss": 0.0109,
"reward": 1.1875,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.19375,
"rewards/format_reward": 0.99375,
"step": 4280
},
{
"completion_length": 243.20625,
"epoch": 0.9464253668502643,
"grad_norm": 0.43659783572888766,
"kl": 0.2771484375,
"learning_rate": 1.736191858832048e-07,
"loss": 0.0111,
"reward": 1.1125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1625,
"rewards/format_reward": 0.95,
"step": 4285
},
{
"completion_length": 230.93125,
"epoch": 0.9475297138360873,
"grad_norm": 0.5036319473035266,
"kl": 0.315594482421875,
"learning_rate": 1.665387295994747e-07,
"loss": 0.0126,
"reward": 1.05,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95,
"step": 4290
},
{
"completion_length": 257.44375,
"epoch": 0.9486340608219103,
"grad_norm": 0.21837455215402343,
"kl": 0.318695068359375,
"learning_rate": 1.5960445857367003e-07,
"loss": 0.0128,
"reward": 1.08125,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.96875,
"step": 4295
},
{
"completion_length": 282.2,
"epoch": 0.9497384078077332,
"grad_norm": 0.8008170036383372,
"kl": 0.27298583984375,
"learning_rate": 1.5281647589138527e-07,
"loss": 0.0109,
"reward": 1.01875,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.9375,
"step": 4300
},
{
"epoch": 0.9497384078077332,
"eval_completion_length": 243.03,
"eval_kl": 0.26423828125,
"eval_loss": 0.010587015189230442,
"eval_reward": 1.15,
"eval_reward_std": 0.1697056245803833,
"eval_rewards/accuracy_reward": 0.17,
"eval_rewards/format_reward": 0.98,
"eval_runtime": 110.5252,
"eval_samples_per_second": 0.896,
"eval_steps_per_second": 0.226,
"step": 4300
},
{
"completion_length": 233.6625,
"epoch": 0.9508427547935562,
"grad_norm": 0.7236286239926621,
"kl": 0.329180908203125,
"learning_rate": 1.4617488246348012e-07,
"loss": 0.0132,
"reward": 1.06875,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95625,
"step": 4305
},
{
"completion_length": 236.18125,
"epoch": 0.9519471017793791,
"grad_norm": 0.5259274208633408,
"kl": 0.270782470703125,
"learning_rate": 1.3967977702456946e-07,
"loss": 0.0108,
"reward": 1.08125,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.95625,
"step": 4310
},
{
"completion_length": 222.0125,
"epoch": 0.953051448765202,
"grad_norm": 0.4716816022245921,
"kl": 0.269232177734375,
"learning_rate": 1.3333125613156695e-07,
"loss": 0.0108,
"reward": 1.11875,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.975,
"step": 4315
},
{
"completion_length": 254.275,
"epoch": 0.9541557957510249,
"grad_norm": 0.3798073604709502,
"kl": 0.29171142578125,
"learning_rate": 1.271294141622459e-07,
"loss": 0.0117,
"reward": 1.08125,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.95625,
"step": 4320
},
{
"completion_length": 263.34375,
"epoch": 0.9552601427368479,
"grad_norm": 0.39932023814829193,
"kl": 0.31627197265625,
"learning_rate": 1.2107434331383504e-07,
"loss": 0.0126,
"reward": 1.075,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.93125,
"step": 4325
},
{
"completion_length": 233.03125,
"epoch": 0.9563644897226709,
"grad_norm": 0.915012168079641,
"kl": 0.30823974609375,
"learning_rate": 1.1516613360164408e-07,
"loss": 0.0123,
"reward": 1.1125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.96875,
"step": 4330
},
{
"completion_length": 235.96875,
"epoch": 0.9574688367084938,
"grad_norm": 0.6431925020300033,
"kl": 0.26207275390625,
"learning_rate": 1.094048728577346e-07,
"loss": 0.0105,
"reward": 1.0625,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.9625,
"step": 4335
},
{
"completion_length": 227.89375,
"epoch": 0.9585731836943168,
"grad_norm": 0.2197375620895538,
"kl": 0.22823486328125,
"learning_rate": 1.0379064672960793e-07,
"loss": 0.0091,
"reward": 1.1375,
"reward_std": 0.12374368458986282,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.99375,
"step": 4340
},
{
"completion_length": 236.425,
"epoch": 0.9596775306801397,
"grad_norm": 0.24464788267487614,
"kl": 0.237347412109375,
"learning_rate": 9.832353867893385e-08,
"loss": 0.0095,
"reward": 1.05625,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95625,
"step": 4345
},
{
"completion_length": 222.8,
"epoch": 0.9607818776659627,
"grad_norm": 0.6551456803073707,
"kl": 0.2460235595703125,
"learning_rate": 9.300362998030832e-08,
"loss": 0.0098,
"reward": 1.175,
"reward_std": 0.21213203072547912,
"rewards/accuracy_reward": 0.2125,
"rewards/format_reward": 0.9625,
"step": 4350
},
{
"completion_length": 268.76875,
"epoch": 0.9618862246517856,
"grad_norm": 0.592425806579998,
"kl": 0.27703857421875,
"learning_rate": 8.783099972004882e-08,
"loss": 0.0111,
"reward": 1.0125,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.91875,
"step": 4355
},
{
"completion_length": 212.39375,
"epoch": 0.9629905716376085,
"grad_norm": 0.46261395016364687,
"kl": 0.278302001953125,
"learning_rate": 8.280572479501426e-08,
"loss": 0.0111,
"reward": 1.125,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.96875,
"step": 4360
},
{
"completion_length": 242.06875,
"epoch": 0.9640949186234314,
"grad_norm": 0.28486230034584065,
"kl": 0.22513427734375,
"learning_rate": 7.792787991146356e-08,
"loss": 0.009,
"reward": 1.09375,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.98125,
"step": 4365
},
{
"completion_length": 240.1875,
"epoch": 0.9651992656092544,
"grad_norm": 0.43886555296427354,
"kl": 0.30863037109375,
"learning_rate": 7.319753758394665e-08,
"loss": 0.0123,
"reward": 1.0375,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.94375,
"step": 4370
},
{
"completion_length": 223.30625,
"epoch": 0.9663036125950774,
"grad_norm": 0.542732303860562,
"kl": 0.27510986328125,
"learning_rate": 6.861476813422419e-08,
"loss": 0.011,
"reward": 1.06875,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.98125,
"step": 4375
},
{
"completion_length": 269.04375,
"epoch": 0.9674079595809003,
"grad_norm": 0.35561046298902815,
"kl": 0.350921630859375,
"learning_rate": 6.417963969022389e-08,
"loss": 0.014,
"reward": 1.04375,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.93125,
"step": 4380
},
{
"completion_length": 294.5625,
"epoch": 0.9685123065667233,
"grad_norm": 0.6237843740200061,
"kl": 0.262664794921875,
"learning_rate": 5.989221818502478e-08,
"loss": 0.0105,
"reward": 1.10625,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.95,
"step": 4385
},
{
"completion_length": 257.58125,
"epoch": 0.9696166535525462,
"grad_norm": 0.6942753855581536,
"kl": 0.319189453125,
"learning_rate": 5.5752567355883415e-08,
"loss": 0.0128,
"reward": 1.06875,
"reward_std": 0.15026018843054773,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.95,
"step": 4390
},
{
"completion_length": 259.525,
"epoch": 0.9707210005383692,
"grad_norm": 0.47467571553923327,
"kl": 0.288873291015625,
"learning_rate": 5.176074874327919e-08,
"loss": 0.0116,
"reward": 1.1,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.9625,
"step": 4395
},
{
"completion_length": 247.05,
"epoch": 0.9718253475241921,
"grad_norm": 0.47878465160417794,
"kl": 0.256573486328125,
"learning_rate": 4.791682169000056e-08,
"loss": 0.0103,
"reward": 1.09375,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.975,
"step": 4400
},
{
"epoch": 0.9718253475241921,
"eval_completion_length": 254.0,
"eval_kl": 0.27548828125,
"eval_loss": 0.011038653552532196,
"eval_reward": 1.11,
"eval_reward_std": 0.1838477599620819,
"eval_rewards/accuracy_reward": 0.16,
"eval_rewards/format_reward": 0.95,
"eval_runtime": 128.6135,
"eval_samples_per_second": 0.77,
"eval_steps_per_second": 0.194,
"step": 4400
},
{
"completion_length": 288.8375,
"epoch": 0.9729296945100151,
"grad_norm": 0.6464757294619069,
"kl": 0.3235595703125,
"learning_rate": 4.4220843340269105e-08,
"loss": 0.0129,
"reward": 1.025,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.9125,
"step": 4405
},
{
"completion_length": 245.01875,
"epoch": 0.9740340414958379,
"grad_norm": 0.6385791474694716,
"kl": 0.33304443359375,
"learning_rate": 4.067286863888131e-08,
"loss": 0.0133,
"reward": 1.10625,
"reward_std": 0.2563262037932873,
"rewards/accuracy_reward": 0.16875,
"rewards/format_reward": 0.9375,
"step": 4410
},
{
"completion_length": 219.36875,
"epoch": 0.9751383884816609,
"grad_norm": 0.6252687285926767,
"kl": 0.28475341796875,
"learning_rate": 3.727295033040035e-08,
"loss": 0.0114,
"reward": 1.11875,
"reward_std": 0.09722718074917794,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.99375,
"step": 4415
},
{
"completion_length": 266.64375,
"epoch": 0.9762427354674839,
"grad_norm": 0.4059885139351556,
"kl": 0.2508056640625,
"learning_rate": 3.402113895836445e-08,
"loss": 0.01,
"reward": 1.05,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.1,
"rewards/format_reward": 0.95,
"step": 4420
},
{
"completion_length": 234.075,
"epoch": 0.9773470824533068,
"grad_norm": 0.5571412291452437,
"kl": 0.258990478515625,
"learning_rate": 3.091748286453866e-08,
"loss": 0.0104,
"reward": 1.125,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.96875,
"step": 4425
},
{
"completion_length": 245.19375,
"epoch": 0.9784514294391298,
"grad_norm": 0.48222048728786027,
"kl": 0.279296875,
"learning_rate": 2.796202818819871e-08,
"loss": 0.0112,
"reward": 1.06875,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.1125,
"rewards/format_reward": 0.95625,
"step": 4430
},
{
"completion_length": 266.23125,
"epoch": 0.9795557764249527,
"grad_norm": 0.33073986574713815,
"kl": 0.2812255859375,
"learning_rate": 2.5154818865440466e-08,
"loss": 0.0113,
"reward": 1.09375,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.94375,
"step": 4435
},
{
"completion_length": 243.86875,
"epoch": 0.9806601234107757,
"grad_norm": 0.5979836605535603,
"kl": 0.3093017578125,
"learning_rate": 2.2495896628529355e-08,
"loss": 0.0124,
"reward": 1.10625,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.95625,
"step": 4440
},
{
"completion_length": 234.6375,
"epoch": 0.9817644703965986,
"grad_norm": 0.71956275463976,
"kl": 0.267864990234375,
"learning_rate": 1.9985301005280843e-08,
"loss": 0.0107,
"reward": 1.09375,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.11875,
"rewards/format_reward": 0.975,
"step": 4445
},
{
"completion_length": 250.375,
"epoch": 0.9828688173824216,
"grad_norm": 0.4915100209116087,
"kl": 0.26923828125,
"learning_rate": 1.7623069318469797e-08,
"loss": 0.0108,
"reward": 1.025,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.9375,
"step": 4450
},
{
"completion_length": 238.46875,
"epoch": 0.9839731643682446,
"grad_norm": 0.4934697459512418,
"kl": 0.263037109375,
"learning_rate": 1.5409236685277608e-08,
"loss": 0.0105,
"reward": 1.09375,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.9625,
"step": 4455
},
{
"completion_length": 242.4625,
"epoch": 0.9850775113540674,
"grad_norm": 0.7032342242514465,
"kl": 0.2882568359375,
"learning_rate": 1.3343836016772582e-08,
"loss": 0.0115,
"reward": 1.0625,
"reward_std": 0.1414213538169861,
"rewards/accuracy_reward": 0.10625,
"rewards/format_reward": 0.95625,
"step": 4460
},
{
"completion_length": 253.40625,
"epoch": 0.9861818583398904,
"grad_norm": 0.40012002703514815,
"kl": 0.28798828125,
"learning_rate": 1.1426898017412591e-08,
"loss": 0.0115,
"reward": 1.10625,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.175,
"rewards/format_reward": 0.93125,
"step": 4465
},
{
"completion_length": 231.40625,
"epoch": 0.9872862053257133,
"grad_norm": 0.5527803201803545,
"kl": 0.273876953125,
"learning_rate": 9.658451184600959e-09,
"loss": 0.0109,
"reward": 1.08125,
"reward_std": 0.16793785765767097,
"rewards/accuracy_reward": 0.13125,
"rewards/format_reward": 0.95,
"step": 4470
},
{
"completion_length": 276.0125,
"epoch": 0.9883905523115363,
"grad_norm": 0.5553612160206977,
"kl": 0.271929931640625,
"learning_rate": 8.038521808249045e-09,
"loss": 0.0109,
"reward": 1.05625,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.93125,
"step": 4475
},
{
"completion_length": 269.39375,
"epoch": 0.9894948992973592,
"grad_norm": 0.38046769308040707,
"kl": 0.31407470703125,
"learning_rate": 6.567133970397654e-09,
"loss": 0.0126,
"reward": 1.03125,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.0875,
"rewards/format_reward": 0.94375,
"step": 4480
},
{
"completion_length": 253.03125,
"epoch": 0.9905992462831822,
"grad_norm": 0.25828108036325964,
"kl": 0.36680908203125,
"learning_rate": 5.2443095448506674e-09,
"loss": 0.0147,
"reward": 1.025,
"reward_std": 0.15909902304410933,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.93125,
"step": 4485
},
{
"completion_length": 242.60625,
"epoch": 0.9917035932690051,
"grad_norm": 0.3615879163519988,
"kl": 0.23807373046875,
"learning_rate": 4.070068196853072e-09,
"loss": 0.0095,
"reward": 1.10625,
"reward_std": 0.13258251920342445,
"rewards/accuracy_reward": 0.1375,
"rewards/format_reward": 0.96875,
"step": 4490
},
{
"completion_length": 271.5,
"epoch": 0.9928079402548281,
"grad_norm": 0.31635821168080147,
"kl": 0.35159912109375,
"learning_rate": 3.0444273828000857e-09,
"loss": 0.0141,
"reward": 1.075,
"reward_std": 0.19445436149835588,
"rewards/accuracy_reward": 0.14375,
"rewards/format_reward": 0.93125,
"step": 4495
},
{
"completion_length": 233.03125,
"epoch": 0.9939122872406511,
"grad_norm": 0.30099890850035027,
"kl": 0.325830078125,
"learning_rate": 2.167402349972925e-09,
"loss": 0.013,
"reward": 1.0375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.08125,
"rewards/format_reward": 0.95625,
"step": 4500
},
{
"epoch": 0.9939122872406511,
"eval_completion_length": 251.83,
"eval_kl": 0.32880859375,
"eval_loss": 0.013173764571547508,
"eval_reward": 1.115,
"eval_reward_std": 0.19091882765293122,
"eval_rewards/accuracy_reward": 0.16,
"eval_rewards/format_reward": 0.955,
"eval_runtime": 141.5462,
"eval_samples_per_second": 0.699,
"eval_steps_per_second": 0.177,
"step": 4500
},
{
"completion_length": 207.225,
"epoch": 0.9950166342264739,
"grad_norm": 0.7679754782825252,
"kl": 0.2902587890625,
"learning_rate": 1.4390061363189767e-09,
"loss": 0.0116,
"reward": 1.11875,
"reward_std": 0.18561552688479424,
"rewards/accuracy_reward": 0.15,
"rewards/format_reward": 0.96875,
"step": 4505
},
{
"completion_length": 278.71875,
"epoch": 0.9961209812122969,
"grad_norm": 0.44631107231254136,
"kl": 0.279425048828125,
"learning_rate": 8.592495702497427e-10,
"loss": 0.0112,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.90625,
"step": 4510
},
{
"completion_length": 246.39375,
"epoch": 0.9972253281981198,
"grad_norm": 0.36414142285577833,
"kl": 0.30335693359375,
"learning_rate": 4.2814127048873553e-10,
"loss": 0.0121,
"reward": 1.05625,
"reward_std": 0.1149048499763012,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.9625,
"step": 4515
},
{
"completion_length": 258.74375,
"epoch": 0.9983296751839428,
"grad_norm": 0.5750843977179474,
"kl": 0.405145263671875,
"learning_rate": 1.4568764593603235e-10,
"loss": 0.0162,
"reward": 1.05,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.95625,
"step": 4520
},
{
"completion_length": 261.95,
"epoch": 0.9994340221697657,
"grad_norm": 0.5025685502837802,
"kl": 0.259100341796875,
"learning_rate": 1.1892895576126606e-11,
"loss": 0.0104,
"reward": 1.10625,
"reward_std": 0.20329319611191748,
"rewards/accuracy_reward": 0.1625,
"rewards/format_reward": 0.94375,
"step": 4525
},
{
"completion_length": 248.6875,
"epoch": 0.9998757609640949,
"kl": 0.22357177734375,
"reward": 1.171875,
"reward_std": 0.19887377880513668,
"rewards/accuracy_reward": 0.203125,
"rewards/format_reward": 0.96875,
"step": 4527,
"total_flos": 0.0,
"train_loss": 28.747999461705767,
"train_runtime": 163973.669,
"train_samples_per_second": 0.442,
"train_steps_per_second": 0.028
}
],
"logging_steps": 5,
"max_steps": 4527,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}