{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.981333333333333, "eval_steps": 10, "global_step": 138, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 479.2916717529297, "epoch": 0.021333333333333333, "grad_norm": 0.08470216393470764, "learning_rate": 2.1428571428571428e-07, "loss": 0.0578, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 595.5416793823242, "epoch": 0.042666666666666665, "grad_norm": 0.09772875905036926, "learning_rate": 4.2857142857142857e-07, "loss": 0.0281, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 508.16667556762695, "epoch": 0.064, "grad_norm": 0.07093458622694016, "learning_rate": 6.428571428571428e-07, "loss": -0.0183, "reward": 0.0833333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 514.1875190734863, "epoch": 0.08533333333333333, "grad_norm": 0.04470205307006836, "learning_rate": 8.571428571428571e-07, "loss": -0.0086, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 576.4791793823242, "epoch": 0.10666666666666667, "grad_norm": 0.06627917289733887, "learning_rate": 1.0714285714285716e-06, "loss": 0.0005, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 497.37501525878906, "epoch": 0.128, "grad_norm": 0.03331312909722328, "learning_rate": 1.2857142857142856e-06, "loss": -0.0039, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 404.66667556762695, "epoch": 0.14933333333333335, "grad_norm": 0.0803137719631195, "learning_rate": 1.5e-06, "loss": -0.0083, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 575.4583511352539, "epoch": 0.17066666666666666, "grad_norm": 0.046666789799928665, "learning_rate": 1.7142857142857143e-06, "loss": -0.0055, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 535.0000076293945, "epoch": 0.192, "grad_norm": 0.06731698662042618, "learning_rate": 1.928571428571429e-06, "loss": 0.001, "reward": 0.12500000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.0, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 510.6666793823242, "epoch": 0.21333333333333335, "grad_norm": 0.1065572202205658, "learning_rate": 2.142857142857143e-06, "loss": -0.0209, "reward": 0.1458333358168602, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 516.1875228881836, "epoch": 0.23466666666666666, "grad_norm": 0.06277307868003845, "learning_rate": 2.357142857142857e-06, "loss": -0.0154, "reward": 0.18750000186264515, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.02083333395421505, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 479.93751525878906, "epoch": 0.256, "grad_norm": 0.06429535895586014, "learning_rate": 2.571428571428571e-06, "loss": -0.0103, "reward": 0.14583333767950535, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.14583333767950535, "rewards/format_reward": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 532.8125190734863, "epoch": 0.2773333333333333, "grad_norm": 0.05405157431960106, "learning_rate": 2.785714285714286e-06, "loss": 0.0394, "reward": 0.10416666977107525, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 448.45834732055664, "epoch": 0.2986666666666667, "grad_norm": 0.08196503669023514, "learning_rate": 3e-06, "loss": -0.0514, "reward": 0.2083333395421505, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.02083333395421505, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 562.4791889190674, "epoch": 0.32, "grad_norm": 0.07421501725912094, "learning_rate": 2.999518612944646e-06, "loss": -0.049, "reward": 0.1666666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 489.729190826416, "epoch": 0.3413333333333333, "grad_norm": 0.07382796704769135, "learning_rate": 2.9980747607565792e-06, "loss": 0.0417, "reward": 0.2083333358168602, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 459.58334159851074, "epoch": 0.3626666666666667, "grad_norm": 0.09509492665529251, "learning_rate": 2.995669370171471e-06, "loss": 0.0036, "reward": 0.10416666977107525, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 507.5000171661377, "epoch": 0.384, "grad_norm": 0.07961481809616089, "learning_rate": 2.9923039850878425e-06, "loss": -0.032, "reward": 0.29166667349636555, "reward_std": 0.32475952059030533, "rewards/accuracy_reward": 0.27083333767950535, "rewards/format_reward": 0.02083333395421505, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 510.0625190734863, "epoch": 0.4053333333333333, "grad_norm": 0.08553481101989746, "learning_rate": 2.9879807655761146e-06, "loss": 0.0163, "reward": 0.20833333767950535, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.20833333767950535, "rewards/format_reward": 0.0, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 390.06251335144043, "epoch": 0.4266666666666667, "grad_norm": 0.0976102352142334, "learning_rate": 2.982702486492167e-06, "loss": -0.0533, "reward": 0.35416667722165585, "reward_std": 0.32475952059030533, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 523.666675567627, "epoch": 0.448, "grad_norm": 0.06446705013513565, "learning_rate": 2.9764725356963015e-06, "loss": -0.0544, "reward": 0.1875000037252903, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 358.5833435058594, "epoch": 0.4693333333333333, "grad_norm": 0.10436850041151047, "learning_rate": 2.969294911878742e-06, "loss": -0.049, "reward": 0.5208333469927311, "reward_std": 0.3608439117670059, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.02083333395421505, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 430.6041793823242, "epoch": 0.49066666666666664, "grad_norm": 0.06835100054740906, "learning_rate": 2.9611742219930806e-06, "loss": -0.0144, "reward": 0.3125000037252903, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.3125000037252903, "rewards/format_reward": 0.0, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 427.1041831970215, "epoch": 0.512, "grad_norm": 0.08951476961374283, "learning_rate": 2.9521156782993067e-06, "loss": -0.0214, "reward": 0.416666679084301, "reward_std": 0.3608439117670059, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 500.52084732055664, "epoch": 0.5333333333333333, "grad_norm": 0.04994317516684532, "learning_rate": 2.942125095018319e-06, "loss": 0.0059, "reward": 0.5000000111758709, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 447.3125114440918, "epoch": 0.5546666666666666, "grad_norm": 0.08734725415706635, "learning_rate": 2.9312088846000733e-06, "loss": -0.0767, "reward": 0.5000000055879354, "reward_std": 0.3608439117670059, "rewards/accuracy_reward": 0.5000000055879354, "rewards/format_reward": 0.0, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 488.0833435058594, "epoch": 0.576, "grad_norm": 0.052988629788160324, "learning_rate": 2.9193740536077556e-06, "loss": -0.017, "reward": 0.645833345130086, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6250000093132257, "rewards/format_reward": 0.02083333395421505, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 502.14584732055664, "epoch": 0.5973333333333334, "grad_norm": 0.05281541496515274, "learning_rate": 2.906628198220621e-06, "loss": -0.0444, "reward": 0.479166679084301, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 409.04167556762695, "epoch": 0.6186666666666667, "grad_norm": 0.06433824449777603, "learning_rate": 2.8929794993583936e-06, "loss": 0.0073, "reward": 0.6250000186264515, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.6250000186264515, "rewards/format_reward": 0.0, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 564.7083473205566, "epoch": 0.64, "grad_norm": 0.05183090269565582, "learning_rate": 2.878436717430346e-06, "loss": -0.0141, "reward": 0.5625000074505806, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 530.3750190734863, "epoch": 0.6613333333333333, "grad_norm": 0.08775021880865097, "learning_rate": 2.8630091867124373e-06, "loss": -0.0283, "reward": 0.5000000074505806, "reward_std": 0.5051814764738083, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 477.12502670288086, "epoch": 0.6826666666666666, "grad_norm": 0.044543083757162094, "learning_rate": 2.846706809356113e-06, "loss": 0.0011, "reward": 0.8333333507180214, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.8333333507180214, "rewards/format_reward": 0.0, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 449.5000114440918, "epoch": 0.704, "grad_norm": 0.04804208129644394, "learning_rate": 2.8295400490326126e-06, "loss": 0.0198, "reward": 0.7916666828095913, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7916666828095913, "rewards/format_reward": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 408.1041736602783, "epoch": 0.7253333333333334, "grad_norm": 0.05096372961997986, "learning_rate": 2.811519924216873e-06, "loss": 0.0343, "reward": 0.6458333469927311, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6458333469927311, "rewards/format_reward": 0.0, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 429.2916717529297, "epoch": 0.7466666666666667, "grad_norm": 0.057229503989219666, "learning_rate": 2.7926580011153244e-06, "loss": 0.0138, "reward": 0.7500000223517418, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.7500000223517418, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 478.27084732055664, "epoch": 0.768, "grad_norm": 0.03784911707043648, "learning_rate": 2.7729663862421267e-06, "loss": 0.0228, "reward": 0.6458333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 435.93750762939453, "epoch": 0.7893333333333333, "grad_norm": 0.03586390241980553, "learning_rate": 2.7524577186486113e-06, "loss": 0.0256, "reward": 0.8750000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.8750000074505806, "rewards/format_reward": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 541.3958511352539, "epoch": 0.8106666666666666, "grad_norm": 0.042199280112981796, "learning_rate": 2.731145161810915e-06, "loss": 0.0552, "reward": 0.645833345130086, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.645833345130086, "rewards/format_reward": 0.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 498.04167556762695, "epoch": 0.832, "grad_norm": 0.05263036862015724, "learning_rate": 2.709042395181008e-06, "loss": 0.0344, "reward": 0.7916666865348816, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 447.14584732055664, "epoch": 0.8533333333333334, "grad_norm": 0.06595506519079208, "learning_rate": 2.6861636054065477e-06, "loss": 0.0311, "reward": 0.7291666828095913, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7291666828095913, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 425.18751525878906, "epoch": 0.8746666666666667, "grad_norm": 0.03698687627911568, "learning_rate": 2.6625234772251882e-06, "loss": -0.0078, "reward": 0.6875000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 495.3125228881836, "epoch": 0.896, "grad_norm": 0.053743649274110794, "learning_rate": 2.6381371840391863e-06, "loss": 0.0458, "reward": 0.7083333469927311, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.7083333469927311, "rewards/format_reward": 0.0, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 533.1875152587891, "epoch": 0.9173333333333333, "grad_norm": 0.05754838138818741, "learning_rate": 2.6130203781763665e-06, "loss": 0.0311, "reward": 0.6041666809469461, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6041666809469461, "rewards/format_reward": 0.0, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 481.1041793823242, "epoch": 0.9386666666666666, "grad_norm": 0.038248177617788315, "learning_rate": 2.58718918084368e-06, "loss": 0.0152, "reward": 0.6875000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 543.2291717529297, "epoch": 0.96, "grad_norm": 0.043112609535455704, "learning_rate": 2.5606601717798212e-06, "loss": 0.0341, "reward": 0.6250000223517418, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 502.66668701171875, "epoch": 0.9813333333333333, "grad_norm": 0.05423992499709129, "learning_rate": 2.53345037861353e-06, "loss": 0.0196, "reward": 0.6250000204890966, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.6250000204890966, "rewards/format_reward": 0.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 464.9166831970215, "epoch": 1.0213333333333334, "grad_norm": 0.052206046879291534, "learning_rate": 2.5055772659344177e-06, "loss": 0.0226, "reward": 0.7291666902601719, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7291666902601719, "rewards/format_reward": 0.0, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 525.8958473205566, "epoch": 1.0426666666666666, "grad_norm": 0.06632654368877411, "learning_rate": 2.477058724083334e-06, "loss": 0.0391, "reward": 0.7708333544433117, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.7708333544433117, "rewards/format_reward": 0.0, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 529.2916870117188, "epoch": 1.064, "grad_norm": 0.06171787902712822, "learning_rate": 2.447913057669456e-06, "loss": 0.0185, "reward": 0.6875000186264515, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.6875000186264515, "rewards/format_reward": 0.0, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 425.08334159851074, "epoch": 1.0853333333333333, "grad_norm": 0.04282655939459801, "learning_rate": 2.4181589738214946e-06, "loss": 0.0057, "reward": 0.7708333507180214, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 515.3541870117188, "epoch": 1.1066666666666667, "grad_norm": 0.05828641727566719, "learning_rate": 2.3878155701805258e-06, "loss": -0.0319, "reward": 0.6250000223517418, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 401.5833435058594, "epoch": 1.1280000000000001, "grad_norm": 0.0363980270922184, "learning_rate": 2.3569023226421886e-06, "loss": -0.0209, "reward": 0.8750000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8750000074505806, "rewards/format_reward": 0.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 445.5208435058594, "epoch": 1.1493333333333333, "grad_norm": 0.07585529237985611, "learning_rate": 2.325439072856087e-06, "loss": 0.0295, "reward": 0.7291666865348816, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 457.5833549499512, "epoch": 1.1706666666666667, "grad_norm": 0.048082854598760605, "learning_rate": 2.2934460154904436e-06, "loss": 0.0327, "reward": 0.7500000186264515, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7500000186264515, "rewards/format_reward": 0.0, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 492.0000190734863, "epoch": 1.192, "grad_norm": 0.05505705624818802, "learning_rate": 2.2609436852701614e-06, "loss": -0.0311, "reward": 0.6250000167638063, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.6250000167638063, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 454.6250114440918, "epoch": 1.2133333333333334, "grad_norm": 0.04047110304236412, "learning_rate": 2.227952943796622e-06, "loss": -0.0065, "reward": 0.7708333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7708333395421505, "rewards/format_reward": 0.0, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 505.6250114440918, "epoch": 1.2346666666666666, "grad_norm": 0.0817456841468811, "learning_rate": 2.194494966157681e-06, "loss": 0.0409, "reward": 0.7083333414047956, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7083333414047956, "rewards/format_reward": 0.0, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 555.2708473205566, "epoch": 1.256, "grad_norm": 0.06136218458414078, "learning_rate": 2.160591227336452e-06, "loss": -0.0545, "reward": 0.666666679084301, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 497.60418128967285, "epoch": 1.2773333333333334, "grad_norm": 0.03282522037625313, "learning_rate": 2.126263488427595e-06, "loss": 0.0007, "reward": 0.7500000111758709, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7500000111758709, "rewards/format_reward": 0.0, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 435.3750114440918, "epoch": 1.2986666666666666, "grad_norm": 0.08471012115478516, "learning_rate": 2.091533782669978e-06, "loss": -0.0221, "reward": 0.7916666846722364, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7916666846722364, "rewards/format_reward": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 560.4583549499512, "epoch": 1.32, "grad_norm": 0.050180453807115555, "learning_rate": 2.0564244013046517e-06, "loss": 0.0034, "reward": 0.6250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.6250000037252903, "rewards/format_reward": 0.0, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 489.43751525878906, "epoch": 1.3413333333333333, "grad_norm": 0.07682310044765472, "learning_rate": 2.0209578792672304e-06, "loss": -0.0011, "reward": 0.6666666865348816, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 371.00000762939453, "epoch": 1.3626666666666667, "grad_norm": 0.045041777193546295, "learning_rate": 1.9851569807238573e-06, "loss": -0.0184, "reward": 0.6458333414047956, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.6458333414047956, "rewards/format_reward": 0.0, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 456.08334732055664, "epoch": 1.384, "grad_norm": 0.04616188630461693, "learning_rate": 1.9490446844600373e-06, "loss": 0.0165, "reward": 0.5416666772216558, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5416666772216558, "rewards/format_reward": 0.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 429.7083396911621, "epoch": 1.4053333333333333, "grad_norm": 0.06254471838474274, "learning_rate": 1.912644169131717e-06, "loss": 0.0056, "reward": 0.7500000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 440.5833549499512, "epoch": 1.4266666666666667, "grad_norm": 0.058094874024391174, "learning_rate": 1.875978798388081e-06, "loss": 0.0496, "reward": 0.8333333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 417.60417556762695, "epoch": 1.448, "grad_norm": 0.032374057918787, "learning_rate": 1.8390721058756023e-06, "loss": 0.0034, "reward": 0.7500000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7500000074505806, "rewards/format_reward": 0.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 492.6250190734863, "epoch": 1.4693333333333334, "grad_norm": 0.057136889547109604, "learning_rate": 1.8019477801329903e-06, "loss": 0.0007, "reward": 0.7500000223517418, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7500000223517418, "rewards/format_reward": 0.0, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 469.52084732055664, "epoch": 1.4906666666666666, "grad_norm": 0.06064446642994881, "learning_rate": 1.764629649386713e-06, "loss": 0.0015, "reward": 0.6875000260770321, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.6875000260770321, "rewards/format_reward": 0.0, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 512.8125190734863, "epoch": 1.512, "grad_norm": 0.08758383989334106, "learning_rate": 1.7271416662568652e-06, "loss": 0.0576, "reward": 0.6666666865348816, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 439.18750953674316, "epoch": 1.5333333333333332, "grad_norm": 0.06427132338285446, "learning_rate": 1.6895078923831942e-06, "loss": 0.0174, "reward": 0.770833358168602, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 487.6041831970215, "epoch": 1.5546666666666666, "grad_norm": 0.05160915106534958, "learning_rate": 1.6517524829811483e-06, "loss": -0.0068, "reward": 0.7083333544433117, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7083333544433117, "rewards/format_reward": 0.0, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 436.27084732055664, "epoch": 1.576, "grad_norm": 0.05624426528811455, "learning_rate": 1.6138996713378693e-06, "loss": 0.0278, "reward": 0.791666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.791666679084301, "rewards/format_reward": 0.0, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 467.6250114440918, "epoch": 1.5973333333333333, "grad_norm": 0.07060783356428146, "learning_rate": 1.5759737532580691e-06, "loss": 0.0241, "reward": 0.8541666865348816, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 492.8958549499512, "epoch": 1.6186666666666667, "grad_norm": 0.07051456719636917, "learning_rate": 1.5379990714697819e-06, "loss": 0.002, "reward": 0.6666666828095913, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6666666828095913, "rewards/format_reward": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 474.81251335144043, "epoch": 1.6400000000000001, "grad_norm": 0.07857351005077362, "learning_rate": 1.5e-06, "loss": -0.0008, "reward": 0.645833345130086, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.645833345130086, "rewards/format_reward": 0.0, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 620.7708549499512, "epoch": 1.6613333333333333, "grad_norm": 0.03268231451511383, "learning_rate": 1.4620009285302184e-06, "loss": -0.0008, "reward": 0.4583333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.4583333358168602, "rewards/format_reward": 0.0, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 490.1250190734863, "epoch": 1.6826666666666665, "grad_norm": 0.06123334541916847, "learning_rate": 1.4240262467419312e-06, "loss": 0.0209, "reward": 0.6666666734963655, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6666666734963655, "rewards/format_reward": 0.0, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 414.6250114440918, "epoch": 1.704, "grad_norm": 0.051681999117136, "learning_rate": 1.386100328662131e-06, "loss": 0.0246, "reward": 0.8750000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 519.4583473205566, "epoch": 1.7253333333333334, "grad_norm": 0.053687114268541336, "learning_rate": 1.348247517018852e-06, "loss": 0.0154, "reward": 0.7500000111758709, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7500000111758709, "rewards/format_reward": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 476.6875114440918, "epoch": 1.7466666666666666, "grad_norm": 0.05704135075211525, "learning_rate": 1.3104921076168067e-06, "loss": 0.0145, "reward": 0.7708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 520.9791831970215, "epoch": 1.768, "grad_norm": 0.09103868156671524, "learning_rate": 1.2728583337431355e-06, "loss": 0.0513, "reward": 0.687500013038516, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.687500013038516, "rewards/format_reward": 0.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 512.7500114440918, "epoch": 1.7893333333333334, "grad_norm": 0.12537769973278046, "learning_rate": 1.2353703506132877e-06, "loss": 0.0442, "reward": 0.6666666865348816, "reward_std": 0.3608439117670059, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 424.41667556762695, "epoch": 1.8106666666666666, "grad_norm": 0.067196786403656, "learning_rate": 1.1980522198670096e-06, "loss": -0.018, "reward": 0.7708333507180214, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 470.0000114440918, "epoch": 1.8319999999999999, "grad_norm": 0.05638742074370384, "learning_rate": 1.1609278941243977e-06, "loss": -0.0033, "reward": 0.6041666734963655, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.6041666734963655, "rewards/format_reward": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 412.75000762939453, "epoch": 1.8533333333333335, "grad_norm": 0.07567507773637772, "learning_rate": 1.1240212016119191e-06, "loss": 0.0322, "reward": 0.791666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.791666679084301, "rewards/format_reward": 0.0, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 515.2500114440918, "epoch": 1.8746666666666667, "grad_norm": 0.10538745671510696, "learning_rate": 1.087355830868283e-06, "loss": 0.022, "reward": 0.520833345130086, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.520833345130086, "rewards/format_reward": 0.0, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 472.25001525878906, "epoch": 1.896, "grad_norm": 0.08544166386127472, "learning_rate": 1.050955315539963e-06, "loss": -0.0006, "reward": 0.8333333395421505, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.8333333395421505, "rewards/format_reward": 0.0, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 436.6250057220459, "epoch": 1.9173333333333333, "grad_norm": 0.2055334597826004, "learning_rate": 1.0148430192761428e-06, "loss": 0.0073, "reward": 0.7291666902601719, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.7291666902601719, "rewards/format_reward": 0.0, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 531.9791831970215, "epoch": 1.9386666666666668, "grad_norm": 0.274494469165802, "learning_rate": 9.790421207327699e-07, "loss": -0.0495, "reward": 0.5833333469927311, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5833333469927311, "rewards/format_reward": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 416.54168128967285, "epoch": 1.96, "grad_norm": 0.22600480914115906, "learning_rate": 9.435755986953485e-07, "loss": -0.0098, "reward": 0.6666666846722364, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.6666666846722364, "rewards/format_reward": 0.0, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 455.68750762939453, "epoch": 1.9813333333333332, "grad_norm": 0.1384221911430359, "learning_rate": 9.084662173300225e-07, "loss": 0.0305, "reward": 0.6875000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 435.54167556762695, "epoch": 2.021333333333333, "grad_norm": 0.18297049403190613, "learning_rate": 8.737365115724057e-07, "loss": 0.0126, "reward": 0.7708333618938923, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.7708333618938923, "rewards/format_reward": 0.0, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 446.9166793823242, "epoch": 2.042666666666667, "grad_norm": 0.1745564043521881, "learning_rate": 8.394087726635485e-07, "loss": 0.0131, "reward": 0.7291666772216558, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.7291666772216558, "rewards/format_reward": 0.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 464.5416793823242, "epoch": 2.064, "grad_norm": 0.148799866437912, "learning_rate": 8.055050338423189e-07, "loss": -0.004, "reward": 0.5833333358168602, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5833333358168602, "rewards/format_reward": 0.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 507.0416736602783, "epoch": 2.0853333333333333, "grad_norm": 0.2118462324142456, "learning_rate": 7.720470562033787e-07, "loss": -0.0077, "reward": 0.6666666865348816, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 519.4166793823242, "epoch": 2.1066666666666665, "grad_norm": 0.39051565527915955, "learning_rate": 7.390563147298395e-07, "loss": 0.0254, "reward": 0.4791666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 559.4166793823242, "epoch": 2.128, "grad_norm": 0.785881519317627, "learning_rate": 7.065539845095568e-07, "loss": 0.0403, "reward": 0.6666666828095913, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.6666666828095913, "rewards/format_reward": 0.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 441.60417556762695, "epoch": 2.1493333333333333, "grad_norm": 0.40288758277893066, "learning_rate": 6.74560927143913e-07, "loss": 0.0043, "reward": 0.7083333488553762, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.7083333488553762, "rewards/format_reward": 0.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 396.6041793823242, "epoch": 2.1706666666666665, "grad_norm": 0.3716258704662323, "learning_rate": 6.430976773578113e-07, "loss": -0.0253, "reward": 0.8125000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 467.5416831970215, "epoch": 2.192, "grad_norm": 0.38535118103027344, "learning_rate": 6.12184429819474e-07, "loss": 0.0074, "reward": 0.8125000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.8125000074505806, "rewards/format_reward": 0.0, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 428.43750762939453, "epoch": 2.2133333333333334, "grad_norm": 0.6874639391899109, "learning_rate": 5.818410261785057e-07, "loss": 0.0067, "reward": 0.7083333469927311, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7083333469927311, "rewards/format_reward": 0.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 446.1458435058594, "epoch": 2.2346666666666666, "grad_norm": 0.8092372417449951, "learning_rate": 5.520869423305442e-07, "loss": 0.0204, "reward": 0.5000000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5000000037252903, "rewards/format_reward": 0.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 452.89584732055664, "epoch": 2.2560000000000002, "grad_norm": 0.8611705303192139, "learning_rate": 5.22941275916667e-07, "loss": 0.0223, "reward": 0.7708333414047956, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7708333414047956, "rewards/format_reward": 0.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 524.5833511352539, "epoch": 2.2773333333333334, "grad_norm": 0.9545259475708008, "learning_rate": 4.944227340655821e-07, "loss": 0.0134, "reward": 0.7083333469927311, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7083333469927311, "rewards/format_reward": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 575.4791793823242, "epoch": 2.2986666666666666, "grad_norm": 1.5368934869766235, "learning_rate": 4.6654962138647007e-07, "loss": 0.0175, "reward": 0.5833333395421505, "reward_std": 0.32475952059030533, "rewards/accuracy_reward": 0.5833333395421505, "rewards/format_reward": 0.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 479.0625114440918, "epoch": 2.32, "grad_norm": 1.5656981468200684, "learning_rate": 4.3933982822017883e-07, "loss": -0.001, "reward": 0.625000013038516, "reward_std": 0.3608439117670059, "rewards/accuracy_reward": 0.625000013038516, "rewards/format_reward": 0.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 512.9791793823242, "epoch": 2.3413333333333335, "grad_norm": 1.6603009700775146, "learning_rate": 4.1281081915632036e-07, "loss": 0.0282, "reward": 0.500000013038516, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.500000013038516, "rewards/format_reward": 0.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 518.1041831970215, "epoch": 2.3626666666666667, "grad_norm": 1.1787548065185547, "learning_rate": 3.869796218236342e-07, "loss": 0.0138, "reward": 0.5208333488553762, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.5208333488553762, "rewards/format_reward": 0.0, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 421.3541793823242, "epoch": 2.384, "grad_norm": 1.3255914449691772, "learning_rate": 3.618628159608137e-07, "loss": 0.0306, "reward": 0.7916666772216558, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7916666772216558, "rewards/format_reward": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 369.7083435058594, "epoch": 2.405333333333333, "grad_norm": 0.6214256286621094, "learning_rate": 3.374765227748119e-07, "loss": 0.0086, "reward": 0.8541666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 432.4166831970215, "epoch": 2.4266666666666667, "grad_norm": 2.1950066089630127, "learning_rate": 3.1383639459345236e-07, "loss": 0.0008, "reward": 0.6875000111758709, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.6875000111758709, "rewards/format_reward": 0.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 414.9375114440918, "epoch": 2.448, "grad_norm": 1.6744318008422852, "learning_rate": 2.909576048189928e-07, "loss": 0.0052, "reward": 0.7916666865348816, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 497.7083511352539, "epoch": 2.469333333333333, "grad_norm": 1.6432876586914062, "learning_rate": 2.688548381890859e-07, "loss": -0.0057, "reward": 0.5416666809469461, "reward_std": 0.32475952059030533, "rewards/accuracy_reward": 0.5416666809469461, "rewards/format_reward": 0.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 356.45834732055664, "epoch": 2.490666666666667, "grad_norm": 1.4244107007980347, "learning_rate": 2.475422813513891e-07, "loss": -0.019, "reward": 0.854166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.854166679084301, "rewards/format_reward": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 393.85417556762695, "epoch": 2.512, "grad_norm": 0.5893205404281616, "learning_rate": 2.2703361375787346e-07, "loss": 0.0103, "reward": 0.9791666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 474.7083511352539, "epoch": 2.533333333333333, "grad_norm": 1.3833318948745728, "learning_rate": 2.0734199888467554e-07, "loss": -0.0016, "reward": 0.7916666753590107, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7916666753590107, "rewards/format_reward": 0.0, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 526.9791870117188, "epoch": 2.554666666666667, "grad_norm": 2.314704656600952, "learning_rate": 1.8848007578312686e-07, "loss": 0.0038, "reward": 0.5416666753590107, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.5416666753590107, "rewards/format_reward": 0.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 500.9166946411133, "epoch": 2.576, "grad_norm": 1.5002299547195435, "learning_rate": 1.7045995096738782e-07, "loss": -0.0083, "reward": 0.687500013038516, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.687500013038516, "rewards/format_reward": 0.0, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 470.12500381469727, "epoch": 2.5973333333333333, "grad_norm": 1.2943968772888184, "learning_rate": 1.5329319064388763e-07, "loss": 0.0078, "reward": 0.770833345130086, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.770833345130086, "rewards/format_reward": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 458.37501335144043, "epoch": 2.618666666666667, "grad_norm": 1.0128077268600464, "learning_rate": 1.3699081328756263e-07, "loss": 0.0287, "reward": 0.7708333488553762, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7708333488553762, "rewards/format_reward": 0.0, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 520.0208511352539, "epoch": 2.64, "grad_norm": 1.820874571800232, "learning_rate": 1.215632825696541e-07, "loss": 0.0105, "reward": 0.6875000149011612, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 465.0625114440918, "epoch": 2.6613333333333333, "grad_norm": 2.199171781539917, "learning_rate": 1.0702050064160684e-07, "loss": 0.0138, "reward": 0.6041666772216558, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6041666772216558, "rewards/format_reward": 0.0, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 500.1458435058594, "epoch": 2.6826666666666665, "grad_norm": 1.681441307067871, "learning_rate": 9.337180177937954e-08, "loss": 0.0313, "reward": 0.5416666753590107, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5416666753590107, "rewards/format_reward": 0.0, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 665.0208549499512, "epoch": 2.7039999999999997, "grad_norm": 1.198270559310913, "learning_rate": 8.062594639224469e-08, "loss": -0.0049, "reward": 0.4375000111758709, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 558.812520980835, "epoch": 2.7253333333333334, "grad_norm": 1.729133129119873, "learning_rate": 6.879111539992677e-08, "loss": 0.0174, "reward": 0.5625000111758709, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5625000111758709, "rewards/format_reward": 0.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 479.9166831970215, "epoch": 2.7466666666666666, "grad_norm": 1.7538045644760132, "learning_rate": 5.787490498168141e-08, "loss": -0.0194, "reward": 0.7291666865348816, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 506.7083511352539, "epoch": 2.768, "grad_norm": 1.361107349395752, "learning_rate": 4.788432170069373e-08, "loss": 0.0047, "reward": 0.6250000111758709, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.6250000111758709, "rewards/format_reward": 0.0, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 517.4375076293945, "epoch": 2.7893333333333334, "grad_norm": 1.7667996883392334, "learning_rate": 3.882577800691961e-08, "loss": 0.0192, "reward": 0.6250000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 532.6250114440918, "epoch": 2.8106666666666666, "grad_norm": 1.4511686563491821, "learning_rate": 3.0705088121258276e-08, "loss": 0.0061, "reward": 0.6250000111758709, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6250000111758709, "rewards/format_reward": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 475.79168701171875, "epoch": 2.832, "grad_norm": 1.0118310451507568, "learning_rate": 2.3527464303698676e-08, "loss": 0.0164, "reward": 0.729166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.0, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 477.2291793823242, "epoch": 2.8533333333333335, "grad_norm": 1.7040830850601196, "learning_rate": 1.729751350783293e-08, "loss": 0.0219, "reward": 0.7291666902601719, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.7291666902601719, "rewards/format_reward": 0.0, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 582.9166793823242, "epoch": 2.8746666666666667, "grad_norm": 1.8976448774337769, "learning_rate": 1.2019234423885472e-08, "loss": 0.0403, "reward": 0.5000000111758709, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 390.5416793823242, "epoch": 2.896, "grad_norm": 1.7262071371078491, "learning_rate": 7.696014912157268e-09, "loss": 0.0131, "reward": 0.7500000223517418, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.7500000223517418, "rewards/format_reward": 0.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 474.37501525878906, "epoch": 2.9173333333333336, "grad_norm": 2.062682867050171, "learning_rate": 4.330629828528887e-09, "loss": 0.0098, "reward": 0.6875000186264515, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6875000186264515, "rewards/format_reward": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 575.2083473205566, "epoch": 2.9386666666666668, "grad_norm": 2.000577926635742, "learning_rate": 1.9252392434208623e-09, "loss": 0.0269, "reward": 0.6041666809469461, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.6041666809469461, "rewards/format_reward": 0.0, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 487.1666793823242, "epoch": 2.96, "grad_norm": 1.6203181743621826, "learning_rate": 4.81387055354221e-10, "loss": 0.0127, "reward": 0.7291666828095913, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.7291666828095913, "rewards/format_reward": 0.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 521.4375095367432, "epoch": 2.981333333333333, "grad_norm": 2.0446486473083496, "learning_rate": 0.0, "loss": 0.0058, "reward": 0.5833333414047956, "reward_std": 0.3608439117670059, "rewards/accuracy_reward": 0.5833333414047956, "rewards/format_reward": 0.0, "step": 138 }, { "epoch": 2.981333333333333, "step": 138, "total_flos": 0.0, "train_loss": 0.00609967172773474, "train_runtime": 8086.2821, "train_samples_per_second": 0.278, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 138, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }