| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.981333333333333, |
| "eval_steps": 10, |
| "global_step": 138, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.2916717529297, |
| "epoch": 0.021333333333333333, |
| "grad_norm": 0.08470216393470764, |
| "learning_rate": 2.1428571428571428e-07, |
| "loss": 0.0578, |
| "reward": 0.1250000037252903, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.0, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.5416793823242, |
| "epoch": 0.042666666666666665, |
| "grad_norm": 0.09772875905036926, |
| "learning_rate": 4.2857142857142857e-07, |
| "loss": 0.0281, |
| "reward": 0.0833333358168602, |
| "reward_std": 0.10825317353010178, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.0, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.16667556762695, |
| "epoch": 0.064, |
| "grad_norm": 0.07093458622694016, |
| "learning_rate": 6.428571428571428e-07, |
| "loss": -0.0183, |
| "reward": 0.0833333358168602, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.0, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.1875190734863, |
| "epoch": 0.08533333333333333, |
| "grad_norm": 0.04470205307006836, |
| "learning_rate": 8.571428571428571e-07, |
| "loss": -0.0086, |
| "reward": 0.0416666679084301, |
| "reward_std": 0.07216878235340118, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/format_reward": 0.0, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.4791793823242, |
| "epoch": 0.10666666666666667, |
| "grad_norm": 0.06627917289733887, |
| "learning_rate": 1.0714285714285716e-06, |
| "loss": 0.0005, |
| "reward": 0.1250000037252903, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.0, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 497.37501525878906, |
| "epoch": 0.128, |
| "grad_norm": 0.03331312909722328, |
| "learning_rate": 1.2857142857142856e-06, |
| "loss": -0.0039, |
| "reward": 0.0416666679084301, |
| "reward_std": 0.07216878235340118, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/format_reward": 0.0, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.66667556762695, |
| "epoch": 0.14933333333333335, |
| "grad_norm": 0.0803137719631195, |
| "learning_rate": 1.5e-06, |
| "loss": -0.0083, |
| "reward": 0.1250000037252903, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.0, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.4583511352539, |
| "epoch": 0.17066666666666666, |
| "grad_norm": 0.046666789799928665, |
| "learning_rate": 1.7142857142857143e-06, |
| "loss": -0.0055, |
| "reward": 0.06250000186264515, |
| "reward_std": 0.07216878235340118, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/format_reward": 0.0, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.0000076293945, |
| "epoch": 0.192, |
| "grad_norm": 0.06731698662042618, |
| "learning_rate": 1.928571428571429e-06, |
| "loss": 0.001, |
| "reward": 0.12500000186264515, |
| "reward_std": 0.10825317353010178, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.0, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 510.6666793823242, |
| "epoch": 0.21333333333333335, |
| "grad_norm": 0.1065572202205658, |
| "learning_rate": 2.142857142857143e-06, |
| "loss": -0.0209, |
| "reward": 0.1458333358168602, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/format_reward": 0.0, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 516.1875228881836, |
| "epoch": 0.23466666666666666, |
| "grad_norm": 0.06277307868003845, |
| "learning_rate": 2.357142857142857e-06, |
| "loss": -0.0154, |
| "reward": 0.18750000186264515, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.02083333395421505, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.93751525878906, |
| "epoch": 0.256, |
| "grad_norm": 0.06429535895586014, |
| "learning_rate": 2.571428571428571e-06, |
| "loss": -0.0103, |
| "reward": 0.14583333767950535, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/format_reward": 0.0, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 532.8125190734863, |
| "epoch": 0.2773333333333333, |
| "grad_norm": 0.05405157431960106, |
| "learning_rate": 2.785714285714286e-06, |
| "loss": 0.0394, |
| "reward": 0.10416666977107525, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/format_reward": 0.0, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.45834732055664, |
| "epoch": 0.2986666666666667, |
| "grad_norm": 0.08196503669023514, |
| "learning_rate": 3e-06, |
| "loss": -0.0514, |
| "reward": 0.2083333395421505, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/format_reward": 0.02083333395421505, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 562.4791889190674, |
| "epoch": 0.32, |
| "grad_norm": 0.07421501725912094, |
| "learning_rate": 2.999518612944646e-06, |
| "loss": -0.049, |
| "reward": 0.1666666716337204, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/format_reward": 0.0, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.729190826416, |
| "epoch": 0.3413333333333333, |
| "grad_norm": 0.07382796704769135, |
| "learning_rate": 2.9980747607565792e-06, |
| "loss": 0.0417, |
| "reward": 0.2083333358168602, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/format_reward": 0.0, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.58334159851074, |
| "epoch": 0.3626666666666667, |
| "grad_norm": 0.09509492665529251, |
| "learning_rate": 2.995669370171471e-06, |
| "loss": 0.0036, |
| "reward": 0.10416666977107525, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/format_reward": 0.0, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.5000171661377, |
| "epoch": 0.384, |
| "grad_norm": 0.07961481809616089, |
| "learning_rate": 2.9923039850878425e-06, |
| "loss": -0.032, |
| "reward": 0.29166667349636555, |
| "reward_std": 0.32475952059030533, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/format_reward": 0.02083333395421505, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 510.0625190734863, |
| "epoch": 0.4053333333333333, |
| "grad_norm": 0.08553481101989746, |
| "learning_rate": 2.9879807655761146e-06, |
| "loss": 0.0163, |
| "reward": 0.20833333767950535, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/format_reward": 0.0, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 390.06251335144043, |
| "epoch": 0.4266666666666667, |
| "grad_norm": 0.0976102352142334, |
| "learning_rate": 2.982702486492167e-06, |
| "loss": -0.0533, |
| "reward": 0.35416667722165585, |
| "reward_std": 0.32475952059030533, |
| "rewards/accuracy_reward": 0.35416667722165585, |
| "rewards/format_reward": 0.0, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.666675567627, |
| "epoch": 0.448, |
| "grad_norm": 0.06446705013513565, |
| "learning_rate": 2.9764725356963015e-06, |
| "loss": -0.0544, |
| "reward": 0.1875000037252903, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/format_reward": 0.0, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.5833435058594, |
| "epoch": 0.4693333333333333, |
| "grad_norm": 0.10436850041151047, |
| "learning_rate": 2.969294911878742e-06, |
| "loss": -0.049, |
| "reward": 0.5208333469927311, |
| "reward_std": 0.3608439117670059, |
| "rewards/accuracy_reward": 0.5000000111758709, |
| "rewards/format_reward": 0.02083333395421505, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.6041793823242, |
| "epoch": 0.49066666666666664, |
| "grad_norm": 0.06835100054740906, |
| "learning_rate": 2.9611742219930806e-06, |
| "loss": -0.0144, |
| "reward": 0.3125000037252903, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/format_reward": 0.0, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.1041831970215, |
| "epoch": 0.512, |
| "grad_norm": 0.08951476961374283, |
| "learning_rate": 2.9521156782993067e-06, |
| "loss": -0.0214, |
| "reward": 0.416666679084301, |
| "reward_std": 0.3608439117670059, |
| "rewards/accuracy_reward": 0.416666679084301, |
| "rewards/format_reward": 0.0, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.52084732055664, |
| "epoch": 0.5333333333333333, |
| "grad_norm": 0.04994317516684532, |
| "learning_rate": 2.942125095018319e-06, |
| "loss": 0.0059, |
| "reward": 0.5000000111758709, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.5000000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.3125114440918, |
| "epoch": 0.5546666666666666, |
| "grad_norm": 0.08734725415706635, |
| "learning_rate": 2.9312088846000733e-06, |
| "loss": -0.0767, |
| "reward": 0.5000000055879354, |
| "reward_std": 0.3608439117670059, |
| "rewards/accuracy_reward": 0.5000000055879354, |
| "rewards/format_reward": 0.0, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.0833435058594, |
| "epoch": 0.576, |
| "grad_norm": 0.052988629788160324, |
| "learning_rate": 2.9193740536077556e-06, |
| "loss": -0.017, |
| "reward": 0.645833345130086, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6250000093132257, |
| "rewards/format_reward": 0.02083333395421505, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.14584732055664, |
| "epoch": 0.5973333333333334, |
| "grad_norm": 0.05281541496515274, |
| "learning_rate": 2.906628198220621e-06, |
| "loss": -0.0444, |
| "reward": 0.479166679084301, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.479166679084301, |
| "rewards/format_reward": 0.0, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 409.04167556762695, |
| "epoch": 0.6186666666666667, |
| "grad_norm": 0.06433824449777603, |
| "learning_rate": 2.8929794993583936e-06, |
| "loss": 0.0073, |
| "reward": 0.6250000186264515, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.6250000186264515, |
| "rewards/format_reward": 0.0, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 564.7083473205566, |
| "epoch": 0.64, |
| "grad_norm": 0.05183090269565582, |
| "learning_rate": 2.878436717430346e-06, |
| "loss": -0.0141, |
| "reward": 0.5625000074505806, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.5625000074505806, |
| "rewards/format_reward": 0.0, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 530.3750190734863, |
| "epoch": 0.6613333333333333, |
| "grad_norm": 0.08775021880865097, |
| "learning_rate": 2.8630091867124373e-06, |
| "loss": -0.0283, |
| "reward": 0.5000000074505806, |
| "reward_std": 0.5051814764738083, |
| "rewards/accuracy_reward": 0.5000000074505806, |
| "rewards/format_reward": 0.0, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.12502670288086, |
| "epoch": 0.6826666666666666, |
| "grad_norm": 0.044543083757162094, |
| "learning_rate": 2.846706809356113e-06, |
| "loss": 0.0011, |
| "reward": 0.8333333507180214, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.8333333507180214, |
| "rewards/format_reward": 0.0, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.5000114440918, |
| "epoch": 0.704, |
| "grad_norm": 0.04804208129644394, |
| "learning_rate": 2.8295400490326126e-06, |
| "loss": 0.0198, |
| "reward": 0.7916666828095913, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.7916666828095913, |
| "rewards/format_reward": 0.0, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 408.1041736602783, |
| "epoch": 0.7253333333333334, |
| "grad_norm": 0.05096372961997986, |
| "learning_rate": 2.811519924216873e-06, |
| "loss": 0.0343, |
| "reward": 0.6458333469927311, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6458333469927311, |
| "rewards/format_reward": 0.0, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 429.2916717529297, |
| "epoch": 0.7466666666666667, |
| "grad_norm": 0.057229503989219666, |
| "learning_rate": 2.7926580011153244e-06, |
| "loss": 0.0138, |
| "reward": 0.7500000223517418, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.7500000223517418, |
| "rewards/format_reward": 0.0, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.27084732055664, |
| "epoch": 0.768, |
| "grad_norm": 0.03784911707043648, |
| "learning_rate": 2.7729663862421267e-06, |
| "loss": 0.0228, |
| "reward": 0.6458333432674408, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.6458333432674408, |
| "rewards/format_reward": 0.0, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.93750762939453, |
| "epoch": 0.7893333333333333, |
| "grad_norm": 0.03586390241980553, |
| "learning_rate": 2.7524577186486113e-06, |
| "loss": 0.0256, |
| "reward": 0.8750000074505806, |
| "reward_std": 0.10825317353010178, |
| "rewards/accuracy_reward": 0.8750000074505806, |
| "rewards/format_reward": 0.0, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.3958511352539, |
| "epoch": 0.8106666666666666, |
| "grad_norm": 0.042199280112981796, |
| "learning_rate": 2.731145161810915e-06, |
| "loss": 0.0552, |
| "reward": 0.645833345130086, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.645833345130086, |
| "rewards/format_reward": 0.0, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 498.04167556762695, |
| "epoch": 0.832, |
| "grad_norm": 0.05263036862015724, |
| "learning_rate": 2.709042395181008e-06, |
| "loss": 0.0344, |
| "reward": 0.7916666865348816, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.7916666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.14584732055664, |
| "epoch": 0.8533333333333334, |
| "grad_norm": 0.06595506519079208, |
| "learning_rate": 2.6861636054065477e-06, |
| "loss": 0.0311, |
| "reward": 0.7291666828095913, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.7291666828095913, |
| "rewards/format_reward": 0.0, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.18751525878906, |
| "epoch": 0.8746666666666667, |
| "grad_norm": 0.03698687627911568, |
| "learning_rate": 2.6625234772251882e-06, |
| "loss": -0.0078, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/format_reward": 0.0, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 495.3125228881836, |
| "epoch": 0.896, |
| "grad_norm": 0.053743649274110794, |
| "learning_rate": 2.6381371840391863e-06, |
| "loss": 0.0458, |
| "reward": 0.7083333469927311, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.7083333469927311, |
| "rewards/format_reward": 0.0, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.1875152587891, |
| "epoch": 0.9173333333333333, |
| "grad_norm": 0.05754838138818741, |
| "learning_rate": 2.6130203781763665e-06, |
| "loss": 0.0311, |
| "reward": 0.6041666809469461, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6041666809469461, |
| "rewards/format_reward": 0.0, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 481.1041793823242, |
| "epoch": 0.9386666666666666, |
| "grad_norm": 0.038248177617788315, |
| "learning_rate": 2.58718918084368e-06, |
| "loss": 0.0152, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/format_reward": 0.0, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 543.2291717529297, |
| "epoch": 0.96, |
| "grad_norm": 0.043112609535455704, |
| "learning_rate": 2.5606601717798212e-06, |
| "loss": 0.0341, |
| "reward": 0.6250000223517418, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.6250000223517418, |
| "rewards/format_reward": 0.0, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.66668701171875, |
| "epoch": 0.9813333333333333, |
| "grad_norm": 0.05423992499709129, |
| "learning_rate": 2.53345037861353e-06, |
| "loss": 0.0196, |
| "reward": 0.6250000204890966, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.6250000204890966, |
| "rewards/format_reward": 0.0, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.9166831970215, |
| "epoch": 1.0213333333333334, |
| "grad_norm": 0.052206046879291534, |
| "learning_rate": 2.5055772659344177e-06, |
| "loss": 0.0226, |
| "reward": 0.7291666902601719, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.7291666902601719, |
| "rewards/format_reward": 0.0, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 525.8958473205566, |
| "epoch": 1.0426666666666666, |
| "grad_norm": 0.06632654368877411, |
| "learning_rate": 2.477058724083334e-06, |
| "loss": 0.0391, |
| "reward": 0.7708333544433117, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.7708333544433117, |
| "rewards/format_reward": 0.0, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 529.2916870117188, |
| "epoch": 1.064, |
| "grad_norm": 0.06171787902712822, |
| "learning_rate": 2.447913057669456e-06, |
| "loss": 0.0185, |
| "reward": 0.6875000186264515, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.6875000186264515, |
| "rewards/format_reward": 0.0, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.08334159851074, |
| "epoch": 1.0853333333333333, |
| "grad_norm": 0.04282655939459801, |
| "learning_rate": 2.4181589738214946e-06, |
| "loss": 0.0057, |
| "reward": 0.7708333507180214, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7708333507180214, |
| "rewards/format_reward": 0.0, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.3541870117188, |
| "epoch": 1.1066666666666667, |
| "grad_norm": 0.05828641727566719, |
| "learning_rate": 2.3878155701805258e-06, |
| "loss": -0.0319, |
| "reward": 0.6250000223517418, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.6250000223517418, |
| "rewards/format_reward": 0.0, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 401.5833435058594, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.0363980270922184, |
| "learning_rate": 2.3569023226421886e-06, |
| "loss": -0.0209, |
| "reward": 0.8750000074505806, |
| "reward_std": 0.07216878235340118, |
| "rewards/accuracy_reward": 0.8750000074505806, |
| "rewards/format_reward": 0.0, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.5208435058594, |
| "epoch": 1.1493333333333333, |
| "grad_norm": 0.07585529237985611, |
| "learning_rate": 2.325439072856087e-06, |
| "loss": 0.0295, |
| "reward": 0.7291666865348816, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.7291666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.5833549499512, |
| "epoch": 1.1706666666666667, |
| "grad_norm": 0.048082854598760605, |
| "learning_rate": 2.2934460154904436e-06, |
| "loss": 0.0327, |
| "reward": 0.7500000186264515, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7500000186264515, |
| "rewards/format_reward": 0.0, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.0000190734863, |
| "epoch": 1.192, |
| "grad_norm": 0.05505705624818802, |
| "learning_rate": 2.2609436852701614e-06, |
| "loss": -0.0311, |
| "reward": 0.6250000167638063, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.6250000167638063, |
| "rewards/format_reward": 0.0, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 454.6250114440918, |
| "epoch": 1.2133333333333334, |
| "grad_norm": 0.04047110304236412, |
| "learning_rate": 2.227952943796622e-06, |
| "loss": -0.0065, |
| "reward": 0.7708333395421505, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.7708333395421505, |
| "rewards/format_reward": 0.0, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.6250114440918, |
| "epoch": 1.2346666666666666, |
| "grad_norm": 0.0817456841468811, |
| "learning_rate": 2.194494966157681e-06, |
| "loss": 0.0409, |
| "reward": 0.7083333414047956, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.7083333414047956, |
| "rewards/format_reward": 0.0, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 555.2708473205566, |
| "epoch": 1.256, |
| "grad_norm": 0.06136218458414078, |
| "learning_rate": 2.160591227336452e-06, |
| "loss": -0.0545, |
| "reward": 0.666666679084301, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.666666679084301, |
| "rewards/format_reward": 0.0, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 497.60418128967285, |
| "epoch": 1.2773333333333334, |
| "grad_norm": 0.03282522037625313, |
| "learning_rate": 2.126263488427595e-06, |
| "loss": 0.0007, |
| "reward": 0.7500000111758709, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7500000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.3750114440918, |
| "epoch": 1.2986666666666666, |
| "grad_norm": 0.08471012115478516, |
| "learning_rate": 2.091533782669978e-06, |
| "loss": -0.0221, |
| "reward": 0.7916666846722364, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7916666846722364, |
| "rewards/format_reward": 0.0, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 560.4583549499512, |
| "epoch": 1.32, |
| "grad_norm": 0.050180453807115555, |
| "learning_rate": 2.0564244013046517e-06, |
| "loss": 0.0034, |
| "reward": 0.6250000037252903, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.6250000037252903, |
| "rewards/format_reward": 0.0, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.43751525878906, |
| "epoch": 1.3413333333333333, |
| "grad_norm": 0.07682310044765472, |
| "learning_rate": 2.0209578792672304e-06, |
| "loss": -0.0011, |
| "reward": 0.6666666865348816, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.6666666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 371.00000762939453, |
| "epoch": 1.3626666666666667, |
| "grad_norm": 0.045041777193546295, |
| "learning_rate": 1.9851569807238573e-06, |
| "loss": -0.0184, |
| "reward": 0.6458333414047956, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.6458333414047956, |
| "rewards/format_reward": 0.0, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.08334732055664, |
| "epoch": 1.384, |
| "grad_norm": 0.04616188630461693, |
| "learning_rate": 1.9490446844600373e-06, |
| "loss": 0.0165, |
| "reward": 0.5416666772216558, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.5416666772216558, |
| "rewards/format_reward": 0.0, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 429.7083396911621, |
| "epoch": 1.4053333333333333, |
| "grad_norm": 0.06254471838474274, |
| "learning_rate": 1.912644169131717e-06, |
| "loss": 0.0056, |
| "reward": 0.7500000149011612, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.7500000149011612, |
| "rewards/format_reward": 0.0, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.5833549499512, |
| "epoch": 1.4266666666666667, |
| "grad_norm": 0.058094874024391174, |
| "learning_rate": 1.875978798388081e-06, |
| "loss": 0.0496, |
| "reward": 0.8333333432674408, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.8333333432674408, |
| "rewards/format_reward": 0.0, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 417.60417556762695, |
| "epoch": 1.448, |
| "grad_norm": 0.032374057918787, |
| "learning_rate": 1.8390721058756023e-06, |
| "loss": 0.0034, |
| "reward": 0.7500000074505806, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.7500000074505806, |
| "rewards/format_reward": 0.0, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.6250190734863, |
| "epoch": 1.4693333333333334, |
| "grad_norm": 0.057136889547109604, |
| "learning_rate": 1.8019477801329903e-06, |
| "loss": 0.0007, |
| "reward": 0.7500000223517418, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7500000223517418, |
| "rewards/format_reward": 0.0, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 469.52084732055664, |
| "epoch": 1.4906666666666666, |
| "grad_norm": 0.06064446642994881, |
| "learning_rate": 1.764629649386713e-06, |
| "loss": 0.0015, |
| "reward": 0.6875000260770321, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.6875000260770321, |
| "rewards/format_reward": 0.0, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.8125190734863, |
| "epoch": 1.512, |
| "grad_norm": 0.08758383989334106, |
| "learning_rate": 1.7271416662568652e-06, |
| "loss": 0.0576, |
| "reward": 0.6666666865348816, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.6666666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.18750953674316, |
| "epoch": 1.5333333333333332, |
| "grad_norm": 0.06427132338285446, |
| "learning_rate": 1.6895078923831942e-06, |
| "loss": 0.0174, |
| "reward": 0.770833358168602, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.770833358168602, |
| "rewards/format_reward": 0.0, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.6041831970215, |
| "epoch": 1.5546666666666666, |
| "grad_norm": 0.05160915106534958, |
| "learning_rate": 1.6517524829811483e-06, |
| "loss": -0.0068, |
| "reward": 0.7083333544433117, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7083333544433117, |
| "rewards/format_reward": 0.0, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.27084732055664, |
| "epoch": 1.576, |
| "grad_norm": 0.05624426528811455, |
| "learning_rate": 1.6138996713378693e-06, |
| "loss": 0.0278, |
| "reward": 0.791666679084301, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.791666679084301, |
| "rewards/format_reward": 0.0, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.6250114440918, |
| "epoch": 1.5973333333333333, |
| "grad_norm": 0.07060783356428146, |
| "learning_rate": 1.5759737532580691e-06, |
| "loss": 0.0241, |
| "reward": 0.8541666865348816, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.8541666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.8958549499512, |
| "epoch": 1.6186666666666667, |
| "grad_norm": 0.07051456719636917, |
| "learning_rate": 1.5379990714697819e-06, |
| "loss": 0.002, |
| "reward": 0.6666666828095913, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6666666828095913, |
| "rewards/format_reward": 0.0, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.81251335144043, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 0.07857351005077362, |
| "learning_rate": 1.5e-06, |
| "loss": -0.0008, |
| "reward": 0.645833345130086, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.645833345130086, |
| "rewards/format_reward": 0.0, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.7708549499512, |
| "epoch": 1.6613333333333333, |
| "grad_norm": 0.03268231451511383, |
| "learning_rate": 1.4620009285302184e-06, |
| "loss": -0.0008, |
| "reward": 0.4583333358168602, |
| "reward_std": 0.10825317353010178, |
| "rewards/accuracy_reward": 0.4583333358168602, |
| "rewards/format_reward": 0.0, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 490.1250190734863, |
| "epoch": 1.6826666666666665, |
| "grad_norm": 0.06123334541916847, |
| "learning_rate": 1.4240262467419312e-06, |
| "loss": 0.0209, |
| "reward": 0.6666666734963655, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6666666734963655, |
| "rewards/format_reward": 0.0, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.6250114440918, |
| "epoch": 1.704, |
| "grad_norm": 0.051681999117136, |
| "learning_rate": 1.386100328662131e-06, |
| "loss": 0.0246, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.8750000149011612, |
| "rewards/format_reward": 0.0, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.4583473205566, |
| "epoch": 1.7253333333333334, |
| "grad_norm": 0.053687114268541336, |
| "learning_rate": 1.348247517018852e-06, |
| "loss": 0.0154, |
| "reward": 0.7500000111758709, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.7500000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.6875114440918, |
| "epoch": 1.7466666666666666, |
| "grad_norm": 0.05704135075211525, |
| "learning_rate": 1.3104921076168067e-06, |
| "loss": 0.0145, |
| "reward": 0.7708333432674408, |
| "reward_std": 0.10825317353010178, |
| "rewards/accuracy_reward": 0.7708333432674408, |
| "rewards/format_reward": 0.0, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.9791831970215, |
| "epoch": 1.768, |
| "grad_norm": 0.09103868156671524, |
| "learning_rate": 1.2728583337431355e-06, |
| "loss": 0.0513, |
| "reward": 0.687500013038516, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.687500013038516, |
| "rewards/format_reward": 0.0, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.7500114440918, |
| "epoch": 1.7893333333333334, |
| "grad_norm": 0.12537769973278046, |
| "learning_rate": 1.2353703506132877e-06, |
| "loss": 0.0442, |
| "reward": 0.6666666865348816, |
| "reward_std": 0.3608439117670059, |
| "rewards/accuracy_reward": 0.6666666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 424.41667556762695, |
| "epoch": 1.8106666666666666, |
| "grad_norm": 0.067196786403656, |
| "learning_rate": 1.1980522198670096e-06, |
| "loss": -0.018, |
| "reward": 0.7708333507180214, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7708333507180214, |
| "rewards/format_reward": 0.0, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 470.0000114440918, |
| "epoch": 1.8319999999999999, |
| "grad_norm": 0.05638742074370384, |
| "learning_rate": 1.1609278941243977e-06, |
| "loss": -0.0033, |
| "reward": 0.6041666734963655, |
| "reward_std": 0.10825317353010178, |
| "rewards/accuracy_reward": 0.6041666734963655, |
| "rewards/format_reward": 0.0, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 412.75000762939453, |
| "epoch": 1.8533333333333335, |
| "grad_norm": 0.07567507773637772, |
| "learning_rate": 1.1240212016119191e-06, |
| "loss": 0.0322, |
| "reward": 0.791666679084301, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.791666679084301, |
| "rewards/format_reward": 0.0, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.2500114440918, |
| "epoch": 1.8746666666666667, |
| "grad_norm": 0.10538745671510696, |
| "learning_rate": 1.087355830868283e-06, |
| "loss": 0.022, |
| "reward": 0.520833345130086, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.520833345130086, |
| "rewards/format_reward": 0.0, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.25001525878906, |
| "epoch": 1.896, |
| "grad_norm": 0.08544166386127472, |
| "learning_rate": 1.050955315539963e-06, |
| "loss": -0.0006, |
| "reward": 0.8333333395421505, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.8333333395421505, |
| "rewards/format_reward": 0.0, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.6250057220459, |
| "epoch": 1.9173333333333333, |
| "grad_norm": 0.2055334597826004, |
| "learning_rate": 1.0148430192761428e-06, |
| "loss": 0.0073, |
| "reward": 0.7291666902601719, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.7291666902601719, |
| "rewards/format_reward": 0.0, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.9791831970215, |
| "epoch": 1.9386666666666668, |
| "grad_norm": 0.274494469165802, |
| "learning_rate": 9.790421207327699e-07, |
| "loss": -0.0495, |
| "reward": 0.5833333469927311, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.5833333469927311, |
| "rewards/format_reward": 0.0, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 416.54168128967285, |
| "epoch": 1.96, |
| "grad_norm": 0.22600480914115906, |
| "learning_rate": 9.435755986953485e-07, |
| "loss": -0.0098, |
| "reward": 0.6666666846722364, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.6666666846722364, |
| "rewards/format_reward": 0.0, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.68750762939453, |
| "epoch": 1.9813333333333332, |
| "grad_norm": 0.1384221911430359, |
| "learning_rate": 9.084662173300225e-07, |
| "loss": 0.0305, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/format_reward": 0.0, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.54167556762695, |
| "epoch": 2.021333333333333, |
| "grad_norm": 0.18297049403190613, |
| "learning_rate": 8.737365115724057e-07, |
| "loss": 0.0126, |
| "reward": 0.7708333618938923, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.7708333618938923, |
| "rewards/format_reward": 0.0, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.9166793823242, |
| "epoch": 2.042666666666667, |
| "grad_norm": 0.1745564043521881, |
| "learning_rate": 8.394087726635485e-07, |
| "loss": 0.0131, |
| "reward": 0.7291666772216558, |
| "reward_std": 0.10825317353010178, |
| "rewards/accuracy_reward": 0.7291666772216558, |
| "rewards/format_reward": 0.0, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.5416793823242, |
| "epoch": 2.064, |
| "grad_norm": 0.148799866437912, |
| "learning_rate": 8.055050338423189e-07, |
| "loss": -0.004, |
| "reward": 0.5833333358168602, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.5833333358168602, |
| "rewards/format_reward": 0.0, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.0416736602783, |
| "epoch": 2.0853333333333333, |
| "grad_norm": 0.2118462324142456, |
| "learning_rate": 7.720470562033787e-07, |
| "loss": -0.0077, |
| "reward": 0.6666666865348816, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.6666666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.4166793823242, |
| "epoch": 2.1066666666666665, |
| "grad_norm": 0.39051565527915955, |
| "learning_rate": 7.390563147298395e-07, |
| "loss": 0.0254, |
| "reward": 0.4791666716337204, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.4791666716337204, |
| "rewards/format_reward": 0.0, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 559.4166793823242, |
| "epoch": 2.128, |
| "grad_norm": 0.785881519317627, |
| "learning_rate": 7.065539845095568e-07, |
| "loss": 0.0403, |
| "reward": 0.6666666828095913, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.6666666828095913, |
| "rewards/format_reward": 0.0, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 441.60417556762695, |
| "epoch": 2.1493333333333333, |
| "grad_norm": 0.40288758277893066, |
| "learning_rate": 6.74560927143913e-07, |
| "loss": 0.0043, |
| "reward": 0.7083333488553762, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.7083333488553762, |
| "rewards/format_reward": 0.0, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.6041793823242, |
| "epoch": 2.1706666666666665, |
| "grad_norm": 0.3716258704662323, |
| "learning_rate": 6.430976773578113e-07, |
| "loss": -0.0253, |
| "reward": 0.8125000149011612, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.8125000149011612, |
| "rewards/format_reward": 0.0, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.5416831970215, |
| "epoch": 2.192, |
| "grad_norm": 0.38535118103027344, |
| "learning_rate": 6.12184429819474e-07, |
| "loss": 0.0074, |
| "reward": 0.8125000074505806, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.8125000074505806, |
| "rewards/format_reward": 0.0, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.43750762939453, |
| "epoch": 2.2133333333333334, |
| "grad_norm": 0.6874639391899109, |
| "learning_rate": 5.818410261785057e-07, |
| "loss": 0.0067, |
| "reward": 0.7083333469927311, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7083333469927311, |
| "rewards/format_reward": 0.0, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.1458435058594, |
| "epoch": 2.2346666666666666, |
| "grad_norm": 0.8092372417449951, |
| "learning_rate": 5.520869423305442e-07, |
| "loss": 0.0204, |
| "reward": 0.5000000037252903, |
| "reward_std": 0.07216878235340118, |
| "rewards/accuracy_reward": 0.5000000037252903, |
| "rewards/format_reward": 0.0, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.89584732055664, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 0.8611705303192139, |
| "learning_rate": 5.22941275916667e-07, |
| "loss": 0.0223, |
| "reward": 0.7708333414047956, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.7708333414047956, |
| "rewards/format_reward": 0.0, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 524.5833511352539, |
| "epoch": 2.2773333333333334, |
| "grad_norm": 0.9545259475708008, |
| "learning_rate": 4.944227340655821e-07, |
| "loss": 0.0134, |
| "reward": 0.7083333469927311, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.7083333469927311, |
| "rewards/format_reward": 0.0, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.4791793823242, |
| "epoch": 2.2986666666666666, |
| "grad_norm": 1.5368934869766235, |
| "learning_rate": 4.6654962138647007e-07, |
| "loss": 0.0175, |
| "reward": 0.5833333395421505, |
| "reward_std": 0.32475952059030533, |
| "rewards/accuracy_reward": 0.5833333395421505, |
| "rewards/format_reward": 0.0, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.0625114440918, |
| "epoch": 2.32, |
| "grad_norm": 1.5656981468200684, |
| "learning_rate": 4.3933982822017883e-07, |
| "loss": -0.001, |
| "reward": 0.625000013038516, |
| "reward_std": 0.3608439117670059, |
| "rewards/accuracy_reward": 0.625000013038516, |
| "rewards/format_reward": 0.0, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.9791793823242, |
| "epoch": 2.3413333333333335, |
| "grad_norm": 1.6603009700775146, |
| "learning_rate": 4.1281081915632036e-07, |
| "loss": 0.0282, |
| "reward": 0.500000013038516, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.500000013038516, |
| "rewards/format_reward": 0.0, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.1041831970215, |
| "epoch": 2.3626666666666667, |
| "grad_norm": 1.1787548065185547, |
| "learning_rate": 3.869796218236342e-07, |
| "loss": 0.0138, |
| "reward": 0.5208333488553762, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.5208333488553762, |
| "rewards/format_reward": 0.0, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.3541793823242, |
| "epoch": 2.384, |
| "grad_norm": 1.3255914449691772, |
| "learning_rate": 3.618628159608137e-07, |
| "loss": 0.0306, |
| "reward": 0.7916666772216558, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7916666772216558, |
| "rewards/format_reward": 0.0, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 369.7083435058594, |
| "epoch": 2.405333333333333, |
| "grad_norm": 0.6214256286621094, |
| "learning_rate": 3.374765227748119e-07, |
| "loss": 0.0086, |
| "reward": 0.8541666716337204, |
| "reward_std": 0.03608439117670059, |
| "rewards/accuracy_reward": 0.8541666716337204, |
| "rewards/format_reward": 0.0, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 432.4166831970215, |
| "epoch": 2.4266666666666667, |
| "grad_norm": 2.1950066089630127, |
| "learning_rate": 3.1383639459345236e-07, |
| "loss": 0.0008, |
| "reward": 0.6875000111758709, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.6875000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.9375114440918, |
| "epoch": 2.448, |
| "grad_norm": 1.6744318008422852, |
| "learning_rate": 2.909576048189928e-07, |
| "loss": 0.0052, |
| "reward": 0.7916666865348816, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.7916666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 497.7083511352539, |
| "epoch": 2.469333333333333, |
| "grad_norm": 1.6432876586914062, |
| "learning_rate": 2.688548381890859e-07, |
| "loss": -0.0057, |
| "reward": 0.5416666809469461, |
| "reward_std": 0.32475952059030533, |
| "rewards/accuracy_reward": 0.5416666809469461, |
| "rewards/format_reward": 0.0, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 356.45834732055664, |
| "epoch": 2.490666666666667, |
| "grad_norm": 1.4244107007980347, |
| "learning_rate": 2.475422813513891e-07, |
| "loss": -0.019, |
| "reward": 0.854166679084301, |
| "reward_std": 0.10825317353010178, |
| "rewards/accuracy_reward": 0.854166679084301, |
| "rewards/format_reward": 0.0, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.85417556762695, |
| "epoch": 2.512, |
| "grad_norm": 0.5893205404281616, |
| "learning_rate": 2.2703361375787346e-07, |
| "loss": 0.0103, |
| "reward": 0.9791666716337204, |
| "reward_std": 0.03608439117670059, |
| "rewards/accuracy_reward": 0.9791666716337204, |
| "rewards/format_reward": 0.0, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.7083511352539, |
| "epoch": 2.533333333333333, |
| "grad_norm": 1.3833318948745728, |
| "learning_rate": 2.0734199888467554e-07, |
| "loss": -0.0016, |
| "reward": 0.7916666753590107, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.7916666753590107, |
| "rewards/format_reward": 0.0, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.9791870117188, |
| "epoch": 2.554666666666667, |
| "grad_norm": 2.314704656600952, |
| "learning_rate": 1.8848007578312686e-07, |
| "loss": 0.0038, |
| "reward": 0.5416666753590107, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.5416666753590107, |
| "rewards/format_reward": 0.0, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.9166946411133, |
| "epoch": 2.576, |
| "grad_norm": 1.5002299547195435, |
| "learning_rate": 1.7045995096738782e-07, |
| "loss": -0.0083, |
| "reward": 0.687500013038516, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.687500013038516, |
| "rewards/format_reward": 0.0, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 470.12500381469727, |
| "epoch": 2.5973333333333333, |
| "grad_norm": 1.2943968772888184, |
| "learning_rate": 1.5329319064388763e-07, |
| "loss": 0.0078, |
| "reward": 0.770833345130086, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.770833345130086, |
| "rewards/format_reward": 0.0, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 458.37501335144043, |
| "epoch": 2.618666666666667, |
| "grad_norm": 1.0128077268600464, |
| "learning_rate": 1.3699081328756263e-07, |
| "loss": 0.0287, |
| "reward": 0.7708333488553762, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.7708333488553762, |
| "rewards/format_reward": 0.0, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.0208511352539, |
| "epoch": 2.64, |
| "grad_norm": 1.820874571800232, |
| "learning_rate": 1.215632825696541e-07, |
| "loss": 0.0105, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/format_reward": 0.0, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.0625114440918, |
| "epoch": 2.6613333333333333, |
| "grad_norm": 2.199171781539917, |
| "learning_rate": 1.0702050064160684e-07, |
| "loss": 0.0138, |
| "reward": 0.6041666772216558, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6041666772216558, |
| "rewards/format_reward": 0.0, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.1458435058594, |
| "epoch": 2.6826666666666665, |
| "grad_norm": 1.681441307067871, |
| "learning_rate": 9.337180177937954e-08, |
| "loss": 0.0313, |
| "reward": 0.5416666753590107, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.5416666753590107, |
| "rewards/format_reward": 0.0, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 665.0208549499512, |
| "epoch": 2.7039999999999997, |
| "grad_norm": 1.198270559310913, |
| "learning_rate": 8.062594639224469e-08, |
| "loss": -0.0049, |
| "reward": 0.4375000111758709, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.4375000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 558.812520980835, |
| "epoch": 2.7253333333333334, |
| "grad_norm": 1.729133129119873, |
| "learning_rate": 6.879111539992677e-08, |
| "loss": 0.0174, |
| "reward": 0.5625000111758709, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.5625000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.9166831970215, |
| "epoch": 2.7466666666666666, |
| "grad_norm": 1.7538045644760132, |
| "learning_rate": 5.787490498168141e-08, |
| "loss": -0.0194, |
| "reward": 0.7291666865348816, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.7291666865348816, |
| "rewards/format_reward": 0.0, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.7083511352539, |
| "epoch": 2.768, |
| "grad_norm": 1.361107349395752, |
| "learning_rate": 4.788432170069373e-08, |
| "loss": 0.0047, |
| "reward": 0.6250000111758709, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.6250000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.4375076293945, |
| "epoch": 2.7893333333333334, |
| "grad_norm": 1.7667996883392334, |
| "learning_rate": 3.882577800691961e-08, |
| "loss": 0.0192, |
| "reward": 0.6250000149011612, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6250000149011612, |
| "rewards/format_reward": 0.0, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 532.6250114440918, |
| "epoch": 2.8106666666666666, |
| "grad_norm": 1.4511686563491821, |
| "learning_rate": 3.0705088121258276e-08, |
| "loss": 0.0061, |
| "reward": 0.6250000111758709, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6250000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 475.79168701171875, |
| "epoch": 2.832, |
| "grad_norm": 1.0118310451507568, |
| "learning_rate": 2.3527464303698676e-08, |
| "loss": 0.0164, |
| "reward": 0.729166679084301, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 0.729166679084301, |
| "rewards/format_reward": 0.0, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.2291793823242, |
| "epoch": 2.8533333333333335, |
| "grad_norm": 1.7040830850601196, |
| "learning_rate": 1.729751350783293e-08, |
| "loss": 0.0219, |
| "reward": 0.7291666902601719, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.7291666902601719, |
| "rewards/format_reward": 0.0, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 582.9166793823242, |
| "epoch": 2.8746666666666667, |
| "grad_norm": 1.8976448774337769, |
| "learning_rate": 1.2019234423885472e-08, |
| "loss": 0.0403, |
| "reward": 0.5000000111758709, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.5000000111758709, |
| "rewards/format_reward": 0.0, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 390.5416793823242, |
| "epoch": 2.896, |
| "grad_norm": 1.7262071371078491, |
| "learning_rate": 7.696014912157268e-09, |
| "loss": 0.0131, |
| "reward": 0.7500000223517418, |
| "reward_std": 0.28867512941360474, |
| "rewards/accuracy_reward": 0.7500000223517418, |
| "rewards/format_reward": 0.0, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.37501525878906, |
| "epoch": 2.9173333333333336, |
| "grad_norm": 2.062682867050171, |
| "learning_rate": 4.330629828528887e-09, |
| "loss": 0.0098, |
| "reward": 0.6875000186264515, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.6875000186264515, |
| "rewards/format_reward": 0.0, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.2083473205566, |
| "epoch": 2.9386666666666668, |
| "grad_norm": 2.000577926635742, |
| "learning_rate": 1.9252392434208623e-09, |
| "loss": 0.0269, |
| "reward": 0.6041666809469461, |
| "reward_std": 0.18042195588350296, |
| "rewards/accuracy_reward": 0.6041666809469461, |
| "rewards/format_reward": 0.0, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.1666793823242, |
| "epoch": 2.96, |
| "grad_norm": 1.6203181743621826, |
| "learning_rate": 4.81387055354221e-10, |
| "loss": 0.0127, |
| "reward": 0.7291666828095913, |
| "reward_std": 0.25259073823690414, |
| "rewards/accuracy_reward": 0.7291666828095913, |
| "rewards/format_reward": 0.0, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 521.4375095367432, |
| "epoch": 2.981333333333333, |
| "grad_norm": 2.0446486473083496, |
| "learning_rate": 0.0, |
| "loss": 0.0058, |
| "reward": 0.5833333414047956, |
| "reward_std": 0.3608439117670059, |
| "rewards/accuracy_reward": 0.5833333414047956, |
| "rewards/format_reward": 0.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.981333333333333, |
| "step": 138, |
| "total_flos": 0.0, |
| "train_loss": 0.00609967172773474, |
| "train_runtime": 8086.2821, |
| "train_samples_per_second": 0.278, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 138, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|