qwen2.5-1.5b-grpo-infer / trainer_state.json
yami2333's picture
Upload folder using huggingface_hub
3ce1370 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.981333333333333,
"eval_steps": 10,
"global_step": 138,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 479.2916717529297,
"epoch": 0.021333333333333333,
"grad_norm": 0.08470216393470764,
"learning_rate": 2.1428571428571428e-07,
"loss": 0.0578,
"reward": 0.1250000037252903,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 595.5416793823242,
"epoch": 0.042666666666666665,
"grad_norm": 0.09772875905036926,
"learning_rate": 4.2857142857142857e-07,
"loss": 0.0281,
"reward": 0.0833333358168602,
"reward_std": 0.10825317353010178,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 508.16667556762695,
"epoch": 0.064,
"grad_norm": 0.07093458622694016,
"learning_rate": 6.428571428571428e-07,
"loss": -0.0183,
"reward": 0.0833333358168602,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.0,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 514.1875190734863,
"epoch": 0.08533333333333333,
"grad_norm": 0.04470205307006836,
"learning_rate": 8.571428571428571e-07,
"loss": -0.0086,
"reward": 0.0416666679084301,
"reward_std": 0.07216878235340118,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.0,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 576.4791793823242,
"epoch": 0.10666666666666667,
"grad_norm": 0.06627917289733887,
"learning_rate": 1.0714285714285716e-06,
"loss": 0.0005,
"reward": 0.1250000037252903,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 497.37501525878906,
"epoch": 0.128,
"grad_norm": 0.03331312909722328,
"learning_rate": 1.2857142857142856e-06,
"loss": -0.0039,
"reward": 0.0416666679084301,
"reward_std": 0.07216878235340118,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.0,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 404.66667556762695,
"epoch": 0.14933333333333335,
"grad_norm": 0.0803137719631195,
"learning_rate": 1.5e-06,
"loss": -0.0083,
"reward": 0.1250000037252903,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 575.4583511352539,
"epoch": 0.17066666666666666,
"grad_norm": 0.046666789799928665,
"learning_rate": 1.7142857142857143e-06,
"loss": -0.0055,
"reward": 0.06250000186264515,
"reward_std": 0.07216878235340118,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": 0.0,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 535.0000076293945,
"epoch": 0.192,
"grad_norm": 0.06731698662042618,
"learning_rate": 1.928571428571429e-06,
"loss": 0.001,
"reward": 0.12500000186264515,
"reward_std": 0.10825317353010178,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.0,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 510.6666793823242,
"epoch": 0.21333333333333335,
"grad_norm": 0.1065572202205658,
"learning_rate": 2.142857142857143e-06,
"loss": -0.0209,
"reward": 0.1458333358168602,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 516.1875228881836,
"epoch": 0.23466666666666666,
"grad_norm": 0.06277307868003845,
"learning_rate": 2.357142857142857e-06,
"loss": -0.0154,
"reward": 0.18750000186264515,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.02083333395421505,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 479.93751525878906,
"epoch": 0.256,
"grad_norm": 0.06429535895586014,
"learning_rate": 2.571428571428571e-06,
"loss": -0.0103,
"reward": 0.14583333767950535,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/format_reward": 0.0,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 532.8125190734863,
"epoch": 0.2773333333333333,
"grad_norm": 0.05405157431960106,
"learning_rate": 2.785714285714286e-06,
"loss": 0.0394,
"reward": 0.10416666977107525,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/format_reward": 0.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 448.45834732055664,
"epoch": 0.2986666666666667,
"grad_norm": 0.08196503669023514,
"learning_rate": 3e-06,
"loss": -0.0514,
"reward": 0.2083333395421505,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/format_reward": 0.02083333395421505,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 562.4791889190674,
"epoch": 0.32,
"grad_norm": 0.07421501725912094,
"learning_rate": 2.999518612944646e-06,
"loss": -0.049,
"reward": 0.1666666716337204,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 489.729190826416,
"epoch": 0.3413333333333333,
"grad_norm": 0.07382796704769135,
"learning_rate": 2.9980747607565792e-06,
"loss": 0.0417,
"reward": 0.2083333358168602,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/format_reward": 0.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 459.58334159851074,
"epoch": 0.3626666666666667,
"grad_norm": 0.09509492665529251,
"learning_rate": 2.995669370171471e-06,
"loss": 0.0036,
"reward": 0.10416666977107525,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/format_reward": 0.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 507.5000171661377,
"epoch": 0.384,
"grad_norm": 0.07961481809616089,
"learning_rate": 2.9923039850878425e-06,
"loss": -0.032,
"reward": 0.29166667349636555,
"reward_std": 0.32475952059030533,
"rewards/accuracy_reward": 0.27083333767950535,
"rewards/format_reward": 0.02083333395421505,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 510.0625190734863,
"epoch": 0.4053333333333333,
"grad_norm": 0.08553481101989746,
"learning_rate": 2.9879807655761146e-06,
"loss": 0.0163,
"reward": 0.20833333767950535,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/format_reward": 0.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 390.06251335144043,
"epoch": 0.4266666666666667,
"grad_norm": 0.0976102352142334,
"learning_rate": 2.982702486492167e-06,
"loss": -0.0533,
"reward": 0.35416667722165585,
"reward_std": 0.32475952059030533,
"rewards/accuracy_reward": 0.35416667722165585,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 523.666675567627,
"epoch": 0.448,
"grad_norm": 0.06446705013513565,
"learning_rate": 2.9764725356963015e-06,
"loss": -0.0544,
"reward": 0.1875000037252903,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/format_reward": 0.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 358.5833435058594,
"epoch": 0.4693333333333333,
"grad_norm": 0.10436850041151047,
"learning_rate": 2.969294911878742e-06,
"loss": -0.049,
"reward": 0.5208333469927311,
"reward_std": 0.3608439117670059,
"rewards/accuracy_reward": 0.5000000111758709,
"rewards/format_reward": 0.02083333395421505,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 430.6041793823242,
"epoch": 0.49066666666666664,
"grad_norm": 0.06835100054740906,
"learning_rate": 2.9611742219930806e-06,
"loss": -0.0144,
"reward": 0.3125000037252903,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.3125000037252903,
"rewards/format_reward": 0.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 427.1041831970215,
"epoch": 0.512,
"grad_norm": 0.08951476961374283,
"learning_rate": 2.9521156782993067e-06,
"loss": -0.0214,
"reward": 0.416666679084301,
"reward_std": 0.3608439117670059,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 500.52084732055664,
"epoch": 0.5333333333333333,
"grad_norm": 0.04994317516684532,
"learning_rate": 2.942125095018319e-06,
"loss": 0.0059,
"reward": 0.5000000111758709,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.5000000111758709,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 447.3125114440918,
"epoch": 0.5546666666666666,
"grad_norm": 0.08734725415706635,
"learning_rate": 2.9312088846000733e-06,
"loss": -0.0767,
"reward": 0.5000000055879354,
"reward_std": 0.3608439117670059,
"rewards/accuracy_reward": 0.5000000055879354,
"rewards/format_reward": 0.0,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 488.0833435058594,
"epoch": 0.576,
"grad_norm": 0.052988629788160324,
"learning_rate": 2.9193740536077556e-06,
"loss": -0.017,
"reward": 0.645833345130086,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6250000093132257,
"rewards/format_reward": 0.02083333395421505,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 502.14584732055664,
"epoch": 0.5973333333333334,
"grad_norm": 0.05281541496515274,
"learning_rate": 2.906628198220621e-06,
"loss": -0.0444,
"reward": 0.479166679084301,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.479166679084301,
"rewards/format_reward": 0.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 409.04167556762695,
"epoch": 0.6186666666666667,
"grad_norm": 0.06433824449777603,
"learning_rate": 2.8929794993583936e-06,
"loss": 0.0073,
"reward": 0.6250000186264515,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.6250000186264515,
"rewards/format_reward": 0.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 564.7083473205566,
"epoch": 0.64,
"grad_norm": 0.05183090269565582,
"learning_rate": 2.878436717430346e-06,
"loss": -0.0141,
"reward": 0.5625000074505806,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.5625000074505806,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 530.3750190734863,
"epoch": 0.6613333333333333,
"grad_norm": 0.08775021880865097,
"learning_rate": 2.8630091867124373e-06,
"loss": -0.0283,
"reward": 0.5000000074505806,
"reward_std": 0.5051814764738083,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 477.12502670288086,
"epoch": 0.6826666666666666,
"grad_norm": 0.044543083757162094,
"learning_rate": 2.846706809356113e-06,
"loss": 0.0011,
"reward": 0.8333333507180214,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.8333333507180214,
"rewards/format_reward": 0.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 449.5000114440918,
"epoch": 0.704,
"grad_norm": 0.04804208129644394,
"learning_rate": 2.8295400490326126e-06,
"loss": 0.0198,
"reward": 0.7916666828095913,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7916666828095913,
"rewards/format_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 408.1041736602783,
"epoch": 0.7253333333333334,
"grad_norm": 0.05096372961997986,
"learning_rate": 2.811519924216873e-06,
"loss": 0.0343,
"reward": 0.6458333469927311,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6458333469927311,
"rewards/format_reward": 0.0,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 429.2916717529297,
"epoch": 0.7466666666666667,
"grad_norm": 0.057229503989219666,
"learning_rate": 2.7926580011153244e-06,
"loss": 0.0138,
"reward": 0.7500000223517418,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.7500000223517418,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 478.27084732055664,
"epoch": 0.768,
"grad_norm": 0.03784911707043648,
"learning_rate": 2.7729663862421267e-06,
"loss": 0.0228,
"reward": 0.6458333432674408,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.6458333432674408,
"rewards/format_reward": 0.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 435.93750762939453,
"epoch": 0.7893333333333333,
"grad_norm": 0.03586390241980553,
"learning_rate": 2.7524577186486113e-06,
"loss": 0.0256,
"reward": 0.8750000074505806,
"reward_std": 0.10825317353010178,
"rewards/accuracy_reward": 0.8750000074505806,
"rewards/format_reward": 0.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 541.3958511352539,
"epoch": 0.8106666666666666,
"grad_norm": 0.042199280112981796,
"learning_rate": 2.731145161810915e-06,
"loss": 0.0552,
"reward": 0.645833345130086,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.645833345130086,
"rewards/format_reward": 0.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 498.04167556762695,
"epoch": 0.832,
"grad_norm": 0.05263036862015724,
"learning_rate": 2.709042395181008e-06,
"loss": 0.0344,
"reward": 0.7916666865348816,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.7916666865348816,
"rewards/format_reward": 0.0,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 447.14584732055664,
"epoch": 0.8533333333333334,
"grad_norm": 0.06595506519079208,
"learning_rate": 2.6861636054065477e-06,
"loss": 0.0311,
"reward": 0.7291666828095913,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7291666828095913,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 425.18751525878906,
"epoch": 0.8746666666666667,
"grad_norm": 0.03698687627911568,
"learning_rate": 2.6625234772251882e-06,
"loss": -0.0078,
"reward": 0.6875000149011612,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.6875000149011612,
"rewards/format_reward": 0.0,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 495.3125228881836,
"epoch": 0.896,
"grad_norm": 0.053743649274110794,
"learning_rate": 2.6381371840391863e-06,
"loss": 0.0458,
"reward": 0.7083333469927311,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.7083333469927311,
"rewards/format_reward": 0.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 533.1875152587891,
"epoch": 0.9173333333333333,
"grad_norm": 0.05754838138818741,
"learning_rate": 2.6130203781763665e-06,
"loss": 0.0311,
"reward": 0.6041666809469461,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6041666809469461,
"rewards/format_reward": 0.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 481.1041793823242,
"epoch": 0.9386666666666666,
"grad_norm": 0.038248177617788315,
"learning_rate": 2.58718918084368e-06,
"loss": 0.0152,
"reward": 0.6875000149011612,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6875000149011612,
"rewards/format_reward": 0.0,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 543.2291717529297,
"epoch": 0.96,
"grad_norm": 0.043112609535455704,
"learning_rate": 2.5606601717798212e-06,
"loss": 0.0341,
"reward": 0.6250000223517418,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.6250000223517418,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 502.66668701171875,
"epoch": 0.9813333333333333,
"grad_norm": 0.05423992499709129,
"learning_rate": 2.53345037861353e-06,
"loss": 0.0196,
"reward": 0.6250000204890966,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.6250000204890966,
"rewards/format_reward": 0.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 464.9166831970215,
"epoch": 1.0213333333333334,
"grad_norm": 0.052206046879291534,
"learning_rate": 2.5055772659344177e-06,
"loss": 0.0226,
"reward": 0.7291666902601719,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7291666902601719,
"rewards/format_reward": 0.0,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 525.8958473205566,
"epoch": 1.0426666666666666,
"grad_norm": 0.06632654368877411,
"learning_rate": 2.477058724083334e-06,
"loss": 0.0391,
"reward": 0.7708333544433117,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.7708333544433117,
"rewards/format_reward": 0.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 529.2916870117188,
"epoch": 1.064,
"grad_norm": 0.06171787902712822,
"learning_rate": 2.447913057669456e-06,
"loss": 0.0185,
"reward": 0.6875000186264515,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.6875000186264515,
"rewards/format_reward": 0.0,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 425.08334159851074,
"epoch": 1.0853333333333333,
"grad_norm": 0.04282655939459801,
"learning_rate": 2.4181589738214946e-06,
"loss": 0.0057,
"reward": 0.7708333507180214,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7708333507180214,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 515.3541870117188,
"epoch": 1.1066666666666667,
"grad_norm": 0.05828641727566719,
"learning_rate": 2.3878155701805258e-06,
"loss": -0.0319,
"reward": 0.6250000223517418,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.6250000223517418,
"rewards/format_reward": 0.0,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 401.5833435058594,
"epoch": 1.1280000000000001,
"grad_norm": 0.0363980270922184,
"learning_rate": 2.3569023226421886e-06,
"loss": -0.0209,
"reward": 0.8750000074505806,
"reward_std": 0.07216878235340118,
"rewards/accuracy_reward": 0.8750000074505806,
"rewards/format_reward": 0.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 445.5208435058594,
"epoch": 1.1493333333333333,
"grad_norm": 0.07585529237985611,
"learning_rate": 2.325439072856087e-06,
"loss": 0.0295,
"reward": 0.7291666865348816,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.7291666865348816,
"rewards/format_reward": 0.0,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 457.5833549499512,
"epoch": 1.1706666666666667,
"grad_norm": 0.048082854598760605,
"learning_rate": 2.2934460154904436e-06,
"loss": 0.0327,
"reward": 0.7500000186264515,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7500000186264515,
"rewards/format_reward": 0.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 492.0000190734863,
"epoch": 1.192,
"grad_norm": 0.05505705624818802,
"learning_rate": 2.2609436852701614e-06,
"loss": -0.0311,
"reward": 0.6250000167638063,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.6250000167638063,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 454.6250114440918,
"epoch": 1.2133333333333334,
"grad_norm": 0.04047110304236412,
"learning_rate": 2.227952943796622e-06,
"loss": -0.0065,
"reward": 0.7708333395421505,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7708333395421505,
"rewards/format_reward": 0.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 505.6250114440918,
"epoch": 1.2346666666666666,
"grad_norm": 0.0817456841468811,
"learning_rate": 2.194494966157681e-06,
"loss": 0.0409,
"reward": 0.7083333414047956,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7083333414047956,
"rewards/format_reward": 0.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 555.2708473205566,
"epoch": 1.256,
"grad_norm": 0.06136218458414078,
"learning_rate": 2.160591227336452e-06,
"loss": -0.0545,
"reward": 0.666666679084301,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.666666679084301,
"rewards/format_reward": 0.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 497.60418128967285,
"epoch": 1.2773333333333334,
"grad_norm": 0.03282522037625313,
"learning_rate": 2.126263488427595e-06,
"loss": 0.0007,
"reward": 0.7500000111758709,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7500000111758709,
"rewards/format_reward": 0.0,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 435.3750114440918,
"epoch": 1.2986666666666666,
"grad_norm": 0.08471012115478516,
"learning_rate": 2.091533782669978e-06,
"loss": -0.0221,
"reward": 0.7916666846722364,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7916666846722364,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 560.4583549499512,
"epoch": 1.32,
"grad_norm": 0.050180453807115555,
"learning_rate": 2.0564244013046517e-06,
"loss": 0.0034,
"reward": 0.6250000037252903,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.6250000037252903,
"rewards/format_reward": 0.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 489.43751525878906,
"epoch": 1.3413333333333333,
"grad_norm": 0.07682310044765472,
"learning_rate": 2.0209578792672304e-06,
"loss": -0.0011,
"reward": 0.6666666865348816,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.6666666865348816,
"rewards/format_reward": 0.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 371.00000762939453,
"epoch": 1.3626666666666667,
"grad_norm": 0.045041777193546295,
"learning_rate": 1.9851569807238573e-06,
"loss": -0.0184,
"reward": 0.6458333414047956,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.6458333414047956,
"rewards/format_reward": 0.0,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 456.08334732055664,
"epoch": 1.384,
"grad_norm": 0.04616188630461693,
"learning_rate": 1.9490446844600373e-06,
"loss": 0.0165,
"reward": 0.5416666772216558,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.5416666772216558,
"rewards/format_reward": 0.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 429.7083396911621,
"epoch": 1.4053333333333333,
"grad_norm": 0.06254471838474274,
"learning_rate": 1.912644169131717e-06,
"loss": 0.0056,
"reward": 0.7500000149011612,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7500000149011612,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 440.5833549499512,
"epoch": 1.4266666666666667,
"grad_norm": 0.058094874024391174,
"learning_rate": 1.875978798388081e-06,
"loss": 0.0496,
"reward": 0.8333333432674408,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.8333333432674408,
"rewards/format_reward": 0.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 417.60417556762695,
"epoch": 1.448,
"grad_norm": 0.032374057918787,
"learning_rate": 1.8390721058756023e-06,
"loss": 0.0034,
"reward": 0.7500000074505806,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7500000074505806,
"rewards/format_reward": 0.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 492.6250190734863,
"epoch": 1.4693333333333334,
"grad_norm": 0.057136889547109604,
"learning_rate": 1.8019477801329903e-06,
"loss": 0.0007,
"reward": 0.7500000223517418,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7500000223517418,
"rewards/format_reward": 0.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 469.52084732055664,
"epoch": 1.4906666666666666,
"grad_norm": 0.06064446642994881,
"learning_rate": 1.764629649386713e-06,
"loss": 0.0015,
"reward": 0.6875000260770321,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.6875000260770321,
"rewards/format_reward": 0.0,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 512.8125190734863,
"epoch": 1.512,
"grad_norm": 0.08758383989334106,
"learning_rate": 1.7271416662568652e-06,
"loss": 0.0576,
"reward": 0.6666666865348816,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.6666666865348816,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 439.18750953674316,
"epoch": 1.5333333333333332,
"grad_norm": 0.06427132338285446,
"learning_rate": 1.6895078923831942e-06,
"loss": 0.0174,
"reward": 0.770833358168602,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.770833358168602,
"rewards/format_reward": 0.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 487.6041831970215,
"epoch": 1.5546666666666666,
"grad_norm": 0.05160915106534958,
"learning_rate": 1.6517524829811483e-06,
"loss": -0.0068,
"reward": 0.7083333544433117,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7083333544433117,
"rewards/format_reward": 0.0,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 436.27084732055664,
"epoch": 1.576,
"grad_norm": 0.05624426528811455,
"learning_rate": 1.6138996713378693e-06,
"loss": 0.0278,
"reward": 0.791666679084301,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.791666679084301,
"rewards/format_reward": 0.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 467.6250114440918,
"epoch": 1.5973333333333333,
"grad_norm": 0.07060783356428146,
"learning_rate": 1.5759737532580691e-06,
"loss": 0.0241,
"reward": 0.8541666865348816,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.8541666865348816,
"rewards/format_reward": 0.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 492.8958549499512,
"epoch": 1.6186666666666667,
"grad_norm": 0.07051456719636917,
"learning_rate": 1.5379990714697819e-06,
"loss": 0.002,
"reward": 0.6666666828095913,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6666666828095913,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 474.81251335144043,
"epoch": 1.6400000000000001,
"grad_norm": 0.07857351005077362,
"learning_rate": 1.5e-06,
"loss": -0.0008,
"reward": 0.645833345130086,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.645833345130086,
"rewards/format_reward": 0.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 620.7708549499512,
"epoch": 1.6613333333333333,
"grad_norm": 0.03268231451511383,
"learning_rate": 1.4620009285302184e-06,
"loss": -0.0008,
"reward": 0.4583333358168602,
"reward_std": 0.10825317353010178,
"rewards/accuracy_reward": 0.4583333358168602,
"rewards/format_reward": 0.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 490.1250190734863,
"epoch": 1.6826666666666665,
"grad_norm": 0.06123334541916847,
"learning_rate": 1.4240262467419312e-06,
"loss": 0.0209,
"reward": 0.6666666734963655,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6666666734963655,
"rewards/format_reward": 0.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 414.6250114440918,
"epoch": 1.704,
"grad_norm": 0.051681999117136,
"learning_rate": 1.386100328662131e-06,
"loss": 0.0246,
"reward": 0.8750000149011612,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.8750000149011612,
"rewards/format_reward": 0.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 519.4583473205566,
"epoch": 1.7253333333333334,
"grad_norm": 0.053687114268541336,
"learning_rate": 1.348247517018852e-06,
"loss": 0.0154,
"reward": 0.7500000111758709,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7500000111758709,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 476.6875114440918,
"epoch": 1.7466666666666666,
"grad_norm": 0.05704135075211525,
"learning_rate": 1.3104921076168067e-06,
"loss": 0.0145,
"reward": 0.7708333432674408,
"reward_std": 0.10825317353010178,
"rewards/accuracy_reward": 0.7708333432674408,
"rewards/format_reward": 0.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 520.9791831970215,
"epoch": 1.768,
"grad_norm": 0.09103868156671524,
"learning_rate": 1.2728583337431355e-06,
"loss": 0.0513,
"reward": 0.687500013038516,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.687500013038516,
"rewards/format_reward": 0.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 512.7500114440918,
"epoch": 1.7893333333333334,
"grad_norm": 0.12537769973278046,
"learning_rate": 1.2353703506132877e-06,
"loss": 0.0442,
"reward": 0.6666666865348816,
"reward_std": 0.3608439117670059,
"rewards/accuracy_reward": 0.6666666865348816,
"rewards/format_reward": 0.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 424.41667556762695,
"epoch": 1.8106666666666666,
"grad_norm": 0.067196786403656,
"learning_rate": 1.1980522198670096e-06,
"loss": -0.018,
"reward": 0.7708333507180214,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7708333507180214,
"rewards/format_reward": 0.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 470.0000114440918,
"epoch": 1.8319999999999999,
"grad_norm": 0.05638742074370384,
"learning_rate": 1.1609278941243977e-06,
"loss": -0.0033,
"reward": 0.6041666734963655,
"reward_std": 0.10825317353010178,
"rewards/accuracy_reward": 0.6041666734963655,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 412.75000762939453,
"epoch": 1.8533333333333335,
"grad_norm": 0.07567507773637772,
"learning_rate": 1.1240212016119191e-06,
"loss": 0.0322,
"reward": 0.791666679084301,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.791666679084301,
"rewards/format_reward": 0.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 515.2500114440918,
"epoch": 1.8746666666666667,
"grad_norm": 0.10538745671510696,
"learning_rate": 1.087355830868283e-06,
"loss": 0.022,
"reward": 0.520833345130086,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.520833345130086,
"rewards/format_reward": 0.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 472.25001525878906,
"epoch": 1.896,
"grad_norm": 0.08544166386127472,
"learning_rate": 1.050955315539963e-06,
"loss": -0.0006,
"reward": 0.8333333395421505,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.8333333395421505,
"rewards/format_reward": 0.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 436.6250057220459,
"epoch": 1.9173333333333333,
"grad_norm": 0.2055334597826004,
"learning_rate": 1.0148430192761428e-06,
"loss": 0.0073,
"reward": 0.7291666902601719,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.7291666902601719,
"rewards/format_reward": 0.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 531.9791831970215,
"epoch": 1.9386666666666668,
"grad_norm": 0.274494469165802,
"learning_rate": 9.790421207327699e-07,
"loss": -0.0495,
"reward": 0.5833333469927311,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.5833333469927311,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 416.54168128967285,
"epoch": 1.96,
"grad_norm": 0.22600480914115906,
"learning_rate": 9.435755986953485e-07,
"loss": -0.0098,
"reward": 0.6666666846722364,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.6666666846722364,
"rewards/format_reward": 0.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 455.68750762939453,
"epoch": 1.9813333333333332,
"grad_norm": 0.1384221911430359,
"learning_rate": 9.084662173300225e-07,
"loss": 0.0305,
"reward": 0.6875000149011612,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.6875000149011612,
"rewards/format_reward": 0.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 435.54167556762695,
"epoch": 2.021333333333333,
"grad_norm": 0.18297049403190613,
"learning_rate": 8.737365115724057e-07,
"loss": 0.0126,
"reward": 0.7708333618938923,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.7708333618938923,
"rewards/format_reward": 0.0,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 446.9166793823242,
"epoch": 2.042666666666667,
"grad_norm": 0.1745564043521881,
"learning_rate": 8.394087726635485e-07,
"loss": 0.0131,
"reward": 0.7291666772216558,
"reward_std": 0.10825317353010178,
"rewards/accuracy_reward": 0.7291666772216558,
"rewards/format_reward": 0.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 464.5416793823242,
"epoch": 2.064,
"grad_norm": 0.148799866437912,
"learning_rate": 8.055050338423189e-07,
"loss": -0.004,
"reward": 0.5833333358168602,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.5833333358168602,
"rewards/format_reward": 0.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 507.0416736602783,
"epoch": 2.0853333333333333,
"grad_norm": 0.2118462324142456,
"learning_rate": 7.720470562033787e-07,
"loss": -0.0077,
"reward": 0.6666666865348816,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.6666666865348816,
"rewards/format_reward": 0.0,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 519.4166793823242,
"epoch": 2.1066666666666665,
"grad_norm": 0.39051565527915955,
"learning_rate": 7.390563147298395e-07,
"loss": 0.0254,
"reward": 0.4791666716337204,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.4791666716337204,
"rewards/format_reward": 0.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 559.4166793823242,
"epoch": 2.128,
"grad_norm": 0.785881519317627,
"learning_rate": 7.065539845095568e-07,
"loss": 0.0403,
"reward": 0.6666666828095913,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.6666666828095913,
"rewards/format_reward": 0.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 441.60417556762695,
"epoch": 2.1493333333333333,
"grad_norm": 0.40288758277893066,
"learning_rate": 6.74560927143913e-07,
"loss": 0.0043,
"reward": 0.7083333488553762,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.7083333488553762,
"rewards/format_reward": 0.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 396.6041793823242,
"epoch": 2.1706666666666665,
"grad_norm": 0.3716258704662323,
"learning_rate": 6.430976773578113e-07,
"loss": -0.0253,
"reward": 0.8125000149011612,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.8125000149011612,
"rewards/format_reward": 0.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 467.5416831970215,
"epoch": 2.192,
"grad_norm": 0.38535118103027344,
"learning_rate": 6.12184429819474e-07,
"loss": 0.0074,
"reward": 0.8125000074505806,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.8125000074505806,
"rewards/format_reward": 0.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 428.43750762939453,
"epoch": 2.2133333333333334,
"grad_norm": 0.6874639391899109,
"learning_rate": 5.818410261785057e-07,
"loss": 0.0067,
"reward": 0.7083333469927311,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7083333469927311,
"rewards/format_reward": 0.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 446.1458435058594,
"epoch": 2.2346666666666666,
"grad_norm": 0.8092372417449951,
"learning_rate": 5.520869423305442e-07,
"loss": 0.0204,
"reward": 0.5000000037252903,
"reward_std": 0.07216878235340118,
"rewards/accuracy_reward": 0.5000000037252903,
"rewards/format_reward": 0.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 452.89584732055664,
"epoch": 2.2560000000000002,
"grad_norm": 0.8611705303192139,
"learning_rate": 5.22941275916667e-07,
"loss": 0.0223,
"reward": 0.7708333414047956,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7708333414047956,
"rewards/format_reward": 0.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 524.5833511352539,
"epoch": 2.2773333333333334,
"grad_norm": 0.9545259475708008,
"learning_rate": 4.944227340655821e-07,
"loss": 0.0134,
"reward": 0.7083333469927311,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7083333469927311,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 575.4791793823242,
"epoch": 2.2986666666666666,
"grad_norm": 1.5368934869766235,
"learning_rate": 4.6654962138647007e-07,
"loss": 0.0175,
"reward": 0.5833333395421505,
"reward_std": 0.32475952059030533,
"rewards/accuracy_reward": 0.5833333395421505,
"rewards/format_reward": 0.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 479.0625114440918,
"epoch": 2.32,
"grad_norm": 1.5656981468200684,
"learning_rate": 4.3933982822017883e-07,
"loss": -0.001,
"reward": 0.625000013038516,
"reward_std": 0.3608439117670059,
"rewards/accuracy_reward": 0.625000013038516,
"rewards/format_reward": 0.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 512.9791793823242,
"epoch": 2.3413333333333335,
"grad_norm": 1.6603009700775146,
"learning_rate": 4.1281081915632036e-07,
"loss": 0.0282,
"reward": 0.500000013038516,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.500000013038516,
"rewards/format_reward": 0.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 518.1041831970215,
"epoch": 2.3626666666666667,
"grad_norm": 1.1787548065185547,
"learning_rate": 3.869796218236342e-07,
"loss": 0.0138,
"reward": 0.5208333488553762,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.5208333488553762,
"rewards/format_reward": 0.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 421.3541793823242,
"epoch": 2.384,
"grad_norm": 1.3255914449691772,
"learning_rate": 3.618628159608137e-07,
"loss": 0.0306,
"reward": 0.7916666772216558,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7916666772216558,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 369.7083435058594,
"epoch": 2.405333333333333,
"grad_norm": 0.6214256286621094,
"learning_rate": 3.374765227748119e-07,
"loss": 0.0086,
"reward": 0.8541666716337204,
"reward_std": 0.03608439117670059,
"rewards/accuracy_reward": 0.8541666716337204,
"rewards/format_reward": 0.0,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 432.4166831970215,
"epoch": 2.4266666666666667,
"grad_norm": 2.1950066089630127,
"learning_rate": 3.1383639459345236e-07,
"loss": 0.0008,
"reward": 0.6875000111758709,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.6875000111758709,
"rewards/format_reward": 0.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 414.9375114440918,
"epoch": 2.448,
"grad_norm": 1.6744318008422852,
"learning_rate": 2.909576048189928e-07,
"loss": 0.0052,
"reward": 0.7916666865348816,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7916666865348816,
"rewards/format_reward": 0.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 497.7083511352539,
"epoch": 2.469333333333333,
"grad_norm": 1.6432876586914062,
"learning_rate": 2.688548381890859e-07,
"loss": -0.0057,
"reward": 0.5416666809469461,
"reward_std": 0.32475952059030533,
"rewards/accuracy_reward": 0.5416666809469461,
"rewards/format_reward": 0.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 356.45834732055664,
"epoch": 2.490666666666667,
"grad_norm": 1.4244107007980347,
"learning_rate": 2.475422813513891e-07,
"loss": -0.019,
"reward": 0.854166679084301,
"reward_std": 0.10825317353010178,
"rewards/accuracy_reward": 0.854166679084301,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 393.85417556762695,
"epoch": 2.512,
"grad_norm": 0.5893205404281616,
"learning_rate": 2.2703361375787346e-07,
"loss": 0.0103,
"reward": 0.9791666716337204,
"reward_std": 0.03608439117670059,
"rewards/accuracy_reward": 0.9791666716337204,
"rewards/format_reward": 0.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 474.7083511352539,
"epoch": 2.533333333333333,
"grad_norm": 1.3833318948745728,
"learning_rate": 2.0734199888467554e-07,
"loss": -0.0016,
"reward": 0.7916666753590107,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7916666753590107,
"rewards/format_reward": 0.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 526.9791870117188,
"epoch": 2.554666666666667,
"grad_norm": 2.314704656600952,
"learning_rate": 1.8848007578312686e-07,
"loss": 0.0038,
"reward": 0.5416666753590107,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.5416666753590107,
"rewards/format_reward": 0.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 500.9166946411133,
"epoch": 2.576,
"grad_norm": 1.5002299547195435,
"learning_rate": 1.7045995096738782e-07,
"loss": -0.0083,
"reward": 0.687500013038516,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.687500013038516,
"rewards/format_reward": 0.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 470.12500381469727,
"epoch": 2.5973333333333333,
"grad_norm": 1.2943968772888184,
"learning_rate": 1.5329319064388763e-07,
"loss": 0.0078,
"reward": 0.770833345130086,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.770833345130086,
"rewards/format_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 458.37501335144043,
"epoch": 2.618666666666667,
"grad_norm": 1.0128077268600464,
"learning_rate": 1.3699081328756263e-07,
"loss": 0.0287,
"reward": 0.7708333488553762,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7708333488553762,
"rewards/format_reward": 0.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 520.0208511352539,
"epoch": 2.64,
"grad_norm": 1.820874571800232,
"learning_rate": 1.215632825696541e-07,
"loss": 0.0105,
"reward": 0.6875000149011612,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.6875000149011612,
"rewards/format_reward": 0.0,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 465.0625114440918,
"epoch": 2.6613333333333333,
"grad_norm": 2.199171781539917,
"learning_rate": 1.0702050064160684e-07,
"loss": 0.0138,
"reward": 0.6041666772216558,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6041666772216558,
"rewards/format_reward": 0.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 500.1458435058594,
"epoch": 2.6826666666666665,
"grad_norm": 1.681441307067871,
"learning_rate": 9.337180177937954e-08,
"loss": 0.0313,
"reward": 0.5416666753590107,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.5416666753590107,
"rewards/format_reward": 0.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 665.0208549499512,
"epoch": 2.7039999999999997,
"grad_norm": 1.198270559310913,
"learning_rate": 8.062594639224469e-08,
"loss": -0.0049,
"reward": 0.4375000111758709,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.4375000111758709,
"rewards/format_reward": 0.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 558.812520980835,
"epoch": 2.7253333333333334,
"grad_norm": 1.729133129119873,
"learning_rate": 6.879111539992677e-08,
"loss": 0.0174,
"reward": 0.5625000111758709,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.5625000111758709,
"rewards/format_reward": 0.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 479.9166831970215,
"epoch": 2.7466666666666666,
"grad_norm": 1.7538045644760132,
"learning_rate": 5.787490498168141e-08,
"loss": -0.0194,
"reward": 0.7291666865348816,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7291666865348816,
"rewards/format_reward": 0.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 506.7083511352539,
"epoch": 2.768,
"grad_norm": 1.361107349395752,
"learning_rate": 4.788432170069373e-08,
"loss": 0.0047,
"reward": 0.6250000111758709,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.6250000111758709,
"rewards/format_reward": 0.0,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 517.4375076293945,
"epoch": 2.7893333333333334,
"grad_norm": 1.7667996883392334,
"learning_rate": 3.882577800691961e-08,
"loss": 0.0192,
"reward": 0.6250000149011612,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6250000149011612,
"rewards/format_reward": 0.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 532.6250114440918,
"epoch": 2.8106666666666666,
"grad_norm": 1.4511686563491821,
"learning_rate": 3.0705088121258276e-08,
"loss": 0.0061,
"reward": 0.6250000111758709,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6250000111758709,
"rewards/format_reward": 0.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 475.79168701171875,
"epoch": 2.832,
"grad_norm": 1.0118310451507568,
"learning_rate": 2.3527464303698676e-08,
"loss": 0.0164,
"reward": 0.729166679084301,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.729166679084301,
"rewards/format_reward": 0.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 477.2291793823242,
"epoch": 2.8533333333333335,
"grad_norm": 1.7040830850601196,
"learning_rate": 1.729751350783293e-08,
"loss": 0.0219,
"reward": 0.7291666902601719,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.7291666902601719,
"rewards/format_reward": 0.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 582.9166793823242,
"epoch": 2.8746666666666667,
"grad_norm": 1.8976448774337769,
"learning_rate": 1.2019234423885472e-08,
"loss": 0.0403,
"reward": 0.5000000111758709,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.5000000111758709,
"rewards/format_reward": 0.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 390.5416793823242,
"epoch": 2.896,
"grad_norm": 1.7262071371078491,
"learning_rate": 7.696014912157268e-09,
"loss": 0.0131,
"reward": 0.7500000223517418,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.7500000223517418,
"rewards/format_reward": 0.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 474.37501525878906,
"epoch": 2.9173333333333336,
"grad_norm": 2.062682867050171,
"learning_rate": 4.330629828528887e-09,
"loss": 0.0098,
"reward": 0.6875000186264515,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6875000186264515,
"rewards/format_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 575.2083473205566,
"epoch": 2.9386666666666668,
"grad_norm": 2.000577926635742,
"learning_rate": 1.9252392434208623e-09,
"loss": 0.0269,
"reward": 0.6041666809469461,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.6041666809469461,
"rewards/format_reward": 0.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 487.1666793823242,
"epoch": 2.96,
"grad_norm": 1.6203181743621826,
"learning_rate": 4.81387055354221e-10,
"loss": 0.0127,
"reward": 0.7291666828095913,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.7291666828095913,
"rewards/format_reward": 0.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 521.4375095367432,
"epoch": 2.981333333333333,
"grad_norm": 2.0446486473083496,
"learning_rate": 0.0,
"loss": 0.0058,
"reward": 0.5833333414047956,
"reward_std": 0.3608439117670059,
"rewards/accuracy_reward": 0.5833333414047956,
"rewards/format_reward": 0.0,
"step": 138
},
{
"epoch": 2.981333333333333,
"step": 138,
"total_flos": 0.0,
"train_loss": 0.00609967172773474,
"train_runtime": 8086.2821,
"train_samples_per_second": 0.278,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 138,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}