| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9998757609640949, |
| "eval_steps": 100, |
| "global_step": 4527, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 384.625, |
| "epoch": 0.0011043469858229456, |
| "grad_norm": 1.912142623070508, |
| "kl": 0.0005407754331827163, |
| "learning_rate": 2.2075055187637973e-07, |
| "loss": 0.0, |
| "reward": 0.59375, |
| "reward_std": 0.30935921147465706, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.45625, |
| "step": 5 |
| }, |
| { |
| "completion_length": 461.3875, |
| "epoch": 0.002208693971645891, |
| "grad_norm": 1.2625025592593675, |
| "kl": 0.00020947456359863282, |
| "learning_rate": 4.4150110375275946e-07, |
| "loss": 0.0, |
| "reward": 0.575, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.41875, |
| "step": 10 |
| }, |
| { |
| "completion_length": 385.9875, |
| "epoch": 0.0033130409574688366, |
| "grad_norm": 1.4824872365395827, |
| "kl": 0.0002246655523777008, |
| "learning_rate": 6.622516556291392e-07, |
| "loss": 0.0, |
| "reward": 0.61875, |
| "reward_std": 0.3623922191560268, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.46875, |
| "step": 15 |
| }, |
| { |
| "completion_length": 429.06875, |
| "epoch": 0.004417387943291782, |
| "grad_norm": 1.1157617082303197, |
| "kl": 0.000436440110206604, |
| "learning_rate": 8.830022075055189e-07, |
| "loss": 0.0, |
| "reward": 0.6, |
| "reward_std": 0.3712310537695885, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.46875, |
| "step": 20 |
| }, |
| { |
| "completion_length": 352.0125, |
| "epoch": 0.005521734929114728, |
| "grad_norm": 0.6004950274417132, |
| "kl": 0.002642902731895447, |
| "learning_rate": 1.1037527593818985e-06, |
| "loss": 0.0001, |
| "reward": 0.725, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.60625, |
| "step": 25 |
| }, |
| { |
| "completion_length": 294.3125, |
| "epoch": 0.006626081914937673, |
| "grad_norm": 3.738595553790936, |
| "kl": 0.015436601638793946, |
| "learning_rate": 1.3245033112582784e-06, |
| "loss": 0.0006, |
| "reward": 0.73125, |
| "reward_std": 0.32703688070178033, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.61875, |
| "step": 30 |
| }, |
| { |
| "completion_length": 302.69375, |
| "epoch": 0.007730428900760619, |
| "grad_norm": 1.655779879795067, |
| "kl": 0.019573783874511717, |
| "learning_rate": 1.545253863134658e-06, |
| "loss": 0.0008, |
| "reward": 0.79375, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.73125, |
| "step": 35 |
| }, |
| { |
| "completion_length": 375.9125, |
| "epoch": 0.008834775886583565, |
| "grad_norm": 2.572886866328185, |
| "kl": 0.018162012100219727, |
| "learning_rate": 1.7660044150110378e-06, |
| "loss": 0.0007, |
| "reward": 0.8375, |
| "reward_std": 0.30052037686109545, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.7, |
| "step": 40 |
| }, |
| { |
| "completion_length": 363.75625, |
| "epoch": 0.00993912287240651, |
| "grad_norm": 0.7289483033436792, |
| "kl": 0.017768669128417968, |
| "learning_rate": 1.9867549668874175e-06, |
| "loss": 0.0007, |
| "reward": 0.81875, |
| "reward_std": 0.2916815422475338, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.73125, |
| "step": 45 |
| }, |
| { |
| "completion_length": 317.81875, |
| "epoch": 0.011043469858229456, |
| "grad_norm": 0.608013151594999, |
| "kl": 0.014789676666259766, |
| "learning_rate": 2.207505518763797e-06, |
| "loss": 0.0006, |
| "reward": 0.85625, |
| "reward_std": 0.2916815422475338, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.75625, |
| "step": 50 |
| }, |
| { |
| "completion_length": 288.8625, |
| "epoch": 0.012147816844052401, |
| "grad_norm": 1.087832023876081, |
| "kl": 0.014481544494628906, |
| "learning_rate": 2.4282560706401767e-06, |
| "loss": 0.0006, |
| "reward": 0.8, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.71875, |
| "step": 55 |
| }, |
| { |
| "completion_length": 266.53125, |
| "epoch": 0.013252163829875346, |
| "grad_norm": 1.453031787000738, |
| "kl": 0.013717460632324218, |
| "learning_rate": 2.6490066225165567e-06, |
| "loss": 0.0005, |
| "reward": 0.8625, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.8, |
| "step": 60 |
| }, |
| { |
| "completion_length": 285.14375, |
| "epoch": 0.014356510815698293, |
| "grad_norm": 1.139133573143394, |
| "kl": 0.01951141357421875, |
| "learning_rate": 2.8697571743929364e-06, |
| "loss": 0.0008, |
| "reward": 0.90625, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.81875, |
| "step": 65 |
| }, |
| { |
| "completion_length": 265.5875, |
| "epoch": 0.015460857801521238, |
| "grad_norm": 0.8819487715358617, |
| "kl": 0.01654224395751953, |
| "learning_rate": 3.090507726269316e-06, |
| "loss": 0.0007, |
| "reward": 0.89375, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.79375, |
| "step": 70 |
| }, |
| { |
| "completion_length": 275.11875, |
| "epoch": 0.016565204787344183, |
| "grad_norm": 0.6456181700347499, |
| "kl": 0.026264095306396486, |
| "learning_rate": 3.311258278145696e-06, |
| "loss": 0.0011, |
| "reward": 0.91875, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.8, |
| "step": 75 |
| }, |
| { |
| "completion_length": 260.90625, |
| "epoch": 0.01766955177316713, |
| "grad_norm": 1.1108485585200154, |
| "kl": 0.0263336181640625, |
| "learning_rate": 3.5320088300220757e-06, |
| "loss": 0.0011, |
| "reward": 0.91875, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.83125, |
| "step": 80 |
| }, |
| { |
| "completion_length": 210.76875, |
| "epoch": 0.018773898758990076, |
| "grad_norm": 0.6452807102957274, |
| "kl": 0.03692817687988281, |
| "learning_rate": 3.752759381898455e-06, |
| "loss": 0.0015, |
| "reward": 0.98125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.90625, |
| "step": 85 |
| }, |
| { |
| "completion_length": 264.475, |
| "epoch": 0.01987824574481302, |
| "grad_norm": 0.7880782851098767, |
| "kl": 0.035125350952148436, |
| "learning_rate": 3.973509933774835e-06, |
| "loss": 0.0014, |
| "reward": 0.88125, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.8125, |
| "step": 90 |
| }, |
| { |
| "completion_length": 284.78125, |
| "epoch": 0.020982592730635966, |
| "grad_norm": 0.7085853946380432, |
| "kl": 0.023376846313476564, |
| "learning_rate": 4.1942604856512145e-06, |
| "loss": 0.0009, |
| "reward": 0.95625, |
| "reward_std": 0.2563262037932873, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.8375, |
| "step": 95 |
| }, |
| { |
| "completion_length": 278.6875, |
| "epoch": 0.022086939716458913, |
| "grad_norm": 0.8560416069975489, |
| "kl": 0.04554176330566406, |
| "learning_rate": 4.415011037527594e-06, |
| "loss": 0.0018, |
| "reward": 0.9625, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.86875, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.022086939716458913, |
| "eval_completion_length": 248.37, |
| "eval_kl": 0.046234130859375, |
| "eval_loss": 0.0018549839733168483, |
| "eval_reward": 0.985, |
| "eval_reward_std": 0.1767766922712326, |
| "eval_rewards/accuracy_reward": 0.07, |
| "eval_rewards/format_reward": 0.915, |
| "eval_runtime": 127.1942, |
| "eval_samples_per_second": 0.778, |
| "eval_steps_per_second": 0.197, |
| "step": 100 |
| }, |
| { |
| "completion_length": 252.65625, |
| "epoch": 0.023191286702281856, |
| "grad_norm": 0.8513924054195807, |
| "kl": 0.07780532836914063, |
| "learning_rate": 4.635761589403974e-06, |
| "loss": 0.0031, |
| "reward": 0.99375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.89375, |
| "step": 105 |
| }, |
| { |
| "completion_length": 212.9375, |
| "epoch": 0.024295633688104803, |
| "grad_norm": 1.1979997527722244, |
| "kl": 0.03602142333984375, |
| "learning_rate": 4.856512141280353e-06, |
| "loss": 0.0014, |
| "reward": 0.9375, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.8875, |
| "step": 110 |
| }, |
| { |
| "completion_length": 195.11875, |
| "epoch": 0.02539998067392775, |
| "grad_norm": 0.5507213314932029, |
| "kl": 0.03738479614257813, |
| "learning_rate": 5.077262693156734e-06, |
| "loss": 0.0015, |
| "reward": 0.95625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.90625, |
| "step": 115 |
| }, |
| { |
| "completion_length": 160.11875, |
| "epoch": 0.026504327659750693, |
| "grad_norm": 1.1186714555390784, |
| "kl": 0.036508941650390626, |
| "learning_rate": 5.2980132450331135e-06, |
| "loss": 0.0015, |
| "reward": 1.0, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.93125, |
| "step": 120 |
| }, |
| { |
| "completion_length": 183.3, |
| "epoch": 0.02760867464557364, |
| "grad_norm": 1.378767133063548, |
| "kl": 0.044263458251953124, |
| "learning_rate": 5.518763796909493e-06, |
| "loss": 0.0018, |
| "reward": 0.99375, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.8875, |
| "step": 125 |
| }, |
| { |
| "completion_length": 164.45, |
| "epoch": 0.028713021631396586, |
| "grad_norm": 0.7101264478347288, |
| "kl": 0.052339935302734376, |
| "learning_rate": 5.739514348785873e-06, |
| "loss": 0.0021, |
| "reward": 1.0625, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.925, |
| "step": 130 |
| }, |
| { |
| "completion_length": 180.11875, |
| "epoch": 0.02981736861721953, |
| "grad_norm": 0.49907818395454545, |
| "kl": 0.05917434692382813, |
| "learning_rate": 5.960264900662252e-06, |
| "loss": 0.0024, |
| "reward": 1.025, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.90625, |
| "step": 135 |
| }, |
| { |
| "completion_length": 198.775, |
| "epoch": 0.030921715603042476, |
| "grad_norm": 0.8587681927090215, |
| "kl": 0.06558990478515625, |
| "learning_rate": 6.181015452538632e-06, |
| "loss": 0.0026, |
| "reward": 0.98125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.86875, |
| "step": 140 |
| }, |
| { |
| "completion_length": 177.4625, |
| "epoch": 0.03202606258886542, |
| "grad_norm": 1.5915533844114518, |
| "kl": 0.08502044677734374, |
| "learning_rate": 6.4017660044150125e-06, |
| "loss": 0.0034, |
| "reward": 0.99375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9125, |
| "step": 145 |
| }, |
| { |
| "completion_length": 204.075, |
| "epoch": 0.033130409574688366, |
| "grad_norm": 0.7038907751646122, |
| "kl": 0.06302261352539062, |
| "learning_rate": 6.622516556291392e-06, |
| "loss": 0.0025, |
| "reward": 1.03125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9375, |
| "step": 150 |
| }, |
| { |
| "completion_length": 160.18125, |
| "epoch": 0.034234756560511316, |
| "grad_norm": 1.0970259017722608, |
| "kl": 0.09358978271484375, |
| "learning_rate": 6.843267108167772e-06, |
| "loss": 0.0037, |
| "reward": 1.08125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.96875, |
| "step": 155 |
| }, |
| { |
| "completion_length": 180.5875, |
| "epoch": 0.03533910354633426, |
| "grad_norm": 0.5682863259937629, |
| "kl": 0.091546630859375, |
| "learning_rate": 7.064017660044151e-06, |
| "loss": 0.0037, |
| "reward": 1.0375, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.975, |
| "step": 160 |
| }, |
| { |
| "completion_length": 209.2625, |
| "epoch": 0.0364434505321572, |
| "grad_norm": 0.9429119760227406, |
| "kl": 0.08664703369140625, |
| "learning_rate": 7.28476821192053e-06, |
| "loss": 0.0035, |
| "reward": 1.08125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.96875, |
| "step": 165 |
| }, |
| { |
| "completion_length": 208.41875, |
| "epoch": 0.03754779751798015, |
| "grad_norm": 1.2790088963784818, |
| "kl": 0.101275634765625, |
| "learning_rate": 7.50551876379691e-06, |
| "loss": 0.0041, |
| "reward": 1.0625, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.16875, |
| "rewards/format_reward": 0.89375, |
| "step": 170 |
| }, |
| { |
| "completion_length": 259.24375, |
| "epoch": 0.038652144503803096, |
| "grad_norm": 0.46075949216263296, |
| "kl": 0.0850189208984375, |
| "learning_rate": 7.726269315673288e-06, |
| "loss": 0.0034, |
| "reward": 0.94375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.89375, |
| "step": 175 |
| }, |
| { |
| "completion_length": 248.26875, |
| "epoch": 0.03975649148962604, |
| "grad_norm": 0.6242098331069889, |
| "kl": 0.086053466796875, |
| "learning_rate": 7.94701986754967e-06, |
| "loss": 0.0034, |
| "reward": 1.025, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.8875, |
| "step": 180 |
| }, |
| { |
| "completion_length": 238.375, |
| "epoch": 0.04086083847544899, |
| "grad_norm": 1.150282122681421, |
| "kl": 0.1117950439453125, |
| "learning_rate": 8.16777041942605e-06, |
| "loss": 0.0045, |
| "reward": 1.025, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.9125, |
| "step": 185 |
| }, |
| { |
| "completion_length": 219.4, |
| "epoch": 0.04196518546127193, |
| "grad_norm": 0.7542415145174329, |
| "kl": 0.126629638671875, |
| "learning_rate": 8.388520971302429e-06, |
| "loss": 0.0051, |
| "reward": 0.975, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.89375, |
| "step": 190 |
| }, |
| { |
| "completion_length": 198.54375, |
| "epoch": 0.043069532447094876, |
| "grad_norm": 0.6615977009838082, |
| "kl": 0.1339691162109375, |
| "learning_rate": 8.609271523178809e-06, |
| "loss": 0.0054, |
| "reward": 0.975, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.89375, |
| "step": 195 |
| }, |
| { |
| "completion_length": 230.6, |
| "epoch": 0.044173879432917826, |
| "grad_norm": 1.2480834347897671, |
| "kl": 0.1072998046875, |
| "learning_rate": 8.830022075055188e-06, |
| "loss": 0.0043, |
| "reward": 0.95625, |
| "reward_std": 0.2563262037932873, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.85, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.044173879432917826, |
| "eval_completion_length": 205.55, |
| "eval_kl": 0.0896044921875, |
| "eval_loss": 0.003579025389626622, |
| "eval_reward": 1.085, |
| "eval_reward_std": 0.1484924215078354, |
| "eval_rewards/accuracy_reward": 0.13, |
| "eval_rewards/format_reward": 0.955, |
| "eval_runtime": 103.514, |
| "eval_samples_per_second": 0.956, |
| "eval_steps_per_second": 0.242, |
| "step": 200 |
| }, |
| { |
| "completion_length": 224.75, |
| "epoch": 0.04527822641874077, |
| "grad_norm": 0.5677247970494854, |
| "kl": 0.1135223388671875, |
| "learning_rate": 9.050772626931568e-06, |
| "loss": 0.0045, |
| "reward": 1.025, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.90625, |
| "step": 205 |
| }, |
| { |
| "completion_length": 276.25625, |
| "epoch": 0.04638257340456371, |
| "grad_norm": 0.6313414085910611, |
| "kl": 0.12886962890625, |
| "learning_rate": 9.271523178807948e-06, |
| "loss": 0.0052, |
| "reward": 1.125, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.20625, |
| "rewards/format_reward": 0.91875, |
| "step": 210 |
| }, |
| { |
| "completion_length": 337.5125, |
| "epoch": 0.04748692039038666, |
| "grad_norm": 0.6076982934519585, |
| "kl": 0.1175445556640625, |
| "learning_rate": 9.492273730684327e-06, |
| "loss": 0.0047, |
| "reward": 0.9125, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.81875, |
| "step": 215 |
| }, |
| { |
| "completion_length": 331.2875, |
| "epoch": 0.048591267376209606, |
| "grad_norm": 0.887758372335282, |
| "kl": 0.11523284912109374, |
| "learning_rate": 9.713024282560707e-06, |
| "loss": 0.0046, |
| "reward": 0.94375, |
| "reward_std": 0.23864853456616403, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.85625, |
| "step": 220 |
| }, |
| { |
| "completion_length": 269.38125, |
| "epoch": 0.04969561436203255, |
| "grad_norm": 0.6864886594586257, |
| "kl": 0.13491058349609375, |
| "learning_rate": 9.933774834437086e-06, |
| "loss": 0.0054, |
| "reward": 0.95, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.875, |
| "step": 225 |
| }, |
| { |
| "completion_length": 246.025, |
| "epoch": 0.0507999613478555, |
| "grad_norm": 1.2699307354463993, |
| "kl": 0.166473388671875, |
| "learning_rate": 1.0154525386313468e-05, |
| "loss": 0.0067, |
| "reward": 1.025, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.88125, |
| "step": 230 |
| }, |
| { |
| "completion_length": 206.09375, |
| "epoch": 0.05190430833367844, |
| "grad_norm": 0.6720713057618479, |
| "kl": 0.1639678955078125, |
| "learning_rate": 1.0375275938189846e-05, |
| "loss": 0.0066, |
| "reward": 1.0375, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.925, |
| "step": 235 |
| }, |
| { |
| "completion_length": 312.0875, |
| "epoch": 0.053008655319501385, |
| "grad_norm": 0.8771338863302535, |
| "kl": 0.25369873046875, |
| "learning_rate": 1.0596026490066227e-05, |
| "loss": 0.0101, |
| "reward": 0.9, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.8125, |
| "step": 240 |
| }, |
| { |
| "completion_length": 188.65, |
| "epoch": 0.054113002305324336, |
| "grad_norm": 0.6127023135715081, |
| "kl": 0.209857177734375, |
| "learning_rate": 1.0816777041942605e-05, |
| "loss": 0.0084, |
| "reward": 1.0125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.91875, |
| "step": 245 |
| }, |
| { |
| "completion_length": 183.35, |
| "epoch": 0.05521734929114728, |
| "grad_norm": 0.9178124946472914, |
| "kl": 0.2119171142578125, |
| "learning_rate": 1.1037527593818986e-05, |
| "loss": 0.0085, |
| "reward": 0.95625, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.90625, |
| "step": 250 |
| }, |
| { |
| "completion_length": 199.725, |
| "epoch": 0.05632169627697022, |
| "grad_norm": 0.9841255331094259, |
| "kl": 0.188714599609375, |
| "learning_rate": 1.1258278145695364e-05, |
| "loss": 0.0075, |
| "reward": 0.9875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.91875, |
| "step": 255 |
| }, |
| { |
| "completion_length": 222.5875, |
| "epoch": 0.05742604326279317, |
| "grad_norm": 0.6571161565702436, |
| "kl": 1504.1844940185547, |
| "learning_rate": 1.1479028697571745e-05, |
| "loss": 60.0766, |
| "reward": 0.975, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9, |
| "step": 260 |
| }, |
| { |
| "completion_length": 238.375, |
| "epoch": 0.058530390248616115, |
| "grad_norm": 0.7793687563873339, |
| "kl": 0.16044921875, |
| "learning_rate": 1.1699779249448125e-05, |
| "loss": 0.0064, |
| "reward": 1.04375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.925, |
| "step": 265 |
| }, |
| { |
| "completion_length": 308.975, |
| "epoch": 0.05963473723443906, |
| "grad_norm": 0.6524174398492628, |
| "kl": 0.344976806640625, |
| "learning_rate": 1.1920529801324505e-05, |
| "loss": 0.0138, |
| "reward": 1.03125, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.89375, |
| "step": 270 |
| }, |
| { |
| "completion_length": 298.2875, |
| "epoch": 0.06073908422026201, |
| "grad_norm": 0.515875299778908, |
| "kl": 0.228948974609375, |
| "learning_rate": 1.2141280353200884e-05, |
| "loss": 0.0092, |
| "reward": 0.98125, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.88125, |
| "step": 275 |
| }, |
| { |
| "completion_length": 328.84375, |
| "epoch": 0.06184343120608495, |
| "grad_norm": 31.625220010701653, |
| "kl": 0.2593719482421875, |
| "learning_rate": 1.2362030905077264e-05, |
| "loss": 0.0104, |
| "reward": 1.08125, |
| "reward_std": 0.30935921147465706, |
| "rewards/accuracy_reward": 0.225, |
| "rewards/format_reward": 0.85625, |
| "step": 280 |
| }, |
| { |
| "completion_length": 338.28125, |
| "epoch": 0.0629477781919079, |
| "grad_norm": 0.8132166957068776, |
| "kl": 0.26093597412109376, |
| "learning_rate": 1.2582781456953644e-05, |
| "loss": 0.0104, |
| "reward": 0.925, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.83125, |
| "step": 285 |
| }, |
| { |
| "completion_length": 276.55625, |
| "epoch": 0.06405212517773085, |
| "grad_norm": 0.8732592923475726, |
| "kl": 0.2025054931640625, |
| "learning_rate": 1.2803532008830025e-05, |
| "loss": 0.0081, |
| "reward": 1.0125, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.8875, |
| "step": 290 |
| }, |
| { |
| "completion_length": 222.75, |
| "epoch": 0.06515647216355379, |
| "grad_norm": 0.5307249225741028, |
| "kl": 0.1828094482421875, |
| "learning_rate": 1.3024282560706403e-05, |
| "loss": 0.0073, |
| "reward": 1.0625, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9375, |
| "step": 295 |
| }, |
| { |
| "completion_length": 176.43125, |
| "epoch": 0.06626081914937673, |
| "grad_norm": 0.7065408921391656, |
| "kl": 0.169866943359375, |
| "learning_rate": 1.3245033112582784e-05, |
| "loss": 0.0068, |
| "reward": 1.11875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1625, |
| "rewards/format_reward": 0.95625, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.06626081914937673, |
| "eval_completion_length": 205.125, |
| "eval_kl": 0.178349609375, |
| "eval_loss": 0.007154763210564852, |
| "eval_reward": 1.085, |
| "eval_reward_std": 0.162634556889534, |
| "eval_rewards/accuracy_reward": 0.13, |
| "eval_rewards/format_reward": 0.955, |
| "eval_runtime": 111.3878, |
| "eval_samples_per_second": 0.889, |
| "eval_steps_per_second": 0.224, |
| "step": 300 |
| }, |
| { |
| "completion_length": 243.425, |
| "epoch": 0.06736516613519968, |
| "grad_norm": 0.7324178851810815, |
| "kl": 0.2018463134765625, |
| "learning_rate": 1.3465783664459162e-05, |
| "loss": 0.0081, |
| "reward": 1.0375, |
| "reward_std": 0.2828427076339722, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.9, |
| "step": 305 |
| }, |
| { |
| "completion_length": 246.875, |
| "epoch": 0.06846951312102263, |
| "grad_norm": 0.5430407102590828, |
| "kl": 0.199591064453125, |
| "learning_rate": 1.3686534216335543e-05, |
| "loss": 0.008, |
| "reward": 1.0, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9125, |
| "step": 310 |
| }, |
| { |
| "completion_length": 231.8625, |
| "epoch": 0.06957386010684558, |
| "grad_norm": 2.525719433106217, |
| "kl": 0.256878662109375, |
| "learning_rate": 1.3907284768211921e-05, |
| "loss": 0.0103, |
| "reward": 1.08125, |
| "reward_std": 0.23864853456616403, |
| "rewards/accuracy_reward": 0.18125, |
| "rewards/format_reward": 0.9, |
| "step": 315 |
| }, |
| { |
| "completion_length": 269.71875, |
| "epoch": 0.07067820709266852, |
| "grad_norm": 2.2679595474497583, |
| "kl": 0.81484375, |
| "learning_rate": 1.4128035320088303e-05, |
| "loss": 0.0326, |
| "reward": 1.0, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.875, |
| "step": 320 |
| }, |
| { |
| "completion_length": 208.94375, |
| "epoch": 0.07178255407849146, |
| "grad_norm": 1.1848713457261022, |
| "kl": 0.43358154296875, |
| "learning_rate": 1.434878587196468e-05, |
| "loss": 0.0174, |
| "reward": 0.99375, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.88125, |
| "step": 325 |
| }, |
| { |
| "completion_length": 186.3375, |
| "epoch": 0.0728869010643144, |
| "grad_norm": 1.434354885038631, |
| "kl": 0.292218017578125, |
| "learning_rate": 1.456953642384106e-05, |
| "loss": 0.0117, |
| "reward": 0.74375, |
| "reward_std": 0.30935921147465706, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.66875, |
| "step": 330 |
| }, |
| { |
| "completion_length": 616.03125, |
| "epoch": 0.07399124805013735, |
| "grad_norm": 0.46617566479772593, |
| "kl": 0.228448486328125, |
| "learning_rate": 1.479028697571744e-05, |
| "loss": 0.0091, |
| "reward": 0.58125, |
| "reward_std": 0.2563262037932873, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.5, |
| "step": 335 |
| }, |
| { |
| "completion_length": 781.5875, |
| "epoch": 0.0750955950359603, |
| "grad_norm": 0.4999142123724588, |
| "kl": 0.203704833984375, |
| "learning_rate": 1.501103752759382e-05, |
| "loss": 0.0081, |
| "reward": 0.5125, |
| "reward_std": 0.30052037686109545, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.48125, |
| "step": 340 |
| }, |
| { |
| "completion_length": 438.83125, |
| "epoch": 0.07619994202178325, |
| "grad_norm": 1.1259807453052872, |
| "kl": 0.229962158203125, |
| "learning_rate": 1.52317880794702e-05, |
| "loss": 0.0092, |
| "reward": 0.99375, |
| "reward_std": 0.2916815422475338, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.8625, |
| "step": 345 |
| }, |
| { |
| "completion_length": 308.03125, |
| "epoch": 0.07730428900760619, |
| "grad_norm": 1.1381907828574633, |
| "kl": 0.246209716796875, |
| "learning_rate": 1.5452538631346577e-05, |
| "loss": 0.0098, |
| "reward": 1.01875, |
| "reward_std": 0.2740038730204105, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.8625, |
| "step": 350 |
| }, |
| { |
| "completion_length": 466.78125, |
| "epoch": 0.07840863599342913, |
| "grad_norm": 145.06654530948717, |
| "kl": 1.06409912109375, |
| "learning_rate": 1.567328918322296e-05, |
| "loss": 0.0425, |
| "reward": 0.83125, |
| "reward_std": 0.3623922191560268, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.725, |
| "step": 355 |
| }, |
| { |
| "completion_length": 601.19375, |
| "epoch": 0.07951298297925208, |
| "grad_norm": 4.173181885857791, |
| "kl": 0.5132080078125, |
| "learning_rate": 1.589403973509934e-05, |
| "loss": 0.0205, |
| "reward": 0.5625, |
| "reward_std": 0.38890872299671175, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.5125, |
| "step": 360 |
| }, |
| { |
| "completion_length": 552.525, |
| "epoch": 0.08061732996507502, |
| "grad_norm": 2.488203792347465, |
| "kl": 0.5505126953125, |
| "learning_rate": 1.6114790286975718e-05, |
| "loss": 0.022, |
| "reward": 0.675, |
| "reward_std": 0.38890872299671175, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.61875, |
| "step": 365 |
| }, |
| { |
| "completion_length": 323.6625, |
| "epoch": 0.08172167695089798, |
| "grad_norm": 0.4380180481079113, |
| "kl": 0.425970458984375, |
| "learning_rate": 1.63355408388521e-05, |
| "loss": 0.017, |
| "reward": 1.0375, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.90625, |
| "step": 370 |
| }, |
| { |
| "completion_length": 284.35, |
| "epoch": 0.08282602393672092, |
| "grad_norm": 1.2931698264509297, |
| "kl": 0.374395751953125, |
| "learning_rate": 1.6556291390728477e-05, |
| "loss": 0.015, |
| "reward": 1.01875, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9, |
| "step": 375 |
| }, |
| { |
| "completion_length": 271.5875, |
| "epoch": 0.08393037092254386, |
| "grad_norm": 2.19321510252618, |
| "kl": 0.78577880859375, |
| "learning_rate": 1.6777041942604858e-05, |
| "loss": 0.0315, |
| "reward": 1.0625, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.175, |
| "rewards/format_reward": 0.8875, |
| "step": 380 |
| }, |
| { |
| "completion_length": 247.65, |
| "epoch": 0.08503471790836681, |
| "grad_norm": 0.8492367847141669, |
| "kl": 0.65609130859375, |
| "learning_rate": 1.699779249448124e-05, |
| "loss": 0.0263, |
| "reward": 1.04375, |
| "reward_std": 0.23864853456616403, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.89375, |
| "step": 385 |
| }, |
| { |
| "completion_length": 279.06875, |
| "epoch": 0.08613906489418975, |
| "grad_norm": 1.1176439301337926, |
| "kl": 0.440093994140625, |
| "learning_rate": 1.7218543046357617e-05, |
| "loss": 0.0176, |
| "reward": 1.0125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.90625, |
| "step": 390 |
| }, |
| { |
| "completion_length": 381.325, |
| "epoch": 0.0872434118800127, |
| "grad_norm": 1.4438637661323328, |
| "kl": 1.2351318359375, |
| "learning_rate": 1.7439293598234e-05, |
| "loss": 0.0494, |
| "reward": 0.81875, |
| "reward_std": 0.30935921147465706, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.73125, |
| "step": 395 |
| }, |
| { |
| "completion_length": 393.53125, |
| "epoch": 0.08834775886583565, |
| "grad_norm": 0.8840798582686505, |
| "kl": 0.66854248046875, |
| "learning_rate": 1.7660044150110377e-05, |
| "loss": 0.0267, |
| "reward": 0.9625, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.8375, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.08834775886583565, |
| "eval_completion_length": 357.46, |
| "eval_kl": 0.44, |
| "eval_loss": 0.017609253525733948, |
| "eval_reward": 0.99, |
| "eval_reward_std": 0.24041630148887635, |
| "eval_rewards/accuracy_reward": 0.155, |
| "eval_rewards/format_reward": 0.835, |
| "eval_runtime": 153.4656, |
| "eval_samples_per_second": 0.645, |
| "eval_steps_per_second": 0.163, |
| "step": 400 |
| }, |
| { |
| "completion_length": 289.31875, |
| "epoch": 0.0894521058516586, |
| "grad_norm": 5.233032104359776, |
| "kl": 0.511737060546875, |
| "learning_rate": 1.7880794701986758e-05, |
| "loss": 0.0205, |
| "reward": 1.00625, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9125, |
| "step": 405 |
| }, |
| { |
| "completion_length": 255.6, |
| "epoch": 0.09055645283748154, |
| "grad_norm": 0.5288317975266756, |
| "kl": 0.2679931640625, |
| "learning_rate": 1.8101545253863136e-05, |
| "loss": 0.0107, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95, |
| "step": 410 |
| }, |
| { |
| "completion_length": 283.04375, |
| "epoch": 0.09166079982330448, |
| "grad_norm": 0.44523434013578594, |
| "kl": 0.219281005859375, |
| "learning_rate": 1.8322295805739517e-05, |
| "loss": 0.0088, |
| "reward": 1.0625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95, |
| "step": 415 |
| }, |
| { |
| "completion_length": 308.90625, |
| "epoch": 0.09276514680912742, |
| "grad_norm": 0.8815212129055245, |
| "kl": 0.264593505859375, |
| "learning_rate": 1.8543046357615895e-05, |
| "loss": 0.0106, |
| "reward": 1.0125, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9125, |
| "step": 420 |
| }, |
| { |
| "completion_length": 314.7375, |
| "epoch": 0.09386949379495037, |
| "grad_norm": 1.3789377076942395, |
| "kl": 0.3002197265625, |
| "learning_rate": 1.8763796909492276e-05, |
| "loss": 0.012, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.90625, |
| "step": 425 |
| }, |
| { |
| "completion_length": 339.475, |
| "epoch": 0.09497384078077332, |
| "grad_norm": 0.6184446711073517, |
| "kl": 0.441192626953125, |
| "learning_rate": 1.8984547461368654e-05, |
| "loss": 0.0177, |
| "reward": 0.95625, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.90625, |
| "step": 430 |
| }, |
| { |
| "completion_length": 317.0125, |
| "epoch": 0.09607818776659627, |
| "grad_norm": 1.5357040975928846, |
| "kl": 0.2957275390625, |
| "learning_rate": 1.9205298013245036e-05, |
| "loss": 0.0118, |
| "reward": 1.0, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.94375, |
| "step": 435 |
| }, |
| { |
| "completion_length": 326.9, |
| "epoch": 0.09718253475241921, |
| "grad_norm": 0.9509356833694793, |
| "kl": 0.24755859375, |
| "learning_rate": 1.9426048565121414e-05, |
| "loss": 0.0099, |
| "reward": 0.95, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.89375, |
| "step": 440 |
| }, |
| { |
| "completion_length": 247.94375, |
| "epoch": 0.09828688173824215, |
| "grad_norm": 7.331260846492792, |
| "kl": 0.458038330078125, |
| "learning_rate": 1.9646799116997795e-05, |
| "loss": 0.0183, |
| "reward": 0.9875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.93125, |
| "step": 445 |
| }, |
| { |
| "completion_length": 244.2, |
| "epoch": 0.0993912287240651, |
| "grad_norm": 2.2209695163797973, |
| "kl": 0.579638671875, |
| "learning_rate": 1.9867549668874173e-05, |
| "loss": 0.0232, |
| "reward": 0.98125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.93125, |
| "step": 450 |
| }, |
| { |
| "completion_length": 247.9875, |
| "epoch": 0.10049557570988804, |
| "grad_norm": 0.6146792415154715, |
| "kl": 0.3593017578125, |
| "learning_rate": 1.9999988107104428e-05, |
| "loss": 0.0144, |
| "reward": 1.025, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.95625, |
| "step": 455 |
| }, |
| { |
| "completion_length": 204.425, |
| "epoch": 0.101599922695711, |
| "grad_norm": 1.2131689914986956, |
| "kl": 0.246929931640625, |
| "learning_rate": 1.9999854312354064e-05, |
| "loss": 0.0099, |
| "reward": 1.08125, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.95625, |
| "step": 460 |
| }, |
| { |
| "completion_length": 215.4625, |
| "epoch": 0.10270426968153394, |
| "grad_norm": 22.462295342430743, |
| "kl": 0.606536865234375, |
| "learning_rate": 1.999957185872951e-05, |
| "loss": 0.0243, |
| "reward": 0.94375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.90625, |
| "step": 465 |
| }, |
| { |
| "completion_length": 180.38125, |
| "epoch": 0.10380861666735688, |
| "grad_norm": 0.22049287292981123, |
| "kl": 1.026416015625, |
| "learning_rate": 1.999914075042975e-05, |
| "loss": 0.0411, |
| "reward": 0.99375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.93125, |
| "step": 470 |
| }, |
| { |
| "completion_length": 227.85625, |
| "epoch": 0.10491296365317983, |
| "grad_norm": 0.5306656385299113, |
| "kl": 0.257354736328125, |
| "learning_rate": 1.9998560993863682e-05, |
| "loss": 0.0103, |
| "reward": 0.925, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.8875, |
| "step": 475 |
| }, |
| { |
| "completion_length": 173.825, |
| "epoch": 0.10601731063900277, |
| "grad_norm": 0.7697195391306778, |
| "kl": 0.243804931640625, |
| "learning_rate": 1.999783259765003e-05, |
| "loss": 0.0098, |
| "reward": 0.9875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.95, |
| "step": 480 |
| }, |
| { |
| "completion_length": 245.43125, |
| "epoch": 0.10712165762482571, |
| "grad_norm": 0.8895584020157263, |
| "kl": 0.344189453125, |
| "learning_rate": 1.9996955572617202e-05, |
| "loss": 0.0138, |
| "reward": 0.94375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.925, |
| "step": 485 |
| }, |
| { |
| "completion_length": 228.575, |
| "epoch": 0.10822600461064867, |
| "grad_norm": 0.4019967230109815, |
| "kl": 0.6487060546875, |
| "learning_rate": 1.999592993180315e-05, |
| "loss": 0.026, |
| "reward": 0.95, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9, |
| "step": 490 |
| }, |
| { |
| "completion_length": 191.1125, |
| "epoch": 0.10933035159647161, |
| "grad_norm": 2.2236898903555584, |
| "kl": 0.3437255859375, |
| "learning_rate": 1.9994755690455154e-05, |
| "loss": 0.0137, |
| "reward": 1.025, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.95625, |
| "step": 495 |
| }, |
| { |
| "completion_length": 227.65625, |
| "epoch": 0.11043469858229456, |
| "grad_norm": 0.31534477505615033, |
| "kl": 0.254638671875, |
| "learning_rate": 1.9993432866029604e-05, |
| "loss": 0.0102, |
| "reward": 1.0375, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.96875, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.11043469858229456, |
| "eval_completion_length": 305.16, |
| "eval_kl": 0.25392578125, |
| "eval_loss": 0.010133117437362671, |
| "eval_reward": 1.01, |
| "eval_reward_std": 0.09899494767189027, |
| "eval_rewards/accuracy_reward": 0.065, |
| "eval_rewards/format_reward": 0.945, |
| "eval_runtime": 127.2591, |
| "eval_samples_per_second": 0.778, |
| "eval_steps_per_second": 0.196, |
| "step": 500 |
| }, |
| { |
| "completion_length": 348.06875, |
| "epoch": 0.1115390455681175, |
| "grad_norm": 0.47067061501157526, |
| "kl": 0.3302001953125, |
| "learning_rate": 1.9991961478191753e-05, |
| "loss": 0.0132, |
| "reward": 0.95625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.9125, |
| "step": 505 |
| }, |
| { |
| "completion_length": 404.30625, |
| "epoch": 0.11264339255394044, |
| "grad_norm": 0.3909987106694373, |
| "kl": 0.290771484375, |
| "learning_rate": 1.99903415488154e-05, |
| "loss": 0.0116, |
| "reward": 0.975, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.93125, |
| "step": 510 |
| }, |
| { |
| "completion_length": 331.25, |
| "epoch": 0.11374773953976339, |
| "grad_norm": 0.13154079900966428, |
| "kl": 0.2630859375, |
| "learning_rate": 1.998857310198259e-05, |
| "loss": 0.0105, |
| "reward": 0.90625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0125, |
| "rewards/format_reward": 0.89375, |
| "step": 515 |
| }, |
| { |
| "completion_length": 294.86875, |
| "epoch": 0.11485208652558634, |
| "grad_norm": 0.600369457847733, |
| "kl": 0.209710693359375, |
| "learning_rate": 1.998665616398323e-05, |
| "loss": 0.0084, |
| "reward": 0.9875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.95, |
| "step": 520 |
| }, |
| { |
| "completion_length": 194.80625, |
| "epoch": 0.11595643351140929, |
| "grad_norm": 0.6528037119159925, |
| "kl": 0.23944091796875, |
| "learning_rate": 1.9984590763314722e-05, |
| "loss": 0.0096, |
| "reward": 1.01875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.975, |
| "step": 525 |
| }, |
| { |
| "completion_length": 150.6, |
| "epoch": 0.11706078049723223, |
| "grad_norm": 0.2685863872987803, |
| "kl": 0.19815673828125, |
| "learning_rate": 1.998237693068153e-05, |
| "loss": 0.0079, |
| "reward": 1.00625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.975, |
| "step": 530 |
| }, |
| { |
| "completion_length": 190.7, |
| "epoch": 0.11816512748305517, |
| "grad_norm": 0.6345565715762368, |
| "kl": 0.385015869140625, |
| "learning_rate": 1.9980014698994722e-05, |
| "loss": 0.0154, |
| "reward": 1.0, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.95625, |
| "step": 535 |
| }, |
| { |
| "completion_length": 263.48125, |
| "epoch": 0.11926947446887812, |
| "grad_norm": 0.7032479277266585, |
| "kl": 0.430792236328125, |
| "learning_rate": 1.997750410337147e-05, |
| "loss": 0.0172, |
| "reward": 1.0125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.96875, |
| "step": 540 |
| }, |
| { |
| "completion_length": 322.3125, |
| "epoch": 0.12037382145470106, |
| "grad_norm": 0.7900459644547021, |
| "kl": 0.45318603515625, |
| "learning_rate": 1.997484518113456e-05, |
| "loss": 0.0181, |
| "reward": 0.9625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.925, |
| "step": 545 |
| }, |
| { |
| "completion_length": 282.39375, |
| "epoch": 0.12147816844052402, |
| "grad_norm": 1.7131720462446698, |
| "kl": 0.667669677734375, |
| "learning_rate": 1.9972037971811802e-05, |
| "loss": 0.0267, |
| "reward": 0.94375, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0125, |
| "rewards/format_reward": 0.93125, |
| "step": 550 |
| }, |
| { |
| "completion_length": 351.20625, |
| "epoch": 0.12258251542634696, |
| "grad_norm": 2.3429062472049664, |
| "kl": 0.4404296875, |
| "learning_rate": 1.9969082517135463e-05, |
| "loss": 0.0176, |
| "reward": 0.9875, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.93125, |
| "step": 555 |
| }, |
| { |
| "completion_length": 302.83125, |
| "epoch": 0.1236868624121699, |
| "grad_norm": 0.42331075217778713, |
| "kl": 0.244976806640625, |
| "learning_rate": 1.9965978861041637e-05, |
| "loss": 0.0098, |
| "reward": 1.04375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9625, |
| "step": 560 |
| }, |
| { |
| "completion_length": 227.3625, |
| "epoch": 0.12479120939799285, |
| "grad_norm": 6.878210574805294, |
| "kl": 0.490325927734375, |
| "learning_rate": 1.99627270496696e-05, |
| "loss": 0.0196, |
| "reward": 1.0125, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.95625, |
| "step": 565 |
| }, |
| { |
| "completion_length": 259.70625, |
| "epoch": 0.1258955563838158, |
| "grad_norm": 0.29828625265818887, |
| "kl": 0.570684814453125, |
| "learning_rate": 1.995932713136112e-05, |
| "loss": 0.0228, |
| "reward": 0.9625, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.925, |
| "step": 570 |
| }, |
| { |
| "completion_length": 256.55625, |
| "epoch": 0.12699990336963873, |
| "grad_norm": 0.5399903589695695, |
| "kl": 0.375726318359375, |
| "learning_rate": 1.9955779156659735e-05, |
| "loss": 0.015, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.975, |
| "step": 575 |
| }, |
| { |
| "completion_length": 242.5, |
| "epoch": 0.1281042503554617, |
| "grad_norm": 9.42984310059387, |
| "kl": 0.512103271484375, |
| "learning_rate": 1.9952083178310002e-05, |
| "loss": 0.0205, |
| "reward": 1.0375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9625, |
| "step": 580 |
| }, |
| { |
| "completion_length": 217.3375, |
| "epoch": 0.12920859734128462, |
| "grad_norm": 4.883219507705031, |
| "kl": 0.71790771484375, |
| "learning_rate": 1.994823925125672e-05, |
| "loss": 0.0287, |
| "reward": 1.03125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.95, |
| "step": 585 |
| }, |
| { |
| "completion_length": 213.125, |
| "epoch": 0.13031294432710758, |
| "grad_norm": 0.656048665938222, |
| "kl": 0.331298828125, |
| "learning_rate": 1.994424743264412e-05, |
| "loss": 0.0132, |
| "reward": 1.10625, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.95625, |
| "step": 590 |
| }, |
| { |
| "completion_length": 175.19375, |
| "epoch": 0.13141729131293053, |
| "grad_norm": 0.4384345369089057, |
| "kl": 0.21328125, |
| "learning_rate": 1.9940107781814976e-05, |
| "loss": 0.0085, |
| "reward": 1.08125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.99375, |
| "step": 595 |
| }, |
| { |
| "completion_length": 232.13125, |
| "epoch": 0.13252163829875346, |
| "grad_norm": 9.864648628359978, |
| "kl": 0.3187255859375, |
| "learning_rate": 1.993582036030978e-05, |
| "loss": 0.0127, |
| "reward": 1.00625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.95625, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.13252163829875346, |
| "eval_completion_length": 205.38, |
| "eval_kl": 16.67103515625, |
| "eval_loss": 0.669373095035553, |
| "eval_reward": 1.06, |
| "eval_reward_std": 0.22627416610717774, |
| "eval_rewards/accuracy_reward": 0.135, |
| "eval_rewards/format_reward": 0.925, |
| "eval_runtime": 95.7714, |
| "eval_samples_per_second": 1.034, |
| "eval_steps_per_second": 0.261, |
| "step": 600 |
| }, |
| { |
| "completion_length": 220.1375, |
| "epoch": 0.13362598528457642, |
| "grad_norm": 0.9515331014381163, |
| "kl": 0.321160888671875, |
| "learning_rate": 1.993138523186578e-05, |
| "loss": 0.0129, |
| "reward": 1.06875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.98125, |
| "step": 605 |
| }, |
| { |
| "completion_length": 219.99375, |
| "epoch": 0.13473033227039935, |
| "grad_norm": 3.2832305875346033, |
| "kl": 0.55125732421875, |
| "learning_rate": 1.9926802462416054e-05, |
| "loss": 0.0221, |
| "reward": 1.04375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9625, |
| "step": 610 |
| }, |
| { |
| "completion_length": 218.5875, |
| "epoch": 0.1358346792562223, |
| "grad_norm": 1.0232056540109529, |
| "kl": 0.942523193359375, |
| "learning_rate": 1.9922072120088537e-05, |
| "loss": 0.0377, |
| "reward": 0.99375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.93125, |
| "step": 615 |
| }, |
| { |
| "completion_length": 268.2125, |
| "epoch": 0.13693902624204526, |
| "grad_norm": 1.7246486491338628, |
| "kl": 0.6009521484375, |
| "learning_rate": 1.991719427520499e-05, |
| "loss": 0.024, |
| "reward": 1.0125, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.9, |
| "step": 620 |
| }, |
| { |
| "completion_length": 240.51875, |
| "epoch": 0.1380433732278682, |
| "grad_norm": 7.217525782475475, |
| "kl": 0.579345703125, |
| "learning_rate": 1.9912169000279952e-05, |
| "loss": 0.0231, |
| "reward": 1.05, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.975, |
| "step": 625 |
| }, |
| { |
| "completion_length": 307.725, |
| "epoch": 0.13914772021369115, |
| "grad_norm": 0.32686422446894037, |
| "kl": 0.320556640625, |
| "learning_rate": 1.9906996370019692e-05, |
| "loss": 0.0128, |
| "reward": 1.05625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9375, |
| "step": 630 |
| }, |
| { |
| "completion_length": 448.2375, |
| "epoch": 0.14025206719951408, |
| "grad_norm": 4.724746821053579, |
| "kl": 0.55283203125, |
| "learning_rate": 1.990167646132107e-05, |
| "loss": 0.0221, |
| "reward": 0.8125, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.74375, |
| "step": 635 |
| }, |
| { |
| "completion_length": 466.6125, |
| "epoch": 0.14135641418533704, |
| "grad_norm": 0.3626333681552602, |
| "kl": 3.0367431640625, |
| "learning_rate": 1.9896209353270394e-05, |
| "loss": 0.1216, |
| "reward": 0.7625, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.725, |
| "step": 640 |
| }, |
| { |
| "completion_length": 351.9125, |
| "epoch": 0.14246076117115997, |
| "grad_norm": 0.3534359394092258, |
| "kl": 0.2459228515625, |
| "learning_rate": 1.989059512714227e-05, |
| "loss": 0.0098, |
| "reward": 1.01875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.925, |
| "step": 645 |
| }, |
| { |
| "completion_length": 309.93125, |
| "epoch": 0.14356510815698292, |
| "grad_norm": 0.6269748799278583, |
| "kl": 0.239697265625, |
| "learning_rate": 1.988483386639836e-05, |
| "loss": 0.0096, |
| "reward": 1.1125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.9625, |
| "step": 650 |
| }, |
| { |
| "completion_length": 304.49375, |
| "epoch": 0.14466945514280588, |
| "grad_norm": 2.4513786768690364, |
| "kl": 0.2604248046875, |
| "learning_rate": 1.9878925656686167e-05, |
| "loss": 0.0104, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.98125, |
| "step": 655 |
| }, |
| { |
| "completion_length": 307.99375, |
| "epoch": 0.1457738021286288, |
| "grad_norm": 0.5055349332276848, |
| "kl": 0.27725830078125, |
| "learning_rate": 1.9872870585837757e-05, |
| "loss": 0.0111, |
| "reward": 1.00625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.95, |
| "step": 660 |
| }, |
| { |
| "completion_length": 281.29375, |
| "epoch": 0.14687814911445177, |
| "grad_norm": 42388.24147147825, |
| "kl": 170.7101318359375, |
| "learning_rate": 1.9866668743868437e-05, |
| "loss": 6.8324, |
| "reward": 0.9375, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.0125, |
| "rewards/format_reward": 0.925, |
| "step": 665 |
| }, |
| { |
| "completion_length": 331.09375, |
| "epoch": 0.1479824961002747, |
| "grad_norm": 3.1708940830460186, |
| "kl": 1.48009033203125, |
| "learning_rate": 1.9860320222975435e-05, |
| "loss": 0.0594, |
| "reward": 0.8875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.84375, |
| "step": 670 |
| }, |
| { |
| "completion_length": 329.43125, |
| "epoch": 0.14908684308609765, |
| "grad_norm": 9.028073103998265, |
| "kl": 0.427667236328125, |
| "learning_rate": 1.9853825117536522e-05, |
| "loss": 0.0171, |
| "reward": 0.95, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.9125, |
| "step": 675 |
| }, |
| { |
| "completion_length": 295.99375, |
| "epoch": 0.1501911900719206, |
| "grad_norm": 1.3497959756065208, |
| "kl": 0.8688232421875, |
| "learning_rate": 1.9847183524108614e-05, |
| "loss": 0.0348, |
| "reward": 0.98125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.93125, |
| "step": 680 |
| }, |
| { |
| "completion_length": 256.76875, |
| "epoch": 0.15129553705774354, |
| "grad_norm": 0.5127846687930365, |
| "kl": 0.305950927734375, |
| "learning_rate": 1.9840395541426333e-05, |
| "loss": 0.0122, |
| "reward": 1.06875, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.96875, |
| "step": 685 |
| }, |
| { |
| "completion_length": 248.68125, |
| "epoch": 0.1523998840435665, |
| "grad_norm": 0.9607285175875181, |
| "kl": 0.33861083984375, |
| "learning_rate": 1.983346127040053e-05, |
| "loss": 0.0135, |
| "reward": 1.01875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.95, |
| "step": 690 |
| }, |
| { |
| "completion_length": 209.85, |
| "epoch": 0.15350423102938943, |
| "grad_norm": 2.0950203003912313, |
| "kl": 0.3568115234375, |
| "learning_rate": 1.9826380814116795e-05, |
| "loss": 0.0143, |
| "reward": 1.0, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.925, |
| "step": 695 |
| }, |
| { |
| "completion_length": 618.08125, |
| "epoch": 0.15460857801521238, |
| "grad_norm": 2.8704953934506423, |
| "kl": 1.026025390625, |
| "learning_rate": 1.9819154277833938e-05, |
| "loss": 0.041, |
| "reward": 0.71875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.6875, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.15460857801521238, |
| "eval_completion_length": 959.67, |
| "eval_kl": 1.478125, |
| "eval_loss": 0.05926254764199257, |
| "eval_reward": 0.6, |
| "eval_reward_std": 0.38183765530586244, |
| "eval_rewards/accuracy_reward": 0.045, |
| "eval_rewards/format_reward": 0.555, |
| "eval_runtime": 261.2361, |
| "eval_samples_per_second": 0.379, |
| "eval_steps_per_second": 0.096, |
| "step": 700 |
| }, |
| { |
| "completion_length": 1003.5125, |
| "epoch": 0.1557129250010353, |
| "grad_norm": 0.28454444589509803, |
| "kl": 1.074609375, |
| "learning_rate": 1.9811781768982392e-05, |
| "loss": 0.043, |
| "reward": 0.7375, |
| "reward_std": 0.2828427076339722, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.71875, |
| "step": 705 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.15681727198685827, |
| "grad_norm": 0.20551909954742464, |
| "kl": 648601.9237182618, |
| "learning_rate": 1.980426339716264e-05, |
| "loss": 25944.5938, |
| "reward": 0.8125, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.00625, |
| "rewards/format_reward": 0.80625, |
| "step": 710 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.15792161897268123, |
| "grad_norm": 0.1216302069380067, |
| "kl": 0.264453125, |
| "learning_rate": 1.9796599274143586e-05, |
| "loss": 0.0106, |
| "reward": 0.99375, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.0125, |
| "rewards/format_reward": 0.98125, |
| "step": 715 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.15902596595850416, |
| "grad_norm": 0.13579361297724876, |
| "kl": 0.23675537109375, |
| "learning_rate": 1.9788789513860875e-05, |
| "loss": 0.0095, |
| "reward": 1.0, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.98125, |
| "step": 720 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.1601303129443271, |
| "grad_norm": 0.1484233873762049, |
| "kl": 0.26705322265625, |
| "learning_rate": 1.9780834232415214e-05, |
| "loss": 0.0107, |
| "reward": 0.975, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.95625, |
| "step": 725 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.16123465993015004, |
| "grad_norm": 0.2382323266501869, |
| "kl": 0.26546630859375, |
| "learning_rate": 1.9772733548070647e-05, |
| "loss": 0.0106, |
| "reward": 0.88125, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.85625, |
| "step": 730 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.162339006915973, |
| "grad_norm": 0.22617291442101026, |
| "kl": 0.29144287109375, |
| "learning_rate": 1.9764487581252787e-05, |
| "loss": 0.0117, |
| "reward": 0.90625, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.88125, |
| "step": 735 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.16344335390179596, |
| "grad_norm": 0.27522740788676725, |
| "kl": 0.272119140625, |
| "learning_rate": 1.975609645454704e-05, |
| "loss": 0.0109, |
| "reward": 0.95625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.91875, |
| "step": 740 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.1645477008876189, |
| "grad_norm": 0.26380621267798754, |
| "kl": 0.3930419921875, |
| "learning_rate": 1.9747560292696763e-05, |
| "loss": 0.0157, |
| "reward": 0.9625, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.93125, |
| "step": 745 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.16565204787344184, |
| "grad_norm": 0.28940243726369763, |
| "kl": 0.33485107421875, |
| "learning_rate": 1.9738879222601425e-05, |
| "loss": 0.0134, |
| "reward": 0.8875, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.85625, |
| "step": 750 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.16675639485926477, |
| "grad_norm": 0.28544947271914256, |
| "kl": 0.5123046875, |
| "learning_rate": 1.9730053373314722e-05, |
| "loss": 0.0205, |
| "reward": 0.9125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.875, |
| "step": 755 |
| }, |
| { |
| "completion_length": 1020.45, |
| "epoch": 0.16786074184508773, |
| "grad_norm": 0.25645868139823796, |
| "kl": 0.56268310546875, |
| "learning_rate": 1.9721082876042644e-05, |
| "loss": 0.0225, |
| "reward": 0.95625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.925, |
| "step": 760 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.16896508883091066, |
| "grad_norm": 0.12040072575635787, |
| "kl": 0.48553466796875, |
| "learning_rate": 1.9711967864141542e-05, |
| "loss": 0.0194, |
| "reward": 0.98125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.9375, |
| "step": 765 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.17006943581673362, |
| "grad_norm": 0.17007869061264408, |
| "kl": 0.50712890625, |
| "learning_rate": 1.970270847311612e-05, |
| "loss": 0.0203, |
| "reward": 1.05625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.975, |
| "step": 770 |
| }, |
| { |
| "completion_length": 1020.9375, |
| "epoch": 0.17117378280255657, |
| "grad_norm": 0.17581235812523752, |
| "kl": 0.49864501953125, |
| "learning_rate": 1.9693304840617456e-05, |
| "loss": 0.0199, |
| "reward": 1.0375, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.94375, |
| "step": 775 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.1722781297883795, |
| "grad_norm": 0.18169044119511507, |
| "kl": 0.66134033203125, |
| "learning_rate": 1.968375710644093e-05, |
| "loss": 0.0265, |
| "reward": 0.96875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.8875, |
| "step": 780 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.17338247677420246, |
| "grad_norm": 0.44893330655977454, |
| "kl": 0.538287353515625, |
| "learning_rate": 1.9674065412524147e-05, |
| "loss": 0.0215, |
| "reward": 0.75, |
| "reward_std": 0.30052037686109545, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.7, |
| "step": 785 |
| }, |
| { |
| "completion_length": 1024.0, |
| "epoch": 0.1744868237600254, |
| "grad_norm": 1.1859702994440748, |
| "kl": 0.7820556640625, |
| "learning_rate": 1.9664229902944833e-05, |
| "loss": 0.0313, |
| "reward": 0.88125, |
| "reward_std": 0.2563262037932873, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.81875, |
| "step": 790 |
| }, |
| { |
| "completion_length": 1016.3125, |
| "epoch": 0.17559117074584835, |
| "grad_norm": 0.22143586568637066, |
| "kl": 0.54296875, |
| "learning_rate": 1.9654250723918706e-05, |
| "loss": 0.0217, |
| "reward": 0.99375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.925, |
| "step": 795 |
| }, |
| { |
| "completion_length": 1018.775, |
| "epoch": 0.1766955177316713, |
| "grad_norm": 22.572401118778046, |
| "kl": 1.007623291015625, |
| "learning_rate": 1.9644128023797273e-05, |
| "loss": 0.0403, |
| "reward": 1.025, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.9875, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.1766955177316713, |
| "eval_completion_length": 999.645, |
| "eval_kl": 0.35353515625, |
| "eval_loss": 0.0141792893409729, |
| "eval_reward": 1.045, |
| "eval_reward_std": 0.13435028612613678, |
| "eval_rewards/accuracy_reward": 0.085, |
| "eval_rewards/format_reward": 0.96, |
| "eval_runtime": 263.5974, |
| "eval_samples_per_second": 0.376, |
| "eval_steps_per_second": 0.095, |
| "step": 800 |
| }, |
| { |
| "completion_length": 856.90625, |
| "epoch": 0.17779986471749423, |
| "grad_norm": 0.4670617314622781, |
| "kl": 0.29603271484375, |
| "learning_rate": 1.9633861953065648e-05, |
| "loss": 0.0118, |
| "reward": 0.9125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0125, |
| "rewards/format_reward": 0.9, |
| "step": 805 |
| }, |
| { |
| "completion_length": 235.19375, |
| "epoch": 0.1789042117033172, |
| "grad_norm": 0.8767573552781158, |
| "kl": 0.281640625, |
| "learning_rate": 1.9623452664340305e-05, |
| "loss": 0.0113, |
| "reward": 0.95625, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.88125, |
| "step": 810 |
| }, |
| { |
| "completion_length": 194.6875, |
| "epoch": 0.18000855868914012, |
| "grad_norm": 0.8245609924387703, |
| "kl": 0.235968017578125, |
| "learning_rate": 1.9612900312366815e-05, |
| "loss": 0.0094, |
| "reward": 0.98125, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.89375, |
| "step": 815 |
| }, |
| { |
| "completion_length": 177.96875, |
| "epoch": 0.18111290567496308, |
| "grad_norm": 0.5059362794610306, |
| "kl": 0.2334716796875, |
| "learning_rate": 1.9602205054017534e-05, |
| "loss": 0.0093, |
| "reward": 1.04375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9625, |
| "step": 820 |
| }, |
| { |
| "completion_length": 217.3625, |
| "epoch": 0.182217252660786, |
| "grad_norm": 0.4835214879479257, |
| "kl": 0.26324462890625, |
| "learning_rate": 1.9591367048289297e-05, |
| "loss": 0.0105, |
| "reward": 1.01875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.975, |
| "step": 825 |
| }, |
| { |
| "completion_length": 221.90625, |
| "epoch": 0.18332159964660896, |
| "grad_norm": 0.6626328778695273, |
| "kl": 0.28028564453125, |
| "learning_rate": 1.9580386456301014e-05, |
| "loss": 0.0112, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.975, |
| "step": 830 |
| }, |
| { |
| "completion_length": 205.31875, |
| "epoch": 0.18442594663243192, |
| "grad_norm": 0.26541083822971634, |
| "kl": 0.27823486328125, |
| "learning_rate": 1.9569263441291312e-05, |
| "loss": 0.0111, |
| "reward": 1.00625, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.98125, |
| "step": 835 |
| }, |
| { |
| "completion_length": 206.03125, |
| "epoch": 0.18553029361825485, |
| "grad_norm": 0.12829507495547837, |
| "kl": 0.29173583984375, |
| "learning_rate": 1.9557998168616087e-05, |
| "loss": 0.0117, |
| "reward": 1.025, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.98125, |
| "step": 840 |
| }, |
| { |
| "completion_length": 197.2375, |
| "epoch": 0.1866346406040778, |
| "grad_norm": 0.632523566350679, |
| "kl": 0.3269287109375, |
| "learning_rate": 1.9546590805746054e-05, |
| "loss": 0.0131, |
| "reward": 1.0125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.96875, |
| "step": 845 |
| }, |
| { |
| "completion_length": 201.3125, |
| "epoch": 0.18773898758990074, |
| "grad_norm": 0.4218691032301411, |
| "kl": 0.254644775390625, |
| "learning_rate": 1.9535041522264256e-05, |
| "loss": 0.0102, |
| "reward": 1.03125, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.9875, |
| "step": 850 |
| }, |
| { |
| "completion_length": 215.075, |
| "epoch": 0.1888433345757237, |
| "grad_norm": 0.5263796297221296, |
| "kl": 0.24737548828125, |
| "learning_rate": 1.9523350489863545e-05, |
| "loss": 0.0099, |
| "reward": 1.01875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.98125, |
| "step": 855 |
| }, |
| { |
| "completion_length": 293.66875, |
| "epoch": 0.18994768156154665, |
| "grad_norm": 0.5106483317493103, |
| "kl": 0.24151611328125, |
| "learning_rate": 1.951151788234402e-05, |
| "loss": 0.0097, |
| "reward": 1.03125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.96875, |
| "step": 860 |
| }, |
| { |
| "completion_length": 341.4625, |
| "epoch": 0.19105202854736958, |
| "grad_norm": 0.3446610557866059, |
| "kl": 0.21561279296875, |
| "learning_rate": 1.949954387561046e-05, |
| "loss": 0.0086, |
| "reward": 1.05625, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 1.0, |
| "step": 865 |
| }, |
| { |
| "completion_length": 290.48125, |
| "epoch": 0.19215637553319254, |
| "grad_norm": 0.5188322122434208, |
| "kl": 0.22645263671875, |
| "learning_rate": 1.9487428647669688e-05, |
| "loss": 0.0091, |
| "reward": 1.05625, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9875, |
| "step": 870 |
| }, |
| { |
| "completion_length": 265.8, |
| "epoch": 0.19326072251901547, |
| "grad_norm": 0.36026950031545973, |
| "kl": 0.2487060546875, |
| "learning_rate": 1.947517237862795e-05, |
| "loss": 0.0099, |
| "reward": 1.05625, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 1.0, |
| "step": 875 |
| }, |
| { |
| "completion_length": 229.44375, |
| "epoch": 0.19436506950483842, |
| "grad_norm": 0.27419984769066713, |
| "kl": 0.247833251953125, |
| "learning_rate": 1.9462775250688208e-05, |
| "loss": 0.0099, |
| "reward": 1.05, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 1.0, |
| "step": 880 |
| }, |
| { |
| "completion_length": 242.05625, |
| "epoch": 0.19546941649066138, |
| "grad_norm": 0.22671689499182268, |
| "kl": 0.2520751953125, |
| "learning_rate": 1.9450237448147463e-05, |
| "loss": 0.0101, |
| "reward": 1.0625, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9875, |
| "step": 885 |
| }, |
| { |
| "completion_length": 250.46875, |
| "epoch": 0.1965737634764843, |
| "grad_norm": 0.6022152556248251, |
| "kl": 0.26463623046875, |
| "learning_rate": 1.943755915739399e-05, |
| "loss": 0.0106, |
| "reward": 1.03125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.95625, |
| "step": 890 |
| }, |
| { |
| "completion_length": 202.0625, |
| "epoch": 0.19767811046230727, |
| "grad_norm": 0.3986065730178507, |
| "kl": 0.2731201171875, |
| "learning_rate": 1.9424740566904572e-05, |
| "loss": 0.0109, |
| "reward": 1.01875, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.99375, |
| "step": 895 |
| }, |
| { |
| "completion_length": 199.31875, |
| "epoch": 0.1987824574481302, |
| "grad_norm": 0.3484175794006397, |
| "kl": 0.2666259765625, |
| "learning_rate": 1.9411781867241718e-05, |
| "loss": 0.0107, |
| "reward": 1.03125, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.9875, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.1987824574481302, |
| "eval_completion_length": 219.105, |
| "eval_kl": 0.2623046875, |
| "eval_loss": 0.010502400808036327, |
| "eval_reward": 1.085, |
| "eval_reward_std": 0.13435028612613678, |
| "eval_rewards/accuracy_reward": 0.1, |
| "eval_rewards/format_reward": 0.985, |
| "eval_runtime": 94.3452, |
| "eval_samples_per_second": 1.049, |
| "eval_steps_per_second": 0.265, |
| "step": 900 |
| }, |
| { |
| "completion_length": 249.375, |
| "epoch": 0.19988680443395315, |
| "grad_norm": 0.11204663969136651, |
| "kl": 0.28580322265625, |
| "learning_rate": 1.9398683251050796e-05, |
| "loss": 0.0114, |
| "reward": 1.04375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.9875, |
| "step": 905 |
| }, |
| { |
| "completion_length": 258.64375, |
| "epoch": 0.20099115141977608, |
| "grad_norm": 0.38371933952741333, |
| "kl": 0.28076171875, |
| "learning_rate": 1.93854449130572e-05, |
| "loss": 0.0112, |
| "reward": 1.0625, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 1.0, |
| "step": 910 |
| }, |
| { |
| "completion_length": 246.2125, |
| "epoch": 0.20209549840559904, |
| "grad_norm": 0.596227033187015, |
| "kl": 0.26915283203125, |
| "learning_rate": 1.937206705006344e-05, |
| "loss": 0.0108, |
| "reward": 1.04375, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.9875, |
| "step": 915 |
| }, |
| { |
| "completion_length": 229.68125, |
| "epoch": 0.203199845391422, |
| "grad_norm": 1.0512533587389215, |
| "kl": 0.27860107421875, |
| "learning_rate": 1.9358549860946217e-05, |
| "loss": 0.0111, |
| "reward": 0.9875, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.925, |
| "step": 920 |
| }, |
| { |
| "completion_length": 193.76875, |
| "epoch": 0.20430419237724493, |
| "grad_norm": 0.3503850314920464, |
| "kl": 0.26494140625, |
| "learning_rate": 1.934489354665347e-05, |
| "loss": 0.0106, |
| "reward": 0.8375, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.79375, |
| "step": 925 |
| }, |
| { |
| "completion_length": 307.98125, |
| "epoch": 0.20540853936306788, |
| "grad_norm": 0.49849280178686406, |
| "kl": 0.31822509765625, |
| "learning_rate": 1.9331098310201392e-05, |
| "loss": 0.0127, |
| "reward": 0.99375, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.88125, |
| "step": 930 |
| }, |
| { |
| "completion_length": 149.59375, |
| "epoch": 0.2065128863488908, |
| "grad_norm": 0.7917539737118271, |
| "kl": 0.3322509765625, |
| "learning_rate": 1.9317164356671395e-05, |
| "loss": 0.0133, |
| "reward": 1.0375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9625, |
| "step": 935 |
| }, |
| { |
| "completion_length": 168.63125, |
| "epoch": 0.20761723333471377, |
| "grad_norm": 0.6201527306472769, |
| "kl": 0.41641845703125, |
| "learning_rate": 1.930309189320709e-05, |
| "loss": 0.0167, |
| "reward": 1.06875, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9625, |
| "step": 940 |
| }, |
| { |
| "completion_length": 200.69375, |
| "epoch": 0.20872158032053673, |
| "grad_norm": 0.48233327830154826, |
| "kl": 0.333837890625, |
| "learning_rate": 1.9288881129011177e-05, |
| "loss": 0.0134, |
| "reward": 1.1, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.975, |
| "step": 945 |
| }, |
| { |
| "completion_length": 202.0, |
| "epoch": 0.20982592730635966, |
| "grad_norm": 0.8281740363940345, |
| "kl": 0.35965576171875, |
| "learning_rate": 1.9274532275342355e-05, |
| "loss": 0.0144, |
| "reward": 1.075, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.96875, |
| "step": 950 |
| }, |
| { |
| "completion_length": 237.83125, |
| "epoch": 0.2109302742921826, |
| "grad_norm": 0.28772905337093585, |
| "kl": 0.32879638671875, |
| "learning_rate": 1.9260045545512174e-05, |
| "loss": 0.0131, |
| "reward": 1.05625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9875, |
| "step": 955 |
| }, |
| { |
| "completion_length": 260.9375, |
| "epoch": 0.21203462127800554, |
| "grad_norm": 0.5484628404079348, |
| "kl": 0.340283203125, |
| "learning_rate": 1.9245421154881873e-05, |
| "loss": 0.0136, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.975, |
| "step": 960 |
| }, |
| { |
| "completion_length": 325.5625, |
| "epoch": 0.2131389682638285, |
| "grad_norm": 0.37202208226132655, |
| "kl": 0.40030517578125, |
| "learning_rate": 1.9230659320859157e-05, |
| "loss": 0.016, |
| "reward": 1.01875, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.9875, |
| "step": 965 |
| }, |
| { |
| "completion_length": 249.5125, |
| "epoch": 0.21424331524965143, |
| "grad_norm": 0.37717614370685104, |
| "kl": 0.35120849609375, |
| "learning_rate": 1.9215760262894982e-05, |
| "loss": 0.014, |
| "reward": 1.00625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.975, |
| "step": 970 |
| }, |
| { |
| "completion_length": 230.45625, |
| "epoch": 0.21534766223547439, |
| "grad_norm": 0.5086005905523352, |
| "kl": 0.38538818359375, |
| "learning_rate": 1.9200724202480305e-05, |
| "loss": 0.0154, |
| "reward": 1.025, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.96875, |
| "step": 975 |
| }, |
| { |
| "completion_length": 203.0125, |
| "epoch": 0.21645200922129734, |
| "grad_norm": 0.3935754886467958, |
| "kl": 0.32562255859375, |
| "learning_rate": 1.9185551363142754e-05, |
| "loss": 0.013, |
| "reward": 1.0, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.96875, |
| "step": 980 |
| }, |
| { |
| "completion_length": 177.90625, |
| "epoch": 0.21755635620712027, |
| "grad_norm": 0.4203555792398308, |
| "kl": 0.29302978515625, |
| "learning_rate": 1.9170241970443344e-05, |
| "loss": 0.0117, |
| "reward": 1.01875, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.99375, |
| "step": 985 |
| }, |
| { |
| "completion_length": 149.94375, |
| "epoch": 0.21866070319294323, |
| "grad_norm": 0.26776293393774087, |
| "kl": 0.3010009765625, |
| "learning_rate": 1.9154796251973092e-05, |
| "loss": 0.012, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.98125, |
| "step": 990 |
| }, |
| { |
| "completion_length": 171.49375, |
| "epoch": 0.21976505017876616, |
| "grad_norm": 0.5930942674995531, |
| "kl": 0.30377197265625, |
| "learning_rate": 1.9139214437349663e-05, |
| "loss": 0.0121, |
| "reward": 1.025, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.99375, |
| "step": 995 |
| }, |
| { |
| "completion_length": 182.9125, |
| "epoch": 0.22086939716458912, |
| "grad_norm": 0.5327664873628459, |
| "kl": 0.2912841796875, |
| "learning_rate": 1.9123496758213926e-05, |
| "loss": 0.0117, |
| "reward": 1.06875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9875, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.22086939716458912, |
| "eval_completion_length": 214.23, |
| "eval_kl": 0.269765625, |
| "eval_loss": 0.01077475119382143, |
| "eval_reward": 1.075, |
| "eval_reward_std": 0.1767766922712326, |
| "eval_rewards/accuracy_reward": 0.105, |
| "eval_rewards/format_reward": 0.97, |
| "eval_runtime": 90.4327, |
| "eval_samples_per_second": 1.095, |
| "eval_steps_per_second": 0.276, |
| "step": 1000 |
| }, |
| { |
| "completion_length": 192.26875, |
| "epoch": 0.22197374415041207, |
| "grad_norm": 0.607671116808556, |
| "kl": 0.29041748046875, |
| "learning_rate": 1.9107643448226536e-05, |
| "loss": 0.0116, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.975, |
| "step": 1005 |
| }, |
| { |
| "completion_length": 233.1875, |
| "epoch": 0.223078091136235, |
| "grad_norm": 0.42966284071874145, |
| "kl": 4.75654296875, |
| "learning_rate": 1.909165474306445e-05, |
| "loss": 0.1909, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.975, |
| "step": 1010 |
| }, |
| { |
| "completion_length": 451.7875, |
| "epoch": 0.22418243812205796, |
| "grad_norm": 0.2531114981656375, |
| "kl": 0.275, |
| "learning_rate": 1.9075530880417422e-05, |
| "loss": 0.011, |
| "reward": 0.925, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.9, |
| "step": 1015 |
| }, |
| { |
| "completion_length": 535.35625, |
| "epoch": 0.2252867851078809, |
| "grad_norm": 0.4277427642056076, |
| "kl": 0.28232421875, |
| "learning_rate": 1.905927209998447e-05, |
| "loss": 0.0113, |
| "reward": 0.89375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.83125, |
| "step": 1020 |
| }, |
| { |
| "completion_length": 319.4625, |
| "epoch": 0.22639113209370385, |
| "grad_norm": 0.44601881214583666, |
| "kl": 0.30135498046875, |
| "learning_rate": 1.9042878643470313e-05, |
| "loss": 0.0121, |
| "reward": 1.00625, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.98125, |
| "step": 1025 |
| }, |
| { |
| "completion_length": 219.5, |
| "epoch": 0.22749547907952677, |
| "grad_norm": 0.42697703684794175, |
| "kl": 0.3317626953125, |
| "learning_rate": 1.9026350754581782e-05, |
| "loss": 0.0133, |
| "reward": 1.0125, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.95, |
| "step": 1030 |
| }, |
| { |
| "completion_length": 203.14375, |
| "epoch": 0.22859982606534973, |
| "grad_norm": 0.2565664818941694, |
| "kl": 0.315625, |
| "learning_rate": 1.900968867902419e-05, |
| "loss": 0.0126, |
| "reward": 0.93125, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.88125, |
| "step": 1035 |
| }, |
| { |
| "completion_length": 202.94375, |
| "epoch": 0.2297041730511727, |
| "grad_norm": 0.42814411154309545, |
| "kl": 0.31634521484375, |
| "learning_rate": 1.8992892664497693e-05, |
| "loss": 0.0127, |
| "reward": 1.06875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.99375, |
| "step": 1040 |
| }, |
| { |
| "completion_length": 205.91875, |
| "epoch": 0.23080852003699562, |
| "grad_norm": 0.43764023473742697, |
| "kl": 0.2983154296875, |
| "learning_rate": 1.897596296069358e-05, |
| "loss": 0.0119, |
| "reward": 1.05, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9875, |
| "step": 1045 |
| }, |
| { |
| "completion_length": 196.5375, |
| "epoch": 0.23191286702281858, |
| "grad_norm": 0.5131717281451017, |
| "kl": 0.27991943359375, |
| "learning_rate": 1.8958899819290592e-05, |
| "loss": 0.0112, |
| "reward": 1.025, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.98125, |
| "step": 1050 |
| }, |
| { |
| "completion_length": 205.26875, |
| "epoch": 0.2330172140086415, |
| "grad_norm": 0.32785012231096045, |
| "kl": 0.290869140625, |
| "learning_rate": 1.8941703493951163e-05, |
| "loss": 0.0116, |
| "reward": 1.075, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.99375, |
| "step": 1055 |
| }, |
| { |
| "completion_length": 231.63125, |
| "epoch": 0.23412156099446446, |
| "grad_norm": 0.5254694645789388, |
| "kl": 0.28807373046875, |
| "learning_rate": 1.892437424031766e-05, |
| "loss": 0.0115, |
| "reward": 1.05625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.99375, |
| "step": 1060 |
| }, |
| { |
| "completion_length": 242.0375, |
| "epoch": 0.23522590798028742, |
| "grad_norm": 0.5026068127158217, |
| "kl": 0.29200439453125, |
| "learning_rate": 1.890691231600856e-05, |
| "loss": 0.0117, |
| "reward": 1.06875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9625, |
| "step": 1065 |
| }, |
| { |
| "completion_length": 246.08125, |
| "epoch": 0.23633025496611035, |
| "grad_norm": 0.13203150585306936, |
| "kl": 0.322235107421875, |
| "learning_rate": 1.8889317980614653e-05, |
| "loss": 0.0129, |
| "reward": 1.075, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.96875, |
| "step": 1070 |
| }, |
| { |
| "completion_length": 224.4125, |
| "epoch": 0.2374346019519333, |
| "grad_norm": 0.41058165320035295, |
| "kl": 0.354974365234375, |
| "learning_rate": 1.8871591495695156e-05, |
| "loss": 0.0142, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.975, |
| "step": 1075 |
| }, |
| { |
| "completion_length": 202.43125, |
| "epoch": 0.23853894893775623, |
| "grad_norm": 0.5981781477284894, |
| "kl": 0.3003173828125, |
| "learning_rate": 1.8853733124773837e-05, |
| "loss": 0.012, |
| "reward": 1.025, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.98125, |
| "step": 1080 |
| }, |
| { |
| "completion_length": 170.225, |
| "epoch": 0.2396432959235792, |
| "grad_norm": 0.4268507688185011, |
| "kl": 0.301025390625, |
| "learning_rate": 1.8835743133335096e-05, |
| "loss": 0.012, |
| "reward": 1.0625, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9875, |
| "step": 1085 |
| }, |
| { |
| "completion_length": 175.80625, |
| "epoch": 0.24074764290940212, |
| "grad_norm": 0.5747423342289615, |
| "kl": 0.3157562255859375, |
| "learning_rate": 1.8817621788820017e-05, |
| "loss": 0.0126, |
| "reward": 1.0125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9625, |
| "step": 1090 |
| }, |
| { |
| "completion_length": 167.35, |
| "epoch": 0.24185198989522508, |
| "grad_norm": 0.519815561291836, |
| "kl": 0.32388916015625, |
| "learning_rate": 1.8799369360622394e-05, |
| "loss": 0.013, |
| "reward": 1.08125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.99375, |
| "step": 1095 |
| }, |
| { |
| "completion_length": 218.21875, |
| "epoch": 0.24295633688104804, |
| "grad_norm": 0.5333785220488585, |
| "kl": 0.36580810546875, |
| "learning_rate": 1.8780986120084715e-05, |
| "loss": 0.0146, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.975, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.24295633688104804, |
| "eval_completion_length": 262.785, |
| "eval_kl": 0.3383984375, |
| "eval_loss": 0.013547366484999657, |
| "eval_reward": 1.03, |
| "eval_reward_std": 0.15556348919868468, |
| "eval_rewards/accuracy_reward": 0.075, |
| "eval_rewards/format_reward": 0.955, |
| "eval_runtime": 118.9008, |
| "eval_samples_per_second": 0.833, |
| "eval_steps_per_second": 0.21, |
| "step": 1100 |
| }, |
| { |
| "completion_length": 226.5375, |
| "epoch": 0.24406068386687096, |
| "grad_norm": 0.6802013609013249, |
| "kl": 0.3069580078125, |
| "learning_rate": 1.876247234049416e-05, |
| "loss": 0.0123, |
| "reward": 1.05, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.975, |
| "step": 1105 |
| }, |
| { |
| "completion_length": 264.125, |
| "epoch": 0.24516503085269392, |
| "grad_norm": 0.9555057313081333, |
| "kl": 0.42034912109375, |
| "learning_rate": 1.8743828297078485e-05, |
| "loss": 0.0168, |
| "reward": 0.95625, |
| "reward_std": 0.23864853456616403, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.8875, |
| "step": 1110 |
| }, |
| { |
| "completion_length": 204.44375, |
| "epoch": 0.24626937783851685, |
| "grad_norm": 0.7866089087734848, |
| "kl": 0.45830078125, |
| "learning_rate": 1.8725054267001992e-05, |
| "loss": 0.0183, |
| "reward": 0.7625, |
| "reward_std": 0.30052037686109545, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.725, |
| "step": 1115 |
| }, |
| { |
| "completion_length": 170.48125, |
| "epoch": 0.2473737248243398, |
| "grad_norm": 0.4917046420126656, |
| "kl": 0.535284423828125, |
| "learning_rate": 1.8706150529361355e-05, |
| "loss": 0.0214, |
| "reward": 0.825, |
| "reward_std": 0.30052037686109545, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.74375, |
| "step": 1120 |
| }, |
| { |
| "completion_length": 170.68125, |
| "epoch": 0.24847807181016277, |
| "grad_norm": 0.4598219209937343, |
| "kl": 0.50391845703125, |
| "learning_rate": 1.8687117365181514e-05, |
| "loss": 0.0202, |
| "reward": 0.90625, |
| "reward_std": 0.23864853456616403, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.8625, |
| "step": 1125 |
| }, |
| { |
| "completion_length": 226.35, |
| "epoch": 0.2495824187959857, |
| "grad_norm": 0.729450452143476, |
| "kl": 0.4862548828125, |
| "learning_rate": 1.8667955057411454e-05, |
| "loss": 0.0195, |
| "reward": 0.9375, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.85625, |
| "step": 1130 |
| }, |
| { |
| "completion_length": 199.03125, |
| "epoch": 0.25068676578180865, |
| "grad_norm": 0.9720760882388615, |
| "kl": 0.467034912109375, |
| "learning_rate": 1.864866389092005e-05, |
| "loss": 0.0187, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9125, |
| "step": 1135 |
| }, |
| { |
| "completion_length": 213.5875, |
| "epoch": 0.2517911127676316, |
| "grad_norm": 0.6415965551575933, |
| "kl": 0.8406494140625, |
| "learning_rate": 1.8629244152491773e-05, |
| "loss": 0.0336, |
| "reward": 0.96875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.9125, |
| "step": 1140 |
| }, |
| { |
| "completion_length": 264.75, |
| "epoch": 0.2528954597534545, |
| "grad_norm": 3.4377172313710402, |
| "kl": 1.02249755859375, |
| "learning_rate": 1.860969613082249e-05, |
| "loss": 0.0409, |
| "reward": 0.95, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9, |
| "step": 1145 |
| }, |
| { |
| "completion_length": 262.26875, |
| "epoch": 0.25399980673927747, |
| "grad_norm": 0.5339383124761619, |
| "kl": 1.24478759765625, |
| "learning_rate": 1.8590020116515116e-05, |
| "loss": 0.0496, |
| "reward": 1.01875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.94375, |
| "step": 1150 |
| }, |
| { |
| "completion_length": 271.375, |
| "epoch": 0.2551041537251004, |
| "grad_norm": 1.6266177185437845, |
| "kl": 0.5312255859375, |
| "learning_rate": 1.8570216402075326e-05, |
| "loss": 0.0213, |
| "reward": 1.0125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.925, |
| "step": 1155 |
| }, |
| { |
| "completion_length": 274.64375, |
| "epoch": 0.2562085007109234, |
| "grad_norm": 1.0477658800658394, |
| "kl": 0.89627685546875, |
| "learning_rate": 1.8550285281907198e-05, |
| "loss": 0.0358, |
| "reward": 1.0, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.91875, |
| "step": 1160 |
| }, |
| { |
| "completion_length": 262.7, |
| "epoch": 0.25731284769674634, |
| "grad_norm": 0.4264557263560298, |
| "kl": 0.498876953125, |
| "learning_rate": 1.8530227052308843e-05, |
| "loss": 0.0199, |
| "reward": 1.0125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9375, |
| "step": 1165 |
| }, |
| { |
| "completion_length": 199.46875, |
| "epoch": 0.25841719468256924, |
| "grad_norm": 0.6299395922753566, |
| "kl": 0.44912109375, |
| "learning_rate": 1.8510042011467978e-05, |
| "loss": 0.018, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.96875, |
| "step": 1170 |
| }, |
| { |
| "completion_length": 222.3625, |
| "epoch": 0.2595215416683922, |
| "grad_norm": 0.529042632954111, |
| "kl": 0.43572998046875, |
| "learning_rate": 1.848973045945753e-05, |
| "loss": 0.0174, |
| "reward": 1.00625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.94375, |
| "step": 1175 |
| }, |
| { |
| "completion_length": 209.43125, |
| "epoch": 0.26062588865421515, |
| "grad_norm": 0.7827952040899112, |
| "kl": 0.83447265625, |
| "learning_rate": 1.8469292698231137e-05, |
| "loss": 0.0335, |
| "reward": 1.05, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.95625, |
| "step": 1180 |
| }, |
| { |
| "completion_length": 153.6875, |
| "epoch": 0.2617302356400381, |
| "grad_norm": 0.42454488448610755, |
| "kl": 0.35516357421875, |
| "learning_rate": 1.8448729031618687e-05, |
| "loss": 0.0142, |
| "reward": 1.04375, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.96875, |
| "step": 1185 |
| }, |
| { |
| "completion_length": 174.7125, |
| "epoch": 0.26283458262586107, |
| "grad_norm": 0.37149930187993413, |
| "kl": 0.365673828125, |
| "learning_rate": 1.8428039765321783e-05, |
| "loss": 0.0146, |
| "reward": 1.0375, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.96875, |
| "step": 1190 |
| }, |
| { |
| "completion_length": 265.21875, |
| "epoch": 0.26393892961168397, |
| "grad_norm": 0.66806143396866, |
| "kl": 0.3682373046875, |
| "learning_rate": 1.840722520690921e-05, |
| "loss": 0.0147, |
| "reward": 0.96875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.90625, |
| "step": 1195 |
| }, |
| { |
| "completion_length": 270.75, |
| "epoch": 0.2650432765975069, |
| "grad_norm": 0.62491488989135, |
| "kl": 0.37822265625, |
| "learning_rate": 1.838628566581236e-05, |
| "loss": 0.0151, |
| "reward": 0.91875, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.8375, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.2650432765975069, |
| "eval_completion_length": 337.06, |
| "eval_kl": 0.4419140625, |
| "eval_loss": 0.0176791213452816, |
| "eval_reward": 0.905, |
| "eval_reward_std": 0.3181980448961258, |
| "eval_rewards/accuracy_reward": 0.085, |
| "eval_rewards/format_reward": 0.82, |
| "eval_runtime": 175.1338, |
| "eval_samples_per_second": 0.565, |
| "eval_steps_per_second": 0.143, |
| "step": 1200 |
| }, |
| { |
| "completion_length": 257.49375, |
| "epoch": 0.2661476235833299, |
| "grad_norm": 0.83147670526204, |
| "kl": 0.35118408203125, |
| "learning_rate": 1.8365221453320625e-05, |
| "loss": 0.014, |
| "reward": 0.9125, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.84375, |
| "step": 1205 |
| }, |
| { |
| "completion_length": 223.31875, |
| "epoch": 0.26725197056915284, |
| "grad_norm": 0.24158167072477596, |
| "kl": 0.3875, |
| "learning_rate": 1.8344032882576784e-05, |
| "loss": 0.0155, |
| "reward": 0.95625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.90625, |
| "step": 1210 |
| }, |
| { |
| "completion_length": 163.0, |
| "epoch": 0.2683563175549758, |
| "grad_norm": 0.6479529739729771, |
| "kl": 0.405072021484375, |
| "learning_rate": 1.8322720268572333e-05, |
| "loss": 0.0162, |
| "reward": 0.99375, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.9375, |
| "step": 1215 |
| }, |
| { |
| "completion_length": 292.16875, |
| "epoch": 0.2694606645407987, |
| "grad_norm": 1.7491765816716196, |
| "kl": 0.540771484375, |
| "learning_rate": 1.83012839281428e-05, |
| "loss": 0.0216, |
| "reward": 0.95, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.8875, |
| "step": 1220 |
| }, |
| { |
| "completion_length": 358.31875, |
| "epoch": 0.27056501152662166, |
| "grad_norm": 0.5769509244571842, |
| "kl": 0.5750244140625, |
| "learning_rate": 1.827972417996306e-05, |
| "loss": 0.023, |
| "reward": 0.8375, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.7875, |
| "step": 1225 |
| }, |
| { |
| "completion_length": 263.51875, |
| "epoch": 0.2716693585124446, |
| "grad_norm": 0.47084507783132956, |
| "kl": 0.51199951171875, |
| "learning_rate": 1.8258041344542567e-05, |
| "loss": 0.0205, |
| "reward": 0.89375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.875, |
| "step": 1230 |
| }, |
| { |
| "completion_length": 145.9875, |
| "epoch": 0.27277370549826757, |
| "grad_norm": 0.6657898991831224, |
| "kl": 0.52864990234375, |
| "learning_rate": 1.823623574422061e-05, |
| "loss": 0.0212, |
| "reward": 1.025, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9625, |
| "step": 1235 |
| }, |
| { |
| "completion_length": 151.34375, |
| "epoch": 0.27387805248409053, |
| "grad_norm": 0.3173979928590889, |
| "kl": 0.4265869140625, |
| "learning_rate": 1.821430770316151e-05, |
| "loss": 0.0171, |
| "reward": 1.0125, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.96875, |
| "step": 1240 |
| }, |
| { |
| "completion_length": 583.2, |
| "epoch": 0.27498239946991343, |
| "grad_norm": 0.3100947155151386, |
| "kl": 0.40513916015625, |
| "learning_rate": 1.8192257547349805e-05, |
| "loss": 0.0162, |
| "reward": 0.65625, |
| "reward_std": 0.2916815422475338, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.625, |
| "step": 1245 |
| }, |
| { |
| "completion_length": 315.43125, |
| "epoch": 0.2760867464557364, |
| "grad_norm": 0.1782330419853091, |
| "kl": 0.51036376953125, |
| "learning_rate": 1.817008560458541e-05, |
| "loss": 0.0204, |
| "reward": 0.84375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.8125, |
| "step": 1250 |
| }, |
| { |
| "completion_length": 87.08125, |
| "epoch": 0.27719109344155934, |
| "grad_norm": 0.2736695226581386, |
| "kl": 0.59683837890625, |
| "learning_rate": 1.814779220447872e-05, |
| "loss": 0.0239, |
| "reward": 0.9625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.9375, |
| "step": 1255 |
| }, |
| { |
| "completion_length": 124.04375, |
| "epoch": 0.2782954404273823, |
| "grad_norm": 0.4315634020248374, |
| "kl": 0.48404541015625, |
| "learning_rate": 1.8125377678445755e-05, |
| "loss": 0.0194, |
| "reward": 1.01875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9375, |
| "step": 1260 |
| }, |
| { |
| "completion_length": 242.1875, |
| "epoch": 0.2793997874132052, |
| "grad_norm": 0.6754023429831525, |
| "kl": 0.5489013671875, |
| "learning_rate": 1.8102842359703177e-05, |
| "loss": 0.022, |
| "reward": 0.9375, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.8875, |
| "step": 1265 |
| }, |
| { |
| "completion_length": 154.88125, |
| "epoch": 0.28050413439902816, |
| "grad_norm": 0.7500021389013647, |
| "kl": 0.44857177734375, |
| "learning_rate": 1.8080186583263386e-05, |
| "loss": 0.018, |
| "reward": 0.9875, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9375, |
| "step": 1270 |
| }, |
| { |
| "completion_length": 138.74375, |
| "epoch": 0.2816084813848511, |
| "grad_norm": 0.6511896035985808, |
| "kl": 0.660400390625, |
| "learning_rate": 1.8057410685929505e-05, |
| "loss": 0.0264, |
| "reward": 1.01875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.96875, |
| "step": 1275 |
| }, |
| { |
| "completion_length": 278.21875, |
| "epoch": 0.2827128283706741, |
| "grad_norm": 0.37768724134381376, |
| "kl": 0.4253173828125, |
| "learning_rate": 1.8034515006290398e-05, |
| "loss": 0.017, |
| "reward": 0.9625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.9375, |
| "step": 1280 |
| }, |
| { |
| "completion_length": 306.81875, |
| "epoch": 0.28381717535649703, |
| "grad_norm": 0.29452808394209296, |
| "kl": 0.38458251953125, |
| "learning_rate": 1.8011499884715616e-05, |
| "loss": 0.0154, |
| "reward": 1.0375, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.95625, |
| "step": 1285 |
| }, |
| { |
| "completion_length": 331.20625, |
| "epoch": 0.28492152234231993, |
| "grad_norm": 0.40009340612327654, |
| "kl": 0.40555419921875, |
| "learning_rate": 1.7988365663350352e-05, |
| "loss": 0.0162, |
| "reward": 1.00625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9375, |
| "step": 1290 |
| }, |
| { |
| "completion_length": 291.39375, |
| "epoch": 0.2860258693281429, |
| "grad_norm": 0.2398401379002744, |
| "kl": 0.3625, |
| "learning_rate": 1.7965112686110346e-05, |
| "loss": 0.0145, |
| "reward": 0.96875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.93125, |
| "step": 1295 |
| }, |
| { |
| "completion_length": 210.84375, |
| "epoch": 0.28713021631396585, |
| "grad_norm": 0.16393982679750796, |
| "kl": 0.36951904296875, |
| "learning_rate": 1.7941741298676777e-05, |
| "loss": 0.0148, |
| "reward": 1.0125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.96875, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.28713021631396585, |
| "eval_completion_length": 168.92, |
| "eval_kl": 0.39515625, |
| "eval_loss": 0.01582903414964676, |
| "eval_reward": 1.05, |
| "eval_reward_std": 0.11313708305358887, |
| "eval_rewards/accuracy_reward": 0.07, |
| "eval_rewards/format_reward": 0.98, |
| "eval_runtime": 82.4657, |
| "eval_samples_per_second": 1.2, |
| "eval_steps_per_second": 0.303, |
| "step": 1300 |
| }, |
| { |
| "completion_length": 157.2875, |
| "epoch": 0.2882345632997888, |
| "grad_norm": 0.5544518957919585, |
| "kl": 0.35565185546875, |
| "learning_rate": 1.7918251848491118e-05, |
| "loss": 0.0142, |
| "reward": 1.00625, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.9875, |
| "step": 1305 |
| }, |
| { |
| "completion_length": 143.9625, |
| "epoch": 0.28933891028561176, |
| "grad_norm": 1.0694156042827718, |
| "kl": 0.36668701171875, |
| "learning_rate": 1.7894644684749983e-05, |
| "loss": 0.0147, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.99375, |
| "step": 1310 |
| }, |
| { |
| "completion_length": 129.1875, |
| "epoch": 0.29044325727143466, |
| "grad_norm": 0.7797356796661203, |
| "kl": 0.363818359375, |
| "learning_rate": 1.7870920158399918e-05, |
| "loss": 0.0146, |
| "reward": 1.075, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.96875, |
| "step": 1315 |
| }, |
| { |
| "completion_length": 107.58125, |
| "epoch": 0.2915476042572576, |
| "grad_norm": 0.42231572420920943, |
| "kl": 0.415740966796875, |
| "learning_rate": 1.7847078622132202e-05, |
| "loss": 0.0166, |
| "reward": 1.0, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.95, |
| "step": 1320 |
| }, |
| { |
| "completion_length": 137.425, |
| "epoch": 0.2926519512430806, |
| "grad_norm": 0.31138070158067094, |
| "kl": 0.4673095703125, |
| "learning_rate": 1.7823120430377593e-05, |
| "loss": 0.0187, |
| "reward": 1.0, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.975, |
| "step": 1325 |
| }, |
| { |
| "completion_length": 169.975, |
| "epoch": 0.29375629822890353, |
| "grad_norm": 0.7054962096277919, |
| "kl": 0.4247802734375, |
| "learning_rate": 1.7799045939301063e-05, |
| "loss": 0.017, |
| "reward": 1.04375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.98125, |
| "step": 1330 |
| }, |
| { |
| "completion_length": 199.325, |
| "epoch": 0.2948606452147265, |
| "grad_norm": 0.5371054293908584, |
| "kl": 0.4145263671875, |
| "learning_rate": 1.7774855506796497e-05, |
| "loss": 0.0166, |
| "reward": 1.06875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.98125, |
| "step": 1335 |
| }, |
| { |
| "completion_length": 285.3625, |
| "epoch": 0.2959649922005494, |
| "grad_norm": 0.49107133614596143, |
| "kl": 0.38699951171875, |
| "learning_rate": 1.775054949248138e-05, |
| "loss": 0.0155, |
| "reward": 1.0625, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95, |
| "step": 1340 |
| }, |
| { |
| "completion_length": 195.01875, |
| "epoch": 0.29706933918637235, |
| "grad_norm": 0.18854873920164167, |
| "kl": 0.401611328125, |
| "learning_rate": 1.7726128257691447e-05, |
| "loss": 0.0161, |
| "reward": 1.08125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9875, |
| "step": 1345 |
| }, |
| { |
| "completion_length": 144.2625, |
| "epoch": 0.2981736861721953, |
| "grad_norm": 0.194977538009497, |
| "kl": 0.3897705078125, |
| "learning_rate": 1.770159216547532e-05, |
| "loss": 0.0156, |
| "reward": 1.06875, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.99375, |
| "step": 1350 |
| }, |
| { |
| "completion_length": 142.68125, |
| "epoch": 0.29927803315801826, |
| "grad_norm": 0.2721232391131242, |
| "kl": 0.4391845703125, |
| "learning_rate": 1.7676941580589097e-05, |
| "loss": 0.0176, |
| "reward": 1.01875, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.9875, |
| "step": 1355 |
| }, |
| { |
| "completion_length": 141.6625, |
| "epoch": 0.3003823801438412, |
| "grad_norm": 0.12367929505701036, |
| "kl": 0.4060302734375, |
| "learning_rate": 1.7652176869490933e-05, |
| "loss": 0.0162, |
| "reward": 1.0375, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.98125, |
| "step": 1360 |
| }, |
| { |
| "completion_length": 147.54375, |
| "epoch": 0.3014867271296641, |
| "grad_norm": 0.1722353049345247, |
| "kl": 0.43369140625, |
| "learning_rate": 1.76272984003356e-05, |
| "loss": 0.0173, |
| "reward": 1.025, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 1.0, |
| "step": 1365 |
| }, |
| { |
| "completion_length": 161.99375, |
| "epoch": 0.3025910741154871, |
| "grad_norm": 0.6029005445006613, |
| "kl": 0.47828369140625, |
| "learning_rate": 1.7602306542969006e-05, |
| "loss": 0.0191, |
| "reward": 0.975, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.95625, |
| "step": 1370 |
| }, |
| { |
| "completion_length": 300.3375, |
| "epoch": 0.30369542110131004, |
| "grad_norm": 0.6594210074645137, |
| "kl": 0.5243896484375, |
| "learning_rate": 1.7577201668922702e-05, |
| "loss": 0.021, |
| "reward": 0.9, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.875, |
| "step": 1375 |
| }, |
| { |
| "completion_length": 235.9625, |
| "epoch": 0.304799768087133, |
| "grad_norm": 0.42920157132224784, |
| "kl": 0.418603515625, |
| "learning_rate": 1.7551984151408363e-05, |
| "loss": 0.0167, |
| "reward": 0.9125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.8875, |
| "step": 1380 |
| }, |
| { |
| "completion_length": 148.1375, |
| "epoch": 0.3059041150729559, |
| "grad_norm": 0.6394391510717589, |
| "kl": 0.38023681640625, |
| "learning_rate": 1.7526654365312222e-05, |
| "loss": 0.0152, |
| "reward": 1.06875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.99375, |
| "step": 1385 |
| }, |
| { |
| "completion_length": 130.94375, |
| "epoch": 0.30700846205877885, |
| "grad_norm": 0.38449234338266886, |
| "kl": 0.381109619140625, |
| "learning_rate": 1.750121268718951e-05, |
| "loss": 0.0152, |
| "reward": 1.025, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.975, |
| "step": 1390 |
| }, |
| { |
| "completion_length": 134.8125, |
| "epoch": 0.3081128090446018, |
| "grad_norm": 0.32691206341821716, |
| "kl": 0.37451171875, |
| "learning_rate": 1.7475659495258864e-05, |
| "loss": 0.015, |
| "reward": 1.00625, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.9625, |
| "step": 1395 |
| }, |
| { |
| "completion_length": 152.13125, |
| "epoch": 0.30921715603042477, |
| "grad_norm": 0.5233437303073457, |
| "kl": 0.39224853515625, |
| "learning_rate": 1.7449995169396693e-05, |
| "loss": 0.0157, |
| "reward": 0.99375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.93125, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.30921715603042477, |
| "eval_completion_length": 140.44, |
| "eval_kl": 0.42287109375, |
| "eval_loss": 0.016920818015933037, |
| "eval_reward": 1.07, |
| "eval_reward_std": 0.18384775936603545, |
| "eval_rewards/accuracy_reward": 0.115, |
| "eval_rewards/format_reward": 0.955, |
| "eval_runtime": 81.6042, |
| "eval_samples_per_second": 1.213, |
| "eval_steps_per_second": 0.306, |
| "step": 1400 |
| }, |
| { |
| "completion_length": 130.34375, |
| "epoch": 0.3103215030162477, |
| "grad_norm": 0.28398981693339564, |
| "kl": 0.38331298828125, |
| "learning_rate": 1.7424220091131536e-05, |
| "loss": 0.0153, |
| "reward": 1.01875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.95625, |
| "step": 1405 |
| }, |
| { |
| "completion_length": 180.325, |
| "epoch": 0.3114258500020706, |
| "grad_norm": 0.6590820065232371, |
| "kl": 0.417919921875, |
| "learning_rate": 1.739833464363838e-05, |
| "loss": 0.0167, |
| "reward": 0.96875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.91875, |
| "step": 1410 |
| }, |
| { |
| "completion_length": 174.08125, |
| "epoch": 0.3125301969878936, |
| "grad_norm": 0.19303045686565795, |
| "kl": 0.37972412109375, |
| "learning_rate": 1.7372339211732988e-05, |
| "loss": 0.0152, |
| "reward": 1.0125, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.95, |
| "step": 1415 |
| }, |
| { |
| "completion_length": 208.6375, |
| "epoch": 0.31363454397371654, |
| "grad_norm": 1.2704281522495193, |
| "kl": 0.3710205078125, |
| "learning_rate": 1.734623418186615e-05, |
| "loss": 0.0148, |
| "reward": 0.99375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9125, |
| "step": 1420 |
| }, |
| { |
| "completion_length": 208.275, |
| "epoch": 0.3147388909595395, |
| "grad_norm": 0.4017752883432233, |
| "kl": 0.42412109375, |
| "learning_rate": 1.7320019942117954e-05, |
| "loss": 0.017, |
| "reward": 0.99375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.94375, |
| "step": 1425 |
| }, |
| { |
| "completion_length": 263.475, |
| "epoch": 0.31584323794536245, |
| "grad_norm": 0.6704877034578468, |
| "kl": 0.443798828125, |
| "learning_rate": 1.729369688219202e-05, |
| "loss": 0.0178, |
| "reward": 0.98125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9, |
| "step": 1430 |
| }, |
| { |
| "completion_length": 268.4625, |
| "epoch": 0.31694758493118536, |
| "grad_norm": 1.1806288895312564, |
| "kl": 0.475146484375, |
| "learning_rate": 1.7267265393409684e-05, |
| "loss": 0.019, |
| "reward": 0.9875, |
| "reward_std": 0.24748736917972564, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.89375, |
| "step": 1435 |
| }, |
| { |
| "completion_length": 239.40625, |
| "epoch": 0.3180519319170083, |
| "grad_norm": 0.7720008549372368, |
| "kl": 0.47855224609375, |
| "learning_rate": 1.7240725868704218e-05, |
| "loss": 0.0192, |
| "reward": 0.98125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9125, |
| "step": 1440 |
| }, |
| { |
| "completion_length": 209.23125, |
| "epoch": 0.31915627890283127, |
| "grad_norm": 0.6559969920402181, |
| "kl": 0.36781005859375, |
| "learning_rate": 1.7214078702614946e-05, |
| "loss": 0.0147, |
| "reward": 1.0, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.93125, |
| "step": 1445 |
| }, |
| { |
| "completion_length": 181.79375, |
| "epoch": 0.3202606258886542, |
| "grad_norm": 1.033869102520755, |
| "kl": 0.44471435546875, |
| "learning_rate": 1.7187324291281423e-05, |
| "loss": 0.0178, |
| "reward": 0.95625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.925, |
| "step": 1450 |
| }, |
| { |
| "completion_length": 144.8375, |
| "epoch": 0.3213649728744772, |
| "grad_norm": 0.23531024361520275, |
| "kl": 0.55146484375, |
| "learning_rate": 1.71604630324375e-05, |
| "loss": 0.0221, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.975, |
| "step": 1455 |
| }, |
| { |
| "completion_length": 152.3375, |
| "epoch": 0.3224693198603001, |
| "grad_norm": 0.521389309576663, |
| "kl": 0.3604736328125, |
| "learning_rate": 1.7133495325405448e-05, |
| "loss": 0.0144, |
| "reward": 1.05, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9875, |
| "step": 1460 |
| }, |
| { |
| "completion_length": 147.61875, |
| "epoch": 0.32357366684612304, |
| "grad_norm": 0.5218385357295671, |
| "kl": 0.3684814453125, |
| "learning_rate": 1.7106421571090003e-05, |
| "loss": 0.0147, |
| "reward": 1.03125, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.99375, |
| "step": 1465 |
| }, |
| { |
| "completion_length": 146.7625, |
| "epoch": 0.324678013831946, |
| "grad_norm": 0.40670196743586623, |
| "kl": 0.383203125, |
| "learning_rate": 1.7079242171972417e-05, |
| "loss": 0.0153, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.98125, |
| "step": 1470 |
| }, |
| { |
| "completion_length": 183.4125, |
| "epoch": 0.32578236081776896, |
| "grad_norm": 0.17326012835635307, |
| "kl": 0.40433349609375, |
| "learning_rate": 1.705195753210446e-05, |
| "loss": 0.0162, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.99375, |
| "step": 1475 |
| }, |
| { |
| "completion_length": 252.45, |
| "epoch": 0.3268867078035919, |
| "grad_norm": 0.33618781399155934, |
| "kl": 0.43258056640625, |
| "learning_rate": 1.7024568057102423e-05, |
| "loss": 0.0173, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.975, |
| "step": 1480 |
| }, |
| { |
| "completion_length": 316.8125, |
| "epoch": 0.3279910547894148, |
| "grad_norm": 0.6985966336266197, |
| "kl": 0.4880126953125, |
| "learning_rate": 1.6997074154141097e-05, |
| "loss": 0.0195, |
| "reward": 1.0625, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9875, |
| "step": 1485 |
| }, |
| { |
| "completion_length": 227.04375, |
| "epoch": 0.3290954017752378, |
| "grad_norm": 0.2716042884976899, |
| "kl": 0.42822265625, |
| "learning_rate": 1.69694762319477e-05, |
| "loss": 0.0171, |
| "reward": 1.06875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.98125, |
| "step": 1490 |
| }, |
| { |
| "completion_length": 246.30625, |
| "epoch": 0.33019974876106073, |
| "grad_norm": 0.34216236944018125, |
| "kl": 0.432666015625, |
| "learning_rate": 1.694177470079581e-05, |
| "loss": 0.0173, |
| "reward": 0.9875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.95, |
| "step": 1495 |
| }, |
| { |
| "completion_length": 319.29375, |
| "epoch": 0.3313040957468837, |
| "grad_norm": 0.3073497949162371, |
| "kl": 0.4350341796875, |
| "learning_rate": 1.6913969972499272e-05, |
| "loss": 0.0174, |
| "reward": 1.025, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9625, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3313040957468837, |
| "eval_completion_length": 508.6, |
| "eval_kl": 0.5653125, |
| "eval_loss": 0.022641615942120552, |
| "eval_reward": 0.97, |
| "eval_reward_std": 0.15556348919868468, |
| "eval_rewards/accuracy_reward": 0.06, |
| "eval_rewards/format_reward": 0.91, |
| "eval_runtime": 242.1008, |
| "eval_samples_per_second": 0.409, |
| "eval_steps_per_second": 0.103, |
| "step": 1500 |
| }, |
| { |
| "completion_length": 219.5875, |
| "epoch": 0.33240844273270664, |
| "grad_norm": 0.5738378890078689, |
| "kl": 0.447314453125, |
| "learning_rate": 1.688606246040607e-05, |
| "loss": 0.0179, |
| "reward": 1.01875, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.90625, |
| "step": 1505 |
| }, |
| { |
| "completion_length": 175.0, |
| "epoch": 0.33351278971852955, |
| "grad_norm": 0.4329938576388711, |
| "kl": 0.36485595703125, |
| "learning_rate": 1.6858052579392182e-05, |
| "loss": 0.0146, |
| "reward": 1.06875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 1.0, |
| "step": 1510 |
| }, |
| { |
| "completion_length": 212.8375, |
| "epoch": 0.3346171367043525, |
| "grad_norm": 0.3974460378419368, |
| "kl": 0.3696533203125, |
| "learning_rate": 1.682994074585541e-05, |
| "loss": 0.0148, |
| "reward": 0.95625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.925, |
| "step": 1515 |
| }, |
| { |
| "completion_length": 198.4125, |
| "epoch": 0.33572148369017546, |
| "grad_norm": 0.6339047206651848, |
| "kl": 0.3900634765625, |
| "learning_rate": 1.6801727377709195e-05, |
| "loss": 0.0156, |
| "reward": 0.96875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.925, |
| "step": 1520 |
| }, |
| { |
| "completion_length": 175.4, |
| "epoch": 0.3368258306759984, |
| "grad_norm": 0.31233984339595194, |
| "kl": 0.36982421875, |
| "learning_rate": 1.6773412894376404e-05, |
| "loss": 0.0148, |
| "reward": 0.98125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.95625, |
| "step": 1525 |
| }, |
| { |
| "completion_length": 162.075, |
| "epoch": 0.3379301776618213, |
| "grad_norm": 0.41593882245992403, |
| "kl": 0.3514892578125, |
| "learning_rate": 1.674499771678309e-05, |
| "loss": 0.0141, |
| "reward": 1.01875, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.98125, |
| "step": 1530 |
| }, |
| { |
| "completion_length": 146.81875, |
| "epoch": 0.3390345246476443, |
| "grad_norm": 0.6916723408968213, |
| "kl": 0.4715576171875, |
| "learning_rate": 1.6716482267352234e-05, |
| "loss": 0.0189, |
| "reward": 1.04375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9375, |
| "step": 1535 |
| }, |
| { |
| "completion_length": 146.05625, |
| "epoch": 0.34013887163346723, |
| "grad_norm": 0.16801257159790053, |
| "kl": 0.4378662109375, |
| "learning_rate": 1.6687866969997483e-05, |
| "loss": 0.0175, |
| "reward": 1.0, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.95, |
| "step": 1540 |
| }, |
| { |
| "completion_length": 158.125, |
| "epoch": 0.3412432186192902, |
| "grad_norm": 0.3924153520384322, |
| "kl": 0.3984375, |
| "learning_rate": 1.665915225011681e-05, |
| "loss": 0.0159, |
| "reward": 1.00625, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.96875, |
| "step": 1545 |
| }, |
| { |
| "completion_length": 152.525, |
| "epoch": 0.34234756560511315, |
| "grad_norm": 0.2188171820439607, |
| "kl": 0.3915771484375, |
| "learning_rate": 1.663033853458624e-05, |
| "loss": 0.0157, |
| "reward": 1.0, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.98125, |
| "step": 1550 |
| }, |
| { |
| "completion_length": 185.0, |
| "epoch": 0.34345191259093605, |
| "grad_norm": 0.2492866797409777, |
| "kl": 0.446630859375, |
| "learning_rate": 1.660142625175346e-05, |
| "loss": 0.0179, |
| "reward": 1.0375, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.98125, |
| "step": 1555 |
| }, |
| { |
| "completion_length": 197.925, |
| "epoch": 0.344556259576759, |
| "grad_norm": 0.43125503310433044, |
| "kl": 0.417333984375, |
| "learning_rate": 1.6572415831431466e-05, |
| "loss": 0.0167, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.975, |
| "step": 1560 |
| }, |
| { |
| "completion_length": 231.45, |
| "epoch": 0.34566060656258196, |
| "grad_norm": 0.547580901229839, |
| "kl": 0.4208251953125, |
| "learning_rate": 1.6543307704892196e-05, |
| "loss": 0.0168, |
| "reward": 1.0125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.95, |
| "step": 1565 |
| }, |
| { |
| "completion_length": 210.36875, |
| "epoch": 0.3467649535484049, |
| "grad_norm": 0.30578684489167307, |
| "kl": 0.40220947265625, |
| "learning_rate": 1.6514102304860077e-05, |
| "loss": 0.0161, |
| "reward": 1.01875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.98125, |
| "step": 1570 |
| }, |
| { |
| "completion_length": 169.575, |
| "epoch": 0.3478693005342279, |
| "grad_norm": 0.3714599755051663, |
| "kl": 0.4043701171875, |
| "learning_rate": 1.6484800065505627e-05, |
| "loss": 0.0162, |
| "reward": 1.01875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.98125, |
| "step": 1575 |
| }, |
| { |
| "completion_length": 157.6625, |
| "epoch": 0.3489736475200508, |
| "grad_norm": 1.1408284138746587, |
| "kl": 0.51844482421875, |
| "learning_rate": 1.6455401422438984e-05, |
| "loss": 0.0207, |
| "reward": 1.0375, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.96875, |
| "step": 1580 |
| }, |
| { |
| "completion_length": 131.2875, |
| "epoch": 0.35007799450587374, |
| "grad_norm": 0.47193321313326936, |
| "kl": 0.4167236328125, |
| "learning_rate": 1.6425906812703435e-05, |
| "loss": 0.0167, |
| "reward": 1.08125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.975, |
| "step": 1585 |
| }, |
| { |
| "completion_length": 194.43125, |
| "epoch": 0.3511823414916967, |
| "grad_norm": 0.723120589080064, |
| "kl": 0.4700439453125, |
| "learning_rate": 1.6396316674768914e-05, |
| "loss": 0.0188, |
| "reward": 0.99375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.925, |
| "step": 1590 |
| }, |
| { |
| "completion_length": 216.175, |
| "epoch": 0.35228668847751965, |
| "grad_norm": 0.4975629560332776, |
| "kl": 0.42794189453125, |
| "learning_rate": 1.6366631448525486e-05, |
| "loss": 0.0171, |
| "reward": 1.075, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.95625, |
| "step": 1595 |
| }, |
| { |
| "completion_length": 195.9375, |
| "epoch": 0.3533910354633426, |
| "grad_norm": 0.33985255338891107, |
| "kl": 0.3559814453125, |
| "learning_rate": 1.6336851575276814e-05, |
| "loss": 0.0142, |
| "reward": 1.05, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9875, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.3533910354633426, |
| "eval_completion_length": 231.0, |
| "eval_kl": 0.735625, |
| "eval_loss": 0.02945670112967491, |
| "eval_reward": 1.05, |
| "eval_reward_std": 0.15556348919868468, |
| "eval_rewards/accuracy_reward": 0.1, |
| "eval_rewards/format_reward": 0.95, |
| "eval_runtime": 111.0663, |
| "eval_samples_per_second": 0.891, |
| "eval_steps_per_second": 0.225, |
| "step": 1600 |
| }, |
| { |
| "completion_length": 231.275, |
| "epoch": 0.3544953824491655, |
| "grad_norm": 0.595242649626797, |
| "kl": 0.4072265625, |
| "learning_rate": 1.630697749773359e-05, |
| "loss": 0.0163, |
| "reward": 1.05, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.96875, |
| "step": 1605 |
| }, |
| { |
| "completion_length": 278.25, |
| "epoch": 0.35559972943498847, |
| "grad_norm": 0.4801274687526583, |
| "kl": 0.40982666015625, |
| "learning_rate": 1.627700966000696e-05, |
| "loss": 0.0164, |
| "reward": 1.025, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.95, |
| "step": 1610 |
| }, |
| { |
| "completion_length": 260.71875, |
| "epoch": 0.3567040764208114, |
| "grad_norm": 0.29704464145114623, |
| "kl": 0.3713134765625, |
| "learning_rate": 1.6246948507601915e-05, |
| "loss": 0.0149, |
| "reward": 1.025, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.94375, |
| "step": 1615 |
| }, |
| { |
| "completion_length": 220.7375, |
| "epoch": 0.3578084234066344, |
| "grad_norm": 0.16551151233488073, |
| "kl": 0.33929443359375, |
| "learning_rate": 1.621679448741067e-05, |
| "loss": 0.0136, |
| "reward": 1.05, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.98125, |
| "step": 1620 |
| }, |
| { |
| "completion_length": 203.19375, |
| "epoch": 0.35891277039245734, |
| "grad_norm": 0.44232175696554416, |
| "kl": 0.3436279296875, |
| "learning_rate": 1.618654804770603e-05, |
| "loss": 0.0137, |
| "reward": 1.0875, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9875, |
| "step": 1625 |
| }, |
| { |
| "completion_length": 196.01875, |
| "epoch": 0.36001711737828024, |
| "grad_norm": 0.3595404694857126, |
| "kl": 0.33565673828125, |
| "learning_rate": 1.615620963813471e-05, |
| "loss": 0.0134, |
| "reward": 1.03125, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.9875, |
| "step": 1630 |
| }, |
| { |
| "completion_length": 220.28125, |
| "epoch": 0.3611214643641032, |
| "grad_norm": 0.09068347346699927, |
| "kl": 0.334228515625, |
| "learning_rate": 1.6125779709710668e-05, |
| "loss": 0.0134, |
| "reward": 1.04375, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.99375, |
| "step": 1635 |
| }, |
| { |
| "completion_length": 217.84375, |
| "epoch": 0.36222581134992615, |
| "grad_norm": 0.24326484641045593, |
| "kl": 0.323681640625, |
| "learning_rate": 1.6095258714808373e-05, |
| "loss": 0.0129, |
| "reward": 1.09375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.99375, |
| "step": 1640 |
| }, |
| { |
| "completion_length": 190.775, |
| "epoch": 0.3633301583357491, |
| "grad_norm": 0.32151529940248824, |
| "kl": 0.3042724609375, |
| "learning_rate": 1.606464710715612e-05, |
| "loss": 0.0122, |
| "reward": 1.04375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.975, |
| "step": 1645 |
| }, |
| { |
| "completion_length": 223.5375, |
| "epoch": 0.364434505321572, |
| "grad_norm": 0.4066387353346626, |
| "kl": 0.35045166015625, |
| "learning_rate": 1.603394534182925e-05, |
| "loss": 0.014, |
| "reward": 1.04375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.975, |
| "step": 1650 |
| }, |
| { |
| "completion_length": 200.40625, |
| "epoch": 0.36553885230739497, |
| "grad_norm": 0.6150107546663145, |
| "kl": 0.42801513671875, |
| "learning_rate": 1.600315387524339e-05, |
| "loss": 0.0171, |
| "reward": 1.05625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 1655 |
| }, |
| { |
| "completion_length": 206.625, |
| "epoch": 0.3666431992932179, |
| "grad_norm": 0.3881248346947634, |
| "kl": 0.36854248046875, |
| "learning_rate": 1.5972273165147697e-05, |
| "loss": 0.0147, |
| "reward": 1.05, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.98125, |
| "step": 1660 |
| }, |
| { |
| "completion_length": 223.15625, |
| "epoch": 0.3677475462790409, |
| "grad_norm": 0.45585685791218283, |
| "kl": 0.35394287109375, |
| "learning_rate": 1.5941303670618018e-05, |
| "loss": 0.0141, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.96875, |
| "step": 1665 |
| }, |
| { |
| "completion_length": 205.44375, |
| "epoch": 0.36885189326486384, |
| "grad_norm": 0.24229473958778308, |
| "kl": 0.32738037109375, |
| "learning_rate": 1.591024585205007e-05, |
| "loss": 0.0131, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.98125, |
| "step": 1670 |
| }, |
| { |
| "completion_length": 237.66875, |
| "epoch": 0.36995624025068674, |
| "grad_norm": 0.9451634093337382, |
| "kl": 0.37305908203125, |
| "learning_rate": 1.587910017115262e-05, |
| "loss": 0.0149, |
| "reward": 1.0125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.925, |
| "step": 1675 |
| }, |
| { |
| "completion_length": 234.78125, |
| "epoch": 0.3710605872365097, |
| "grad_norm": 0.4259015577951971, |
| "kl": 0.3545654296875, |
| "learning_rate": 1.5847867090940602e-05, |
| "loss": 0.0142, |
| "reward": 1.0625, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.96875, |
| "step": 1680 |
| }, |
| { |
| "completion_length": 259.5, |
| "epoch": 0.37216493422233266, |
| "grad_norm": 0.3894739125660781, |
| "kl": 0.33232421875, |
| "learning_rate": 1.5816547075728227e-05, |
| "loss": 0.0133, |
| "reward": 1.0125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.94375, |
| "step": 1685 |
| }, |
| { |
| "completion_length": 218.225, |
| "epoch": 0.3732692812081556, |
| "grad_norm": 0.5751023644328291, |
| "kl": 0.3769775390625, |
| "learning_rate": 1.5785140591122107e-05, |
| "loss": 0.0151, |
| "reward": 1.075, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.96875, |
| "step": 1690 |
| }, |
| { |
| "completion_length": 198.24375, |
| "epoch": 0.37437362819397857, |
| "grad_norm": 0.6070740663767715, |
| "kl": 0.39683837890625, |
| "learning_rate": 1.57536481040143e-05, |
| "loss": 0.0159, |
| "reward": 1.05, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.94375, |
| "step": 1695 |
| }, |
| { |
| "completion_length": 171.06875, |
| "epoch": 0.37547797517980147, |
| "grad_norm": 0.5506629078773986, |
| "kl": 0.37344970703125, |
| "learning_rate": 1.57220700825754e-05, |
| "loss": 0.0149, |
| "reward": 1.09375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.98125, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.37547797517980147, |
| "eval_completion_length": 164.62, |
| "eval_kl": 0.39626953125, |
| "eval_loss": 0.015575483441352844, |
| "eval_reward": 1.06, |
| "eval_reward_std": 0.08485281229019165, |
| "eval_rewards/accuracy_reward": 0.085, |
| "eval_rewards/format_reward": 0.975, |
| "eval_runtime": 82.4164, |
| "eval_samples_per_second": 1.201, |
| "eval_steps_per_second": 0.303, |
| "step": 1700 |
| }, |
| { |
| "completion_length": 148.7125, |
| "epoch": 0.37658232216562443, |
| "grad_norm": 0.346175215998511, |
| "kl": 0.34801025390625, |
| "learning_rate": 1.5690406996247557e-05, |
| "loss": 0.0139, |
| "reward": 1.10625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.975, |
| "step": 1705 |
| }, |
| { |
| "completion_length": 157.10625, |
| "epoch": 0.3776866691514474, |
| "grad_norm": 0.41957587709124056, |
| "kl": 0.35477294921875, |
| "learning_rate": 1.5658659315737505e-05, |
| "loss": 0.0142, |
| "reward": 1.075, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.9625, |
| "step": 1710 |
| }, |
| { |
| "completion_length": 172.70625, |
| "epoch": 0.37879101613727034, |
| "grad_norm": 0.2876351231003489, |
| "kl": 0.35120849609375, |
| "learning_rate": 1.5626827513009565e-05, |
| "loss": 0.014, |
| "reward": 1.00625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.95625, |
| "step": 1715 |
| }, |
| { |
| "completion_length": 158.39375, |
| "epoch": 0.3798953631230933, |
| "grad_norm": 0.5026511321796595, |
| "kl": 0.3473388671875, |
| "learning_rate": 1.5594912061278627e-05, |
| "loss": 0.0139, |
| "reward": 1.04375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.975, |
| "step": 1720 |
| }, |
| { |
| "completion_length": 173.2875, |
| "epoch": 0.3809997101089162, |
| "grad_norm": 0.43958113689444284, |
| "kl": 0.3528564453125, |
| "learning_rate": 1.5562913435003113e-05, |
| "loss": 0.0141, |
| "reward": 1.025, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.96875, |
| "step": 1725 |
| }, |
| { |
| "completion_length": 195.36875, |
| "epoch": 0.38210405709473916, |
| "grad_norm": 0.7155259543987148, |
| "kl": 0.343896484375, |
| "learning_rate": 1.5530832109877932e-05, |
| "loss": 0.0138, |
| "reward": 1.0375, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9625, |
| "step": 1730 |
| }, |
| { |
| "completion_length": 157.21875, |
| "epoch": 0.3832084040805621, |
| "grad_norm": 0.3870602492613469, |
| "kl": 0.343524169921875, |
| "learning_rate": 1.5498668562827397e-05, |
| "loss": 0.0137, |
| "reward": 1.04375, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.975, |
| "step": 1735 |
| }, |
| { |
| "completion_length": 182.0625, |
| "epoch": 0.38431275106638507, |
| "grad_norm": 0.5306169810415612, |
| "kl": 0.35052490234375, |
| "learning_rate": 1.5466423271998144e-05, |
| "loss": 0.014, |
| "reward": 1.01875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.94375, |
| "step": 1740 |
| }, |
| { |
| "completion_length": 166.49375, |
| "epoch": 0.38541709805220803, |
| "grad_norm": 0.4946006693969201, |
| "kl": 0.3321044921875, |
| "learning_rate": 1.5434096716752023e-05, |
| "loss": 0.0133, |
| "reward": 1.05, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.96875, |
| "step": 1745 |
| }, |
| { |
| "completion_length": 186.98125, |
| "epoch": 0.38652144503803093, |
| "grad_norm": 0.31165636186354284, |
| "kl": 0.35806884765625, |
| "learning_rate": 1.5401689377658962e-05, |
| "loss": 0.0143, |
| "reward": 1.0625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.975, |
| "step": 1750 |
| }, |
| { |
| "completion_length": 181.3, |
| "epoch": 0.3876257920238539, |
| "grad_norm": 0.5512017419947034, |
| "kl": 0.43681640625, |
| "learning_rate": 1.536920173648984e-05, |
| "loss": 0.0175, |
| "reward": 1.0125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.95, |
| "step": 1755 |
| }, |
| { |
| "completion_length": 220.86875, |
| "epoch": 0.38873013900967684, |
| "grad_norm": 0.7898520406737889, |
| "kl": 0.3798583984375, |
| "learning_rate": 1.53366342762093e-05, |
| "loss": 0.0152, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9125, |
| "step": 1760 |
| }, |
| { |
| "completion_length": 225.61875, |
| "epoch": 0.3898344859954998, |
| "grad_norm": 0.7541459820766353, |
| "kl": 0.3935302734375, |
| "learning_rate": 1.5303987480968607e-05, |
| "loss": 0.0157, |
| "reward": 0.9625, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.89375, |
| "step": 1765 |
| }, |
| { |
| "completion_length": 189.53125, |
| "epoch": 0.39093883298132276, |
| "grad_norm": 0.33298536402698525, |
| "kl": 0.322216796875, |
| "learning_rate": 1.5271261836098403e-05, |
| "loss": 0.0129, |
| "reward": 0.99375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.9375, |
| "step": 1770 |
| }, |
| { |
| "completion_length": 168.725, |
| "epoch": 0.39204317996714566, |
| "grad_norm": 0.7771633455821945, |
| "kl": 0.3632568359375, |
| "learning_rate": 1.5238457828101531e-05, |
| "loss": 0.0145, |
| "reward": 1.0, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.95625, |
| "step": 1775 |
| }, |
| { |
| "completion_length": 160.28125, |
| "epoch": 0.3931475269529686, |
| "grad_norm": 0.478390100746179, |
| "kl": 0.372265625, |
| "learning_rate": 1.520557594464579e-05, |
| "loss": 0.0149, |
| "reward": 0.9875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.94375, |
| "step": 1780 |
| }, |
| { |
| "completion_length": 188.59375, |
| "epoch": 0.3942518739387916, |
| "grad_norm": 0.4217931355042731, |
| "kl": 0.3974609375, |
| "learning_rate": 1.5172616674556673e-05, |
| "loss": 0.0159, |
| "reward": 0.95, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.89375, |
| "step": 1785 |
| }, |
| { |
| "completion_length": 149.525, |
| "epoch": 0.39535622092461453, |
| "grad_norm": 0.7501382613974432, |
| "kl": 0.4054931640625, |
| "learning_rate": 1.5139580507810118e-05, |
| "loss": 0.0162, |
| "reward": 0.9875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.93125, |
| "step": 1790 |
| }, |
| { |
| "completion_length": 145.725, |
| "epoch": 0.39646056791043743, |
| "grad_norm": 0.5349731933801097, |
| "kl": 0.35238037109375, |
| "learning_rate": 1.510646793552522e-05, |
| "loss": 0.0141, |
| "reward": 1.01875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.975, |
| "step": 1795 |
| }, |
| { |
| "completion_length": 142.375, |
| "epoch": 0.3975649148962604, |
| "grad_norm": 0.6112477717509665, |
| "kl": 0.4273193359375, |
| "learning_rate": 1.5073279449956916e-05, |
| "loss": 0.0171, |
| "reward": 1.05, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.99375, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.3975649148962604, |
| "eval_completion_length": 169.21, |
| "eval_kl": 0.37419921875, |
| "eval_loss": 0.014996632933616638, |
| "eval_reward": 1.055, |
| "eval_reward_std": 0.07778174459934234, |
| "eval_rewards/accuracy_reward": 0.08, |
| "eval_rewards/format_reward": 0.975, |
| "eval_runtime": 85.5248, |
| "eval_samples_per_second": 1.158, |
| "eval_steps_per_second": 0.292, |
| "step": 1800 |
| }, |
| { |
| "completion_length": 183.75625, |
| "epoch": 0.39866926188208335, |
| "grad_norm": 0.42239616616022757, |
| "kl": 0.35390625, |
| "learning_rate": 1.5040015544488689e-05, |
| "loss": 0.0142, |
| "reward": 1.0, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.96875, |
| "step": 1805 |
| }, |
| { |
| "completion_length": 231.51875, |
| "epoch": 0.3997736088679063, |
| "grad_norm": 0.38357330680196106, |
| "kl": 0.34571533203125, |
| "learning_rate": 1.5006676713625217e-05, |
| "loss": 0.0138, |
| "reward": 1.0125, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9625, |
| "step": 1810 |
| }, |
| { |
| "completion_length": 249.51875, |
| "epoch": 0.40087795585372926, |
| "grad_norm": 0.36278847988909035, |
| "kl": 0.34613037109375, |
| "learning_rate": 1.4973263452985023e-05, |
| "loss": 0.0138, |
| "reward": 1.01875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.96875, |
| "step": 1815 |
| }, |
| { |
| "completion_length": 254.9, |
| "epoch": 0.40198230283955216, |
| "grad_norm": 0.13063316008666095, |
| "kl": 0.3719482421875, |
| "learning_rate": 1.493977625929312e-05, |
| "loss": 0.0149, |
| "reward": 0.975, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.0125, |
| "rewards/format_reward": 0.9625, |
| "step": 1820 |
| }, |
| { |
| "completion_length": 222.58125, |
| "epoch": 0.4030866498253751, |
| "grad_norm": 0.4311131808891421, |
| "kl": 0.33587646484375, |
| "learning_rate": 1.4906215630373606e-05, |
| "loss": 0.0134, |
| "reward": 1.025, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.975, |
| "step": 1825 |
| }, |
| { |
| "completion_length": 221.6125, |
| "epoch": 0.4041909968111981, |
| "grad_norm": 0.6819109826661146, |
| "kl": 0.38231201171875, |
| "learning_rate": 1.4872582065142285e-05, |
| "loss": 0.0153, |
| "reward": 1.0125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.95, |
| "step": 1830 |
| }, |
| { |
| "completion_length": 238.88125, |
| "epoch": 0.40529534379702103, |
| "grad_norm": 0.43973316451031436, |
| "kl": 0.35328369140625, |
| "learning_rate": 1.4838876063599234e-05, |
| "loss": 0.0141, |
| "reward": 0.9625, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.91875, |
| "step": 1835 |
| }, |
| { |
| "completion_length": 309.99375, |
| "epoch": 0.406399690782844, |
| "grad_norm": 0.8444306019322414, |
| "kl": 0.44168701171875, |
| "learning_rate": 1.480509812682138e-05, |
| "loss": 0.0177, |
| "reward": 0.86875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.84375, |
| "step": 1840 |
| }, |
| { |
| "completion_length": 126.275, |
| "epoch": 0.4075040377686669, |
| "grad_norm": 0.35468253817906953, |
| "kl": 0.38702392578125, |
| "learning_rate": 1.4771248756955042e-05, |
| "loss": 0.0155, |
| "reward": 1.05625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 1.0, |
| "step": 1845 |
| }, |
| { |
| "completion_length": 123.95625, |
| "epoch": 0.40860838475448985, |
| "grad_norm": 0.5763156946426605, |
| "kl": 0.36905517578125, |
| "learning_rate": 1.4737328457208471e-05, |
| "loss": 0.0148, |
| "reward": 1.125, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.99375, |
| "step": 1850 |
| }, |
| { |
| "completion_length": 113.06875, |
| "epoch": 0.4097127317403128, |
| "grad_norm": 0.47557294439345416, |
| "kl": 0.379296875, |
| "learning_rate": 1.4703337731844374e-05, |
| "loss": 0.0152, |
| "reward": 1.0875, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 1.0, |
| "step": 1855 |
| }, |
| { |
| "completion_length": 106.64375, |
| "epoch": 0.41081707872613576, |
| "grad_norm": 0.787870748692926, |
| "kl": 0.402252197265625, |
| "learning_rate": 1.4669277086172406e-05, |
| "loss": 0.0161, |
| "reward": 1.08125, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.99375, |
| "step": 1860 |
| }, |
| { |
| "completion_length": 113.35, |
| "epoch": 0.4119214257119587, |
| "grad_norm": 0.5993634901343988, |
| "kl": 0.395849609375, |
| "learning_rate": 1.4635147026541674e-05, |
| "loss": 0.0158, |
| "reward": 1.08125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 1.0, |
| "step": 1865 |
| }, |
| { |
| "completion_length": 142.475, |
| "epoch": 0.4130257726977816, |
| "grad_norm": 0.14466786983492855, |
| "kl": 0.3989501953125, |
| "learning_rate": 1.4600948060333187e-05, |
| "loss": 0.016, |
| "reward": 1.1, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.99375, |
| "step": 1870 |
| }, |
| { |
| "completion_length": 180.33125, |
| "epoch": 0.4141301196836046, |
| "grad_norm": 0.2913646742496878, |
| "kl": 0.3666259765625, |
| "learning_rate": 1.4566680695952333e-05, |
| "loss": 0.0147, |
| "reward": 1.0375, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9875, |
| "step": 1875 |
| }, |
| { |
| "completion_length": 246.5625, |
| "epoch": 0.41523446666942754, |
| "grad_norm": 0.10584068251739959, |
| "kl": 0.3494873046875, |
| "learning_rate": 1.4532345442821323e-05, |
| "loss": 0.014, |
| "reward": 1.0375, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.99375, |
| "step": 1880 |
| }, |
| { |
| "completion_length": 349.275, |
| "epoch": 0.4163388136552505, |
| "grad_norm": 0.42138126797069886, |
| "kl": 0.40732421875, |
| "learning_rate": 1.4497942811371592e-05, |
| "loss": 0.0163, |
| "reward": 0.95625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.925, |
| "step": 1885 |
| }, |
| { |
| "completion_length": 282.8125, |
| "epoch": 0.41744316064107345, |
| "grad_norm": 0.5708867419498027, |
| "kl": 0.4172607421875, |
| "learning_rate": 1.4463473313036241e-05, |
| "loss": 0.0167, |
| "reward": 0.93125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.90625, |
| "step": 1890 |
| }, |
| { |
| "completion_length": 245.1, |
| "epoch": 0.41854750762689635, |
| "grad_norm": 0.5020193212271058, |
| "kl": 0.431884765625, |
| "learning_rate": 1.4428937460242417e-05, |
| "loss": 0.0173, |
| "reward": 0.99375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.91875, |
| "step": 1895 |
| }, |
| { |
| "completion_length": 197.70625, |
| "epoch": 0.4196518546127193, |
| "grad_norm": 0.590350066389971, |
| "kl": 0.3642578125, |
| "learning_rate": 1.4394335766403703e-05, |
| "loss": 0.0146, |
| "reward": 1.0, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.95625, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.4196518546127193, |
| "eval_completion_length": 210.86, |
| "eval_kl": 0.4005078125, |
| "eval_loss": 0.016043836250901222, |
| "eval_reward": 1.025, |
| "eval_reward_std": 0.12020815074443818, |
| "eval_rewards/accuracy_reward": 0.09, |
| "eval_rewards/format_reward": 0.935, |
| "eval_runtime": 109.1917, |
| "eval_samples_per_second": 0.907, |
| "eval_steps_per_second": 0.229, |
| "step": 1900 |
| }, |
| { |
| "completion_length": 210.5125, |
| "epoch": 0.42075620159854227, |
| "grad_norm": 0.5480387591743037, |
| "kl": 0.4215087890625, |
| "learning_rate": 1.4359668745912472e-05, |
| "loss": 0.0169, |
| "reward": 0.98125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.93125, |
| "step": 1905 |
| }, |
| { |
| "completion_length": 187.575, |
| "epoch": 0.4218605485843652, |
| "grad_norm": 0.37297156781214846, |
| "kl": 0.318310546875, |
| "learning_rate": 1.4324936914132255e-05, |
| "loss": 0.0127, |
| "reward": 1.0125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.95625, |
| "step": 1910 |
| }, |
| { |
| "completion_length": 194.59375, |
| "epoch": 0.4229648955701881, |
| "grad_norm": 0.4621580856810021, |
| "kl": 0.3155029296875, |
| "learning_rate": 1.4290140787390083e-05, |
| "loss": 0.0126, |
| "reward": 1.0125, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.9875, |
| "step": 1915 |
| }, |
| { |
| "completion_length": 215.26875, |
| "epoch": 0.4240692425560111, |
| "grad_norm": 0.23236440356104268, |
| "kl": 0.31630859375, |
| "learning_rate": 1.4255280882968787e-05, |
| "loss": 0.0126, |
| "reward": 1.0375, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.98125, |
| "step": 1920 |
| }, |
| { |
| "completion_length": 257.96875, |
| "epoch": 0.42517358954183404, |
| "grad_norm": 0.1821862648072376, |
| "kl": 0.36019287109375, |
| "learning_rate": 1.4220357719099338e-05, |
| "loss": 0.0144, |
| "reward": 1.00625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.95, |
| "step": 1925 |
| }, |
| { |
| "completion_length": 278.40625, |
| "epoch": 0.426277936527657, |
| "grad_norm": 0.5171028800770205, |
| "kl": 0.341015625, |
| "learning_rate": 1.4185371814953116e-05, |
| "loss": 0.0136, |
| "reward": 0.96875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.9125, |
| "step": 1930 |
| }, |
| { |
| "completion_length": 203.1, |
| "epoch": 0.42738228351347995, |
| "grad_norm": 0.3194976531325425, |
| "kl": 0.33564453125, |
| "learning_rate": 1.415032369063422e-05, |
| "loss": 0.0134, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.98125, |
| "step": 1935 |
| }, |
| { |
| "completion_length": 248.76875, |
| "epoch": 0.42848663049930286, |
| "grad_norm": 0.32014297487636434, |
| "kl": 0.34617919921875, |
| "learning_rate": 1.41152138671717e-05, |
| "loss": 0.0138, |
| "reward": 0.96875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.9375, |
| "step": 1940 |
| }, |
| { |
| "completion_length": 224.6875, |
| "epoch": 0.4295909774851258, |
| "grad_norm": 0.7093467850610166, |
| "kl": 0.32181396484375, |
| "learning_rate": 1.408004286651185e-05, |
| "loss": 0.0129, |
| "reward": 1.025, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.99375, |
| "step": 1945 |
| }, |
| { |
| "completion_length": 219.30625, |
| "epoch": 0.43069532447094877, |
| "grad_norm": 0.12714345360829074, |
| "kl": 0.3352294921875, |
| "learning_rate": 1.4044811211510419e-05, |
| "loss": 0.0134, |
| "reward": 1.04375, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.98125, |
| "step": 1950 |
| }, |
| { |
| "completion_length": 253.2875, |
| "epoch": 0.4317996714567717, |
| "grad_norm": 0.180488927556792, |
| "kl": 0.3677734375, |
| "learning_rate": 1.4009519425924858e-05, |
| "loss": 0.0147, |
| "reward": 1.04375, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.9875, |
| "step": 1955 |
| }, |
| { |
| "completion_length": 270.91875, |
| "epoch": 0.4329040184425947, |
| "grad_norm": 0.30372496615737005, |
| "kl": 0.301806640625, |
| "learning_rate": 1.3974168034406524e-05, |
| "loss": 0.0121, |
| "reward": 0.99375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.9625, |
| "step": 1960 |
| }, |
| { |
| "completion_length": 248.61875, |
| "epoch": 0.4340083654284176, |
| "grad_norm": 0.4701815288270493, |
| "kl": 0.33768310546875, |
| "learning_rate": 1.3938757562492873e-05, |
| "loss": 0.0135, |
| "reward": 1.04375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.98125, |
| "step": 1965 |
| }, |
| { |
| "completion_length": 225.59375, |
| "epoch": 0.43511271241424054, |
| "grad_norm": 0.36757084788553285, |
| "kl": 0.3396484375, |
| "learning_rate": 1.3903288536599668e-05, |
| "loss": 0.0136, |
| "reward": 1.0625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.98125, |
| "step": 1970 |
| }, |
| { |
| "completion_length": 312.15, |
| "epoch": 0.4362170594000635, |
| "grad_norm": 0.48977082806979283, |
| "kl": 0.358837890625, |
| "learning_rate": 1.3867761484013135e-05, |
| "loss": 0.0144, |
| "reward": 1.01875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.95625, |
| "step": 1975 |
| }, |
| { |
| "completion_length": 307.16875, |
| "epoch": 0.43732140638588646, |
| "grad_norm": 0.49526262796042186, |
| "kl": 0.340966796875, |
| "learning_rate": 1.3832176932882136e-05, |
| "loss": 0.0136, |
| "reward": 1.0625, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9875, |
| "step": 1980 |
| }, |
| { |
| "completion_length": 252.50625, |
| "epoch": 0.4384257533717094, |
| "grad_norm": 0.6091872240850794, |
| "kl": 0.3243408203125, |
| "learning_rate": 1.3796535412210301e-05, |
| "loss": 0.013, |
| "reward": 1.025, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.96875, |
| "step": 1985 |
| }, |
| { |
| "completion_length": 230.20625, |
| "epoch": 0.4395301003575323, |
| "grad_norm": 0.29579068631634226, |
| "kl": 0.34698486328125, |
| "learning_rate": 1.3760837451848193e-05, |
| "loss": 0.0139, |
| "reward": 1.075, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.975, |
| "step": 1990 |
| }, |
| { |
| "completion_length": 246.5375, |
| "epoch": 0.4406344473433553, |
| "grad_norm": 0.3033284715610845, |
| "kl": 0.34864501953125, |
| "learning_rate": 1.3725083582485397e-05, |
| "loss": 0.0139, |
| "reward": 1.01875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.94375, |
| "step": 1995 |
| }, |
| { |
| "completion_length": 213.94375, |
| "epoch": 0.44173879432917823, |
| "grad_norm": 0.4124635672949258, |
| "kl": 0.33160400390625, |
| "learning_rate": 1.3689274335642653e-05, |
| "loss": 0.0133, |
| "reward": 1.01875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.96875, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.44173879432917823, |
| "eval_completion_length": 175.91, |
| "eval_kl": 0.3898828125, |
| "eval_loss": 0.015537865459918976, |
| "eval_reward": 1.1, |
| "eval_reward_std": 0.11313708305358887, |
| "eval_rewards/accuracy_reward": 0.11, |
| "eval_rewards/format_reward": 0.99, |
| "eval_runtime": 88.9833, |
| "eval_samples_per_second": 1.113, |
| "eval_steps_per_second": 0.281, |
| "step": 2000 |
| }, |
| { |
| "completion_length": 198.1125, |
| "epoch": 0.4428431413150012, |
| "grad_norm": 0.41251225473854053, |
| "kl": 0.32073974609375, |
| "learning_rate": 1.3653410243663953e-05, |
| "loss": 0.0128, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.98125, |
| "step": 2005 |
| }, |
| { |
| "completion_length": 174.21875, |
| "epoch": 0.44394748830082414, |
| "grad_norm": 0.6755273210792271, |
| "kl": 0.318115234375, |
| "learning_rate": 1.3617491839708614e-05, |
| "loss": 0.0127, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9875, |
| "step": 2010 |
| }, |
| { |
| "completion_length": 208.9875, |
| "epoch": 0.44505183528664705, |
| "grad_norm": 0.37867244007672246, |
| "kl": 0.32615966796875, |
| "learning_rate": 1.3581519657743365e-05, |
| "loss": 0.013, |
| "reward": 1.05625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9625, |
| "step": 2015 |
| }, |
| { |
| "completion_length": 255.75, |
| "epoch": 0.44615618227247, |
| "grad_norm": 0.6185289191665273, |
| "kl": 0.3501220703125, |
| "learning_rate": 1.3545494232534406e-05, |
| "loss": 0.014, |
| "reward": 1.06875, |
| "reward_std": 0.23864853456616403, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.9375, |
| "step": 2020 |
| }, |
| { |
| "completion_length": 327.4125, |
| "epoch": 0.44726052925829296, |
| "grad_norm": 0.8766696054374737, |
| "kl": 0.4114990234375, |
| "learning_rate": 1.3509416099639456e-05, |
| "loss": 0.0165, |
| "reward": 1.03125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.95, |
| "step": 2025 |
| }, |
| { |
| "completion_length": 336.04375, |
| "epoch": 0.4483648762441159, |
| "grad_norm": 0.26246836662809214, |
| "kl": 0.33997802734375, |
| "learning_rate": 1.3473285795399792e-05, |
| "loss": 0.0136, |
| "reward": 1.09375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9875, |
| "step": 2030 |
| }, |
| { |
| "completion_length": 242.35625, |
| "epoch": 0.4494692232299388, |
| "grad_norm": 0.4471292709567679, |
| "kl": 0.3488037109375, |
| "learning_rate": 1.3437103856932266e-05, |
| "loss": 0.014, |
| "reward": 1.1, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.975, |
| "step": 2035 |
| }, |
| { |
| "completion_length": 209.25625, |
| "epoch": 0.4505735702157618, |
| "grad_norm": 0.73758038746274, |
| "kl": 0.3877685546875, |
| "learning_rate": 1.3400870822121348e-05, |
| "loss": 0.0155, |
| "reward": 0.9375, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.85, |
| "step": 2040 |
| }, |
| { |
| "completion_length": 196.275, |
| "epoch": 0.45167791720158473, |
| "grad_norm": 0.5384037353444987, |
| "kl": 0.373583984375, |
| "learning_rate": 1.3364587229611095e-05, |
| "loss": 0.0149, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.975, |
| "step": 2045 |
| }, |
| { |
| "completion_length": 242.125, |
| "epoch": 0.4527822641874077, |
| "grad_norm": 0.35105762795134465, |
| "kl": 0.43712158203125, |
| "learning_rate": 1.332825361879717e-05, |
| "loss": 0.0175, |
| "reward": 1.1, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.975, |
| "step": 2050 |
| }, |
| { |
| "completion_length": 226.36875, |
| "epoch": 0.45388661117323065, |
| "grad_norm": 0.5975228945667029, |
| "kl": 0.51041259765625, |
| "learning_rate": 1.3291870529818809e-05, |
| "loss": 0.0204, |
| "reward": 1.0875, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.975, |
| "step": 2055 |
| }, |
| { |
| "completion_length": 243.575, |
| "epoch": 0.45499095815905355, |
| "grad_norm": 0.4387220366275729, |
| "kl": 0.458203125, |
| "learning_rate": 1.3255438503550796e-05, |
| "loss": 0.0183, |
| "reward": 1.09375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.975, |
| "step": 2060 |
| }, |
| { |
| "completion_length": 250.0875, |
| "epoch": 0.4560953051448765, |
| "grad_norm": 0.5275971912489743, |
| "kl": 0.39151611328125, |
| "learning_rate": 1.3218958081595426e-05, |
| "loss": 0.0157, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.98125, |
| "step": 2065 |
| }, |
| { |
| "completion_length": 282.99375, |
| "epoch": 0.45719965213069946, |
| "grad_norm": 0.4169345983090638, |
| "kl": 0.4114990234375, |
| "learning_rate": 1.3182429806274442e-05, |
| "loss": 0.0165, |
| "reward": 1.0625, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.975, |
| "step": 2070 |
| }, |
| { |
| "completion_length": 242.4, |
| "epoch": 0.4583039991165224, |
| "grad_norm": 0.5265515421182827, |
| "kl": 0.48302001953125, |
| "learning_rate": 1.3145854220620981e-05, |
| "loss": 0.0193, |
| "reward": 1.0, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.94375, |
| "step": 2075 |
| }, |
| { |
| "completion_length": 241.05, |
| "epoch": 0.4594083461023454, |
| "grad_norm": 0.5585942695958915, |
| "kl": 0.4310302734375, |
| "learning_rate": 1.3109231868371511e-05, |
| "loss": 0.0172, |
| "reward": 0.9625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.94375, |
| "step": 2080 |
| }, |
| { |
| "completion_length": 176.9125, |
| "epoch": 0.4605126930881683, |
| "grad_norm": 0.4414724029442064, |
| "kl": 0.35958251953125, |
| "learning_rate": 1.3072563293957725e-05, |
| "loss": 0.0144, |
| "reward": 1.05625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 2085 |
| }, |
| { |
| "completion_length": 192.8, |
| "epoch": 0.46161704007399124, |
| "grad_norm": 0.586018890860745, |
| "kl": 0.322509765625, |
| "learning_rate": 1.3035849042498462e-05, |
| "loss": 0.0129, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9625, |
| "step": 2090 |
| }, |
| { |
| "completion_length": 208.95625, |
| "epoch": 0.4627213870598142, |
| "grad_norm": 0.42268112037658245, |
| "kl": 0.319384765625, |
| "learning_rate": 1.299908965979161e-05, |
| "loss": 0.0128, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9625, |
| "step": 2095 |
| }, |
| { |
| "completion_length": 222.16875, |
| "epoch": 0.46382573404563715, |
| "grad_norm": 0.31473777919390844, |
| "kl": 0.30546875, |
| "learning_rate": 1.2962285692305964e-05, |
| "loss": 0.0122, |
| "reward": 1.0125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9625, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.46382573404563715, |
| "eval_completion_length": 192.71, |
| "eval_kl": 0.31083984375, |
| "eval_loss": 0.012439416721463203, |
| "eval_reward": 1.1, |
| "eval_reward_std": 0.21213203072547912, |
| "eval_rewards/accuracy_reward": 0.145, |
| "eval_rewards/format_reward": 0.955, |
| "eval_runtime": 96.8968, |
| "eval_samples_per_second": 1.022, |
| "eval_steps_per_second": 0.258, |
| "step": 2100 |
| }, |
| { |
| "completion_length": 216.8125, |
| "epoch": 0.4649300810314601, |
| "grad_norm": 0.35648646516795124, |
| "kl": 0.32301025390625, |
| "learning_rate": 1.2925437687173144e-05, |
| "loss": 0.0129, |
| "reward": 1.04375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9625, |
| "step": 2105 |
| }, |
| { |
| "completion_length": 241.8125, |
| "epoch": 0.466034428017283, |
| "grad_norm": 0.601644494723294, |
| "kl": 0.3125732421875, |
| "learning_rate": 1.2888546192179417e-05, |
| "loss": 0.0125, |
| "reward": 0.96875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 0.94375, |
| "step": 2110 |
| }, |
| { |
| "completion_length": 217.7625, |
| "epoch": 0.46713877500310597, |
| "grad_norm": 0.39671636795145077, |
| "kl": 0.326318359375, |
| "learning_rate": 1.2851611755757587e-05, |
| "loss": 0.013, |
| "reward": 1.025, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9375, |
| "step": 2115 |
| }, |
| { |
| "completion_length": 210.575, |
| "epoch": 0.4682431219889289, |
| "grad_norm": 0.5031259322905296, |
| "kl": 0.35986328125, |
| "learning_rate": 1.2814634926978831e-05, |
| "loss": 0.0144, |
| "reward": 1.025, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.93125, |
| "step": 2120 |
| }, |
| { |
| "completion_length": 187.1, |
| "epoch": 0.4693474689747519, |
| "grad_norm": 0.48486411865791645, |
| "kl": 0.35367431640625, |
| "learning_rate": 1.2777616255544527e-05, |
| "loss": 0.0141, |
| "reward": 1.075, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.95, |
| "step": 2125 |
| }, |
| { |
| "completion_length": 151.975, |
| "epoch": 0.47045181596057484, |
| "grad_norm": 0.7338227984314649, |
| "kl": 0.3826904296875, |
| "learning_rate": 1.2740556291778096e-05, |
| "loss": 0.0153, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.975, |
| "step": 2130 |
| }, |
| { |
| "completion_length": 133.78125, |
| "epoch": 0.47155616294639774, |
| "grad_norm": 0.11684943721597078, |
| "kl": 0.33577880859375, |
| "learning_rate": 1.2703455586616811e-05, |
| "loss": 0.0134, |
| "reward": 1.0875, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.99375, |
| "step": 2135 |
| }, |
| { |
| "completion_length": 145.9875, |
| "epoch": 0.4726605099322207, |
| "grad_norm": 0.32149812408314604, |
| "kl": 0.38963623046875, |
| "learning_rate": 1.2666314691603615e-05, |
| "loss": 0.0156, |
| "reward": 1.13125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.9875, |
| "step": 2140 |
| }, |
| { |
| "completion_length": 232.575, |
| "epoch": 0.47376485691804365, |
| "grad_norm": 0.6480932091195085, |
| "kl": 0.3406494140625, |
| "learning_rate": 1.2629134158878919e-05, |
| "loss": 0.0136, |
| "reward": 1.05625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 2145 |
| }, |
| { |
| "completion_length": 253.8875, |
| "epoch": 0.4748692039038666, |
| "grad_norm": 0.36437117134621355, |
| "kl": 0.3377685546875, |
| "learning_rate": 1.259191454117239e-05, |
| "loss": 0.0135, |
| "reward": 1.04375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.975, |
| "step": 2150 |
| }, |
| { |
| "completion_length": 231.3125, |
| "epoch": 0.47597355088968957, |
| "grad_norm": 0.4391123760933655, |
| "kl": 0.3203125, |
| "learning_rate": 1.255465639179473e-05, |
| "loss": 0.0128, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.975, |
| "step": 2155 |
| }, |
| { |
| "completion_length": 260.05625, |
| "epoch": 0.47707789787551247, |
| "grad_norm": 0.34571139879091517, |
| "kl": 0.35738525390625, |
| "learning_rate": 1.2517360264629463e-05, |
| "loss": 0.0143, |
| "reward": 1.01875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.95, |
| "step": 2160 |
| }, |
| { |
| "completion_length": 221.43125, |
| "epoch": 0.4781822448613354, |
| "grad_norm": 0.45867628713278896, |
| "kl": 0.38974609375, |
| "learning_rate": 1.24800267141247e-05, |
| "loss": 0.0156, |
| "reward": 1.0375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.93125, |
| "step": 2165 |
| }, |
| { |
| "completion_length": 170.575, |
| "epoch": 0.4792865918471584, |
| "grad_norm": 0.19943826053198088, |
| "kl": 0.37861328125, |
| "learning_rate": 1.2442656295284879e-05, |
| "loss": 0.0151, |
| "reward": 1.05, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.98125, |
| "step": 2170 |
| }, |
| { |
| "completion_length": 197.7875, |
| "epoch": 0.48039093883298134, |
| "grad_norm": 0.37120994010979813, |
| "kl": 0.342919921875, |
| "learning_rate": 1.2405249563662539e-05, |
| "loss": 0.0137, |
| "reward": 0.99375, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.95625, |
| "step": 2175 |
| }, |
| { |
| "completion_length": 175.96875, |
| "epoch": 0.48149528581880424, |
| "grad_norm": 0.6013419839896456, |
| "kl": 0.3757080078125, |
| "learning_rate": 1.2367807075350036e-05, |
| "loss": 0.015, |
| "reward": 1.08125, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.95625, |
| "step": 2180 |
| }, |
| { |
| "completion_length": 184.83125, |
| "epoch": 0.4825996328046272, |
| "grad_norm": 0.5134474475685822, |
| "kl": 0.36424560546875, |
| "learning_rate": 1.23303293869713e-05, |
| "loss": 0.0146, |
| "reward": 1.0125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9625, |
| "step": 2185 |
| }, |
| { |
| "completion_length": 234.29375, |
| "epoch": 0.48370397979045016, |
| "grad_norm": 0.5230059460040423, |
| "kl": 0.34986572265625, |
| "learning_rate": 1.2292817055673543e-05, |
| "loss": 0.014, |
| "reward": 1.0125, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9375, |
| "step": 2190 |
| }, |
| { |
| "completion_length": 308.00625, |
| "epoch": 0.4848083267762731, |
| "grad_norm": 0.7592675553160979, |
| "kl": 0.3602783203125, |
| "learning_rate": 1.2255270639118984e-05, |
| "loss": 0.0144, |
| "reward": 1.0125, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9125, |
| "step": 2195 |
| }, |
| { |
| "completion_length": 273.43125, |
| "epoch": 0.48591267376209607, |
| "grad_norm": 0.2856132458649576, |
| "kl": 0.37276611328125, |
| "learning_rate": 1.2217690695476551e-05, |
| "loss": 0.0149, |
| "reward": 1.00625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.90625, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.48591267376209607, |
| "eval_completion_length": 253.775, |
| "eval_kl": 0.5310546875, |
| "eval_loss": 0.02128330059349537, |
| "eval_reward": 1.045, |
| "eval_reward_std": 0.162634556889534, |
| "eval_rewards/accuracy_reward": 0.105, |
| "eval_rewards/format_reward": 0.94, |
| "eval_runtime": 127.3028, |
| "eval_samples_per_second": 0.778, |
| "eval_steps_per_second": 0.196, |
| "step": 2200 |
| }, |
| { |
| "completion_length": 259.58125, |
| "epoch": 0.48701702074791897, |
| "grad_norm": 0.3284138468757768, |
| "kl": 0.3307373046875, |
| "learning_rate": 1.2180077783413601e-05, |
| "loss": 0.0132, |
| "reward": 1.05625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 2205 |
| }, |
| { |
| "completion_length": 252.30625, |
| "epoch": 0.48812136773374193, |
| "grad_norm": 0.4057901132375836, |
| "kl": 0.4347412109375, |
| "learning_rate": 1.21424324620876e-05, |
| "loss": 0.0174, |
| "reward": 0.9875, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.91875, |
| "step": 2210 |
| }, |
| { |
| "completion_length": 249.60625, |
| "epoch": 0.4892257147195649, |
| "grad_norm": 0.9156591934586986, |
| "kl": 0.3991943359375, |
| "learning_rate": 1.2104755291137797e-05, |
| "loss": 0.016, |
| "reward": 0.9875, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9375, |
| "step": 2215 |
| }, |
| { |
| "completion_length": 229.95625, |
| "epoch": 0.49033006170538784, |
| "grad_norm": 0.42300199898124896, |
| "kl": 0.374468994140625, |
| "learning_rate": 1.2067046830676947e-05, |
| "loss": 0.015, |
| "reward": 1.025, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.95, |
| "step": 2220 |
| }, |
| { |
| "completion_length": 206.26875, |
| "epoch": 0.4914344086912108, |
| "grad_norm": 0.3982306025965269, |
| "kl": 0.3041748046875, |
| "learning_rate": 1.2029307641282935e-05, |
| "loss": 0.0122, |
| "reward": 1.05, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.96875, |
| "step": 2225 |
| }, |
| { |
| "completion_length": 225.10625, |
| "epoch": 0.4925387556770337, |
| "grad_norm": 0.3221497073306949, |
| "kl": 0.30826416015625, |
| "learning_rate": 1.1991538283990483e-05, |
| "loss": 0.0123, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.95625, |
| "step": 2230 |
| }, |
| { |
| "completion_length": 193.3125, |
| "epoch": 0.49364310266285666, |
| "grad_norm": 0.13887134423717792, |
| "kl": 0.32666015625, |
| "learning_rate": 1.1953739320282778e-05, |
| "loss": 0.0131, |
| "reward": 1.06875, |
| "reward_std": 0.02651650384068489, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.99375, |
| "step": 2235 |
| }, |
| { |
| "completion_length": 208.83125, |
| "epoch": 0.4947474496486796, |
| "grad_norm": 0.3671257080599345, |
| "kl": 0.30867919921875, |
| "learning_rate": 1.191591131208315e-05, |
| "loss": 0.0123, |
| "reward": 1.0875, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.99375, |
| "step": 2240 |
| }, |
| { |
| "completion_length": 196.06875, |
| "epoch": 0.4958517966345026, |
| "grad_norm": 0.4192081679963359, |
| "kl": 0.3347412109375, |
| "learning_rate": 1.1878054821746703e-05, |
| "loss": 0.0134, |
| "reward": 1.11875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.96875, |
| "step": 2245 |
| }, |
| { |
| "completion_length": 218.25625, |
| "epoch": 0.49695614362032553, |
| "grad_norm": 0.2809177260646367, |
| "kl": 0.31444091796875, |
| "learning_rate": 1.1840170412051957e-05, |
| "loss": 0.0126, |
| "reward": 1.075, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.975, |
| "step": 2250 |
| }, |
| { |
| "completion_length": 246.13125, |
| "epoch": 0.49806049060614843, |
| "grad_norm": 0.3266723103626801, |
| "kl": 0.290673828125, |
| "learning_rate": 1.1802258646192486e-05, |
| "loss": 0.0116, |
| "reward": 1.025, |
| "reward_std": 0.03535533845424652, |
| "rewards/accuracy_reward": 0.025, |
| "rewards/format_reward": 1.0, |
| "step": 2255 |
| }, |
| { |
| "completion_length": 263.225, |
| "epoch": 0.4991648375919714, |
| "grad_norm": 0.19489451125924528, |
| "kl": 0.29127197265625, |
| "learning_rate": 1.1764320087768546e-05, |
| "loss": 0.0116, |
| "reward": 1.0875, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.975, |
| "step": 2260 |
| }, |
| { |
| "completion_length": 265.95, |
| "epoch": 0.5002691845777943, |
| "grad_norm": 0.5199830849997911, |
| "kl": 0.34532470703125, |
| "learning_rate": 1.1726355300778693e-05, |
| "loss": 0.0138, |
| "reward": 1.04375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9625, |
| "step": 2265 |
| }, |
| { |
| "completion_length": 230.08125, |
| "epoch": 0.5013735315636173, |
| "grad_norm": 0.5370502961123099, |
| "kl": 0.31375732421875, |
| "learning_rate": 1.1688364849611395e-05, |
| "loss": 0.0125, |
| "reward": 1.075, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.98125, |
| "step": 2270 |
| }, |
| { |
| "completion_length": 268.4375, |
| "epoch": 0.5024778785494403, |
| "grad_norm": 0.4955767601038962, |
| "kl": 0.28502197265625, |
| "learning_rate": 1.1650349299036656e-05, |
| "loss": 0.0114, |
| "reward": 1.0625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.98125, |
| "step": 2275 |
| }, |
| { |
| "completion_length": 204.94375, |
| "epoch": 0.5035822255352632, |
| "grad_norm": 0.5349423661973638, |
| "kl": 0.3089111328125, |
| "learning_rate": 1.1612309214197599e-05, |
| "loss": 0.0124, |
| "reward": 1.0625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.975, |
| "step": 2280 |
| }, |
| { |
| "completion_length": 191.61875, |
| "epoch": 0.5046865725210862, |
| "grad_norm": 0.7009751890329485, |
| "kl": 0.32718505859375, |
| "learning_rate": 1.1574245160602085e-05, |
| "loss": 0.0131, |
| "reward": 1.0375, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9875, |
| "step": 2285 |
| }, |
| { |
| "completion_length": 174.96875, |
| "epoch": 0.505790919506909, |
| "grad_norm": 0.15634356767517202, |
| "kl": 0.32626953125, |
| "learning_rate": 1.153615770411429e-05, |
| "loss": 0.013, |
| "reward": 1.1125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.99375, |
| "step": 2290 |
| }, |
| { |
| "completion_length": 213.85625, |
| "epoch": 0.506895266492732, |
| "grad_norm": 0.47252723075105413, |
| "kl": 0.311212158203125, |
| "learning_rate": 1.1498047410946307e-05, |
| "loss": 0.0124, |
| "reward": 1.06875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9875, |
| "step": 2295 |
| }, |
| { |
| "completion_length": 215.2875, |
| "epoch": 0.5079996134785549, |
| "grad_norm": 0.4948372981089919, |
| "kl": 0.33463134765625, |
| "learning_rate": 1.1459914847649716e-05, |
| "loss": 0.0134, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.98125, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.5079996134785549, |
| "eval_completion_length": 230.115, |
| "eval_kl": 0.37994140625, |
| "eval_loss": 0.015226633287966251, |
| "eval_reward": 1.105, |
| "eval_reward_std": 0.13435028612613678, |
| "eval_rewards/accuracy_reward": 0.135, |
| "eval_rewards/format_reward": 0.97, |
| "eval_runtime": 115.6989, |
| "eval_samples_per_second": 0.856, |
| "eval_steps_per_second": 0.216, |
| "step": 2300 |
| }, |
| { |
| "completion_length": 215.875, |
| "epoch": 0.5091039604643779, |
| "grad_norm": 0.3666409705462901, |
| "kl": 0.35133056640625, |
| "learning_rate": 1.1421760581107164e-05, |
| "loss": 0.0141, |
| "reward": 1.04375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9625, |
| "step": 2305 |
| }, |
| { |
| "completion_length": 254.09375, |
| "epoch": 0.5102083074502008, |
| "grad_norm": 0.2975662859403391, |
| "kl": 0.34254150390625, |
| "learning_rate": 1.1383585178523955e-05, |
| "loss": 0.0137, |
| "reward": 0.98125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.94375, |
| "step": 2310 |
| }, |
| { |
| "completion_length": 256.36875, |
| "epoch": 0.5113126544360238, |
| "grad_norm": 0.719346374442343, |
| "kl": 0.35948486328125, |
| "learning_rate": 1.1345389207419588e-05, |
| "loss": 0.0144, |
| "reward": 1.01875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.925, |
| "step": 2315 |
| }, |
| { |
| "completion_length": 209.3, |
| "epoch": 0.5124170014218468, |
| "grad_norm": 0.7163698011097263, |
| "kl": 0.32767333984375, |
| "learning_rate": 1.1307173235619342e-05, |
| "loss": 0.0131, |
| "reward": 1.05625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 2320 |
| }, |
| { |
| "completion_length": 229.61875, |
| "epoch": 0.5135213484076697, |
| "grad_norm": 0.16271285683956263, |
| "kl": 0.32386474609375, |
| "learning_rate": 1.126893783124583e-05, |
| "loss": 0.013, |
| "reward": 1.04375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.96875, |
| "step": 2325 |
| }, |
| { |
| "completion_length": 241.45625, |
| "epoch": 0.5146256953934927, |
| "grad_norm": 0.29867489688674986, |
| "kl": 0.34649658203125, |
| "learning_rate": 1.1230683562710549e-05, |
| "loss": 0.0139, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.975, |
| "step": 2330 |
| }, |
| { |
| "completion_length": 241.88125, |
| "epoch": 0.5157300423793156, |
| "grad_norm": 0.2511157393264926, |
| "kl": 0.33067626953125, |
| "learning_rate": 1.1192410998705432e-05, |
| "loss": 0.0132, |
| "reward": 1.08125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.98125, |
| "step": 2335 |
| }, |
| { |
| "completion_length": 259.81875, |
| "epoch": 0.5168343893651385, |
| "grad_norm": 0.1754130257017029, |
| "kl": 0.29140625, |
| "learning_rate": 1.1154120708194398e-05, |
| "loss": 0.0117, |
| "reward": 1.0625, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9875, |
| "step": 2340 |
| }, |
| { |
| "completion_length": 238.56875, |
| "epoch": 0.5179387363509614, |
| "grad_norm": 0.2633940153329421, |
| "kl": 0.32506103515625, |
| "learning_rate": 1.1115813260404889e-05, |
| "loss": 0.013, |
| "reward": 1.05625, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.98125, |
| "step": 2345 |
| }, |
| { |
| "completion_length": 194.5625, |
| "epoch": 0.5190430833367844, |
| "grad_norm": 0.37301932919862296, |
| "kl": 0.3505615234375, |
| "learning_rate": 1.1077489224819402e-05, |
| "loss": 0.014, |
| "reward": 1.075, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.975, |
| "step": 2350 |
| }, |
| { |
| "completion_length": 202.15, |
| "epoch": 0.5201474303226074, |
| "grad_norm": 0.5495626073196226, |
| "kl": 0.377880859375, |
| "learning_rate": 1.1039149171167046e-05, |
| "loss": 0.0151, |
| "reward": 0.975, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.93125, |
| "step": 2355 |
| }, |
| { |
| "completion_length": 220.68125, |
| "epoch": 0.5212517773084303, |
| "grad_norm": 0.38846325239139723, |
| "kl": 0.3607421875, |
| "learning_rate": 1.1000793669415035e-05, |
| "loss": 0.0144, |
| "reward": 1.025, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.91875, |
| "step": 2360 |
| }, |
| { |
| "completion_length": 211.93125, |
| "epoch": 0.5223561242942533, |
| "grad_norm": 0.5335204386556052, |
| "kl": 0.3825927734375, |
| "learning_rate": 1.0962423289760254e-05, |
| "loss": 0.0153, |
| "reward": 1.0125, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.925, |
| "step": 2365 |
| }, |
| { |
| "completion_length": 195.3375, |
| "epoch": 0.5234604712800762, |
| "grad_norm": 0.5401729852880928, |
| "kl": 0.35989990234375, |
| "learning_rate": 1.0924038602620757e-05, |
| "loss": 0.0144, |
| "reward": 1.025, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.95625, |
| "step": 2370 |
| }, |
| { |
| "completion_length": 186.30625, |
| "epoch": 0.5245648182658992, |
| "grad_norm": 0.6847630132207224, |
| "kl": 0.32152099609375, |
| "learning_rate": 1.0885640178627291e-05, |
| "loss": 0.0129, |
| "reward": 1.05, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.96875, |
| "step": 2375 |
| }, |
| { |
| "completion_length": 187.09375, |
| "epoch": 0.5256691652517221, |
| "grad_norm": 0.5295522607198633, |
| "kl": 0.288751220703125, |
| "learning_rate": 1.0847228588614821e-05, |
| "loss": 0.0115, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.99375, |
| "step": 2380 |
| }, |
| { |
| "completion_length": 207.14375, |
| "epoch": 0.526773512237545, |
| "grad_norm": 0.39564621021267, |
| "kl": 0.310736083984375, |
| "learning_rate": 1.0808804403614044e-05, |
| "loss": 0.0124, |
| "reward": 1.025, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.98125, |
| "step": 2385 |
| }, |
| { |
| "completion_length": 217.94375, |
| "epoch": 0.5278778592233679, |
| "grad_norm": 0.32699751280459455, |
| "kl": 0.30042724609375, |
| "learning_rate": 1.0770368194842886e-05, |
| "loss": 0.012, |
| "reward": 1.03125, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.99375, |
| "step": 2390 |
| }, |
| { |
| "completion_length": 214.59375, |
| "epoch": 0.5289822062091909, |
| "grad_norm": 0.3390808446949196, |
| "kl": 0.321435546875, |
| "learning_rate": 1.073192053369802e-05, |
| "loss": 0.0129, |
| "reward": 1.025, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.96875, |
| "step": 2395 |
| }, |
| { |
| "completion_length": 227.125, |
| "epoch": 0.5300865531950139, |
| "grad_norm": 0.3714729764465815, |
| "kl": 0.36365966796875, |
| "learning_rate": 1.0693461991746389e-05, |
| "loss": 0.0146, |
| "reward": 1.05, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.5300865531950139, |
| "eval_completion_length": 213.925, |
| "eval_kl": 0.4396484375, |
| "eval_loss": 0.017622916027903557, |
| "eval_reward": 1.03, |
| "eval_reward_std": 0.1414213538169861, |
| "eval_rewards/accuracy_reward": 0.08, |
| "eval_rewards/format_reward": 0.95, |
| "eval_runtime": 105.7779, |
| "eval_samples_per_second": 0.936, |
| "eval_steps_per_second": 0.236, |
| "step": 2400 |
| }, |
| { |
| "completion_length": 225.29375, |
| "epoch": 0.5311909001808368, |
| "grad_norm": 0.2664948280585707, |
| "kl": 0.40863037109375, |
| "learning_rate": 1.0654993140716665e-05, |
| "loss": 0.0164, |
| "reward": 1.0, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.94375, |
| "step": 2405 |
| }, |
| { |
| "completion_length": 208.39375, |
| "epoch": 0.5322952471666598, |
| "grad_norm": 0.634943961085242, |
| "kl": 0.33333740234375, |
| "learning_rate": 1.0616514552490791e-05, |
| "loss": 0.0133, |
| "reward": 1.05625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 2410 |
| }, |
| { |
| "completion_length": 186.975, |
| "epoch": 0.5333995941524827, |
| "grad_norm": 0.5915967584567521, |
| "kl": 0.294097900390625, |
| "learning_rate": 1.0578026799095464e-05, |
| "loss": 0.0118, |
| "reward": 1.0125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.975, |
| "step": 2415 |
| }, |
| { |
| "completion_length": 202.1625, |
| "epoch": 0.5345039411383057, |
| "grad_norm": 0.45619021814548455, |
| "kl": 0.321533203125, |
| "learning_rate": 1.0539530452693625e-05, |
| "loss": 0.0129, |
| "reward": 1.06875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.975, |
| "step": 2420 |
| }, |
| { |
| "completion_length": 243.76875, |
| "epoch": 0.5356082881241286, |
| "grad_norm": 0.3048810072947555, |
| "kl": 0.37376708984375, |
| "learning_rate": 1.0501026085575967e-05, |
| "loss": 0.0149, |
| "reward": 1.0, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.94375, |
| "step": 2425 |
| }, |
| { |
| "completion_length": 252.825, |
| "epoch": 0.5367126351099516, |
| "grad_norm": 0.40580599059859296, |
| "kl": 0.36983642578125, |
| "learning_rate": 1.046251427015241e-05, |
| "loss": 0.0148, |
| "reward": 1.01875, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.94375, |
| "step": 2430 |
| }, |
| { |
| "completion_length": 265.5875, |
| "epoch": 0.5378169820957744, |
| "grad_norm": 0.5522550075448642, |
| "kl": 0.384912109375, |
| "learning_rate": 1.0423995578943615e-05, |
| "loss": 0.0154, |
| "reward": 1.025, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.93125, |
| "step": 2435 |
| }, |
| { |
| "completion_length": 274.49375, |
| "epoch": 0.5389213290815974, |
| "grad_norm": 0.7214328168716406, |
| "kl": 0.4843017578125, |
| "learning_rate": 1.0385470584572449e-05, |
| "loss": 0.0194, |
| "reward": 1.04375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.925, |
| "step": 2440 |
| }, |
| { |
| "completion_length": 336.85625, |
| "epoch": 0.5400256760674204, |
| "grad_norm": 0.3475290976205475, |
| "kl": 0.52220458984375, |
| "learning_rate": 1.0346939859755481e-05, |
| "loss": 0.0209, |
| "reward": 0.9625, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9, |
| "step": 2445 |
| }, |
| { |
| "completion_length": 298.64375, |
| "epoch": 0.5411300230532433, |
| "grad_norm": 0.37045136738406037, |
| "kl": 0.3713623046875, |
| "learning_rate": 1.0308403977294476e-05, |
| "loss": 0.0149, |
| "reward": 1.025, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.95, |
| "step": 2450 |
| }, |
| { |
| "completion_length": 247.96875, |
| "epoch": 0.5422343700390663, |
| "grad_norm": 1.2798359500362633, |
| "kl": 0.4294677734375, |
| "learning_rate": 1.0269863510067872e-05, |
| "loss": 0.0172, |
| "reward": 1.05625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.95, |
| "step": 2455 |
| }, |
| { |
| "completion_length": 220.65625, |
| "epoch": 0.5433387170248892, |
| "grad_norm": 0.3900864517809055, |
| "kl": 0.405615234375, |
| "learning_rate": 1.023131903102226e-05, |
| "loss": 0.0162, |
| "reward": 1.0125, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.95625, |
| "step": 2460 |
| }, |
| { |
| "completion_length": 229.73125, |
| "epoch": 0.5444430640107122, |
| "grad_norm": 0.4673057649728135, |
| "kl": 0.3134033203125, |
| "learning_rate": 1.0192771113163875e-05, |
| "loss": 0.0125, |
| "reward": 1.04375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.98125, |
| "step": 2465 |
| }, |
| { |
| "completion_length": 197.48125, |
| "epoch": 0.5455474109965351, |
| "grad_norm": 0.5576580736483916, |
| "kl": 0.34364013671875, |
| "learning_rate": 1.0154220329550076e-05, |
| "loss": 0.0137, |
| "reward": 1.06875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9875, |
| "step": 2470 |
| }, |
| { |
| "completion_length": 227.26875, |
| "epoch": 0.5466517579823581, |
| "grad_norm": 0.21804579013846342, |
| "kl": 0.40257568359375, |
| "learning_rate": 1.0115667253280817e-05, |
| "loss": 0.0161, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9625, |
| "step": 2475 |
| }, |
| { |
| "completion_length": 209.29375, |
| "epoch": 0.5477561049681811, |
| "grad_norm": 0.3304966161490469, |
| "kl": 0.35350341796875, |
| "learning_rate": 1.0077112457490143e-05, |
| "loss": 0.0141, |
| "reward": 1.03125, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.975, |
| "step": 2480 |
| }, |
| { |
| "completion_length": 190.29375, |
| "epoch": 0.5488604519540039, |
| "grad_norm": 0.47912529430555245, |
| "kl": 0.34261474609375, |
| "learning_rate": 1.0038556515337654e-05, |
| "loss": 0.0137, |
| "reward": 1.0625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.98125, |
| "step": 2485 |
| }, |
| { |
| "completion_length": 210.5625, |
| "epoch": 0.5499647989398269, |
| "grad_norm": 0.6442806902182178, |
| "kl": 0.3751953125, |
| "learning_rate": 1e-05, |
| "loss": 0.015, |
| "reward": 0.99375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.95625, |
| "step": 2490 |
| }, |
| { |
| "completion_length": 205.20625, |
| "epoch": 0.5510691459256498, |
| "grad_norm": 0.6416478639095549, |
| "kl": 0.412200927734375, |
| "learning_rate": 9.961443484662349e-06, |
| "loss": 0.0165, |
| "reward": 1.03125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.95625, |
| "step": 2495 |
| }, |
| { |
| "completion_length": 222.10625, |
| "epoch": 0.5521734929114728, |
| "grad_norm": 0.38828171969188574, |
| "kl": 0.35648193359375, |
| "learning_rate": 9.92288754250986e-06, |
| "loss": 0.0143, |
| "reward": 1.0125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9625, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.5521734929114728, |
| "eval_completion_length": 193.71, |
| "eval_kl": 0.44693359375, |
| "eval_loss": 0.01760600134730339, |
| "eval_reward": 1.055, |
| "eval_reward_std": 0.13435028612613678, |
| "eval_rewards/accuracy_reward": 0.085, |
| "eval_rewards/format_reward": 0.97, |
| "eval_runtime": 90.4642, |
| "eval_samples_per_second": 1.094, |
| "eval_steps_per_second": 0.276, |
| "step": 2500 |
| }, |
| { |
| "completion_length": 221.71875, |
| "epoch": 0.5532778398972957, |
| "grad_norm": 0.5618731254639551, |
| "kl": 0.420947265625, |
| "learning_rate": 9.884332746719186e-06, |
| "loss": 0.0168, |
| "reward": 1.0, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.94375, |
| "step": 2505 |
| }, |
| { |
| "completion_length": 180.55625, |
| "epoch": 0.5543821868831187, |
| "grad_norm": 0.4034995486933272, |
| "kl": 0.32989501953125, |
| "learning_rate": 9.845779670449926e-06, |
| "loss": 0.0132, |
| "reward": 1.05, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.99375, |
| "step": 2510 |
| }, |
| { |
| "completion_length": 211.625, |
| "epoch": 0.5554865338689416, |
| "grad_norm": 0.291793288012953, |
| "kl": 0.4625244140625, |
| "learning_rate": 9.807228886836128e-06, |
| "loss": 0.0185, |
| "reward": 1.05, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.975, |
| "step": 2515 |
| }, |
| { |
| "completion_length": 173.81875, |
| "epoch": 0.5565908808547646, |
| "grad_norm": 0.4350329917552652, |
| "kl": 0.319976806640625, |
| "learning_rate": 9.768680968977743e-06, |
| "loss": 0.0128, |
| "reward": 1.0875, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 1.0, |
| "step": 2520 |
| }, |
| { |
| "completion_length": 209.65, |
| "epoch": 0.5576952278405876, |
| "grad_norm": 0.32148315759606677, |
| "kl": 0.31556396484375, |
| "learning_rate": 9.730136489932133e-06, |
| "loss": 0.0126, |
| "reward": 1.1, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.975, |
| "step": 2525 |
| }, |
| { |
| "completion_length": 181.725, |
| "epoch": 0.5587995748264104, |
| "grad_norm": 0.4796485021571092, |
| "kl": 0.355078125, |
| "learning_rate": 9.691596022705527e-06, |
| "loss": 0.0142, |
| "reward": 1.0625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9875, |
| "step": 2530 |
| }, |
| { |
| "completion_length": 217.01875, |
| "epoch": 0.5599039218122334, |
| "grad_norm": 0.6531621153503112, |
| "kl": 0.52369384765625, |
| "learning_rate": 9.653060140244524e-06, |
| "loss": 0.0209, |
| "reward": 1.0, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9375, |
| "step": 2535 |
| }, |
| { |
| "completion_length": 222.725, |
| "epoch": 0.5610082687980563, |
| "grad_norm": 0.48376942186846866, |
| "kl": 0.5350341796875, |
| "learning_rate": 9.614529415427556e-06, |
| "loss": 0.0214, |
| "reward": 1.0125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9375, |
| "step": 2540 |
| }, |
| { |
| "completion_length": 169.14375, |
| "epoch": 0.5621126157838793, |
| "grad_norm": 0.6248488648541215, |
| "kl": 0.3317138671875, |
| "learning_rate": 9.576004421056389e-06, |
| "loss": 0.0133, |
| "reward": 1.08125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.98125, |
| "step": 2545 |
| }, |
| { |
| "completion_length": 173.6875, |
| "epoch": 0.5632169627697022, |
| "grad_norm": 0.3043924981907395, |
| "kl": 0.31737060546875, |
| "learning_rate": 9.537485729847594e-06, |
| "loss": 0.0127, |
| "reward": 1.05625, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.99375, |
| "step": 2550 |
| }, |
| { |
| "completion_length": 163.68125, |
| "epoch": 0.5643213097555252, |
| "grad_norm": 0.42066464481441374, |
| "kl": 0.300604248046875, |
| "learning_rate": 9.498973914424035e-06, |
| "loss": 0.012, |
| "reward": 1.0875, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.99375, |
| "step": 2555 |
| }, |
| { |
| "completion_length": 182.5625, |
| "epoch": 0.5654256567413481, |
| "grad_norm": 0.3979137026903759, |
| "kl": 0.3016357421875, |
| "learning_rate": 9.460469547306375e-06, |
| "loss": 0.0121, |
| "reward": 1.0625, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 1.0, |
| "step": 2560 |
| }, |
| { |
| "completion_length": 186.04375, |
| "epoch": 0.5665300037271711, |
| "grad_norm": 0.21641364510353625, |
| "kl": 0.29324951171875, |
| "learning_rate": 9.421973200904538e-06, |
| "loss": 0.0117, |
| "reward": 1.04375, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.99375, |
| "step": 2565 |
| }, |
| { |
| "completion_length": 161.68125, |
| "epoch": 0.5676343507129941, |
| "grad_norm": 0.24964356866992105, |
| "kl": 0.3321533203125, |
| "learning_rate": 9.38348544750921e-06, |
| "loss": 0.0133, |
| "reward": 1.0625, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9875, |
| "step": 2570 |
| }, |
| { |
| "completion_length": 187.28125, |
| "epoch": 0.568738697698817, |
| "grad_norm": 0.1836091653753328, |
| "kl": 0.31024169921875, |
| "learning_rate": 9.345006859283338e-06, |
| "loss": 0.0124, |
| "reward": 1.10625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.99375, |
| "step": 2575 |
| }, |
| { |
| "completion_length": 212.85, |
| "epoch": 0.5698430446846399, |
| "grad_norm": 0.6849145983537044, |
| "kl": 0.339111328125, |
| "learning_rate": 9.306538008253611e-06, |
| "loss": 0.0136, |
| "reward": 1.08125, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 1.0, |
| "step": 2580 |
| }, |
| { |
| "completion_length": 204.25625, |
| "epoch": 0.5709473916704628, |
| "grad_norm": 0.8512773917093649, |
| "kl": 0.32901611328125, |
| "learning_rate": 9.268079466301978e-06, |
| "loss": 0.0132, |
| "reward": 1.11875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.96875, |
| "step": 2585 |
| }, |
| { |
| "completion_length": 223.1375, |
| "epoch": 0.5720517386562858, |
| "grad_norm": 0.18764654024348235, |
| "kl": 0.29111328125, |
| "learning_rate": 9.229631805157116e-06, |
| "loss": 0.0116, |
| "reward": 1.05, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.99375, |
| "step": 2590 |
| }, |
| { |
| "completion_length": 232.8625, |
| "epoch": 0.5731560856421087, |
| "grad_norm": 0.26470565817220254, |
| "kl": 0.30040283203125, |
| "learning_rate": 9.19119559638596e-06, |
| "loss": 0.012, |
| "reward": 1.0875, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.975, |
| "step": 2595 |
| }, |
| { |
| "completion_length": 224.3875, |
| "epoch": 0.5742604326279317, |
| "grad_norm": 0.28812544109822696, |
| "kl": 0.30372314453125, |
| "learning_rate": 9.15277141138518e-06, |
| "loss": 0.0121, |
| "reward": 1.1125, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.99375, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.5742604326279317, |
| "eval_completion_length": 200.555, |
| "eval_kl": 0.3039453125, |
| "eval_loss": 0.012129716575145721, |
| "eval_reward": 1.095, |
| "eval_reward_std": 0.12020815074443818, |
| "eval_rewards/accuracy_reward": 0.105, |
| "eval_rewards/format_reward": 0.99, |
| "eval_runtime": 84.0641, |
| "eval_samples_per_second": 1.178, |
| "eval_steps_per_second": 0.297, |
| "step": 2600 |
| }, |
| { |
| "completion_length": 206.1, |
| "epoch": 0.5753647796137547, |
| "grad_norm": 0.647401912936629, |
| "kl": 0.309521484375, |
| "learning_rate": 9.114359821372714e-06, |
| "loss": 0.0124, |
| "reward": 1.06875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9875, |
| "step": 2605 |
| }, |
| { |
| "completion_length": 232.76875, |
| "epoch": 0.5764691265995776, |
| "grad_norm": 0.4001821517172711, |
| "kl": 0.28963623046875, |
| "learning_rate": 9.075961397379247e-06, |
| "loss": 0.0116, |
| "reward": 1.06875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9875, |
| "step": 2610 |
| }, |
| { |
| "completion_length": 210.9, |
| "epoch": 0.5775734735854006, |
| "grad_norm": 0.46275481774122, |
| "kl": 0.29266357421875, |
| "learning_rate": 9.037576710239748e-06, |
| "loss": 0.0117, |
| "reward": 1.075, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 1.0, |
| "step": 2615 |
| }, |
| { |
| "completion_length": 232.54375, |
| "epoch": 0.5786778205712235, |
| "grad_norm": 0.06132598498237229, |
| "kl": 0.2859619140625, |
| "learning_rate": 8.999206330584969e-06, |
| "loss": 0.0114, |
| "reward": 1.03125, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.99375, |
| "step": 2620 |
| }, |
| { |
| "completion_length": 215.91875, |
| "epoch": 0.5797821675570465, |
| "grad_norm": 0.3911529231832668, |
| "kl": 0.29134521484375, |
| "learning_rate": 8.960850828832958e-06, |
| "loss": 0.0116, |
| "reward": 1.10625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.99375, |
| "step": 2625 |
| }, |
| { |
| "completion_length": 220.8625, |
| "epoch": 0.5808865145428693, |
| "grad_norm": 0.2442930603266259, |
| "kl": 0.31162109375, |
| "learning_rate": 8.9225107751806e-06, |
| "loss": 0.0125, |
| "reward": 1.06875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9875, |
| "step": 2630 |
| }, |
| { |
| "completion_length": 224.05625, |
| "epoch": 0.5819908615286923, |
| "grad_norm": 0.09213249286006794, |
| "kl": 0.3035400390625, |
| "learning_rate": 8.884186739595114e-06, |
| "loss": 0.0121, |
| "reward": 1.04375, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.99375, |
| "step": 2635 |
| }, |
| { |
| "completion_length": 207.7, |
| "epoch": 0.5830952085145152, |
| "grad_norm": 0.4516706074689143, |
| "kl": 0.300616455078125, |
| "learning_rate": 8.845879291805605e-06, |
| "loss": 0.012, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9875, |
| "step": 2640 |
| }, |
| { |
| "completion_length": 200.91875, |
| "epoch": 0.5841995555003382, |
| "grad_norm": 0.5391545499409273, |
| "kl": 0.29783935546875, |
| "learning_rate": 8.807589001294571e-06, |
| "loss": 0.0119, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.98125, |
| "step": 2645 |
| }, |
| { |
| "completion_length": 174.175, |
| "epoch": 0.5853039024861612, |
| "grad_norm": 0.7085634992406773, |
| "kl": 0.31053466796875, |
| "learning_rate": 8.769316437289456e-06, |
| "loss": 0.0124, |
| "reward": 1.0625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.96875, |
| "step": 2650 |
| }, |
| { |
| "completion_length": 181.50625, |
| "epoch": 0.5864082494719841, |
| "grad_norm": 0.4855936312310574, |
| "kl": 0.33160400390625, |
| "learning_rate": 8.731062168754174e-06, |
| "loss": 0.0133, |
| "reward": 1.075, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9875, |
| "step": 2655 |
| }, |
| { |
| "completion_length": 195.2625, |
| "epoch": 0.5875125964578071, |
| "grad_norm": 0.5682639526974919, |
| "kl": 0.359893798828125, |
| "learning_rate": 8.692826764380662e-06, |
| "loss": 0.0144, |
| "reward": 1.05625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.98125, |
| "step": 2660 |
| }, |
| { |
| "completion_length": 208.2875, |
| "epoch": 0.58861694344363, |
| "grad_norm": 0.4588795280012722, |
| "kl": 0.34942626953125, |
| "learning_rate": 8.654610792580415e-06, |
| "loss": 0.014, |
| "reward": 1.0125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.04375, |
| "rewards/format_reward": 0.96875, |
| "step": 2665 |
| }, |
| { |
| "completion_length": 168.025, |
| "epoch": 0.589721290429453, |
| "grad_norm": 0.500593183120369, |
| "kl": 0.36273193359375, |
| "learning_rate": 8.616414821476048e-06, |
| "loss": 0.0145, |
| "reward": 1.025, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9625, |
| "step": 2670 |
| }, |
| { |
| "completion_length": 183.0375, |
| "epoch": 0.5908256374152758, |
| "grad_norm": 0.5184938308817854, |
| "kl": 0.3274658203125, |
| "learning_rate": 8.57823941889284e-06, |
| "loss": 0.0131, |
| "reward": 1.04375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.98125, |
| "step": 2675 |
| }, |
| { |
| "completion_length": 158.6, |
| "epoch": 0.5919299844010988, |
| "grad_norm": 0.2914033499028057, |
| "kl": 0.289453125, |
| "learning_rate": 8.54008515235029e-06, |
| "loss": 0.0116, |
| "reward": 1.14375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.99375, |
| "step": 2680 |
| }, |
| { |
| "completion_length": 166.40625, |
| "epoch": 0.5930343313869217, |
| "grad_norm": 0.32731329886172095, |
| "kl": 0.30450439453125, |
| "learning_rate": 8.501952589053694e-06, |
| "loss": 0.0122, |
| "reward": 1.05, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 1.0, |
| "step": 2685 |
| }, |
| { |
| "completion_length": 156.31875, |
| "epoch": 0.5941386783727447, |
| "grad_norm": 0.09256103441005661, |
| "kl": 0.3406005859375, |
| "learning_rate": 8.463842295885712e-06, |
| "loss": 0.0136, |
| "reward": 1.01875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0375, |
| "rewards/format_reward": 0.98125, |
| "step": 2690 |
| }, |
| { |
| "completion_length": 183.61875, |
| "epoch": 0.5952430253585677, |
| "grad_norm": 0.5104668071449497, |
| "kl": 0.309075927734375, |
| "learning_rate": 8.425754839397917e-06, |
| "loss": 0.0124, |
| "reward": 1.0875, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.96875, |
| "step": 2695 |
| }, |
| { |
| "completion_length": 191.0875, |
| "epoch": 0.5963473723443906, |
| "grad_norm": 0.47594505089710887, |
| "kl": 0.30067138671875, |
| "learning_rate": 8.387690785802403e-06, |
| "loss": 0.012, |
| "reward": 1.05625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.5963473723443906, |
| "eval_completion_length": 200.535, |
| "eval_kl": 0.33861328125, |
| "eval_loss": 0.013564695604145527, |
| "eval_reward": 1.105, |
| "eval_reward_std": 0.20506096243858338, |
| "eval_rewards/accuracy_reward": 0.15, |
| "eval_rewards/format_reward": 0.955, |
| "eval_runtime": 104.0245, |
| "eval_samples_per_second": 0.952, |
| "eval_steps_per_second": 0.24, |
| "step": 2700 |
| }, |
| { |
| "completion_length": 181.2, |
| "epoch": 0.5974517193302136, |
| "grad_norm": 0.38636194379006417, |
| "kl": 0.30714111328125, |
| "learning_rate": 8.349650700963346e-06, |
| "loss": 0.0123, |
| "reward": 1.14375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.99375, |
| "step": 2705 |
| }, |
| { |
| "completion_length": 216.9, |
| "epoch": 0.5985560663160365, |
| "grad_norm": 0.21083257411179623, |
| "kl": 0.30963134765625, |
| "learning_rate": 8.311635150388607e-06, |
| "loss": 0.0124, |
| "reward": 1.025, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9625, |
| "step": 2710 |
| }, |
| { |
| "completion_length": 209.09375, |
| "epoch": 0.5996604133018595, |
| "grad_norm": 0.254195239435785, |
| "kl": 0.34473876953125, |
| "learning_rate": 8.273644699221309e-06, |
| "loss": 0.0138, |
| "reward": 1.0375, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.975, |
| "step": 2715 |
| }, |
| { |
| "completion_length": 210.60625, |
| "epoch": 0.6007647602876824, |
| "grad_norm": 0.2751894182054688, |
| "kl": 0.315087890625, |
| "learning_rate": 8.235679912231456e-06, |
| "loss": 0.0126, |
| "reward": 1.01875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.96875, |
| "step": 2720 |
| }, |
| { |
| "completion_length": 184.66875, |
| "epoch": 0.6018691072735053, |
| "grad_norm": 0.5013893987512199, |
| "kl": 0.300457763671875, |
| "learning_rate": 8.197741353807515e-06, |
| "loss": 0.012, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.98125, |
| "step": 2725 |
| }, |
| { |
| "completion_length": 209.50625, |
| "epoch": 0.6029734542593282, |
| "grad_norm": 0.2668494553767842, |
| "kl": 0.343133544921875, |
| "learning_rate": 8.159829587948048e-06, |
| "loss": 0.0137, |
| "reward": 1.075, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.95625, |
| "step": 2730 |
| }, |
| { |
| "completion_length": 217.9875, |
| "epoch": 0.6040778012451512, |
| "grad_norm": 1.1790834992221495, |
| "kl": 0.34041748046875, |
| "learning_rate": 8.1219451782533e-06, |
| "loss": 0.0136, |
| "reward": 1.01875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.95625, |
| "step": 2735 |
| }, |
| { |
| "completion_length": 199.21875, |
| "epoch": 0.6051821482309742, |
| "grad_norm": 0.38803314249531284, |
| "kl": 0.31158447265625, |
| "learning_rate": 8.084088687916853e-06, |
| "loss": 0.0125, |
| "reward": 1.04375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.975, |
| "step": 2740 |
| }, |
| { |
| "completion_length": 185.13125, |
| "epoch": 0.6062864952167971, |
| "grad_norm": 0.5365344661762009, |
| "kl": 0.3183349609375, |
| "learning_rate": 8.046260679717225e-06, |
| "loss": 0.0127, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.99375, |
| "step": 2745 |
| }, |
| { |
| "completion_length": 195.78125, |
| "epoch": 0.6073908422026201, |
| "grad_norm": 0.5602246756166963, |
| "kl": 0.361181640625, |
| "learning_rate": 8.00846171600952e-06, |
| "loss": 0.0144, |
| "reward": 1.05625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 2750 |
| }, |
| { |
| "completion_length": 174.98125, |
| "epoch": 0.608495189188443, |
| "grad_norm": 0.45096759701250927, |
| "kl": 0.326898193359375, |
| "learning_rate": 7.970692358717067e-06, |
| "loss": 0.0131, |
| "reward": 1.0625, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.99375, |
| "step": 2755 |
| }, |
| { |
| "completion_length": 219.3125, |
| "epoch": 0.609599536174266, |
| "grad_norm": 0.52626187470055, |
| "kl": 0.349560546875, |
| "learning_rate": 7.932953169323057e-06, |
| "loss": 0.014, |
| "reward": 1.06875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9875, |
| "step": 2760 |
| }, |
| { |
| "completion_length": 220.175, |
| "epoch": 0.610703883160089, |
| "grad_norm": 0.3604233546767503, |
| "kl": 0.31280517578125, |
| "learning_rate": 7.895244708862204e-06, |
| "loss": 0.0125, |
| "reward": 1.0625, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.99375, |
| "step": 2765 |
| }, |
| { |
| "completion_length": 207.06875, |
| "epoch": 0.6118082301459118, |
| "grad_norm": 0.5288120469694233, |
| "kl": 0.33223876953125, |
| "learning_rate": 7.857567537912404e-06, |
| "loss": 0.0133, |
| "reward": 1.05625, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9875, |
| "step": 2770 |
| }, |
| { |
| "completion_length": 236.71875, |
| "epoch": 0.6129125771317347, |
| "grad_norm": 0.10425504560684959, |
| "kl": 0.32779541015625, |
| "learning_rate": 7.8199222165864e-06, |
| "loss": 0.0131, |
| "reward": 1.05625, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9875, |
| "step": 2775 |
| }, |
| { |
| "completion_length": 207.39375, |
| "epoch": 0.6140169241175577, |
| "grad_norm": 0.2589149844532793, |
| "kl": 0.283251953125, |
| "learning_rate": 7.78230930452345e-06, |
| "loss": 0.0113, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 1.0, |
| "step": 2780 |
| }, |
| { |
| "completion_length": 242.43125, |
| "epoch": 0.6151212711033807, |
| "grad_norm": 0.22583571089563595, |
| "kl": 0.27052001953125, |
| "learning_rate": 7.744729360881023e-06, |
| "loss": 0.0108, |
| "reward": 1.11875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.99375, |
| "step": 2785 |
| }, |
| { |
| "completion_length": 231.89375, |
| "epoch": 0.6162256180892036, |
| "grad_norm": 0.4749669282378495, |
| "kl": 0.278765869140625, |
| "learning_rate": 7.70718294432646e-06, |
| "loss": 0.0111, |
| "reward": 1.08125, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 1.0, |
| "step": 2790 |
| }, |
| { |
| "completion_length": 245.79375, |
| "epoch": 0.6173299650750266, |
| "grad_norm": 0.3617583523975815, |
| "kl": 0.27476806640625, |
| "learning_rate": 7.669670613028705e-06, |
| "loss": 0.011, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.98125, |
| "step": 2795 |
| }, |
| { |
| "completion_length": 227.44375, |
| "epoch": 0.6184343120608495, |
| "grad_norm": 0.2965537312612653, |
| "kl": 0.28612060546875, |
| "learning_rate": 7.632192924649969e-06, |
| "loss": 0.0114, |
| "reward": 1.08125, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 1.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.6184343120608495, |
| "eval_completion_length": 230.735, |
| "eval_kl": 0.3071484375, |
| "eval_loss": 0.01229775045067072, |
| "eval_reward": 1.095, |
| "eval_reward_std": 0.13435028612613678, |
| "eval_rewards/accuracy_reward": 0.11, |
| "eval_rewards/format_reward": 0.985, |
| "eval_runtime": 108.6588, |
| "eval_samples_per_second": 0.911, |
| "eval_steps_per_second": 0.23, |
| "step": 2800 |
| }, |
| { |
| "completion_length": 239.04375, |
| "epoch": 0.6195386590466725, |
| "grad_norm": 0.5214893247053585, |
| "kl": 0.2989501953125, |
| "learning_rate": 7.594750436337467e-06, |
| "loss": 0.012, |
| "reward": 1.0625, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 1.0, |
| "step": 2805 |
| }, |
| { |
| "completion_length": 233.93125, |
| "epoch": 0.6206430060324954, |
| "grad_norm": 0.4598794915978221, |
| "kl": 0.29241943359375, |
| "learning_rate": 7.557343704715121e-06, |
| "loss": 0.0117, |
| "reward": 1.05625, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.99375, |
| "step": 2810 |
| }, |
| { |
| "completion_length": 242.64375, |
| "epoch": 0.6217473530183184, |
| "grad_norm": 3.1244852402266345, |
| "kl": 0.31943359375, |
| "learning_rate": 7.519973285875303e-06, |
| "loss": 0.0128, |
| "reward": 1.0625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.975, |
| "step": 2815 |
| }, |
| { |
| "completion_length": 236.81875, |
| "epoch": 0.6228517000041413, |
| "grad_norm": 0.5414981338856611, |
| "kl": 0.3600830078125, |
| "learning_rate": 7.482639735370536e-06, |
| "loss": 0.0144, |
| "reward": 1.03125, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.98125, |
| "step": 2820 |
| }, |
| { |
| "completion_length": 279.33125, |
| "epoch": 0.6239560469899642, |
| "grad_norm": 0.8596187860646796, |
| "kl": 0.40133056640625, |
| "learning_rate": 7.445343608205273e-06, |
| "loss": 0.0161, |
| "reward": 1.01875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9375, |
| "step": 2825 |
| }, |
| { |
| "completion_length": 247.0875, |
| "epoch": 0.6250603939757872, |
| "grad_norm": 0.5780188447264637, |
| "kl": 0.3927734375, |
| "learning_rate": 7.408085458827612e-06, |
| "loss": 0.0157, |
| "reward": 1.05, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.95625, |
| "step": 2830 |
| }, |
| { |
| "completion_length": 249.1, |
| "epoch": 0.6261647409616101, |
| "grad_norm": 0.316789753877662, |
| "kl": 0.38126220703125, |
| "learning_rate": 7.37086584112108e-06, |
| "loss": 0.0153, |
| "reward": 1.05, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.95625, |
| "step": 2835 |
| }, |
| { |
| "completion_length": 252.71875, |
| "epoch": 0.6272690879474331, |
| "grad_norm": 0.4453525406044762, |
| "kl": 0.37978515625, |
| "learning_rate": 7.333685308396383e-06, |
| "loss": 0.0152, |
| "reward": 1.0125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9375, |
| "step": 2840 |
| }, |
| { |
| "completion_length": 195.29375, |
| "epoch": 0.628373434933256, |
| "grad_norm": 0.6694404625243146, |
| "kl": 0.33199462890625, |
| "learning_rate": 7.2965444133831905e-06, |
| "loss": 0.0133, |
| "reward": 1.04375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.975, |
| "step": 2845 |
| }, |
| { |
| "completion_length": 203.05, |
| "epoch": 0.629477781919079, |
| "grad_norm": 0.5384625756146423, |
| "kl": 0.3214599609375, |
| "learning_rate": 7.2594437082219074e-06, |
| "loss": 0.0129, |
| "reward": 1.05, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.975, |
| "step": 2850 |
| }, |
| { |
| "completion_length": 181.36875, |
| "epoch": 0.630582128904902, |
| "grad_norm": 0.5857866433127917, |
| "kl": 0.31610107421875, |
| "learning_rate": 7.222383744455477e-06, |
| "loss": 0.0126, |
| "reward": 1.10625, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 1.0, |
| "step": 2855 |
| }, |
| { |
| "completion_length": 192.7625, |
| "epoch": 0.6316864758907249, |
| "grad_norm": 0.5306577984285455, |
| "kl": 0.2734375, |
| "learning_rate": 7.185365073021171e-06, |
| "loss": 0.0109, |
| "reward": 1.10625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9875, |
| "step": 2860 |
| }, |
| { |
| "completion_length": 186.95625, |
| "epoch": 0.6327908228765479, |
| "grad_norm": 0.25146113191688085, |
| "kl": 0.2825439453125, |
| "learning_rate": 7.148388244242414e-06, |
| "loss": 0.0113, |
| "reward": 1.05625, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 1.0, |
| "step": 2865 |
| }, |
| { |
| "completion_length": 188.88125, |
| "epoch": 0.6338951698623707, |
| "grad_norm": 0.29446615942878507, |
| "kl": 0.28685302734375, |
| "learning_rate": 7.111453807820587e-06, |
| "loss": 0.0115, |
| "reward": 1.0875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9875, |
| "step": 2870 |
| }, |
| { |
| "completion_length": 215.34375, |
| "epoch": 0.6349995168481937, |
| "grad_norm": 0.11271213250290812, |
| "kl": 0.24910888671875, |
| "learning_rate": 7.0745623128268605e-06, |
| "loss": 0.01, |
| "reward": 1.05, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9875, |
| "step": 2875 |
| }, |
| { |
| "completion_length": 220.21875, |
| "epoch": 0.6361038638340166, |
| "grad_norm": 0.13827817934352812, |
| "kl": 0.265997314453125, |
| "learning_rate": 7.037714307694038e-06, |
| "loss": 0.0106, |
| "reward": 1.05, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.9875, |
| "step": 2880 |
| }, |
| { |
| "completion_length": 224.525, |
| "epoch": 0.6372082108198396, |
| "grad_norm": 0.3760142764874246, |
| "kl": 0.30487060546875, |
| "learning_rate": 7.000910340208393e-06, |
| "loss": 0.0122, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.98125, |
| "step": 2885 |
| }, |
| { |
| "completion_length": 229.125, |
| "epoch": 0.6383125578056625, |
| "grad_norm": 0.4800060583908338, |
| "kl": 0.28165283203125, |
| "learning_rate": 6.964150957501538e-06, |
| "loss": 0.0113, |
| "reward": 1.0875, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.975, |
| "step": 2890 |
| }, |
| { |
| "completion_length": 259.29375, |
| "epoch": 0.6394169047914855, |
| "grad_norm": 0.46816183628445984, |
| "kl": 0.2957763671875, |
| "learning_rate": 6.927436706042276e-06, |
| "loss": 0.0118, |
| "reward": 1.11875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.9875, |
| "step": 2895 |
| }, |
| { |
| "completion_length": 290.36875, |
| "epoch": 0.6405212517773085, |
| "grad_norm": 0.4620742885523695, |
| "kl": 0.291473388671875, |
| "learning_rate": 6.890768131628492e-06, |
| "loss": 0.0117, |
| "reward": 1.04375, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.95625, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.6405212517773085, |
| "eval_completion_length": 255.545, |
| "eval_kl": 0.29716796875, |
| "eval_loss": 0.01189742237329483, |
| "eval_reward": 1.085, |
| "eval_reward_std": 0.13435028612613678, |
| "eval_rewards/accuracy_reward": 0.1, |
| "eval_rewards/format_reward": 0.985, |
| "eval_runtime": 124.9105, |
| "eval_samples_per_second": 0.793, |
| "eval_steps_per_second": 0.2, |
| "step": 2900 |
| }, |
| { |
| "completion_length": 284.0625, |
| "epoch": 0.6416255987631314, |
| "grad_norm": 0.3834368146178332, |
| "kl": 0.2900390625, |
| "learning_rate": 6.8541457793790204e-06, |
| "loss": 0.0116, |
| "reward": 1.0375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9625, |
| "step": 2905 |
| }, |
| { |
| "completion_length": 251.69375, |
| "epoch": 0.6427299457489544, |
| "grad_norm": 0.2517910604330981, |
| "kl": 0.2760498046875, |
| "learning_rate": 6.8175701937255645e-06, |
| "loss": 0.011, |
| "reward": 1.03125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.975, |
| "step": 2910 |
| }, |
| { |
| "completion_length": 264.5, |
| "epoch": 0.6438342927347772, |
| "grad_norm": 0.2750608123545671, |
| "kl": 0.2783935546875, |
| "learning_rate": 6.781041918404578e-06, |
| "loss": 0.0111, |
| "reward": 1.08125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.975, |
| "step": 2915 |
| }, |
| { |
| "completion_length": 258.33125, |
| "epoch": 0.6449386397206002, |
| "grad_norm": 0.3654656487363651, |
| "kl": 0.2917236328125, |
| "learning_rate": 6.744561496449208e-06, |
| "loss": 0.0117, |
| "reward": 1.0625, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9875, |
| "step": 2920 |
| }, |
| { |
| "completion_length": 245.49375, |
| "epoch": 0.6460429867064231, |
| "grad_norm": 0.6036458859995204, |
| "kl": 0.267669677734375, |
| "learning_rate": 6.708129470181197e-06, |
| "loss": 0.0107, |
| "reward": 1.15, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 1.0, |
| "step": 2925 |
| }, |
| { |
| "completion_length": 265.6875, |
| "epoch": 0.6471473336922461, |
| "grad_norm": 0.7309833094166883, |
| "kl": 0.26925048828125, |
| "learning_rate": 6.671746381202835e-06, |
| "loss": 0.0108, |
| "reward": 1.08125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9875, |
| "step": 2930 |
| }, |
| { |
| "completion_length": 252.36875, |
| "epoch": 0.648251680678069, |
| "grad_norm": 0.42288290479080043, |
| "kl": 0.2704833984375, |
| "learning_rate": 6.635412770388911e-06, |
| "loss": 0.0108, |
| "reward": 1.125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 1.0, |
| "step": 2935 |
| }, |
| { |
| "completion_length": 247.08125, |
| "epoch": 0.649356027663892, |
| "grad_norm": 0.49374444365360365, |
| "kl": 0.27628173828125, |
| "learning_rate": 6.5991291778786556e-06, |
| "loss": 0.0111, |
| "reward": 1.125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.98125, |
| "step": 2940 |
| }, |
| { |
| "completion_length": 233.7375, |
| "epoch": 0.650460374649715, |
| "grad_norm": 0.5239819846390583, |
| "kl": 0.28341064453125, |
| "learning_rate": 6.562896143067734e-06, |
| "loss": 0.0113, |
| "reward": 1.1125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9875, |
| "step": 2945 |
| }, |
| { |
| "completion_length": 255.7, |
| "epoch": 0.6515647216355379, |
| "grad_norm": 0.43425079053320653, |
| "kl": 0.2722412109375, |
| "learning_rate": 6.526714204600212e-06, |
| "loss": 0.0109, |
| "reward": 1.10625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.99375, |
| "step": 2950 |
| }, |
| { |
| "completion_length": 249.475, |
| "epoch": 0.6526690686213609, |
| "grad_norm": 0.25987542033307665, |
| "kl": 0.27745361328125, |
| "learning_rate": 6.490583900360543e-06, |
| "loss": 0.0111, |
| "reward": 1.06875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.98125, |
| "step": 2955 |
| }, |
| { |
| "completion_length": 273.275, |
| "epoch": 0.6537734156071838, |
| "grad_norm": 0.45996917419712435, |
| "kl": 0.292822265625, |
| "learning_rate": 6.4545057674655954e-06, |
| "loss": 0.0117, |
| "reward": 1.1, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.95625, |
| "step": 2960 |
| }, |
| { |
| "completion_length": 268.3625, |
| "epoch": 0.6548777625930067, |
| "grad_norm": 0.5624671363615396, |
| "kl": 0.32144775390625, |
| "learning_rate": 6.418480342256635e-06, |
| "loss": 0.0129, |
| "reward": 1.09375, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.95625, |
| "step": 2965 |
| }, |
| { |
| "completion_length": 269.64375, |
| "epoch": 0.6559821095788296, |
| "grad_norm": 0.32479361186684463, |
| "kl": 0.27386474609375, |
| "learning_rate": 6.38250816029139e-06, |
| "loss": 0.011, |
| "reward": 1.0375, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.94375, |
| "step": 2970 |
| }, |
| { |
| "completion_length": 232.7375, |
| "epoch": 0.6570864565646526, |
| "grad_norm": 0.4550467921271227, |
| "kl": 0.25955810546875, |
| "learning_rate": 6.34658975633605e-06, |
| "loss": 0.0104, |
| "reward": 1.09375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9875, |
| "step": 2975 |
| }, |
| { |
| "completion_length": 237.9875, |
| "epoch": 0.6581908035504755, |
| "grad_norm": 0.5807970338558914, |
| "kl": 0.282135009765625, |
| "learning_rate": 6.310725664357349e-06, |
| "loss": 0.0113, |
| "reward": 1.05, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95, |
| "step": 2980 |
| }, |
| { |
| "completion_length": 246.35625, |
| "epoch": 0.6592951505362985, |
| "grad_norm": 0.1538823043299478, |
| "kl": 0.321832275390625, |
| "learning_rate": 6.274916417514605e-06, |
| "loss": 0.0129, |
| "reward": 1.08125, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.98125, |
| "step": 2985 |
| }, |
| { |
| "completion_length": 265.53125, |
| "epoch": 0.6603994975221215, |
| "grad_norm": 0.41231875233030124, |
| "kl": 0.287841796875, |
| "learning_rate": 6.239162548151809e-06, |
| "loss": 0.0115, |
| "reward": 1.14375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.99375, |
| "step": 2990 |
| }, |
| { |
| "completion_length": 282.34375, |
| "epoch": 0.6615038445079444, |
| "grad_norm": 0.4537223091285934, |
| "kl": 0.29276123046875, |
| "learning_rate": 6.2034645877897e-06, |
| "loss": 0.0117, |
| "reward": 1.1125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.95625, |
| "step": 2995 |
| }, |
| { |
| "completion_length": 251.75, |
| "epoch": 0.6626081914937674, |
| "grad_norm": 0.5005714464427982, |
| "kl": 0.30860595703125, |
| "learning_rate": 6.167823067117868e-06, |
| "loss": 0.0123, |
| "reward": 1.1125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.975, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.6626081914937674, |
| "eval_completion_length": 255.26, |
| "eval_kl": 0.29595703125, |
| "eval_loss": 0.011857852339744568, |
| "eval_reward": 1.14, |
| "eval_reward_std": 0.12727921843528747, |
| "eval_rewards/accuracy_reward": 0.16, |
| "eval_rewards/format_reward": 0.98, |
| "eval_runtime": 109.8303, |
| "eval_samples_per_second": 0.901, |
| "eval_steps_per_second": 0.228, |
| "step": 3000 |
| }, |
| { |
| "completion_length": 300.43125, |
| "epoch": 0.6637125384795903, |
| "grad_norm": 0.32746973812290947, |
| "kl": 0.308380126953125, |
| "learning_rate": 6.132238515986868e-06, |
| "loss": 0.0123, |
| "reward": 1.04375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.94375, |
| "step": 3005 |
| }, |
| { |
| "completion_length": 243.6375, |
| "epoch": 0.6648168854654133, |
| "grad_norm": 0.4677149628391746, |
| "kl": 0.294287109375, |
| "learning_rate": 6.096711463400333e-06, |
| "loss": 0.0118, |
| "reward": 1.1125, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.98125, |
| "step": 3010 |
| }, |
| { |
| "completion_length": 247.98125, |
| "epoch": 0.6659212324512361, |
| "grad_norm": 0.5439611294896475, |
| "kl": 0.281396484375, |
| "learning_rate": 6.061242437507131e-06, |
| "loss": 0.0113, |
| "reward": 1.14375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1625, |
| "rewards/format_reward": 0.98125, |
| "step": 3015 |
| }, |
| { |
| "completion_length": 275.20625, |
| "epoch": 0.6670255794370591, |
| "grad_norm": 0.4749719977776336, |
| "kl": 0.31934814453125, |
| "learning_rate": 6.025831965593479e-06, |
| "loss": 0.0128, |
| "reward": 1.1125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.95625, |
| "step": 3020 |
| }, |
| { |
| "completion_length": 256.54375, |
| "epoch": 0.668129926422882, |
| "grad_norm": 0.3744944357004974, |
| "kl": 0.32305908203125, |
| "learning_rate": 5.990480574075143e-06, |
| "loss": 0.0129, |
| "reward": 1.00625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.94375, |
| "step": 3025 |
| }, |
| { |
| "completion_length": 245.075, |
| "epoch": 0.669234273408705, |
| "grad_norm": 0.36682653313436625, |
| "kl": 0.2878173828125, |
| "learning_rate": 5.955188788489583e-06, |
| "loss": 0.0115, |
| "reward": 1.0375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.95, |
| "step": 3030 |
| }, |
| { |
| "completion_length": 228.79375, |
| "epoch": 0.670338620394528, |
| "grad_norm": 0.5471531880195812, |
| "kl": 0.2614013671875, |
| "learning_rate": 5.919957133488155e-06, |
| "loss": 0.0105, |
| "reward": 1.0625, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.975, |
| "step": 3035 |
| }, |
| { |
| "completion_length": 281.49375, |
| "epoch": 0.6714429673803509, |
| "grad_norm": 0.44721939894789436, |
| "kl": 0.313287353515625, |
| "learning_rate": 5.884786132828304e-06, |
| "loss": 0.0125, |
| "reward": 1.05625, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9375, |
| "step": 3040 |
| }, |
| { |
| "completion_length": 220.5375, |
| "epoch": 0.6725473143661739, |
| "grad_norm": 0.4903267420807382, |
| "kl": 0.280303955078125, |
| "learning_rate": 5.849676309365786e-06, |
| "loss": 0.0112, |
| "reward": 1.0625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.95625, |
| "step": 3045 |
| }, |
| { |
| "completion_length": 213.80625, |
| "epoch": 0.6736516613519968, |
| "grad_norm": 0.6637137632075046, |
| "kl": 0.3021484375, |
| "learning_rate": 5.814628185046884e-06, |
| "loss": 0.0121, |
| "reward": 1.075, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.94375, |
| "step": 3050 |
| }, |
| { |
| "completion_length": 193.11875, |
| "epoch": 0.6747560083378198, |
| "grad_norm": 0.48238872145583594, |
| "kl": 0.3324951171875, |
| "learning_rate": 5.779642280900668e-06, |
| "loss": 0.0133, |
| "reward": 1.05625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.96875, |
| "step": 3055 |
| }, |
| { |
| "completion_length": 157.91875, |
| "epoch": 0.6758603553236426, |
| "grad_norm": 0.3447631853492008, |
| "kl": 0.324847412109375, |
| "learning_rate": 5.744719117031217e-06, |
| "loss": 0.013, |
| "reward": 1.08125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.975, |
| "step": 3060 |
| }, |
| { |
| "completion_length": 142.34375, |
| "epoch": 0.6769647023094656, |
| "grad_norm": 0.2144181948765553, |
| "kl": 0.333251953125, |
| "learning_rate": 5.709859212609919e-06, |
| "loss": 0.0133, |
| "reward": 1.075, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.98125, |
| "step": 3065 |
| }, |
| { |
| "completion_length": 142.88125, |
| "epoch": 0.6780690492952886, |
| "grad_norm": 2.1794877397302637, |
| "kl": 0.31337890625, |
| "learning_rate": 5.675063085867747e-06, |
| "loss": 0.0125, |
| "reward": 1.0375, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.9875, |
| "step": 3070 |
| }, |
| { |
| "completion_length": 167.0625, |
| "epoch": 0.6791733962811115, |
| "grad_norm": 0.5227859895364816, |
| "kl": 0.31148681640625, |
| "learning_rate": 5.6403312540875325e-06, |
| "loss": 0.0125, |
| "reward": 1.075, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.98125, |
| "step": 3075 |
| }, |
| { |
| "completion_length": 183.3, |
| "epoch": 0.6802777432669345, |
| "grad_norm": 0.5037852453688297, |
| "kl": 0.3081787109375, |
| "learning_rate": 5.6056642335963e-06, |
| "loss": 0.0123, |
| "reward": 1.08125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.96875, |
| "step": 3080 |
| }, |
| { |
| "completion_length": 170.60625, |
| "epoch": 0.6813820902527574, |
| "grad_norm": 0.39645655269998004, |
| "kl": 0.38388671875, |
| "learning_rate": 5.571062539757582e-06, |
| "loss": 0.0154, |
| "reward": 1.0875, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9625, |
| "step": 3085 |
| }, |
| { |
| "completion_length": 178.66875, |
| "epoch": 0.6824864372385804, |
| "grad_norm": 0.3386051871333338, |
| "kl": 0.353631591796875, |
| "learning_rate": 5.536526686963762e-06, |
| "loss": 0.0141, |
| "reward": 1.08125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9625, |
| "step": 3090 |
| }, |
| { |
| "completion_length": 179.43125, |
| "epoch": 0.6835907842244033, |
| "grad_norm": 0.1674625268129377, |
| "kl": 0.311474609375, |
| "learning_rate": 5.50205718862841e-06, |
| "loss": 0.0125, |
| "reward": 1.09375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9875, |
| "step": 3095 |
| }, |
| { |
| "completion_length": 198.5, |
| "epoch": 0.6846951312102263, |
| "grad_norm": 896.8738993114102, |
| "kl": 5.80845947265625, |
| "learning_rate": 5.467654557178679e-06, |
| "loss": 0.2331, |
| "reward": 1.0375, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9625, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.6846951312102263, |
| "eval_completion_length": 212.23, |
| "eval_kl": 0.383203125, |
| "eval_loss": 0.015370451845228672, |
| "eval_reward": 1.08, |
| "eval_reward_std": 0.15556348919868468, |
| "eval_rewards/accuracy_reward": 0.12, |
| "eval_rewards/format_reward": 0.96, |
| "eval_runtime": 115.3303, |
| "eval_samples_per_second": 0.858, |
| "eval_steps_per_second": 0.217, |
| "step": 3100 |
| }, |
| { |
| "completion_length": 191.40625, |
| "epoch": 0.6857994781960493, |
| "grad_norm": 0.23939982985583666, |
| "kl": 0.347125244140625, |
| "learning_rate": 5.433319304047666e-06, |
| "loss": 0.0139, |
| "reward": 1.0375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.96875, |
| "step": 3105 |
| }, |
| { |
| "completion_length": 196.55, |
| "epoch": 0.6869038251818721, |
| "grad_norm": 0.35370609706150546, |
| "kl": 0.33798828125, |
| "learning_rate": 5.399051939666817e-06, |
| "loss": 0.0135, |
| "reward": 1.0875, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.975, |
| "step": 3110 |
| }, |
| { |
| "completion_length": 184.975, |
| "epoch": 0.688008172167695, |
| "grad_norm": 0.35760764478630763, |
| "kl": 0.30341796875, |
| "learning_rate": 5.36485297345833e-06, |
| "loss": 0.0121, |
| "reward": 1.1125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9875, |
| "step": 3115 |
| }, |
| { |
| "completion_length": 203.2, |
| "epoch": 0.689112519153518, |
| "grad_norm": 0.644558524459656, |
| "kl": 0.292742919921875, |
| "learning_rate": 5.330722913827594e-06, |
| "loss": 0.0117, |
| "reward": 1.0625, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.96875, |
| "step": 3120 |
| }, |
| { |
| "completion_length": 230.50625, |
| "epoch": 0.690216866139341, |
| "grad_norm": 0.3464456804053971, |
| "kl": 0.27896728515625, |
| "learning_rate": 5.29666226815563e-06, |
| "loss": 0.0112, |
| "reward": 1.1125, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.98125, |
| "step": 3125 |
| }, |
| { |
| "completion_length": 221.98125, |
| "epoch": 0.6913212131251639, |
| "grad_norm": 0.20402330430464163, |
| "kl": 0.286383056640625, |
| "learning_rate": 5.262671542791531e-06, |
| "loss": 0.0115, |
| "reward": 1.0, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.01875, |
| "rewards/format_reward": 0.98125, |
| "step": 3130 |
| }, |
| { |
| "completion_length": 218.54375, |
| "epoch": 0.6924255601109869, |
| "grad_norm": 0.5547640484775744, |
| "kl": 0.2820068359375, |
| "learning_rate": 5.228751243044961e-06, |
| "loss": 0.0113, |
| "reward": 1.04375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9625, |
| "step": 3135 |
| }, |
| { |
| "completion_length": 207.03125, |
| "epoch": 0.6935299070968098, |
| "grad_norm": 0.5409399988975135, |
| "kl": 0.27630615234375, |
| "learning_rate": 5.194901873178622e-06, |
| "loss": 0.0111, |
| "reward": 1.075, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 1.0, |
| "step": 3140 |
| }, |
| { |
| "completion_length": 188.34375, |
| "epoch": 0.6946342540826328, |
| "grad_norm": 0.15055443349583902, |
| "kl": 0.267608642578125, |
| "learning_rate": 5.1611239364007694e-06, |
| "loss": 0.0107, |
| "reward": 1.075, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.98125, |
| "step": 3145 |
| }, |
| { |
| "completion_length": 183.2125, |
| "epoch": 0.6957386010684558, |
| "grad_norm": 0.4919745608219073, |
| "kl": 0.27034912109375, |
| "learning_rate": 5.127417934857718e-06, |
| "loss": 0.0108, |
| "reward": 1.08125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.975, |
| "step": 3150 |
| }, |
| { |
| "completion_length": 193.4875, |
| "epoch": 0.6968429480542786, |
| "grad_norm": 0.08730250985788413, |
| "kl": 0.28955078125, |
| "learning_rate": 5.093784369626397e-06, |
| "loss": 0.0116, |
| "reward": 1.1125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9875, |
| "step": 3155 |
| }, |
| { |
| "completion_length": 209.58125, |
| "epoch": 0.6979472950401016, |
| "grad_norm": 0.32846616068602047, |
| "kl": 0.298028564453125, |
| "learning_rate": 5.060223740706883e-06, |
| "loss": 0.0119, |
| "reward": 1.06875, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9875, |
| "step": 3160 |
| }, |
| { |
| "completion_length": 219.4625, |
| "epoch": 0.6990516420259245, |
| "grad_norm": 1.1444441188491123, |
| "kl": 0.36575927734375, |
| "learning_rate": 5.026736547014981e-06, |
| "loss": 0.0146, |
| "reward": 1.01875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.9625, |
| "step": 3165 |
| }, |
| { |
| "completion_length": 239.91875, |
| "epoch": 0.7001559890117475, |
| "grad_norm": 0.8099775678937355, |
| "kl": 0.308984375, |
| "learning_rate": 4.993323286374787e-06, |
| "loss": 0.0124, |
| "reward": 1.025, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.95, |
| "step": 3170 |
| }, |
| { |
| "completion_length": 219.4125, |
| "epoch": 0.7012603359975704, |
| "grad_norm": 0.4265583411464448, |
| "kl": 0.29287109375, |
| "learning_rate": 4.959984455511313e-06, |
| "loss": 0.0117, |
| "reward": 1.08125, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.99375, |
| "step": 3175 |
| }, |
| { |
| "completion_length": 231.3125, |
| "epoch": 0.7023646829833934, |
| "grad_norm": 0.454103943617819, |
| "kl": 0.29195556640625, |
| "learning_rate": 4.926720550043089e-06, |
| "loss": 0.0117, |
| "reward": 1.06875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.98125, |
| "step": 3180 |
| }, |
| { |
| "completion_length": 197.88125, |
| "epoch": 0.7034690299692163, |
| "grad_norm": 0.4885755590367494, |
| "kl": 0.25499267578125, |
| "learning_rate": 4.893532064474787e-06, |
| "loss": 0.0102, |
| "reward": 1.08125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 1.0, |
| "step": 3185 |
| }, |
| { |
| "completion_length": 227.78125, |
| "epoch": 0.7045733769550393, |
| "grad_norm": 0.2719638893519356, |
| "kl": 0.270703125, |
| "learning_rate": 4.860419492189886e-06, |
| "loss": 0.0108, |
| "reward": 1.06875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.975, |
| "step": 3190 |
| }, |
| { |
| "completion_length": 216.89375, |
| "epoch": 0.7056777239408623, |
| "grad_norm": 0.6224163001095495, |
| "kl": 0.29207763671875, |
| "learning_rate": 4.827383325443331e-06, |
| "loss": 0.0117, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9625, |
| "step": 3195 |
| }, |
| { |
| "completion_length": 260.0, |
| "epoch": 0.7067820709266852, |
| "grad_norm": 0.4086697490241937, |
| "kl": 0.273291015625, |
| "learning_rate": 4.794424055354213e-06, |
| "loss": 0.0109, |
| "reward": 1.08125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9625, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7067820709266852, |
| "eval_completion_length": 267.11, |
| "eval_kl": 0.30763671875, |
| "eval_loss": 0.012311533093452454, |
| "eval_reward": 1.09, |
| "eval_reward_std": 0.21213203012943269, |
| "eval_rewards/accuracy_reward": 0.14, |
| "eval_rewards/format_reward": 0.95, |
| "eval_runtime": 127.8367, |
| "eval_samples_per_second": 0.774, |
| "eval_steps_per_second": 0.196, |
| "step": 3200 |
| }, |
| { |
| "completion_length": 246.5875, |
| "epoch": 0.7078864179125081, |
| "grad_norm": 1.0355398852634203, |
| "kl": 0.26212158203125, |
| "learning_rate": 4.761542171898469e-06, |
| "loss": 0.0105, |
| "reward": 1.0625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.975, |
| "step": 3205 |
| }, |
| { |
| "completion_length": 282.29375, |
| "epoch": 0.708990764898331, |
| "grad_norm": 0.5137021254058943, |
| "kl": 0.3453125, |
| "learning_rate": 4.728738163901597e-06, |
| "loss": 0.0138, |
| "reward": 1.03125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.94375, |
| "step": 3210 |
| }, |
| { |
| "completion_length": 272.9, |
| "epoch": 0.710095111884154, |
| "grad_norm": 0.4557603593431217, |
| "kl": 0.3007080078125, |
| "learning_rate": 4.696012519031397e-06, |
| "loss": 0.012, |
| "reward": 1.06875, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95625, |
| "step": 3215 |
| }, |
| { |
| "completion_length": 300.46875, |
| "epoch": 0.7111994588699769, |
| "grad_norm": 0.9407435380966918, |
| "kl": 0.3644775390625, |
| "learning_rate": 4.663365723790698e-06, |
| "loss": 0.0146, |
| "reward": 1.0, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.90625, |
| "step": 3220 |
| }, |
| { |
| "completion_length": 282.19375, |
| "epoch": 0.7123038058557999, |
| "grad_norm": 0.48300892660454786, |
| "kl": 0.3069580078125, |
| "learning_rate": 4.630798263510162e-06, |
| "loss": 0.0123, |
| "reward": 1.05625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.95, |
| "step": 3225 |
| }, |
| { |
| "completion_length": 234.26875, |
| "epoch": 0.7134081528416228, |
| "grad_norm": 0.5153762702568343, |
| "kl": 0.33831787109375, |
| "learning_rate": 4.598310622341037e-06, |
| "loss": 0.0135, |
| "reward": 1.08125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9625, |
| "step": 3230 |
| }, |
| { |
| "completion_length": 283.26875, |
| "epoch": 0.7145124998274458, |
| "grad_norm": 0.2278004212721197, |
| "kl": 0.2323486328125, |
| "learning_rate": 4.565903283247981e-06, |
| "loss": 0.0093, |
| "reward": 1.13125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.975, |
| "step": 3235 |
| }, |
| { |
| "completion_length": 254.85, |
| "epoch": 0.7156168468132688, |
| "grad_norm": 0.4057412022574356, |
| "kl": 0.224237060546875, |
| "learning_rate": 4.533576728001858e-06, |
| "loss": 0.009, |
| "reward": 1.11875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.98125, |
| "step": 3240 |
| }, |
| { |
| "completion_length": 243.23125, |
| "epoch": 0.7167211937990917, |
| "grad_norm": 0.3936093803196274, |
| "kl": 0.2511962890625, |
| "learning_rate": 4.501331437172606e-06, |
| "loss": 0.01, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9625, |
| "step": 3245 |
| }, |
| { |
| "completion_length": 261.04375, |
| "epoch": 0.7178255407849147, |
| "grad_norm": 0.15034549538860667, |
| "kl": 0.28284912109375, |
| "learning_rate": 4.469167890122073e-06, |
| "loss": 0.0113, |
| "reward": 1.03125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.94375, |
| "step": 3250 |
| }, |
| { |
| "completion_length": 270.925, |
| "epoch": 0.7189298877707375, |
| "grad_norm": 0.5028341851811142, |
| "kl": 0.24959716796875, |
| "learning_rate": 4.437086564996891e-06, |
| "loss": 0.01, |
| "reward": 1.0375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.95625, |
| "step": 3255 |
| }, |
| { |
| "completion_length": 264.66875, |
| "epoch": 0.7200342347565605, |
| "grad_norm": 0.5870825869850653, |
| "kl": 0.26156005859375, |
| "learning_rate": 4.405087938721376e-06, |
| "loss": 0.0105, |
| "reward": 1.0375, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.93125, |
| "step": 3260 |
| }, |
| { |
| "completion_length": 333.51875, |
| "epoch": 0.7211385817423834, |
| "grad_norm": 0.5363985927856229, |
| "kl": 0.268310546875, |
| "learning_rate": 4.373172486990436e-06, |
| "loss": 0.0107, |
| "reward": 1.03125, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9125, |
| "step": 3265 |
| }, |
| { |
| "completion_length": 259.7625, |
| "epoch": 0.7222429287282064, |
| "grad_norm": 0.34294137834570276, |
| "kl": 0.25391845703125, |
| "learning_rate": 4.341340684262498e-06, |
| "loss": 0.0102, |
| "reward": 1.05625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9625, |
| "step": 3270 |
| }, |
| { |
| "completion_length": 279.15625, |
| "epoch": 0.7233472757140293, |
| "grad_norm": 0.45234678819267615, |
| "kl": 0.2611572265625, |
| "learning_rate": 4.309593003752446e-06, |
| "loss": 0.0104, |
| "reward": 1.0875, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.9375, |
| "step": 3275 |
| }, |
| { |
| "completion_length": 297.00625, |
| "epoch": 0.7244516226998523, |
| "grad_norm": 0.40479995606264946, |
| "kl": 0.280712890625, |
| "learning_rate": 4.277929917424602e-06, |
| "loss": 0.0112, |
| "reward": 1.0125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.925, |
| "step": 3280 |
| }, |
| { |
| "completion_length": 241.7875, |
| "epoch": 0.7255559696856753, |
| "grad_norm": 0.31716356544020063, |
| "kl": 0.2287841796875, |
| "learning_rate": 4.246351895985702e-06, |
| "loss": 0.0091, |
| "reward": 1.04375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9375, |
| "step": 3285 |
| }, |
| { |
| "completion_length": 254.69375, |
| "epoch": 0.7266603166714982, |
| "grad_norm": 0.3098099830382794, |
| "kl": 0.24783935546875, |
| "learning_rate": 4.214859408877899e-06, |
| "loss": 0.0099, |
| "reward": 1.05625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9625, |
| "step": 3290 |
| }, |
| { |
| "completion_length": 236.5875, |
| "epoch": 0.7277646636573212, |
| "grad_norm": 0.2207504226236474, |
| "kl": 0.2484619140625, |
| "learning_rate": 4.183452924271776e-06, |
| "loss": 0.0099, |
| "reward": 1.0375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.96875, |
| "step": 3295 |
| }, |
| { |
| "completion_length": 224.6625, |
| "epoch": 0.728869010643144, |
| "grad_norm": 0.5845311907558509, |
| "kl": 0.25625, |
| "learning_rate": 4.152132909059402e-06, |
| "loss": 0.0103, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.975, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.728869010643144, |
| "eval_completion_length": 241.69, |
| "eval_kl": 0.31572265625, |
| "eval_loss": 0.012639479711651802, |
| "eval_reward": 1.09, |
| "eval_reward_std": 0.1414213538169861, |
| "eval_rewards/accuracy_reward": 0.135, |
| "eval_rewards/format_reward": 0.955, |
| "eval_runtime": 118.6805, |
| "eval_samples_per_second": 0.834, |
| "eval_steps_per_second": 0.211, |
| "step": 3300 |
| }, |
| { |
| "completion_length": 215.5125, |
| "epoch": 0.729973357628967, |
| "grad_norm": 0.30909422545672033, |
| "kl": 0.245867919921875, |
| "learning_rate": 4.120899828847385e-06, |
| "loss": 0.0098, |
| "reward": 1.0875, |
| "reward_std": 0.05303300768136978, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9875, |
| "step": 3305 |
| }, |
| { |
| "completion_length": 230.26875, |
| "epoch": 0.7310777046147899, |
| "grad_norm": 0.6453873653199322, |
| "kl": 0.260516357421875, |
| "learning_rate": 4.089754147949935e-06, |
| "loss": 0.0104, |
| "reward": 1.08125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.95625, |
| "step": 3310 |
| }, |
| { |
| "completion_length": 210.56875, |
| "epoch": 0.7321820516006129, |
| "grad_norm": 0.40594249764413265, |
| "kl": 0.229119873046875, |
| "learning_rate": 4.058696329381987e-06, |
| "loss": 0.0092, |
| "reward": 1.1125, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.98125, |
| "step": 3315 |
| }, |
| { |
| "completion_length": 212.29375, |
| "epoch": 0.7332863985864359, |
| "grad_norm": 0.38422267389292253, |
| "kl": 0.2646240234375, |
| "learning_rate": 4.027726834852303e-06, |
| "loss": 0.0106, |
| "reward": 1.0875, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9625, |
| "step": 3320 |
| }, |
| { |
| "completion_length": 232.43125, |
| "epoch": 0.7343907455722588, |
| "grad_norm": 0.5042182184524241, |
| "kl": 0.2716796875, |
| "learning_rate": 3.996846124756609e-06, |
| "loss": 0.0109, |
| "reward": 1.05, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95, |
| "step": 3325 |
| }, |
| { |
| "completion_length": 219.50625, |
| "epoch": 0.7354950925580818, |
| "grad_norm": 0.5264628768885443, |
| "kl": 0.272119140625, |
| "learning_rate": 3.966054658170754e-06, |
| "loss": 0.0109, |
| "reward": 1.0875, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.95625, |
| "step": 3330 |
| }, |
| { |
| "completion_length": 223.0125, |
| "epoch": 0.7365994395439047, |
| "grad_norm": 0.2967573269006475, |
| "kl": 0.258892822265625, |
| "learning_rate": 3.93535289284388e-06, |
| "loss": 0.0104, |
| "reward": 1.075, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.96875, |
| "step": 3335 |
| }, |
| { |
| "completion_length": 256.3625, |
| "epoch": 0.7377037865297277, |
| "grad_norm": 0.35416855035423694, |
| "kl": 0.2759521484375, |
| "learning_rate": 3.904741285191629e-06, |
| "loss": 0.011, |
| "reward": 1.08125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.94375, |
| "step": 3340 |
| }, |
| { |
| "completion_length": 237.9875, |
| "epoch": 0.7388081335155506, |
| "grad_norm": 0.5938232640376352, |
| "kl": 0.283270263671875, |
| "learning_rate": 3.874220290289337e-06, |
| "loss": 0.0113, |
| "reward": 1.15625, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.19375, |
| "rewards/format_reward": 0.9625, |
| "step": 3345 |
| }, |
| { |
| "completion_length": 249.93125, |
| "epoch": 0.7399124805013735, |
| "grad_norm": 0.25454486548911043, |
| "kl": 0.254449462890625, |
| "learning_rate": 3.8437903618652895e-06, |
| "loss": 0.0102, |
| "reward": 1.0625, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9625, |
| "step": 3350 |
| }, |
| { |
| "completion_length": 221.375, |
| "epoch": 0.7410168274871964, |
| "grad_norm": 0.2326519300763832, |
| "kl": 0.24263916015625, |
| "learning_rate": 3.8134519522939693e-06, |
| "loss": 0.0097, |
| "reward": 1.0875, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9875, |
| "step": 3355 |
| }, |
| { |
| "completion_length": 224.9375, |
| "epoch": 0.7421211744730194, |
| "grad_norm": 0.4822164383039262, |
| "kl": 0.2813232421875, |
| "learning_rate": 3.7832055125893318e-06, |
| "loss": 0.0113, |
| "reward": 1.10625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.96875, |
| "step": 3360 |
| }, |
| { |
| "completion_length": 230.09375, |
| "epoch": 0.7432255214588424, |
| "grad_norm": 0.6372609601101804, |
| "kl": 0.298992919921875, |
| "learning_rate": 3.753051492398089e-06, |
| "loss": 0.012, |
| "reward": 1.14375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.19375, |
| "rewards/format_reward": 0.95, |
| "step": 3365 |
| }, |
| { |
| "completion_length": 219.25, |
| "epoch": 0.7443298684446653, |
| "grad_norm": 0.3081391116247598, |
| "kl": 0.2989990234375, |
| "learning_rate": 3.7229903399930423e-06, |
| "loss": 0.012, |
| "reward": 1.1, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.9625, |
| "step": 3370 |
| }, |
| { |
| "completion_length": 155.59375, |
| "epoch": 0.7454342154304883, |
| "grad_norm": 0.5909119308745682, |
| "kl": 0.31363525390625, |
| "learning_rate": 3.6930225022664136e-06, |
| "loss": 0.0125, |
| "reward": 1.11875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.98125, |
| "step": 3375 |
| }, |
| { |
| "completion_length": 187.8375, |
| "epoch": 0.7465385624163112, |
| "grad_norm": 0.34179198645052977, |
| "kl": 0.3404541015625, |
| "learning_rate": 3.6631484247231896e-06, |
| "loss": 0.0136, |
| "reward": 1.04375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.95, |
| "step": 3380 |
| }, |
| { |
| "completion_length": 212.6, |
| "epoch": 0.7476429094021342, |
| "grad_norm": 0.75092071011766, |
| "kl": 0.36864013671875, |
| "learning_rate": 3.6333685514745165e-06, |
| "loss": 0.0147, |
| "reward": 1.075, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.925, |
| "step": 3385 |
| }, |
| { |
| "completion_length": 193.43125, |
| "epoch": 0.7487472563879571, |
| "grad_norm": 0.329990173152014, |
| "kl": 0.365625, |
| "learning_rate": 3.6036833252310887e-06, |
| "loss": 0.0146, |
| "reward": 1.0625, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95, |
| "step": 3390 |
| }, |
| { |
| "completion_length": 201.61875, |
| "epoch": 0.7498516033737801, |
| "grad_norm": 0.47149722693689095, |
| "kl": 0.37044677734375, |
| "learning_rate": 3.574093187296568e-06, |
| "loss": 0.0148, |
| "reward": 1.075, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.95, |
| "step": 3395 |
| }, |
| { |
| "completion_length": 210.40625, |
| "epoch": 0.7509559503596029, |
| "grad_norm": 0.6186398549798994, |
| "kl": 0.31754150390625, |
| "learning_rate": 3.544598577561016e-06, |
| "loss": 0.0127, |
| "reward": 1.06875, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.96875, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.7509559503596029, |
| "eval_completion_length": 217.785, |
| "eval_kl": 0.5054296875, |
| "eval_loss": 0.020201342180371284, |
| "eval_reward": 1.095, |
| "eval_reward_std": 0.162634556889534, |
| "eval_rewards/accuracy_reward": 0.13, |
| "eval_rewards/format_reward": 0.965, |
| "eval_runtime": 115.0896, |
| "eval_samples_per_second": 0.86, |
| "eval_steps_per_second": 0.217, |
| "step": 3400 |
| }, |
| { |
| "completion_length": 220.7875, |
| "epoch": 0.7520602973454259, |
| "grad_norm": 0.502156340630982, |
| "kl": 0.31522216796875, |
| "learning_rate": 3.515199934494373e-06, |
| "loss": 0.0126, |
| "reward": 1.0625, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9375, |
| "step": 3405 |
| }, |
| { |
| "completion_length": 235.6625, |
| "epoch": 0.7531646443312489, |
| "grad_norm": 0.405309226466982, |
| "kl": 0.34342041015625, |
| "learning_rate": 3.4858976951399237e-06, |
| "loss": 0.0137, |
| "reward": 1.075, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.9625, |
| "step": 3410 |
| }, |
| { |
| "completion_length": 207.125, |
| "epoch": 0.7542689913170718, |
| "grad_norm": 0.42471895189637104, |
| "kl": 0.37327880859375, |
| "learning_rate": 3.4566922951078086e-06, |
| "loss": 0.0149, |
| "reward": 1.10625, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.9625, |
| "step": 3415 |
| }, |
| { |
| "completion_length": 204.0625, |
| "epoch": 0.7553733383028948, |
| "grad_norm": 0.328073526920033, |
| "kl": 0.277392578125, |
| "learning_rate": 3.427584168568535e-06, |
| "loss": 0.0111, |
| "reward": 1.10625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.975, |
| "step": 3420 |
| }, |
| { |
| "completion_length": 205.0875, |
| "epoch": 0.7564776852887177, |
| "grad_norm": 0.5369831637398775, |
| "kl": 0.2722412109375, |
| "learning_rate": 3.398573748246544e-06, |
| "loss": 0.0109, |
| "reward": 1.175, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.9875, |
| "step": 3425 |
| }, |
| { |
| "completion_length": 271.4875, |
| "epoch": 0.7575820322745407, |
| "grad_norm": 0.5616908933906252, |
| "kl": 0.249615478515625, |
| "learning_rate": 3.3696614654137637e-06, |
| "loss": 0.01, |
| "reward": 0.9625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.93125, |
| "step": 3430 |
| }, |
| { |
| "completion_length": 246.775, |
| "epoch": 0.7586863792603636, |
| "grad_norm": 0.602386946053219, |
| "kl": 0.25421142578125, |
| "learning_rate": 3.3408477498831917e-06, |
| "loss": 0.0102, |
| "reward": 1.1375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.18125, |
| "rewards/format_reward": 0.95625, |
| "step": 3435 |
| }, |
| { |
| "completion_length": 227.56875, |
| "epoch": 0.7597907262461866, |
| "grad_norm": 0.5462886025152357, |
| "kl": 0.259625244140625, |
| "learning_rate": 3.3121330300025222e-06, |
| "loss": 0.0104, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.975, |
| "step": 3440 |
| }, |
| { |
| "completion_length": 217.3875, |
| "epoch": 0.7608950732320094, |
| "grad_norm": 0.513835490839363, |
| "kl": 0.26868896484375, |
| "learning_rate": 3.2835177326477675e-06, |
| "loss": 0.0108, |
| "reward": 1.1125, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9875, |
| "step": 3445 |
| }, |
| { |
| "completion_length": 221.81875, |
| "epoch": 0.7619994202178324, |
| "grad_norm": 0.44817623889299235, |
| "kl": 0.251348876953125, |
| "learning_rate": 3.2550022832169125e-06, |
| "loss": 0.0101, |
| "reward": 1.05, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.96875, |
| "step": 3450 |
| }, |
| { |
| "completion_length": 201.45625, |
| "epoch": 0.7631037672036554, |
| "grad_norm": 0.3221696210681099, |
| "kl": 0.258599853515625, |
| "learning_rate": 3.2265871056235974e-06, |
| "loss": 0.0103, |
| "reward": 1.0875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.99375, |
| "step": 3455 |
| }, |
| { |
| "completion_length": 200.58125, |
| "epoch": 0.7642081141894783, |
| "grad_norm": 0.748323660705002, |
| "kl": 0.27359619140625, |
| "learning_rate": 3.1982726222908046e-06, |
| "loss": 0.0109, |
| "reward": 1.11875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.975, |
| "step": 3460 |
| }, |
| { |
| "completion_length": 239.84375, |
| "epoch": 0.7653124611753013, |
| "grad_norm": 0.40897691141192144, |
| "kl": 0.24927978515625, |
| "learning_rate": 3.170059254144593e-06, |
| "loss": 0.01, |
| "reward": 1.06875, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.98125, |
| "step": 3465 |
| }, |
| { |
| "completion_length": 220.5125, |
| "epoch": 0.7664168081611242, |
| "grad_norm": 0.32005616347994614, |
| "kl": 0.26856689453125, |
| "learning_rate": 3.1419474206078203e-06, |
| "loss": 0.0107, |
| "reward": 1.1625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.18125, |
| "rewards/format_reward": 0.98125, |
| "step": 3470 |
| }, |
| { |
| "completion_length": 235.375, |
| "epoch": 0.7675211551469472, |
| "grad_norm": 0.38318241182760876, |
| "kl": 0.2571533203125, |
| "learning_rate": 3.113937539593931e-06, |
| "loss": 0.0103, |
| "reward": 1.09375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9875, |
| "step": 3475 |
| }, |
| { |
| "completion_length": 265.41875, |
| "epoch": 0.7686255021327701, |
| "grad_norm": 0.5005202602287694, |
| "kl": 0.2830078125, |
| "learning_rate": 3.086030027500728e-06, |
| "loss": 0.0113, |
| "reward": 1.09375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.98125, |
| "step": 3480 |
| }, |
| { |
| "completion_length": 263.2875, |
| "epoch": 0.7697298491185931, |
| "grad_norm": 0.07189820608786429, |
| "kl": 0.28231201171875, |
| "learning_rate": 3.058225299204195e-06, |
| "loss": 0.0113, |
| "reward": 1.05625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.98125, |
| "step": 3485 |
| }, |
| { |
| "completion_length": 255.85, |
| "epoch": 0.7708341961044161, |
| "grad_norm": 0.5124546790054572, |
| "kl": 0.28914794921875, |
| "learning_rate": 3.0305237680523046e-06, |
| "loss": 0.0116, |
| "reward": 1.1, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.98125, |
| "step": 3490 |
| }, |
| { |
| "completion_length": 243.6625, |
| "epoch": 0.7719385430902389, |
| "grad_norm": 0.2684526887471308, |
| "kl": 0.257568359375, |
| "learning_rate": 3.002925845858905e-06, |
| "loss": 0.0103, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.05625, |
| "rewards/format_reward": 0.98125, |
| "step": 3495 |
| }, |
| { |
| "completion_length": 279.725, |
| "epoch": 0.7730428900760619, |
| "grad_norm": 0.45994470057081904, |
| "kl": 0.265185546875, |
| "learning_rate": 2.9754319428975796e-06, |
| "loss": 0.0106, |
| "reward": 1.125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.1625, |
| "rewards/format_reward": 0.9625, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.7730428900760619, |
| "eval_completion_length": 227.09, |
| "eval_kl": 0.26388671875, |
| "eval_loss": 0.010543497279286385, |
| "eval_reward": 1.095, |
| "eval_reward_std": 0.1484924215078354, |
| "eval_rewards/accuracy_reward": 0.115, |
| "eval_rewards/format_reward": 0.98, |
| "eval_runtime": 102.2929, |
| "eval_samples_per_second": 0.968, |
| "eval_steps_per_second": 0.244, |
| "step": 3500 |
| }, |
| { |
| "completion_length": 277.59375, |
| "epoch": 0.7741472370618848, |
| "grad_norm": 0.4448459387591611, |
| "kl": 0.287054443359375, |
| "learning_rate": 2.948042467895544e-06, |
| "loss": 0.0115, |
| "reward": 1.05625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9625, |
| "step": 3505 |
| }, |
| { |
| "completion_length": 259.24375, |
| "epoch": 0.7752515840477078, |
| "grad_norm": 0.4966119961041164, |
| "kl": 0.29287109375, |
| "learning_rate": 2.920757828027586e-06, |
| "loss": 0.0117, |
| "reward": 1.03125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9625, |
| "step": 3510 |
| }, |
| { |
| "completion_length": 286.8, |
| "epoch": 0.7763559310335307, |
| "grad_norm": 0.484519618170077, |
| "kl": 0.2783935546875, |
| "learning_rate": 2.893578428909998e-06, |
| "loss": 0.0111, |
| "reward": 1.10625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.9625, |
| "step": 3515 |
| }, |
| { |
| "completion_length": 236.9875, |
| "epoch": 0.7774602780193537, |
| "grad_norm": 0.5777828810239061, |
| "kl": 0.27120361328125, |
| "learning_rate": 2.8665046745945555e-06, |
| "loss": 0.0109, |
| "reward": 1.1, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.95625, |
| "step": 3520 |
| }, |
| { |
| "completion_length": 269.5625, |
| "epoch": 0.7785646250051766, |
| "grad_norm": 0.21783452027900907, |
| "kl": 0.25440673828125, |
| "learning_rate": 2.839536967562504e-06, |
| "loss": 0.0102, |
| "reward": 1.10625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.98125, |
| "step": 3525 |
| }, |
| { |
| "completion_length": 254.6625, |
| "epoch": 0.7796689719909996, |
| "grad_norm": 0.5865883286348121, |
| "kl": 0.233740234375, |
| "learning_rate": 2.8126757087185797e-06, |
| "loss": 0.0093, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.975, |
| "step": 3530 |
| }, |
| { |
| "completion_length": 280.525, |
| "epoch": 0.7807733189768226, |
| "grad_norm": 0.43883058743962794, |
| "kl": 0.271136474609375, |
| "learning_rate": 2.7859212973850535e-06, |
| "loss": 0.0108, |
| "reward": 1.075, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.9375, |
| "step": 3535 |
| }, |
| { |
| "completion_length": 205.775, |
| "epoch": 0.7818776659626455, |
| "grad_norm": 0.606176189949368, |
| "kl": 0.27373046875, |
| "learning_rate": 2.759274131295787e-06, |
| "loss": 0.0109, |
| "reward": 1.11875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.975, |
| "step": 3540 |
| }, |
| { |
| "completion_length": 260.25, |
| "epoch": 0.7829820129484684, |
| "grad_norm": 0.3955866589392802, |
| "kl": 0.270849609375, |
| "learning_rate": 2.732734606590318e-06, |
| "loss": 0.0108, |
| "reward": 1.03125, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.93125, |
| "step": 3545 |
| }, |
| { |
| "completion_length": 262.69375, |
| "epoch": 0.7840863599342913, |
| "grad_norm": 0.3782847315356218, |
| "kl": 0.275299072265625, |
| "learning_rate": 2.7063031178079847e-06, |
| "loss": 0.011, |
| "reward": 1.0625, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.94375, |
| "step": 3550 |
| }, |
| { |
| "completion_length": 250.175, |
| "epoch": 0.7851907069201143, |
| "grad_norm": 0.27828806764961916, |
| "kl": 0.2863525390625, |
| "learning_rate": 2.679980057882049e-06, |
| "loss": 0.0115, |
| "reward": 1.00625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9375, |
| "step": 3555 |
| }, |
| { |
| "completion_length": 235.0625, |
| "epoch": 0.7862950539059372, |
| "grad_norm": 0.7313042767403699, |
| "kl": 0.284637451171875, |
| "learning_rate": 2.6537658181338534e-06, |
| "loss": 0.0114, |
| "reward": 1.08125, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.9375, |
| "step": 3560 |
| }, |
| { |
| "completion_length": 222.3625, |
| "epoch": 0.7873994008917602, |
| "grad_norm": 0.40644071357218936, |
| "kl": 0.286981201171875, |
| "learning_rate": 2.6276607882670135e-06, |
| "loss": 0.0115, |
| "reward": 1.075, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.9625, |
| "step": 3565 |
| }, |
| { |
| "completion_length": 231.09375, |
| "epoch": 0.7885037478775831, |
| "grad_norm": 0.6111130783275136, |
| "kl": 0.306976318359375, |
| "learning_rate": 2.60166535636162e-06, |
| "loss": 0.0123, |
| "reward": 1.1375, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.18125, |
| "rewards/format_reward": 0.95625, |
| "step": 3570 |
| }, |
| { |
| "completion_length": 216.48125, |
| "epoch": 0.7896080948634061, |
| "grad_norm": 0.49711551639970475, |
| "kl": 0.25562744140625, |
| "learning_rate": 2.5757799088684654e-06, |
| "loss": 0.0102, |
| "reward": 1.1875, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.19375, |
| "rewards/format_reward": 0.99375, |
| "step": 3575 |
| }, |
| { |
| "completion_length": 218.78125, |
| "epoch": 0.7907124418492291, |
| "grad_norm": 0.31764284101121393, |
| "kl": 0.30023193359375, |
| "learning_rate": 2.5500048306033065e-06, |
| "loss": 0.012, |
| "reward": 1.06875, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95625, |
| "step": 3580 |
| }, |
| { |
| "completion_length": 216.31875, |
| "epoch": 0.791816788835052, |
| "grad_norm": 0.4787122183538524, |
| "kl": 0.2999267578125, |
| "learning_rate": 2.5243405047411353e-06, |
| "loss": 0.012, |
| "reward": 1.09375, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.975, |
| "step": 3585 |
| }, |
| { |
| "completion_length": 249.625, |
| "epoch": 0.7929211358208749, |
| "grad_norm": 0.3111958011260876, |
| "kl": 0.28712158203125, |
| "learning_rate": 2.498787312810492e-06, |
| "loss": 0.0115, |
| "reward": 1.05, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.94375, |
| "step": 3590 |
| }, |
| { |
| "completion_length": 231.98125, |
| "epoch": 0.7940254828066978, |
| "grad_norm": 0.39194633386336397, |
| "kl": 0.28707275390625, |
| "learning_rate": 2.4733456346877817e-06, |
| "loss": 0.0115, |
| "reward": 1.05, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.95625, |
| "step": 3595 |
| }, |
| { |
| "completion_length": 175.5375, |
| "epoch": 0.7951298297925208, |
| "grad_norm": 0.42822142675950153, |
| "kl": 0.3134765625, |
| "learning_rate": 2.448015848591638e-06, |
| "loss": 0.0125, |
| "reward": 1.10625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.98125, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.7951298297925208, |
| "eval_completion_length": 171.885, |
| "eval_kl": 0.3194921875, |
| "eval_loss": 0.012775387614965439, |
| "eval_reward": 1.14, |
| "eval_reward_std": 0.15556348919868468, |
| "eval_rewards/accuracy_reward": 0.16, |
| "eval_rewards/format_reward": 0.98, |
| "eval_runtime": 92.7145, |
| "eval_samples_per_second": 1.068, |
| "eval_steps_per_second": 0.27, |
| "step": 3600 |
| }, |
| { |
| "completion_length": 201.60625, |
| "epoch": 0.7962341767783437, |
| "grad_norm": 0.5464532193593336, |
| "kl": 0.325775146484375, |
| "learning_rate": 2.4227983310772963e-06, |
| "loss": 0.013, |
| "reward": 1.075, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.95625, |
| "step": 3605 |
| }, |
| { |
| "completion_length": 176.4625, |
| "epoch": 0.7973385237641667, |
| "grad_norm": 0.5021094659088707, |
| "kl": 0.36162109375, |
| "learning_rate": 2.3976934570309974e-06, |
| "loss": 0.0145, |
| "reward": 1.1125, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.96875, |
| "step": 3610 |
| }, |
| { |
| "completion_length": 167.06875, |
| "epoch": 0.7984428707499897, |
| "grad_norm": 0.3403630562580807, |
| "kl": 0.325738525390625, |
| "learning_rate": 2.3727015996644043e-06, |
| "loss": 0.013, |
| "reward": 1.125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.98125, |
| "step": 3615 |
| }, |
| { |
| "completion_length": 138.4625, |
| "epoch": 0.7995472177358126, |
| "grad_norm": 0.6077367973458568, |
| "kl": 0.333203125, |
| "learning_rate": 2.3478231305090694e-06, |
| "loss": 0.0133, |
| "reward": 1.13125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.99375, |
| "step": 3620 |
| }, |
| { |
| "completion_length": 177.075, |
| "epoch": 0.8006515647216356, |
| "grad_norm": 0.4466360553445801, |
| "kl": 0.325604248046875, |
| "learning_rate": 2.3230584194109074e-06, |
| "loss": 0.013, |
| "reward": 1.11875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.98125, |
| "step": 3625 |
| }, |
| { |
| "completion_length": 161.88125, |
| "epoch": 0.8017559117074585, |
| "grad_norm": 0.3897741643622985, |
| "kl": 0.345849609375, |
| "learning_rate": 2.298407834524682e-06, |
| "loss": 0.0138, |
| "reward": 1.0875, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.975, |
| "step": 3630 |
| }, |
| { |
| "completion_length": 166.9, |
| "epoch": 0.8028602586932815, |
| "grad_norm": 2.30034414615901, |
| "kl": 0.372119140625, |
| "learning_rate": 2.2738717423085543e-06, |
| "loss": 0.0149, |
| "reward": 1.10625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.98125, |
| "step": 3635 |
| }, |
| { |
| "completion_length": 238.34375, |
| "epoch": 0.8039646056791043, |
| "grad_norm": 0.6076643483832027, |
| "kl": 0.309075927734375, |
| "learning_rate": 2.2494505075186234e-06, |
| "loss": 0.0124, |
| "reward": 1.0875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.16875, |
| "rewards/format_reward": 0.91875, |
| "step": 3640 |
| }, |
| { |
| "completion_length": 181.86875, |
| "epoch": 0.8050689526649273, |
| "grad_norm": 0.2992763139298062, |
| "kl": 0.269927978515625, |
| "learning_rate": 2.2251444932035094e-06, |
| "loss": 0.0108, |
| "reward": 1.125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.98125, |
| "step": 3645 |
| }, |
| { |
| "completion_length": 164.825, |
| "epoch": 0.8061732996507502, |
| "grad_norm": 0.6026739836434083, |
| "kl": 0.284381103515625, |
| "learning_rate": 2.200954060698941e-06, |
| "loss": 0.0114, |
| "reward": 1.11875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.96875, |
| "step": 3650 |
| }, |
| { |
| "completion_length": 230.73125, |
| "epoch": 0.8072776466365732, |
| "grad_norm": 0.48565298064687734, |
| "kl": 0.30531005859375, |
| "learning_rate": 2.176879569622409e-06, |
| "loss": 0.0122, |
| "reward": 1.075, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.95625, |
| "step": 3655 |
| }, |
| { |
| "completion_length": 242.775, |
| "epoch": 0.8083819936223962, |
| "grad_norm": 0.21896055218236896, |
| "kl": 0.2802001953125, |
| "learning_rate": 2.1529213778677993e-06, |
| "loss": 0.0112, |
| "reward": 1.025, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.94375, |
| "step": 3660 |
| }, |
| { |
| "completion_length": 229.15, |
| "epoch": 0.8094863406082191, |
| "grad_norm": 0.14196401938191486, |
| "kl": 0.259161376953125, |
| "learning_rate": 2.1290798416000857e-06, |
| "loss": 0.0104, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.95625, |
| "step": 3665 |
| }, |
| { |
| "completion_length": 267.93125, |
| "epoch": 0.8105906875940421, |
| "grad_norm": 0.5415824445762728, |
| "kl": 0.263916015625, |
| "learning_rate": 2.1053553152500204e-06, |
| "loss": 0.0106, |
| "reward": 1.0125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.91875, |
| "step": 3670 |
| }, |
| { |
| "completion_length": 234.425, |
| "epoch": 0.811695034579865, |
| "grad_norm": 0.44188434661367404, |
| "kl": 0.27322998046875, |
| "learning_rate": 2.081748151508883e-06, |
| "loss": 0.0109, |
| "reward": 1.075, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.95625, |
| "step": 3675 |
| }, |
| { |
| "completion_length": 223.39375, |
| "epoch": 0.812799381565688, |
| "grad_norm": 0.29953298263136474, |
| "kl": 0.2898193359375, |
| "learning_rate": 2.0582587013232268e-06, |
| "loss": 0.0116, |
| "reward": 1.05625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95625, |
| "step": 3680 |
| }, |
| { |
| "completion_length": 242.9125, |
| "epoch": 0.8139037285515108, |
| "grad_norm": 0.5105270540146248, |
| "kl": 0.28282470703125, |
| "learning_rate": 2.0348873138896563e-06, |
| "loss": 0.0113, |
| "reward": 1.0, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.05, |
| "rewards/format_reward": 0.95, |
| "step": 3685 |
| }, |
| { |
| "completion_length": 225.31875, |
| "epoch": 0.8150080755373338, |
| "grad_norm": 0.3698502677044578, |
| "kl": 0.252008056640625, |
| "learning_rate": 2.0116343366496493e-06, |
| "loss": 0.0101, |
| "reward": 1.0625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9625, |
| "step": 3690 |
| }, |
| { |
| "completion_length": 236.0875, |
| "epoch": 0.8161124225231567, |
| "grad_norm": 1.4290601982893592, |
| "kl": 0.321392822265625, |
| "learning_rate": 1.988500115284385e-06, |
| "loss": 0.0129, |
| "reward": 1.0375, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.95625, |
| "step": 3695 |
| }, |
| { |
| "completion_length": 211.28125, |
| "epoch": 0.8172167695089797, |
| "grad_norm": 0.3911358009799874, |
| "kl": 0.278375244140625, |
| "learning_rate": 1.9654849937096033e-06, |
| "loss": 0.0111, |
| "reward": 1.0625, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.975, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.8172167695089797, |
| "eval_completion_length": 202.245, |
| "eval_kl": 0.299296875, |
| "eval_loss": 0.011967692524194717, |
| "eval_reward": 1.13, |
| "eval_reward_std": 0.11313708305358887, |
| "eval_rewards/accuracy_reward": 0.14, |
| "eval_rewards/format_reward": 0.99, |
| "eval_runtime": 97.9771, |
| "eval_samples_per_second": 1.01, |
| "eval_steps_per_second": 0.255, |
| "step": 3700 |
| }, |
| { |
| "completion_length": 245.05, |
| "epoch": 0.8183211164948027, |
| "grad_norm": 0.6857212516430605, |
| "kl": 0.283929443359375, |
| "learning_rate": 1.942589314070494e-06, |
| "loss": 0.0114, |
| "reward": 1.05, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.94375, |
| "step": 3705 |
| }, |
| { |
| "completion_length": 207.8, |
| "epoch": 0.8194254634806256, |
| "grad_norm": 0.8931067063405094, |
| "kl": 0.33228759765625, |
| "learning_rate": 1.9198134167366156e-06, |
| "loss": 0.0133, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.96875, |
| "step": 3710 |
| }, |
| { |
| "completion_length": 231.15625, |
| "epoch": 0.8205298104664486, |
| "grad_norm": 4.939225391817678, |
| "kl": 0.328973388671875, |
| "learning_rate": 1.897157640296825e-06, |
| "loss": 0.0131, |
| "reward": 1.06875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95625, |
| "step": 3715 |
| }, |
| { |
| "completion_length": 213.325, |
| "epoch": 0.8216341574522715, |
| "grad_norm": 0.5141296496399171, |
| "kl": 0.280364990234375, |
| "learning_rate": 1.8746223215542482e-06, |
| "loss": 0.0112, |
| "reward": 1.09375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.96875, |
| "step": 3720 |
| }, |
| { |
| "completion_length": 216.3625, |
| "epoch": 0.8227385044380945, |
| "grad_norm": 0.49407058755769534, |
| "kl": 0.245660400390625, |
| "learning_rate": 1.8522077955212791e-06, |
| "loss": 0.0098, |
| "reward": 1.1375, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.98125, |
| "step": 3725 |
| }, |
| { |
| "completion_length": 206.43125, |
| "epoch": 0.8238428514239174, |
| "grad_norm": 0.2188098942709737, |
| "kl": 0.278680419921875, |
| "learning_rate": 1.8299143954145926e-06, |
| "loss": 0.0111, |
| "reward": 1.1, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.95, |
| "step": 3730 |
| }, |
| { |
| "completion_length": 211.05, |
| "epoch": 0.8249471984097403, |
| "grad_norm": 0.8180293925174863, |
| "kl": 0.28306884765625, |
| "learning_rate": 1.8077424526501964e-06, |
| "loss": 0.0113, |
| "reward": 1.0875, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.96875, |
| "step": 3735 |
| }, |
| { |
| "completion_length": 216.4625, |
| "epoch": 0.8260515453955632, |
| "grad_norm": 0.6158285951662569, |
| "kl": 0.28001708984375, |
| "learning_rate": 1.7856922968384926e-06, |
| "loss": 0.0112, |
| "reward": 1.0875, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.95625, |
| "step": 3740 |
| }, |
| { |
| "completion_length": 238.0125, |
| "epoch": 0.8271558923813862, |
| "grad_norm": 0.615093259382316, |
| "kl": 0.301104736328125, |
| "learning_rate": 1.763764255779392e-06, |
| "loss": 0.012, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.94375, |
| "step": 3745 |
| }, |
| { |
| "completion_length": 212.09375, |
| "epoch": 0.8282602393672092, |
| "grad_norm": 0.5625866842898283, |
| "kl": 0.2462158203125, |
| "learning_rate": 1.7419586554574364e-06, |
| "loss": 0.0098, |
| "reward": 1.14375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.175, |
| "rewards/format_reward": 0.96875, |
| "step": 3750 |
| }, |
| { |
| "completion_length": 244.36875, |
| "epoch": 0.8293645863530321, |
| "grad_norm": 0.6830147990367013, |
| "kl": 0.36268310546875, |
| "learning_rate": 1.720275820036944e-06, |
| "loss": 0.0145, |
| "reward": 1.05, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.925, |
| "step": 3755 |
| }, |
| { |
| "completion_length": 185.325, |
| "epoch": 0.8304689333388551, |
| "grad_norm": 0.09647577383094562, |
| "kl": 0.28330078125, |
| "learning_rate": 1.6987160718572027e-06, |
| "loss": 0.0113, |
| "reward": 1.10625, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.975, |
| "step": 3760 |
| }, |
| { |
| "completion_length": 169.075, |
| "epoch": 0.831573280324678, |
| "grad_norm": 0.42190186308598165, |
| "kl": 0.2713623046875, |
| "learning_rate": 1.6772797314276712e-06, |
| "loss": 0.0109, |
| "reward": 1.11875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.98125, |
| "step": 3765 |
| }, |
| { |
| "completion_length": 209.425, |
| "epoch": 0.832677627310501, |
| "grad_norm": 0.4838527676338876, |
| "kl": 0.3133056640625, |
| "learning_rate": 1.6559671174232195e-06, |
| "loss": 0.0125, |
| "reward": 1.03125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.95, |
| "step": 3770 |
| }, |
| { |
| "completion_length": 196.15625, |
| "epoch": 0.833781974296324, |
| "grad_norm": 0.541789987335856, |
| "kl": 0.284918212890625, |
| "learning_rate": 1.6347785466793764e-06, |
| "loss": 0.0114, |
| "reward": 1.13125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.98125, |
| "step": 3775 |
| }, |
| { |
| "completion_length": 189.86875, |
| "epoch": 0.8348863212821469, |
| "grad_norm": 0.37944243070397565, |
| "kl": 0.305487060546875, |
| "learning_rate": 1.6137143341876439e-06, |
| "loss": 0.0122, |
| "reward": 1.09375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.99375, |
| "step": 3780 |
| }, |
| { |
| "completion_length": 192.91875, |
| "epoch": 0.8359906682679697, |
| "grad_norm": 0.4016733182249456, |
| "kl": 0.266162109375, |
| "learning_rate": 1.5927747930907921e-06, |
| "loss": 0.0106, |
| "reward": 1.08125, |
| "reward_std": 0.06187184229493141, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9875, |
| "step": 3785 |
| }, |
| { |
| "completion_length": 199.01875, |
| "epoch": 0.8370950152537927, |
| "grad_norm": 0.555504541233714, |
| "kl": 0.31944580078125, |
| "learning_rate": 1.5719602346782215e-06, |
| "loss": 0.0128, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9625, |
| "step": 3790 |
| }, |
| { |
| "completion_length": 191.89375, |
| "epoch": 0.8381993622396157, |
| "grad_norm": 0.6714397875203784, |
| "kl": 0.400775146484375, |
| "learning_rate": 1.5512709683813165e-06, |
| "loss": 0.016, |
| "reward": 1.15, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.9625, |
| "step": 3795 |
| }, |
| { |
| "completion_length": 211.68125, |
| "epoch": 0.8393037092254386, |
| "grad_norm": 0.3384649202970216, |
| "kl": 0.278900146484375, |
| "learning_rate": 1.5307073017688644e-06, |
| "loss": 0.0112, |
| "reward": 1.03125, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.9625, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.8393037092254386, |
| "eval_completion_length": 195.9, |
| "eval_kl": 0.30609375, |
| "eval_loss": 0.012239097617566586, |
| "eval_reward": 1.095, |
| "eval_reward_std": 0.14849242091178894, |
| "eval_rewards/accuracy_reward": 0.13, |
| "eval_rewards/format_reward": 0.965, |
| "eval_runtime": 100.9941, |
| "eval_samples_per_second": 0.98, |
| "eval_steps_per_second": 0.248, |
| "step": 3800 |
| }, |
| { |
| "completion_length": 217.4375, |
| "epoch": 0.8404080562112616, |
| "grad_norm": 0.6534309102123147, |
| "kl": 0.330548095703125, |
| "learning_rate": 1.5102695405424738e-06, |
| "loss": 0.0132, |
| "reward": 1.08125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.95625, |
| "step": 3805 |
| }, |
| { |
| "completion_length": 207.18125, |
| "epoch": 0.8415124031970845, |
| "grad_norm": 0.35957581300760044, |
| "kl": 0.358197021484375, |
| "learning_rate": 1.4899579885320237e-06, |
| "loss": 0.0143, |
| "reward": 1.0875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.94375, |
| "step": 3810 |
| }, |
| { |
| "completion_length": 222.85625, |
| "epoch": 0.8426167501829075, |
| "grad_norm": 0.5604436470793626, |
| "kl": 0.305682373046875, |
| "learning_rate": 1.4697729476911614e-06, |
| "loss": 0.0122, |
| "reward": 1.13125, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.18125, |
| "rewards/format_reward": 0.95, |
| "step": 3815 |
| }, |
| { |
| "completion_length": 202.29375, |
| "epoch": 0.8437210971687304, |
| "grad_norm": 0.4206816591710824, |
| "kl": 0.2802001953125, |
| "learning_rate": 1.449714718092803e-06, |
| "loss": 0.0112, |
| "reward": 1.08125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.98125, |
| "step": 3820 |
| }, |
| { |
| "completion_length": 205.2375, |
| "epoch": 0.8448254441545534, |
| "grad_norm": 0.16670952436146919, |
| "kl": 0.273931884765625, |
| "learning_rate": 1.4297835979246777e-06, |
| "loss": 0.011, |
| "reward": 1.075, |
| "reward_std": 0.07071067690849304, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9875, |
| "step": 3825 |
| }, |
| { |
| "completion_length": 238.1875, |
| "epoch": 0.8459297911403763, |
| "grad_norm": 0.613398892634411, |
| "kl": 0.276251220703125, |
| "learning_rate": 1.4099798834848855e-06, |
| "loss": 0.0111, |
| "reward": 1.10625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.95, |
| "step": 3830 |
| }, |
| { |
| "completion_length": 261.0375, |
| "epoch": 0.8470341381261992, |
| "grad_norm": 0.47116863545999577, |
| "kl": 0.307647705078125, |
| "learning_rate": 1.3903038691775095e-06, |
| "loss": 0.0123, |
| "reward": 1.09375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.94375, |
| "step": 3835 |
| }, |
| { |
| "completion_length": 211.49375, |
| "epoch": 0.8481384851120222, |
| "grad_norm": 0.3456577853406588, |
| "kl": 0.296746826171875, |
| "learning_rate": 1.370755847508226e-06, |
| "loss": 0.0119, |
| "reward": 1.1125, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.975, |
| "step": 3840 |
| }, |
| { |
| "completion_length": 210.94375, |
| "epoch": 0.8492428320978451, |
| "grad_norm": 0.3986816328493071, |
| "kl": 0.29635009765625, |
| "learning_rate": 1.3513361090799537e-06, |
| "loss": 0.0119, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.96875, |
| "step": 3845 |
| }, |
| { |
| "completion_length": 216.41875, |
| "epoch": 0.8503471790836681, |
| "grad_norm": 0.36582880270044166, |
| "kl": 0.265789794921875, |
| "learning_rate": 1.332044942588545e-06, |
| "loss": 0.0106, |
| "reward": 1.14375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.18125, |
| "rewards/format_reward": 0.9625, |
| "step": 3850 |
| }, |
| { |
| "completion_length": 196.1875, |
| "epoch": 0.851451526069491, |
| "grad_norm": 0.43079415258986453, |
| "kl": 0.3136474609375, |
| "learning_rate": 1.3128826348184886e-06, |
| "loss": 0.0125, |
| "reward": 1.1625, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.19375, |
| "rewards/format_reward": 0.96875, |
| "step": 3855 |
| }, |
| { |
| "completion_length": 205.025, |
| "epoch": 0.852555873055314, |
| "grad_norm": 0.26604232127523036, |
| "kl": 0.30478515625, |
| "learning_rate": 1.2938494706386462e-06, |
| "loss": 0.0122, |
| "reward": 1.0875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.96875, |
| "step": 3860 |
| }, |
| { |
| "completion_length": 218.6, |
| "epoch": 0.853660220041137, |
| "grad_norm": 0.4745459719079689, |
| "kl": 0.239349365234375, |
| "learning_rate": 1.2749457329980108e-06, |
| "loss": 0.0096, |
| "reward": 1.13125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.975, |
| "step": 3865 |
| }, |
| { |
| "completion_length": 221.925, |
| "epoch": 0.8547645670269599, |
| "grad_norm": 1.0389906784554162, |
| "kl": 0.282757568359375, |
| "learning_rate": 1.256171702921516e-06, |
| "loss": 0.0113, |
| "reward": 1.125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.1625, |
| "rewards/format_reward": 0.9625, |
| "step": 3870 |
| }, |
| { |
| "completion_length": 227.575, |
| "epoch": 0.8558689140127829, |
| "grad_norm": 0.5325421261443589, |
| "kl": 0.273663330078125, |
| "learning_rate": 1.237527659505846e-06, |
| "loss": 0.0109, |
| "reward": 1.0625, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95, |
| "step": 3875 |
| }, |
| { |
| "completion_length": 253.325, |
| "epoch": 0.8569732609986057, |
| "grad_norm": 0.19847592400408953, |
| "kl": 0.26175537109375, |
| "learning_rate": 1.2190138799152851e-06, |
| "loss": 0.0105, |
| "reward": 1.05, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.94375, |
| "step": 3880 |
| }, |
| { |
| "completion_length": 252.20625, |
| "epoch": 0.8580776079844287, |
| "grad_norm": 0.6511065620949663, |
| "kl": 0.270318603515625, |
| "learning_rate": 1.200630639377609e-06, |
| "loss": 0.0108, |
| "reward": 1.0625, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.94375, |
| "step": 3885 |
| }, |
| { |
| "completion_length": 222.19375, |
| "epoch": 0.8591819549702516, |
| "grad_norm": 0.510836488773378, |
| "kl": 0.246490478515625, |
| "learning_rate": 1.1823782111799843e-06, |
| "loss": 0.0099, |
| "reward": 1.1375, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.18125, |
| "rewards/format_reward": 0.95625, |
| "step": 3890 |
| }, |
| { |
| "completion_length": 252.8125, |
| "epoch": 0.8602863019560746, |
| "grad_norm": 0.4973937206692706, |
| "kl": 0.240765380859375, |
| "learning_rate": 1.1642568666649067e-06, |
| "loss": 0.0096, |
| "reward": 1.08125, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.94375, |
| "step": 3895 |
| }, |
| { |
| "completion_length": 235.05625, |
| "epoch": 0.8613906489418975, |
| "grad_norm": 0.896473119654828, |
| "kl": 0.25704345703125, |
| "learning_rate": 1.1462668752261652e-06, |
| "loss": 0.0103, |
| "reward": 1.11875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.9625, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.8613906489418975, |
| "eval_completion_length": 235.22, |
| "eval_kl": 0.29470703125, |
| "eval_loss": 0.011815370991826057, |
| "eval_reward": 1.07, |
| "eval_reward_std": 0.12727921783924104, |
| "eval_rewards/accuracy_reward": 0.11, |
| "eval_rewards/format_reward": 0.96, |
| "eval_runtime": 109.2786, |
| "eval_samples_per_second": 0.906, |
| "eval_steps_per_second": 0.229, |
| "step": 3900 |
| }, |
| { |
| "completion_length": 233.93125, |
| "epoch": 0.8624949959277205, |
| "grad_norm": 0.791697614971569, |
| "kl": 0.280670166015625, |
| "learning_rate": 1.1284085043048465e-06, |
| "loss": 0.0112, |
| "reward": 1.05625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95625, |
| "step": 3905 |
| }, |
| { |
| "completion_length": 262.68125, |
| "epoch": 0.8635993429135435, |
| "grad_norm": 0.5051810763575918, |
| "kl": 0.27977294921875, |
| "learning_rate": 1.1106820193853484e-06, |
| "loss": 0.0112, |
| "reward": 1.0125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.075, |
| "rewards/format_reward": 0.9375, |
| "step": 3910 |
| }, |
| { |
| "completion_length": 251.48125, |
| "epoch": 0.8647036898993664, |
| "grad_norm": 0.5025881602992487, |
| "kl": 0.273834228515625, |
| "learning_rate": 1.0930876839914418e-06, |
| "loss": 0.011, |
| "reward": 1.06875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95625, |
| "step": 3915 |
| }, |
| { |
| "completion_length": 236.35625, |
| "epoch": 0.8658080368851894, |
| "grad_norm": 0.4129347471678857, |
| "kl": 0.2613372802734375, |
| "learning_rate": 1.0756257596823427e-06, |
| "loss": 0.0105, |
| "reward": 1.075, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.95625, |
| "step": 3920 |
| }, |
| { |
| "completion_length": 265.86875, |
| "epoch": 0.8669123838710123, |
| "grad_norm": 0.4235003049667533, |
| "kl": 0.253765869140625, |
| "learning_rate": 1.058296506048836e-06, |
| "loss": 0.0101, |
| "reward": 1.1, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.95, |
| "step": 3925 |
| }, |
| { |
| "completion_length": 232.975, |
| "epoch": 0.8680167308568352, |
| "grad_norm": 0.37693409083366114, |
| "kl": 0.2826416015625, |
| "learning_rate": 1.04110018070941e-06, |
| "loss": 0.0113, |
| "reward": 1.14375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.16875, |
| "rewards/format_reward": 0.975, |
| "step": 3930 |
| }, |
| { |
| "completion_length": 256.3375, |
| "epoch": 0.8691210778426581, |
| "grad_norm": 0.47005147974118267, |
| "kl": 0.28515625, |
| "learning_rate": 1.0240370393064235e-06, |
| "loss": 0.0114, |
| "reward": 1.125, |
| "reward_std": 0.2298096999526024, |
| "rewards/accuracy_reward": 0.175, |
| "rewards/format_reward": 0.95, |
| "step": 3935 |
| }, |
| { |
| "completion_length": 258.5625, |
| "epoch": 0.8702254248284811, |
| "grad_norm": 0.27709333139181463, |
| "kl": 0.31121826171875, |
| "learning_rate": 1.0071073355023097e-06, |
| "loss": 0.0124, |
| "reward": 1.0875, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.93125, |
| "step": 3940 |
| }, |
| { |
| "completion_length": 227.4875, |
| "epoch": 0.871329771814304, |
| "grad_norm": 0.2761772502885486, |
| "kl": 0.301312255859375, |
| "learning_rate": 9.903113209758098e-07, |
| "loss": 0.012, |
| "reward": 1.11875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.96875, |
| "step": 3945 |
| }, |
| { |
| "completion_length": 225.5875, |
| "epoch": 0.872434118800127, |
| "grad_norm": 0.26781461171540255, |
| "kl": 0.31710205078125, |
| "learning_rate": 9.736492454182211e-07, |
| "loss": 0.0127, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.9625, |
| "step": 3950 |
| }, |
| { |
| "completion_length": 241.3375, |
| "epoch": 0.87353846578595, |
| "grad_norm": 1.1383280325532497, |
| "kl": 0.262933349609375, |
| "learning_rate": 9.571213565296877e-07, |
| "loss": 0.0105, |
| "reward": 1.075, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.9625, |
| "step": 3955 |
| }, |
| { |
| "completion_length": 233.5, |
| "epoch": 0.8746428127717729, |
| "grad_norm": 0.29444920945103936, |
| "kl": 0.333721923828125, |
| "learning_rate": 9.407279000155311e-07, |
| "loss": 0.0133, |
| "reward": 1.075, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.93125, |
| "step": 3960 |
| }, |
| { |
| "completion_length": 219.39375, |
| "epoch": 0.8757471597575959, |
| "grad_norm": 0.42276681745389866, |
| "kl": 0.26292724609375, |
| "learning_rate": 9.244691195825794e-07, |
| "loss": 0.0105, |
| "reward": 1.1375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.1625, |
| "rewards/format_reward": 0.975, |
| "step": 3965 |
| }, |
| { |
| "completion_length": 269.35625, |
| "epoch": 0.8768515067434188, |
| "grad_norm": 0.5714466190012454, |
| "kl": 0.2780029296875, |
| "learning_rate": 9.0834525693555e-07, |
| "loss": 0.0111, |
| "reward": 1.0625, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9375, |
| "step": 3970 |
| }, |
| { |
| "completion_length": 221.74375, |
| "epoch": 0.8779558537292417, |
| "grad_norm": 0.5132142260680984, |
| "kl": 0.23480224609375, |
| "learning_rate": 8.923565517734633e-07, |
| "loss": 0.0094, |
| "reward": 1.09375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.98125, |
| "step": 3975 |
| }, |
| { |
| "completion_length": 239.19375, |
| "epoch": 0.8790602007150646, |
| "grad_norm": 0.6209622546123578, |
| "kl": 0.246160888671875, |
| "learning_rate": 8.765032417860753e-07, |
| "loss": 0.0099, |
| "reward": 1.15625, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.19375, |
| "rewards/format_reward": 0.9625, |
| "step": 3980 |
| }, |
| { |
| "completion_length": 230.28125, |
| "epoch": 0.8801645477008876, |
| "grad_norm": 0.4959744957429339, |
| "kl": 0.328594970703125, |
| "learning_rate": 8.607855626503403e-07, |
| "loss": 0.0132, |
| "reward": 1.1125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1625, |
| "rewards/format_reward": 0.95, |
| "step": 3985 |
| }, |
| { |
| "completion_length": 242.23125, |
| "epoch": 0.8812688946867105, |
| "grad_norm": 0.6000894011738015, |
| "kl": 0.264337158203125, |
| "learning_rate": 8.452037480269082e-07, |
| "loss": 0.0106, |
| "reward": 1.09375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.95, |
| "step": 3990 |
| }, |
| { |
| "completion_length": 241.79375, |
| "epoch": 0.8823732416725335, |
| "grad_norm": 0.5856288163785148, |
| "kl": 0.274072265625, |
| "learning_rate": 8.297580295566576e-07, |
| "loss": 0.011, |
| "reward": 1.0375, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.95, |
| "step": 3995 |
| }, |
| { |
| "completion_length": 228.125, |
| "epoch": 0.8834775886583565, |
| "grad_norm": 0.7727469678433277, |
| "kl": 0.239056396484375, |
| "learning_rate": 8.144486368572468e-07, |
| "loss": 0.0096, |
| "reward": 1.1875, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.2125, |
| "rewards/format_reward": 0.975, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.8834775886583565, |
| "eval_completion_length": 240.635, |
| "eval_kl": 0.28673828125, |
| "eval_loss": 0.011466315016150475, |
| "eval_reward": 1.09, |
| "eval_reward_std": 0.16970562398433686, |
| "eval_rewards/accuracy_reward": 0.14, |
| "eval_rewards/format_reward": 0.95, |
| "eval_runtime": 124.8847, |
| "eval_samples_per_second": 0.793, |
| "eval_steps_per_second": 0.2, |
| "step": 4000 |
| }, |
| { |
| "completion_length": 222.24375, |
| "epoch": 0.8845819356441794, |
| "grad_norm": 0.13203636774859265, |
| "kl": 0.271759033203125, |
| "learning_rate": 7.992757975196974e-07, |
| "loss": 0.0109, |
| "reward": 1.10625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.975, |
| "step": 4005 |
| }, |
| { |
| "completion_length": 217.46875, |
| "epoch": 0.8856862826300024, |
| "grad_norm": 0.3836137257482129, |
| "kl": 0.25550537109375, |
| "learning_rate": 7.842397371050181e-07, |
| "loss": 0.0102, |
| "reward": 1.075, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.975, |
| "step": 4010 |
| }, |
| { |
| "completion_length": 264.93125, |
| "epoch": 0.8867906296158253, |
| "grad_norm": 0.35191809685250214, |
| "kl": 0.23974609375, |
| "learning_rate": 7.693406791408476e-07, |
| "loss": 0.0096, |
| "reward": 1.09375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.95625, |
| "step": 4015 |
| }, |
| { |
| "completion_length": 241.44375, |
| "epoch": 0.8878949766016483, |
| "grad_norm": 0.4672837627346682, |
| "kl": 0.26492919921875, |
| "learning_rate": 7.545788451181313e-07, |
| "loss": 0.0106, |
| "reward": 1.0625, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9625, |
| "step": 4020 |
| }, |
| { |
| "completion_length": 277.9875, |
| "epoch": 0.8889993235874711, |
| "grad_norm": 0.7088610794073225, |
| "kl": 0.29766845703125, |
| "learning_rate": 7.399544544878268e-07, |
| "loss": 0.0119, |
| "reward": 1.06875, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.94375, |
| "step": 4025 |
| }, |
| { |
| "completion_length": 226.96875, |
| "epoch": 0.8901036705732941, |
| "grad_norm": 0.29863896309113996, |
| "kl": 0.246087646484375, |
| "learning_rate": 7.25467724657647e-07, |
| "loss": 0.0098, |
| "reward": 1.09375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.98125, |
| "step": 4030 |
| }, |
| { |
| "completion_length": 261.29375, |
| "epoch": 0.891208017559117, |
| "grad_norm": 0.5220634238395366, |
| "kl": 0.238995361328125, |
| "learning_rate": 7.11118870988825e-07, |
| "loss": 0.0096, |
| "reward": 1.1125, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.96875, |
| "step": 4035 |
| }, |
| { |
| "completion_length": 227.1875, |
| "epoch": 0.89231236454494, |
| "grad_norm": 0.468574862123508, |
| "kl": 0.254180908203125, |
| "learning_rate": 6.969081067929129e-07, |
| "loss": 0.0102, |
| "reward": 1.09375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.98125, |
| "step": 4040 |
| }, |
| { |
| "completion_length": 223.36875, |
| "epoch": 0.893416711530763, |
| "grad_norm": 0.7105477246802777, |
| "kl": 0.23565673828125, |
| "learning_rate": 6.828356433286065e-07, |
| "loss": 0.0094, |
| "reward": 1.15625, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.16875, |
| "rewards/format_reward": 0.9875, |
| "step": 4045 |
| }, |
| { |
| "completion_length": 242.2875, |
| "epoch": 0.8945210585165859, |
| "grad_norm": 0.3124111487407041, |
| "kl": 0.272393798828125, |
| "learning_rate": 6.689016897986123e-07, |
| "loss": 0.0109, |
| "reward": 1.09375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.96875, |
| "step": 4050 |
| }, |
| { |
| "completion_length": 235.10625, |
| "epoch": 0.8956254055024089, |
| "grad_norm": 0.5287755229051584, |
| "kl": 0.263592529296875, |
| "learning_rate": 6.551064533465335e-07, |
| "loss": 0.0105, |
| "reward": 1.16875, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.2, |
| "rewards/format_reward": 0.96875, |
| "step": 4055 |
| }, |
| { |
| "completion_length": 210.3, |
| "epoch": 0.8967297524882318, |
| "grad_norm": 0.3865388105745784, |
| "kl": 0.243426513671875, |
| "learning_rate": 6.414501390537875e-07, |
| "loss": 0.0097, |
| "reward": 1.0875, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9875, |
| "step": 4060 |
| }, |
| { |
| "completion_length": 257.4125, |
| "epoch": 0.8978340994740548, |
| "grad_norm": 0.5827223262448566, |
| "kl": 0.28209228515625, |
| "learning_rate": 6.279329499365649e-07, |
| "loss": 0.0113, |
| "reward": 1.01875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.06875, |
| "rewards/format_reward": 0.95, |
| "step": 4065 |
| }, |
| { |
| "completion_length": 245.89375, |
| "epoch": 0.8989384464598776, |
| "grad_norm": 0.5568021946499023, |
| "kl": 0.329107666015625, |
| "learning_rate": 6.14555086942804e-07, |
| "loss": 0.0132, |
| "reward": 1.05625, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.94375, |
| "step": 4070 |
| }, |
| { |
| "completion_length": 270.7, |
| "epoch": 0.9000427934457006, |
| "grad_norm": 0.8265886341669334, |
| "kl": 0.343658447265625, |
| "learning_rate": 6.013167489492089e-07, |
| "loss": 0.0137, |
| "reward": 1.0375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9375, |
| "step": 4075 |
| }, |
| { |
| "completion_length": 235.875, |
| "epoch": 0.9011471404315236, |
| "grad_norm": 0.15521466379213147, |
| "kl": 0.21239013671875, |
| "learning_rate": 5.88218132758287e-07, |
| "loss": 0.0085, |
| "reward": 1.09375, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.9875, |
| "step": 4080 |
| }, |
| { |
| "completion_length": 263.94375, |
| "epoch": 0.9022514874173465, |
| "grad_norm": 0.3565020657376661, |
| "kl": 0.248944091796875, |
| "learning_rate": 5.752594330954275e-07, |
| "loss": 0.01, |
| "reward": 1.0875, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9625, |
| "step": 4085 |
| }, |
| { |
| "completion_length": 217.0875, |
| "epoch": 0.9033558344031695, |
| "grad_norm": 0.7599338431132417, |
| "kl": 0.256341552734375, |
| "learning_rate": 5.624408426060124e-07, |
| "loss": 0.0103, |
| "reward": 1.09375, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.99375, |
| "step": 4090 |
| }, |
| { |
| "completion_length": 240.49375, |
| "epoch": 0.9044601813889924, |
| "grad_norm": 0.3404631065084141, |
| "kl": 0.26585693359375, |
| "learning_rate": 5.497625518525374e-07, |
| "loss": 0.0106, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.96875, |
| "step": 4095 |
| }, |
| { |
| "completion_length": 240.90625, |
| "epoch": 0.9055645283748154, |
| "grad_norm": 0.4830591822507376, |
| "kl": 0.2419189453125, |
| "learning_rate": 5.372247493117921e-07, |
| "loss": 0.0097, |
| "reward": 1.0375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.95625, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.9055645283748154, |
| "eval_completion_length": 236.99, |
| "eval_kl": 0.26658203125, |
| "eval_loss": 0.01068319845944643, |
| "eval_reward": 1.13, |
| "eval_reward_std": 0.1838477599620819, |
| "eval_rewards/accuracy_reward": 0.16, |
| "eval_rewards/format_reward": 0.97, |
| "eval_runtime": 112.8111, |
| "eval_samples_per_second": 0.878, |
| "eval_steps_per_second": 0.222, |
| "step": 4100 |
| }, |
| { |
| "completion_length": 213.69375, |
| "epoch": 0.9066688753606383, |
| "grad_norm": 0.0963839062331533, |
| "kl": 0.2247802734375, |
| "learning_rate": 5.248276213720526e-07, |
| "loss": 0.009, |
| "reward": 1.11875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.98125, |
| "step": 4105 |
| }, |
| { |
| "completion_length": 235.7625, |
| "epoch": 0.9077732223464613, |
| "grad_norm": 0.3586489435080358, |
| "kl": 113.0680419921875, |
| "learning_rate": 5.125713523303133e-07, |
| "loss": 4.5501, |
| "reward": 1.08125, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.9625, |
| "step": 4110 |
| }, |
| { |
| "completion_length": 245.6875, |
| "epoch": 0.9088775693322843, |
| "grad_norm": 0.5993938735102521, |
| "kl": 0.2639892578125, |
| "learning_rate": 5.004561243895433e-07, |
| "loss": 0.0106, |
| "reward": 1.09375, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.95625, |
| "step": 4115 |
| }, |
| { |
| "completion_length": 238.9875, |
| "epoch": 0.9099819163181071, |
| "grad_norm": 0.7059681339718733, |
| "kl": 0.25078125, |
| "learning_rate": 4.884821176559817e-07, |
| "loss": 0.01, |
| "reward": 1.09375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.95625, |
| "step": 4120 |
| }, |
| { |
| "completion_length": 214.73125, |
| "epoch": 0.91108626330393, |
| "grad_norm": 0.5836610939153032, |
| "kl": 0.248724365234375, |
| "learning_rate": 4.7664951013645875e-07, |
| "loss": 0.01, |
| "reward": 1.1, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.98125, |
| "step": 4125 |
| }, |
| { |
| "completion_length": 287.74375, |
| "epoch": 0.912190610289753, |
| "grad_norm": 0.5029836450413667, |
| "kl": 0.338372802734375, |
| "learning_rate": 4.649584777357452e-07, |
| "loss": 0.0135, |
| "reward": 1.0375, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9375, |
| "step": 4130 |
| }, |
| { |
| "completion_length": 238.50625, |
| "epoch": 0.913294957275576, |
| "grad_norm": 0.3679025891536969, |
| "kl": 0.2593505859375, |
| "learning_rate": 4.534091942539476e-07, |
| "loss": 0.0104, |
| "reward": 1.06875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.96875, |
| "step": 4135 |
| }, |
| { |
| "completion_length": 243.9, |
| "epoch": 0.9143993042613989, |
| "grad_norm": 0.36542863601047937, |
| "kl": 0.2465576171875, |
| "learning_rate": 4.420018313839147e-07, |
| "loss": 0.0099, |
| "reward": 1.1625, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.2, |
| "rewards/format_reward": 0.9625, |
| "step": 4140 |
| }, |
| { |
| "completion_length": 244.55625, |
| "epoch": 0.9155036512472219, |
| "grad_norm": 0.6654427169718511, |
| "kl": 0.266680908203125, |
| "learning_rate": 4.3073655870869093e-07, |
| "loss": 0.0107, |
| "reward": 1.09375, |
| "reward_std": 0.22097086533904076, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.95625, |
| "step": 4145 |
| }, |
| { |
| "completion_length": 261.3625, |
| "epoch": 0.9166079982330448, |
| "grad_norm": 0.4755302093000005, |
| "kl": 0.246258544921875, |
| "learning_rate": 4.1961354369898675e-07, |
| "loss": 0.0099, |
| "reward": 1.1, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.96875, |
| "step": 4150 |
| }, |
| { |
| "completion_length": 238.525, |
| "epoch": 0.9177123452188678, |
| "grad_norm": 0.3919488350765311, |
| "kl": 0.269342041015625, |
| "learning_rate": 4.086329517107046e-07, |
| "loss": 0.0108, |
| "reward": 1.175, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.2125, |
| "rewards/format_reward": 0.9625, |
| "step": 4155 |
| }, |
| { |
| "completion_length": 259.00625, |
| "epoch": 0.9188166922046908, |
| "grad_norm": 0.4320891208444687, |
| "kl": 0.283648681640625, |
| "learning_rate": 3.9779494598246484e-07, |
| "loss": 0.0113, |
| "reward": 1.1125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.95625, |
| "step": 4160 |
| }, |
| { |
| "completion_length": 261.85625, |
| "epoch": 0.9199210391905137, |
| "grad_norm": 0.4840697345614265, |
| "kl": 0.2813232421875, |
| "learning_rate": 3.8709968763318894e-07, |
| "loss": 0.0113, |
| "reward": 1.0875, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.95, |
| "step": 4165 |
| }, |
| { |
| "completion_length": 244.525, |
| "epoch": 0.9210253861763366, |
| "grad_norm": 0.29181391006128693, |
| "kl": 0.273553466796875, |
| "learning_rate": 3.7654733565969826e-07, |
| "loss": 0.0109, |
| "reward": 1.1125, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.9625, |
| "step": 4170 |
| }, |
| { |
| "completion_length": 263.0, |
| "epoch": 0.9221297331621595, |
| "grad_norm": 0.6600185394674087, |
| "kl": 0.275067138671875, |
| "learning_rate": 3.661380469343556e-07, |
| "loss": 0.011, |
| "reward": 1.01875, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.91875, |
| "step": 4175 |
| }, |
| { |
| "completion_length": 257.275, |
| "epoch": 0.9232340801479825, |
| "grad_norm": 0.3026772238703725, |
| "kl": 0.2882568359375, |
| "learning_rate": 3.558719762027307e-07, |
| "loss": 0.0115, |
| "reward": 1.075, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.9625, |
| "step": 4180 |
| }, |
| { |
| "completion_length": 238.7, |
| "epoch": 0.9243384271338054, |
| "grad_norm": 0.5128635762934807, |
| "kl": 0.256884765625, |
| "learning_rate": 3.457492760812975e-07, |
| "loss": 0.0103, |
| "reward": 1.0625, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.94375, |
| "step": 4185 |
| }, |
| { |
| "completion_length": 271.00625, |
| "epoch": 0.9254427741196284, |
| "grad_norm": 0.41061824834878685, |
| "kl": 0.3115478515625, |
| "learning_rate": 3.357700970551681e-07, |
| "loss": 0.0125, |
| "reward": 1.0875, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9625, |
| "step": 4190 |
| }, |
| { |
| "completion_length": 236.125, |
| "epoch": 0.9265471211054513, |
| "grad_norm": 0.6143144747308046, |
| "kl": 0.2658935546875, |
| "learning_rate": 3.2593458747585683e-07, |
| "loss": 0.0106, |
| "reward": 1.0625, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9625, |
| "step": 4195 |
| }, |
| { |
| "completion_length": 263.36875, |
| "epoch": 0.9276514680912743, |
| "grad_norm": 0.31447425015692126, |
| "kl": 0.238623046875, |
| "learning_rate": 3.1624289355907334e-07, |
| "loss": 0.0095, |
| "reward": 1.0875, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.975, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.9276514680912743, |
| "eval_completion_length": 263.69, |
| "eval_kl": 0.3008984375, |
| "eval_loss": 0.012053935788571835, |
| "eval_reward": 1.095, |
| "eval_reward_std": 0.1767766922712326, |
| "eval_rewards/accuracy_reward": 0.15, |
| "eval_rewards/format_reward": 0.945, |
| "eval_runtime": 126.8567, |
| "eval_samples_per_second": 0.78, |
| "eval_steps_per_second": 0.197, |
| "step": 4200 |
| }, |
| { |
| "completion_length": 227.49375, |
| "epoch": 0.9287558150770973, |
| "grad_norm": 0.3848576143966496, |
| "kl": 0.260791015625, |
| "learning_rate": 3.0669515938254404e-07, |
| "loss": 0.0104, |
| "reward": 1.08125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.96875, |
| "step": 4205 |
| }, |
| { |
| "completion_length": 242.55625, |
| "epoch": 0.9298601620629202, |
| "grad_norm": 0.660183404776511, |
| "kl": 0.3072021484375, |
| "learning_rate": 2.972915268838794e-07, |
| "loss": 0.0123, |
| "reward": 1.11875, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.9625, |
| "step": 4210 |
| }, |
| { |
| "completion_length": 237.59375, |
| "epoch": 0.9309645090487431, |
| "grad_norm": 0.38237841247452214, |
| "kl": 0.236932373046875, |
| "learning_rate": 2.8803213585846036e-07, |
| "loss": 0.0095, |
| "reward": 1.09375, |
| "reward_std": 0.07954951152205467, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.98125, |
| "step": 4215 |
| }, |
| { |
| "completion_length": 254.3125, |
| "epoch": 0.932068856034566, |
| "grad_norm": 0.938722788620333, |
| "kl": 0.304193115234375, |
| "learning_rate": 2.7891712395735513e-07, |
| "loss": 0.0122, |
| "reward": 1.03125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.94375, |
| "step": 4220 |
| }, |
| { |
| "completion_length": 270.375, |
| "epoch": 0.933173203020389, |
| "grad_norm": 0.6446127146773467, |
| "kl": 0.32706298828125, |
| "learning_rate": 2.699466266852779e-07, |
| "loss": 0.0131, |
| "reward": 1.05625, |
| "reward_std": 0.23864853456616403, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.925, |
| "step": 4225 |
| }, |
| { |
| "completion_length": 225.74375, |
| "epoch": 0.9342775500062119, |
| "grad_norm": 0.514239970716185, |
| "kl": 0.2426910400390625, |
| "learning_rate": 2.6112077739857465e-07, |
| "loss": 0.0097, |
| "reward": 1.0875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9625, |
| "step": 4230 |
| }, |
| { |
| "completion_length": 249.1625, |
| "epoch": 0.9353818969920349, |
| "grad_norm": 0.4120891261838415, |
| "kl": 0.245001220703125, |
| "learning_rate": 2.524397073032403e-07, |
| "loss": 0.0098, |
| "reward": 1.05, |
| "reward_std": 0.10606601536273956, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.96875, |
| "step": 4235 |
| }, |
| { |
| "completion_length": 240.0125, |
| "epoch": 0.9364862439778578, |
| "grad_norm": 0.4432984558834709, |
| "kl": 0.27457275390625, |
| "learning_rate": 2.4390354545296257e-07, |
| "loss": 0.011, |
| "reward": 1.09375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.9625, |
| "step": 4240 |
| }, |
| { |
| "completion_length": 216.80625, |
| "epoch": 0.9375905909636808, |
| "grad_norm": 0.2646324742458765, |
| "kl": 0.221075439453125, |
| "learning_rate": 2.3551241874721353e-07, |
| "loss": 0.0088, |
| "reward": 1.10625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.99375, |
| "step": 4245 |
| }, |
| { |
| "completion_length": 247.775, |
| "epoch": 0.9386949379495038, |
| "grad_norm": 0.183761058738616, |
| "kl": 0.28399658203125, |
| "learning_rate": 2.272664519293566e-07, |
| "loss": 0.0114, |
| "reward": 1.1, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.96875, |
| "step": 4250 |
| }, |
| { |
| "completion_length": 245.94375, |
| "epoch": 0.9397992849353267, |
| "grad_norm": 0.49084283791698335, |
| "kl": 0.25836181640625, |
| "learning_rate": 2.1916576758478913e-07, |
| "loss": 0.0103, |
| "reward": 1.06875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.98125, |
| "step": 4255 |
| }, |
| { |
| "completion_length": 255.21875, |
| "epoch": 0.9409036319211497, |
| "grad_norm": 0.5752245613234309, |
| "kl": 0.304534912109375, |
| "learning_rate": 2.1121048613912843e-07, |
| "loss": 0.0122, |
| "reward": 1.08125, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.94375, |
| "step": 4260 |
| }, |
| { |
| "completion_length": 248.875, |
| "epoch": 0.9420079789069725, |
| "grad_norm": 0.4954935572252636, |
| "kl": 0.280999755859375, |
| "learning_rate": 2.0340072585641523e-07, |
| "loss": 0.0112, |
| "reward": 1.11875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.9625, |
| "step": 4265 |
| }, |
| { |
| "completion_length": 274.95, |
| "epoch": 0.9431123258927955, |
| "grad_norm": 0.6291854725110926, |
| "kl": 0.3195556640625, |
| "learning_rate": 1.9573660283735974e-07, |
| "loss": 0.0128, |
| "reward": 1.09375, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.9375, |
| "step": 4270 |
| }, |
| { |
| "completion_length": 227.425, |
| "epoch": 0.9442166728786184, |
| "grad_norm": 0.4759307581588166, |
| "kl": 0.221356201171875, |
| "learning_rate": 1.8821823101760949e-07, |
| "loss": 0.0089, |
| "reward": 1.10625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.975, |
| "step": 4275 |
| }, |
| { |
| "completion_length": 222.7625, |
| "epoch": 0.9453210198644414, |
| "grad_norm": 0.5007795680830986, |
| "kl": 0.27132568359375, |
| "learning_rate": 1.8084572216606422e-07, |
| "loss": 0.0109, |
| "reward": 1.1875, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.19375, |
| "rewards/format_reward": 0.99375, |
| "step": 4280 |
| }, |
| { |
| "completion_length": 243.20625, |
| "epoch": 0.9464253668502643, |
| "grad_norm": 0.43659783572888766, |
| "kl": 0.2771484375, |
| "learning_rate": 1.736191858832048e-07, |
| "loss": 0.0111, |
| "reward": 1.1125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1625, |
| "rewards/format_reward": 0.95, |
| "step": 4285 |
| }, |
| { |
| "completion_length": 230.93125, |
| "epoch": 0.9475297138360873, |
| "grad_norm": 0.5036319473035266, |
| "kl": 0.315594482421875, |
| "learning_rate": 1.665387295994747e-07, |
| "loss": 0.0126, |
| "reward": 1.05, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95, |
| "step": 4290 |
| }, |
| { |
| "completion_length": 257.44375, |
| "epoch": 0.9486340608219103, |
| "grad_norm": 0.21837455215402343, |
| "kl": 0.318695068359375, |
| "learning_rate": 1.5960445857367003e-07, |
| "loss": 0.0128, |
| "reward": 1.08125, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.96875, |
| "step": 4295 |
| }, |
| { |
| "completion_length": 282.2, |
| "epoch": 0.9497384078077332, |
| "grad_norm": 0.8008170036383372, |
| "kl": 0.27298583984375, |
| "learning_rate": 1.5281647589138527e-07, |
| "loss": 0.0109, |
| "reward": 1.01875, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.9375, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.9497384078077332, |
| "eval_completion_length": 243.03, |
| "eval_kl": 0.26423828125, |
| "eval_loss": 0.010587015189230442, |
| "eval_reward": 1.15, |
| "eval_reward_std": 0.1697056245803833, |
| "eval_rewards/accuracy_reward": 0.17, |
| "eval_rewards/format_reward": 0.98, |
| "eval_runtime": 110.5252, |
| "eval_samples_per_second": 0.896, |
| "eval_steps_per_second": 0.226, |
| "step": 4300 |
| }, |
| { |
| "completion_length": 233.6625, |
| "epoch": 0.9508427547935562, |
| "grad_norm": 0.7236286239926621, |
| "kl": 0.329180908203125, |
| "learning_rate": 1.4617488246348012e-07, |
| "loss": 0.0132, |
| "reward": 1.06875, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95625, |
| "step": 4305 |
| }, |
| { |
| "completion_length": 236.18125, |
| "epoch": 0.9519471017793791, |
| "grad_norm": 0.5259274208633408, |
| "kl": 0.270782470703125, |
| "learning_rate": 1.3967977702456946e-07, |
| "loss": 0.0108, |
| "reward": 1.08125, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.95625, |
| "step": 4310 |
| }, |
| { |
| "completion_length": 222.0125, |
| "epoch": 0.953051448765202, |
| "grad_norm": 0.4716816022245921, |
| "kl": 0.269232177734375, |
| "learning_rate": 1.3333125613156695e-07, |
| "loss": 0.0108, |
| "reward": 1.11875, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.975, |
| "step": 4315 |
| }, |
| { |
| "completion_length": 254.275, |
| "epoch": 0.9541557957510249, |
| "grad_norm": 0.3798073604709502, |
| "kl": 0.29171142578125, |
| "learning_rate": 1.271294141622459e-07, |
| "loss": 0.0117, |
| "reward": 1.08125, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.95625, |
| "step": 4320 |
| }, |
| { |
| "completion_length": 263.34375, |
| "epoch": 0.9552601427368479, |
| "grad_norm": 0.39932023814829193, |
| "kl": 0.31627197265625, |
| "learning_rate": 1.2107434331383504e-07, |
| "loss": 0.0126, |
| "reward": 1.075, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.93125, |
| "step": 4325 |
| }, |
| { |
| "completion_length": 233.03125, |
| "epoch": 0.9563644897226709, |
| "grad_norm": 0.915012168079641, |
| "kl": 0.30823974609375, |
| "learning_rate": 1.1516613360164408e-07, |
| "loss": 0.0123, |
| "reward": 1.1125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.96875, |
| "step": 4330 |
| }, |
| { |
| "completion_length": 235.96875, |
| "epoch": 0.9574688367084938, |
| "grad_norm": 0.6431925020300033, |
| "kl": 0.26207275390625, |
| "learning_rate": 1.094048728577346e-07, |
| "loss": 0.0105, |
| "reward": 1.0625, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.9625, |
| "step": 4335 |
| }, |
| { |
| "completion_length": 227.89375, |
| "epoch": 0.9585731836943168, |
| "grad_norm": 0.2197375620895538, |
| "kl": 0.22823486328125, |
| "learning_rate": 1.0379064672960793e-07, |
| "loss": 0.0091, |
| "reward": 1.1375, |
| "reward_std": 0.12374368458986282, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.99375, |
| "step": 4340 |
| }, |
| { |
| "completion_length": 236.425, |
| "epoch": 0.9596775306801397, |
| "grad_norm": 0.24464788267487614, |
| "kl": 0.237347412109375, |
| "learning_rate": 9.832353867893385e-08, |
| "loss": 0.0095, |
| "reward": 1.05625, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95625, |
| "step": 4345 |
| }, |
| { |
| "completion_length": 222.8, |
| "epoch": 0.9607818776659627, |
| "grad_norm": 0.6551456803073707, |
| "kl": 0.2460235595703125, |
| "learning_rate": 9.300362998030832e-08, |
| "loss": 0.0098, |
| "reward": 1.175, |
| "reward_std": 0.21213203072547912, |
| "rewards/accuracy_reward": 0.2125, |
| "rewards/format_reward": 0.9625, |
| "step": 4350 |
| }, |
| { |
| "completion_length": 268.76875, |
| "epoch": 0.9618862246517856, |
| "grad_norm": 0.592425806579998, |
| "kl": 0.27703857421875, |
| "learning_rate": 8.783099972004882e-08, |
| "loss": 0.0111, |
| "reward": 1.0125, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.91875, |
| "step": 4355 |
| }, |
| { |
| "completion_length": 212.39375, |
| "epoch": 0.9629905716376085, |
| "grad_norm": 0.46261395016364687, |
| "kl": 0.278302001953125, |
| "learning_rate": 8.280572479501426e-08, |
| "loss": 0.0111, |
| "reward": 1.125, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.96875, |
| "step": 4360 |
| }, |
| { |
| "completion_length": 242.06875, |
| "epoch": 0.9640949186234314, |
| "grad_norm": 0.28486230034584065, |
| "kl": 0.22513427734375, |
| "learning_rate": 7.792787991146356e-08, |
| "loss": 0.009, |
| "reward": 1.09375, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.98125, |
| "step": 4365 |
| }, |
| { |
| "completion_length": 240.1875, |
| "epoch": 0.9651992656092544, |
| "grad_norm": 0.43886555296427354, |
| "kl": 0.30863037109375, |
| "learning_rate": 7.319753758394665e-08, |
| "loss": 0.0123, |
| "reward": 1.0375, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.94375, |
| "step": 4370 |
| }, |
| { |
| "completion_length": 223.30625, |
| "epoch": 0.9663036125950774, |
| "grad_norm": 0.542732303860562, |
| "kl": 0.27510986328125, |
| "learning_rate": 6.861476813422419e-08, |
| "loss": 0.011, |
| "reward": 1.06875, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.98125, |
| "step": 4375 |
| }, |
| { |
| "completion_length": 269.04375, |
| "epoch": 0.9674079595809003, |
| "grad_norm": 0.35561046298902815, |
| "kl": 0.350921630859375, |
| "learning_rate": 6.417963969022389e-08, |
| "loss": 0.014, |
| "reward": 1.04375, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.93125, |
| "step": 4380 |
| }, |
| { |
| "completion_length": 294.5625, |
| "epoch": 0.9685123065667233, |
| "grad_norm": 0.6237843740200061, |
| "kl": 0.262664794921875, |
| "learning_rate": 5.989221818502478e-08, |
| "loss": 0.0105, |
| "reward": 1.10625, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.95, |
| "step": 4385 |
| }, |
| { |
| "completion_length": 257.58125, |
| "epoch": 0.9696166535525462, |
| "grad_norm": 0.6942753855581536, |
| "kl": 0.319189453125, |
| "learning_rate": 5.5752567355883415e-08, |
| "loss": 0.0128, |
| "reward": 1.06875, |
| "reward_std": 0.15026018843054773, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.95, |
| "step": 4390 |
| }, |
| { |
| "completion_length": 259.525, |
| "epoch": 0.9707210005383692, |
| "grad_norm": 0.47467571553923327, |
| "kl": 0.288873291015625, |
| "learning_rate": 5.176074874327919e-08, |
| "loss": 0.0116, |
| "reward": 1.1, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.9625, |
| "step": 4395 |
| }, |
| { |
| "completion_length": 247.05, |
| "epoch": 0.9718253475241921, |
| "grad_norm": 0.47878465160417794, |
| "kl": 0.256573486328125, |
| "learning_rate": 4.791682169000056e-08, |
| "loss": 0.0103, |
| "reward": 1.09375, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.975, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.9718253475241921, |
| "eval_completion_length": 254.0, |
| "eval_kl": 0.27548828125, |
| "eval_loss": 0.011038653552532196, |
| "eval_reward": 1.11, |
| "eval_reward_std": 0.1838477599620819, |
| "eval_rewards/accuracy_reward": 0.16, |
| "eval_rewards/format_reward": 0.95, |
| "eval_runtime": 128.6135, |
| "eval_samples_per_second": 0.77, |
| "eval_steps_per_second": 0.194, |
| "step": 4400 |
| }, |
| { |
| "completion_length": 288.8375, |
| "epoch": 0.9729296945100151, |
| "grad_norm": 0.6464757294619069, |
| "kl": 0.3235595703125, |
| "learning_rate": 4.4220843340269105e-08, |
| "loss": 0.0129, |
| "reward": 1.025, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.9125, |
| "step": 4405 |
| }, |
| { |
| "completion_length": 245.01875, |
| "epoch": 0.9740340414958379, |
| "grad_norm": 0.6385791474694716, |
| "kl": 0.33304443359375, |
| "learning_rate": 4.067286863888131e-08, |
| "loss": 0.0133, |
| "reward": 1.10625, |
| "reward_std": 0.2563262037932873, |
| "rewards/accuracy_reward": 0.16875, |
| "rewards/format_reward": 0.9375, |
| "step": 4410 |
| }, |
| { |
| "completion_length": 219.36875, |
| "epoch": 0.9751383884816609, |
| "grad_norm": 0.6252687285926767, |
| "kl": 0.28475341796875, |
| "learning_rate": 3.727295033040035e-08, |
| "loss": 0.0114, |
| "reward": 1.11875, |
| "reward_std": 0.09722718074917794, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.99375, |
| "step": 4415 |
| }, |
| { |
| "completion_length": 266.64375, |
| "epoch": 0.9762427354674839, |
| "grad_norm": 0.4059885139351556, |
| "kl": 0.2508056640625, |
| "learning_rate": 3.402113895836445e-08, |
| "loss": 0.01, |
| "reward": 1.05, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.1, |
| "rewards/format_reward": 0.95, |
| "step": 4420 |
| }, |
| { |
| "completion_length": 234.075, |
| "epoch": 0.9773470824533068, |
| "grad_norm": 0.5571412291452437, |
| "kl": 0.258990478515625, |
| "learning_rate": 3.091748286453866e-08, |
| "loss": 0.0104, |
| "reward": 1.125, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.96875, |
| "step": 4425 |
| }, |
| { |
| "completion_length": 245.19375, |
| "epoch": 0.9784514294391298, |
| "grad_norm": 0.48222048728786027, |
| "kl": 0.279296875, |
| "learning_rate": 2.796202818819871e-08, |
| "loss": 0.0112, |
| "reward": 1.06875, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.1125, |
| "rewards/format_reward": 0.95625, |
| "step": 4430 |
| }, |
| { |
| "completion_length": 266.23125, |
| "epoch": 0.9795557764249527, |
| "grad_norm": 0.33073986574713815, |
| "kl": 0.2812255859375, |
| "learning_rate": 2.5154818865440466e-08, |
| "loss": 0.0113, |
| "reward": 1.09375, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.94375, |
| "step": 4435 |
| }, |
| { |
| "completion_length": 243.86875, |
| "epoch": 0.9806601234107757, |
| "grad_norm": 0.5979836605535603, |
| "kl": 0.3093017578125, |
| "learning_rate": 2.2495896628529355e-08, |
| "loss": 0.0124, |
| "reward": 1.10625, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.95625, |
| "step": 4440 |
| }, |
| { |
| "completion_length": 234.6375, |
| "epoch": 0.9817644703965986, |
| "grad_norm": 0.71956275463976, |
| "kl": 0.267864990234375, |
| "learning_rate": 1.9985301005280843e-08, |
| "loss": 0.0107, |
| "reward": 1.09375, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.11875, |
| "rewards/format_reward": 0.975, |
| "step": 4445 |
| }, |
| { |
| "completion_length": 250.375, |
| "epoch": 0.9828688173824216, |
| "grad_norm": 0.4915100209116087, |
| "kl": 0.26923828125, |
| "learning_rate": 1.7623069318469797e-08, |
| "loss": 0.0108, |
| "reward": 1.025, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.9375, |
| "step": 4450 |
| }, |
| { |
| "completion_length": 238.46875, |
| "epoch": 0.9839731643682446, |
| "grad_norm": 0.4934697459512418, |
| "kl": 0.263037109375, |
| "learning_rate": 1.5409236685277608e-08, |
| "loss": 0.0105, |
| "reward": 1.09375, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.9625, |
| "step": 4455 |
| }, |
| { |
| "completion_length": 242.4625, |
| "epoch": 0.9850775113540674, |
| "grad_norm": 0.7032342242514465, |
| "kl": 0.2882568359375, |
| "learning_rate": 1.3343836016772582e-08, |
| "loss": 0.0115, |
| "reward": 1.0625, |
| "reward_std": 0.1414213538169861, |
| "rewards/accuracy_reward": 0.10625, |
| "rewards/format_reward": 0.95625, |
| "step": 4460 |
| }, |
| { |
| "completion_length": 253.40625, |
| "epoch": 0.9861818583398904, |
| "grad_norm": 0.40012002703514815, |
| "kl": 0.28798828125, |
| "learning_rate": 1.1426898017412591e-08, |
| "loss": 0.0115, |
| "reward": 1.10625, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.175, |
| "rewards/format_reward": 0.93125, |
| "step": 4465 |
| }, |
| { |
| "completion_length": 231.40625, |
| "epoch": 0.9872862053257133, |
| "grad_norm": 0.5527803201803545, |
| "kl": 0.273876953125, |
| "learning_rate": 9.658451184600959e-09, |
| "loss": 0.0109, |
| "reward": 1.08125, |
| "reward_std": 0.16793785765767097, |
| "rewards/accuracy_reward": 0.13125, |
| "rewards/format_reward": 0.95, |
| "step": 4470 |
| }, |
| { |
| "completion_length": 276.0125, |
| "epoch": 0.9883905523115363, |
| "grad_norm": 0.5553612160206977, |
| "kl": 0.271929931640625, |
| "learning_rate": 8.038521808249045e-09, |
| "loss": 0.0109, |
| "reward": 1.05625, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.93125, |
| "step": 4475 |
| }, |
| { |
| "completion_length": 269.39375, |
| "epoch": 0.9894948992973592, |
| "grad_norm": 0.38046769308040707, |
| "kl": 0.31407470703125, |
| "learning_rate": 6.567133970397654e-09, |
| "loss": 0.0126, |
| "reward": 1.03125, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.0875, |
| "rewards/format_reward": 0.94375, |
| "step": 4480 |
| }, |
| { |
| "completion_length": 253.03125, |
| "epoch": 0.9905992462831822, |
| "grad_norm": 0.25828108036325964, |
| "kl": 0.36680908203125, |
| "learning_rate": 5.2443095448506674e-09, |
| "loss": 0.0147, |
| "reward": 1.025, |
| "reward_std": 0.15909902304410933, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.93125, |
| "step": 4485 |
| }, |
| { |
| "completion_length": 242.60625, |
| "epoch": 0.9917035932690051, |
| "grad_norm": 0.3615879163519988, |
| "kl": 0.23807373046875, |
| "learning_rate": 4.070068196853072e-09, |
| "loss": 0.0095, |
| "reward": 1.10625, |
| "reward_std": 0.13258251920342445, |
| "rewards/accuracy_reward": 0.1375, |
| "rewards/format_reward": 0.96875, |
| "step": 4490 |
| }, |
| { |
| "completion_length": 271.5, |
| "epoch": 0.9928079402548281, |
| "grad_norm": 0.31635821168080147, |
| "kl": 0.35159912109375, |
| "learning_rate": 3.0444273828000857e-09, |
| "loss": 0.0141, |
| "reward": 1.075, |
| "reward_std": 0.19445436149835588, |
| "rewards/accuracy_reward": 0.14375, |
| "rewards/format_reward": 0.93125, |
| "step": 4495 |
| }, |
| { |
| "completion_length": 233.03125, |
| "epoch": 0.9939122872406511, |
| "grad_norm": 0.30099890850035027, |
| "kl": 0.325830078125, |
| "learning_rate": 2.167402349972925e-09, |
| "loss": 0.013, |
| "reward": 1.0375, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.08125, |
| "rewards/format_reward": 0.95625, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.9939122872406511, |
| "eval_completion_length": 251.83, |
| "eval_kl": 0.32880859375, |
| "eval_loss": 0.013173764571547508, |
| "eval_reward": 1.115, |
| "eval_reward_std": 0.19091882765293122, |
| "eval_rewards/accuracy_reward": 0.16, |
| "eval_rewards/format_reward": 0.955, |
| "eval_runtime": 141.5462, |
| "eval_samples_per_second": 0.699, |
| "eval_steps_per_second": 0.177, |
| "step": 4500 |
| }, |
| { |
| "completion_length": 207.225, |
| "epoch": 0.9950166342264739, |
| "grad_norm": 0.7679754782825252, |
| "kl": 0.2902587890625, |
| "learning_rate": 1.4390061363189767e-09, |
| "loss": 0.0116, |
| "reward": 1.11875, |
| "reward_std": 0.18561552688479424, |
| "rewards/accuracy_reward": 0.15, |
| "rewards/format_reward": 0.96875, |
| "step": 4505 |
| }, |
| { |
| "completion_length": 278.71875, |
| "epoch": 0.9961209812122969, |
| "grad_norm": 0.44631107231254136, |
| "kl": 0.279425048828125, |
| "learning_rate": 8.592495702497427e-10, |
| "loss": 0.0112, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.90625, |
| "step": 4510 |
| }, |
| { |
| "completion_length": 246.39375, |
| "epoch": 0.9972253281981198, |
| "grad_norm": 0.36414142285577833, |
| "kl": 0.30335693359375, |
| "learning_rate": 4.2814127048873553e-10, |
| "loss": 0.0121, |
| "reward": 1.05625, |
| "reward_std": 0.1149048499763012, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.9625, |
| "step": 4515 |
| }, |
| { |
| "completion_length": 258.74375, |
| "epoch": 0.9983296751839428, |
| "grad_norm": 0.5750843977179474, |
| "kl": 0.405145263671875, |
| "learning_rate": 1.4568764593603235e-10, |
| "loss": 0.0162, |
| "reward": 1.05, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.95625, |
| "step": 4520 |
| }, |
| { |
| "completion_length": 261.95, |
| "epoch": 0.9994340221697657, |
| "grad_norm": 0.5025685502837802, |
| "kl": 0.259100341796875, |
| "learning_rate": 1.1892895576126606e-11, |
| "loss": 0.0104, |
| "reward": 1.10625, |
| "reward_std": 0.20329319611191748, |
| "rewards/accuracy_reward": 0.1625, |
| "rewards/format_reward": 0.94375, |
| "step": 4525 |
| }, |
| { |
| "completion_length": 248.6875, |
| "epoch": 0.9998757609640949, |
| "kl": 0.22357177734375, |
| "reward": 1.171875, |
| "reward_std": 0.19887377880513668, |
| "rewards/accuracy_reward": 0.203125, |
| "rewards/format_reward": 0.96875, |
| "step": 4527, |
| "total_flos": 0.0, |
| "train_loss": 28.747999461705767, |
| "train_runtime": 163973.669, |
| "train_samples_per_second": 0.442, |
| "train_steps_per_second": 0.028 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 4527, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|