| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.08237232289950576, |
| "eval_steps": 1000, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4658203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 88.3837890625, |
| "completions/mean_terminated_length": 53.83729553222656, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0787938493303955, |
| "epoch": 0.00041186161449752884, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1026352643966675, |
| "kl": 1.5408068257727336e-05, |
| "learning_rate": 0.0, |
| "loss": 0.0612, |
| "num_tokens": 473618.0, |
| "reward": -0.654300332069397, |
| "reward_std": 1.2014957666397095, |
| "rewards/reward_model/mean": -0.654300332069397, |
| "rewards/reward_model/std": 1.4879947900772095, |
| "step": 1, |
| "step_time": 179.40438475832343 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.521484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 95.0380859375, |
| "completions/mean_terminated_length": 59.11632537841797, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0889650019817054, |
| "epoch": 0.0008237232289950577, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9445520043373108, |
| "kl": 1.5487904489575044e-05, |
| "learning_rate": 1.2345679012345681e-08, |
| "loss": 0.0685, |
| "num_tokens": 944384.0, |
| "reward": -0.6944406032562256, |
| "reward_std": 1.1158981323242188, |
| "rewards/reward_model/mean": -0.6944406032562256, |
| "rewards/reward_model/std": 1.4779117107391357, |
| "step": 2, |
| "step_time": 168.28568758117035 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4921875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 93.1064453125, |
| "completions/mean_terminated_length": 59.286537170410156, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.1078118681907654, |
| "epoch": 0.0012355848434925864, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9809994101524353, |
| "kl": 0.0009136445219155576, |
| "learning_rate": 2.4691358024691362e-08, |
| "loss": 0.061, |
| "num_tokens": 1417434.0, |
| "reward": -0.8067716956138611, |
| "reward_std": 1.1805193424224854, |
| "rewards/reward_model/mean": -0.8067716956138611, |
| "rewards/reward_model/std": 1.5296157598495483, |
| "step": 3, |
| "step_time": 168.7894278760068 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4912109375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 90.3203125, |
| "completions/mean_terminated_length": 53.94241714477539, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0778324585407972, |
| "epoch": 0.0016474464579901153, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9622601270675659, |
| "kl": 0.0009502729969881329, |
| "learning_rate": 3.7037037037037036e-08, |
| "loss": 0.0854, |
| "num_tokens": 1886250.0, |
| "reward": -0.5533753037452698, |
| "reward_std": 1.0693888664245605, |
| "rewards/reward_model/mean": -0.5533753037452698, |
| "rewards/reward_model/std": 1.3799840211868286, |
| "step": 4, |
| "step_time": 167.94514833204448 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.47802734375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 88.8896484375, |
| "completions/mean_terminated_length": 53.07202911376953, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.164030898362398, |
| "epoch": 0.002059308072487644, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0810271501541138, |
| "kl": 0.0010240612220968615, |
| "learning_rate": 4.9382716049382724e-08, |
| "loss": 0.0979, |
| "num_tokens": 2372616.0, |
| "reward": -0.8290466070175171, |
| "reward_std": 1.1383775472640991, |
| "rewards/reward_model/mean": -0.8290466070175171, |
| "rewards/reward_model/std": 1.4821057319641113, |
| "step": 5, |
| "step_time": 168.5208105482161 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.44775390625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 86.54541015625, |
| "completions/mean_terminated_length": 52.934574127197266, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.121390470303595, |
| "epoch": 0.002471169686985173, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.026131272315979, |
| "kl": 0.000956788239591333, |
| "learning_rate": 6.17283950617284e-08, |
| "loss": 0.102, |
| "num_tokens": 2856453.0, |
| "reward": -0.5948619842529297, |
| "reward_std": 1.0859686136245728, |
| "rewards/reward_model/mean": -0.5948619842529297, |
| "rewards/reward_model/std": 1.4433753490447998, |
| "step": 6, |
| "step_time": 169.30755526619032 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.47998046875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 88.40185546875, |
| "completions/mean_terminated_length": 51.85258483886719, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.1293580746278167, |
| "epoch": 0.002883031301482702, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9581882953643799, |
| "kl": 0.0010509827170608332, |
| "learning_rate": 7.407407407407407e-08, |
| "loss": 0.0752, |
| "num_tokens": 3349660.0, |
| "reward": -0.8746315836906433, |
| "reward_std": 1.1371493339538574, |
| "rewards/reward_model/mean": -0.8746315836906433, |
| "rewards/reward_model/std": 1.5432283878326416, |
| "step": 7, |
| "step_time": 170.4541406123899 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.462890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 89.59423828125, |
| "completions/mean_terminated_length": 56.49545669555664, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.199626039713621, |
| "epoch": 0.0032948929159802307, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1233900785446167, |
| "kl": 0.0011659049921490805, |
| "learning_rate": 8.641975308641976e-08, |
| "loss": 0.0835, |
| "num_tokens": 3869181.0, |
| "reward": -0.9943232536315918, |
| "reward_std": 1.099515438079834, |
| "rewards/reward_model/mean": -0.9943232536315918, |
| "rewards/reward_model/std": 1.4042030572891235, |
| "step": 8, |
| "step_time": 168.8292339304462 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.50244140625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 92.26611328125, |
| "completions/mean_terminated_length": 56.181549072265625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 2.103476638905704, |
| "epoch": 0.0037067545304777594, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0071220397949219, |
| "kl": 0.0010218678287401417, |
| "learning_rate": 9.876543209876545e-08, |
| "loss": 0.0786, |
| "num_tokens": 4330526.0, |
| "reward": -0.7287623286247253, |
| "reward_std": 1.2205724716186523, |
| "rewards/reward_model/mean": -0.7287623286247253, |
| "rewards/reward_model/std": 1.5410621166229248, |
| "step": 9, |
| "step_time": 168.4490856071934 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4384765625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 87.068359375, |
| "completions/mean_terminated_length": 55.10608673095703, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 2.068316952791065, |
| "epoch": 0.004118616144975288, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0786375999450684, |
| "kl": 0.0011272716699295415, |
| "learning_rate": 1.1111111111111111e-07, |
| "loss": 0.0808, |
| "num_tokens": 4813482.0, |
| "reward": -0.8588310480117798, |
| "reward_std": 1.1204930543899536, |
| "rewards/reward_model/mean": -0.8588310480117798, |
| "rewards/reward_model/std": 1.4020955562591553, |
| "step": 10, |
| "step_time": 169.3498973324895 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.486328125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 88.27197265625, |
| "completions/mean_terminated_length": 50.65874481201172, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.085946503095329, |
| "epoch": 0.004530477759472817, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1249303817749023, |
| "kl": 0.0010357791773003555, |
| "learning_rate": 1.234567901234568e-07, |
| "loss": 0.103, |
| "num_tokens": 5276279.0, |
| "reward": -0.7370425462722778, |
| "reward_std": 1.1393404006958008, |
| "rewards/reward_model/mean": -0.7370425462722778, |
| "rewards/reward_model/std": 1.435203194618225, |
| "step": 11, |
| "step_time": 169.61693120608106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.435546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 84.62060546875, |
| "completions/mean_terminated_length": 51.147926330566406, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "entropy": 2.0578739237971604, |
| "epoch": 0.004942339373970346, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1678587198257446, |
| "kl": 0.0010205526389199804, |
| "learning_rate": 1.3580246913580248e-07, |
| "loss": 0.129, |
| "num_tokens": 5750318.0, |
| "reward": -0.6621623039245605, |
| "reward_std": 1.1341545581817627, |
| "rewards/reward_model/mean": -0.6621623039245605, |
| "rewards/reward_model/std": 1.4956636428833008, |
| "step": 12, |
| "step_time": 170.6942683076486 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.48876953125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 89.955078125, |
| "completions/mean_terminated_length": 53.581661224365234, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 2.1759862853214145, |
| "epoch": 0.005354200988467875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9353419542312622, |
| "kl": 0.0009984489861381007, |
| "learning_rate": 1.4814814814814815e-07, |
| "loss": 0.0706, |
| "num_tokens": 6237106.0, |
| "reward": -0.71650230884552, |
| "reward_std": 1.1081366539001465, |
| "rewards/reward_model/mean": -0.71650230884552, |
| "rewards/reward_model/std": 1.4882901906967163, |
| "step": 13, |
| "step_time": 168.60461562033743 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.47265625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 127.0, |
| "completions/mean_length": 90.6298828125, |
| "completions/mean_terminated_length": 57.13518524169922, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.1225685542449355, |
| "epoch": 0.005766062602965404, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9926177263259888, |
| "kl": 0.0010822901392657513, |
| "learning_rate": 1.6049382716049383e-07, |
| "loss": 0.0705, |
| "num_tokens": 6768988.0, |
| "reward": -0.8033103346824646, |
| "reward_std": 1.1658474206924438, |
| "rewards/reward_model/mean": -0.8033103346824646, |
| "rewards/reward_model/std": 1.5343424081802368, |
| "step": 14, |
| "step_time": 169.76986178942025 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.43359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 87.02978515625, |
| "completions/mean_terminated_length": 55.666378021240234, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.2208039346151054, |
| "epoch": 0.006177924217462933, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.110655426979065, |
| "kl": 0.00102766218378747, |
| "learning_rate": 1.7283950617283952e-07, |
| "loss": 0.1137, |
| "num_tokens": 7264761.0, |
| "reward": -0.8211149573326111, |
| "reward_std": 1.1067304611206055, |
| "rewards/reward_model/mean": -0.8211149573326111, |
| "rewards/reward_model/std": 1.4263983964920044, |
| "step": 15, |
| "step_time": 169.37415388552472 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.447265625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 89.8525390625, |
| "completions/mean_terminated_length": 58.984100341796875, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "entropy": 2.036483039613813, |
| "epoch": 0.006589785831960461, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0956333875656128, |
| "kl": 0.0011385848249574337, |
| "learning_rate": 1.8518518518518518e-07, |
| "loss": 0.0679, |
| "num_tokens": 7745675.0, |
| "reward": -0.5313577651977539, |
| "reward_std": 1.1804759502410889, |
| "rewards/reward_model/mean": -0.5313577651977539, |
| "rewards/reward_model/std": 1.5051146745681763, |
| "step": 16, |
| "step_time": 168.84541190741584 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46142578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 87.02197265625, |
| "completions/mean_terminated_length": 51.91387176513672, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 2.1997494087554514, |
| "epoch": 0.00700164744645799, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0221517086029053, |
| "kl": 0.0010186115512169636, |
| "learning_rate": 1.975308641975309e-07, |
| "loss": 0.0962, |
| "num_tokens": 8243704.0, |
| "reward": -0.89983731508255, |
| "reward_std": 1.135831356048584, |
| "rewards/reward_model/mean": -0.89983731508255, |
| "rewards/reward_model/std": 1.4320958852767944, |
| "step": 17, |
| "step_time": 168.78324813907966 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.45947265625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 88.05908203125, |
| "completions/mean_terminated_length": 54.10749816894531, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.176318216137588, |
| "epoch": 0.007413509060955519, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0268303155899048, |
| "kl": 0.0010050315393073106, |
| "learning_rate": 2.0987654320987656e-07, |
| "loss": 0.0845, |
| "num_tokens": 8726801.0, |
| "reward": -0.7434755563735962, |
| "reward_std": 1.1786913871765137, |
| "rewards/reward_model/mean": -0.7434755563735962, |
| "rewards/reward_model/std": 1.4701310396194458, |
| "step": 18, |
| "step_time": 168.51915573468432 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4658203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 88.00439453125, |
| "completions/mean_terminated_length": 53.12705993652344, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0752989565953612, |
| "epoch": 0.007825370675453048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0904563665390015, |
| "kl": 0.00123250130627639, |
| "learning_rate": 2.2222222222222222e-07, |
| "loss": 0.0885, |
| "num_tokens": 9180858.0, |
| "reward": -0.8568893074989319, |
| "reward_std": 1.1963412761688232, |
| "rewards/reward_model/mean": -0.8568893074989319, |
| "rewards/reward_model/std": 1.5186042785644531, |
| "step": 19, |
| "step_time": 170.01141701499 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4033203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 83.22021484375, |
| "completions/mean_terminated_length": 52.95172119140625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 2.200795284938067, |
| "epoch": 0.008237232289950576, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0073621273040771, |
| "kl": 0.0012237756848207937, |
| "learning_rate": 2.3456790123456793e-07, |
| "loss": 0.0782, |
| "num_tokens": 9702557.0, |
| "reward": -0.9474191069602966, |
| "reward_std": 1.101952314376831, |
| "rewards/reward_model/mean": -0.9474191665649414, |
| "rewards/reward_model/std": 1.514784336090088, |
| "step": 20, |
| "step_time": 168.7904914407991 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.44189453125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 127.0, |
| "completions/mean_length": 85.07275390625, |
| "completions/mean_terminated_length": 51.08399200439453, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.1343746068887413, |
| "epoch": 0.008649093904448105, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1044775247573853, |
| "kl": 0.0011666002192214364, |
| "learning_rate": 2.469135802469136e-07, |
| "loss": 0.0659, |
| "num_tokens": 10182002.0, |
| "reward": -0.8981258869171143, |
| "reward_std": 1.1897304058074951, |
| "rewards/reward_model/mean": -0.8981258869171143, |
| "rewards/reward_model/std": 1.4881244897842407, |
| "step": 21, |
| "step_time": 168.61277754418552 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4580078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 88.984375, |
| "completions/mean_terminated_length": 56.0144157409668, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.060287212021649, |
| "epoch": 0.009060955518945634, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9674479961395264, |
| "kl": 0.0012853052542141086, |
| "learning_rate": 2.5925925925925923e-07, |
| "loss": 0.0795, |
| "num_tokens": 10599858.0, |
| "reward": -0.7459607720375061, |
| "reward_std": 1.18560791015625, |
| "rewards/reward_model/mean": -0.7459607720375061, |
| "rewards/reward_model/std": 1.4447804689407349, |
| "step": 22, |
| "step_time": 168.57235636515543 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.45751953125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 87.20947265625, |
| "completions/mean_terminated_length": 52.80738067626953, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.9992840560153127, |
| "epoch": 0.009472817133443162, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2746193408966064, |
| "kl": 0.001352380840899059, |
| "learning_rate": 2.7160493827160497e-07, |
| "loss": 0.0805, |
| "num_tokens": 11135295.0, |
| "reward": -0.9941644668579102, |
| "reward_std": 1.2033442258834839, |
| "rewards/reward_model/mean": -0.9941644668579102, |
| "rewards/reward_model/std": 1.5118839740753174, |
| "step": 23, |
| "step_time": 168.93097670795396 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.458984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 88.12744140625, |
| "completions/mean_terminated_length": 54.300540924072266, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 2.1036753226071596, |
| "epoch": 0.009884678747940691, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9719477295875549, |
| "kl": 0.0014479562626092957, |
| "learning_rate": 2.839506172839506e-07, |
| "loss": 0.0792, |
| "num_tokens": 11647428.0, |
| "reward": -0.7246302366256714, |
| "reward_std": 1.1223700046539307, |
| "rewards/reward_model/mean": -0.7246302366256714, |
| "rewards/reward_model/std": 1.4486252069473267, |
| "step": 24, |
| "step_time": 168.1220847275108 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.48876953125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 92.43017578125, |
| "completions/mean_terminated_length": 58.42311477661133, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0290252747945487, |
| "epoch": 0.01029654036243822, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.036043643951416, |
| "kl": 0.0014065650616430503, |
| "learning_rate": 2.962962962962963e-07, |
| "loss": 0.0464, |
| "num_tokens": 12175925.0, |
| "reward": -0.8139803409576416, |
| "reward_std": 1.18918776512146, |
| "rewards/reward_model/mean": -0.8139803409576416, |
| "rewards/reward_model/std": 1.5184983015060425, |
| "step": 25, |
| "step_time": 169.08092289417982 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.48193359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 91.38720703125, |
| "completions/mean_terminated_length": 57.3279914855957, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0865674833767116, |
| "epoch": 0.01070840197693575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9221011996269226, |
| "kl": 0.0016496309149260924, |
| "learning_rate": 3.08641975308642e-07, |
| "loss": 0.0459, |
| "num_tokens": 12671022.0, |
| "reward": -0.6815944910049438, |
| "reward_std": 1.1987043619155884, |
| "rewards/reward_model/mean": -0.6815944910049438, |
| "rewards/reward_model/std": 1.503211259841919, |
| "step": 26, |
| "step_time": 169.66068721655756 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.39208984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.84814453125, |
| "completions/mean_terminated_length": 53.726104736328125, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.100129804573953, |
| "epoch": 0.011120263591433279, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0829551219940186, |
| "kl": 0.0020542718434626295, |
| "learning_rate": 3.2098765432098767e-07, |
| "loss": 0.1111, |
| "num_tokens": 13159479.0, |
| "reward": -0.7841147780418396, |
| "reward_std": 1.1083781719207764, |
| "rewards/reward_model/mean": -0.7841147780418396, |
| "rewards/reward_model/std": 1.398116946220398, |
| "step": 27, |
| "step_time": 170.63890342088416 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.41845703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 84.29150390625, |
| "completions/mean_terminated_length": 52.84046936035156, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0639419481158257, |
| "epoch": 0.011532125205930808, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9612188935279846, |
| "kl": 0.002871143702122936, |
| "learning_rate": 3.333333333333333e-07, |
| "loss": 0.0511, |
| "num_tokens": 13624972.0, |
| "reward": -0.41133514046669006, |
| "reward_std": 1.0870225429534912, |
| "rewards/reward_model/mean": -0.41133514046669006, |
| "rewards/reward_model/std": 1.3928031921386719, |
| "step": 28, |
| "step_time": 169.01845826301724 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3896484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.62353515625, |
| "completions/mean_terminated_length": 53.65519714355469, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0633218064904213, |
| "epoch": 0.011943986820428337, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0220043659210205, |
| "kl": 0.004001651409907936, |
| "learning_rate": 3.4567901234567904e-07, |
| "loss": 0.0608, |
| "num_tokens": 14084265.0, |
| "reward": -0.5280731916427612, |
| "reward_std": 1.139591097831726, |
| "rewards/reward_model/mean": -0.5280731916427612, |
| "rewards/reward_model/std": 1.5284217596054077, |
| "step": 29, |
| "step_time": 169.68904952565208 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3447265625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.75439453125, |
| "completions/mean_terminated_length": 51.321163177490234, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.053064794279635, |
| "epoch": 0.012355848434925865, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0159528255462646, |
| "kl": 0.004774259470650577, |
| "learning_rate": 3.580246913580247e-07, |
| "loss": 0.0826, |
| "num_tokens": 14545778.0, |
| "reward": -0.8308598399162292, |
| "reward_std": 1.1439062356948853, |
| "rewards/reward_model/mean": -0.8308598399162292, |
| "rewards/reward_model/std": 1.4677071571350098, |
| "step": 30, |
| "step_time": 169.4162016301416 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40478515625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 84.13525390625, |
| "completions/mean_terminated_length": 54.30434799194336, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.9778149635531008, |
| "epoch": 0.012767710049423394, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9643027186393738, |
| "kl": 0.004632160428627685, |
| "learning_rate": 3.7037037037037036e-07, |
| "loss": 0.0324, |
| "num_tokens": 15021479.0, |
| "reward": -0.5928993225097656, |
| "reward_std": 1.0915915966033936, |
| "rewards/reward_model/mean": -0.5928993225097656, |
| "rewards/reward_model/std": 1.4171936511993408, |
| "step": 31, |
| "step_time": 169.55369784962386 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.39404296875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.93603515625, |
| "completions/mean_terminated_length": 51.981468200683594, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.9114997563883662, |
| "epoch": 0.013179571663920923, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9297622442245483, |
| "kl": 0.005185256256481807, |
| "learning_rate": 3.8271604938271605e-07, |
| "loss": 0.0641, |
| "num_tokens": 15490468.0, |
| "reward": -0.4294321537017822, |
| "reward_std": 1.1095049381256104, |
| "rewards/reward_model/mean": -0.4294321537017822, |
| "rewards/reward_model/std": 1.4001518487930298, |
| "step": 32, |
| "step_time": 169.97963417787105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4033203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 84.73388671875, |
| "completions/mean_terminated_length": 55.48854446411133, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.9920633286237717, |
| "epoch": 0.013591433278418451, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9739150404930115, |
| "kl": 0.0050489629365984, |
| "learning_rate": 3.950617283950618e-07, |
| "loss": 0.069, |
| "num_tokens": 16033027.0, |
| "reward": -0.5853164792060852, |
| "reward_std": 1.1397128105163574, |
| "rewards/reward_model/mean": -0.5853164792060852, |
| "rewards/reward_model/std": 1.4342437982559204, |
| "step": 33, |
| "step_time": 170.0918092643842 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.33349609375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.16943359375, |
| "completions/mean_terminated_length": 56.236629486083984, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.052212963812053, |
| "epoch": 0.01400329489291598, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0108827352523804, |
| "kl": 0.0070345894837373635, |
| "learning_rate": 4.0740740740740737e-07, |
| "loss": 0.0717, |
| "num_tokens": 16443422.0, |
| "reward": -0.40320760011672974, |
| "reward_std": 1.023691177368164, |
| "rewards/reward_model/mean": -0.40320760011672974, |
| "rewards/reward_model/std": 1.3064631223678589, |
| "step": 34, |
| "step_time": 168.22575595136732 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3896484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.1884765625, |
| "completions/mean_terminated_length": 52.94239807128906, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.8943170690909028, |
| "epoch": 0.014415156507413509, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9463284015655518, |
| "kl": 0.006503033908302314, |
| "learning_rate": 4.197530864197531e-07, |
| "loss": 0.0771, |
| "num_tokens": 16958848.0, |
| "reward": -0.46641844511032104, |
| "reward_std": 1.1392958164215088, |
| "rewards/reward_model/mean": -0.46641844511032104, |
| "rewards/reward_model/std": 1.3904635906219482, |
| "step": 35, |
| "step_time": 169.57979472074658 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2724609375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 69.291015625, |
| "completions/mean_terminated_length": 47.3046989440918, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 2.0284548006020486, |
| "epoch": 0.014827018121911038, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.202287197113037, |
| "kl": 0.008711300118193321, |
| "learning_rate": 4.320987654320988e-07, |
| "loss": 0.0853, |
| "num_tokens": 17445812.0, |
| "reward": -0.5143425464630127, |
| "reward_std": 1.080782175064087, |
| "rewards/reward_model/mean": -0.5143425464630127, |
| "rewards/reward_model/std": 1.3849540948867798, |
| "step": 36, |
| "step_time": 169.69654387421906 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3369140625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.77197265625, |
| "completions/mean_terminated_length": 52.25110626220703, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.8757861303165555, |
| "epoch": 0.015238879736408566, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8937506675720215, |
| "kl": 0.010500569681425986, |
| "learning_rate": 4.4444444444444444e-07, |
| "loss": 0.0466, |
| "num_tokens": 17886785.0, |
| "reward": -0.2941930890083313, |
| "reward_std": 1.089874267578125, |
| "rewards/reward_model/mean": -0.2941930890083313, |
| "rewards/reward_model/std": 1.3422448635101318, |
| "step": 37, |
| "step_time": 168.3831845112145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.33203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.78515625, |
| "completions/mean_terminated_length": 54.32163619995117, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.8339524874463677, |
| "epoch": 0.015650741350906095, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8777432441711426, |
| "kl": 0.014370948238138226, |
| "learning_rate": 4.567901234567901e-07, |
| "loss": 0.0431, |
| "num_tokens": 18363593.0, |
| "reward": -0.21549299359321594, |
| "reward_std": 1.0654486417770386, |
| "rewards/reward_model/mean": -0.21549299359321594, |
| "rewards/reward_model/std": 1.286303997039795, |
| "step": 38, |
| "step_time": 168.6560257449746 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3623046875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.3076171875, |
| "completions/mean_terminated_length": 53.211334228515625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.809513804037124, |
| "epoch": 0.016062602965403624, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.850346028804779, |
| "kl": 0.019628787720648688, |
| "learning_rate": 4.6913580246913586e-07, |
| "loss": -0.0144, |
| "num_tokens": 18782015.0, |
| "reward": -0.19260446727275848, |
| "reward_std": 1.0799050331115723, |
| "rewards/reward_model/mean": -0.19260446727275848, |
| "rewards/reward_model/std": 1.4198755025863647, |
| "step": 39, |
| "step_time": 169.06077374424785 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3212890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.080078125, |
| "completions/mean_terminated_length": 54.44892120361328, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.856378594879061, |
| "epoch": 0.016474464579901153, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8225836753845215, |
| "kl": 0.023546985856228275, |
| "learning_rate": 4.814814814814815e-07, |
| "loss": 0.0268, |
| "num_tokens": 19238691.0, |
| "reward": -0.22110876441001892, |
| "reward_std": 1.0441968441009521, |
| "rewards/reward_model/mean": -0.22110876441001892, |
| "rewards/reward_model/std": 1.3271934986114502, |
| "step": 40, |
| "step_time": 169.5594472438097 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.33251953125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.1416015625, |
| "completions/mean_terminated_length": 51.805416107177734, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.764324402436614, |
| "epoch": 0.01688632619439868, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7773052453994751, |
| "kl": 0.029280464848852716, |
| "learning_rate": 4.938271604938272e-07, |
| "loss": 0.0194, |
| "num_tokens": 19755301.0, |
| "reward": -0.12512998282909393, |
| "reward_std": 1.0090844631195068, |
| "rewards/reward_model/mean": -0.12512998282909393, |
| "rewards/reward_model/std": 1.2345008850097656, |
| "step": 41, |
| "step_time": 170.39410974271595 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.27490234375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 72.548828125, |
| "completions/mean_terminated_length": 51.52592468261719, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.758555585052818, |
| "epoch": 0.01729818780889621, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8362958431243896, |
| "kl": 0.03495379232481355, |
| "learning_rate": 5.061728395061729e-07, |
| "loss": 0.0014, |
| "num_tokens": 20199209.0, |
| "reward": -0.034443896263837814, |
| "reward_std": 1.0466477870941162, |
| "rewards/reward_model/mean": -0.034443896263837814, |
| "rewards/reward_model/std": 1.2755711078643799, |
| "step": 42, |
| "step_time": 170.17393092392012 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25830078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 74.35302734375, |
| "completions/mean_terminated_length": 55.6701774597168, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.6830066749826074, |
| "epoch": 0.01771004942339374, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9207571148872375, |
| "kl": 0.03826815243519377, |
| "learning_rate": 5.185185185185185e-07, |
| "loss": 0.0197, |
| "num_tokens": 20667900.0, |
| "reward": -0.03724297881126404, |
| "reward_std": 0.9730924367904663, |
| "rewards/reward_model/mean": -0.03724297881126404, |
| "rewards/reward_model/std": 1.1648329496383667, |
| "step": 43, |
| "step_time": 168.8539799619466 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3427734375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.59326171875, |
| "completions/mean_terminated_length": 58.911590576171875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.722061135340482, |
| "epoch": 0.018121911037891267, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.691871166229248, |
| "kl": 0.035297417802212294, |
| "learning_rate": 5.308641975308642e-07, |
| "loss": 0.0252, |
| "num_tokens": 21084443.0, |
| "reward": 0.1364922821521759, |
| "reward_std": 0.9992862939834595, |
| "rewards/reward_model/mean": 0.1364922821521759, |
| "rewards/reward_model/std": 1.338813066482544, |
| "step": 44, |
| "step_time": 168.30778062017635 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2216796875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 70.23193359375, |
| "completions/mean_terminated_length": 53.77854537963867, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.6960708745755255, |
| "epoch": 0.018533772652388796, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8769946694374084, |
| "kl": 0.04808991262325435, |
| "learning_rate": 5.432098765432099e-07, |
| "loss": -0.0016, |
| "num_tokens": 21510710.0, |
| "reward": 0.1898983120918274, |
| "reward_std": 0.9757044911384583, |
| "rewards/reward_model/mean": 0.1898983120918274, |
| "rewards/reward_model/std": 1.1677379608154297, |
| "step": 45, |
| "step_time": 169.03864477854222 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.9912109375, |
| "completions/mean_terminated_length": 58.42255401611328, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.6502066934481263, |
| "epoch": 0.018945634266886325, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6699737906455994, |
| "kl": 0.04364871226789546, |
| "learning_rate": 5.555555555555555e-07, |
| "loss": 0.0298, |
| "num_tokens": 21968036.0, |
| "reward": 0.14871619641780853, |
| "reward_std": 0.8983126878738403, |
| "rewards/reward_model/mean": 0.14871619641780853, |
| "rewards/reward_model/std": 1.1425597667694092, |
| "step": 46, |
| "step_time": 169.17142802104354 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.27294921875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 73.318359375, |
| "completions/mean_terminated_length": 52.789791107177734, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.806841244455427, |
| "epoch": 0.019357495881383854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7548915147781372, |
| "kl": 0.050439021695638075, |
| "learning_rate": 5.679012345679012e-07, |
| "loss": 0.0261, |
| "num_tokens": 22428752.0, |
| "reward": 0.18833398818969727, |
| "reward_std": 0.9490935802459717, |
| "rewards/reward_model/mean": 0.18833398818969727, |
| "rewards/reward_model/std": 1.218595027923584, |
| "step": 47, |
| "step_time": 169.8053262718022 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.23583984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 67.080078125, |
| "completions/mean_terminated_length": 48.278594970703125, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.7256725700572133, |
| "epoch": 0.019769357495881382, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8408772349357605, |
| "kl": 0.05761563615669729, |
| "learning_rate": 5.80246913580247e-07, |
| "loss": -0.0234, |
| "num_tokens": 22883444.0, |
| "reward": 0.10819900035858154, |
| "reward_std": 0.9136756062507629, |
| "rewards/reward_model/mean": 0.10819900035858154, |
| "rewards/reward_model/std": 1.13023042678833, |
| "step": 48, |
| "step_time": 170.28967663506046 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25927734375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 71.54296875, |
| "completions/mean_terminated_length": 51.78114700317383, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.676765794865787, |
| "epoch": 0.02018121911037891, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6633880138397217, |
| "kl": 0.05726497815339826, |
| "learning_rate": 5.925925925925926e-07, |
| "loss": 0.0077, |
| "num_tokens": 23311980.0, |
| "reward": 0.31039929389953613, |
| "reward_std": 0.8826955556869507, |
| "rewards/reward_model/mean": 0.31039929389953613, |
| "rewards/reward_model/std": 1.1599924564361572, |
| "step": 49, |
| "step_time": 169.17084869695827 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2392578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 73.46142578125, |
| "completions/mean_terminated_length": 56.30873107910156, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.5921450154855847, |
| "epoch": 0.02059308072487644, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7272253632545471, |
| "kl": 0.05662718684470747, |
| "learning_rate": 6.049382716049383e-07, |
| "loss": -0.0038, |
| "num_tokens": 23729245.0, |
| "reward": 0.2335912585258484, |
| "reward_std": 0.9175702929496765, |
| "rewards/reward_model/mean": 0.2335912585258484, |
| "rewards/reward_model/std": 1.1314274072647095, |
| "step": 50, |
| "step_time": 169.32695539435372 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.26220703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.47705078125, |
| "completions/mean_terminated_length": 59.521507263183594, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.630979296285659, |
| "epoch": 0.021004942339373972, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7209051847457886, |
| "kl": 0.0596031873501488, |
| "learning_rate": 6.17283950617284e-07, |
| "loss": -0.019, |
| "num_tokens": 24171054.0, |
| "reward": 0.3881710171699524, |
| "reward_std": 0.9779696464538574, |
| "rewards/reward_model/mean": 0.3881710171699524, |
| "rewards/reward_model/std": 1.2501736879348755, |
| "step": 51, |
| "step_time": 169.39474018104374 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3837890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 87.35888671875, |
| "completions/mean_terminated_length": 62.0467529296875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.5277671799995005, |
| "epoch": 0.0214168039538715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6657931208610535, |
| "kl": 0.04347534721819102, |
| "learning_rate": 6.296296296296296e-07, |
| "loss": 0.0084, |
| "num_tokens": 24640845.0, |
| "reward": 0.40088099241256714, |
| "reward_std": 0.8522671461105347, |
| "rewards/reward_model/mean": 0.40088099241256714, |
| "rewards/reward_model/std": 1.1760755777359009, |
| "step": 52, |
| "step_time": 169.99416326358914 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.26708984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.86572265625, |
| "completions/mean_terminated_length": 58.23118209838867, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.5355965252965689, |
| "epoch": 0.02182866556836903, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7246338725090027, |
| "kl": 0.059198625254794024, |
| "learning_rate": 6.419753086419753e-07, |
| "loss": 0.0065, |
| "num_tokens": 25146234.0, |
| "reward": 0.32493141293525696, |
| "reward_std": 0.8951080441474915, |
| "rewards/reward_model/mean": 0.32493141293525696, |
| "rewards/reward_model/std": 1.109892725944519, |
| "step": 53, |
| "step_time": 170.4134237067774 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.26220703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.32373046875, |
| "completions/mean_terminated_length": 59.313697814941406, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.4214782847557217, |
| "epoch": 0.022240527182866558, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.663506805896759, |
| "kl": 0.07469483163731638, |
| "learning_rate": 6.54320987654321e-07, |
| "loss": -0.0106, |
| "num_tokens": 25556273.0, |
| "reward": 0.5579333305358887, |
| "reward_std": 0.8257571458816528, |
| "rewards/reward_model/mean": 0.5579333305358887, |
| "rewards/reward_model/std": 1.0652962923049927, |
| "step": 54, |
| "step_time": 168.6097109238617 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2158203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 67.8369140625, |
| "completions/mean_terminated_length": 51.278953552246094, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.4409256265498698, |
| "epoch": 0.022652388797364087, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6998105645179749, |
| "kl": 0.09127819760760758, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": -0.0145, |
| "num_tokens": 26009315.0, |
| "reward": 0.36190831661224365, |
| "reward_std": 0.8500241637229919, |
| "rewards/reward_model/mean": 0.36190831661224365, |
| "rewards/reward_model/std": 1.0737853050231934, |
| "step": 55, |
| "step_time": 169.63903413154185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2265625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 71.69873046875, |
| "completions/mean_terminated_length": 55.20643997192383, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.4477170635946095, |
| "epoch": 0.023064250411861616, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6856215596199036, |
| "kl": 0.10684883118665311, |
| "learning_rate": 6.790123456790124e-07, |
| "loss": -0.0154, |
| "num_tokens": 26453082.0, |
| "reward": 0.5452687740325928, |
| "reward_std": 0.7654911875724792, |
| "rewards/reward_model/mean": 0.5452687740325928, |
| "rewards/reward_model/std": 0.9965056777000427, |
| "step": 56, |
| "step_time": 167.3982848683372 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.31689453125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.6259765625, |
| "completions/mean_terminated_length": 58.6490364074707, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.3876500492915511, |
| "epoch": 0.023476112026359144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5802730321884155, |
| "kl": 0.08226650860888185, |
| "learning_rate": 6.913580246913581e-07, |
| "loss": -0.0124, |
| "num_tokens": 26859036.0, |
| "reward": 0.4985049366950989, |
| "reward_std": 0.8241320252418518, |
| "rewards/reward_model/mean": 0.4985049366950989, |
| "rewards/reward_model/std": 1.177066683769226, |
| "step": 57, |
| "step_time": 168.99145932588726 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.29150390625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.41162109375, |
| "completions/mean_terminated_length": 62.24327850341797, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.3736186842434108, |
| "epoch": 0.023887973640856673, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.672706127166748, |
| "kl": 0.0943700206480571, |
| "learning_rate": 7.037037037037037e-07, |
| "loss": 0.0084, |
| "num_tokens": 27318439.0, |
| "reward": 0.7748833894729614, |
| "reward_std": 0.8400471806526184, |
| "rewards/reward_model/mean": 0.7748833894729614, |
| "rewards/reward_model/std": 1.0984324216842651, |
| "step": 58, |
| "step_time": 168.8087218273431 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.30419921875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.111328125, |
| "completions/mean_terminated_length": 57.737545013427734, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.3578353270422667, |
| "epoch": 0.024299835255354202, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6414256691932678, |
| "kl": 0.10844983752758708, |
| "learning_rate": 7.160493827160494e-07, |
| "loss": 0.0049, |
| "num_tokens": 27845483.0, |
| "reward": 0.6581840515136719, |
| "reward_std": 0.7642059326171875, |
| "rewards/reward_model/mean": 0.6581840515136719, |
| "rewards/reward_model/std": 1.0230196714401245, |
| "step": 59, |
| "step_time": 170.09878712054342 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.20947265625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 68.9013671875, |
| "completions/mean_terminated_length": 53.24150848388672, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.359937054105103, |
| "epoch": 0.02471169686985173, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6327216029167175, |
| "kl": 0.12389619748864789, |
| "learning_rate": 7.283950617283951e-07, |
| "loss": -0.0019, |
| "num_tokens": 28360609.0, |
| "reward": 0.5726691484451294, |
| "reward_std": 0.7263065576553345, |
| "rewards/reward_model/mean": 0.5726691484451294, |
| "rewards/reward_model/std": 1.0532201528549194, |
| "step": 60, |
| "step_time": 169.033332105726 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21728515625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 74.28515625, |
| "completions/mean_terminated_length": 59.37367248535156, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.2873307822737843, |
| "epoch": 0.02512355848434926, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6471663117408752, |
| "kl": 0.11947597128164489, |
| "learning_rate": 7.407407407407407e-07, |
| "loss": -0.018, |
| "num_tokens": 28796649.0, |
| "reward": 0.8057171106338501, |
| "reward_std": 0.6930927038192749, |
| "rewards/reward_model/mean": 0.8057171106338501, |
| "rewards/reward_model/std": 0.9504708647727966, |
| "step": 61, |
| "step_time": 170.3676045727916 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.27880859375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.5634765625, |
| "completions/mean_terminated_length": 62.22477722167969, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.4397811936214566, |
| "epoch": 0.025535420098846788, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5935730934143066, |
| "kl": 0.11211827301303856, |
| "learning_rate": 7.530864197530865e-07, |
| "loss": 0.005, |
| "num_tokens": 29272715.0, |
| "reward": 0.5376583337783813, |
| "reward_std": 0.7316970825195312, |
| "rewards/reward_model/mean": 0.5376583337783813, |
| "rewards/reward_model/std": 1.0817116498947144, |
| "step": 62, |
| "step_time": 170.07809142861515 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.27978515625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 83.00634765625, |
| "completions/mean_terminated_length": 65.52745819091797, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.2591755213215947, |
| "epoch": 0.025947281713344317, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6332337260246277, |
| "kl": 0.09935169246455189, |
| "learning_rate": 7.654320987654321e-07, |
| "loss": -0.0047, |
| "num_tokens": 29675416.0, |
| "reward": 0.8634133338928223, |
| "reward_std": 0.7280638217926025, |
| "rewards/reward_model/mean": 0.8634133338928223, |
| "rewards/reward_model/std": 1.0552853345870972, |
| "step": 63, |
| "step_time": 169.48511258373037 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.23291015625, |
| "completions/mean_terminated_length": 64.30420684814453, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.3228048181626946, |
| "epoch": 0.026359143327841845, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5896514058113098, |
| "kl": 0.10629570209130179, |
| "learning_rate": 7.777777777777778e-07, |
| "loss": 0.0018, |
| "num_tokens": 30103349.0, |
| "reward": 0.7413797378540039, |
| "reward_std": 0.6787456274032593, |
| "rewards/reward_model/mean": 0.7413797378540039, |
| "rewards/reward_model/std": 0.9844362735748291, |
| "step": 64, |
| "step_time": 168.49784950073808 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2080078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.34521484375, |
| "completions/mean_terminated_length": 64.04130554199219, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.3859816826879978, |
| "epoch": 0.026771004942339374, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.569113552570343, |
| "kl": 0.1355650291661732, |
| "learning_rate": 7.901234567901236e-07, |
| "loss": 0.0008, |
| "num_tokens": 30598360.0, |
| "reward": 0.8138879537582397, |
| "reward_std": 0.6921124458312988, |
| "rewards/reward_model/mean": 0.8138879537582397, |
| "rewards/reward_model/std": 1.008180856704712, |
| "step": 65, |
| "step_time": 168.94378049625084 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2333984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.275390625, |
| "completions/mean_terminated_length": 60.52738952636719, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.276806804118678, |
| "epoch": 0.027182866556836903, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.584193229675293, |
| "kl": 0.12889757197990548, |
| "learning_rate": 8.024691358024692e-07, |
| "loss": 0.0109, |
| "num_tokens": 31028524.0, |
| "reward": 0.7695643901824951, |
| "reward_std": 0.7420451641082764, |
| "rewards/reward_model/mean": 0.7695643901824951, |
| "rewards/reward_model/std": 1.1982769966125488, |
| "step": 66, |
| "step_time": 169.0462037078105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2314453125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.75390625, |
| "completions/mean_terminated_length": 67.82718658447266, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.2658436398487538, |
| "epoch": 0.02759472817133443, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5793888568878174, |
| "kl": 0.10512553945591208, |
| "learning_rate": 8.148148148148147e-07, |
| "loss": 0.019, |
| "num_tokens": 31459860.0, |
| "reward": 0.9362199306488037, |
| "reward_std": 0.6280190944671631, |
| "rewards/reward_model/mean": 0.9362199306488037, |
| "rewards/reward_model/std": 1.003322958946228, |
| "step": 67, |
| "step_time": 168.24778978247195 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.20068359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 75.294921875, |
| "completions/mean_terminated_length": 62.06230926513672, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.3218755372799933, |
| "epoch": 0.02800658978583196, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.619716465473175, |
| "kl": 0.13950362912146375, |
| "learning_rate": 8.271604938271605e-07, |
| "loss": 0.0032, |
| "num_tokens": 31900336.0, |
| "reward": 0.7856715321540833, |
| "reward_std": 0.6523309946060181, |
| "rewards/reward_model/mean": 0.7856715321540833, |
| "rewards/reward_model/std": 0.9243690371513367, |
| "step": 68, |
| "step_time": 168.62237379932776 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 72.62060546875, |
| "completions/mean_terminated_length": 59.84074783325195, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.2817874399479479, |
| "epoch": 0.02841845140032949, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6988398432731628, |
| "kl": 0.1405531533237081, |
| "learning_rate": 8.395061728395062e-07, |
| "loss": 0.0001, |
| "num_tokens": 32349991.0, |
| "reward": 0.7539228200912476, |
| "reward_std": 0.6927404403686523, |
| "rewards/reward_model/mean": 0.7539228200912476, |
| "rewards/reward_model/std": 1.1138005256652832, |
| "step": 69, |
| "step_time": 168.95463426411152 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.18310546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.68505859375, |
| "completions/mean_terminated_length": 65.18290710449219, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "entropy": 1.3052547052502632, |
| "epoch": 0.028830313014827018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5859233140945435, |
| "kl": 0.14809596522536594, |
| "learning_rate": 8.518518518518518e-07, |
| "loss": 0.006, |
| "num_tokens": 32799778.0, |
| "reward": 0.899767279624939, |
| "reward_std": 0.6600509881973267, |
| "rewards/reward_model/mean": 0.899767279624939, |
| "rewards/reward_model/std": 1.0378800630569458, |
| "step": 70, |
| "step_time": 168.27934673754498 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21435546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.1396484375, |
| "completions/mean_terminated_length": 63.26289749145508, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.2957828119397163, |
| "epoch": 0.029242174629324547, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6063217520713806, |
| "kl": 0.1499464159278432, |
| "learning_rate": 8.641975308641976e-07, |
| "loss": 0.0009, |
| "num_tokens": 33258528.0, |
| "reward": 0.9532963037490845, |
| "reward_std": 0.5860557556152344, |
| "rewards/reward_model/mean": 0.9532963037490845, |
| "rewards/reward_model/std": 0.9753101468086243, |
| "step": 71, |
| "step_time": 168.12876597139984 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.123046875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 71.07666015625, |
| "completions/mean_terminated_length": 63.08964538574219, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2831250824965537, |
| "epoch": 0.029654036243822075, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.650992214679718, |
| "kl": 0.14023015335260425, |
| "learning_rate": 8.765432098765433e-07, |
| "loss": 0.0053, |
| "num_tokens": 33708125.0, |
| "reward": 0.8464133739471436, |
| "reward_std": 0.5981078147888184, |
| "rewards/reward_model/mean": 0.8464133739471436, |
| "rewards/reward_model/std": 0.9848034977912903, |
| "step": 72, |
| "step_time": 168.81821045372635 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1435546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 70.923828125, |
| "completions/mean_terminated_length": 61.356895446777344, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.27306635864079, |
| "epoch": 0.030065897858319604, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6549474000930786, |
| "kl": 0.15989831037586555, |
| "learning_rate": 8.888888888888889e-07, |
| "loss": -0.0137, |
| "num_tokens": 34163265.0, |
| "reward": 0.9670735001564026, |
| "reward_std": 0.590969979763031, |
| "rewards/reward_model/mean": 0.9670735001564026, |
| "rewards/reward_model/std": 0.9453141689300537, |
| "step": 73, |
| "step_time": 169.4882780299522 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15234375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.10107421875, |
| "completions/mean_terminated_length": 66.77362060546875, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2641989213880152, |
| "epoch": 0.030477759472817133, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5717378854751587, |
| "kl": 0.13075158167339396, |
| "learning_rate": 9.012345679012347e-07, |
| "loss": 0.0049, |
| "num_tokens": 34635568.0, |
| "reward": 1.0525561571121216, |
| "reward_std": 0.5589165687561035, |
| "rewards/reward_model/mean": 1.0525561571121216, |
| "rewards/reward_model/std": 0.8849756121635437, |
| "step": 74, |
| "step_time": 169.85129849473014 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.18017578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.9130859375, |
| "completions/mean_terminated_length": 65.6855239868164, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2528326134197414, |
| "epoch": 0.03088962108731466, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5799550414085388, |
| "kl": 0.13488844226230867, |
| "learning_rate": 9.135802469135802e-07, |
| "loss": 0.0233, |
| "num_tokens": 35042110.0, |
| "reward": 1.0121339559555054, |
| "reward_std": 0.6093316078186035, |
| "rewards/reward_model/mean": 1.0121339559555054, |
| "rewards/reward_model/std": 0.9795147776603699, |
| "step": 75, |
| "step_time": 168.39488552790135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.11328125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 72.35791015625, |
| "completions/mean_terminated_length": 65.24944305419922, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.2186853648163378, |
| "epoch": 0.03130148270181219, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6350732445716858, |
| "kl": 0.13198584888596088, |
| "learning_rate": 9.259259259259259e-07, |
| "loss": 0.0245, |
| "num_tokens": 35454619.0, |
| "reward": 1.1278910636901855, |
| "reward_std": 0.6185814738273621, |
| "rewards/reward_model/mean": 1.1278910636901855, |
| "rewards/reward_model/std": 0.9232901930809021, |
| "step": 76, |
| "step_time": 169.25788368703797 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.13134765625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 70.81689453125, |
| "completions/mean_terminated_length": 62.170318603515625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.247056140564382, |
| "epoch": 0.03171334431630972, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.759410560131073, |
| "kl": 0.16601154587988276, |
| "learning_rate": 9.382716049382717e-07, |
| "loss": -0.0003, |
| "num_tokens": 35845316.0, |
| "reward": 1.0192276239395142, |
| "reward_std": 0.5931369066238403, |
| "rewards/reward_model/mean": 1.0192276239395142, |
| "rewards/reward_model/std": 0.9772949814796448, |
| "step": 77, |
| "step_time": 168.0306376479566 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09521484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 70.13916015625, |
| "completions/mean_terminated_length": 64.05018615722656, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2540333659853786, |
| "epoch": 0.03212520593080725, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6740376353263855, |
| "kl": 0.1378997444990091, |
| "learning_rate": 9.506172839506173e-07, |
| "loss": -0.013, |
| "num_tokens": 36287137.0, |
| "reward": 0.9819941520690918, |
| "reward_std": 0.604373574256897, |
| "rewards/reward_model/mean": 0.9819941520690918, |
| "rewards/reward_model/std": 0.9436709880828857, |
| "step": 78, |
| "step_time": 168.8950103893876 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09130859375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 68.37744140625, |
| "completions/mean_terminated_length": 62.3863525390625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.1720305329654366, |
| "epoch": 0.032537067545304776, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6466286182403564, |
| "kl": 0.12133494461886585, |
| "learning_rate": 9.62962962962963e-07, |
| "loss": -0.0048, |
| "num_tokens": 36719654.0, |
| "reward": 1.1645737886428833, |
| "reward_std": 0.5557790398597717, |
| "rewards/reward_model/mean": 1.1645737886428833, |
| "rewards/reward_model/std": 0.9391114711761475, |
| "step": 79, |
| "step_time": 169.75694013293833 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.13330078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 69.42236328125, |
| "completions/mean_terminated_length": 60.412960052490234, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.2074508473742753, |
| "epoch": 0.032948929159802305, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7178550362586975, |
| "kl": 0.16269512700091582, |
| "learning_rate": 9.753086419753088e-07, |
| "loss": 0.0024, |
| "num_tokens": 37198599.0, |
| "reward": 1.0070809125900269, |
| "reward_std": 0.6197090148925781, |
| "rewards/reward_model/mean": 1.0070809125900269, |
| "rewards/reward_model/std": 0.8695884943008423, |
| "step": 80, |
| "step_time": 170.65315298642963 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.10888671875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 69.23583984375, |
| "completions/mean_terminated_length": 62.05534362792969, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.1959036465268582, |
| "epoch": 0.033360790774299834, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6767128705978394, |
| "kl": 0.1640322696766816, |
| "learning_rate": 9.876543209876544e-07, |
| "loss": -0.0102, |
| "num_tokens": 37627498.0, |
| "reward": 1.051206111907959, |
| "reward_std": 0.6312122344970703, |
| "rewards/reward_model/mean": 1.051206111907959, |
| "rewards/reward_model/std": 1.006866455078125, |
| "step": 81, |
| "step_time": 169.01685216045007 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15234375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.1484375, |
| "completions/mean_terminated_length": 66.8294906616211, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2353556689340621, |
| "epoch": 0.03377265238879736, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6121835708618164, |
| "kl": 0.13194930272584315, |
| "learning_rate": 1e-06, |
| "loss": -0.0084, |
| "num_tokens": 38077466.0, |
| "reward": 1.1334960460662842, |
| "reward_std": 0.5429809093475342, |
| "rewards/reward_model/mean": 1.1334960460662842, |
| "rewards/reward_model/std": 1.0062233209609985, |
| "step": 82, |
| "step_time": 169.77917499747127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1455078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 74.54931640625, |
| "completions/mean_terminated_length": 65.44742584228516, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.2487247881945223, |
| "epoch": 0.03418451400329489, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8223838210105896, |
| "kl": 0.1559760042873677, |
| "learning_rate": 1.0123456790123457e-06, |
| "loss": 0.0192, |
| "num_tokens": 38560095.0, |
| "reward": 1.0274322032928467, |
| "reward_std": 0.6149877309799194, |
| "rewards/reward_model/mean": 1.0274322032928467, |
| "rewards/reward_model/std": 0.8848612308502197, |
| "step": 83, |
| "step_time": 168.7728981245309 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0693359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 68.9453125, |
| "completions/mean_terminated_length": 64.54563903808594, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.203527741599828, |
| "epoch": 0.03459637561779242, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6430616974830627, |
| "kl": 0.16221579984994605, |
| "learning_rate": 1.0246913580246913e-06, |
| "loss": -0.0054, |
| "num_tokens": 38989743.0, |
| "reward": 1.1352043151855469, |
| "reward_std": 0.5808489918708801, |
| "rewards/reward_model/mean": 1.1352043151855469, |
| "rewards/reward_model/std": 1.0034772157669067, |
| "step": 84, |
| "step_time": 167.36356884567067 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08544921875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 73.13720703125, |
| "completions/mean_terminated_length": 68.01121520996094, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.1932638001162559, |
| "epoch": 0.03500823723228995, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.618569552898407, |
| "kl": 0.1509321930789156, |
| "learning_rate": 1.037037037037037e-06, |
| "loss": -0.0105, |
| "num_tokens": 39413960.0, |
| "reward": 1.1917307376861572, |
| "reward_std": 0.6115972995758057, |
| "rewards/reward_model/mean": 1.1917307376861572, |
| "rewards/reward_model/std": 0.8577749729156494, |
| "step": 85, |
| "step_time": 169.4022615076974 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1328125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 74.16455078125, |
| "completions/mean_terminated_length": 65.91948699951172, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.1511160423979163, |
| "epoch": 0.03542009884678748, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6170347929000854, |
| "kl": 0.16613518859958276, |
| "learning_rate": 1.0493827160493827e-06, |
| "loss": 0.0068, |
| "num_tokens": 39897081.0, |
| "reward": 1.2492460012435913, |
| "reward_std": 0.6051790714263916, |
| "rewards/reward_model/mean": 1.2492460012435913, |
| "rewards/reward_model/std": 0.8991779685020447, |
| "step": 86, |
| "step_time": 168.7876625736244 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.091796875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 73.81884765625, |
| "completions/mean_terminated_length": 68.34247589111328, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "entropy": 1.216100089251995, |
| "epoch": 0.035831960461285006, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6116214394569397, |
| "kl": 0.17693783454888035, |
| "learning_rate": 1.0617283950617285e-06, |
| "loss": 0.014, |
| "num_tokens": 40304870.0, |
| "reward": 1.327715277671814, |
| "reward_std": 0.5273313522338867, |
| "rewards/reward_model/mean": 1.327715277671814, |
| "rewards/reward_model/std": 0.8829416036605835, |
| "step": 87, |
| "step_time": 170.00215818034485 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.10498046875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.087890625, |
| "completions/mean_terminated_length": 76.70267486572266, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2015210629906505, |
| "epoch": 0.036243822075782535, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5836432576179504, |
| "kl": 0.14079495580517687, |
| "learning_rate": 1.074074074074074e-06, |
| "loss": 0.0043, |
| "num_tokens": 40738362.0, |
| "reward": 1.2116522789001465, |
| "reward_std": 0.5785905122756958, |
| "rewards/reward_model/mean": 1.2116522789001465, |
| "rewards/reward_model/std": 0.8599736094474792, |
| "step": 88, |
| "step_time": 170.04560359567404 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07080078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 71.1552734375, |
| "completions/mean_terminated_length": 66.82395935058594, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "entropy": 1.1739569688215852, |
| "epoch": 0.036655683690280064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5943244099617004, |
| "kl": 0.16060514625860378, |
| "learning_rate": 1.0864197530864199e-06, |
| "loss": -0.0115, |
| "num_tokens": 41174584.0, |
| "reward": 1.079056739807129, |
| "reward_std": 0.5492511987686157, |
| "rewards/reward_model/mean": 1.079056739807129, |
| "rewards/reward_model/std": 0.8875714540481567, |
| "step": 89, |
| "step_time": 170.3232544688508 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 73.50341796875, |
| "completions/mean_terminated_length": 68.63350677490234, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2288584825582802, |
| "epoch": 0.03706754530477759, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6955690979957581, |
| "kl": 0.18743510471540503, |
| "learning_rate": 1.0987654320987655e-06, |
| "loss": 0.0392, |
| "num_tokens": 41634879.0, |
| "reward": 1.2760483026504517, |
| "reward_std": 0.5275530219078064, |
| "rewards/reward_model/mean": 1.2760483026504517, |
| "rewards/reward_model/std": 0.8526185154914856, |
| "step": 90, |
| "step_time": 169.28955688048154 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.103515625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.4619140625, |
| "completions/mean_terminated_length": 71.62635803222656, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.1926879836246371, |
| "epoch": 0.03747940691927512, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6305184364318848, |
| "kl": 0.16981972678331658, |
| "learning_rate": 1.111111111111111e-06, |
| "loss": -0.0032, |
| "num_tokens": 42093777.0, |
| "reward": 1.3892216682434082, |
| "reward_std": 0.5210399627685547, |
| "rewards/reward_model/mean": 1.3892216682434082, |
| "rewards/reward_model/std": 0.8532023429870605, |
| "step": 91, |
| "step_time": 169.93193591805175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1005859375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.02099609375, |
| "completions/mean_terminated_length": 74.65526580810547, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.220891160191968, |
| "epoch": 0.03789126853377265, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5791942477226257, |
| "kl": 0.13990605663275346, |
| "learning_rate": 1.1234567901234568e-06, |
| "loss": 0.021, |
| "num_tokens": 42533916.0, |
| "reward": 1.3777389526367188, |
| "reward_std": 0.5628249049186707, |
| "rewards/reward_model/mean": 1.3777389526367188, |
| "rewards/reward_model/std": 0.8695874214172363, |
| "step": 92, |
| "step_time": 168.67672005156055 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.13427734375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.66845703125, |
| "completions/mean_terminated_length": 74.48223876953125, |
| "completions/min_length": 8.0, |
| "completions/min_terminated_length": 8.0, |
| "entropy": 1.1679968070238829, |
| "epoch": 0.03830313014827018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.577627956867218, |
| "kl": 0.14946075380430557, |
| "learning_rate": 1.1358024691358024e-06, |
| "loss": 0.0005, |
| "num_tokens": 42977269.0, |
| "reward": 1.2400810718536377, |
| "reward_std": 0.5488580465316772, |
| "rewards/reward_model/mean": 1.2400810718536377, |
| "rewards/reward_model/std": 0.8818415999412537, |
| "step": 93, |
| "step_time": 169.11300712404773 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1591796875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 89.02783203125, |
| "completions/mean_terminated_length": 81.64982604980469, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2256716336123645, |
| "epoch": 0.03871499176276771, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3020151853561401, |
| "kl": 0.1575723221176304, |
| "learning_rate": 1.1481481481481482e-06, |
| "loss": 0.0078, |
| "num_tokens": 43468782.0, |
| "reward": 1.2878694534301758, |
| "reward_std": 0.5034958124160767, |
| "rewards/reward_model/mean": 1.2878694534301758, |
| "rewards/reward_model/std": 0.8000524640083313, |
| "step": 94, |
| "step_time": 168.66197129152715 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.146484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.29296875, |
| "completions/mean_terminated_length": 74.4485092163086, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2125889593735337, |
| "epoch": 0.039126853377265236, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5775403380393982, |
| "kl": 0.17348909995052963, |
| "learning_rate": 1.160493827160494e-06, |
| "loss": 0.0291, |
| "num_tokens": 43925606.0, |
| "reward": 1.3064830303192139, |
| "reward_std": 0.5317621231079102, |
| "rewards/reward_model/mean": 1.3064830303192139, |
| "rewards/reward_model/std": 0.8767746090888977, |
| "step": 95, |
| "step_time": 168.66238435404375 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.11376953125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 83.8466796875, |
| "completions/mean_terminated_length": 78.17851257324219, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2149158080574125, |
| "epoch": 0.039538714991762765, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5917447209358215, |
| "kl": 0.16629101379658096, |
| "learning_rate": 1.1728395061728396e-06, |
| "loss": 0.0372, |
| "num_tokens": 44365228.0, |
| "reward": 1.310151219367981, |
| "reward_std": 0.5394536852836609, |
| "rewards/reward_model/mean": 1.310151219367981, |
| "rewards/reward_model/std": 0.8472654223442078, |
| "step": 96, |
| "step_time": 170.65063601452857 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.087890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.22119140625, |
| "completions/mean_terminated_length": 76.7136001586914, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2331861625425518, |
| "epoch": 0.039950576606260293, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6160045266151428, |
| "kl": 0.18450818251585588, |
| "learning_rate": 1.1851851851851852e-06, |
| "loss": 0.0239, |
| "num_tokens": 44809841.0, |
| "reward": 1.388469934463501, |
| "reward_std": 0.47841960191726685, |
| "rewards/reward_model/mean": 1.388469934463501, |
| "rewards/reward_model/std": 0.7604539394378662, |
| "step": 97, |
| "step_time": 168.5139070255682 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.14404296875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 86.03857421875, |
| "completions/mean_terminated_length": 78.97718811035156, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2734931902959943, |
| "epoch": 0.04036243822075782, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8496507406234741, |
| "kl": 0.1486935554712545, |
| "learning_rate": 1.197530864197531e-06, |
| "loss": 0.024, |
| "num_tokens": 45291392.0, |
| "reward": 1.273500680923462, |
| "reward_std": 0.5131819844245911, |
| "rewards/reward_model/mean": 1.273500680923462, |
| "rewards/reward_model/std": 0.8183842301368713, |
| "step": 98, |
| "step_time": 169.52931605745107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07861328125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.30078125, |
| "completions/mean_terminated_length": 71.8897705078125, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.210193380014971, |
| "epoch": 0.04077429983525535, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6533998847007751, |
| "kl": 0.16968276555417106, |
| "learning_rate": 1.2098765432098765e-06, |
| "loss": 0.0227, |
| "num_tokens": 45769608.0, |
| "reward": 1.374595046043396, |
| "reward_std": 0.5025352835655212, |
| "rewards/reward_model/mean": 1.374595046043396, |
| "rewards/reward_model/std": 0.7355093359947205, |
| "step": 99, |
| "step_time": 170.76728575211018 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.10595703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.27490234375, |
| "completions/mean_terminated_length": 74.61878204345703, |
| "completions/min_length": 12.0, |
| "completions/min_terminated_length": 12.0, |
| "entropy": 1.1873215795494616, |
| "epoch": 0.04118616144975288, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5959328413009644, |
| "kl": 0.14610896681551822, |
| "learning_rate": 1.2222222222222221e-06, |
| "loss": 0.0218, |
| "num_tokens": 46193339.0, |
| "reward": 1.393322229385376, |
| "reward_std": 0.5245035886764526, |
| "rewards/reward_model/mean": 1.393322229385376, |
| "rewards/reward_model/std": 0.8767962455749512, |
| "step": 100, |
| "step_time": 169.11898464756086 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1171875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.0029296875, |
| "completions/mean_terminated_length": 74.7643814086914, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1970643543172628, |
| "epoch": 0.04159802306425041, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6192975640296936, |
| "kl": 0.17013628454878926, |
| "learning_rate": 1.234567901234568e-06, |
| "loss": 0.0146, |
| "num_tokens": 46603041.0, |
| "reward": 1.3850064277648926, |
| "reward_std": 0.5491656064987183, |
| "rewards/reward_model/mean": 1.3850064277648926, |
| "rewards/reward_model/std": 0.8517816662788391, |
| "step": 101, |
| "step_time": 172.5300747868605 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09130859375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.30126953125, |
| "completions/mean_terminated_length": 74.40784454345703, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.2366106095723808, |
| "epoch": 0.042009884678747944, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6139182448387146, |
| "kl": 0.15838144707959145, |
| "learning_rate": 1.2469135802469135e-06, |
| "loss": 0.0044, |
| "num_tokens": 47074410.0, |
| "reward": 1.4079811573028564, |
| "reward_std": 0.5216450691223145, |
| "rewards/reward_model/mean": 1.4079811573028564, |
| "rewards/reward_model/std": 0.7702781558036804, |
| "step": 102, |
| "step_time": 168.7905627740547 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 75.34375, |
| "completions/mean_terminated_length": 70.88135528564453, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2070911666378379, |
| "epoch": 0.04242174629324547, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6438104510307312, |
| "kl": 0.18365478227497078, |
| "learning_rate": 1.2592592592592593e-06, |
| "loss": 0.0099, |
| "num_tokens": 47573482.0, |
| "reward": 1.379532814025879, |
| "reward_std": 0.5358700156211853, |
| "rewards/reward_model/mean": 1.379532814025879, |
| "rewards/reward_model/std": 0.8822428584098816, |
| "step": 103, |
| "step_time": 169.78911154950038 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.12841796875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.322265625, |
| "completions/mean_terminated_length": 73.2974853515625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.2451989527326077, |
| "epoch": 0.042833607907743, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5943405628204346, |
| "kl": 0.15326940282830037, |
| "learning_rate": 1.271604938271605e-06, |
| "loss": 0.0054, |
| "num_tokens": 48002462.0, |
| "reward": 1.2383322715759277, |
| "reward_std": 0.5271602869033813, |
| "rewards/reward_model/mean": 1.2383322715759277, |
| "rewards/reward_model/std": 1.0347553491592407, |
| "step": 104, |
| "step_time": 168.55086909374222 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06884765625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 72.16455078125, |
| "completions/mean_terminated_length": 68.03618621826172, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1801623778883368, |
| "epoch": 0.04324546952224053, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6378849148750305, |
| "kl": 0.15821812994545326, |
| "learning_rate": 1.2839506172839507e-06, |
| "loss": 0.0035, |
| "num_tokens": 48392175.0, |
| "reward": 1.5423574447631836, |
| "reward_std": 0.4824356138706207, |
| "rewards/reward_model/mean": 1.5423574447631836, |
| "rewards/reward_model/std": 0.8302909135818481, |
| "step": 105, |
| "step_time": 169.42736366018653 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0830078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.6953125, |
| "completions/mean_terminated_length": 74.23216247558594, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.3169332989491522, |
| "epoch": 0.04365733113673806, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5943061113357544, |
| "kl": 0.17450666631339118, |
| "learning_rate": 1.2962962962962962e-06, |
| "loss": 0.0041, |
| "num_tokens": 48840319.0, |
| "reward": 1.3128201961517334, |
| "reward_std": 0.5468074679374695, |
| "rewards/reward_model/mean": 1.3128201961517334, |
| "rewards/reward_model/std": 0.9597176909446716, |
| "step": 106, |
| "step_time": 169.50068523269147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08642578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 73.5673828125, |
| "completions/mean_terminated_length": 68.41796112060547, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1956646356265992, |
| "epoch": 0.04406919275123559, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9529135823249817, |
| "kl": 0.14988234487827867, |
| "learning_rate": 1.308641975308642e-06, |
| "loss": 0.0263, |
| "num_tokens": 49281225.0, |
| "reward": 1.289332628250122, |
| "reward_std": 0.5048444867134094, |
| "rewards/reward_model/mean": 1.289332628250122, |
| "rewards/reward_model/std": 0.8929986953735352, |
| "step": 107, |
| "step_time": 169.95979618094862 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1025390625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.716796875, |
| "completions/mean_terminated_length": 75.31446838378906, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2545830043964088, |
| "epoch": 0.044481054365733116, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5836382508277893, |
| "kl": 0.14862596249440685, |
| "learning_rate": 1.3209876543209876e-06, |
| "loss": 0.0258, |
| "num_tokens": 49792933.0, |
| "reward": 1.3871095180511475, |
| "reward_std": 0.5084824562072754, |
| "rewards/reward_model/mean": 1.3871095180511475, |
| "rewards/reward_model/std": 0.8852909207344055, |
| "step": 108, |
| "step_time": 169.20244881836697 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04638671875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 70.8515625, |
| "completions/mean_terminated_length": 68.07168579101562, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2631548107601702, |
| "epoch": 0.044892915980230645, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6485495567321777, |
| "kl": 0.1719318167597521, |
| "learning_rate": 1.3333333333333332e-06, |
| "loss": -0.0057, |
| "num_tokens": 50257109.0, |
| "reward": 1.5582630634307861, |
| "reward_std": 0.5006577968597412, |
| "rewards/reward_model/mean": 1.5582630634307861, |
| "rewards/reward_model/std": 0.8305342197418213, |
| "step": 109, |
| "step_time": 168.78190125897527 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0556640625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 71.658203125, |
| "completions/mean_terminated_length": 68.33712768554688, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2364499233663082, |
| "epoch": 0.045304777594728174, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.588637113571167, |
| "kl": 0.16918645988334902, |
| "learning_rate": 1.345679012345679e-06, |
| "loss": 0.0182, |
| "num_tokens": 50728345.0, |
| "reward": 1.4181112051010132, |
| "reward_std": 0.5059062242507935, |
| "rewards/reward_model/mean": 1.4181112051010132, |
| "rewards/reward_model/std": 0.9040238857269287, |
| "step": 110, |
| "step_time": 169.8461561407894 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0908203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.97265625, |
| "completions/mean_terminated_length": 75.17507934570312, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.3138902625069022, |
| "epoch": 0.0457166392092257, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5418975353240967, |
| "kl": 0.16301694582216442, |
| "learning_rate": 1.3580246913580248e-06, |
| "loss": 0.0118, |
| "num_tokens": 51243937.0, |
| "reward": 1.2584481239318848, |
| "reward_std": 0.4887618124485016, |
| "rewards/reward_model/mean": 1.2584481239318848, |
| "rewards/reward_model/std": 0.9598668813705444, |
| "step": 111, |
| "step_time": 169.55967817036435 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.10009765625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.0, |
| "completions/mean_terminated_length": 71.32718658447266, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2457992839626968, |
| "epoch": 0.04612850082372323, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7095631957054138, |
| "kl": 0.18068589709582739, |
| "learning_rate": 1.3703703703703704e-06, |
| "loss": 0.0037, |
| "num_tokens": 51729121.0, |
| "reward": 1.4246959686279297, |
| "reward_std": 0.44748643040657043, |
| "rewards/reward_model/mean": 1.4246959686279297, |
| "rewards/reward_model/std": 0.8918141722679138, |
| "step": 112, |
| "step_time": 169.78249773895368 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07568359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 73.623046875, |
| "completions/mean_terminated_length": 69.17062377929688, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2187957542482764, |
| "epoch": 0.04654036243822076, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9400354623794556, |
| "kl": 0.22693650811561383, |
| "learning_rate": 1.3827160493827162e-06, |
| "loss": 0.0122, |
| "num_tokens": 52150877.0, |
| "reward": 1.5303287506103516, |
| "reward_std": 0.4950566291809082, |
| "rewards/reward_model/mean": 1.5303287506103516, |
| "rewards/reward_model/std": 0.8175535798072815, |
| "step": 113, |
| "step_time": 169.6679522804916 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05517578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.8798828125, |
| "completions/mean_terminated_length": 73.89456939697266, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.2598798810504377, |
| "epoch": 0.04695222405271829, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5663942098617554, |
| "kl": 0.18427117861574516, |
| "learning_rate": 1.3950617283950617e-06, |
| "loss": 0.0113, |
| "num_tokens": 52610119.0, |
| "reward": 1.4505566358566284, |
| "reward_std": 0.47873079776763916, |
| "rewards/reward_model/mean": 1.4505566358566284, |
| "rewards/reward_model/std": 0.9025180339813232, |
| "step": 114, |
| "step_time": 168.86495931399986 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0615234375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.5009765625, |
| "completions/mean_terminated_length": 73.12487030029297, |
| "completions/min_length": 9.0, |
| "completions/min_terminated_length": 9.0, |
| "entropy": 1.2603700500912964, |
| "epoch": 0.04736408566721582, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.594477653503418, |
| "kl": 0.1864922004751861, |
| "learning_rate": 1.4074074074074073e-06, |
| "loss": 0.0007, |
| "num_tokens": 53070697.0, |
| "reward": 1.5104587078094482, |
| "reward_std": 0.4391752779483795, |
| "rewards/reward_model/mean": 1.5104587078094482, |
| "rewards/reward_model/std": 0.7633559703826904, |
| "step": 115, |
| "step_time": 170.47059550089762 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08544921875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.56005859375, |
| "completions/mean_terminated_length": 75.03417205810547, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2630685805343091, |
| "epoch": 0.047775947281713346, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5976677536964417, |
| "kl": 0.17307385959429666, |
| "learning_rate": 1.4197530864197531e-06, |
| "loss": 0.0185, |
| "num_tokens": 53542148.0, |
| "reward": 1.419188141822815, |
| "reward_std": 0.4496381878852844, |
| "rewards/reward_model/mean": 1.419188141822815, |
| "rewards/reward_model/std": 0.8634824156761169, |
| "step": 116, |
| "step_time": 169.13070647930726 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.076171875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.9912109375, |
| "completions/mean_terminated_length": 79.2801284790039, |
| "completions/min_length": 10.0, |
| "completions/min_terminated_length": 10.0, |
| "entropy": 1.3025836027227342, |
| "epoch": 0.048187808896210875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5448313355445862, |
| "kl": 0.1677903521922417, |
| "learning_rate": 1.4320987654320987e-06, |
| "loss": 0.0104, |
| "num_tokens": 53986130.0, |
| "reward": 1.4435430765151978, |
| "reward_std": 0.4609750807285309, |
| "rewards/reward_model/mean": 1.4435430765151978, |
| "rewards/reward_model/std": 0.7846410870552063, |
| "step": 117, |
| "step_time": 169.9405871666968 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0830078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.50244140625, |
| "completions/mean_terminated_length": 74.0218276977539, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.288459766190499, |
| "epoch": 0.048599670510708404, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5833011865615845, |
| "kl": 0.1983755056571681, |
| "learning_rate": 1.4444444444444445e-06, |
| "loss": 0.0079, |
| "num_tokens": 54439799.0, |
| "reward": 1.4345738887786865, |
| "reward_std": 0.48493313789367676, |
| "rewards/reward_model/mean": 1.4345738887786865, |
| "rewards/reward_model/std": 0.8746789693832397, |
| "step": 118, |
| "step_time": 169.1852500126697 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07373046875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.51513671875, |
| "completions/mean_terminated_length": 74.576171875, |
| "completions/min_length": 6.0, |
| "completions/min_terminated_length": 6.0, |
| "entropy": 1.224245477002114, |
| "epoch": 0.04901153212520593, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6017325520515442, |
| "kl": 0.18918700981885195, |
| "learning_rate": 1.4567901234567903e-06, |
| "loss": 0.0202, |
| "num_tokens": 54875606.0, |
| "reward": 1.5676627159118652, |
| "reward_std": 0.4177808165550232, |
| "rewards/reward_model/mean": 1.5676627159118652, |
| "rewards/reward_model/std": 0.854965329170227, |
| "step": 119, |
| "step_time": 169.29946460714564 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.10595703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.9853515625, |
| "completions/mean_terminated_length": 72.05789184570312, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.209789669374004, |
| "epoch": 0.04942339373970346, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8375149369239807, |
| "kl": 0.20605222514132038, |
| "learning_rate": 1.4691358024691359e-06, |
| "loss": 0.016, |
| "num_tokens": 55328024.0, |
| "reward": 1.4581849575042725, |
| "reward_std": 0.4875085949897766, |
| "rewards/reward_model/mean": 1.4581849575042725, |
| "rewards/reward_model/std": 0.9500890374183655, |
| "step": 120, |
| "step_time": 169.78462026640773 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.11181640625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 84.04736328125, |
| "completions/mean_terminated_length": 78.51402282714844, |
| "completions/min_length": 10.0, |
| "completions/min_terminated_length": 10.0, |
| "entropy": 1.232045111246407, |
| "epoch": 0.04983525535420099, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5587739944458008, |
| "kl": 0.18681989194010384, |
| "learning_rate": 1.4814814814814815e-06, |
| "loss": 0.0177, |
| "num_tokens": 55771033.0, |
| "reward": 1.4740581512451172, |
| "reward_std": 0.4508310556411743, |
| "rewards/reward_model/mean": 1.4740581512451172, |
| "rewards/reward_model/std": 0.8630524277687073, |
| "step": 121, |
| "step_time": 168.61212700419128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08349609375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.2998046875, |
| "completions/mean_terminated_length": 71.58977508544922, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2581788282841444, |
| "epoch": 0.05024711696869852, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5757594704627991, |
| "kl": 0.19532652158522978, |
| "learning_rate": 1.4938271604938272e-06, |
| "loss": 0.011, |
| "num_tokens": 56215583.0, |
| "reward": 1.4256713390350342, |
| "reward_std": 0.4282228946685791, |
| "rewards/reward_model/mean": 1.4256713390350342, |
| "rewards/reward_model/std": 0.9199231863021851, |
| "step": 122, |
| "step_time": 168.55926717165858 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0947265625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.01416015625, |
| "completions/mean_terminated_length": 72.78370666503906, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2156934614758939, |
| "epoch": 0.05065897858319605, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5999395847320557, |
| "kl": 0.19728930137353018, |
| "learning_rate": 1.506172839506173e-06, |
| "loss": 0.0113, |
| "num_tokens": 56669980.0, |
| "reward": 1.6116740703582764, |
| "reward_std": 0.4245319962501526, |
| "rewards/reward_model/mean": 1.6116740703582764, |
| "rewards/reward_model/std": 0.7885070443153381, |
| "step": 123, |
| "step_time": 168.9781666644849 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07470703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.9921875, |
| "completions/mean_terminated_length": 72.8738784790039, |
| "completions/min_length": 8.0, |
| "completions/min_terminated_length": 8.0, |
| "entropy": 1.2410368812270463, |
| "epoch": 0.051070840197693576, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5885641574859619, |
| "kl": 0.22556541379890405, |
| "learning_rate": 1.5185185185185186e-06, |
| "loss": 0.0278, |
| "num_tokens": 57086316.0, |
| "reward": 1.7070322036743164, |
| "reward_std": 0.4219028055667877, |
| "rewards/reward_model/mean": 1.7070322036743164, |
| "rewards/reward_model/std": 0.7904437184333801, |
| "step": 124, |
| "step_time": 168.52942496724427 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.14599609375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.66748046875, |
| "completions/mean_terminated_length": 74.91766357421875, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2711665122769773, |
| "epoch": 0.051482701812191105, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5652002096176147, |
| "kl": 0.21976503301993944, |
| "learning_rate": 1.5308641975308642e-06, |
| "loss": 0.025, |
| "num_tokens": 57550787.0, |
| "reward": 1.5571563243865967, |
| "reward_std": 0.43854257464408875, |
| "rewards/reward_model/mean": 1.5571563243865967, |
| "rewards/reward_model/std": 0.9019820690155029, |
| "step": 125, |
| "step_time": 169.8311795401387 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.083984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.79931640625, |
| "completions/mean_terminated_length": 76.47174835205078, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2508959705010056, |
| "epoch": 0.051894563426688634, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.587785542011261, |
| "kl": 0.2117367441242095, |
| "learning_rate": 1.5432098765432098e-06, |
| "loss": 0.0343, |
| "num_tokens": 57980552.0, |
| "reward": 1.6625767946243286, |
| "reward_std": 0.41725289821624756, |
| "rewards/reward_model/mean": 1.6625767946243286, |
| "rewards/reward_model/std": 0.781819760799408, |
| "step": 126, |
| "step_time": 170.32025544391945 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 74.45458984375, |
| "completions/mean_terminated_length": 71.35691833496094, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2673575547523797, |
| "epoch": 0.05230642504118616, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.663336992263794, |
| "kl": 0.2514068058517296, |
| "learning_rate": 1.5555555555555556e-06, |
| "loss": 0.007, |
| "num_tokens": 58421643.0, |
| "reward": 1.6073158979415894, |
| "reward_std": 0.431104838848114, |
| "rewards/reward_model/mean": 1.6073158979415894, |
| "rewards/reward_model/std": 0.7830482125282288, |
| "step": 127, |
| "step_time": 167.69548717467114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05712890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.8486328125, |
| "completions/mean_terminated_length": 76.93112182617188, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 1.3203896265476942, |
| "epoch": 0.05271828665568369, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5664064288139343, |
| "kl": 0.21401835474534892, |
| "learning_rate": 1.5679012345679012e-06, |
| "loss": 0.0218, |
| "num_tokens": 58904437.0, |
| "reward": 1.5991525650024414, |
| "reward_std": 0.4234989285469055, |
| "rewards/reward_model/mean": 1.5991525650024414, |
| "rewards/reward_model/std": 0.8423987030982971, |
| "step": 128, |
| "step_time": 170.09795808279887 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0693359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.70556640625, |
| "completions/mean_terminated_length": 72.88404846191406, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.2859586202539504, |
| "epoch": 0.05313014827018122, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5938490033149719, |
| "kl": 0.23812477025785483, |
| "learning_rate": 1.5802469135802472e-06, |
| "loss": 0.0198, |
| "num_tokens": 59368826.0, |
| "reward": 1.4986733198165894, |
| "reward_std": 0.48006802797317505, |
| "rewards/reward_model/mean": 1.4986733198165894, |
| "rewards/reward_model/std": 0.9735957384109497, |
| "step": 129, |
| "step_time": 168.860689394176 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07666015625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.9814453125, |
| "completions/mean_terminated_length": 74.91168975830078, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.210738726425916, |
| "epoch": 0.05354200988467875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5862839818000793, |
| "kl": 0.21009712052182294, |
| "learning_rate": 1.5925925925925927e-06, |
| "loss": 0.0074, |
| "num_tokens": 59805460.0, |
| "reward": 1.6455633640289307, |
| "reward_std": 0.4051080346107483, |
| "rewards/reward_model/mean": 1.6455633640289307, |
| "rewards/reward_model/std": 0.902286946773529, |
| "step": 130, |
| "step_time": 169.57672298140824 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0595703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 75.908203125, |
| "completions/mean_terminated_length": 72.6085205078125, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2665375941433012, |
| "epoch": 0.05395387149917628, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5997110605239868, |
| "kl": 0.2307097713346593, |
| "learning_rate": 1.6049382716049383e-06, |
| "loss": 0.0136, |
| "num_tokens": 60283128.0, |
| "reward": 1.6560347080230713, |
| "reward_std": 0.3901210129261017, |
| "rewards/reward_model/mean": 1.6560347080230713, |
| "rewards/reward_model/std": 0.7428885698318481, |
| "step": 131, |
| "step_time": 170.24099622154608 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05615234375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 75.75537109375, |
| "completions/mean_terminated_length": 72.64718627929688, |
| "completions/min_length": 6.0, |
| "completions/min_terminated_length": 6.0, |
| "entropy": 1.280121157411486, |
| "epoch": 0.054365733113673806, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6451330780982971, |
| "kl": 0.24510033716796897, |
| "learning_rate": 1.617283950617284e-06, |
| "loss": 0.0219, |
| "num_tokens": 60704675.0, |
| "reward": 1.5663776397705078, |
| "reward_std": 0.4158337414264679, |
| "rewards/reward_model/mean": 1.5663776397705078, |
| "rewards/reward_model/std": 0.8126929402351379, |
| "step": 132, |
| "step_time": 168.72953157825395 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03466796875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 72.27392578125, |
| "completions/mean_terminated_length": 70.27263641357422, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.255036068148911, |
| "epoch": 0.054777594728171335, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5956186056137085, |
| "kl": 0.22500447847414762, |
| "learning_rate": 1.6296296296296295e-06, |
| "loss": 0.0051, |
| "num_tokens": 61135252.0, |
| "reward": 1.5705558061599731, |
| "reward_std": 0.4293729066848755, |
| "rewards/reward_model/mean": 1.5705558061599731, |
| "rewards/reward_model/std": 0.8589349985122681, |
| "step": 133, |
| "step_time": 169.56890073092654 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0849609375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.0068359375, |
| "completions/mean_terminated_length": 76.64353942871094, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.22501984029077, |
| "epoch": 0.05518945634266886, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6271244883537292, |
| "kl": 0.19278798706363887, |
| "learning_rate": 1.6419753086419753e-06, |
| "loss": 0.0213, |
| "num_tokens": 61582082.0, |
| "reward": 1.4659600257873535, |
| "reward_std": 0.4655250310897827, |
| "rewards/reward_model/mean": 1.4659600257873535, |
| "rewards/reward_model/std": 0.8552236557006836, |
| "step": 134, |
| "step_time": 169.07396916579455 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06396484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.24267578125, |
| "completions/mean_terminated_length": 73.77412414550781, |
| "completions/min_length": 6.0, |
| "completions/min_terminated_length": 6.0, |
| "entropy": 1.2544868639670312, |
| "epoch": 0.05560131795716639, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.55135577917099, |
| "kl": 0.21390190807869658, |
| "learning_rate": 1.654320987654321e-06, |
| "loss": 0.016, |
| "num_tokens": 62051859.0, |
| "reward": 1.7084484100341797, |
| "reward_std": 0.42418497800827026, |
| "rewards/reward_model/mean": 1.7084484100341797, |
| "rewards/reward_model/std": 0.8365797996520996, |
| "step": 135, |
| "step_time": 168.55205999454483 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06396484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.744140625, |
| "completions/mean_terminated_length": 73.24151611328125, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2688625399023294, |
| "epoch": 0.05601317957166392, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6378998160362244, |
| "kl": 0.23519006837159395, |
| "learning_rate": 1.6666666666666669e-06, |
| "loss": 0.0138, |
| "num_tokens": 62528583.0, |
| "reward": 1.6745352745056152, |
| "reward_std": 0.4303410053253174, |
| "rewards/reward_model/mean": 1.6745352745056152, |
| "rewards/reward_model/std": 0.8760103583335876, |
| "step": 136, |
| "step_time": 167.9896323014982 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0771484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.08544921875, |
| "completions/mean_terminated_length": 78.2470932006836, |
| "completions/min_length": 7.0, |
| "completions/min_terminated_length": 7.0, |
| "entropy": 1.298474219162017, |
| "epoch": 0.05642504118616145, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5870314240455627, |
| "kl": 0.19887266453588381, |
| "learning_rate": 1.6790123456790125e-06, |
| "loss": 0.0049, |
| "num_tokens": 63017686.0, |
| "reward": 1.5558466911315918, |
| "reward_std": 0.41865676641464233, |
| "rewards/reward_model/mean": 1.5558466911315918, |
| "rewards/reward_model/std": 0.8572432994842529, |
| "step": 137, |
| "step_time": 168.25818043760955 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.044921875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 74.47802734375, |
| "completions/mean_terminated_length": 71.96063232421875, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2788015282712877, |
| "epoch": 0.05683690280065898, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0465470552444458, |
| "kl": 0.28733671543886885, |
| "learning_rate": 1.691358024691358e-06, |
| "loss": 0.0283, |
| "num_tokens": 63458441.0, |
| "reward": 1.8364841938018799, |
| "reward_std": 0.3914712369441986, |
| "rewards/reward_model/mean": 1.8364841938018799, |
| "rewards/reward_model/std": 0.8168711066246033, |
| "step": 138, |
| "step_time": 167.18661137623712 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.74072265625, |
| "completions/mean_terminated_length": 73.87821960449219, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2473736926913261, |
| "epoch": 0.05724876441515651, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5875552892684937, |
| "kl": 0.22631295106839389, |
| "learning_rate": 1.7037037037037036e-06, |
| "loss": 0.0339, |
| "num_tokens": 63941110.0, |
| "reward": 1.70628821849823, |
| "reward_std": 0.4453471899032593, |
| "rewards/reward_model/mean": 1.70628821849823, |
| "rewards/reward_model/std": 0.8872243762016296, |
| "step": 139, |
| "step_time": 169.1617161957547 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08740234375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.11279296875, |
| "completions/mean_terminated_length": 74.43070983886719, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2679406760726124, |
| "epoch": 0.057660626029654036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6072338819503784, |
| "kl": 0.2250406270613894, |
| "learning_rate": 1.7160493827160492e-06, |
| "loss": 0.0179, |
| "num_tokens": 64478525.0, |
| "reward": 1.731923222541809, |
| "reward_std": 0.3971579968929291, |
| "rewards/reward_model/mean": 1.731923222541809, |
| "rewards/reward_model/std": 0.8387468457221985, |
| "step": 140, |
| "step_time": 169.69765338627622 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06689453125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.95458984375, |
| "completions/mean_terminated_length": 74.3668212890625, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.3288813619874418, |
| "epoch": 0.058072487644151564, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9750301241874695, |
| "kl": 0.22281288140220568, |
| "learning_rate": 1.7283950617283952e-06, |
| "loss": 0.0086, |
| "num_tokens": 64943904.0, |
| "reward": 1.5512080192565918, |
| "reward_std": 0.4345375895500183, |
| "rewards/reward_model/mean": 1.5512080192565918, |
| "rewards/reward_model/std": 0.8531954288482666, |
| "step": 141, |
| "step_time": 169.35747446445748 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09033203125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.90966796875, |
| "completions/mean_terminated_length": 71.8362808227539, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2554950527846813, |
| "epoch": 0.05848434925864909, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5846207141876221, |
| "kl": 0.22971162866451778, |
| "learning_rate": 1.7407407407407408e-06, |
| "loss": 0.0123, |
| "num_tokens": 65435015.0, |
| "reward": 1.6992193460464478, |
| "reward_std": 0.38921838998794556, |
| "rewards/reward_model/mean": 1.6992193460464478, |
| "rewards/reward_model/std": 0.8802086710929871, |
| "step": 142, |
| "step_time": 169.55797945754603 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0859375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.7138671875, |
| "completions/mean_terminated_length": 74.08013153076172, |
| "completions/min_length": 12.0, |
| "completions/min_terminated_length": 12.0, |
| "entropy": 1.241849954240024, |
| "epoch": 0.05889621087314662, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5389864444732666, |
| "kl": 0.19780789304059, |
| "learning_rate": 1.7530864197530866e-06, |
| "loss": 0.0162, |
| "num_tokens": 65893181.0, |
| "reward": 1.657914638519287, |
| "reward_std": 0.43264538049697876, |
| "rewards/reward_model/mean": 1.657914638519287, |
| "rewards/reward_model/std": 0.9311876893043518, |
| "step": 143, |
| "step_time": 167.69665700104088 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.060546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.701171875, |
| "completions/mean_terminated_length": 75.52391052246094, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.3291778746061027, |
| "epoch": 0.05930807248764415, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5641889572143555, |
| "kl": 0.23325793404364958, |
| "learning_rate": 1.7654320987654322e-06, |
| "loss": 0.0106, |
| "num_tokens": 66306777.0, |
| "reward": 1.6250760555267334, |
| "reward_std": 0.45928484201431274, |
| "rewards/reward_model/mean": 1.6250760555267334, |
| "rewards/reward_model/std": 0.8440907001495361, |
| "step": 144, |
| "step_time": 167.09111699229106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.5625, |
| "completions/mean_terminated_length": 73.81069946289062, |
| "completions/min_length": 10.0, |
| "completions/min_terminated_length": 10.0, |
| "entropy": 1.2597985244356096, |
| "epoch": 0.05971993410214168, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6075507402420044, |
| "kl": 0.2127083102823235, |
| "learning_rate": 1.7777777777777777e-06, |
| "loss": -0.0003, |
| "num_tokens": 66685465.0, |
| "reward": 1.637428641319275, |
| "reward_std": 0.4271353781223297, |
| "rewards/reward_model/mean": 1.637428641319275, |
| "rewards/reward_model/std": 0.9868490099906921, |
| "step": 145, |
| "step_time": 168.623529009521 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08837890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.47509765625, |
| "completions/mean_terminated_length": 75.8676986694336, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2390871203970164, |
| "epoch": 0.06013179571663921, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.738397479057312, |
| "kl": 0.29930633143521845, |
| "learning_rate": 1.7901234567901233e-06, |
| "loss": 0.0243, |
| "num_tokens": 67148166.0, |
| "reward": 1.8085708618164062, |
| "reward_std": 0.4322406053543091, |
| "rewards/reward_model/mean": 1.8085708618164062, |
| "rewards/reward_model/std": 0.8529988527297974, |
| "step": 146, |
| "step_time": 168.20390673354268 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.037109375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.32373046875, |
| "completions/mean_terminated_length": 74.33214569091797, |
| "completions/min_length": 7.0, |
| "completions/min_terminated_length": 7.0, |
| "entropy": 1.2929235147312284, |
| "epoch": 0.06054365733113674, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6655780673027039, |
| "kl": 0.22947472316445783, |
| "learning_rate": 1.8024691358024693e-06, |
| "loss": 0.0168, |
| "num_tokens": 67600669.0, |
| "reward": 1.878852128982544, |
| "reward_std": 0.38836777210235596, |
| "rewards/reward_model/mean": 1.878852128982544, |
| "rewards/reward_model/std": 0.9328538179397583, |
| "step": 147, |
| "step_time": 169.04876817949116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07568359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.455078125, |
| "completions/mean_terminated_length": 73.3164291381836, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.300009768921882, |
| "epoch": 0.060955518945634266, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5745586156845093, |
| "kl": 0.23608133487869054, |
| "learning_rate": 1.814814814814815e-06, |
| "loss": 0.0142, |
| "num_tokens": 68108001.0, |
| "reward": 1.627516269683838, |
| "reward_std": 0.43401244282722473, |
| "rewards/reward_model/mean": 1.627516269683838, |
| "rewards/reward_model/std": 0.9561209082603455, |
| "step": 148, |
| "step_time": 167.96074909390882 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06884765625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.375, |
| "completions/mean_terminated_length": 73.63188171386719, |
| "completions/min_length": 8.0, |
| "completions/min_terminated_length": 8.0, |
| "entropy": 1.2789768348447978, |
| "epoch": 0.061367380560131794, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6077526807785034, |
| "kl": 0.2504324442707002, |
| "learning_rate": 1.8271604938271605e-06, |
| "loss": 0.0138, |
| "num_tokens": 68588705.0, |
| "reward": 1.7919840812683105, |
| "reward_std": 0.40561193227767944, |
| "rewards/reward_model/mean": 1.7919840812683105, |
| "rewards/reward_model/std": 0.807359516620636, |
| "step": 149, |
| "step_time": 167.94775600917637 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0693359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.6728515625, |
| "completions/mean_terminated_length": 77.14690399169922, |
| "completions/min_length": 8.0, |
| "completions/min_terminated_length": 8.0, |
| "entropy": 1.286247756332159, |
| "epoch": 0.06177924217462932, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5670334100723267, |
| "kl": 0.22240044514182955, |
| "learning_rate": 1.839506172839506e-06, |
| "loss": 0.0155, |
| "num_tokens": 69043619.0, |
| "reward": 1.5120161771774292, |
| "reward_std": 0.4330858290195465, |
| "rewards/reward_model/mean": 1.5120161771774292, |
| "rewards/reward_model/std": 0.8432108759880066, |
| "step": 150, |
| "step_time": 168.81714980350807 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.060546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.8125, |
| "completions/mean_terminated_length": 76.70686340332031, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2792057455517352, |
| "epoch": 0.06219110378912685, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5719607472419739, |
| "kl": 0.25582361221313477, |
| "learning_rate": 1.8518518518518519e-06, |
| "loss": 0.0076, |
| "num_tokens": 69484995.0, |
| "reward": 1.7927849292755127, |
| "reward_std": 0.41597551107406616, |
| "rewards/reward_model/mean": 1.7927849292755127, |
| "rewards/reward_model/std": 0.9223642945289612, |
| "step": 151, |
| "step_time": 167.4020181344822 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0498046875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.515625, |
| "completions/mean_terminated_length": 73.81706237792969, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.227274117525667, |
| "epoch": 0.06260296540362438, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7039728164672852, |
| "kl": 0.26282261524465866, |
| "learning_rate": 1.8641975308641975e-06, |
| "loss": 0.0056, |
| "num_tokens": 69909667.0, |
| "reward": 1.8002865314483643, |
| "reward_std": 0.41272586584091187, |
| "rewards/reward_model/mean": 1.8002865314483643, |
| "rewards/reward_model/std": 0.8000503778457642, |
| "step": 152, |
| "step_time": 167.92274621222168 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06103515625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.96826171875, |
| "completions/mean_terminated_length": 76.8460693359375, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.273845980875194, |
| "epoch": 0.06301482701812192, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5925673246383667, |
| "kl": 0.22481732474989258, |
| "learning_rate": 1.8765432098765435e-06, |
| "loss": 0.0193, |
| "num_tokens": 70384674.0, |
| "reward": 1.8214460611343384, |
| "reward_std": 0.413318932056427, |
| "rewards/reward_model/mean": 1.8214460611343384, |
| "rewards/reward_model/std": 0.742709219455719, |
| "step": 153, |
| "step_time": 170.70164536684752 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04638671875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 75.90673828125, |
| "completions/mean_terminated_length": 73.37275695800781, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2647815430536866, |
| "epoch": 0.06342668863261944, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6434057354927063, |
| "kl": 0.30792183699668385, |
| "learning_rate": 1.888888888888889e-06, |
| "loss": 0.0344, |
| "num_tokens": 70848451.0, |
| "reward": 1.8151272535324097, |
| "reward_std": 0.40342289209365845, |
| "rewards/reward_model/mean": 1.8151272535324097, |
| "rewards/reward_model/std": 0.8678472638130188, |
| "step": 154, |
| "step_time": 168.5453836512752 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04736328125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.47216796875, |
| "completions/mean_terminated_length": 74.96002197265625, |
| "completions/min_length": 10.0, |
| "completions/min_terminated_length": 10.0, |
| "entropy": 1.269783977419138, |
| "epoch": 0.06383855024711697, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6314256191253662, |
| "kl": 0.2555277651408687, |
| "learning_rate": 1.9012345679012346e-06, |
| "loss": 0.0251, |
| "num_tokens": 71328554.0, |
| "reward": 1.7019236087799072, |
| "reward_std": 0.4237982928752899, |
| "rewards/reward_model/mean": 1.7019236087799072, |
| "rewards/reward_model/std": 0.913361132144928, |
| "step": 155, |
| "step_time": 169.69989513559267 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09521484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 87.9814453125, |
| "completions/mean_terminated_length": 83.77010345458984, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.262259678915143, |
| "epoch": 0.0642504118616145, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5448090434074402, |
| "kl": 0.22080097059370019, |
| "learning_rate": 1.9135802469135804e-06, |
| "loss": 0.0095, |
| "num_tokens": 71843364.0, |
| "reward": 1.6485573053359985, |
| "reward_std": 0.442721962928772, |
| "rewards/reward_model/mean": 1.6485573053359985, |
| "rewards/reward_model/std": 0.9059621691703796, |
| "step": 156, |
| "step_time": 168.91886990657076 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.076171875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 83.45654296875, |
| "completions/mean_terminated_length": 79.78382873535156, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.2155877850018442, |
| "epoch": 0.06466227347611203, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6289539933204651, |
| "kl": 0.2453711698180996, |
| "learning_rate": 1.925925925925926e-06, |
| "loss": 0.026, |
| "num_tokens": 72273771.0, |
| "reward": 1.784334659576416, |
| "reward_std": 0.4161246418952942, |
| "rewards/reward_model/mean": 1.784334659576416, |
| "rewards/reward_model/std": 0.757505476474762, |
| "step": 157, |
| "step_time": 167.44150482024997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08154296875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.76904296875, |
| "completions/mean_terminated_length": 77.66453552246094, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.235411421628669, |
| "epoch": 0.06507413509060955, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5421144366264343, |
| "kl": 0.22543006701744162, |
| "learning_rate": 1.9382716049382716e-06, |
| "loss": 0.0316, |
| "num_tokens": 72800146.0, |
| "reward": 1.7696501016616821, |
| "reward_std": 0.3846498727798462, |
| "rewards/reward_model/mean": 1.7696501016616821, |
| "rewards/reward_model/std": 0.8624175786972046, |
| "step": 158, |
| "step_time": 169.24183974647895 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08642578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.5419921875, |
| "completions/mean_terminated_length": 76.05238342285156, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.185378910973668, |
| "epoch": 0.06548599670510709, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5627242922782898, |
| "kl": 0.23744830160285346, |
| "learning_rate": 1.9506172839506176e-06, |
| "loss": 0.0326, |
| "num_tokens": 73252072.0, |
| "reward": 1.8059046268463135, |
| "reward_std": 0.3923932611942291, |
| "rewards/reward_model/mean": 1.8059046268463135, |
| "rewards/reward_model/std": 0.8797404766082764, |
| "step": 159, |
| "step_time": 169.05729650333524 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.080078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.8115234375, |
| "completions/mean_terminated_length": 77.79087829589844, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 1.23535214853473, |
| "epoch": 0.06589785831960461, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5441705584526062, |
| "kl": 0.24602960981428623, |
| "learning_rate": 1.962962962962963e-06, |
| "loss": 0.0213, |
| "num_tokens": 73739494.0, |
| "reward": 1.7673776149749756, |
| "reward_std": 0.4512333571910858, |
| "rewards/reward_model/mean": 1.7673776149749756, |
| "rewards/reward_model/std": 0.9724159240722656, |
| "step": 160, |
| "step_time": 168.9636302953586 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06982421875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.20166015625, |
| "completions/mean_terminated_length": 74.46351623535156, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2300576204434037, |
| "epoch": 0.06630971993410215, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5943037271499634, |
| "kl": 0.24898978223791346, |
| "learning_rate": 1.9753086419753087e-06, |
| "loss": 0.0222, |
| "num_tokens": 74167715.0, |
| "reward": 1.926592469215393, |
| "reward_std": 0.41215306520462036, |
| "rewards/reward_model/mean": 1.926592469215393, |
| "rewards/reward_model/std": 1.001451015472412, |
| "step": 161, |
| "step_time": 167.6384666627273 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06884765625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 73.8603515625, |
| "completions/mean_terminated_length": 69.85736846923828, |
| "completions/min_length": 12.0, |
| "completions/min_terminated_length": 12.0, |
| "entropy": 1.198827504646033, |
| "epoch": 0.06672158154859967, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5532698035240173, |
| "kl": 0.27067929052282125, |
| "learning_rate": 1.9876543209876543e-06, |
| "loss": 0.0219, |
| "num_tokens": 74647397.0, |
| "reward": 1.9008184671401978, |
| "reward_std": 0.38047462701797485, |
| "rewards/reward_model/mean": 1.9008184671401978, |
| "rewards/reward_model/std": 0.9279819130897522, |
| "step": 162, |
| "step_time": 167.92858559498563 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0458984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 75.6513671875, |
| "completions/mean_terminated_length": 73.133056640625, |
| "completions/min_length": 9.0, |
| "completions/min_terminated_length": 9.0, |
| "entropy": 1.2209027321077883, |
| "epoch": 0.0671334431630972, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6697596907615662, |
| "kl": 0.27582967511261813, |
| "learning_rate": 2e-06, |
| "loss": 0.0013, |
| "num_tokens": 75100251.0, |
| "reward": 1.778894305229187, |
| "reward_std": 0.40786850452423096, |
| "rewards/reward_model/mean": 1.778894305229187, |
| "rewards/reward_model/std": 0.9986056089401245, |
| "step": 163, |
| "step_time": 168.67196059180424 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04736328125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 76.04296875, |
| "completions/mean_terminated_length": 73.45977020263672, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1774590322747827, |
| "epoch": 0.06754530477759473, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5625696182250977, |
| "kl": 0.24044052005046979, |
| "learning_rate": 2.012345679012346e-06, |
| "loss": 0.0079, |
| "num_tokens": 75646451.0, |
| "reward": 1.668849229812622, |
| "reward_std": 0.40883883833885193, |
| "rewards/reward_model/mean": 1.668849229812622, |
| "rewards/reward_model/std": 0.9833309650421143, |
| "step": 164, |
| "step_time": 168.17798956111073 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0556640625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.30810546875, |
| "completions/mean_terminated_length": 75.3790054321289, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2012779735960066, |
| "epoch": 0.06795716639209226, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.647919774055481, |
| "kl": 0.24133505855570547, |
| "learning_rate": 2.0246913580246915e-06, |
| "loss": -0.0057, |
| "num_tokens": 76075114.0, |
| "reward": 1.8032076358795166, |
| "reward_std": 0.4206048548221588, |
| "rewards/reward_model/mean": 1.8032076358795166, |
| "rewards/reward_model/std": 0.8603482842445374, |
| "step": 165, |
| "step_time": 167.6303828060627 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0712890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.9677734375, |
| "completions/mean_terminated_length": 79.51103973388672, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2237120219506323, |
| "epoch": 0.06836902800658978, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5906450748443604, |
| "kl": 0.23315264994744211, |
| "learning_rate": 2.037037037037037e-06, |
| "loss": 0.0018, |
| "num_tokens": 76518888.0, |
| "reward": 1.6204566955566406, |
| "reward_std": 0.43912777304649353, |
| "rewards/reward_model/mean": 1.6204566955566406, |
| "rewards/reward_model/std": 0.8727616667747498, |
| "step": 166, |
| "step_time": 169.20850368216634 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04345703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 78.6572265625, |
| "completions/mean_terminated_length": 76.41551971435547, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1896002353169024, |
| "epoch": 0.06878088962108732, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6026791930198669, |
| "kl": 0.27406785456696525, |
| "learning_rate": 2.0493827160493827e-06, |
| "loss": 0.0206, |
| "num_tokens": 77015562.0, |
| "reward": 1.8125802278518677, |
| "reward_std": 0.3493039608001709, |
| "rewards/reward_model/mean": 1.8125802278518677, |
| "rewards/reward_model/std": 0.8256182074546814, |
| "step": 167, |
| "step_time": 167.97593408357352 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0576171875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.49560546875, |
| "completions/mean_terminated_length": 77.59119415283203, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 1.2243428956717253, |
| "epoch": 0.06919275123558484, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5614490509033203, |
| "kl": 0.27035874902503565, |
| "learning_rate": 2.0617283950617282e-06, |
| "loss": 0.0263, |
| "num_tokens": 77461185.0, |
| "reward": 1.7474470138549805, |
| "reward_std": 0.345928430557251, |
| "rewards/reward_model/mean": 1.7474470138549805, |
| "rewards/reward_model/std": 0.783444344997406, |
| "step": 168, |
| "step_time": 167.4764309921302 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1171875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 87.9755859375, |
| "completions/mean_terminated_length": 82.66261291503906, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2144196771550924, |
| "epoch": 0.06960461285008238, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7678112387657166, |
| "kl": 0.2838464966916945, |
| "learning_rate": 2.074074074074074e-06, |
| "loss": 0.0225, |
| "num_tokens": 77914863.0, |
| "reward": 1.836325764656067, |
| "reward_std": 0.39190569519996643, |
| "rewards/reward_model/mean": 1.836325764656067, |
| "rewards/reward_model/std": 0.8917631506919861, |
| "step": 169, |
| "step_time": 167.54137054365128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1064453125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 83.44287109375, |
| "completions/mean_terminated_length": 78.13497924804688, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.215761701343581, |
| "epoch": 0.0700164744645799, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.551048219203949, |
| "kl": 0.2747056516527664, |
| "learning_rate": 2.08641975308642e-06, |
| "loss": 0.0378, |
| "num_tokens": 78387770.0, |
| "reward": 1.7706103324890137, |
| "reward_std": 0.3842095136642456, |
| "rewards/reward_model/mean": 1.7706103324890137, |
| "rewards/reward_model/std": 0.9329609274864197, |
| "step": 170, |
| "step_time": 168.65306692710146 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.11474609375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 86.2822265625, |
| "completions/mean_terminated_length": 80.87478637695312, |
| "completions/min_length": 8.0, |
| "completions/min_terminated_length": 8.0, |
| "entropy": 1.2296040374785662, |
| "epoch": 0.07042833607907743, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6304351091384888, |
| "kl": 0.2613049374194816, |
| "learning_rate": 2.0987654320987654e-06, |
| "loss": 0.0244, |
| "num_tokens": 78848220.0, |
| "reward": 1.8255634307861328, |
| "reward_std": 0.3948723077774048, |
| "rewards/reward_model/mean": 1.8255634307861328, |
| "rewards/reward_model/std": 0.8326601982116699, |
| "step": 171, |
| "step_time": 168.58243151567876 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1396484375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 89.66064453125, |
| "completions/mean_terminated_length": 83.43757629394531, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1852441588416696, |
| "epoch": 0.07084019769357495, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5359068512916565, |
| "kl": 0.25176908579305746, |
| "learning_rate": 2.1111111111111114e-06, |
| "loss": 0.0259, |
| "num_tokens": 79300933.0, |
| "reward": 1.806993007659912, |
| "reward_std": 0.37374427914619446, |
| "rewards/reward_model/mean": 1.806993007659912, |
| "rewards/reward_model/std": 0.8301866054534912, |
| "step": 172, |
| "step_time": 168.35271108709276 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.13720703125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 85.8330078125, |
| "completions/mean_terminated_length": 79.12733459472656, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2571615048218518, |
| "epoch": 0.07125205930807249, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5572459101676941, |
| "kl": 0.2960540017229505, |
| "learning_rate": 2.123456790123457e-06, |
| "loss": 0.0209, |
| "num_tokens": 79797039.0, |
| "reward": 1.8621840476989746, |
| "reward_std": 0.40364766120910645, |
| "rewards/reward_model/mean": 1.8621840476989746, |
| "rewards/reward_model/std": 0.9432704448699951, |
| "step": 173, |
| "step_time": 167.40115015720949 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0693359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.65771484375, |
| "completions/mean_terminated_length": 76.05613708496094, |
| "completions/min_length": 15.0, |
| "completions/min_terminated_length": 15.0, |
| "entropy": 1.2446925514377654, |
| "epoch": 0.07166392092257001, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5800526142120361, |
| "kl": 0.2971960572176613, |
| "learning_rate": 2.1358024691358026e-06, |
| "loss": 0.0368, |
| "num_tokens": 80245682.0, |
| "reward": 1.9920142889022827, |
| "reward_std": 0.3743855655193329, |
| "rewards/reward_model/mean": 1.9920142889022827, |
| "rewards/reward_model/std": 0.9964386820793152, |
| "step": 174, |
| "step_time": 168.70972929289564 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05517578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.859375, |
| "completions/mean_terminated_length": 78.10645294189453, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2962157218717039, |
| "epoch": 0.07207578253706755, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6903020143508911, |
| "kl": 0.2973472154699266, |
| "learning_rate": 2.148148148148148e-06, |
| "loss": 0.0265, |
| "num_tokens": 80705938.0, |
| "reward": 1.7820935249328613, |
| "reward_std": 0.37934696674346924, |
| "rewards/reward_model/mean": 1.7820935249328613, |
| "rewards/reward_model/std": 0.902191162109375, |
| "step": 175, |
| "step_time": 169.36274442402646 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.068359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.72021484375, |
| "completions/mean_terminated_length": 78.32441711425781, |
| "completions/min_length": 14.0, |
| "completions/min_terminated_length": 14.0, |
| "entropy": 1.27019348484464, |
| "epoch": 0.07248764415156507, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5628058910369873, |
| "kl": 0.2966971327550709, |
| "learning_rate": 2.160493827160494e-06, |
| "loss": 0.0268, |
| "num_tokens": 81138997.0, |
| "reward": 1.7612276077270508, |
| "reward_std": 0.41708946228027344, |
| "rewards/reward_model/mean": 1.7612276077270508, |
| "rewards/reward_model/std": 0.9293939471244812, |
| "step": 176, |
| "step_time": 169.23447634931654 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0537109375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.7333984375, |
| "completions/mean_terminated_length": 74.88029479980469, |
| "completions/min_length": 6.0, |
| "completions/min_terminated_length": 6.0, |
| "entropy": 1.2512029868084937, |
| "epoch": 0.0728995057660626, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5666512250900269, |
| "kl": 0.30354723357595503, |
| "learning_rate": 2.1728395061728397e-06, |
| "loss": 0.0181, |
| "num_tokens": 81605171.0, |
| "reward": 1.9841545820236206, |
| "reward_std": 0.3431292474269867, |
| "rewards/reward_model/mean": 1.9841545820236206, |
| "rewards/reward_model/std": 0.8817236423492432, |
| "step": 177, |
| "step_time": 168.3823292935267 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07666015625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 75.712890625, |
| "completions/mean_terminated_length": 71.37175750732422, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.263297796715051, |
| "epoch": 0.07331136738056013, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6307470798492432, |
| "kl": 0.3325087181874551, |
| "learning_rate": 2.1851851851851853e-06, |
| "loss": 0.0256, |
| "num_tokens": 82068551.0, |
| "reward": 1.8173794746398926, |
| "reward_std": 0.3982235789299011, |
| "rewards/reward_model/mean": 1.8173794746398926, |
| "rewards/reward_model/std": 1.106950283050537, |
| "step": 178, |
| "step_time": 167.39255077391863 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06103515625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.169921875, |
| "completions/mean_terminated_length": 75.99583435058594, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.271299744490534, |
| "epoch": 0.07372322899505766, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6578369736671448, |
| "kl": 0.33446174120763317, |
| "learning_rate": 2.197530864197531e-06, |
| "loss": 0.0284, |
| "num_tokens": 82517795.0, |
| "reward": 1.7375619411468506, |
| "reward_std": 0.3846127390861511, |
| "rewards/reward_model/mean": 1.7375619411468506, |
| "rewards/reward_model/std": 0.8862913250923157, |
| "step": 179, |
| "step_time": 168.1992179742083 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0419921875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.02197265625, |
| "completions/mean_terminated_length": 77.91896057128906, |
| "completions/min_length": 16.0, |
| "completions/min_terminated_length": 16.0, |
| "entropy": 1.2483481515664607, |
| "epoch": 0.07413509060955518, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5584097504615784, |
| "kl": 0.30480305349919945, |
| "learning_rate": 2.2098765432098765e-06, |
| "loss": 0.0213, |
| "num_tokens": 82938224.0, |
| "reward": 1.9653817415237427, |
| "reward_std": 0.37414827942848206, |
| "rewards/reward_model/mean": 1.9653817415237427, |
| "rewards/reward_model/std": 0.8811721205711365, |
| "step": 180, |
| "step_time": 167.49799311021343 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.02587890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 74.77294921875, |
| "completions/mean_terminated_length": 73.35889434814453, |
| "completions/min_length": 6.0, |
| "completions/min_terminated_length": 6.0, |
| "entropy": 1.2098995437845588, |
| "epoch": 0.07454695222405272, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7208864688873291, |
| "kl": 0.3534813959267922, |
| "learning_rate": 2.222222222222222e-06, |
| "loss": 0.0354, |
| "num_tokens": 83381023.0, |
| "reward": 2.026184320449829, |
| "reward_std": 0.3664648234844208, |
| "rewards/reward_model/mean": 2.026184320449829, |
| "rewards/reward_model/std": 0.780437707901001, |
| "step": 181, |
| "step_time": 168.94889666279778 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.02685546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 74.79931640625, |
| "completions/mean_terminated_length": 73.33116149902344, |
| "completions/min_length": 12.0, |
| "completions/min_terminated_length": 12.0, |
| "entropy": 1.2554190349765122, |
| "epoch": 0.07495881383855024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6133463382720947, |
| "kl": 0.3488931337487884, |
| "learning_rate": 2.234567901234568e-06, |
| "loss": -0.0, |
| "num_tokens": 83815236.0, |
| "reward": 2.015033006668091, |
| "reward_std": 0.35350939631462097, |
| "rewards/reward_model/mean": 2.015033006668091, |
| "rewards/reward_model/std": 1.0514765977859497, |
| "step": 182, |
| "step_time": 168.09779750416055 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0869140625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.35546875, |
| "completions/mean_terminated_length": 74.72513580322266, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.199432454770431, |
| "epoch": 0.07537067545304778, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6157463192939758, |
| "kl": 0.3277318494510837, |
| "learning_rate": 2.2469135802469137e-06, |
| "loss": 0.0357, |
| "num_tokens": 84297404.0, |
| "reward": 1.8543448448181152, |
| "reward_std": 0.391369104385376, |
| "rewards/reward_model/mean": 1.8543448448181152, |
| "rewards/reward_model/std": 0.9162046909332275, |
| "step": 183, |
| "step_time": 167.68394365813583 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0869140625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 84.412109375, |
| "completions/mean_terminated_length": 80.26309967041016, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2098595479037613, |
| "epoch": 0.0757825370675453, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5533583760261536, |
| "kl": 0.31157187622739, |
| "learning_rate": 2.2592592592592592e-06, |
| "loss": 0.0297, |
| "num_tokens": 84797320.0, |
| "reward": 1.829999327659607, |
| "reward_std": 0.39016926288604736, |
| "rewards/reward_model/mean": 1.829999327659607, |
| "rewards/reward_model/std": 0.919818103313446, |
| "step": 184, |
| "step_time": 169.95830805273727 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08642578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.021484375, |
| "completions/mean_terminated_length": 72.1988296508789, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "entropy": 1.216996781527996, |
| "epoch": 0.07619439868204284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5928105115890503, |
| "kl": 0.3597461999161169, |
| "learning_rate": 2.271604938271605e-06, |
| "loss": 0.0241, |
| "num_tokens": 85345332.0, |
| "reward": 1.7781141996383667, |
| "reward_std": 0.40030625462532043, |
| "rewards/reward_model/mean": 1.7781143188476562, |
| "rewards/reward_model/std": 1.0846326351165771, |
| "step": 185, |
| "step_time": 168.48916833195835 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.12060546875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 86.26416015625, |
| "completions/mean_terminated_length": 80.54025268554688, |
| "completions/min_length": 17.0, |
| "completions/min_terminated_length": 17.0, |
| "entropy": 1.1919847468379885, |
| "epoch": 0.07660626029654036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5492737889289856, |
| "kl": 0.30091724882368, |
| "learning_rate": 2.2839506172839504e-06, |
| "loss": 0.0307, |
| "num_tokens": 85790705.0, |
| "reward": 1.9125442504882812, |
| "reward_std": 0.45980480313301086, |
| "rewards/reward_model/mean": 1.9125442504882812, |
| "rewards/reward_model/std": 0.884122371673584, |
| "step": 186, |
| "step_time": 168.23294077534229 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.083984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.7451171875, |
| "completions/mean_terminated_length": 75.3208999633789, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1483043180778623, |
| "epoch": 0.07701812191103789, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5400401949882507, |
| "kl": 0.31244106526719406, |
| "learning_rate": 2.2962962962962964e-06, |
| "loss": 0.0363, |
| "num_tokens": 86241287.0, |
| "reward": 1.9626367092132568, |
| "reward_std": 0.37234199047088623, |
| "rewards/reward_model/mean": 1.9626367092132568, |
| "rewards/reward_model/std": 0.9630360007286072, |
| "step": 187, |
| "step_time": 168.14725243346766 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0634765625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.87939453125, |
| "completions/mean_terminated_length": 78.75338745117188, |
| "completions/min_length": 13.0, |
| "completions/min_terminated_length": 13.0, |
| "entropy": 1.2079863131511956, |
| "epoch": 0.07742998352553541, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.582582414150238, |
| "kl": 0.3190855036955327, |
| "learning_rate": 2.308641975308642e-06, |
| "loss": 0.0175, |
| "num_tokens": 86728272.0, |
| "reward": 1.9138296842575073, |
| "reward_std": 0.3768790364265442, |
| "rewards/reward_model/mean": 1.9138296842575073, |
| "rewards/reward_model/std": 1.0191558599472046, |
| "step": 188, |
| "step_time": 168.40571281081066 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.11376953125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 85.12548828125, |
| "completions/mean_terminated_length": 79.62149047851562, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.169044690206647, |
| "epoch": 0.07784184514003295, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5737549662590027, |
| "kl": 0.31175463960971683, |
| "learning_rate": 2.320987654320988e-06, |
| "loss": 0.0287, |
| "num_tokens": 87171473.0, |
| "reward": 1.9407716989517212, |
| "reward_std": 0.4355354905128479, |
| "rewards/reward_model/mean": 1.9407716989517212, |
| "rewards/reward_model/std": 1.0851298570632935, |
| "step": 189, |
| "step_time": 167.8595218854025 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08837890625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 83.955078125, |
| "completions/mean_terminated_length": 79.68505859375, |
| "completions/min_length": 11.0, |
| "completions/min_terminated_length": 11.0, |
| "entropy": 1.1897474566940218, |
| "epoch": 0.07825370675453047, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5536565184593201, |
| "kl": 0.32399573898874223, |
| "learning_rate": 2.3333333333333336e-06, |
| "loss": 0.0254, |
| "num_tokens": 87624149.0, |
| "reward": 1.953005313873291, |
| "reward_std": 0.39668840169906616, |
| "rewards/reward_model/mean": 1.953005313873291, |
| "rewards/reward_model/std": 0.9170160889625549, |
| "step": 190, |
| "step_time": 167.70779052050784 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07958984375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 87.31494140625, |
| "completions/mean_terminated_length": 83.79681396484375, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1992020884063095, |
| "epoch": 0.07866556836902801, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.534305214881897, |
| "kl": 0.2823739905725233, |
| "learning_rate": 2.345679012345679e-06, |
| "loss": 0.0207, |
| "num_tokens": 88065530.0, |
| "reward": 1.8692455291748047, |
| "reward_std": 0.3706667721271515, |
| "rewards/reward_model/mean": 1.8692455291748047, |
| "rewards/reward_model/std": 0.7108120918273926, |
| "step": 191, |
| "step_time": 168.080767756328 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.06982421875, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.57177734375, |
| "completions/mean_terminated_length": 77.01155090332031, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2336446167901158, |
| "epoch": 0.07907742998352553, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7091127634048462, |
| "kl": 0.3430620247963816, |
| "learning_rate": 2.3580246913580247e-06, |
| "loss": 0.0272, |
| "num_tokens": 88569293.0, |
| "reward": 1.8590911626815796, |
| "reward_std": 0.3997899293899536, |
| "rewards/reward_model/mean": 1.8590911626815796, |
| "rewards/reward_model/std": 1.0301216840744019, |
| "step": 192, |
| "step_time": 168.8303936952725 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05224609375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.47802734375, |
| "completions/mean_terminated_length": 74.69293975830078, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2118491139262915, |
| "epoch": 0.07948929159802307, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6019130349159241, |
| "kl": 0.35962342529091984, |
| "learning_rate": 2.3703703703703703e-06, |
| "loss": 0.0434, |
| "num_tokens": 88987872.0, |
| "reward": 2.0519325733184814, |
| "reward_std": 0.383076012134552, |
| "rewards/reward_model/mean": 2.0519325733184814, |
| "rewards/reward_model/std": 0.8245954513549805, |
| "step": 193, |
| "step_time": 166.53060166956857 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03369140625, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 75.05859375, |
| "completions/mean_terminated_length": 73.21273040771484, |
| "completions/min_length": 6.0, |
| "completions/min_terminated_length": 6.0, |
| "entropy": 1.186548251658678, |
| "epoch": 0.07990115321252059, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7078003287315369, |
| "kl": 0.3844323490629904, |
| "learning_rate": 2.3827160493827163e-06, |
| "loss": 0.0161, |
| "num_tokens": 89383064.0, |
| "reward": 2.240997076034546, |
| "reward_std": 0.3178167939186096, |
| "rewards/reward_model/mean": 2.240997076034546, |
| "rewards/reward_model/std": 0.84669429063797, |
| "step": 194, |
| "step_time": 167.91490666009486 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0439453125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.45703125, |
| "completions/mean_terminated_length": 75.1338119506836, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.1919686342589557, |
| "epoch": 0.08031301482701812, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5621170401573181, |
| "kl": 0.3380963269737549, |
| "learning_rate": 2.395061728395062e-06, |
| "loss": 0.0135, |
| "num_tokens": 89828864.0, |
| "reward": 1.9205503463745117, |
| "reward_std": 0.37170130014419556, |
| "rewards/reward_model/mean": 1.9205503463745117, |
| "rewards/reward_model/std": 0.847759485244751, |
| "step": 195, |
| "step_time": 167.42564077628776 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05078125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 77.94970703125, |
| "completions/mean_terminated_length": 75.2721176147461, |
| "completions/min_length": 22.0, |
| "completions/min_terminated_length": 22.0, |
| "entropy": 1.1974371783435345, |
| "epoch": 0.08072487644151564, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6827316880226135, |
| "kl": 0.3753180764615536, |
| "learning_rate": 2.4074074074074075e-06, |
| "loss": 0.0271, |
| "num_tokens": 90260377.0, |
| "reward": 2.031513214111328, |
| "reward_std": 0.3799358606338501, |
| "rewards/reward_model/mean": 2.031513214111328, |
| "rewards/reward_model/std": 1.0462548732757568, |
| "step": 196, |
| "step_time": 166.99446870852262 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07568359375, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.01416015625, |
| "completions/mean_terminated_length": 76.08504486083984, |
| "completions/min_length": 5.0, |
| "completions/min_terminated_length": 5.0, |
| "entropy": 1.2283128183335066, |
| "epoch": 0.08113673805601318, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.570841372013092, |
| "kl": 0.3397949834470637, |
| "learning_rate": 2.419753086419753e-06, |
| "loss": 0.0326, |
| "num_tokens": 90748310.0, |
| "reward": 1.893890380859375, |
| "reward_std": 0.3679511547088623, |
| "rewards/reward_model/mean": 1.893890380859375, |
| "rewards/reward_model/std": 0.8536400198936462, |
| "step": 197, |
| "step_time": 167.43677308317274 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04736328125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 79.32080078125, |
| "completions/mean_terminated_length": 76.90056610107422, |
| "completions/min_length": 12.0, |
| "completions/min_terminated_length": 12.0, |
| "entropy": 1.2077249235007912, |
| "epoch": 0.0815485996705107, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5746736526489258, |
| "kl": 0.3641646616742946, |
| "learning_rate": 2.4320987654320987e-06, |
| "loss": 0.029, |
| "num_tokens": 91213543.0, |
| "reward": 2.129655361175537, |
| "reward_std": 0.3387081027030945, |
| "rewards/reward_model/mean": 2.129655361175537, |
| "rewards/reward_model/std": 0.9924630522727966, |
| "step": 198, |
| "step_time": 168.12879333738238 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08642578125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 80.84912109375, |
| "completions/mean_terminated_length": 76.38856506347656, |
| "completions/min_length": 17.0, |
| "completions/min_terminated_length": 17.0, |
| "entropy": 1.106790901394561, |
| "epoch": 0.08196046128500824, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6154218316078186, |
| "kl": 0.3942809420404956, |
| "learning_rate": 2.4444444444444442e-06, |
| "loss": 0.0462, |
| "num_tokens": 91686258.0, |
| "reward": 2.0349197387695312, |
| "reward_std": 0.38085871934890747, |
| "rewards/reward_model/mean": 2.0349197387695312, |
| "rewards/reward_model/std": 1.0781173706054688, |
| "step": 199, |
| "step_time": 166.76508231228217 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.064453125, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 81.89501953125, |
| "completions/mean_terminated_length": 78.71868896484375, |
| "completions/min_length": 15.0, |
| "completions/min_terminated_length": 15.0, |
| "entropy": 1.1731834115926176, |
| "epoch": 0.08237232289950576, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5466514825820923, |
| "kl": 0.34953623195178807, |
| "learning_rate": 2.4567901234567902e-06, |
| "loss": 0.0346, |
| "num_tokens": 92113083.0, |
| "reward": 2.178508758544922, |
| "reward_std": 0.3616424798965454, |
| "rewards/reward_model/mean": 2.178508758544922, |
| "rewards/reward_model/std": 0.908944845199585, |
| "step": 200, |
| "step_time": 168.23503723321483 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 2428, |
| "num_input_tokens_seen": 92113083, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|