diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17031 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8954893867924528, + "eval_steps": 500, + "global_step": 48600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00036851415094339625, + "grad_norm": 4.0625, + "learning_rate": 1.999999832962315e-05, + "loss": 4.6337, + "step": 20 + }, + { + "epoch": 0.0007370283018867925, + "grad_norm": 2.40625, + "learning_rate": 1.9999993308452055e-05, + "loss": 3.7762, + "step": 40 + }, + { + "epoch": 0.0011055424528301887, + "grad_norm": 3.15625, + "learning_rate": 1.9999984936484642e-05, + "loss": 3.3166, + "step": 60 + }, + { + "epoch": 0.001474056603773585, + "grad_norm": 2.171875, + "learning_rate": 1.999997321372371e-05, + "loss": 3.1428, + "step": 80 + }, + { + "epoch": 0.0018425707547169812, + "grad_norm": 2.75, + "learning_rate": 1.999995814017319e-05, + "loss": 3.0268, + "step": 100 + }, + { + "epoch": 0.0022110849056603773, + "grad_norm": 2.296875, + "learning_rate": 1.9999939715838126e-05, + "loss": 2.9612, + "step": 120 + }, + { + "epoch": 0.0025795990566037734, + "grad_norm": 5.28125, + "learning_rate": 1.99999179407247e-05, + "loss": 2.8761, + "step": 140 + }, + { + "epoch": 0.00294811320754717, + "grad_norm": 2.421875, + "learning_rate": 1.9999892814840206e-05, + "loss": 2.8318, + "step": 160 + }, + { + "epoch": 0.003316627358490566, + "grad_norm": 2.546875, + "learning_rate": 1.999986433819306e-05, + "loss": 2.802, + "step": 180 + }, + { + "epoch": 0.0036851415094339623, + "grad_norm": 4.53125, + "learning_rate": 1.9999832510792806e-05, + "loss": 2.7623, + "step": 200 + }, + { + "epoch": 0.0040536556603773585, + "grad_norm": 2.59375, + "learning_rate": 1.999979733265011e-05, + "loss": 2.7681, + "step": 220 + }, + { + "epoch": 0.004422169811320755, + "grad_norm": 2.5625, + "learning_rate": 1.9999758803776753e-05, + "loss": 2.7284, + "step": 240 + }, + { + "epoch": 0.004790683962264151, + "grad_norm": 2.65625, + "learning_rate": 1.9999716924185656e-05, + "loss": 2.7284, + "step": 260 + }, + { + "epoch": 0.005159198113207547, + "grad_norm": 2.625, + "learning_rate": 1.9999671693890846e-05, + "loss": 2.7128, + "step": 280 + }, + { + "epoch": 0.005527712264150943, + "grad_norm": 2.265625, + "learning_rate": 1.999962311290748e-05, + "loss": 2.7093, + "step": 300 + }, + { + "epoch": 0.00589622641509434, + "grad_norm": 2.609375, + "learning_rate": 1.9999571181251835e-05, + "loss": 2.7153, + "step": 320 + }, + { + "epoch": 0.006264740566037736, + "grad_norm": 2.1875, + "learning_rate": 1.999951589894131e-05, + "loss": 2.6294, + "step": 340 + }, + { + "epoch": 0.006633254716981132, + "grad_norm": 2.265625, + "learning_rate": 1.9999457265994434e-05, + "loss": 2.6397, + "step": 360 + }, + { + "epoch": 0.0070017688679245285, + "grad_norm": 2.359375, + "learning_rate": 1.9999395282430854e-05, + "loss": 2.6118, + "step": 380 + }, + { + "epoch": 0.007370283018867925, + "grad_norm": 2.3125, + "learning_rate": 1.9999329948271334e-05, + "loss": 2.6485, + "step": 400 + }, + { + "epoch": 0.007738797169811321, + "grad_norm": 3.1875, + "learning_rate": 1.999926126353777e-05, + "loss": 2.6522, + "step": 420 + }, + { + "epoch": 0.008107311320754717, + "grad_norm": 2.640625, + "learning_rate": 1.9999189228253177e-05, + "loss": 2.6205, + "step": 440 + }, + { + "epoch": 0.008475825471698114, + "grad_norm": 2.453125, + "learning_rate": 1.9999113842441692e-05, + "loss": 2.622, + "step": 460 + }, + { + "epoch": 0.00884433962264151, + "grad_norm": 2.5625, + "learning_rate": 1.9999035106128578e-05, + "loss": 2.5942, + "step": 480 + }, + { + "epoch": 0.009212853773584906, + "grad_norm": 2.640625, + "learning_rate": 1.999895301934021e-05, + "loss": 2.5851, + "step": 500 + }, + { + "epoch": 0.009581367924528301, + "grad_norm": 2.921875, + "learning_rate": 1.99988675821041e-05, + "loss": 2.5908, + "step": 520 + }, + { + "epoch": 0.009949882075471699, + "grad_norm": 2.4375, + "learning_rate": 1.9998778794448873e-05, + "loss": 2.5743, + "step": 540 + }, + { + "epoch": 0.010318396226415094, + "grad_norm": 3.03125, + "learning_rate": 1.9998686656404287e-05, + "loss": 2.5624, + "step": 560 + }, + { + "epoch": 0.01068691037735849, + "grad_norm": 2.484375, + "learning_rate": 1.9998591168001207e-05, + "loss": 2.5294, + "step": 580 + }, + { + "epoch": 0.011055424528301886, + "grad_norm": 2.90625, + "learning_rate": 1.9998492329271634e-05, + "loss": 2.5118, + "step": 600 + }, + { + "epoch": 0.011423938679245283, + "grad_norm": 2.75, + "learning_rate": 1.9998390140248684e-05, + "loss": 2.5358, + "step": 620 + }, + { + "epoch": 0.01179245283018868, + "grad_norm": 3.0625, + "learning_rate": 1.99982846009666e-05, + "loss": 2.5115, + "step": 640 + }, + { + "epoch": 0.012160966981132075, + "grad_norm": 3.15625, + "learning_rate": 1.999817571146075e-05, + "loss": 2.5454, + "step": 660 + }, + { + "epoch": 0.012529481132075472, + "grad_norm": 3.25, + "learning_rate": 1.9998063471767614e-05, + "loss": 2.4975, + "step": 680 + }, + { + "epoch": 0.012897995283018868, + "grad_norm": 2.484375, + "learning_rate": 1.9997947881924805e-05, + "loss": 2.5246, + "step": 700 + }, + { + "epoch": 0.013266509433962265, + "grad_norm": 2.875, + "learning_rate": 1.9997828941971053e-05, + "loss": 2.5284, + "step": 720 + }, + { + "epoch": 0.01363502358490566, + "grad_norm": 2.3125, + "learning_rate": 1.9997706651946213e-05, + "loss": 2.5309, + "step": 740 + }, + { + "epoch": 0.014003537735849057, + "grad_norm": 2.78125, + "learning_rate": 1.999758101189126e-05, + "loss": 2.4922, + "step": 760 + }, + { + "epoch": 0.014372051886792452, + "grad_norm": 2.609375, + "learning_rate": 1.9997452021848298e-05, + "loss": 2.4883, + "step": 780 + }, + { + "epoch": 0.01474056603773585, + "grad_norm": 2.5, + "learning_rate": 1.9997319681860548e-05, + "loss": 2.5011, + "step": 800 + }, + { + "epoch": 0.015109080188679245, + "grad_norm": 2.3125, + "learning_rate": 1.9997183991972352e-05, + "loss": 2.5154, + "step": 820 + }, + { + "epoch": 0.015477594339622642, + "grad_norm": 2.484375, + "learning_rate": 1.999704495222918e-05, + "loss": 2.5559, + "step": 840 + }, + { + "epoch": 0.015846108490566037, + "grad_norm": 3.484375, + "learning_rate": 1.9996902562677614e-05, + "loss": 2.5122, + "step": 860 + }, + { + "epoch": 0.016214622641509434, + "grad_norm": 3.296875, + "learning_rate": 1.9996756823365377e-05, + "loss": 2.4959, + "step": 880 + }, + { + "epoch": 0.01658313679245283, + "grad_norm": 3.015625, + "learning_rate": 1.99966077343413e-05, + "loss": 2.4906, + "step": 900 + }, + { + "epoch": 0.016951650943396228, + "grad_norm": 2.6875, + "learning_rate": 1.999645529565533e-05, + "loss": 2.4744, + "step": 920 + }, + { + "epoch": 0.01732016509433962, + "grad_norm": 3.1875, + "learning_rate": 1.9996299507358557e-05, + "loss": 2.4662, + "step": 940 + }, + { + "epoch": 0.01768867924528302, + "grad_norm": 2.40625, + "learning_rate": 1.9996140369503177e-05, + "loss": 2.4652, + "step": 960 + }, + { + "epoch": 0.018057193396226415, + "grad_norm": 2.390625, + "learning_rate": 1.9995977882142517e-05, + "loss": 2.5311, + "step": 980 + }, + { + "epoch": 0.018425707547169812, + "grad_norm": 2.390625, + "learning_rate": 1.9995812045331023e-05, + "loss": 2.4729, + "step": 1000 + }, + { + "epoch": 0.018794221698113206, + "grad_norm": 2.921875, + "learning_rate": 1.999564285912426e-05, + "loss": 2.4889, + "step": 1020 + }, + { + "epoch": 0.019162735849056603, + "grad_norm": 2.609375, + "learning_rate": 1.999547032357893e-05, + "loss": 2.4565, + "step": 1040 + }, + { + "epoch": 0.01953125, + "grad_norm": 2.71875, + "learning_rate": 1.999529443875283e-05, + "loss": 2.4716, + "step": 1060 + }, + { + "epoch": 0.019899764150943397, + "grad_norm": 2.484375, + "learning_rate": 1.9995115204704904e-05, + "loss": 2.494, + "step": 1080 + }, + { + "epoch": 0.020268278301886794, + "grad_norm": 2.921875, + "learning_rate": 1.999493262149521e-05, + "loss": 2.4769, + "step": 1100 + }, + { + "epoch": 0.020636792452830188, + "grad_norm": 2.609375, + "learning_rate": 1.9994746689184928e-05, + "loss": 2.464, + "step": 1120 + }, + { + "epoch": 0.021005306603773585, + "grad_norm": 2.359375, + "learning_rate": 1.9994557407836358e-05, + "loss": 2.4484, + "step": 1140 + }, + { + "epoch": 0.02137382075471698, + "grad_norm": 2.578125, + "learning_rate": 1.9994364777512926e-05, + "loss": 2.4654, + "step": 1160 + }, + { + "epoch": 0.02174233490566038, + "grad_norm": 2.59375, + "learning_rate": 1.999416879827918e-05, + "loss": 2.4778, + "step": 1180 + }, + { + "epoch": 0.022110849056603772, + "grad_norm": 2.828125, + "learning_rate": 1.9993969470200783e-05, + "loss": 2.4848, + "step": 1200 + }, + { + "epoch": 0.02247936320754717, + "grad_norm": 2.5625, + "learning_rate": 1.999376679334453e-05, + "loss": 2.4295, + "step": 1220 + }, + { + "epoch": 0.022847877358490566, + "grad_norm": 2.46875, + "learning_rate": 1.9993560767778336e-05, + "loss": 2.4425, + "step": 1240 + }, + { + "epoch": 0.023216391509433963, + "grad_norm": 2.671875, + "learning_rate": 1.9993351393571233e-05, + "loss": 2.4577, + "step": 1260 + }, + { + "epoch": 0.02358490566037736, + "grad_norm": 2.625, + "learning_rate": 1.9993138670793378e-05, + "loss": 2.4652, + "step": 1280 + }, + { + "epoch": 0.023953419811320754, + "grad_norm": 3.265625, + "learning_rate": 1.999292259951605e-05, + "loss": 2.4901, + "step": 1300 + }, + { + "epoch": 0.02432193396226415, + "grad_norm": 2.828125, + "learning_rate": 1.999270317981165e-05, + "loss": 2.4733, + "step": 1320 + }, + { + "epoch": 0.024690448113207548, + "grad_norm": 2.96875, + "learning_rate": 1.9992480411753704e-05, + "loss": 2.4059, + "step": 1340 + }, + { + "epoch": 0.025058962264150945, + "grad_norm": 2.46875, + "learning_rate": 1.9992254295416854e-05, + "loss": 2.431, + "step": 1360 + }, + { + "epoch": 0.02542747641509434, + "grad_norm": 2.671875, + "learning_rate": 1.999202483087687e-05, + "loss": 2.4219, + "step": 1380 + }, + { + "epoch": 0.025795990566037735, + "grad_norm": 3.03125, + "learning_rate": 1.9991792018210638e-05, + "loss": 2.4157, + "step": 1400 + }, + { + "epoch": 0.026164504716981132, + "grad_norm": 2.640625, + "learning_rate": 1.999155585749617e-05, + "loss": 2.43, + "step": 1420 + }, + { + "epoch": 0.02653301886792453, + "grad_norm": 2.953125, + "learning_rate": 1.9991316348812596e-05, + "loss": 2.4583, + "step": 1440 + }, + { + "epoch": 0.026901533018867923, + "grad_norm": 2.578125, + "learning_rate": 1.9991073492240175e-05, + "loss": 2.4235, + "step": 1460 + }, + { + "epoch": 0.02727004716981132, + "grad_norm": 3.65625, + "learning_rate": 1.999082728786028e-05, + "loss": 2.4199, + "step": 1480 + }, + { + "epoch": 0.027638561320754717, + "grad_norm": 2.671875, + "learning_rate": 1.9990577735755415e-05, + "loss": 2.4105, + "step": 1500 + }, + { + "epoch": 0.028007075471698114, + "grad_norm": 3.171875, + "learning_rate": 1.9990324836009193e-05, + "loss": 2.4124, + "step": 1520 + }, + { + "epoch": 0.02837558962264151, + "grad_norm": 3.359375, + "learning_rate": 1.9990068588706356e-05, + "loss": 2.415, + "step": 1540 + }, + { + "epoch": 0.028744103773584904, + "grad_norm": 2.6875, + "learning_rate": 1.998980899393277e-05, + "loss": 2.4471, + "step": 1560 + }, + { + "epoch": 0.0291126179245283, + "grad_norm": 3.1875, + "learning_rate": 1.9989546051775422e-05, + "loss": 2.4945, + "step": 1580 + }, + { + "epoch": 0.0294811320754717, + "grad_norm": 3.125, + "learning_rate": 1.9989279762322417e-05, + "loss": 2.4305, + "step": 1600 + }, + { + "epoch": 0.029849646226415096, + "grad_norm": 2.703125, + "learning_rate": 1.9989010125662974e-05, + "loss": 2.3965, + "step": 1620 + }, + { + "epoch": 0.03021816037735849, + "grad_norm": 2.265625, + "learning_rate": 1.9988737141887456e-05, + "loss": 2.4122, + "step": 1640 + }, + { + "epoch": 0.030586674528301886, + "grad_norm": 2.578125, + "learning_rate": 1.9988460811087333e-05, + "loss": 2.4092, + "step": 1660 + }, + { + "epoch": 0.030955188679245283, + "grad_norm": 4.03125, + "learning_rate": 1.998818113335519e-05, + "loss": 2.3695, + "step": 1680 + }, + { + "epoch": 0.03132370283018868, + "grad_norm": 2.640625, + "learning_rate": 1.9987898108784746e-05, + "loss": 2.4168, + "step": 1700 + }, + { + "epoch": 0.031692216981132074, + "grad_norm": 2.921875, + "learning_rate": 1.998761173747084e-05, + "loss": 2.4487, + "step": 1720 + }, + { + "epoch": 0.03206073113207547, + "grad_norm": 2.59375, + "learning_rate": 1.9987322019509423e-05, + "loss": 2.4003, + "step": 1740 + }, + { + "epoch": 0.03242924528301887, + "grad_norm": 2.796875, + "learning_rate": 1.9987028954997576e-05, + "loss": 2.414, + "step": 1760 + }, + { + "epoch": 0.032797759433962265, + "grad_norm": 2.796875, + "learning_rate": 1.9986732544033498e-05, + "loss": 2.4195, + "step": 1780 + }, + { + "epoch": 0.03316627358490566, + "grad_norm": 2.796875, + "learning_rate": 1.9986432786716517e-05, + "loss": 2.4008, + "step": 1800 + }, + { + "epoch": 0.03353478773584906, + "grad_norm": 2.9375, + "learning_rate": 1.9986129683147065e-05, + "loss": 2.3768, + "step": 1820 + }, + { + "epoch": 0.033903301886792456, + "grad_norm": 2.40625, + "learning_rate": 1.9985823233426712e-05, + "loss": 2.4047, + "step": 1840 + }, + { + "epoch": 0.034271816037735846, + "grad_norm": 2.828125, + "learning_rate": 1.9985513437658145e-05, + "loss": 2.4112, + "step": 1860 + }, + { + "epoch": 0.03464033018867924, + "grad_norm": 2.484375, + "learning_rate": 1.9985200295945166e-05, + "loss": 2.4062, + "step": 1880 + }, + { + "epoch": 0.03500884433962264, + "grad_norm": 3.28125, + "learning_rate": 1.9984883808392706e-05, + "loss": 2.3751, + "step": 1900 + }, + { + "epoch": 0.03537735849056604, + "grad_norm": 2.59375, + "learning_rate": 1.998456397510681e-05, + "loss": 2.4026, + "step": 1920 + }, + { + "epoch": 0.035745872641509434, + "grad_norm": 2.578125, + "learning_rate": 1.998424079619465e-05, + "loss": 2.4188, + "step": 1940 + }, + { + "epoch": 0.03611438679245283, + "grad_norm": 3.015625, + "learning_rate": 1.9983914271764516e-05, + "loss": 2.3998, + "step": 1960 + }, + { + "epoch": 0.03648290094339623, + "grad_norm": 3.828125, + "learning_rate": 1.9983584401925816e-05, + "loss": 2.3757, + "step": 1980 + }, + { + "epoch": 0.036851415094339625, + "grad_norm": 2.828125, + "learning_rate": 1.9983251186789093e-05, + "loss": 2.3714, + "step": 2000 + }, + { + "epoch": 0.03721992924528302, + "grad_norm": 3.625, + "learning_rate": 1.9982914626465994e-05, + "loss": 2.38, + "step": 2020 + }, + { + "epoch": 0.03758844339622641, + "grad_norm": 3.109375, + "learning_rate": 1.998257472106929e-05, + "loss": 2.4026, + "step": 2040 + }, + { + "epoch": 0.03795695754716981, + "grad_norm": 2.8125, + "learning_rate": 1.9982231470712887e-05, + "loss": 2.3723, + "step": 2060 + }, + { + "epoch": 0.038325471698113206, + "grad_norm": 2.90625, + "learning_rate": 1.9981884875511786e-05, + "loss": 2.3875, + "step": 2080 + }, + { + "epoch": 0.0386939858490566, + "grad_norm": 3.203125, + "learning_rate": 1.998153493558214e-05, + "loss": 2.3626, + "step": 2100 + }, + { + "epoch": 0.0390625, + "grad_norm": 2.546875, + "learning_rate": 1.9981181651041198e-05, + "loss": 2.3627, + "step": 2120 + }, + { + "epoch": 0.0394310141509434, + "grad_norm": 3.09375, + "learning_rate": 1.9980825022007338e-05, + "loss": 2.3458, + "step": 2140 + }, + { + "epoch": 0.039799528301886794, + "grad_norm": 2.640625, + "learning_rate": 1.998046504860006e-05, + "loss": 2.4021, + "step": 2160 + }, + { + "epoch": 0.04016804245283019, + "grad_norm": 2.765625, + "learning_rate": 1.9980101730939992e-05, + "loss": 2.3817, + "step": 2180 + }, + { + "epoch": 0.04053655660377359, + "grad_norm": 2.890625, + "learning_rate": 1.9979735069148863e-05, + "loss": 2.3742, + "step": 2200 + }, + { + "epoch": 0.04090507075471698, + "grad_norm": 3.53125, + "learning_rate": 1.9979365063349538e-05, + "loss": 2.3618, + "step": 2220 + }, + { + "epoch": 0.041273584905660375, + "grad_norm": 2.921875, + "learning_rate": 1.9978991713666002e-05, + "loss": 2.3453, + "step": 2240 + }, + { + "epoch": 0.04164209905660377, + "grad_norm": 2.5, + "learning_rate": 1.997861502022335e-05, + "loss": 2.3524, + "step": 2260 + }, + { + "epoch": 0.04201061320754717, + "grad_norm": 2.296875, + "learning_rate": 1.9978234983147812e-05, + "loss": 2.3399, + "step": 2280 + }, + { + "epoch": 0.042379127358490566, + "grad_norm": 2.9375, + "learning_rate": 1.9977851602566726e-05, + "loss": 2.3865, + "step": 2300 + }, + { + "epoch": 0.04274764150943396, + "grad_norm": 2.484375, + "learning_rate": 1.9977464878608555e-05, + "loss": 2.3481, + "step": 2320 + }, + { + "epoch": 0.04311615566037736, + "grad_norm": 2.9375, + "learning_rate": 1.9977074811402884e-05, + "loss": 2.383, + "step": 2340 + }, + { + "epoch": 0.04348466981132076, + "grad_norm": 2.828125, + "learning_rate": 1.9976681401080417e-05, + "loss": 2.3312, + "step": 2360 + }, + { + "epoch": 0.043853183962264154, + "grad_norm": 2.765625, + "learning_rate": 1.997628464777298e-05, + "loss": 2.3908, + "step": 2380 + }, + { + "epoch": 0.044221698113207544, + "grad_norm": 2.953125, + "learning_rate": 1.997588455161351e-05, + "loss": 2.3233, + "step": 2400 + }, + { + "epoch": 0.04459021226415094, + "grad_norm": 2.625, + "learning_rate": 1.997548111273608e-05, + "loss": 2.3725, + "step": 2420 + }, + { + "epoch": 0.04495872641509434, + "grad_norm": 4.3125, + "learning_rate": 1.9975074331275866e-05, + "loss": 2.352, + "step": 2440 + }, + { + "epoch": 0.045327240566037735, + "grad_norm": 2.84375, + "learning_rate": 1.9974664207369175e-05, + "loss": 2.3746, + "step": 2460 + }, + { + "epoch": 0.04569575471698113, + "grad_norm": 2.734375, + "learning_rate": 1.9974250741153435e-05, + "loss": 2.3701, + "step": 2480 + }, + { + "epoch": 0.04606426886792453, + "grad_norm": 2.484375, + "learning_rate": 1.9973833932767188e-05, + "loss": 2.324, + "step": 2500 + }, + { + "epoch": 0.046432783018867926, + "grad_norm": 2.90625, + "learning_rate": 1.9973413782350096e-05, + "loss": 2.3418, + "step": 2520 + }, + { + "epoch": 0.04680129716981132, + "grad_norm": 2.515625, + "learning_rate": 1.9972990290042946e-05, + "loss": 2.3926, + "step": 2540 + }, + { + "epoch": 0.04716981132075472, + "grad_norm": 3.078125, + "learning_rate": 1.997256345598764e-05, + "loss": 2.335, + "step": 2560 + }, + { + "epoch": 0.04753832547169811, + "grad_norm": 2.484375, + "learning_rate": 1.9972133280327202e-05, + "loss": 2.3253, + "step": 2580 + }, + { + "epoch": 0.04790683962264151, + "grad_norm": 2.65625, + "learning_rate": 1.997169976320577e-05, + "loss": 2.3334, + "step": 2600 + }, + { + "epoch": 0.048275353773584904, + "grad_norm": 2.640625, + "learning_rate": 1.997126290476862e-05, + "loss": 2.3359, + "step": 2620 + }, + { + "epoch": 0.0486438679245283, + "grad_norm": 2.53125, + "learning_rate": 1.997082270516212e-05, + "loss": 2.3433, + "step": 2640 + }, + { + "epoch": 0.0490123820754717, + "grad_norm": 2.828125, + "learning_rate": 1.997037916453378e-05, + "loss": 2.3231, + "step": 2660 + }, + { + "epoch": 0.049380896226415096, + "grad_norm": 2.609375, + "learning_rate": 1.996993228303222e-05, + "loss": 2.3979, + "step": 2680 + }, + { + "epoch": 0.04974941037735849, + "grad_norm": 2.46875, + "learning_rate": 1.996948206080718e-05, + "loss": 2.3811, + "step": 2700 + }, + { + "epoch": 0.05011792452830189, + "grad_norm": 3.25, + "learning_rate": 1.9969028498009523e-05, + "loss": 2.3442, + "step": 2720 + }, + { + "epoch": 0.05048643867924528, + "grad_norm": 2.65625, + "learning_rate": 1.9968571594791226e-05, + "loss": 2.3484, + "step": 2740 + }, + { + "epoch": 0.05085495283018868, + "grad_norm": 2.703125, + "learning_rate": 1.996811135130539e-05, + "loss": 2.3639, + "step": 2760 + }, + { + "epoch": 0.051223466981132074, + "grad_norm": 2.84375, + "learning_rate": 1.996764776770623e-05, + "loss": 2.3566, + "step": 2780 + }, + { + "epoch": 0.05159198113207547, + "grad_norm": 2.671875, + "learning_rate": 1.9967180844149084e-05, + "loss": 2.3054, + "step": 2800 + }, + { + "epoch": 0.05196049528301887, + "grad_norm": 2.625, + "learning_rate": 1.9966710580790418e-05, + "loss": 2.3744, + "step": 2820 + }, + { + "epoch": 0.052329009433962265, + "grad_norm": 2.65625, + "learning_rate": 1.9966236977787794e-05, + "loss": 2.3233, + "step": 2840 + }, + { + "epoch": 0.05269752358490566, + "grad_norm": 2.40625, + "learning_rate": 1.9965760035299917e-05, + "loss": 2.3336, + "step": 2860 + }, + { + "epoch": 0.05306603773584906, + "grad_norm": 3.171875, + "learning_rate": 1.9965279753486594e-05, + "loss": 2.3973, + "step": 2880 + }, + { + "epoch": 0.053434551886792456, + "grad_norm": 3.09375, + "learning_rate": 1.9964796132508765e-05, + "loss": 2.3725, + "step": 2900 + }, + { + "epoch": 0.053803066037735846, + "grad_norm": 2.46875, + "learning_rate": 1.996430917252847e-05, + "loss": 2.3438, + "step": 2920 + }, + { + "epoch": 0.05417158018867924, + "grad_norm": 2.578125, + "learning_rate": 1.9963818873708894e-05, + "loss": 2.3029, + "step": 2940 + }, + { + "epoch": 0.05454009433962264, + "grad_norm": 2.75, + "learning_rate": 1.9963325236214315e-05, + "loss": 2.3102, + "step": 2960 + }, + { + "epoch": 0.05490860849056604, + "grad_norm": 2.921875, + "learning_rate": 1.9962828260210146e-05, + "loss": 2.3644, + "step": 2980 + }, + { + "epoch": 0.055277122641509434, + "grad_norm": 2.96875, + "learning_rate": 1.9962327945862913e-05, + "loss": 2.3726, + "step": 3000 + }, + { + "epoch": 0.05564563679245283, + "grad_norm": 2.578125, + "learning_rate": 1.996182429334026e-05, + "loss": 2.3339, + "step": 3020 + }, + { + "epoch": 0.05601415094339623, + "grad_norm": 2.453125, + "learning_rate": 1.9961317302810953e-05, + "loss": 2.3412, + "step": 3040 + }, + { + "epoch": 0.056382665094339625, + "grad_norm": 2.734375, + "learning_rate": 1.9960806974444873e-05, + "loss": 2.338, + "step": 3060 + }, + { + "epoch": 0.05675117924528302, + "grad_norm": 2.421875, + "learning_rate": 1.996029330841302e-05, + "loss": 2.3501, + "step": 3080 + }, + { + "epoch": 0.05711969339622641, + "grad_norm": 4.28125, + "learning_rate": 1.9959776304887516e-05, + "loss": 2.3331, + "step": 3100 + }, + { + "epoch": 0.05748820754716981, + "grad_norm": 2.828125, + "learning_rate": 1.9959255964041595e-05, + "loss": 2.3302, + "step": 3120 + }, + { + "epoch": 0.057856721698113206, + "grad_norm": 2.65625, + "learning_rate": 1.9958732286049613e-05, + "loss": 2.2736, + "step": 3140 + }, + { + "epoch": 0.0582252358490566, + "grad_norm": 2.609375, + "learning_rate": 1.995820527108705e-05, + "loss": 2.3018, + "step": 3160 + }, + { + "epoch": 0.05859375, + "grad_norm": 2.796875, + "learning_rate": 1.9957674919330488e-05, + "loss": 2.2994, + "step": 3180 + }, + { + "epoch": 0.0589622641509434, + "grad_norm": 2.625, + "learning_rate": 1.9957141230957642e-05, + "loss": 2.3014, + "step": 3200 + }, + { + "epoch": 0.059330778301886794, + "grad_norm": 2.625, + "learning_rate": 1.9956604206147347e-05, + "loss": 2.3717, + "step": 3220 + }, + { + "epoch": 0.05969929245283019, + "grad_norm": 2.625, + "learning_rate": 1.995606384507954e-05, + "loss": 2.3192, + "step": 3240 + }, + { + "epoch": 0.06006780660377359, + "grad_norm": 2.90625, + "learning_rate": 1.9955520147935287e-05, + "loss": 2.294, + "step": 3260 + }, + { + "epoch": 0.06043632075471698, + "grad_norm": 2.765625, + "learning_rate": 1.995497311489677e-05, + "loss": 2.3148, + "step": 3280 + }, + { + "epoch": 0.060804834905660375, + "grad_norm": 2.5, + "learning_rate": 1.995442274614729e-05, + "loss": 2.3255, + "step": 3300 + }, + { + "epoch": 0.06117334905660377, + "grad_norm": 3.09375, + "learning_rate": 1.9953869041871265e-05, + "loss": 2.2866, + "step": 3320 + }, + { + "epoch": 0.06154186320754717, + "grad_norm": 3.09375, + "learning_rate": 1.995331200225423e-05, + "loss": 2.3077, + "step": 3340 + }, + { + "epoch": 0.061910377358490566, + "grad_norm": 3.3125, + "learning_rate": 1.9952751627482836e-05, + "loss": 2.3115, + "step": 3360 + }, + { + "epoch": 0.06227889150943396, + "grad_norm": 2.671875, + "learning_rate": 1.9952187917744853e-05, + "loss": 2.3319, + "step": 3380 + }, + { + "epoch": 0.06264740566037735, + "grad_norm": 3.3125, + "learning_rate": 1.995162087322917e-05, + "loss": 2.3563, + "step": 3400 + }, + { + "epoch": 0.06301591981132075, + "grad_norm": 2.515625, + "learning_rate": 1.9951050494125797e-05, + "loss": 2.3455, + "step": 3420 + }, + { + "epoch": 0.06338443396226415, + "grad_norm": 2.578125, + "learning_rate": 1.9950476780625848e-05, + "loss": 2.3373, + "step": 3440 + }, + { + "epoch": 0.06375294811320754, + "grad_norm": 3.296875, + "learning_rate": 1.994989973292157e-05, + "loss": 2.3371, + "step": 3460 + }, + { + "epoch": 0.06412146226415094, + "grad_norm": 3.484375, + "learning_rate": 1.9949319351206315e-05, + "loss": 2.3368, + "step": 3480 + }, + { + "epoch": 0.06448997641509434, + "grad_norm": 2.921875, + "learning_rate": 1.9948735635674557e-05, + "loss": 2.3236, + "step": 3500 + }, + { + "epoch": 0.06485849056603774, + "grad_norm": 3.1875, + "learning_rate": 1.994814858652189e-05, + "loss": 2.3423, + "step": 3520 + }, + { + "epoch": 0.06522700471698113, + "grad_norm": 3.171875, + "learning_rate": 1.9947558203945022e-05, + "loss": 2.367, + "step": 3540 + }, + { + "epoch": 0.06559551886792453, + "grad_norm": 2.578125, + "learning_rate": 1.9946964488141783e-05, + "loss": 2.3072, + "step": 3560 + }, + { + "epoch": 0.06596403301886793, + "grad_norm": 2.609375, + "learning_rate": 1.9946367439311106e-05, + "loss": 2.2992, + "step": 3580 + }, + { + "epoch": 0.06633254716981132, + "grad_norm": 2.71875, + "learning_rate": 1.9945767057653055e-05, + "loss": 2.2545, + "step": 3600 + }, + { + "epoch": 0.06670106132075472, + "grad_norm": 2.671875, + "learning_rate": 1.9945163343368802e-05, + "loss": 2.3545, + "step": 3620 + }, + { + "epoch": 0.06706957547169812, + "grad_norm": 2.546875, + "learning_rate": 1.9944556296660644e-05, + "loss": 2.3025, + "step": 3640 + }, + { + "epoch": 0.06743808962264151, + "grad_norm": 2.546875, + "learning_rate": 1.994394591773199e-05, + "loss": 2.3014, + "step": 3660 + }, + { + "epoch": 0.06780660377358491, + "grad_norm": 2.59375, + "learning_rate": 1.9943332206787363e-05, + "loss": 2.2447, + "step": 3680 + }, + { + "epoch": 0.06817511792452831, + "grad_norm": 2.625, + "learning_rate": 1.994271516403241e-05, + "loss": 2.3549, + "step": 3700 + }, + { + "epoch": 0.06854363207547169, + "grad_norm": 2.828125, + "learning_rate": 1.9942094789673882e-05, + "loss": 2.3593, + "step": 3720 + }, + { + "epoch": 0.06891214622641509, + "grad_norm": 2.71875, + "learning_rate": 1.994147108391966e-05, + "loss": 2.3225, + "step": 3740 + }, + { + "epoch": 0.06928066037735849, + "grad_norm": 2.953125, + "learning_rate": 1.9940844046978732e-05, + "loss": 2.2987, + "step": 3760 + }, + { + "epoch": 0.06964917452830188, + "grad_norm": 2.390625, + "learning_rate": 1.994021367906121e-05, + "loss": 2.2655, + "step": 3780 + }, + { + "epoch": 0.07001768867924528, + "grad_norm": 3.1875, + "learning_rate": 1.993957998037831e-05, + "loss": 2.3055, + "step": 3800 + }, + { + "epoch": 0.07038620283018868, + "grad_norm": 2.671875, + "learning_rate": 1.9938942951142378e-05, + "loss": 2.2973, + "step": 3820 + }, + { + "epoch": 0.07075471698113207, + "grad_norm": 2.4375, + "learning_rate": 1.9938302591566866e-05, + "loss": 2.2998, + "step": 3840 + }, + { + "epoch": 0.07112323113207547, + "grad_norm": 3.1875, + "learning_rate": 1.993765890186635e-05, + "loss": 2.3037, + "step": 3860 + }, + { + "epoch": 0.07149174528301887, + "grad_norm": 3.046875, + "learning_rate": 1.9937011882256513e-05, + "loss": 2.3077, + "step": 3880 + }, + { + "epoch": 0.07186025943396226, + "grad_norm": 2.953125, + "learning_rate": 1.993636153295416e-05, + "loss": 2.3247, + "step": 3900 + }, + { + "epoch": 0.07222877358490566, + "grad_norm": 3.34375, + "learning_rate": 1.993570785417721e-05, + "loss": 2.3064, + "step": 3920 + }, + { + "epoch": 0.07259728773584906, + "grad_norm": 2.59375, + "learning_rate": 1.99350508461447e-05, + "loss": 2.2559, + "step": 3940 + }, + { + "epoch": 0.07296580188679246, + "grad_norm": 3.0625, + "learning_rate": 1.9934390509076772e-05, + "loss": 2.321, + "step": 3960 + }, + { + "epoch": 0.07333431603773585, + "grad_norm": 3.09375, + "learning_rate": 1.9933726843194705e-05, + "loss": 2.2866, + "step": 3980 + }, + { + "epoch": 0.07370283018867925, + "grad_norm": 2.546875, + "learning_rate": 1.9933059848720866e-05, + "loss": 2.2997, + "step": 4000 + }, + { + "epoch": 0.07407134433962265, + "grad_norm": 3.140625, + "learning_rate": 1.993238952587876e-05, + "loss": 2.2821, + "step": 4020 + }, + { + "epoch": 0.07443985849056604, + "grad_norm": 2.484375, + "learning_rate": 1.9931715874892998e-05, + "loss": 2.2995, + "step": 4040 + }, + { + "epoch": 0.07480837264150944, + "grad_norm": 3.015625, + "learning_rate": 1.9931038895989304e-05, + "loss": 2.3547, + "step": 4060 + }, + { + "epoch": 0.07517688679245282, + "grad_norm": 2.546875, + "learning_rate": 1.993035858939452e-05, + "loss": 2.2587, + "step": 4080 + }, + { + "epoch": 0.07554540094339622, + "grad_norm": 3.0, + "learning_rate": 1.9929674955336605e-05, + "loss": 2.3447, + "step": 4100 + }, + { + "epoch": 0.07591391509433962, + "grad_norm": 2.6875, + "learning_rate": 1.9928987994044632e-05, + "loss": 2.3017, + "step": 4120 + }, + { + "epoch": 0.07628242924528301, + "grad_norm": 2.453125, + "learning_rate": 1.9928297705748785e-05, + "loss": 2.2733, + "step": 4140 + }, + { + "epoch": 0.07665094339622641, + "grad_norm": 2.703125, + "learning_rate": 1.992760409068037e-05, + "loss": 2.3115, + "step": 4160 + }, + { + "epoch": 0.07701945754716981, + "grad_norm": 2.65625, + "learning_rate": 1.9926907149071795e-05, + "loss": 2.295, + "step": 4180 + }, + { + "epoch": 0.0773879716981132, + "grad_norm": 2.875, + "learning_rate": 1.99262068811566e-05, + "loss": 2.3058, + "step": 4200 + }, + { + "epoch": 0.0777564858490566, + "grad_norm": 2.65625, + "learning_rate": 1.9925503287169427e-05, + "loss": 2.3227, + "step": 4220 + }, + { + "epoch": 0.078125, + "grad_norm": 2.671875, + "learning_rate": 1.9924796367346034e-05, + "loss": 2.2713, + "step": 4240 + }, + { + "epoch": 0.0784935141509434, + "grad_norm": 2.671875, + "learning_rate": 1.99240861219233e-05, + "loss": 2.2706, + "step": 4260 + }, + { + "epoch": 0.0788620283018868, + "grad_norm": 3.625, + "learning_rate": 1.9923372551139212e-05, + "loss": 2.2696, + "step": 4280 + }, + { + "epoch": 0.07923054245283019, + "grad_norm": 2.828125, + "learning_rate": 1.9922655655232872e-05, + "loss": 2.2872, + "step": 4300 + }, + { + "epoch": 0.07959905660377359, + "grad_norm": 2.34375, + "learning_rate": 1.99219354344445e-05, + "loss": 2.3168, + "step": 4320 + }, + { + "epoch": 0.07996757075471699, + "grad_norm": 2.8125, + "learning_rate": 1.9921211889015426e-05, + "loss": 2.3078, + "step": 4340 + }, + { + "epoch": 0.08033608490566038, + "grad_norm": 2.625, + "learning_rate": 1.9920485019188094e-05, + "loss": 2.3168, + "step": 4360 + }, + { + "epoch": 0.08070459905660378, + "grad_norm": 2.546875, + "learning_rate": 1.9919754825206063e-05, + "loss": 2.2452, + "step": 4380 + }, + { + "epoch": 0.08107311320754718, + "grad_norm": 3.15625, + "learning_rate": 1.991902130731401e-05, + "loss": 2.2668, + "step": 4400 + }, + { + "epoch": 0.08144162735849056, + "grad_norm": 2.65625, + "learning_rate": 1.9918284465757724e-05, + "loss": 2.2829, + "step": 4420 + }, + { + "epoch": 0.08181014150943396, + "grad_norm": 2.484375, + "learning_rate": 1.9917544300784097e-05, + "loss": 2.3184, + "step": 4440 + }, + { + "epoch": 0.08217865566037735, + "grad_norm": 2.8125, + "learning_rate": 1.9916800812641152e-05, + "loss": 2.3033, + "step": 4460 + }, + { + "epoch": 0.08254716981132075, + "grad_norm": 2.59375, + "learning_rate": 1.991605400157801e-05, + "loss": 2.2779, + "step": 4480 + }, + { + "epoch": 0.08291568396226415, + "grad_norm": 2.96875, + "learning_rate": 1.9915303867844917e-05, + "loss": 2.2609, + "step": 4500 + }, + { + "epoch": 0.08328419811320754, + "grad_norm": 2.609375, + "learning_rate": 1.9914550411693226e-05, + "loss": 2.2794, + "step": 4520 + }, + { + "epoch": 0.08365271226415094, + "grad_norm": 2.625, + "learning_rate": 1.9913793633375405e-05, + "loss": 2.3026, + "step": 4540 + }, + { + "epoch": 0.08402122641509434, + "grad_norm": 2.546875, + "learning_rate": 1.9913033533145035e-05, + "loss": 2.2928, + "step": 4560 + }, + { + "epoch": 0.08438974056603774, + "grad_norm": 3.0625, + "learning_rate": 1.9912270111256812e-05, + "loss": 2.3455, + "step": 4580 + }, + { + "epoch": 0.08475825471698113, + "grad_norm": 3.984375, + "learning_rate": 1.9911503367966544e-05, + "loss": 2.2684, + "step": 4600 + }, + { + "epoch": 0.08512676886792453, + "grad_norm": 2.484375, + "learning_rate": 1.9910733303531146e-05, + "loss": 2.2849, + "step": 4620 + }, + { + "epoch": 0.08549528301886793, + "grad_norm": 2.484375, + "learning_rate": 1.9909959918208653e-05, + "loss": 2.2638, + "step": 4640 + }, + { + "epoch": 0.08586379716981132, + "grad_norm": 2.59375, + "learning_rate": 1.9909183212258216e-05, + "loss": 2.2667, + "step": 4660 + }, + { + "epoch": 0.08623231132075472, + "grad_norm": 3.03125, + "learning_rate": 1.9908403185940092e-05, + "loss": 2.2898, + "step": 4680 + }, + { + "epoch": 0.08660082547169812, + "grad_norm": 3.140625, + "learning_rate": 1.9907619839515643e-05, + "loss": 2.2718, + "step": 4700 + }, + { + "epoch": 0.08696933962264151, + "grad_norm": 2.28125, + "learning_rate": 1.9906833173247363e-05, + "loss": 2.2487, + "step": 4720 + }, + { + "epoch": 0.08733785377358491, + "grad_norm": 2.59375, + "learning_rate": 1.9906043187398843e-05, + "loss": 2.2671, + "step": 4740 + }, + { + "epoch": 0.08770636792452831, + "grad_norm": 3.1875, + "learning_rate": 1.9905249882234794e-05, + "loss": 2.2698, + "step": 4760 + }, + { + "epoch": 0.08807488207547169, + "grad_norm": 2.8125, + "learning_rate": 1.9904453258021035e-05, + "loss": 2.27, + "step": 4780 + }, + { + "epoch": 0.08844339622641509, + "grad_norm": 2.5, + "learning_rate": 1.9903653315024498e-05, + "loss": 2.2832, + "step": 4800 + }, + { + "epoch": 0.08881191037735849, + "grad_norm": 2.71875, + "learning_rate": 1.9902850053513233e-05, + "loss": 2.3455, + "step": 4820 + }, + { + "epoch": 0.08918042452830188, + "grad_norm": 2.703125, + "learning_rate": 1.990204347375639e-05, + "loss": 2.2916, + "step": 4840 + }, + { + "epoch": 0.08954893867924528, + "grad_norm": 2.90625, + "learning_rate": 1.990123357602424e-05, + "loss": 2.2803, + "step": 4860 + }, + { + "epoch": 0.08991745283018868, + "grad_norm": 2.546875, + "learning_rate": 1.990042036058816e-05, + "loss": 2.3099, + "step": 4880 + }, + { + "epoch": 0.09028596698113207, + "grad_norm": 2.859375, + "learning_rate": 1.9899603827720654e-05, + "loss": 2.2785, + "step": 4900 + }, + { + "epoch": 0.09065448113207547, + "grad_norm": 2.8125, + "learning_rate": 1.989878397769531e-05, + "loss": 2.2702, + "step": 4920 + }, + { + "epoch": 0.09102299528301887, + "grad_norm": 3.453125, + "learning_rate": 1.9897960810786854e-05, + "loss": 2.2704, + "step": 4940 + }, + { + "epoch": 0.09139150943396226, + "grad_norm": 2.765625, + "learning_rate": 1.9897134327271107e-05, + "loss": 2.2672, + "step": 4960 + }, + { + "epoch": 0.09176002358490566, + "grad_norm": 2.90625, + "learning_rate": 1.9896304527425013e-05, + "loss": 2.2522, + "step": 4980 + }, + { + "epoch": 0.09212853773584906, + "grad_norm": 2.8125, + "learning_rate": 1.9895471411526617e-05, + "loss": 2.2769, + "step": 5000 + }, + { + "epoch": 0.09249705188679246, + "grad_norm": 2.625, + "learning_rate": 1.989463497985508e-05, + "loss": 2.2928, + "step": 5020 + }, + { + "epoch": 0.09286556603773585, + "grad_norm": 2.8125, + "learning_rate": 1.9893795232690673e-05, + "loss": 2.2772, + "step": 5040 + }, + { + "epoch": 0.09323408018867925, + "grad_norm": 3.046875, + "learning_rate": 1.9892952170314778e-05, + "loss": 2.2655, + "step": 5060 + }, + { + "epoch": 0.09360259433962265, + "grad_norm": 2.78125, + "learning_rate": 1.9892105793009886e-05, + "loss": 2.2947, + "step": 5080 + }, + { + "epoch": 0.09397110849056604, + "grad_norm": 2.71875, + "learning_rate": 1.989125610105961e-05, + "loss": 2.2651, + "step": 5100 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 2.6875, + "learning_rate": 1.9890403094748655e-05, + "loss": 2.2816, + "step": 5120 + }, + { + "epoch": 0.09470813679245282, + "grad_norm": 3.109375, + "learning_rate": 1.988954677436285e-05, + "loss": 2.2814, + "step": 5140 + }, + { + "epoch": 0.09507665094339622, + "grad_norm": 3.234375, + "learning_rate": 1.9888687140189132e-05, + "loss": 2.2207, + "step": 5160 + }, + { + "epoch": 0.09544516509433962, + "grad_norm": 2.53125, + "learning_rate": 1.988782419251554e-05, + "loss": 2.2843, + "step": 5180 + }, + { + "epoch": 0.09581367924528301, + "grad_norm": 2.6875, + "learning_rate": 1.988695793163124e-05, + "loss": 2.254, + "step": 5200 + }, + { + "epoch": 0.09618219339622641, + "grad_norm": 2.703125, + "learning_rate": 1.9886088357826497e-05, + "loss": 2.2509, + "step": 5220 + }, + { + "epoch": 0.09655070754716981, + "grad_norm": 2.796875, + "learning_rate": 1.988521547139268e-05, + "loss": 2.2411, + "step": 5240 + }, + { + "epoch": 0.0969192216981132, + "grad_norm": 2.421875, + "learning_rate": 1.9884339272622286e-05, + "loss": 2.2333, + "step": 5260 + }, + { + "epoch": 0.0972877358490566, + "grad_norm": 3.265625, + "learning_rate": 1.9883459761808905e-05, + "loss": 2.2519, + "step": 5280 + }, + { + "epoch": 0.09765625, + "grad_norm": 2.671875, + "learning_rate": 1.9882576939247245e-05, + "loss": 2.2501, + "step": 5300 + }, + { + "epoch": 0.0980247641509434, + "grad_norm": 2.546875, + "learning_rate": 1.988169080523312e-05, + "loss": 2.2727, + "step": 5320 + }, + { + "epoch": 0.0983932783018868, + "grad_norm": 3.09375, + "learning_rate": 1.988080136006346e-05, + "loss": 2.2653, + "step": 5340 + }, + { + "epoch": 0.09876179245283019, + "grad_norm": 2.96875, + "learning_rate": 1.9879908604036296e-05, + "loss": 2.3189, + "step": 5360 + }, + { + "epoch": 0.09913030660377359, + "grad_norm": 3.03125, + "learning_rate": 1.9879012537450774e-05, + "loss": 2.2751, + "step": 5380 + }, + { + "epoch": 0.09949882075471699, + "grad_norm": 3.40625, + "learning_rate": 1.9878113160607148e-05, + "loss": 2.2711, + "step": 5400 + }, + { + "epoch": 0.09986733490566038, + "grad_norm": 2.75, + "learning_rate": 1.9877210473806782e-05, + "loss": 2.256, + "step": 5420 + }, + { + "epoch": 0.10023584905660378, + "grad_norm": 2.859375, + "learning_rate": 1.9876304477352145e-05, + "loss": 2.2709, + "step": 5440 + }, + { + "epoch": 0.10060436320754718, + "grad_norm": 2.71875, + "learning_rate": 1.9875395171546824e-05, + "loss": 2.2381, + "step": 5460 + }, + { + "epoch": 0.10097287735849056, + "grad_norm": 2.515625, + "learning_rate": 1.9874482556695506e-05, + "loss": 2.2772, + "step": 5480 + }, + { + "epoch": 0.10134139150943396, + "grad_norm": 2.40625, + "learning_rate": 1.9873566633103988e-05, + "loss": 2.2711, + "step": 5500 + }, + { + "epoch": 0.10170990566037735, + "grad_norm": 3.375, + "learning_rate": 1.9872647401079176e-05, + "loss": 2.2634, + "step": 5520 + }, + { + "epoch": 0.10207841981132075, + "grad_norm": 2.84375, + "learning_rate": 1.9871724860929092e-05, + "loss": 2.2548, + "step": 5540 + }, + { + "epoch": 0.10244693396226415, + "grad_norm": 2.5, + "learning_rate": 1.9870799012962857e-05, + "loss": 2.2644, + "step": 5560 + }, + { + "epoch": 0.10281544811320754, + "grad_norm": 2.734375, + "learning_rate": 1.9869869857490703e-05, + "loss": 2.2462, + "step": 5580 + }, + { + "epoch": 0.10318396226415094, + "grad_norm": 2.9375, + "learning_rate": 1.9868937394823978e-05, + "loss": 2.2915, + "step": 5600 + }, + { + "epoch": 0.10355247641509434, + "grad_norm": 2.40625, + "learning_rate": 1.986800162527512e-05, + "loss": 2.2914, + "step": 5620 + }, + { + "epoch": 0.10392099056603774, + "grad_norm": 2.5625, + "learning_rate": 1.98670625491577e-05, + "loss": 2.2561, + "step": 5640 + }, + { + "epoch": 0.10428950471698113, + "grad_norm": 3.03125, + "learning_rate": 1.9866120166786372e-05, + "loss": 2.3031, + "step": 5660 + }, + { + "epoch": 0.10465801886792453, + "grad_norm": 2.515625, + "learning_rate": 1.9865174478476914e-05, + "loss": 2.2546, + "step": 5680 + }, + { + "epoch": 0.10502653301886793, + "grad_norm": 3.921875, + "learning_rate": 1.9864225484546208e-05, + "loss": 2.2614, + "step": 5700 + }, + { + "epoch": 0.10539504716981132, + "grad_norm": 2.609375, + "learning_rate": 1.9863273185312238e-05, + "loss": 2.2843, + "step": 5720 + }, + { + "epoch": 0.10576356132075472, + "grad_norm": 2.84375, + "learning_rate": 1.986231758109411e-05, + "loss": 2.256, + "step": 5740 + }, + { + "epoch": 0.10613207547169812, + "grad_norm": 3.234375, + "learning_rate": 1.986135867221202e-05, + "loss": 2.2477, + "step": 5760 + }, + { + "epoch": 0.10650058962264151, + "grad_norm": 2.578125, + "learning_rate": 1.986039645898728e-05, + "loss": 2.2996, + "step": 5780 + }, + { + "epoch": 0.10686910377358491, + "grad_norm": 2.65625, + "learning_rate": 1.9859430941742307e-05, + "loss": 2.2632, + "step": 5800 + }, + { + "epoch": 0.10723761792452831, + "grad_norm": 2.765625, + "learning_rate": 1.985846212080063e-05, + "loss": 2.223, + "step": 5820 + }, + { + "epoch": 0.10760613207547169, + "grad_norm": 2.59375, + "learning_rate": 1.985748999648688e-05, + "loss": 2.2402, + "step": 5840 + }, + { + "epoch": 0.10797464622641509, + "grad_norm": 2.5, + "learning_rate": 1.9856514569126794e-05, + "loss": 2.2325, + "step": 5860 + }, + { + "epoch": 0.10834316037735849, + "grad_norm": 2.71875, + "learning_rate": 1.9855535839047225e-05, + "loss": 2.2522, + "step": 5880 + }, + { + "epoch": 0.10871167452830188, + "grad_norm": 2.734375, + "learning_rate": 1.9854553806576115e-05, + "loss": 2.2272, + "step": 5900 + }, + { + "epoch": 0.10908018867924528, + "grad_norm": 2.796875, + "learning_rate": 1.9853568472042532e-05, + "loss": 2.2565, + "step": 5920 + }, + { + "epoch": 0.10944870283018868, + "grad_norm": 2.71875, + "learning_rate": 1.9852579835776638e-05, + "loss": 2.2662, + "step": 5940 + }, + { + "epoch": 0.10981721698113207, + "grad_norm": 2.34375, + "learning_rate": 1.9851587898109707e-05, + "loss": 2.2504, + "step": 5960 + }, + { + "epoch": 0.11018573113207547, + "grad_norm": 2.5625, + "learning_rate": 1.9850592659374117e-05, + "loss": 2.2663, + "step": 5980 + }, + { + "epoch": 0.11055424528301887, + "grad_norm": 2.75, + "learning_rate": 1.984959411990335e-05, + "loss": 2.2752, + "step": 6000 + }, + { + "epoch": 0.11092275943396226, + "grad_norm": 2.84375, + "learning_rate": 1.9848592280032e-05, + "loss": 2.2625, + "step": 6020 + }, + { + "epoch": 0.11129127358490566, + "grad_norm": 2.71875, + "learning_rate": 1.984758714009576e-05, + "loss": 2.2441, + "step": 6040 + }, + { + "epoch": 0.11165978773584906, + "grad_norm": 2.5625, + "learning_rate": 1.9846578700431433e-05, + "loss": 2.2496, + "step": 6060 + }, + { + "epoch": 0.11202830188679246, + "grad_norm": 2.515625, + "learning_rate": 1.984556696137693e-05, + "loss": 2.2548, + "step": 6080 + }, + { + "epoch": 0.11239681603773585, + "grad_norm": 2.453125, + "learning_rate": 1.984455192327126e-05, + "loss": 2.2686, + "step": 6100 + }, + { + "epoch": 0.11276533018867925, + "grad_norm": 2.296875, + "learning_rate": 1.9843533586454544e-05, + "loss": 2.2316, + "step": 6120 + }, + { + "epoch": 0.11313384433962265, + "grad_norm": 2.578125, + "learning_rate": 1.9842511951268007e-05, + "loss": 2.279, + "step": 6140 + }, + { + "epoch": 0.11350235849056604, + "grad_norm": 2.5, + "learning_rate": 1.9841487018053976e-05, + "loss": 2.2182, + "step": 6160 + }, + { + "epoch": 0.11387087264150944, + "grad_norm": 2.6875, + "learning_rate": 1.984045878715589e-05, + "loss": 2.2587, + "step": 6180 + }, + { + "epoch": 0.11423938679245282, + "grad_norm": 2.65625, + "learning_rate": 1.983942725891828e-05, + "loss": 2.2549, + "step": 6200 + }, + { + "epoch": 0.11460790094339622, + "grad_norm": 2.46875, + "learning_rate": 1.98383924336868e-05, + "loss": 2.25, + "step": 6220 + }, + { + "epoch": 0.11497641509433962, + "grad_norm": 2.828125, + "learning_rate": 1.983735431180819e-05, + "loss": 2.2156, + "step": 6240 + }, + { + "epoch": 0.11534492924528301, + "grad_norm": 3.515625, + "learning_rate": 1.9836312893630315e-05, + "loss": 2.2451, + "step": 6260 + }, + { + "epoch": 0.11571344339622641, + "grad_norm": 2.59375, + "learning_rate": 1.983526817950212e-05, + "loss": 2.227, + "step": 6280 + }, + { + "epoch": 0.11608195754716981, + "grad_norm": 2.734375, + "learning_rate": 1.983422016977368e-05, + "loss": 2.2483, + "step": 6300 + }, + { + "epoch": 0.1164504716981132, + "grad_norm": 2.953125, + "learning_rate": 1.983316886479615e-05, + "loss": 2.2234, + "step": 6320 + }, + { + "epoch": 0.1168189858490566, + "grad_norm": 2.734375, + "learning_rate": 1.983211426492181e-05, + "loss": 2.2457, + "step": 6340 + }, + { + "epoch": 0.1171875, + "grad_norm": 2.828125, + "learning_rate": 1.9831056370504036e-05, + "loss": 2.2379, + "step": 6360 + }, + { + "epoch": 0.1175560141509434, + "grad_norm": 3.0, + "learning_rate": 1.9829995181897298e-05, + "loss": 2.2439, + "step": 6380 + }, + { + "epoch": 0.1179245283018868, + "grad_norm": 2.90625, + "learning_rate": 1.982893069945719e-05, + "loss": 2.2418, + "step": 6400 + }, + { + "epoch": 0.11829304245283019, + "grad_norm": 2.59375, + "learning_rate": 1.982786292354039e-05, + "loss": 2.2512, + "step": 6420 + }, + { + "epoch": 0.11866155660377359, + "grad_norm": 2.5, + "learning_rate": 1.9826791854504693e-05, + "loss": 2.2769, + "step": 6440 + }, + { + "epoch": 0.11903007075471699, + "grad_norm": 2.796875, + "learning_rate": 1.9825717492708988e-05, + "loss": 2.2344, + "step": 6460 + }, + { + "epoch": 0.11939858490566038, + "grad_norm": 2.734375, + "learning_rate": 1.9824639838513276e-05, + "loss": 2.2451, + "step": 6480 + }, + { + "epoch": 0.11976709905660378, + "grad_norm": 2.734375, + "learning_rate": 1.9823558892278662e-05, + "loss": 2.2622, + "step": 6500 + }, + { + "epoch": 0.12013561320754718, + "grad_norm": 3.234375, + "learning_rate": 1.9822474654367335e-05, + "loss": 2.2662, + "step": 6520 + }, + { + "epoch": 0.12050412735849056, + "grad_norm": 3.125, + "learning_rate": 1.9821387125142614e-05, + "loss": 2.2422, + "step": 6540 + }, + { + "epoch": 0.12087264150943396, + "grad_norm": 2.921875, + "learning_rate": 1.9820296304968907e-05, + "loss": 2.2187, + "step": 6560 + }, + { + "epoch": 0.12124115566037735, + "grad_norm": 3.265625, + "learning_rate": 1.9819202194211716e-05, + "loss": 2.2485, + "step": 6580 + }, + { + "epoch": 0.12160966981132075, + "grad_norm": 2.90625, + "learning_rate": 1.9818104793237668e-05, + "loss": 2.2277, + "step": 6600 + }, + { + "epoch": 0.12197818396226415, + "grad_norm": 2.421875, + "learning_rate": 1.9817004102414474e-05, + "loss": 2.2287, + "step": 6620 + }, + { + "epoch": 0.12234669811320754, + "grad_norm": 3.0625, + "learning_rate": 1.981590012211095e-05, + "loss": 2.2658, + "step": 6640 + }, + { + "epoch": 0.12271521226415094, + "grad_norm": 2.515625, + "learning_rate": 1.9814792852697023e-05, + "loss": 2.2202, + "step": 6660 + }, + { + "epoch": 0.12308372641509434, + "grad_norm": 3.109375, + "learning_rate": 1.9813682294543714e-05, + "loss": 2.2677, + "step": 6680 + }, + { + "epoch": 0.12345224056603774, + "grad_norm": 3.015625, + "learning_rate": 1.981256844802315e-05, + "loss": 2.1904, + "step": 6700 + }, + { + "epoch": 0.12382075471698113, + "grad_norm": 3.390625, + "learning_rate": 1.981145131350856e-05, + "loss": 2.2323, + "step": 6720 + }, + { + "epoch": 0.12418926886792453, + "grad_norm": 2.96875, + "learning_rate": 1.981033089137427e-05, + "loss": 2.2479, + "step": 6740 + }, + { + "epoch": 0.12455778301886793, + "grad_norm": 2.75, + "learning_rate": 1.9809207181995714e-05, + "loss": 2.2513, + "step": 6760 + }, + { + "epoch": 0.12492629716981132, + "grad_norm": 2.890625, + "learning_rate": 1.980808018574942e-05, + "loss": 2.242, + "step": 6780 + }, + { + "epoch": 0.1252948113207547, + "grad_norm": 2.796875, + "learning_rate": 1.9806949903013022e-05, + "loss": 2.1989, + "step": 6800 + }, + { + "epoch": 0.12566332547169812, + "grad_norm": 3.1875, + "learning_rate": 1.9805816334165258e-05, + "loss": 2.231, + "step": 6820 + }, + { + "epoch": 0.1260318396226415, + "grad_norm": 2.953125, + "learning_rate": 1.9804679479585968e-05, + "loss": 2.2514, + "step": 6840 + }, + { + "epoch": 0.1264003537735849, + "grad_norm": 2.71875, + "learning_rate": 1.9803539339656083e-05, + "loss": 2.2144, + "step": 6860 + }, + { + "epoch": 0.1267688679245283, + "grad_norm": 2.359375, + "learning_rate": 1.980239591475764e-05, + "loss": 2.2229, + "step": 6880 + }, + { + "epoch": 0.1271373820754717, + "grad_norm": 2.78125, + "learning_rate": 1.9801249205273783e-05, + "loss": 2.2269, + "step": 6900 + }, + { + "epoch": 0.1275058962264151, + "grad_norm": 2.65625, + "learning_rate": 1.9800099211588747e-05, + "loss": 2.216, + "step": 6920 + }, + { + "epoch": 0.1278744103773585, + "grad_norm": 2.515625, + "learning_rate": 1.9798945934087874e-05, + "loss": 2.1745, + "step": 6940 + }, + { + "epoch": 0.12824292452830188, + "grad_norm": 2.890625, + "learning_rate": 1.9797789373157602e-05, + "loss": 2.2307, + "step": 6960 + }, + { + "epoch": 0.1286114386792453, + "grad_norm": 2.453125, + "learning_rate": 1.9796629529185472e-05, + "loss": 2.2562, + "step": 6980 + }, + { + "epoch": 0.12897995283018868, + "grad_norm": 2.59375, + "learning_rate": 1.9795466402560124e-05, + "loss": 2.2438, + "step": 7000 + }, + { + "epoch": 0.1293484669811321, + "grad_norm": 3.375, + "learning_rate": 1.97942999936713e-05, + "loss": 2.2086, + "step": 7020 + }, + { + "epoch": 0.12971698113207547, + "grad_norm": 2.78125, + "learning_rate": 1.9793130302909838e-05, + "loss": 2.2197, + "step": 7040 + }, + { + "epoch": 0.13008549528301888, + "grad_norm": 3.0, + "learning_rate": 1.9791957330667682e-05, + "loss": 2.2593, + "step": 7060 + }, + { + "epoch": 0.13045400943396226, + "grad_norm": 2.921875, + "learning_rate": 1.9790781077337864e-05, + "loss": 2.238, + "step": 7080 + }, + { + "epoch": 0.13082252358490565, + "grad_norm": 2.859375, + "learning_rate": 1.978960154331453e-05, + "loss": 2.2766, + "step": 7100 + }, + { + "epoch": 0.13119103773584906, + "grad_norm": 2.3125, + "learning_rate": 1.9788418728992914e-05, + "loss": 2.2291, + "step": 7120 + }, + { + "epoch": 0.13155955188679244, + "grad_norm": 2.5625, + "learning_rate": 1.9787232634769357e-05, + "loss": 2.242, + "step": 7140 + }, + { + "epoch": 0.13192806603773585, + "grad_norm": 3.015625, + "learning_rate": 1.978604326104129e-05, + "loss": 2.2222, + "step": 7160 + }, + { + "epoch": 0.13229658018867924, + "grad_norm": 2.765625, + "learning_rate": 1.978485060820725e-05, + "loss": 2.2447, + "step": 7180 + }, + { + "epoch": 0.13266509433962265, + "grad_norm": 2.671875, + "learning_rate": 1.9783654676666875e-05, + "loss": 2.2322, + "step": 7200 + }, + { + "epoch": 0.13303360849056603, + "grad_norm": 3.015625, + "learning_rate": 1.9782455466820893e-05, + "loss": 2.2134, + "step": 7220 + }, + { + "epoch": 0.13340212264150944, + "grad_norm": 2.390625, + "learning_rate": 1.9781252979071135e-05, + "loss": 2.2347, + "step": 7240 + }, + { + "epoch": 0.13377063679245282, + "grad_norm": 2.609375, + "learning_rate": 1.9780047213820534e-05, + "loss": 2.2538, + "step": 7260 + }, + { + "epoch": 0.13413915094339623, + "grad_norm": 2.96875, + "learning_rate": 1.9778838171473114e-05, + "loss": 2.2252, + "step": 7280 + }, + { + "epoch": 0.13450766509433962, + "grad_norm": 3.265625, + "learning_rate": 1.9777625852434002e-05, + "loss": 2.2305, + "step": 7300 + }, + { + "epoch": 0.13487617924528303, + "grad_norm": 3.21875, + "learning_rate": 1.9776410257109424e-05, + "loss": 2.2282, + "step": 7320 + }, + { + "epoch": 0.1352446933962264, + "grad_norm": 2.5625, + "learning_rate": 1.97751913859067e-05, + "loss": 2.1966, + "step": 7340 + }, + { + "epoch": 0.13561320754716982, + "grad_norm": 3.03125, + "learning_rate": 1.9773969239234247e-05, + "loss": 2.2329, + "step": 7360 + }, + { + "epoch": 0.1359817216981132, + "grad_norm": 2.3125, + "learning_rate": 1.9772743817501583e-05, + "loss": 2.2342, + "step": 7380 + }, + { + "epoch": 0.13635023584905662, + "grad_norm": 2.84375, + "learning_rate": 1.977151512111932e-05, + "loss": 2.2592, + "step": 7400 + }, + { + "epoch": 0.13671875, + "grad_norm": 3.375, + "learning_rate": 1.9770283150499177e-05, + "loss": 2.2089, + "step": 7420 + }, + { + "epoch": 0.13708726415094338, + "grad_norm": 2.515625, + "learning_rate": 1.9769047906053954e-05, + "loss": 2.2167, + "step": 7440 + }, + { + "epoch": 0.1374557783018868, + "grad_norm": 2.59375, + "learning_rate": 1.9767809388197563e-05, + "loss": 2.2144, + "step": 7460 + }, + { + "epoch": 0.13782429245283018, + "grad_norm": 2.625, + "learning_rate": 1.9766567597345e-05, + "loss": 2.2292, + "step": 7480 + }, + { + "epoch": 0.1381928066037736, + "grad_norm": 2.421875, + "learning_rate": 1.9765322533912374e-05, + "loss": 2.2102, + "step": 7500 + }, + { + "epoch": 0.13856132075471697, + "grad_norm": 2.84375, + "learning_rate": 1.9764074198316866e-05, + "loss": 2.2313, + "step": 7520 + }, + { + "epoch": 0.13892983490566038, + "grad_norm": 2.609375, + "learning_rate": 1.976282259097678e-05, + "loss": 2.1986, + "step": 7540 + }, + { + "epoch": 0.13929834905660377, + "grad_norm": 2.609375, + "learning_rate": 1.9761567712311504e-05, + "loss": 2.2222, + "step": 7560 + }, + { + "epoch": 0.13966686320754718, + "grad_norm": 3.265625, + "learning_rate": 1.9760309562741513e-05, + "loss": 2.1785, + "step": 7580 + }, + { + "epoch": 0.14003537735849056, + "grad_norm": 2.515625, + "learning_rate": 1.97590481426884e-05, + "loss": 2.2736, + "step": 7600 + }, + { + "epoch": 0.14040389150943397, + "grad_norm": 2.4375, + "learning_rate": 1.975778345257483e-05, + "loss": 2.1893, + "step": 7620 + }, + { + "epoch": 0.14077240566037735, + "grad_norm": 3.171875, + "learning_rate": 1.975651549282458e-05, + "loss": 2.2248, + "step": 7640 + }, + { + "epoch": 0.14114091981132076, + "grad_norm": 2.71875, + "learning_rate": 1.9755244263862524e-05, + "loss": 2.2277, + "step": 7660 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 2.84375, + "learning_rate": 1.9753969766114612e-05, + "loss": 2.1736, + "step": 7680 + }, + { + "epoch": 0.14187794811320756, + "grad_norm": 2.4375, + "learning_rate": 1.9752692000007914e-05, + "loss": 2.2122, + "step": 7700 + }, + { + "epoch": 0.14224646226415094, + "grad_norm": 2.890625, + "learning_rate": 1.9751410965970582e-05, + "loss": 2.2241, + "step": 7720 + }, + { + "epoch": 0.14261497641509435, + "grad_norm": 3.109375, + "learning_rate": 1.975012666443186e-05, + "loss": 2.176, + "step": 7740 + }, + { + "epoch": 0.14298349056603774, + "grad_norm": 3.03125, + "learning_rate": 1.9748839095822096e-05, + "loss": 2.2095, + "step": 7760 + }, + { + "epoch": 0.14335200471698112, + "grad_norm": 2.671875, + "learning_rate": 1.9747548260572723e-05, + "loss": 2.2581, + "step": 7780 + }, + { + "epoch": 0.14372051886792453, + "grad_norm": 2.640625, + "learning_rate": 1.9746254159116276e-05, + "loss": 2.2525, + "step": 7800 + }, + { + "epoch": 0.1440890330188679, + "grad_norm": 2.765625, + "learning_rate": 1.9744956791886387e-05, + "loss": 2.2686, + "step": 7820 + }, + { + "epoch": 0.14445754716981132, + "grad_norm": 3.3125, + "learning_rate": 1.9743656159317775e-05, + "loss": 2.2171, + "step": 7840 + }, + { + "epoch": 0.1448260613207547, + "grad_norm": 2.828125, + "learning_rate": 1.974235226184625e-05, + "loss": 2.2372, + "step": 7860 + }, + { + "epoch": 0.14519457547169812, + "grad_norm": 2.921875, + "learning_rate": 1.974104509990873e-05, + "loss": 2.2186, + "step": 7880 + }, + { + "epoch": 0.1455630896226415, + "grad_norm": 2.734375, + "learning_rate": 1.9739734673943218e-05, + "loss": 2.2089, + "step": 7900 + }, + { + "epoch": 0.1459316037735849, + "grad_norm": 3.109375, + "learning_rate": 1.97384209843888e-05, + "loss": 2.2254, + "step": 7920 + }, + { + "epoch": 0.1463001179245283, + "grad_norm": 2.71875, + "learning_rate": 1.9737104031685682e-05, + "loss": 2.2486, + "step": 7940 + }, + { + "epoch": 0.1466686320754717, + "grad_norm": 2.75, + "learning_rate": 1.9735783816275143e-05, + "loss": 2.2414, + "step": 7960 + }, + { + "epoch": 0.1470371462264151, + "grad_norm": 2.46875, + "learning_rate": 1.9734460338599557e-05, + "loss": 2.2233, + "step": 7980 + }, + { + "epoch": 0.1474056603773585, + "grad_norm": 2.671875, + "learning_rate": 1.9733133599102395e-05, + "loss": 2.2269, + "step": 8000 + }, + { + "epoch": 0.14777417452830188, + "grad_norm": 2.484375, + "learning_rate": 1.9731803598228228e-05, + "loss": 2.2209, + "step": 8020 + }, + { + "epoch": 0.1481426886792453, + "grad_norm": 2.65625, + "learning_rate": 1.97304703364227e-05, + "loss": 2.2111, + "step": 8040 + }, + { + "epoch": 0.14851120283018868, + "grad_norm": 2.828125, + "learning_rate": 1.9729133814132576e-05, + "loss": 2.2264, + "step": 8060 + }, + { + "epoch": 0.1488797169811321, + "grad_norm": 2.765625, + "learning_rate": 1.9727794031805685e-05, + "loss": 2.2108, + "step": 8080 + }, + { + "epoch": 0.14924823113207547, + "grad_norm": 2.9375, + "learning_rate": 1.9726450989890968e-05, + "loss": 2.2133, + "step": 8100 + }, + { + "epoch": 0.14961674528301888, + "grad_norm": 2.703125, + "learning_rate": 1.9725104688838444e-05, + "loss": 2.2374, + "step": 8120 + }, + { + "epoch": 0.14998525943396226, + "grad_norm": 2.921875, + "learning_rate": 1.972375512909924e-05, + "loss": 2.2321, + "step": 8140 + }, + { + "epoch": 0.15035377358490565, + "grad_norm": 2.765625, + "learning_rate": 1.9722402311125557e-05, + "loss": 2.2477, + "step": 8160 + }, + { + "epoch": 0.15072228773584906, + "grad_norm": 2.96875, + "learning_rate": 1.9721046235370706e-05, + "loss": 2.2122, + "step": 8180 + }, + { + "epoch": 0.15109080188679244, + "grad_norm": 2.703125, + "learning_rate": 1.9719686902289076e-05, + "loss": 2.2299, + "step": 8200 + }, + { + "epoch": 0.15145931603773585, + "grad_norm": 2.796875, + "learning_rate": 1.9718324312336155e-05, + "loss": 2.2272, + "step": 8220 + }, + { + "epoch": 0.15182783018867924, + "grad_norm": 2.859375, + "learning_rate": 1.9716958465968514e-05, + "loss": 2.2218, + "step": 8240 + }, + { + "epoch": 0.15219634433962265, + "grad_norm": 3.0, + "learning_rate": 1.971558936364383e-05, + "loss": 2.2032, + "step": 8260 + }, + { + "epoch": 0.15256485849056603, + "grad_norm": 2.65625, + "learning_rate": 1.971421700582085e-05, + "loss": 2.2077, + "step": 8280 + }, + { + "epoch": 0.15293337264150944, + "grad_norm": 2.671875, + "learning_rate": 1.971284139295943e-05, + "loss": 2.19, + "step": 8300 + }, + { + "epoch": 0.15330188679245282, + "grad_norm": 2.5625, + "learning_rate": 1.971146252552051e-05, + "loss": 2.2047, + "step": 8320 + }, + { + "epoch": 0.15367040094339623, + "grad_norm": 2.71875, + "learning_rate": 1.971008040396612e-05, + "loss": 2.2468, + "step": 8340 + }, + { + "epoch": 0.15403891509433962, + "grad_norm": 2.734375, + "learning_rate": 1.9708695028759384e-05, + "loss": 2.2304, + "step": 8360 + }, + { + "epoch": 0.15440742924528303, + "grad_norm": 2.765625, + "learning_rate": 1.970730640036451e-05, + "loss": 2.1885, + "step": 8380 + }, + { + "epoch": 0.1547759433962264, + "grad_norm": 2.84375, + "learning_rate": 1.9705914519246794e-05, + "loss": 2.2055, + "step": 8400 + }, + { + "epoch": 0.15514445754716982, + "grad_norm": 2.765625, + "learning_rate": 1.9704519385872638e-05, + "loss": 2.1955, + "step": 8420 + }, + { + "epoch": 0.1555129716981132, + "grad_norm": 3.34375, + "learning_rate": 1.9703121000709517e-05, + "loss": 2.216, + "step": 8440 + }, + { + "epoch": 0.15588148584905662, + "grad_norm": 2.515625, + "learning_rate": 1.9701719364226004e-05, + "loss": 2.2677, + "step": 8460 + }, + { + "epoch": 0.15625, + "grad_norm": 2.703125, + "learning_rate": 1.9700314476891757e-05, + "loss": 2.1771, + "step": 8480 + }, + { + "epoch": 0.15661851415094338, + "grad_norm": 2.875, + "learning_rate": 1.9698906339177527e-05, + "loss": 2.1936, + "step": 8500 + }, + { + "epoch": 0.1569870283018868, + "grad_norm": 2.828125, + "learning_rate": 1.9697494951555155e-05, + "loss": 2.2509, + "step": 8520 + }, + { + "epoch": 0.15735554245283018, + "grad_norm": 3.21875, + "learning_rate": 1.9696080314497565e-05, + "loss": 2.2185, + "step": 8540 + }, + { + "epoch": 0.1577240566037736, + "grad_norm": 3.015625, + "learning_rate": 1.9694662428478768e-05, + "loss": 2.2016, + "step": 8560 + }, + { + "epoch": 0.15809257075471697, + "grad_norm": 2.609375, + "learning_rate": 1.969324129397388e-05, + "loss": 2.2313, + "step": 8580 + }, + { + "epoch": 0.15846108490566038, + "grad_norm": 2.984375, + "learning_rate": 1.969181691145909e-05, + "loss": 2.2057, + "step": 8600 + }, + { + "epoch": 0.15882959905660377, + "grad_norm": 3.09375, + "learning_rate": 1.969038928141168e-05, + "loss": 2.2409, + "step": 8620 + }, + { + "epoch": 0.15919811320754718, + "grad_norm": 2.9375, + "learning_rate": 1.9688958404310018e-05, + "loss": 2.2247, + "step": 8640 + }, + { + "epoch": 0.15956662735849056, + "grad_norm": 3.046875, + "learning_rate": 1.9687524280633562e-05, + "loss": 2.1909, + "step": 8660 + }, + { + "epoch": 0.15993514150943397, + "grad_norm": 3.1875, + "learning_rate": 1.9686086910862863e-05, + "loss": 2.2172, + "step": 8680 + }, + { + "epoch": 0.16030365566037735, + "grad_norm": 3.171875, + "learning_rate": 1.968464629547955e-05, + "loss": 2.1848, + "step": 8700 + }, + { + "epoch": 0.16067216981132076, + "grad_norm": 2.8125, + "learning_rate": 1.9683202434966347e-05, + "loss": 2.2269, + "step": 8720 + }, + { + "epoch": 0.16104068396226415, + "grad_norm": 2.765625, + "learning_rate": 1.968175532980706e-05, + "loss": 2.1916, + "step": 8740 + }, + { + "epoch": 0.16140919811320756, + "grad_norm": 2.578125, + "learning_rate": 1.9680304980486586e-05, + "loss": 2.1978, + "step": 8760 + }, + { + "epoch": 0.16177771226415094, + "grad_norm": 2.96875, + "learning_rate": 1.9678851387490907e-05, + "loss": 2.2192, + "step": 8780 + }, + { + "epoch": 0.16214622641509435, + "grad_norm": 2.734375, + "learning_rate": 1.9677394551307094e-05, + "loss": 2.1702, + "step": 8800 + }, + { + "epoch": 0.16251474056603774, + "grad_norm": 2.859375, + "learning_rate": 1.9675934472423304e-05, + "loss": 2.216, + "step": 8820 + }, + { + "epoch": 0.16288325471698112, + "grad_norm": 2.84375, + "learning_rate": 1.967447115132878e-05, + "loss": 2.234, + "step": 8840 + }, + { + "epoch": 0.16325176886792453, + "grad_norm": 2.703125, + "learning_rate": 1.9673004588513847e-05, + "loss": 2.2003, + "step": 8860 + }, + { + "epoch": 0.1636202830188679, + "grad_norm": 3.25, + "learning_rate": 1.9671534784469924e-05, + "loss": 2.2197, + "step": 8880 + }, + { + "epoch": 0.16398879716981132, + "grad_norm": 2.671875, + "learning_rate": 1.9670061739689515e-05, + "loss": 2.2036, + "step": 8900 + }, + { + "epoch": 0.1643573113207547, + "grad_norm": 3.265625, + "learning_rate": 1.9668585454666208e-05, + "loss": 2.2055, + "step": 8920 + }, + { + "epoch": 0.16472582547169812, + "grad_norm": 2.671875, + "learning_rate": 1.966710592989467e-05, + "loss": 2.1999, + "step": 8940 + }, + { + "epoch": 0.1650943396226415, + "grad_norm": 2.734375, + "learning_rate": 1.966562316587066e-05, + "loss": 2.183, + "step": 8960 + }, + { + "epoch": 0.1654628537735849, + "grad_norm": 2.921875, + "learning_rate": 1.9664137163091035e-05, + "loss": 2.1926, + "step": 8980 + }, + { + "epoch": 0.1658313679245283, + "grad_norm": 2.625, + "learning_rate": 1.966264792205371e-05, + "loss": 2.1721, + "step": 9000 + }, + { + "epoch": 0.1661998820754717, + "grad_norm": 2.5, + "learning_rate": 1.9661155443257706e-05, + "loss": 2.2361, + "step": 9020 + }, + { + "epoch": 0.1665683962264151, + "grad_norm": 2.859375, + "learning_rate": 1.965965972720312e-05, + "loss": 2.2472, + "step": 9040 + }, + { + "epoch": 0.1669369103773585, + "grad_norm": 2.765625, + "learning_rate": 1.9658160774391142e-05, + "loss": 2.1874, + "step": 9060 + }, + { + "epoch": 0.16730542452830188, + "grad_norm": 3.015625, + "learning_rate": 1.965665858532404e-05, + "loss": 2.2539, + "step": 9080 + }, + { + "epoch": 0.1676739386792453, + "grad_norm": 3.25, + "learning_rate": 1.965515316050516e-05, + "loss": 2.2167, + "step": 9100 + }, + { + "epoch": 0.16804245283018868, + "grad_norm": 2.734375, + "learning_rate": 1.9653644500438945e-05, + "loss": 2.1935, + "step": 9120 + }, + { + "epoch": 0.1684109669811321, + "grad_norm": 3.203125, + "learning_rate": 1.9652132605630917e-05, + "loss": 2.2202, + "step": 9140 + }, + { + "epoch": 0.16877948113207547, + "grad_norm": 2.71875, + "learning_rate": 1.9650617476587673e-05, + "loss": 2.2077, + "step": 9160 + }, + { + "epoch": 0.16914799528301888, + "grad_norm": 2.90625, + "learning_rate": 1.9649099113816917e-05, + "loss": 2.2349, + "step": 9180 + }, + { + "epoch": 0.16951650943396226, + "grad_norm": 2.484375, + "learning_rate": 1.964757751782741e-05, + "loss": 2.172, + "step": 9200 + }, + { + "epoch": 0.16988502358490565, + "grad_norm": 2.921875, + "learning_rate": 1.9646052689129015e-05, + "loss": 2.2375, + "step": 9220 + }, + { + "epoch": 0.17025353773584906, + "grad_norm": 2.6875, + "learning_rate": 1.9644524628232672e-05, + "loss": 2.1993, + "step": 9240 + }, + { + "epoch": 0.17062205188679244, + "grad_norm": 2.84375, + "learning_rate": 1.9642993335650394e-05, + "loss": 2.2055, + "step": 9260 + }, + { + "epoch": 0.17099056603773585, + "grad_norm": 2.890625, + "learning_rate": 1.9641458811895293e-05, + "loss": 2.2012, + "step": 9280 + }, + { + "epoch": 0.17135908018867924, + "grad_norm": 3.046875, + "learning_rate": 1.963992105748156e-05, + "loss": 2.2252, + "step": 9300 + }, + { + "epoch": 0.17172759433962265, + "grad_norm": 2.59375, + "learning_rate": 1.9638380072924458e-05, + "loss": 2.1948, + "step": 9320 + }, + { + "epoch": 0.17209610849056603, + "grad_norm": 2.8125, + "learning_rate": 1.9636835858740342e-05, + "loss": 2.1465, + "step": 9340 + }, + { + "epoch": 0.17246462264150944, + "grad_norm": 2.65625, + "learning_rate": 1.9635288415446654e-05, + "loss": 2.2019, + "step": 9360 + }, + { + "epoch": 0.17283313679245282, + "grad_norm": 2.8125, + "learning_rate": 1.9633737743561906e-05, + "loss": 2.2203, + "step": 9380 + }, + { + "epoch": 0.17320165094339623, + "grad_norm": 3.53125, + "learning_rate": 1.9632183843605694e-05, + "loss": 2.2318, + "step": 9400 + }, + { + "epoch": 0.17357016509433962, + "grad_norm": 2.71875, + "learning_rate": 1.9630626716098705e-05, + "loss": 2.2084, + "step": 9420 + }, + { + "epoch": 0.17393867924528303, + "grad_norm": 2.640625, + "learning_rate": 1.9629066361562693e-05, + "loss": 2.1651, + "step": 9440 + }, + { + "epoch": 0.1743071933962264, + "grad_norm": 3.15625, + "learning_rate": 1.9627502780520505e-05, + "loss": 2.2012, + "step": 9460 + }, + { + "epoch": 0.17467570754716982, + "grad_norm": 3.015625, + "learning_rate": 1.962593597349607e-05, + "loss": 2.2012, + "step": 9480 + }, + { + "epoch": 0.1750442216981132, + "grad_norm": 2.515625, + "learning_rate": 1.962436594101439e-05, + "loss": 2.2011, + "step": 9500 + }, + { + "epoch": 0.17541273584905662, + "grad_norm": 2.59375, + "learning_rate": 1.962279268360155e-05, + "loss": 2.2029, + "step": 9520 + }, + { + "epoch": 0.17578125, + "grad_norm": 2.5625, + "learning_rate": 1.962121620178472e-05, + "loss": 2.1761, + "step": 9540 + }, + { + "epoch": 0.17614976415094338, + "grad_norm": 3.0625, + "learning_rate": 1.961963649609214e-05, + "loss": 2.2287, + "step": 9560 + }, + { + "epoch": 0.1765182783018868, + "grad_norm": 2.734375, + "learning_rate": 1.9618053567053148e-05, + "loss": 2.2123, + "step": 9580 + }, + { + "epoch": 0.17688679245283018, + "grad_norm": 2.875, + "learning_rate": 1.9616467415198143e-05, + "loss": 2.2023, + "step": 9600 + }, + { + "epoch": 0.1772553066037736, + "grad_norm": 3.828125, + "learning_rate": 1.961487804105862e-05, + "loss": 2.1981, + "step": 9620 + }, + { + "epoch": 0.17762382075471697, + "grad_norm": 2.78125, + "learning_rate": 1.9613285445167137e-05, + "loss": 2.196, + "step": 9640 + }, + { + "epoch": 0.17799233490566038, + "grad_norm": 2.546875, + "learning_rate": 1.961168962805735e-05, + "loss": 2.2064, + "step": 9660 + }, + { + "epoch": 0.17836084905660377, + "grad_norm": 2.796875, + "learning_rate": 1.961009059026398e-05, + "loss": 2.2242, + "step": 9680 + }, + { + "epoch": 0.17872936320754718, + "grad_norm": 2.90625, + "learning_rate": 1.9608488332322834e-05, + "loss": 2.2152, + "step": 9700 + }, + { + "epoch": 0.17909787735849056, + "grad_norm": 2.875, + "learning_rate": 1.9606882854770798e-05, + "loss": 2.1737, + "step": 9720 + }, + { + "epoch": 0.17946639150943397, + "grad_norm": 2.59375, + "learning_rate": 1.9605274158145828e-05, + "loss": 2.2247, + "step": 9740 + }, + { + "epoch": 0.17983490566037735, + "grad_norm": 2.703125, + "learning_rate": 1.9603662242986972e-05, + "loss": 2.2023, + "step": 9760 + }, + { + "epoch": 0.18020341981132076, + "grad_norm": 2.546875, + "learning_rate": 1.9602047109834354e-05, + "loss": 2.1773, + "step": 9780 + }, + { + "epoch": 0.18057193396226415, + "grad_norm": 2.859375, + "learning_rate": 1.9600428759229166e-05, + "loss": 2.1494, + "step": 9800 + }, + { + "epoch": 0.18094044811320756, + "grad_norm": 2.640625, + "learning_rate": 1.9598807191713685e-05, + "loss": 2.177, + "step": 9820 + }, + { + "epoch": 0.18130896226415094, + "grad_norm": 2.90625, + "learning_rate": 1.9597182407831267e-05, + "loss": 2.1773, + "step": 9840 + }, + { + "epoch": 0.18167747641509435, + "grad_norm": 2.609375, + "learning_rate": 1.9595554408126346e-05, + "loss": 2.1588, + "step": 9860 + }, + { + "epoch": 0.18204599056603774, + "grad_norm": 3.09375, + "learning_rate": 1.959392319314443e-05, + "loss": 2.2005, + "step": 9880 + }, + { + "epoch": 0.18241450471698112, + "grad_norm": 2.71875, + "learning_rate": 1.9592288763432102e-05, + "loss": 2.2097, + "step": 9900 + }, + { + "epoch": 0.18278301886792453, + "grad_norm": 2.9375, + "learning_rate": 1.9590651119537035e-05, + "loss": 2.1897, + "step": 9920 + }, + { + "epoch": 0.1831515330188679, + "grad_norm": 3.0, + "learning_rate": 1.9589010262007967e-05, + "loss": 2.1971, + "step": 9940 + }, + { + "epoch": 0.18352004716981132, + "grad_norm": 2.765625, + "learning_rate": 1.958736619139472e-05, + "loss": 2.2039, + "step": 9960 + }, + { + "epoch": 0.1838885613207547, + "grad_norm": 2.859375, + "learning_rate": 1.9585718908248178e-05, + "loss": 2.2018, + "step": 9980 + }, + { + "epoch": 0.18425707547169812, + "grad_norm": 2.671875, + "learning_rate": 1.9584068413120323e-05, + "loss": 2.2061, + "step": 10000 + }, + { + "epoch": 0.1846255896226415, + "grad_norm": 2.765625, + "learning_rate": 1.9582414706564197e-05, + "loss": 2.2435, + "step": 10020 + }, + { + "epoch": 0.1849941037735849, + "grad_norm": 2.78125, + "learning_rate": 1.958075778913393e-05, + "loss": 2.1743, + "step": 10040 + }, + { + "epoch": 0.1853626179245283, + "grad_norm": 2.71875, + "learning_rate": 1.9579097661384713e-05, + "loss": 2.1747, + "step": 10060 + }, + { + "epoch": 0.1857311320754717, + "grad_norm": 2.703125, + "learning_rate": 1.9577434323872825e-05, + "loss": 2.2214, + "step": 10080 + }, + { + "epoch": 0.1860996462264151, + "grad_norm": 2.9375, + "learning_rate": 1.9575767777155622e-05, + "loss": 2.1744, + "step": 10100 + }, + { + "epoch": 0.1864681603773585, + "grad_norm": 3.03125, + "learning_rate": 1.9574098021791522e-05, + "loss": 2.1662, + "step": 10120 + }, + { + "epoch": 0.18683667452830188, + "grad_norm": 2.6875, + "learning_rate": 1.9572425058340032e-05, + "loss": 2.1934, + "step": 10140 + }, + { + "epoch": 0.1872051886792453, + "grad_norm": 2.9375, + "learning_rate": 1.9570748887361728e-05, + "loss": 2.2172, + "step": 10160 + }, + { + "epoch": 0.18757370283018868, + "grad_norm": 3.015625, + "learning_rate": 1.9569069509418256e-05, + "loss": 2.2179, + "step": 10180 + }, + { + "epoch": 0.1879422169811321, + "grad_norm": 2.5625, + "learning_rate": 1.956738692507235e-05, + "loss": 2.2501, + "step": 10200 + }, + { + "epoch": 0.18831073113207547, + "grad_norm": 2.625, + "learning_rate": 1.95657011348878e-05, + "loss": 2.1705, + "step": 10220 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 2.609375, + "learning_rate": 1.9564012139429488e-05, + "loss": 2.1915, + "step": 10240 + }, + { + "epoch": 0.18904775943396226, + "grad_norm": 2.625, + "learning_rate": 1.9562319939263362e-05, + "loss": 2.187, + "step": 10260 + }, + { + "epoch": 0.18941627358490565, + "grad_norm": 2.6875, + "learning_rate": 1.9560624534956444e-05, + "loss": 2.2455, + "step": 10280 + }, + { + "epoch": 0.18978478773584906, + "grad_norm": 3.09375, + "learning_rate": 1.9558925927076827e-05, + "loss": 2.1893, + "step": 10300 + }, + { + "epoch": 0.19015330188679244, + "grad_norm": 2.59375, + "learning_rate": 1.9557224116193678e-05, + "loss": 2.1642, + "step": 10320 + }, + { + "epoch": 0.19052181603773585, + "grad_norm": 2.546875, + "learning_rate": 1.955551910287725e-05, + "loss": 2.1781, + "step": 10340 + }, + { + "epoch": 0.19089033018867924, + "grad_norm": 2.984375, + "learning_rate": 1.9553810887698846e-05, + "loss": 2.1971, + "step": 10360 + }, + { + "epoch": 0.19125884433962265, + "grad_norm": 3.734375, + "learning_rate": 1.9552099471230862e-05, + "loss": 2.17, + "step": 10380 + }, + { + "epoch": 0.19162735849056603, + "grad_norm": 3.0625, + "learning_rate": 1.9550384854046754e-05, + "loss": 2.2178, + "step": 10400 + }, + { + "epoch": 0.19199587264150944, + "grad_norm": 2.796875, + "learning_rate": 1.9548667036721065e-05, + "loss": 2.1904, + "step": 10420 + }, + { + "epoch": 0.19236438679245282, + "grad_norm": 2.890625, + "learning_rate": 1.954694601982939e-05, + "loss": 2.2315, + "step": 10440 + }, + { + "epoch": 0.19273290094339623, + "grad_norm": 2.859375, + "learning_rate": 1.9545221803948418e-05, + "loss": 2.2029, + "step": 10460 + }, + { + "epoch": 0.19310141509433962, + "grad_norm": 2.8125, + "learning_rate": 1.9543494389655888e-05, + "loss": 2.2074, + "step": 10480 + }, + { + "epoch": 0.19346992924528303, + "grad_norm": 3.375, + "learning_rate": 1.954176377753063e-05, + "loss": 2.1917, + "step": 10500 + }, + { + "epoch": 0.1938384433962264, + "grad_norm": 3.109375, + "learning_rate": 1.954002996815253e-05, + "loss": 2.2439, + "step": 10520 + }, + { + "epoch": 0.19420695754716982, + "grad_norm": 3.125, + "learning_rate": 1.9538292962102563e-05, + "loss": 2.1897, + "step": 10540 + }, + { + "epoch": 0.1945754716981132, + "grad_norm": 2.78125, + "learning_rate": 1.9536552759962752e-05, + "loss": 2.1723, + "step": 10560 + }, + { + "epoch": 0.19494398584905662, + "grad_norm": 2.953125, + "learning_rate": 1.9534809362316215e-05, + "loss": 2.1991, + "step": 10580 + }, + { + "epoch": 0.1953125, + "grad_norm": 3.03125, + "learning_rate": 1.9533062769747124e-05, + "loss": 2.2021, + "step": 10600 + }, + { + "epoch": 0.19568101415094338, + "grad_norm": 2.828125, + "learning_rate": 1.9531312982840728e-05, + "loss": 2.1835, + "step": 10620 + }, + { + "epoch": 0.1960495283018868, + "grad_norm": 2.71875, + "learning_rate": 1.9529560002183343e-05, + "loss": 2.1696, + "step": 10640 + }, + { + "epoch": 0.19641804245283018, + "grad_norm": 2.875, + "learning_rate": 1.952780382836236e-05, + "loss": 2.1571, + "step": 10660 + }, + { + "epoch": 0.1967865566037736, + "grad_norm": 3.03125, + "learning_rate": 1.952604446196624e-05, + "loss": 2.2236, + "step": 10680 + }, + { + "epoch": 0.19715507075471697, + "grad_norm": 2.828125, + "learning_rate": 1.9524281903584502e-05, + "loss": 2.2019, + "step": 10700 + }, + { + "epoch": 0.19752358490566038, + "grad_norm": 2.765625, + "learning_rate": 1.9522516153807755e-05, + "loss": 2.184, + "step": 10720 + }, + { + "epoch": 0.19789209905660377, + "grad_norm": 2.890625, + "learning_rate": 1.952074721322766e-05, + "loss": 2.2392, + "step": 10740 + }, + { + "epoch": 0.19826061320754718, + "grad_norm": 2.703125, + "learning_rate": 1.9518975082436952e-05, + "loss": 2.1852, + "step": 10760 + }, + { + "epoch": 0.19862912735849056, + "grad_norm": 3.203125, + "learning_rate": 1.951719976202944e-05, + "loss": 2.1956, + "step": 10780 + }, + { + "epoch": 0.19899764150943397, + "grad_norm": 2.671875, + "learning_rate": 1.95154212526e-05, + "loss": 2.2048, + "step": 10800 + }, + { + "epoch": 0.19936615566037735, + "grad_norm": 3.234375, + "learning_rate": 1.9513639554744566e-05, + "loss": 2.1917, + "step": 10820 + }, + { + "epoch": 0.19973466981132076, + "grad_norm": 2.78125, + "learning_rate": 1.951185466906016e-05, + "loss": 2.1877, + "step": 10840 + }, + { + "epoch": 0.20010318396226415, + "grad_norm": 3.203125, + "learning_rate": 1.9510066596144856e-05, + "loss": 2.1866, + "step": 10860 + }, + { + "epoch": 0.20047169811320756, + "grad_norm": 2.75, + "learning_rate": 1.95082753365978e-05, + "loss": 2.1954, + "step": 10880 + }, + { + "epoch": 0.20084021226415094, + "grad_norm": 3.015625, + "learning_rate": 1.9506480891019208e-05, + "loss": 2.2313, + "step": 10900 + }, + { + "epoch": 0.20120872641509435, + "grad_norm": 2.671875, + "learning_rate": 1.9504683260010363e-05, + "loss": 2.1872, + "step": 10920 + }, + { + "epoch": 0.20157724056603774, + "grad_norm": 2.890625, + "learning_rate": 1.950288244417361e-05, + "loss": 2.1607, + "step": 10940 + }, + { + "epoch": 0.20194575471698112, + "grad_norm": 3.078125, + "learning_rate": 1.9501078444112376e-05, + "loss": 2.1636, + "step": 10960 + }, + { + "epoch": 0.20231426886792453, + "grad_norm": 4.0625, + "learning_rate": 1.949927126043114e-05, + "loss": 2.2125, + "step": 10980 + }, + { + "epoch": 0.2026827830188679, + "grad_norm": 2.71875, + "learning_rate": 1.9497460893735452e-05, + "loss": 2.2025, + "step": 11000 + }, + { + "epoch": 0.20305129716981132, + "grad_norm": 2.65625, + "learning_rate": 1.949564734463193e-05, + "loss": 2.1849, + "step": 11020 + }, + { + "epoch": 0.2034198113207547, + "grad_norm": 2.765625, + "learning_rate": 1.9493830613728257e-05, + "loss": 2.1765, + "step": 11040 + }, + { + "epoch": 0.20378832547169812, + "grad_norm": 2.8125, + "learning_rate": 1.9492010701633182e-05, + "loss": 2.1845, + "step": 11060 + }, + { + "epoch": 0.2041568396226415, + "grad_norm": 2.734375, + "learning_rate": 1.949018760895653e-05, + "loss": 2.1858, + "step": 11080 + }, + { + "epoch": 0.2045253537735849, + "grad_norm": 2.75, + "learning_rate": 1.9488361336309168e-05, + "loss": 2.1937, + "step": 11100 + }, + { + "epoch": 0.2048938679245283, + "grad_norm": 2.734375, + "learning_rate": 1.9486531884303056e-05, + "loss": 2.1936, + "step": 11120 + }, + { + "epoch": 0.2052623820754717, + "grad_norm": 2.65625, + "learning_rate": 1.9484699253551195e-05, + "loss": 2.1871, + "step": 11140 + }, + { + "epoch": 0.2056308962264151, + "grad_norm": 2.609375, + "learning_rate": 1.9482863444667674e-05, + "loss": 2.2244, + "step": 11160 + }, + { + "epoch": 0.2059994103773585, + "grad_norm": 2.625, + "learning_rate": 1.9481024458267628e-05, + "loss": 2.1866, + "step": 11180 + }, + { + "epoch": 0.20636792452830188, + "grad_norm": 2.90625, + "learning_rate": 1.9479182294967266e-05, + "loss": 2.214, + "step": 11200 + }, + { + "epoch": 0.2067364386792453, + "grad_norm": 2.546875, + "learning_rate": 1.947733695538386e-05, + "loss": 2.2307, + "step": 11220 + }, + { + "epoch": 0.20710495283018868, + "grad_norm": 3.078125, + "learning_rate": 1.947548844013575e-05, + "loss": 2.211, + "step": 11240 + }, + { + "epoch": 0.2074734669811321, + "grad_norm": 2.765625, + "learning_rate": 1.9473636749842328e-05, + "loss": 2.137, + "step": 11260 + }, + { + "epoch": 0.20784198113207547, + "grad_norm": 2.828125, + "learning_rate": 1.9471781885124067e-05, + "loss": 2.2172, + "step": 11280 + }, + { + "epoch": 0.20821049528301888, + "grad_norm": 3.203125, + "learning_rate": 1.946992384660249e-05, + "loss": 2.2324, + "step": 11300 + }, + { + "epoch": 0.20857900943396226, + "grad_norm": 2.734375, + "learning_rate": 1.9468062634900183e-05, + "loss": 2.2034, + "step": 11320 + }, + { + "epoch": 0.20894752358490565, + "grad_norm": 2.34375, + "learning_rate": 1.9466198250640815e-05, + "loss": 2.181, + "step": 11340 + }, + { + "epoch": 0.20931603773584906, + "grad_norm": 2.890625, + "learning_rate": 1.946433069444909e-05, + "loss": 2.1792, + "step": 11360 + }, + { + "epoch": 0.20968455188679244, + "grad_norm": 3.21875, + "learning_rate": 1.9462459966950796e-05, + "loss": 2.1863, + "step": 11380 + }, + { + "epoch": 0.21005306603773585, + "grad_norm": 2.640625, + "learning_rate": 1.9460586068772773e-05, + "loss": 2.2087, + "step": 11400 + }, + { + "epoch": 0.21042158018867924, + "grad_norm": 2.6875, + "learning_rate": 1.9458709000542926e-05, + "loss": 2.2149, + "step": 11420 + }, + { + "epoch": 0.21079009433962265, + "grad_norm": 2.765625, + "learning_rate": 1.9456828762890225e-05, + "loss": 2.1759, + "step": 11440 + }, + { + "epoch": 0.21115860849056603, + "grad_norm": 2.9375, + "learning_rate": 1.94549453564447e-05, + "loss": 2.1707, + "step": 11460 + }, + { + "epoch": 0.21152712264150944, + "grad_norm": 2.703125, + "learning_rate": 1.9453058781837438e-05, + "loss": 2.2034, + "step": 11480 + }, + { + "epoch": 0.21189563679245282, + "grad_norm": 2.53125, + "learning_rate": 1.9451169039700596e-05, + "loss": 2.183, + "step": 11500 + }, + { + "epoch": 0.21226415094339623, + "grad_norm": 2.8125, + "learning_rate": 1.9449276130667393e-05, + "loss": 2.1882, + "step": 11520 + }, + { + "epoch": 0.21263266509433962, + "grad_norm": 2.671875, + "learning_rate": 1.9447380055372094e-05, + "loss": 2.1667, + "step": 11540 + }, + { + "epoch": 0.21300117924528303, + "grad_norm": 2.53125, + "learning_rate": 1.9445480814450043e-05, + "loss": 2.2198, + "step": 11560 + }, + { + "epoch": 0.2133696933962264, + "grad_norm": 2.875, + "learning_rate": 1.9443578408537636e-05, + "loss": 2.1313, + "step": 11580 + }, + { + "epoch": 0.21373820754716982, + "grad_norm": 2.8125, + "learning_rate": 1.944167283827233e-05, + "loss": 2.19, + "step": 11600 + }, + { + "epoch": 0.2141067216981132, + "grad_norm": 2.703125, + "learning_rate": 1.943976410429264e-05, + "loss": 2.1992, + "step": 11620 + }, + { + "epoch": 0.21447523584905662, + "grad_norm": 3.1875, + "learning_rate": 1.9437852207238155e-05, + "loss": 2.1599, + "step": 11640 + }, + { + "epoch": 0.21484375, + "grad_norm": 2.90625, + "learning_rate": 1.9435937147749505e-05, + "loss": 2.161, + "step": 11660 + }, + { + "epoch": 0.21521226415094338, + "grad_norm": 2.71875, + "learning_rate": 1.9434018926468385e-05, + "loss": 2.2104, + "step": 11680 + }, + { + "epoch": 0.2155807783018868, + "grad_norm": 2.71875, + "learning_rate": 1.943209754403756e-05, + "loss": 2.1673, + "step": 11700 + }, + { + "epoch": 0.21594929245283018, + "grad_norm": 2.59375, + "learning_rate": 1.9430173001100843e-05, + "loss": 2.2167, + "step": 11720 + }, + { + "epoch": 0.2163178066037736, + "grad_norm": 2.734375, + "learning_rate": 1.9428245298303108e-05, + "loss": 2.1566, + "step": 11740 + }, + { + "epoch": 0.21668632075471697, + "grad_norm": 2.84375, + "learning_rate": 1.9426314436290292e-05, + "loss": 2.1789, + "step": 11760 + }, + { + "epoch": 0.21705483490566038, + "grad_norm": 2.859375, + "learning_rate": 1.9424380415709386e-05, + "loss": 2.1768, + "step": 11780 + }, + { + "epoch": 0.21742334905660377, + "grad_norm": 3.484375, + "learning_rate": 1.9422443237208443e-05, + "loss": 2.1829, + "step": 11800 + }, + { + "epoch": 0.21779186320754718, + "grad_norm": 2.765625, + "learning_rate": 1.942050290143657e-05, + "loss": 2.1469, + "step": 11820 + }, + { + "epoch": 0.21816037735849056, + "grad_norm": 3.1875, + "learning_rate": 1.941855940904394e-05, + "loss": 2.1677, + "step": 11840 + }, + { + "epoch": 0.21852889150943397, + "grad_norm": 2.921875, + "learning_rate": 1.941661276068177e-05, + "loss": 2.21, + "step": 11860 + }, + { + "epoch": 0.21889740566037735, + "grad_norm": 2.59375, + "learning_rate": 1.941466295700235e-05, + "loss": 2.1879, + "step": 11880 + }, + { + "epoch": 0.21926591981132076, + "grad_norm": 2.6875, + "learning_rate": 1.9412709998659013e-05, + "loss": 2.1822, + "step": 11900 + }, + { + "epoch": 0.21963443396226415, + "grad_norm": 3.046875, + "learning_rate": 1.9410753886306164e-05, + "loss": 2.2249, + "step": 11920 + }, + { + "epoch": 0.22000294811320756, + "grad_norm": 2.671875, + "learning_rate": 1.9408794620599248e-05, + "loss": 2.173, + "step": 11940 + }, + { + "epoch": 0.22037146226415094, + "grad_norm": 2.71875, + "learning_rate": 1.9406832202194787e-05, + "loss": 2.2354, + "step": 11960 + }, + { + "epoch": 0.22073997641509435, + "grad_norm": 2.65625, + "learning_rate": 1.9404866631750337e-05, + "loss": 2.1934, + "step": 11980 + }, + { + "epoch": 0.22110849056603774, + "grad_norm": 2.578125, + "learning_rate": 1.9402897909924526e-05, + "loss": 2.1216, + "step": 12000 + }, + { + "epoch": 0.22147700471698112, + "grad_norm": 3.203125, + "learning_rate": 1.940092603737703e-05, + "loss": 2.2075, + "step": 12020 + }, + { + "epoch": 0.22184551886792453, + "grad_norm": 2.9375, + "learning_rate": 1.939895101476859e-05, + "loss": 2.1562, + "step": 12040 + }, + { + "epoch": 0.2222140330188679, + "grad_norm": 2.71875, + "learning_rate": 1.939697284276099e-05, + "loss": 2.1846, + "step": 12060 + }, + { + "epoch": 0.22258254716981132, + "grad_norm": 2.734375, + "learning_rate": 1.9394991522017076e-05, + "loss": 2.2001, + "step": 12080 + }, + { + "epoch": 0.2229510613207547, + "grad_norm": 2.78125, + "learning_rate": 1.939300705320075e-05, + "loss": 2.1834, + "step": 12100 + }, + { + "epoch": 0.22331957547169812, + "grad_norm": 2.515625, + "learning_rate": 1.9391019436976976e-05, + "loss": 2.1401, + "step": 12120 + }, + { + "epoch": 0.2236880896226415, + "grad_norm": 2.828125, + "learning_rate": 1.9389028674011746e-05, + "loss": 2.1861, + "step": 12140 + }, + { + "epoch": 0.2240566037735849, + "grad_norm": 3.5625, + "learning_rate": 1.938703476497214e-05, + "loss": 2.1592, + "step": 12160 + }, + { + "epoch": 0.2244251179245283, + "grad_norm": 3.125, + "learning_rate": 1.9385037710526272e-05, + "loss": 2.194, + "step": 12180 + }, + { + "epoch": 0.2247936320754717, + "grad_norm": 2.734375, + "learning_rate": 1.9383037511343316e-05, + "loss": 2.1809, + "step": 12200 + }, + { + "epoch": 0.2251621462264151, + "grad_norm": 2.78125, + "learning_rate": 1.9381034168093495e-05, + "loss": 2.1728, + "step": 12220 + }, + { + "epoch": 0.2255306603773585, + "grad_norm": 3.09375, + "learning_rate": 1.9379027681448085e-05, + "loss": 2.2559, + "step": 12240 + }, + { + "epoch": 0.22589917452830188, + "grad_norm": 2.671875, + "learning_rate": 1.937701805207943e-05, + "loss": 2.1879, + "step": 12260 + }, + { + "epoch": 0.2262676886792453, + "grad_norm": 3.203125, + "learning_rate": 1.937500528066091e-05, + "loss": 2.1747, + "step": 12280 + }, + { + "epoch": 0.22663620283018868, + "grad_norm": 2.796875, + "learning_rate": 1.937298936786697e-05, + "loss": 2.2168, + "step": 12300 + }, + { + "epoch": 0.2270047169811321, + "grad_norm": 2.75, + "learning_rate": 1.9370970314373093e-05, + "loss": 2.198, + "step": 12320 + }, + { + "epoch": 0.22737323113207547, + "grad_norm": 2.765625, + "learning_rate": 1.9368948120855825e-05, + "loss": 2.1662, + "step": 12340 + }, + { + "epoch": 0.22774174528301888, + "grad_norm": 2.875, + "learning_rate": 1.936692278799277e-05, + "loss": 2.1737, + "step": 12360 + }, + { + "epoch": 0.22811025943396226, + "grad_norm": 2.890625, + "learning_rate": 1.9364894316462566e-05, + "loss": 2.2166, + "step": 12380 + }, + { + "epoch": 0.22847877358490565, + "grad_norm": 2.703125, + "learning_rate": 1.9362862706944923e-05, + "loss": 2.1581, + "step": 12400 + }, + { + "epoch": 0.22884728773584906, + "grad_norm": 2.578125, + "learning_rate": 1.9360827960120584e-05, + "loss": 2.1962, + "step": 12420 + }, + { + "epoch": 0.22921580188679244, + "grad_norm": 2.859375, + "learning_rate": 1.9358790076671353e-05, + "loss": 2.1781, + "step": 12440 + }, + { + "epoch": 0.22958431603773585, + "grad_norm": 3.390625, + "learning_rate": 1.935674905728009e-05, + "loss": 2.171, + "step": 12460 + }, + { + "epoch": 0.22995283018867924, + "grad_norm": 2.640625, + "learning_rate": 1.9354704902630693e-05, + "loss": 2.1746, + "step": 12480 + }, + { + "epoch": 0.23032134433962265, + "grad_norm": 2.765625, + "learning_rate": 1.9352657613408115e-05, + "loss": 2.2096, + "step": 12500 + }, + { + "epoch": 0.23068985849056603, + "grad_norm": 3.203125, + "learning_rate": 1.9350607190298372e-05, + "loss": 2.2068, + "step": 12520 + }, + { + "epoch": 0.23105837264150944, + "grad_norm": 3.546875, + "learning_rate": 1.9348553633988512e-05, + "loss": 2.182, + "step": 12540 + }, + { + "epoch": 0.23142688679245282, + "grad_norm": 2.71875, + "learning_rate": 1.9346496945166638e-05, + "loss": 2.1932, + "step": 12560 + }, + { + "epoch": 0.23179540094339623, + "grad_norm": 2.671875, + "learning_rate": 1.9344437124521912e-05, + "loss": 2.1559, + "step": 12580 + }, + { + "epoch": 0.23216391509433962, + "grad_norm": 3.046875, + "learning_rate": 1.9342374172744535e-05, + "loss": 2.1569, + "step": 12600 + }, + { + "epoch": 0.23253242924528303, + "grad_norm": 3.0625, + "learning_rate": 1.9340308090525756e-05, + "loss": 2.1386, + "step": 12620 + }, + { + "epoch": 0.2329009433962264, + "grad_norm": 2.984375, + "learning_rate": 1.9338238878557885e-05, + "loss": 2.1713, + "step": 12640 + }, + { + "epoch": 0.23326945754716982, + "grad_norm": 2.921875, + "learning_rate": 1.9336166537534267e-05, + "loss": 2.1949, + "step": 12660 + }, + { + "epoch": 0.2336379716981132, + "grad_norm": 3.1875, + "learning_rate": 1.933409106814931e-05, + "loss": 2.1613, + "step": 12680 + }, + { + "epoch": 0.23400648584905662, + "grad_norm": 2.765625, + "learning_rate": 1.9332012471098455e-05, + "loss": 2.1366, + "step": 12700 + }, + { + "epoch": 0.234375, + "grad_norm": 2.609375, + "learning_rate": 1.93299307470782e-05, + "loss": 2.1564, + "step": 12720 + }, + { + "epoch": 0.23474351415094338, + "grad_norm": 2.96875, + "learning_rate": 1.9327845896786085e-05, + "loss": 2.1603, + "step": 12740 + }, + { + "epoch": 0.2351120283018868, + "grad_norm": 2.84375, + "learning_rate": 1.932575792092071e-05, + "loss": 2.1884, + "step": 12760 + }, + { + "epoch": 0.23548054245283018, + "grad_norm": 2.875, + "learning_rate": 1.9323666820181704e-05, + "loss": 2.1291, + "step": 12780 + }, + { + "epoch": 0.2358490566037736, + "grad_norm": 2.578125, + "learning_rate": 1.9321572595269763e-05, + "loss": 2.1912, + "step": 12800 + }, + { + "epoch": 0.23621757075471697, + "grad_norm": 3.125, + "learning_rate": 1.9319475246886613e-05, + "loss": 2.137, + "step": 12820 + }, + { + "epoch": 0.23658608490566038, + "grad_norm": 2.5625, + "learning_rate": 1.931737477573503e-05, + "loss": 2.1809, + "step": 12840 + }, + { + "epoch": 0.23695459905660377, + "grad_norm": 2.75, + "learning_rate": 1.9315271182518844e-05, + "loss": 2.1598, + "step": 12860 + }, + { + "epoch": 0.23732311320754718, + "grad_norm": 2.484375, + "learning_rate": 1.931316446794293e-05, + "loss": 2.1775, + "step": 12880 + }, + { + "epoch": 0.23769162735849056, + "grad_norm": 2.625, + "learning_rate": 1.93110546327132e-05, + "loss": 2.1766, + "step": 12900 + }, + { + "epoch": 0.23806014150943397, + "grad_norm": 3.015625, + "learning_rate": 1.930894167753662e-05, + "loss": 2.1767, + "step": 12920 + }, + { + "epoch": 0.23842865566037735, + "grad_norm": 3.0, + "learning_rate": 1.93068256031212e-05, + "loss": 2.1713, + "step": 12940 + }, + { + "epoch": 0.23879716981132076, + "grad_norm": 2.734375, + "learning_rate": 1.9304706410175987e-05, + "loss": 2.2024, + "step": 12960 + }, + { + "epoch": 0.23916568396226415, + "grad_norm": 2.921875, + "learning_rate": 1.9302584099411087e-05, + "loss": 2.1636, + "step": 12980 + }, + { + "epoch": 0.23953419811320756, + "grad_norm": 2.796875, + "learning_rate": 1.9300458671537642e-05, + "loss": 2.1515, + "step": 13000 + }, + { + "epoch": 0.23990271226415094, + "grad_norm": 2.984375, + "learning_rate": 1.929833012726784e-05, + "loss": 2.1901, + "step": 13020 + }, + { + "epoch": 0.24027122641509435, + "grad_norm": 2.734375, + "learning_rate": 1.9296198467314912e-05, + "loss": 2.187, + "step": 13040 + }, + { + "epoch": 0.24063974056603774, + "grad_norm": 2.734375, + "learning_rate": 1.9294063692393135e-05, + "loss": 2.2164, + "step": 13060 + }, + { + "epoch": 0.24100825471698112, + "grad_norm": 2.640625, + "learning_rate": 1.929192580321783e-05, + "loss": 2.1526, + "step": 13080 + }, + { + "epoch": 0.24137676886792453, + "grad_norm": 2.828125, + "learning_rate": 1.928978480050536e-05, + "loss": 2.1956, + "step": 13100 + }, + { + "epoch": 0.2417452830188679, + "grad_norm": 2.546875, + "learning_rate": 1.928764068497313e-05, + "loss": 2.1537, + "step": 13120 + }, + { + "epoch": 0.24211379716981132, + "grad_norm": 2.875, + "learning_rate": 1.928549345733959e-05, + "loss": 2.1991, + "step": 13140 + }, + { + "epoch": 0.2424823113207547, + "grad_norm": 2.8125, + "learning_rate": 1.928334311832424e-05, + "loss": 2.1916, + "step": 13160 + }, + { + "epoch": 0.24285082547169812, + "grad_norm": 2.828125, + "learning_rate": 1.9281189668647603e-05, + "loss": 2.1168, + "step": 13180 + }, + { + "epoch": 0.2432193396226415, + "grad_norm": 3.125, + "learning_rate": 1.927903310903127e-05, + "loss": 2.1565, + "step": 13200 + }, + { + "epoch": 0.2435878537735849, + "grad_norm": 3.015625, + "learning_rate": 1.927687344019785e-05, + "loss": 2.1816, + "step": 13220 + }, + { + "epoch": 0.2439563679245283, + "grad_norm": 2.75, + "learning_rate": 1.927471066287101e-05, + "loss": 2.1787, + "step": 13240 + }, + { + "epoch": 0.2443248820754717, + "grad_norm": 2.796875, + "learning_rate": 1.9272544777775447e-05, + "loss": 2.1878, + "step": 13260 + }, + { + "epoch": 0.2446933962264151, + "grad_norm": 2.828125, + "learning_rate": 1.9270375785636915e-05, + "loss": 2.1771, + "step": 13280 + }, + { + "epoch": 0.2450619103773585, + "grad_norm": 2.703125, + "learning_rate": 1.9268203687182195e-05, + "loss": 2.2025, + "step": 13300 + }, + { + "epoch": 0.24543042452830188, + "grad_norm": 3.125, + "learning_rate": 1.926602848313911e-05, + "loss": 2.1905, + "step": 13320 + }, + { + "epoch": 0.2457989386792453, + "grad_norm": 2.75, + "learning_rate": 1.926385017423653e-05, + "loss": 2.1505, + "step": 13340 + }, + { + "epoch": 0.24616745283018868, + "grad_norm": 2.546875, + "learning_rate": 1.9261668761204367e-05, + "loss": 2.1885, + "step": 13360 + }, + { + "epoch": 0.2465359669811321, + "grad_norm": 2.6875, + "learning_rate": 1.9259484244773562e-05, + "loss": 2.1688, + "step": 13380 + }, + { + "epoch": 0.24690448113207547, + "grad_norm": 2.75, + "learning_rate": 1.92572966256761e-05, + "loss": 2.1699, + "step": 13400 + }, + { + "epoch": 0.24727299528301888, + "grad_norm": 2.734375, + "learning_rate": 1.9255105904645016e-05, + "loss": 2.1857, + "step": 13420 + }, + { + "epoch": 0.24764150943396226, + "grad_norm": 2.953125, + "learning_rate": 1.9252912082414374e-05, + "loss": 2.1981, + "step": 13440 + }, + { + "epoch": 0.24801002358490565, + "grad_norm": 2.578125, + "learning_rate": 1.9250715159719276e-05, + "loss": 2.1907, + "step": 13460 + }, + { + "epoch": 0.24837853773584906, + "grad_norm": 2.796875, + "learning_rate": 1.9248515137295873e-05, + "loss": 2.1653, + "step": 13480 + }, + { + "epoch": 0.24874705188679244, + "grad_norm": 3.15625, + "learning_rate": 1.9246312015881344e-05, + "loss": 2.2101, + "step": 13500 + }, + { + "epoch": 0.24911556603773585, + "grad_norm": 2.828125, + "learning_rate": 1.924410579621391e-05, + "loss": 2.206, + "step": 13520 + }, + { + "epoch": 0.24948408018867924, + "grad_norm": 2.890625, + "learning_rate": 1.924189647903283e-05, + "loss": 2.156, + "step": 13540 + }, + { + "epoch": 0.24985259433962265, + "grad_norm": 2.8125, + "learning_rate": 1.9239684065078406e-05, + "loss": 2.1212, + "step": 13560 + }, + { + "epoch": 0.25022110849056606, + "grad_norm": 2.984375, + "learning_rate": 1.923746855509197e-05, + "loss": 2.1842, + "step": 13580 + }, + { + "epoch": 0.2505896226415094, + "grad_norm": 2.71875, + "learning_rate": 1.92352499498159e-05, + "loss": 2.1492, + "step": 13600 + }, + { + "epoch": 0.2509581367924528, + "grad_norm": 3.140625, + "learning_rate": 1.9233028249993592e-05, + "loss": 2.1803, + "step": 13620 + }, + { + "epoch": 0.25132665094339623, + "grad_norm": 3.90625, + "learning_rate": 1.9230803456369508e-05, + "loss": 2.1996, + "step": 13640 + }, + { + "epoch": 0.25169516509433965, + "grad_norm": 3.421875, + "learning_rate": 1.9228575569689128e-05, + "loss": 2.1902, + "step": 13660 + }, + { + "epoch": 0.252063679245283, + "grad_norm": 2.546875, + "learning_rate": 1.9226344590698968e-05, + "loss": 2.1778, + "step": 13680 + }, + { + "epoch": 0.2524321933962264, + "grad_norm": 3.0, + "learning_rate": 1.9224110520146584e-05, + "loss": 2.1767, + "step": 13700 + }, + { + "epoch": 0.2528007075471698, + "grad_norm": 3.203125, + "learning_rate": 1.922187335878057e-05, + "loss": 2.1827, + "step": 13720 + }, + { + "epoch": 0.25316922169811323, + "grad_norm": 2.640625, + "learning_rate": 1.921963310735056e-05, + "loss": 2.171, + "step": 13740 + }, + { + "epoch": 0.2535377358490566, + "grad_norm": 3.140625, + "learning_rate": 1.9217389766607206e-05, + "loss": 2.2012, + "step": 13760 + }, + { + "epoch": 0.25390625, + "grad_norm": 2.921875, + "learning_rate": 1.921514333730221e-05, + "loss": 2.2255, + "step": 13780 + }, + { + "epoch": 0.2542747641509434, + "grad_norm": 2.796875, + "learning_rate": 1.921289382018831e-05, + "loss": 2.1376, + "step": 13800 + }, + { + "epoch": 0.25464327830188677, + "grad_norm": 2.9375, + "learning_rate": 1.9210641216019267e-05, + "loss": 2.1376, + "step": 13820 + }, + { + "epoch": 0.2550117924528302, + "grad_norm": 2.703125, + "learning_rate": 1.920838552554989e-05, + "loss": 2.1492, + "step": 13840 + }, + { + "epoch": 0.2553803066037736, + "grad_norm": 3.171875, + "learning_rate": 1.920612674953601e-05, + "loss": 2.1431, + "step": 13860 + }, + { + "epoch": 0.255748820754717, + "grad_norm": 2.734375, + "learning_rate": 1.92038648887345e-05, + "loss": 2.1595, + "step": 13880 + }, + { + "epoch": 0.25611733490566035, + "grad_norm": 2.984375, + "learning_rate": 1.9201599943903264e-05, + "loss": 2.1492, + "step": 13900 + }, + { + "epoch": 0.25648584905660377, + "grad_norm": 3.21875, + "learning_rate": 1.9199331915801237e-05, + "loss": 2.1742, + "step": 13920 + }, + { + "epoch": 0.2568543632075472, + "grad_norm": 3.5625, + "learning_rate": 1.9197060805188395e-05, + "loss": 2.1794, + "step": 13940 + }, + { + "epoch": 0.2572228773584906, + "grad_norm": 2.84375, + "learning_rate": 1.9194786612825735e-05, + "loss": 2.1234, + "step": 13960 + }, + { + "epoch": 0.25759139150943394, + "grad_norm": 2.90625, + "learning_rate": 1.9192509339475295e-05, + "loss": 2.1943, + "step": 13980 + }, + { + "epoch": 0.25795990566037735, + "grad_norm": 2.84375, + "learning_rate": 1.9190228985900145e-05, + "loss": 2.197, + "step": 14000 + }, + { + "epoch": 0.25832841981132076, + "grad_norm": 2.8125, + "learning_rate": 1.9187945552864384e-05, + "loss": 2.1698, + "step": 14020 + }, + { + "epoch": 0.2586969339622642, + "grad_norm": 3.109375, + "learning_rate": 1.9185659041133146e-05, + "loss": 2.1901, + "step": 14040 + }, + { + "epoch": 0.25906544811320753, + "grad_norm": 2.953125, + "learning_rate": 1.9183369451472595e-05, + "loss": 2.1671, + "step": 14060 + }, + { + "epoch": 0.25943396226415094, + "grad_norm": 3.015625, + "learning_rate": 1.9181076784649922e-05, + "loss": 2.1326, + "step": 14080 + }, + { + "epoch": 0.25980247641509435, + "grad_norm": 3.171875, + "learning_rate": 1.917878104143336e-05, + "loss": 2.1916, + "step": 14100 + }, + { + "epoch": 0.26017099056603776, + "grad_norm": 2.921875, + "learning_rate": 1.9176482222592163e-05, + "loss": 2.1864, + "step": 14120 + }, + { + "epoch": 0.2605395047169811, + "grad_norm": 2.9375, + "learning_rate": 1.9174180328896617e-05, + "loss": 2.2096, + "step": 14140 + }, + { + "epoch": 0.26090801886792453, + "grad_norm": 3.171875, + "learning_rate": 1.9171875361118047e-05, + "loss": 2.1361, + "step": 14160 + }, + { + "epoch": 0.26127653301886794, + "grad_norm": 3.09375, + "learning_rate": 1.9169567320028797e-05, + "loss": 2.1914, + "step": 14180 + }, + { + "epoch": 0.2616450471698113, + "grad_norm": 2.59375, + "learning_rate": 1.916725620640224e-05, + "loss": 2.1943, + "step": 14200 + }, + { + "epoch": 0.2620135613207547, + "grad_norm": 2.65625, + "learning_rate": 1.9164942021012792e-05, + "loss": 2.1549, + "step": 14220 + }, + { + "epoch": 0.2623820754716981, + "grad_norm": 2.828125, + "learning_rate": 1.9162624764635883e-05, + "loss": 2.1648, + "step": 14240 + }, + { + "epoch": 0.26275058962264153, + "grad_norm": 3.1875, + "learning_rate": 1.9160304438047986e-05, + "loss": 2.1784, + "step": 14260 + }, + { + "epoch": 0.2631191037735849, + "grad_norm": 3.171875, + "learning_rate": 1.915798104202659e-05, + "loss": 2.1763, + "step": 14280 + }, + { + "epoch": 0.2634876179245283, + "grad_norm": 2.90625, + "learning_rate": 1.915565457735022e-05, + "loss": 2.1553, + "step": 14300 + }, + { + "epoch": 0.2638561320754717, + "grad_norm": 2.75, + "learning_rate": 1.9153325044798428e-05, + "loss": 2.1448, + "step": 14320 + }, + { + "epoch": 0.2642246462264151, + "grad_norm": 2.875, + "learning_rate": 1.9150992445151794e-05, + "loss": 2.1435, + "step": 14340 + }, + { + "epoch": 0.26459316037735847, + "grad_norm": 2.671875, + "learning_rate": 1.9148656779191922e-05, + "loss": 2.1554, + "step": 14360 + }, + { + "epoch": 0.2649616745283019, + "grad_norm": 2.859375, + "learning_rate": 1.914631804770145e-05, + "loss": 2.1896, + "step": 14380 + }, + { + "epoch": 0.2653301886792453, + "grad_norm": 2.9375, + "learning_rate": 1.9143976251464037e-05, + "loss": 2.1833, + "step": 14400 + }, + { + "epoch": 0.2656987028301887, + "grad_norm": 2.9375, + "learning_rate": 1.9141631391264374e-05, + "loss": 2.1638, + "step": 14420 + }, + { + "epoch": 0.26606721698113206, + "grad_norm": 3.1875, + "learning_rate": 1.9139283467888178e-05, + "loss": 2.1604, + "step": 14440 + }, + { + "epoch": 0.26643573113207547, + "grad_norm": 2.71875, + "learning_rate": 1.913693248212219e-05, + "loss": 2.1643, + "step": 14460 + }, + { + "epoch": 0.2668042452830189, + "grad_norm": 3.859375, + "learning_rate": 1.913457843475417e-05, + "loss": 2.1358, + "step": 14480 + }, + { + "epoch": 0.26717275943396224, + "grad_norm": 3.1875, + "learning_rate": 1.9132221326572923e-05, + "loss": 2.1575, + "step": 14500 + }, + { + "epoch": 0.26754127358490565, + "grad_norm": 2.84375, + "learning_rate": 1.9129861158368264e-05, + "loss": 2.1842, + "step": 14520 + }, + { + "epoch": 0.26790978773584906, + "grad_norm": 2.8125, + "learning_rate": 1.9127497930931037e-05, + "loss": 2.1555, + "step": 14540 + }, + { + "epoch": 0.26827830188679247, + "grad_norm": 3.046875, + "learning_rate": 1.9125131645053112e-05, + "loss": 2.2042, + "step": 14560 + }, + { + "epoch": 0.2686468160377358, + "grad_norm": 2.953125, + "learning_rate": 1.9122762301527384e-05, + "loss": 2.1665, + "step": 14580 + }, + { + "epoch": 0.26901533018867924, + "grad_norm": 2.921875, + "learning_rate": 1.9120389901147772e-05, + "loss": 2.1274, + "step": 14600 + }, + { + "epoch": 0.26938384433962265, + "grad_norm": 2.78125, + "learning_rate": 1.9118014444709225e-05, + "loss": 2.1558, + "step": 14620 + }, + { + "epoch": 0.26975235849056606, + "grad_norm": 2.953125, + "learning_rate": 1.9115635933007702e-05, + "loss": 2.1567, + "step": 14640 + }, + { + "epoch": 0.2701208726415094, + "grad_norm": 2.90625, + "learning_rate": 1.9113254366840198e-05, + "loss": 2.1372, + "step": 14660 + }, + { + "epoch": 0.2704893867924528, + "grad_norm": 2.5625, + "learning_rate": 1.911086974700473e-05, + "loss": 2.1078, + "step": 14680 + }, + { + "epoch": 0.27085790094339623, + "grad_norm": 2.9375, + "learning_rate": 1.9108482074300333e-05, + "loss": 2.1708, + "step": 14700 + }, + { + "epoch": 0.27122641509433965, + "grad_norm": 3.109375, + "learning_rate": 1.910609134952707e-05, + "loss": 2.137, + "step": 14720 + }, + { + "epoch": 0.271594929245283, + "grad_norm": 2.84375, + "learning_rate": 1.9103697573486022e-05, + "loss": 2.1504, + "step": 14740 + }, + { + "epoch": 0.2719634433962264, + "grad_norm": 3.171875, + "learning_rate": 1.9101300746979297e-05, + "loss": 2.1509, + "step": 14760 + }, + { + "epoch": 0.2723319575471698, + "grad_norm": 2.578125, + "learning_rate": 1.9098900870810024e-05, + "loss": 2.146, + "step": 14780 + }, + { + "epoch": 0.27270047169811323, + "grad_norm": 3.21875, + "learning_rate": 1.9096497945782353e-05, + "loss": 2.184, + "step": 14800 + }, + { + "epoch": 0.2730689858490566, + "grad_norm": 3.203125, + "learning_rate": 1.9094091972701455e-05, + "loss": 2.1656, + "step": 14820 + }, + { + "epoch": 0.2734375, + "grad_norm": 2.828125, + "learning_rate": 1.9091682952373525e-05, + "loss": 2.1637, + "step": 14840 + }, + { + "epoch": 0.2738060141509434, + "grad_norm": 2.953125, + "learning_rate": 1.9089270885605776e-05, + "loss": 2.1594, + "step": 14860 + }, + { + "epoch": 0.27417452830188677, + "grad_norm": 2.9375, + "learning_rate": 1.908685577320644e-05, + "loss": 2.1719, + "step": 14880 + }, + { + "epoch": 0.2745430424528302, + "grad_norm": 3.203125, + "learning_rate": 1.9084437615984773e-05, + "loss": 2.1709, + "step": 14900 + }, + { + "epoch": 0.2749115566037736, + "grad_norm": 2.78125, + "learning_rate": 1.9082016414751057e-05, + "loss": 2.1721, + "step": 14920 + }, + { + "epoch": 0.275280070754717, + "grad_norm": 2.609375, + "learning_rate": 1.907959217031658e-05, + "loss": 2.1734, + "step": 14940 + }, + { + "epoch": 0.27564858490566035, + "grad_norm": 3.09375, + "learning_rate": 1.9077164883493663e-05, + "loss": 2.1863, + "step": 14960 + }, + { + "epoch": 0.27601709905660377, + "grad_norm": 3.171875, + "learning_rate": 1.907473455509564e-05, + "loss": 2.1571, + "step": 14980 + }, + { + "epoch": 0.2763856132075472, + "grad_norm": 2.65625, + "learning_rate": 1.9072301185936864e-05, + "loss": 2.1802, + "step": 15000 + }, + { + "epoch": 0.2767541273584906, + "grad_norm": 2.859375, + "learning_rate": 1.9069864776832707e-05, + "loss": 2.19, + "step": 15020 + }, + { + "epoch": 0.27712264150943394, + "grad_norm": 2.90625, + "learning_rate": 1.906742532859956e-05, + "loss": 2.1843, + "step": 15040 + }, + { + "epoch": 0.27749115566037735, + "grad_norm": 2.78125, + "learning_rate": 1.9064982842054835e-05, + "loss": 2.1317, + "step": 15060 + }, + { + "epoch": 0.27785966981132076, + "grad_norm": 2.625, + "learning_rate": 1.9062537318016962e-05, + "loss": 2.1768, + "step": 15080 + }, + { + "epoch": 0.2782281839622642, + "grad_norm": 3.421875, + "learning_rate": 1.9060088757305383e-05, + "loss": 2.1304, + "step": 15100 + }, + { + "epoch": 0.27859669811320753, + "grad_norm": 2.75, + "learning_rate": 1.9057637160740565e-05, + "loss": 2.1623, + "step": 15120 + }, + { + "epoch": 0.27896521226415094, + "grad_norm": 2.90625, + "learning_rate": 1.9055182529143983e-05, + "loss": 2.1323, + "step": 15140 + }, + { + "epoch": 0.27933372641509435, + "grad_norm": 2.734375, + "learning_rate": 1.905272486333814e-05, + "loss": 2.1543, + "step": 15160 + }, + { + "epoch": 0.27970224056603776, + "grad_norm": 3.65625, + "learning_rate": 1.9050264164146546e-05, + "loss": 2.1516, + "step": 15180 + }, + { + "epoch": 0.2800707547169811, + "grad_norm": 2.90625, + "learning_rate": 1.9047800432393735e-05, + "loss": 2.1678, + "step": 15200 + }, + { + "epoch": 0.28043926886792453, + "grad_norm": 3.09375, + "learning_rate": 1.9045333668905258e-05, + "loss": 2.1892, + "step": 15220 + }, + { + "epoch": 0.28080778301886794, + "grad_norm": 3.0625, + "learning_rate": 1.904286387450767e-05, + "loss": 2.1769, + "step": 15240 + }, + { + "epoch": 0.2811762971698113, + "grad_norm": 3.0, + "learning_rate": 1.904039105002855e-05, + "loss": 2.1437, + "step": 15260 + }, + { + "epoch": 0.2815448113207547, + "grad_norm": 2.796875, + "learning_rate": 1.9037915196296495e-05, + "loss": 2.1227, + "step": 15280 + }, + { + "epoch": 0.2819133254716981, + "grad_norm": 2.890625, + "learning_rate": 1.9035436314141114e-05, + "loss": 2.1997, + "step": 15300 + }, + { + "epoch": 0.28228183962264153, + "grad_norm": 2.890625, + "learning_rate": 1.9032954404393024e-05, + "loss": 2.1561, + "step": 15320 + }, + { + "epoch": 0.2826503537735849, + "grad_norm": 2.75, + "learning_rate": 1.9030469467883875e-05, + "loss": 2.1522, + "step": 15340 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 2.828125, + "learning_rate": 1.9027981505446307e-05, + "loss": 2.181, + "step": 15360 + }, + { + "epoch": 0.2833873820754717, + "grad_norm": 2.890625, + "learning_rate": 1.9025490517913995e-05, + "loss": 2.19, + "step": 15380 + }, + { + "epoch": 0.2837558962264151, + "grad_norm": 2.703125, + "learning_rate": 1.902299650612161e-05, + "loss": 2.163, + "step": 15400 + }, + { + "epoch": 0.28412441037735847, + "grad_norm": 2.625, + "learning_rate": 1.9020499470904853e-05, + "loss": 2.1437, + "step": 15420 + }, + { + "epoch": 0.2844929245283019, + "grad_norm": 2.625, + "learning_rate": 1.901799941310043e-05, + "loss": 2.168, + "step": 15440 + }, + { + "epoch": 0.2848614386792453, + "grad_norm": 3.078125, + "learning_rate": 1.9015496333546054e-05, + "loss": 2.1434, + "step": 15460 + }, + { + "epoch": 0.2852299528301887, + "grad_norm": 2.8125, + "learning_rate": 1.901299023308046e-05, + "loss": 2.1587, + "step": 15480 + }, + { + "epoch": 0.28559846698113206, + "grad_norm": 2.875, + "learning_rate": 1.9010481112543392e-05, + "loss": 2.1512, + "step": 15500 + }, + { + "epoch": 0.28596698113207547, + "grad_norm": 2.78125, + "learning_rate": 1.9007968972775606e-05, + "loss": 2.1489, + "step": 15520 + }, + { + "epoch": 0.2863354952830189, + "grad_norm": 2.921875, + "learning_rate": 1.9005453814618868e-05, + "loss": 2.1382, + "step": 15540 + }, + { + "epoch": 0.28670400943396224, + "grad_norm": 2.734375, + "learning_rate": 1.900293563891596e-05, + "loss": 2.1607, + "step": 15560 + }, + { + "epoch": 0.28707252358490565, + "grad_norm": 2.953125, + "learning_rate": 1.9000414446510664e-05, + "loss": 2.153, + "step": 15580 + }, + { + "epoch": 0.28744103773584906, + "grad_norm": 3.140625, + "learning_rate": 1.899789023824779e-05, + "loss": 2.1512, + "step": 15600 + }, + { + "epoch": 0.28780955188679247, + "grad_norm": 3.125, + "learning_rate": 1.8995363014973144e-05, + "loss": 2.158, + "step": 15620 + }, + { + "epoch": 0.2881780660377358, + "grad_norm": 2.8125, + "learning_rate": 1.899283277753355e-05, + "loss": 2.1615, + "step": 15640 + }, + { + "epoch": 0.28854658018867924, + "grad_norm": 3.125, + "learning_rate": 1.899029952677684e-05, + "loss": 2.1297, + "step": 15660 + }, + { + "epoch": 0.28891509433962265, + "grad_norm": 3.53125, + "learning_rate": 1.8987763263551855e-05, + "loss": 2.1461, + "step": 15680 + }, + { + "epoch": 0.28928360849056606, + "grad_norm": 3.421875, + "learning_rate": 1.8985223988708442e-05, + "loss": 2.1371, + "step": 15700 + }, + { + "epoch": 0.2896521226415094, + "grad_norm": 2.828125, + "learning_rate": 1.8982681703097467e-05, + "loss": 2.1224, + "step": 15720 + }, + { + "epoch": 0.2900206367924528, + "grad_norm": 2.796875, + "learning_rate": 1.8980136407570794e-05, + "loss": 2.1418, + "step": 15740 + }, + { + "epoch": 0.29038915094339623, + "grad_norm": 3.25, + "learning_rate": 1.8977588102981303e-05, + "loss": 2.1667, + "step": 15760 + }, + { + "epoch": 0.29075766509433965, + "grad_norm": 2.8125, + "learning_rate": 1.8975036790182876e-05, + "loss": 2.1816, + "step": 15780 + }, + { + "epoch": 0.291126179245283, + "grad_norm": 2.921875, + "learning_rate": 1.897248247003041e-05, + "loss": 2.1645, + "step": 15800 + }, + { + "epoch": 0.2914946933962264, + "grad_norm": 3.078125, + "learning_rate": 1.8969925143379806e-05, + "loss": 2.1502, + "step": 15820 + }, + { + "epoch": 0.2918632075471698, + "grad_norm": 3.109375, + "learning_rate": 1.896736481108797e-05, + "loss": 2.1501, + "step": 15840 + }, + { + "epoch": 0.29223172169811323, + "grad_norm": 2.796875, + "learning_rate": 1.896480147401282e-05, + "loss": 2.1455, + "step": 15860 + }, + { + "epoch": 0.2926002358490566, + "grad_norm": 2.984375, + "learning_rate": 1.896223513301328e-05, + "loss": 2.1757, + "step": 15880 + }, + { + "epoch": 0.29296875, + "grad_norm": 3.109375, + "learning_rate": 1.8959665788949274e-05, + "loss": 2.1702, + "step": 15900 + }, + { + "epoch": 0.2933372641509434, + "grad_norm": 2.984375, + "learning_rate": 1.8957093442681742e-05, + "loss": 2.1361, + "step": 15920 + }, + { + "epoch": 0.29370577830188677, + "grad_norm": 2.75, + "learning_rate": 1.8954518095072627e-05, + "loss": 2.1688, + "step": 15940 + }, + { + "epoch": 0.2940742924528302, + "grad_norm": 3.28125, + "learning_rate": 1.895193974698487e-05, + "loss": 2.1378, + "step": 15960 + }, + { + "epoch": 0.2944428066037736, + "grad_norm": 3.140625, + "learning_rate": 1.8949358399282428e-05, + "loss": 2.1559, + "step": 15980 + }, + { + "epoch": 0.294811320754717, + "grad_norm": 3.53125, + "learning_rate": 1.8946774052830257e-05, + "loss": 2.2027, + "step": 16000 + }, + { + "epoch": 0.29517983490566035, + "grad_norm": 2.75, + "learning_rate": 1.8944186708494318e-05, + "loss": 2.1607, + "step": 16020 + }, + { + "epoch": 0.29554834905660377, + "grad_norm": 2.640625, + "learning_rate": 1.894159636714158e-05, + "loss": 2.2009, + "step": 16040 + }, + { + "epoch": 0.2959168632075472, + "grad_norm": 2.78125, + "learning_rate": 1.8939003029640018e-05, + "loss": 2.153, + "step": 16060 + }, + { + "epoch": 0.2962853773584906, + "grad_norm": 2.96875, + "learning_rate": 1.8936406696858597e-05, + "loss": 2.1766, + "step": 16080 + }, + { + "epoch": 0.29665389150943394, + "grad_norm": 2.609375, + "learning_rate": 1.8933807369667307e-05, + "loss": 2.1287, + "step": 16100 + }, + { + "epoch": 0.29702240566037735, + "grad_norm": 2.953125, + "learning_rate": 1.8931205048937125e-05, + "loss": 2.1777, + "step": 16120 + }, + { + "epoch": 0.29739091981132076, + "grad_norm": 2.84375, + "learning_rate": 1.8928599735540034e-05, + "loss": 2.152, + "step": 16140 + }, + { + "epoch": 0.2977594339622642, + "grad_norm": 2.828125, + "learning_rate": 1.8925991430349024e-05, + "loss": 2.1945, + "step": 16160 + }, + { + "epoch": 0.29812794811320753, + "grad_norm": 3.03125, + "learning_rate": 1.892338013423809e-05, + "loss": 2.1513, + "step": 16180 + }, + { + "epoch": 0.29849646226415094, + "grad_norm": 3.0, + "learning_rate": 1.8920765848082216e-05, + "loss": 2.1607, + "step": 16200 + }, + { + "epoch": 0.29886497641509435, + "grad_norm": 2.84375, + "learning_rate": 1.8918148572757403e-05, + "loss": 2.1527, + "step": 16220 + }, + { + "epoch": 0.29923349056603776, + "grad_norm": 2.890625, + "learning_rate": 1.8915528309140648e-05, + "loss": 2.117, + "step": 16240 + }, + { + "epoch": 0.2996020047169811, + "grad_norm": 2.59375, + "learning_rate": 1.8912905058109943e-05, + "loss": 2.1877, + "step": 16260 + }, + { + "epoch": 0.29997051886792453, + "grad_norm": 2.78125, + "learning_rate": 1.891027882054429e-05, + "loss": 2.1822, + "step": 16280 + }, + { + "epoch": 0.30033903301886794, + "grad_norm": 2.71875, + "learning_rate": 1.8907649597323686e-05, + "loss": 2.1328, + "step": 16300 + }, + { + "epoch": 0.3007075471698113, + "grad_norm": 3.28125, + "learning_rate": 1.8905017389329136e-05, + "loss": 2.1326, + "step": 16320 + }, + { + "epoch": 0.3010760613207547, + "grad_norm": 3.1875, + "learning_rate": 1.890238219744264e-05, + "loss": 2.1622, + "step": 16340 + }, + { + "epoch": 0.3014445754716981, + "grad_norm": 3.0, + "learning_rate": 1.8899744022547185e-05, + "loss": 2.1617, + "step": 16360 + }, + { + "epoch": 0.30181308962264153, + "grad_norm": 2.96875, + "learning_rate": 1.8897102865526785e-05, + "loss": 2.1641, + "step": 16380 + }, + { + "epoch": 0.3021816037735849, + "grad_norm": 3.171875, + "learning_rate": 1.8894458727266434e-05, + "loss": 2.1654, + "step": 16400 + }, + { + "epoch": 0.3025501179245283, + "grad_norm": 2.671875, + "learning_rate": 1.8891811608652127e-05, + "loss": 2.1704, + "step": 16420 + }, + { + "epoch": 0.3029186320754717, + "grad_norm": 3.078125, + "learning_rate": 1.8889161510570863e-05, + "loss": 2.1336, + "step": 16440 + }, + { + "epoch": 0.3032871462264151, + "grad_norm": 2.859375, + "learning_rate": 1.8886508433910637e-05, + "loss": 2.1977, + "step": 16460 + }, + { + "epoch": 0.30365566037735847, + "grad_norm": 3.3125, + "learning_rate": 1.8883852379560433e-05, + "loss": 2.2209, + "step": 16480 + }, + { + "epoch": 0.3040241745283019, + "grad_norm": 2.890625, + "learning_rate": 1.8881193348410254e-05, + "loss": 2.1204, + "step": 16500 + }, + { + "epoch": 0.3043926886792453, + "grad_norm": 2.84375, + "learning_rate": 1.887853134135108e-05, + "loss": 2.1969, + "step": 16520 + }, + { + "epoch": 0.3047612028301887, + "grad_norm": 2.90625, + "learning_rate": 1.88758663592749e-05, + "loss": 2.171, + "step": 16540 + }, + { + "epoch": 0.30512971698113206, + "grad_norm": 3.390625, + "learning_rate": 1.887319840307469e-05, + "loss": 2.1667, + "step": 16560 + }, + { + "epoch": 0.30549823113207547, + "grad_norm": 2.78125, + "learning_rate": 1.8870527473644435e-05, + "loss": 2.1483, + "step": 16580 + }, + { + "epoch": 0.3058667452830189, + "grad_norm": 2.671875, + "learning_rate": 1.8867853571879103e-05, + "loss": 2.1885, + "step": 16600 + }, + { + "epoch": 0.30623525943396224, + "grad_norm": 2.625, + "learning_rate": 1.886517669867467e-05, + "loss": 2.1656, + "step": 16620 + }, + { + "epoch": 0.30660377358490565, + "grad_norm": 3.0, + "learning_rate": 1.88624968549281e-05, + "loss": 2.16, + "step": 16640 + }, + { + "epoch": 0.30697228773584906, + "grad_norm": 3.046875, + "learning_rate": 1.8859814041537358e-05, + "loss": 2.1618, + "step": 16660 + }, + { + "epoch": 0.30734080188679247, + "grad_norm": 2.734375, + "learning_rate": 1.8857128259401393e-05, + "loss": 2.1143, + "step": 16680 + }, + { + "epoch": 0.3077093160377358, + "grad_norm": 3.046875, + "learning_rate": 1.8854439509420167e-05, + "loss": 2.1811, + "step": 16700 + }, + { + "epoch": 0.30807783018867924, + "grad_norm": 2.78125, + "learning_rate": 1.8851747792494616e-05, + "loss": 2.1405, + "step": 16720 + }, + { + "epoch": 0.30844634433962265, + "grad_norm": 3.125, + "learning_rate": 1.8849053109526686e-05, + "loss": 2.15, + "step": 16740 + }, + { + "epoch": 0.30881485849056606, + "grad_norm": 2.71875, + "learning_rate": 1.884635546141931e-05, + "loss": 2.1865, + "step": 16760 + }, + { + "epoch": 0.3091833726415094, + "grad_norm": 3.171875, + "learning_rate": 1.8843654849076416e-05, + "loss": 2.186, + "step": 16780 + }, + { + "epoch": 0.3095518867924528, + "grad_norm": 3.234375, + "learning_rate": 1.884095127340292e-05, + "loss": 2.1652, + "step": 16800 + }, + { + "epoch": 0.30992040094339623, + "grad_norm": 2.921875, + "learning_rate": 1.8838244735304743e-05, + "loss": 2.1517, + "step": 16820 + }, + { + "epoch": 0.31028891509433965, + "grad_norm": 3.03125, + "learning_rate": 1.8835535235688786e-05, + "loss": 2.1291, + "step": 16840 + }, + { + "epoch": 0.310657429245283, + "grad_norm": 2.734375, + "learning_rate": 1.883282277546295e-05, + "loss": 2.1574, + "step": 16860 + }, + { + "epoch": 0.3110259433962264, + "grad_norm": 3.1875, + "learning_rate": 1.8830107355536127e-05, + "loss": 2.1717, + "step": 16880 + }, + { + "epoch": 0.3113944575471698, + "grad_norm": 2.890625, + "learning_rate": 1.8827388976818194e-05, + "loss": 2.1408, + "step": 16900 + }, + { + "epoch": 0.31176297169811323, + "grad_norm": 2.828125, + "learning_rate": 1.8824667640220037e-05, + "loss": 2.1629, + "step": 16920 + }, + { + "epoch": 0.3121314858490566, + "grad_norm": 2.84375, + "learning_rate": 1.8821943346653503e-05, + "loss": 2.1132, + "step": 16940 + }, + { + "epoch": 0.3125, + "grad_norm": 2.703125, + "learning_rate": 1.8819216097031464e-05, + "loss": 2.1376, + "step": 16960 + }, + { + "epoch": 0.3128685141509434, + "grad_norm": 2.75, + "learning_rate": 1.881648589226776e-05, + "loss": 2.1849, + "step": 16980 + }, + { + "epoch": 0.31323702830188677, + "grad_norm": 3.03125, + "learning_rate": 1.8813752733277227e-05, + "loss": 2.1329, + "step": 17000 + }, + { + "epoch": 0.3136055424528302, + "grad_norm": 2.890625, + "learning_rate": 1.8811016620975694e-05, + "loss": 2.1427, + "step": 17020 + }, + { + "epoch": 0.3139740566037736, + "grad_norm": 2.671875, + "learning_rate": 1.8808277556279972e-05, + "loss": 2.1706, + "step": 17040 + }, + { + "epoch": 0.314342570754717, + "grad_norm": 3.078125, + "learning_rate": 1.8805535540107873e-05, + "loss": 2.1697, + "step": 17060 + }, + { + "epoch": 0.31471108490566035, + "grad_norm": 3.4375, + "learning_rate": 1.8802790573378186e-05, + "loss": 2.1438, + "step": 17080 + }, + { + "epoch": 0.31507959905660377, + "grad_norm": 3.234375, + "learning_rate": 1.88000426570107e-05, + "loss": 2.182, + "step": 17100 + }, + { + "epoch": 0.3154481132075472, + "grad_norm": 3.578125, + "learning_rate": 1.879729179192618e-05, + "loss": 2.1375, + "step": 17120 + }, + { + "epoch": 0.3158166273584906, + "grad_norm": 3.125, + "learning_rate": 1.879453797904639e-05, + "loss": 2.1322, + "step": 17140 + }, + { + "epoch": 0.31618514150943394, + "grad_norm": 2.8125, + "learning_rate": 1.8791781219294075e-05, + "loss": 2.147, + "step": 17160 + }, + { + "epoch": 0.31655365566037735, + "grad_norm": 3.15625, + "learning_rate": 1.878902151359297e-05, + "loss": 2.1713, + "step": 17180 + }, + { + "epoch": 0.31692216981132076, + "grad_norm": 2.734375, + "learning_rate": 1.8786258862867796e-05, + "loss": 2.1771, + "step": 17200 + }, + { + "epoch": 0.3172906839622642, + "grad_norm": 2.828125, + "learning_rate": 1.8783493268044265e-05, + "loss": 2.1331, + "step": 17220 + }, + { + "epoch": 0.31765919811320753, + "grad_norm": 3.046875, + "learning_rate": 1.8780724730049065e-05, + "loss": 2.1921, + "step": 17240 + }, + { + "epoch": 0.31802771226415094, + "grad_norm": 2.859375, + "learning_rate": 1.8777953249809884e-05, + "loss": 2.1393, + "step": 17260 + }, + { + "epoch": 0.31839622641509435, + "grad_norm": 3.140625, + "learning_rate": 1.877517882825539e-05, + "loss": 2.1511, + "step": 17280 + }, + { + "epoch": 0.31876474056603776, + "grad_norm": 2.953125, + "learning_rate": 1.8772401466315232e-05, + "loss": 2.1816, + "step": 17300 + }, + { + "epoch": 0.3191332547169811, + "grad_norm": 3.03125, + "learning_rate": 1.8769621164920047e-05, + "loss": 2.1191, + "step": 17320 + }, + { + "epoch": 0.31950176886792453, + "grad_norm": 2.96875, + "learning_rate": 1.8766837925001465e-05, + "loss": 2.153, + "step": 17340 + }, + { + "epoch": 0.31987028301886794, + "grad_norm": 3.125, + "learning_rate": 1.8764051747492086e-05, + "loss": 2.1172, + "step": 17360 + }, + { + "epoch": 0.3202387971698113, + "grad_norm": 2.734375, + "learning_rate": 1.8761262633325503e-05, + "loss": 2.1495, + "step": 17380 + }, + { + "epoch": 0.3206073113207547, + "grad_norm": 3.390625, + "learning_rate": 1.8758470583436295e-05, + "loss": 2.1872, + "step": 17400 + }, + { + "epoch": 0.3209758254716981, + "grad_norm": 3.625, + "learning_rate": 1.875567559876002e-05, + "loss": 2.1244, + "step": 17420 + }, + { + "epoch": 0.32134433962264153, + "grad_norm": 3.109375, + "learning_rate": 1.8752877680233222e-05, + "loss": 2.1425, + "step": 17440 + }, + { + "epoch": 0.3217128537735849, + "grad_norm": 2.6875, + "learning_rate": 1.875007682879342e-05, + "loss": 2.1428, + "step": 17460 + }, + { + "epoch": 0.3220813679245283, + "grad_norm": 3.5, + "learning_rate": 1.874727304537914e-05, + "loss": 2.1357, + "step": 17480 + }, + { + "epoch": 0.3224498820754717, + "grad_norm": 2.703125, + "learning_rate": 1.8744466330929857e-05, + "loss": 2.0858, + "step": 17500 + }, + { + "epoch": 0.3228183962264151, + "grad_norm": 3.21875, + "learning_rate": 1.874165668638605e-05, + "loss": 2.1629, + "step": 17520 + }, + { + "epoch": 0.32318691037735847, + "grad_norm": 2.984375, + "learning_rate": 1.8738844112689176e-05, + "loss": 2.1865, + "step": 17540 + }, + { + "epoch": 0.3235554245283019, + "grad_norm": 3.265625, + "learning_rate": 1.873602861078167e-05, + "loss": 2.1169, + "step": 17560 + }, + { + "epoch": 0.3239239386792453, + "grad_norm": 3.4375, + "learning_rate": 1.873321018160695e-05, + "loss": 2.1662, + "step": 17580 + }, + { + "epoch": 0.3242924528301887, + "grad_norm": 3.53125, + "learning_rate": 1.8730388826109417e-05, + "loss": 2.1585, + "step": 17600 + }, + { + "epoch": 0.32466096698113206, + "grad_norm": 2.9375, + "learning_rate": 1.8727564545234446e-05, + "loss": 2.1407, + "step": 17620 + }, + { + "epoch": 0.32502948113207547, + "grad_norm": 2.71875, + "learning_rate": 1.87247373399284e-05, + "loss": 2.1182, + "step": 17640 + }, + { + "epoch": 0.3253979952830189, + "grad_norm": 3.484375, + "learning_rate": 1.872190721113862e-05, + "loss": 2.1546, + "step": 17660 + }, + { + "epoch": 0.32576650943396224, + "grad_norm": 3.046875, + "learning_rate": 1.871907415981342e-05, + "loss": 2.1697, + "step": 17680 + }, + { + "epoch": 0.32613502358490565, + "grad_norm": 2.734375, + "learning_rate": 1.8716238186902102e-05, + "loss": 2.1076, + "step": 17700 + }, + { + "epoch": 0.32650353773584906, + "grad_norm": 3.140625, + "learning_rate": 1.8713399293354943e-05, + "loss": 2.174, + "step": 17720 + }, + { + "epoch": 0.32687205188679247, + "grad_norm": 2.8125, + "learning_rate": 1.87105574801232e-05, + "loss": 2.1302, + "step": 17740 + }, + { + "epoch": 0.3272405660377358, + "grad_norm": 2.8125, + "learning_rate": 1.8707712748159103e-05, + "loss": 2.1603, + "step": 17760 + }, + { + "epoch": 0.32760908018867924, + "grad_norm": 2.984375, + "learning_rate": 1.870486509841587e-05, + "loss": 2.1416, + "step": 17780 + }, + { + "epoch": 0.32797759433962265, + "grad_norm": 3.484375, + "learning_rate": 1.8702014531847688e-05, + "loss": 2.1762, + "step": 17800 + }, + { + "epoch": 0.32834610849056606, + "grad_norm": 3.03125, + "learning_rate": 1.869916104940972e-05, + "loss": 2.1762, + "step": 17820 + }, + { + "epoch": 0.3287146226415094, + "grad_norm": 3.109375, + "learning_rate": 1.8696304652058123e-05, + "loss": 2.1353, + "step": 17840 + }, + { + "epoch": 0.3290831367924528, + "grad_norm": 2.921875, + "learning_rate": 1.8693445340750007e-05, + "loss": 2.1399, + "step": 17860 + }, + { + "epoch": 0.32945165094339623, + "grad_norm": 2.96875, + "learning_rate": 1.8690583116443473e-05, + "loss": 2.1186, + "step": 17880 + }, + { + "epoch": 0.32982016509433965, + "grad_norm": 2.9375, + "learning_rate": 1.86877179800976e-05, + "loss": 2.1271, + "step": 17900 + }, + { + "epoch": 0.330188679245283, + "grad_norm": 3.03125, + "learning_rate": 1.8684849932672425e-05, + "loss": 2.1335, + "step": 17920 + }, + { + "epoch": 0.3305571933962264, + "grad_norm": 2.828125, + "learning_rate": 1.8681978975128982e-05, + "loss": 2.1764, + "step": 17940 + }, + { + "epoch": 0.3309257075471698, + "grad_norm": 3.0, + "learning_rate": 1.8679105108429273e-05, + "loss": 2.1702, + "step": 17960 + }, + { + "epoch": 0.33129422169811323, + "grad_norm": 2.890625, + "learning_rate": 1.8676228333536268e-05, + "loss": 2.1537, + "step": 17980 + }, + { + "epoch": 0.3316627358490566, + "grad_norm": 3.15625, + "learning_rate": 1.8673348651413914e-05, + "loss": 2.1314, + "step": 18000 + }, + { + "epoch": 0.33203125, + "grad_norm": 3.484375, + "learning_rate": 1.8670466063027143e-05, + "loss": 2.147, + "step": 18020 + }, + { + "epoch": 0.3323997641509434, + "grad_norm": 2.859375, + "learning_rate": 1.8667580569341845e-05, + "loss": 2.1365, + "step": 18040 + }, + { + "epoch": 0.33276827830188677, + "grad_norm": 3.28125, + "learning_rate": 1.8664692171324895e-05, + "loss": 2.1773, + "step": 18060 + }, + { + "epoch": 0.3331367924528302, + "grad_norm": 2.984375, + "learning_rate": 1.8661800869944136e-05, + "loss": 2.1221, + "step": 18080 + }, + { + "epoch": 0.3335053066037736, + "grad_norm": 2.71875, + "learning_rate": 1.8658906666168382e-05, + "loss": 2.1838, + "step": 18100 + }, + { + "epoch": 0.333873820754717, + "grad_norm": 2.6875, + "learning_rate": 1.8656009560967424e-05, + "loss": 2.1181, + "step": 18120 + }, + { + "epoch": 0.33424233490566035, + "grad_norm": 2.875, + "learning_rate": 1.8653109555312023e-05, + "loss": 2.1232, + "step": 18140 + }, + { + "epoch": 0.33461084905660377, + "grad_norm": 2.734375, + "learning_rate": 1.8650206650173914e-05, + "loss": 2.1832, + "step": 18160 + }, + { + "epoch": 0.3349793632075472, + "grad_norm": 2.875, + "learning_rate": 1.8647300846525805e-05, + "loss": 2.1253, + "step": 18180 + }, + { + "epoch": 0.3353478773584906, + "grad_norm": 3.265625, + "learning_rate": 1.8644392145341362e-05, + "loss": 2.1543, + "step": 18200 + }, + { + "epoch": 0.33571639150943394, + "grad_norm": 3.109375, + "learning_rate": 1.8641480547595246e-05, + "loss": 2.1213, + "step": 18220 + }, + { + "epoch": 0.33608490566037735, + "grad_norm": 2.859375, + "learning_rate": 1.8638566054263067e-05, + "loss": 2.1502, + "step": 18240 + }, + { + "epoch": 0.33645341981132076, + "grad_norm": 2.828125, + "learning_rate": 1.8635648666321412e-05, + "loss": 2.1206, + "step": 18260 + }, + { + "epoch": 0.3368219339622642, + "grad_norm": 3.390625, + "learning_rate": 1.863272838474784e-05, + "loss": 2.1366, + "step": 18280 + }, + { + "epoch": 0.33719044811320753, + "grad_norm": 3.09375, + "learning_rate": 1.8629805210520877e-05, + "loss": 2.1422, + "step": 18300 + }, + { + "epoch": 0.33755896226415094, + "grad_norm": 3.078125, + "learning_rate": 1.8626879144620026e-05, + "loss": 2.1681, + "step": 18320 + }, + { + "epoch": 0.33792747641509435, + "grad_norm": 3.046875, + "learning_rate": 1.8623950188025746e-05, + "loss": 2.1426, + "step": 18340 + }, + { + "epoch": 0.33829599056603776, + "grad_norm": 2.859375, + "learning_rate": 1.8621018341719477e-05, + "loss": 2.1518, + "step": 18360 + }, + { + "epoch": 0.3386645047169811, + "grad_norm": 2.90625, + "learning_rate": 1.8618083606683615e-05, + "loss": 2.1708, + "step": 18380 + }, + { + "epoch": 0.33903301886792453, + "grad_norm": 3.015625, + "learning_rate": 1.8615145983901536e-05, + "loss": 2.089, + "step": 18400 + }, + { + "epoch": 0.33940153301886794, + "grad_norm": 2.9375, + "learning_rate": 1.861220547435758e-05, + "loss": 2.1611, + "step": 18420 + }, + { + "epoch": 0.3397700471698113, + "grad_norm": 3.125, + "learning_rate": 1.860926207903705e-05, + "loss": 2.1302, + "step": 18440 + }, + { + "epoch": 0.3401385613207547, + "grad_norm": 3.09375, + "learning_rate": 1.860631579892621e-05, + "loss": 2.1493, + "step": 18460 + }, + { + "epoch": 0.3405070754716981, + "grad_norm": 2.921875, + "learning_rate": 1.860336663501231e-05, + "loss": 2.1336, + "step": 18480 + }, + { + "epoch": 0.34087558962264153, + "grad_norm": 3.0625, + "learning_rate": 1.8600414588283553e-05, + "loss": 2.133, + "step": 18500 + }, + { + "epoch": 0.3412441037735849, + "grad_norm": 3.359375, + "learning_rate": 1.8597459659729108e-05, + "loss": 2.1998, + "step": 18520 + }, + { + "epoch": 0.3416126179245283, + "grad_norm": 2.75, + "learning_rate": 1.8594501850339116e-05, + "loss": 2.134, + "step": 18540 + }, + { + "epoch": 0.3419811320754717, + "grad_norm": 2.875, + "learning_rate": 1.8591541161104672e-05, + "loss": 2.1432, + "step": 18560 + }, + { + "epoch": 0.3423496462264151, + "grad_norm": 3.203125, + "learning_rate": 1.8588577593017853e-05, + "loss": 2.1442, + "step": 18580 + }, + { + "epoch": 0.34271816037735847, + "grad_norm": 2.828125, + "learning_rate": 1.8585611147071686e-05, + "loss": 2.1579, + "step": 18600 + }, + { + "epoch": 0.3430866745283019, + "grad_norm": 3.0, + "learning_rate": 1.8582641824260164e-05, + "loss": 2.1636, + "step": 18620 + }, + { + "epoch": 0.3434551886792453, + "grad_norm": 3.03125, + "learning_rate": 1.8579669625578253e-05, + "loss": 2.1174, + "step": 18640 + }, + { + "epoch": 0.3438237028301887, + "grad_norm": 3.171875, + "learning_rate": 1.8576694552021876e-05, + "loss": 2.1466, + "step": 18660 + }, + { + "epoch": 0.34419221698113206, + "grad_norm": 3.40625, + "learning_rate": 1.8573716604587918e-05, + "loss": 2.1458, + "step": 18680 + }, + { + "epoch": 0.34456073113207547, + "grad_norm": 3.375, + "learning_rate": 1.857073578427423e-05, + "loss": 2.1809, + "step": 18700 + }, + { + "epoch": 0.3449292452830189, + "grad_norm": 3.03125, + "learning_rate": 1.8567752092079624e-05, + "loss": 2.1821, + "step": 18720 + }, + { + "epoch": 0.34529775943396224, + "grad_norm": 2.796875, + "learning_rate": 1.8564765529003876e-05, + "loss": 2.1267, + "step": 18740 + }, + { + "epoch": 0.34566627358490565, + "grad_norm": 2.984375, + "learning_rate": 1.8561776096047722e-05, + "loss": 2.1451, + "step": 18760 + }, + { + "epoch": 0.34603478773584906, + "grad_norm": 2.890625, + "learning_rate": 1.8558783794212863e-05, + "loss": 2.1484, + "step": 18780 + }, + { + "epoch": 0.34640330188679247, + "grad_norm": 2.96875, + "learning_rate": 1.855578862450196e-05, + "loss": 2.1757, + "step": 18800 + }, + { + "epoch": 0.3467718160377358, + "grad_norm": 2.953125, + "learning_rate": 1.855279058791863e-05, + "loss": 2.116, + "step": 18820 + }, + { + "epoch": 0.34714033018867924, + "grad_norm": 3.421875, + "learning_rate": 1.8549789685467457e-05, + "loss": 2.1225, + "step": 18840 + }, + { + "epoch": 0.34750884433962265, + "grad_norm": 2.828125, + "learning_rate": 1.8546785918153983e-05, + "loss": 2.1317, + "step": 18860 + }, + { + "epoch": 0.34787735849056606, + "grad_norm": 2.875, + "learning_rate": 1.854377928698471e-05, + "loss": 2.1367, + "step": 18880 + }, + { + "epoch": 0.3482458726415094, + "grad_norm": 2.734375, + "learning_rate": 1.85407697929671e-05, + "loss": 2.1493, + "step": 18900 + }, + { + "epoch": 0.3486143867924528, + "grad_norm": 2.859375, + "learning_rate": 1.853775743710957e-05, + "loss": 2.1496, + "step": 18920 + }, + { + "epoch": 0.34898290094339623, + "grad_norm": 2.875, + "learning_rate": 1.8534742220421506e-05, + "loss": 2.1709, + "step": 18940 + }, + { + "epoch": 0.34935141509433965, + "grad_norm": 3.203125, + "learning_rate": 1.853172414391324e-05, + "loss": 2.1609, + "step": 18960 + }, + { + "epoch": 0.349719929245283, + "grad_norm": 3.59375, + "learning_rate": 1.8528703208596072e-05, + "loss": 2.1435, + "step": 18980 + }, + { + "epoch": 0.3500884433962264, + "grad_norm": 3.046875, + "learning_rate": 1.8525679415482258e-05, + "loss": 2.1527, + "step": 19000 + }, + { + "epoch": 0.3504569575471698, + "grad_norm": 3.015625, + "learning_rate": 1.8522652765585007e-05, + "loss": 2.1373, + "step": 19020 + }, + { + "epoch": 0.35082547169811323, + "grad_norm": 2.875, + "learning_rate": 1.851962325991849e-05, + "loss": 2.1219, + "step": 19040 + }, + { + "epoch": 0.3511939858490566, + "grad_norm": 3.1875, + "learning_rate": 1.8516590899497833e-05, + "loss": 2.1429, + "step": 19060 + }, + { + "epoch": 0.3515625, + "grad_norm": 3.1875, + "learning_rate": 1.8513555685339123e-05, + "loss": 2.1246, + "step": 19080 + }, + { + "epoch": 0.3519310141509434, + "grad_norm": 3.59375, + "learning_rate": 1.851051761845939e-05, + "loss": 2.1342, + "step": 19100 + }, + { + "epoch": 0.35229952830188677, + "grad_norm": 3.109375, + "learning_rate": 1.850747669987664e-05, + "loss": 2.1436, + "step": 19120 + }, + { + "epoch": 0.3526680424528302, + "grad_norm": 2.890625, + "learning_rate": 1.8504432930609813e-05, + "loss": 2.1539, + "step": 19140 + }, + { + "epoch": 0.3530365566037736, + "grad_norm": 3.109375, + "learning_rate": 1.850138631167882e-05, + "loss": 2.1323, + "step": 19160 + }, + { + "epoch": 0.353405070754717, + "grad_norm": 3.03125, + "learning_rate": 1.8498336844104525e-05, + "loss": 2.1585, + "step": 19180 + }, + { + "epoch": 0.35377358490566035, + "grad_norm": 3.140625, + "learning_rate": 1.8495284528908733e-05, + "loss": 2.1624, + "step": 19200 + }, + { + "epoch": 0.35414209905660377, + "grad_norm": 3.75, + "learning_rate": 1.8492229367114225e-05, + "loss": 2.1667, + "step": 19220 + }, + { + "epoch": 0.3545106132075472, + "grad_norm": 3.046875, + "learning_rate": 1.8489171359744715e-05, + "loss": 2.1319, + "step": 19240 + }, + { + "epoch": 0.3548791273584906, + "grad_norm": 2.953125, + "learning_rate": 1.8486110507824885e-05, + "loss": 2.1467, + "step": 19260 + }, + { + "epoch": 0.35524764150943394, + "grad_norm": 3.125, + "learning_rate": 1.8483046812380367e-05, + "loss": 2.1388, + "step": 19280 + }, + { + "epoch": 0.35561615566037735, + "grad_norm": 2.96875, + "learning_rate": 1.8479980274437734e-05, + "loss": 2.1106, + "step": 19300 + }, + { + "epoch": 0.35598466981132076, + "grad_norm": 3.046875, + "learning_rate": 1.847691089502453e-05, + "loss": 2.1358, + "step": 19320 + }, + { + "epoch": 0.3563531839622642, + "grad_norm": 14.1875, + "learning_rate": 1.847383867516924e-05, + "loss": 2.1524, + "step": 19340 + }, + { + "epoch": 0.35672169811320753, + "grad_norm": 2.890625, + "learning_rate": 1.84707636159013e-05, + "loss": 2.1367, + "step": 19360 + }, + { + "epoch": 0.35709021226415094, + "grad_norm": 3.015625, + "learning_rate": 1.8467685718251103e-05, + "loss": 2.1622, + "step": 19380 + }, + { + "epoch": 0.35745872641509435, + "grad_norm": 3.046875, + "learning_rate": 1.8464604983249994e-05, + "loss": 2.1503, + "step": 19400 + }, + { + "epoch": 0.35782724056603776, + "grad_norm": 3.453125, + "learning_rate": 1.846152141193026e-05, + "loss": 2.1388, + "step": 19420 + }, + { + "epoch": 0.3581957547169811, + "grad_norm": 3.578125, + "learning_rate": 1.8458435005325145e-05, + "loss": 2.1478, + "step": 19440 + }, + { + "epoch": 0.35856426886792453, + "grad_norm": 3.078125, + "learning_rate": 1.845534576446884e-05, + "loss": 2.1405, + "step": 19460 + }, + { + "epoch": 0.35893278301886794, + "grad_norm": 2.953125, + "learning_rate": 1.845225369039649e-05, + "loss": 2.1245, + "step": 19480 + }, + { + "epoch": 0.3593012971698113, + "grad_norm": 3.3125, + "learning_rate": 1.8449158784144184e-05, + "loss": 2.1822, + "step": 19500 + }, + { + "epoch": 0.3596698113207547, + "grad_norm": 4.15625, + "learning_rate": 1.8446061046748968e-05, + "loss": 2.0976, + "step": 19520 + }, + { + "epoch": 0.3600383254716981, + "grad_norm": 3.3125, + "learning_rate": 1.844296047924883e-05, + "loss": 2.1337, + "step": 19540 + }, + { + "epoch": 0.36040683962264153, + "grad_norm": 3.15625, + "learning_rate": 1.84398570826827e-05, + "loss": 2.1477, + "step": 19560 + }, + { + "epoch": 0.3607753537735849, + "grad_norm": 2.96875, + "learning_rate": 1.843675085809047e-05, + "loss": 2.1332, + "step": 19580 + }, + { + "epoch": 0.3611438679245283, + "grad_norm": 3.5, + "learning_rate": 1.8433641806512973e-05, + "loss": 2.1056, + "step": 19600 + }, + { + "epoch": 0.3615123820754717, + "grad_norm": 2.9375, + "learning_rate": 1.8430529928991993e-05, + "loss": 2.1225, + "step": 19620 + }, + { + "epoch": 0.3618808962264151, + "grad_norm": 2.90625, + "learning_rate": 1.842741522657025e-05, + "loss": 2.1148, + "step": 19640 + }, + { + "epoch": 0.36224941037735847, + "grad_norm": 3.15625, + "learning_rate": 1.8424297700291426e-05, + "loss": 2.1455, + "step": 19660 + }, + { + "epoch": 0.3626179245283019, + "grad_norm": 2.78125, + "learning_rate": 1.8421177351200134e-05, + "loss": 2.151, + "step": 19680 + }, + { + "epoch": 0.3629864386792453, + "grad_norm": 2.984375, + "learning_rate": 1.8418054180341944e-05, + "loss": 2.0987, + "step": 19700 + }, + { + "epoch": 0.3633549528301887, + "grad_norm": 3.25, + "learning_rate": 1.8414928188763365e-05, + "loss": 2.1511, + "step": 19720 + }, + { + "epoch": 0.36372346698113206, + "grad_norm": 2.921875, + "learning_rate": 1.8411799377511857e-05, + "loss": 2.1728, + "step": 19740 + }, + { + "epoch": 0.36409198113207547, + "grad_norm": 3.203125, + "learning_rate": 1.840866774763582e-05, + "loss": 2.1587, + "step": 19760 + }, + { + "epoch": 0.3644604952830189, + "grad_norm": 3.109375, + "learning_rate": 1.8405533300184603e-05, + "loss": 2.1166, + "step": 19780 + }, + { + "epoch": 0.36482900943396224, + "grad_norm": 3.234375, + "learning_rate": 1.840239603620849e-05, + "loss": 2.1486, + "step": 19800 + }, + { + "epoch": 0.36519752358490565, + "grad_norm": 3.390625, + "learning_rate": 1.839925595675872e-05, + "loss": 2.1205, + "step": 19820 + }, + { + "epoch": 0.36556603773584906, + "grad_norm": 3.015625, + "learning_rate": 1.8396113062887467e-05, + "loss": 2.1438, + "step": 19840 + }, + { + "epoch": 0.36593455188679247, + "grad_norm": 3.0625, + "learning_rate": 1.839296735564786e-05, + "loss": 2.1371, + "step": 19860 + }, + { + "epoch": 0.3663030660377358, + "grad_norm": 2.84375, + "learning_rate": 1.8389818836093948e-05, + "loss": 2.1706, + "step": 19880 + }, + { + "epoch": 0.36667158018867924, + "grad_norm": 3.0625, + "learning_rate": 1.8386667505280745e-05, + "loss": 2.1741, + "step": 19900 + }, + { + "epoch": 0.36704009433962265, + "grad_norm": 2.953125, + "learning_rate": 1.83835133642642e-05, + "loss": 2.1718, + "step": 19920 + }, + { + "epoch": 0.36740860849056606, + "grad_norm": 3.25, + "learning_rate": 1.8380356414101198e-05, + "loss": 2.1092, + "step": 19940 + }, + { + "epoch": 0.3677771226415094, + "grad_norm": 2.9375, + "learning_rate": 1.837719665584957e-05, + "loss": 2.1488, + "step": 19960 + }, + { + "epoch": 0.3681456367924528, + "grad_norm": 2.953125, + "learning_rate": 1.837403409056809e-05, + "loss": 2.1615, + "step": 19980 + }, + { + "epoch": 0.36851415094339623, + "grad_norm": 3.59375, + "learning_rate": 1.837086871931647e-05, + "loss": 2.1571, + "step": 20000 + }, + { + "epoch": 0.36888266509433965, + "grad_norm": 2.953125, + "learning_rate": 1.8367700543155355e-05, + "loss": 2.143, + "step": 20020 + }, + { + "epoch": 0.369251179245283, + "grad_norm": 2.859375, + "learning_rate": 1.8364529563146348e-05, + "loss": 2.1786, + "step": 20040 + }, + { + "epoch": 0.3696196933962264, + "grad_norm": 2.9375, + "learning_rate": 1.836135578035197e-05, + "loss": 2.1709, + "step": 20060 + }, + { + "epoch": 0.3699882075471698, + "grad_norm": 2.875, + "learning_rate": 1.83581791958357e-05, + "loss": 2.1577, + "step": 20080 + }, + { + "epoch": 0.37035672169811323, + "grad_norm": 3.1875, + "learning_rate": 1.8354999810661942e-05, + "loss": 2.1248, + "step": 20100 + }, + { + "epoch": 0.3707252358490566, + "grad_norm": 3.4375, + "learning_rate": 1.8351817625896046e-05, + "loss": 2.0981, + "step": 20120 + }, + { + "epoch": 0.37109375, + "grad_norm": 3.265625, + "learning_rate": 1.8348632642604297e-05, + "loss": 2.1319, + "step": 20140 + }, + { + "epoch": 0.3714622641509434, + "grad_norm": 3.34375, + "learning_rate": 1.8345444861853922e-05, + "loss": 2.158, + "step": 20160 + }, + { + "epoch": 0.37183077830188677, + "grad_norm": 3.125, + "learning_rate": 1.834225428471308e-05, + "loss": 2.1569, + "step": 20180 + }, + { + "epoch": 0.3721992924528302, + "grad_norm": 2.875, + "learning_rate": 1.833906091225087e-05, + "loss": 2.1187, + "step": 20200 + }, + { + "epoch": 0.3725678066037736, + "grad_norm": 3.75, + "learning_rate": 1.8335864745537323e-05, + "loss": 2.084, + "step": 20220 + }, + { + "epoch": 0.372936320754717, + "grad_norm": 3.09375, + "learning_rate": 1.8332665785643414e-05, + "loss": 2.1292, + "step": 20240 + }, + { + "epoch": 0.37330483490566035, + "grad_norm": 3.09375, + "learning_rate": 1.8329464033641048e-05, + "loss": 2.1364, + "step": 20260 + }, + { + "epoch": 0.37367334905660377, + "grad_norm": 3.546875, + "learning_rate": 1.832625949060307e-05, + "loss": 2.1171, + "step": 20280 + }, + { + "epoch": 0.3740418632075472, + "grad_norm": 2.875, + "learning_rate": 1.8323052157603256e-05, + "loss": 2.1316, + "step": 20300 + }, + { + "epoch": 0.3744103773584906, + "grad_norm": 2.765625, + "learning_rate": 1.831984203571632e-05, + "loss": 2.1371, + "step": 20320 + }, + { + "epoch": 0.37477889150943394, + "grad_norm": 2.953125, + "learning_rate": 1.8316629126017906e-05, + "loss": 2.1643, + "step": 20340 + }, + { + "epoch": 0.37514740566037735, + "grad_norm": 3.0625, + "learning_rate": 1.8313413429584596e-05, + "loss": 2.141, + "step": 20360 + }, + { + "epoch": 0.37551591981132076, + "grad_norm": 3.03125, + "learning_rate": 1.8310194947493907e-05, + "loss": 2.1553, + "step": 20380 + }, + { + "epoch": 0.3758844339622642, + "grad_norm": 3.34375, + "learning_rate": 1.830697368082429e-05, + "loss": 2.141, + "step": 20400 + }, + { + "epoch": 0.37625294811320753, + "grad_norm": 2.84375, + "learning_rate": 1.8303749630655125e-05, + "loss": 2.1192, + "step": 20420 + }, + { + "epoch": 0.37662146226415094, + "grad_norm": 2.8125, + "learning_rate": 1.830052279806672e-05, + "loss": 2.1507, + "step": 20440 + }, + { + "epoch": 0.37698997641509435, + "grad_norm": 2.953125, + "learning_rate": 1.829729318414033e-05, + "loss": 2.1606, + "step": 20460 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 2.890625, + "learning_rate": 1.8294060789958128e-05, + "loss": 2.119, + "step": 20480 + }, + { + "epoch": 0.3777270047169811, + "grad_norm": 3.140625, + "learning_rate": 1.829082561660323e-05, + "loss": 2.142, + "step": 20500 + }, + { + "epoch": 0.37809551886792453, + "grad_norm": 2.859375, + "learning_rate": 1.8287587665159668e-05, + "loss": 2.1511, + "step": 20520 + }, + { + "epoch": 0.37846403301886794, + "grad_norm": 3.125, + "learning_rate": 1.8284346936712425e-05, + "loss": 2.1568, + "step": 20540 + }, + { + "epoch": 0.3788325471698113, + "grad_norm": 3.171875, + "learning_rate": 1.8281103432347397e-05, + "loss": 2.1746, + "step": 20560 + }, + { + "epoch": 0.3792010613207547, + "grad_norm": 2.984375, + "learning_rate": 1.827785715315142e-05, + "loss": 2.1027, + "step": 20580 + }, + { + "epoch": 0.3795695754716981, + "grad_norm": 2.984375, + "learning_rate": 1.8274608100212253e-05, + "loss": 2.1648, + "step": 20600 + }, + { + "epoch": 0.37993808962264153, + "grad_norm": 2.921875, + "learning_rate": 1.8271356274618594e-05, + "loss": 2.1202, + "step": 20620 + }, + { + "epoch": 0.3803066037735849, + "grad_norm": 2.84375, + "learning_rate": 1.8268101677460056e-05, + "loss": 2.1337, + "step": 20640 + }, + { + "epoch": 0.3806751179245283, + "grad_norm": 2.828125, + "learning_rate": 1.8264844309827195e-05, + "loss": 2.1404, + "step": 20660 + }, + { + "epoch": 0.3810436320754717, + "grad_norm": 3.015625, + "learning_rate": 1.8261584172811494e-05, + "loss": 2.146, + "step": 20680 + }, + { + "epoch": 0.3814121462264151, + "grad_norm": 2.953125, + "learning_rate": 1.825832126750535e-05, + "loss": 2.1321, + "step": 20700 + }, + { + "epoch": 0.38178066037735847, + "grad_norm": 3.109375, + "learning_rate": 1.82550555950021e-05, + "loss": 2.1391, + "step": 20720 + }, + { + "epoch": 0.3821491745283019, + "grad_norm": 3.234375, + "learning_rate": 1.8251787156396002e-05, + "loss": 2.1119, + "step": 20740 + }, + { + "epoch": 0.3825176886792453, + "grad_norm": 3.65625, + "learning_rate": 1.8248515952782254e-05, + "loss": 2.1723, + "step": 20760 + }, + { + "epoch": 0.3828862028301887, + "grad_norm": 3.09375, + "learning_rate": 1.824524198525696e-05, + "loss": 2.15, + "step": 20780 + }, + { + "epoch": 0.38325471698113206, + "grad_norm": 3.046875, + "learning_rate": 1.8241965254917168e-05, + "loss": 2.1459, + "step": 20800 + }, + { + "epoch": 0.38362323113207547, + "grad_norm": 3.015625, + "learning_rate": 1.8238685762860835e-05, + "loss": 2.1504, + "step": 20820 + }, + { + "epoch": 0.3839917452830189, + "grad_norm": 3.03125, + "learning_rate": 1.8235403510186863e-05, + "loss": 2.1402, + "step": 20840 + }, + { + "epoch": 0.38436025943396224, + "grad_norm": 3.140625, + "learning_rate": 1.8232118497995058e-05, + "loss": 2.0967, + "step": 20860 + }, + { + "epoch": 0.38472877358490565, + "grad_norm": 3.53125, + "learning_rate": 1.8228830727386175e-05, + "loss": 2.1234, + "step": 20880 + }, + { + "epoch": 0.38509728773584906, + "grad_norm": 3.03125, + "learning_rate": 1.822554019946187e-05, + "loss": 2.1255, + "step": 20900 + }, + { + "epoch": 0.38546580188679247, + "grad_norm": 3.5, + "learning_rate": 1.8222246915324734e-05, + "loss": 2.1575, + "step": 20920 + }, + { + "epoch": 0.3858343160377358, + "grad_norm": 3.0625, + "learning_rate": 1.821895087607828e-05, + "loss": 2.137, + "step": 20940 + }, + { + "epoch": 0.38620283018867924, + "grad_norm": 2.9375, + "learning_rate": 1.8215652082826945e-05, + "loss": 2.1387, + "step": 20960 + }, + { + "epoch": 0.38657134433962265, + "grad_norm": 2.796875, + "learning_rate": 1.8212350536676093e-05, + "loss": 2.1258, + "step": 20980 + }, + { + "epoch": 0.38693985849056606, + "grad_norm": 2.953125, + "learning_rate": 1.8209046238731998e-05, + "loss": 2.1492, + "step": 21000 + }, + { + "epoch": 0.3873083726415094, + "grad_norm": 2.8125, + "learning_rate": 1.820573919010187e-05, + "loss": 2.102, + "step": 21020 + }, + { + "epoch": 0.3876768867924528, + "grad_norm": 2.828125, + "learning_rate": 1.8202429391893826e-05, + "loss": 2.1411, + "step": 21040 + }, + { + "epoch": 0.38804540094339623, + "grad_norm": 3.28125, + "learning_rate": 1.8199116845216923e-05, + "loss": 2.1437, + "step": 21060 + }, + { + "epoch": 0.38841391509433965, + "grad_norm": 3.84375, + "learning_rate": 1.819580155118112e-05, + "loss": 2.1214, + "step": 21080 + }, + { + "epoch": 0.388782429245283, + "grad_norm": 3.109375, + "learning_rate": 1.819248351089731e-05, + "loss": 2.1245, + "step": 21100 + }, + { + "epoch": 0.3891509433962264, + "grad_norm": 2.78125, + "learning_rate": 1.81891627254773e-05, + "loss": 2.1232, + "step": 21120 + }, + { + "epoch": 0.3895194575471698, + "grad_norm": 3.609375, + "learning_rate": 1.818583919603382e-05, + "loss": 2.1421, + "step": 21140 + }, + { + "epoch": 0.38988797169811323, + "grad_norm": 2.828125, + "learning_rate": 1.8182512923680515e-05, + "loss": 2.1378, + "step": 21160 + }, + { + "epoch": 0.3902564858490566, + "grad_norm": 3.0, + "learning_rate": 1.8179183909531955e-05, + "loss": 2.1637, + "step": 21180 + }, + { + "epoch": 0.390625, + "grad_norm": 2.875, + "learning_rate": 1.8175852154703624e-05, + "loss": 2.1196, + "step": 21200 + }, + { + "epoch": 0.3909935141509434, + "grad_norm": 3.5, + "learning_rate": 1.8172517660311926e-05, + "loss": 2.1074, + "step": 21220 + }, + { + "epoch": 0.39136202830188677, + "grad_norm": 3.21875, + "learning_rate": 1.816918042747418e-05, + "loss": 2.1149, + "step": 21240 + }, + { + "epoch": 0.3917305424528302, + "grad_norm": 3.109375, + "learning_rate": 1.816584045730863e-05, + "loss": 2.119, + "step": 21260 + }, + { + "epoch": 0.3920990566037736, + "grad_norm": 3.25, + "learning_rate": 1.816249775093443e-05, + "loss": 2.1522, + "step": 21280 + }, + { + "epoch": 0.392467570754717, + "grad_norm": 3.015625, + "learning_rate": 1.8159152309471655e-05, + "loss": 2.1093, + "step": 21300 + }, + { + "epoch": 0.39283608490566035, + "grad_norm": 3.078125, + "learning_rate": 1.8155804134041294e-05, + "loss": 2.1086, + "step": 21320 + }, + { + "epoch": 0.39320459905660377, + "grad_norm": 2.9375, + "learning_rate": 1.8152453225765256e-05, + "loss": 2.0902, + "step": 21340 + }, + { + "epoch": 0.3935731132075472, + "grad_norm": 2.84375, + "learning_rate": 1.8149099585766362e-05, + "loss": 2.1318, + "step": 21360 + }, + { + "epoch": 0.3939416273584906, + "grad_norm": 3.234375, + "learning_rate": 1.8145743215168343e-05, + "loss": 2.1114, + "step": 21380 + }, + { + "epoch": 0.39431014150943394, + "grad_norm": 3.171875, + "learning_rate": 1.8142384115095857e-05, + "loss": 2.1218, + "step": 21400 + }, + { + "epoch": 0.39467865566037735, + "grad_norm": 3.4375, + "learning_rate": 1.8139022286674473e-05, + "loss": 2.1447, + "step": 21420 + }, + { + "epoch": 0.39504716981132076, + "grad_norm": 3.140625, + "learning_rate": 1.8135657731030666e-05, + "loss": 2.1679, + "step": 21440 + }, + { + "epoch": 0.3954156839622642, + "grad_norm": 3.0, + "learning_rate": 1.8132290449291834e-05, + "loss": 2.1448, + "step": 21460 + }, + { + "epoch": 0.39578419811320753, + "grad_norm": 3.3125, + "learning_rate": 1.8128920442586285e-05, + "loss": 2.1042, + "step": 21480 + }, + { + "epoch": 0.39615271226415094, + "grad_norm": 2.84375, + "learning_rate": 1.812554771204324e-05, + "loss": 2.172, + "step": 21500 + }, + { + "epoch": 0.39652122641509435, + "grad_norm": 3.015625, + "learning_rate": 1.8122172258792835e-05, + "loss": 2.1405, + "step": 21520 + }, + { + "epoch": 0.39688974056603776, + "grad_norm": 3.53125, + "learning_rate": 1.8118794083966112e-05, + "loss": 2.1284, + "step": 21540 + }, + { + "epoch": 0.3972582547169811, + "grad_norm": 3.03125, + "learning_rate": 1.8115413188695032e-05, + "loss": 2.1332, + "step": 21560 + }, + { + "epoch": 0.39762676886792453, + "grad_norm": 3.4375, + "learning_rate": 1.8112029574112465e-05, + "loss": 2.0887, + "step": 21580 + }, + { + "epoch": 0.39799528301886794, + "grad_norm": 2.984375, + "learning_rate": 1.810864324135219e-05, + "loss": 2.1434, + "step": 21600 + }, + { + "epoch": 0.3983637971698113, + "grad_norm": 3.09375, + "learning_rate": 1.8105254191548907e-05, + "loss": 2.134, + "step": 21620 + }, + { + "epoch": 0.3987323113207547, + "grad_norm": 3.234375, + "learning_rate": 1.810186242583821e-05, + "loss": 2.1366, + "step": 21640 + }, + { + "epoch": 0.3991008254716981, + "grad_norm": 2.84375, + "learning_rate": 1.8098467945356612e-05, + "loss": 2.1292, + "step": 21660 + }, + { + "epoch": 0.39946933962264153, + "grad_norm": 3.015625, + "learning_rate": 1.8095070751241537e-05, + "loss": 2.1645, + "step": 21680 + }, + { + "epoch": 0.3998378537735849, + "grad_norm": 2.9375, + "learning_rate": 1.8091670844631317e-05, + "loss": 2.1426, + "step": 21700 + }, + { + "epoch": 0.4002063679245283, + "grad_norm": 2.875, + "learning_rate": 1.808826822666519e-05, + "loss": 2.0578, + "step": 21720 + }, + { + "epoch": 0.4005748820754717, + "grad_norm": 2.875, + "learning_rate": 1.8084862898483304e-05, + "loss": 2.1516, + "step": 21740 + }, + { + "epoch": 0.4009433962264151, + "grad_norm": 2.984375, + "learning_rate": 1.8081454861226723e-05, + "loss": 2.1321, + "step": 21760 + }, + { + "epoch": 0.40131191037735847, + "grad_norm": 3.53125, + "learning_rate": 1.80780441160374e-05, + "loss": 2.0971, + "step": 21780 + }, + { + "epoch": 0.4016804245283019, + "grad_norm": 2.859375, + "learning_rate": 1.8074630664058218e-05, + "loss": 2.1387, + "step": 21800 + }, + { + "epoch": 0.4020489386792453, + "grad_norm": 3.03125, + "learning_rate": 1.807121450643295e-05, + "loss": 2.1327, + "step": 21820 + }, + { + "epoch": 0.4024174528301887, + "grad_norm": 2.984375, + "learning_rate": 1.8067795644306284e-05, + "loss": 2.1619, + "step": 21840 + }, + { + "epoch": 0.40278596698113206, + "grad_norm": 2.984375, + "learning_rate": 1.8064374078823807e-05, + "loss": 2.1556, + "step": 21860 + }, + { + "epoch": 0.40315448113207547, + "grad_norm": 2.953125, + "learning_rate": 1.8060949811132024e-05, + "loss": 2.1516, + "step": 21880 + }, + { + "epoch": 0.4035229952830189, + "grad_norm": 2.84375, + "learning_rate": 1.8057522842378333e-05, + "loss": 2.1261, + "step": 21900 + }, + { + "epoch": 0.40389150943396224, + "grad_norm": 3.0, + "learning_rate": 1.8054093173711046e-05, + "loss": 2.1082, + "step": 21920 + }, + { + "epoch": 0.40426002358490565, + "grad_norm": 2.96875, + "learning_rate": 1.8050660806279374e-05, + "loss": 2.1472, + "step": 21940 + }, + { + "epoch": 0.40462853773584906, + "grad_norm": 3.359375, + "learning_rate": 1.804722574123343e-05, + "loss": 2.1633, + "step": 21960 + }, + { + "epoch": 0.40499705188679247, + "grad_norm": 3.1875, + "learning_rate": 1.8043787979724242e-05, + "loss": 2.0893, + "step": 21980 + }, + { + "epoch": 0.4053655660377358, + "grad_norm": 3.609375, + "learning_rate": 1.804034752290373e-05, + "loss": 2.1281, + "step": 22000 + }, + { + "epoch": 0.40573408018867924, + "grad_norm": 2.984375, + "learning_rate": 1.8036904371924724e-05, + "loss": 2.1456, + "step": 22020 + }, + { + "epoch": 0.40610259433962265, + "grad_norm": 3.28125, + "learning_rate": 1.8033458527940956e-05, + "loss": 2.126, + "step": 22040 + }, + { + "epoch": 0.40647110849056606, + "grad_norm": 3.15625, + "learning_rate": 1.8030009992107055e-05, + "loss": 2.1024, + "step": 22060 + }, + { + "epoch": 0.4068396226415094, + "grad_norm": 3.1875, + "learning_rate": 1.802655876557856e-05, + "loss": 2.152, + "step": 22080 + }, + { + "epoch": 0.4072081367924528, + "grad_norm": 3.0, + "learning_rate": 1.8023104849511902e-05, + "loss": 2.1175, + "step": 22100 + }, + { + "epoch": 0.40757665094339623, + "grad_norm": 2.828125, + "learning_rate": 1.8019648245064423e-05, + "loss": 2.1067, + "step": 22120 + }, + { + "epoch": 0.40794516509433965, + "grad_norm": 2.96875, + "learning_rate": 1.801618895339436e-05, + "loss": 2.1639, + "step": 22140 + }, + { + "epoch": 0.408313679245283, + "grad_norm": 3.140625, + "learning_rate": 1.8012726975660856e-05, + "loss": 2.1444, + "step": 22160 + }, + { + "epoch": 0.4086821933962264, + "grad_norm": 3.28125, + "learning_rate": 1.800926231302394e-05, + "loss": 2.1353, + "step": 22180 + }, + { + "epoch": 0.4090507075471698, + "grad_norm": 3.15625, + "learning_rate": 1.800579496664456e-05, + "loss": 2.1148, + "step": 22200 + }, + { + "epoch": 0.40941922169811323, + "grad_norm": 3.25, + "learning_rate": 1.8002324937684552e-05, + "loss": 2.1141, + "step": 22220 + }, + { + "epoch": 0.4097877358490566, + "grad_norm": 3.328125, + "learning_rate": 1.7998852227306655e-05, + "loss": 2.1446, + "step": 22240 + }, + { + "epoch": 0.41015625, + "grad_norm": 3.046875, + "learning_rate": 1.7995376836674495e-05, + "loss": 2.142, + "step": 22260 + }, + { + "epoch": 0.4105247641509434, + "grad_norm": 2.703125, + "learning_rate": 1.7991898766952614e-05, + "loss": 2.1361, + "step": 22280 + }, + { + "epoch": 0.41089327830188677, + "grad_norm": 2.890625, + "learning_rate": 1.798841801930644e-05, + "loss": 2.1513, + "step": 22300 + }, + { + "epoch": 0.4112617924528302, + "grad_norm": 3.0, + "learning_rate": 1.7984934594902302e-05, + "loss": 2.1668, + "step": 22320 + }, + { + "epoch": 0.4116303066037736, + "grad_norm": 3.109375, + "learning_rate": 1.7981448494907424e-05, + "loss": 2.1427, + "step": 22340 + }, + { + "epoch": 0.411998820754717, + "grad_norm": 3.25, + "learning_rate": 1.7977959720489932e-05, + "loss": 2.1502, + "step": 22360 + }, + { + "epoch": 0.41236733490566035, + "grad_norm": 3.046875, + "learning_rate": 1.7974468272818844e-05, + "loss": 2.1545, + "step": 22380 + }, + { + "epoch": 0.41273584905660377, + "grad_norm": 3.03125, + "learning_rate": 1.7970974153064068e-05, + "loss": 2.1608, + "step": 22400 + }, + { + "epoch": 0.4131043632075472, + "grad_norm": 2.890625, + "learning_rate": 1.7967477362396413e-05, + "loss": 2.1392, + "step": 22420 + }, + { + "epoch": 0.4134728773584906, + "grad_norm": 3.78125, + "learning_rate": 1.796397790198759e-05, + "loss": 2.1319, + "step": 22440 + }, + { + "epoch": 0.41384139150943394, + "grad_norm": 3.109375, + "learning_rate": 1.7960475773010193e-05, + "loss": 2.1243, + "step": 22460 + }, + { + "epoch": 0.41420990566037735, + "grad_norm": 3.046875, + "learning_rate": 1.7956970976637715e-05, + "loss": 2.1267, + "step": 22480 + }, + { + "epoch": 0.41457841981132076, + "grad_norm": 3.4375, + "learning_rate": 1.7953463514044545e-05, + "loss": 2.1328, + "step": 22500 + }, + { + "epoch": 0.4149469339622642, + "grad_norm": 3.34375, + "learning_rate": 1.7949953386405962e-05, + "loss": 2.1287, + "step": 22520 + }, + { + "epoch": 0.41531544811320753, + "grad_norm": 3.296875, + "learning_rate": 1.794644059489814e-05, + "loss": 2.1032, + "step": 22540 + }, + { + "epoch": 0.41568396226415094, + "grad_norm": 2.796875, + "learning_rate": 1.794292514069814e-05, + "loss": 2.1593, + "step": 22560 + }, + { + "epoch": 0.41605247641509435, + "grad_norm": 3.171875, + "learning_rate": 1.7939407024983927e-05, + "loss": 2.0842, + "step": 22580 + }, + { + "epoch": 0.41642099056603776, + "grad_norm": 2.953125, + "learning_rate": 1.793588624893434e-05, + "loss": 2.1518, + "step": 22600 + }, + { + "epoch": 0.4167895047169811, + "grad_norm": 3.3125, + "learning_rate": 1.7932362813729134e-05, + "loss": 2.1093, + "step": 22620 + }, + { + "epoch": 0.41715801886792453, + "grad_norm": 3.078125, + "learning_rate": 1.792883672054893e-05, + "loss": 2.1334, + "step": 22640 + }, + { + "epoch": 0.41752653301886794, + "grad_norm": 2.78125, + "learning_rate": 1.7925307970575262e-05, + "loss": 2.1116, + "step": 22660 + }, + { + "epoch": 0.4178950471698113, + "grad_norm": 3.140625, + "learning_rate": 1.7921776564990528e-05, + "loss": 2.1439, + "step": 22680 + }, + { + "epoch": 0.4182635613207547, + "grad_norm": 3.375, + "learning_rate": 1.7918242504978047e-05, + "loss": 2.1297, + "step": 22700 + }, + { + "epoch": 0.4186320754716981, + "grad_norm": 3.3125, + "learning_rate": 1.7914705791721994e-05, + "loss": 2.1313, + "step": 22720 + }, + { + "epoch": 0.41900058962264153, + "grad_norm": 3.921875, + "learning_rate": 1.7911166426407467e-05, + "loss": 2.133, + "step": 22740 + }, + { + "epoch": 0.4193691037735849, + "grad_norm": 3.0625, + "learning_rate": 1.7907624410220425e-05, + "loss": 2.1792, + "step": 22760 + }, + { + "epoch": 0.4197376179245283, + "grad_norm": 3.21875, + "learning_rate": 1.7904079744347732e-05, + "loss": 2.1513, + "step": 22780 + }, + { + "epoch": 0.4201061320754717, + "grad_norm": 2.84375, + "learning_rate": 1.790053242997713e-05, + "loss": 2.1098, + "step": 22800 + }, + { + "epoch": 0.4204746462264151, + "grad_norm": 3.015625, + "learning_rate": 1.7896982468297255e-05, + "loss": 2.1393, + "step": 22820 + }, + { + "epoch": 0.42084316037735847, + "grad_norm": 3.109375, + "learning_rate": 1.789342986049763e-05, + "loss": 2.1585, + "step": 22840 + }, + { + "epoch": 0.4212116745283019, + "grad_norm": 2.921875, + "learning_rate": 1.7889874607768656e-05, + "loss": 2.109, + "step": 22860 + }, + { + "epoch": 0.4215801886792453, + "grad_norm": 3.296875, + "learning_rate": 1.7886316711301632e-05, + "loss": 2.1263, + "step": 22880 + }, + { + "epoch": 0.4219487028301887, + "grad_norm": 3.265625, + "learning_rate": 1.7882756172288735e-05, + "loss": 2.1326, + "step": 22900 + }, + { + "epoch": 0.42231721698113206, + "grad_norm": 3.515625, + "learning_rate": 1.787919299192303e-05, + "loss": 2.1264, + "step": 22920 + }, + { + "epoch": 0.42268573113207547, + "grad_norm": 3.203125, + "learning_rate": 1.787562717139847e-05, + "loss": 2.1372, + "step": 22940 + }, + { + "epoch": 0.4230542452830189, + "grad_norm": 3.28125, + "learning_rate": 1.7872058711909884e-05, + "loss": 2.1423, + "step": 22960 + }, + { + "epoch": 0.42342275943396224, + "grad_norm": 3.15625, + "learning_rate": 1.7868487614653e-05, + "loss": 2.1375, + "step": 22980 + }, + { + "epoch": 0.42379127358490565, + "grad_norm": 3.28125, + "learning_rate": 1.786491388082441e-05, + "loss": 2.1331, + "step": 23000 + }, + { + "epoch": 0.42415978773584906, + "grad_norm": 2.8125, + "learning_rate": 1.7861337511621604e-05, + "loss": 2.1154, + "step": 23020 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 2.984375, + "learning_rate": 1.7857758508242956e-05, + "loss": 2.0976, + "step": 23040 + }, + { + "epoch": 0.4248968160377358, + "grad_norm": 3.78125, + "learning_rate": 1.7854176871887713e-05, + "loss": 2.1478, + "step": 23060 + }, + { + "epoch": 0.42526533018867924, + "grad_norm": 3.625, + "learning_rate": 1.7850592603756008e-05, + "loss": 2.12, + "step": 23080 + }, + { + "epoch": 0.42563384433962265, + "grad_norm": 3.4375, + "learning_rate": 1.784700570504886e-05, + "loss": 2.1355, + "step": 23100 + }, + { + "epoch": 0.42600235849056606, + "grad_norm": 3.34375, + "learning_rate": 1.7843416176968167e-05, + "loss": 2.158, + "step": 23120 + }, + { + "epoch": 0.4263708726415094, + "grad_norm": 3.109375, + "learning_rate": 1.783982402071671e-05, + "loss": 2.1448, + "step": 23140 + }, + { + "epoch": 0.4267393867924528, + "grad_norm": 3.296875, + "learning_rate": 1.7836229237498138e-05, + "loss": 2.1255, + "step": 23160 + }, + { + "epoch": 0.42710790094339623, + "grad_norm": 3.078125, + "learning_rate": 1.7832631828517004e-05, + "loss": 2.1228, + "step": 23180 + }, + { + "epoch": 0.42747641509433965, + "grad_norm": 3.375, + "learning_rate": 1.7829031794978717e-05, + "loss": 2.1655, + "step": 23200 + }, + { + "epoch": 0.427844929245283, + "grad_norm": 3.171875, + "learning_rate": 1.782542913808958e-05, + "loss": 2.1015, + "step": 23220 + }, + { + "epoch": 0.4282134433962264, + "grad_norm": 2.875, + "learning_rate": 1.7821823859056772e-05, + "loss": 2.1445, + "step": 23240 + }, + { + "epoch": 0.4285819575471698, + "grad_norm": 3.328125, + "learning_rate": 1.7818215959088345e-05, + "loss": 2.1144, + "step": 23260 + }, + { + "epoch": 0.42895047169811323, + "grad_norm": 2.78125, + "learning_rate": 1.7814605439393233e-05, + "loss": 2.146, + "step": 23280 + }, + { + "epoch": 0.4293189858490566, + "grad_norm": 3.34375, + "learning_rate": 1.7810992301181254e-05, + "loss": 2.1545, + "step": 23300 + }, + { + "epoch": 0.4296875, + "grad_norm": 3.765625, + "learning_rate": 1.7807376545663096e-05, + "loss": 2.1139, + "step": 23320 + }, + { + "epoch": 0.4300560141509434, + "grad_norm": 2.71875, + "learning_rate": 1.7803758174050325e-05, + "loss": 2.1525, + "step": 23340 + }, + { + "epoch": 0.43042452830188677, + "grad_norm": 2.9375, + "learning_rate": 1.7800137187555385e-05, + "loss": 2.1035, + "step": 23360 + }, + { + "epoch": 0.4307930424528302, + "grad_norm": 2.921875, + "learning_rate": 1.7796513587391593e-05, + "loss": 2.1553, + "step": 23380 + }, + { + "epoch": 0.4311615566037736, + "grad_norm": 2.75, + "learning_rate": 1.7792887374773148e-05, + "loss": 2.1165, + "step": 23400 + }, + { + "epoch": 0.431530070754717, + "grad_norm": 2.90625, + "learning_rate": 1.778925855091512e-05, + "loss": 2.1233, + "step": 23420 + }, + { + "epoch": 0.43189858490566035, + "grad_norm": 3.453125, + "learning_rate": 1.7785627117033453e-05, + "loss": 2.1225, + "step": 23440 + }, + { + "epoch": 0.43226709905660377, + "grad_norm": 3.03125, + "learning_rate": 1.7781993074344967e-05, + "loss": 2.1293, + "step": 23460 + }, + { + "epoch": 0.4326356132075472, + "grad_norm": 2.890625, + "learning_rate": 1.777835642406736e-05, + "loss": 2.117, + "step": 23480 + }, + { + "epoch": 0.4330041273584906, + "grad_norm": 2.921875, + "learning_rate": 1.7774717167419197e-05, + "loss": 2.115, + "step": 23500 + }, + { + "epoch": 0.43337264150943394, + "grad_norm": 3.140625, + "learning_rate": 1.777107530561992e-05, + "loss": 2.1211, + "step": 23520 + }, + { + "epoch": 0.43374115566037735, + "grad_norm": 3.21875, + "learning_rate": 1.7767430839889848e-05, + "loss": 2.1171, + "step": 23540 + }, + { + "epoch": 0.43410966981132076, + "grad_norm": 3.890625, + "learning_rate": 1.7763783771450162e-05, + "loss": 2.1048, + "step": 23560 + }, + { + "epoch": 0.4344781839622642, + "grad_norm": 2.984375, + "learning_rate": 1.7760134101522925e-05, + "loss": 2.1184, + "step": 23580 + }, + { + "epoch": 0.43484669811320753, + "grad_norm": 3.21875, + "learning_rate": 1.7756481831331064e-05, + "loss": 2.1203, + "step": 23600 + }, + { + "epoch": 0.43521521226415094, + "grad_norm": 3.125, + "learning_rate": 1.7752826962098387e-05, + "loss": 2.0978, + "step": 23620 + }, + { + "epoch": 0.43558372641509435, + "grad_norm": 3.0625, + "learning_rate": 1.7749169495049563e-05, + "loss": 2.1117, + "step": 23640 + }, + { + "epoch": 0.43595224056603776, + "grad_norm": 3.0625, + "learning_rate": 1.7745509431410136e-05, + "loss": 2.1117, + "step": 23660 + }, + { + "epoch": 0.4363207547169811, + "grad_norm": 3.046875, + "learning_rate": 1.7741846772406518e-05, + "loss": 2.1639, + "step": 23680 + }, + { + "epoch": 0.43668926886792453, + "grad_norm": 2.875, + "learning_rate": 1.7738181519265993e-05, + "loss": 2.1325, + "step": 23700 + }, + { + "epoch": 0.43705778301886794, + "grad_norm": 3.09375, + "learning_rate": 1.7734513673216714e-05, + "loss": 2.118, + "step": 23720 + }, + { + "epoch": 0.4374262971698113, + "grad_norm": 3.421875, + "learning_rate": 1.7730843235487706e-05, + "loss": 2.1086, + "step": 23740 + }, + { + "epoch": 0.4377948113207547, + "grad_norm": 2.890625, + "learning_rate": 1.7727170207308857e-05, + "loss": 2.1196, + "step": 23760 + }, + { + "epoch": 0.4381633254716981, + "grad_norm": 3.328125, + "learning_rate": 1.772349458991092e-05, + "loss": 2.1551, + "step": 23780 + }, + { + "epoch": 0.43853183962264153, + "grad_norm": 3.671875, + "learning_rate": 1.7719816384525524e-05, + "loss": 2.1576, + "step": 23800 + }, + { + "epoch": 0.4389003537735849, + "grad_norm": 3.109375, + "learning_rate": 1.7716135592385164e-05, + "loss": 2.1547, + "step": 23820 + }, + { + "epoch": 0.4392688679245283, + "grad_norm": 3.125, + "learning_rate": 1.7712452214723197e-05, + "loss": 2.132, + "step": 23840 + }, + { + "epoch": 0.4396373820754717, + "grad_norm": 3.40625, + "learning_rate": 1.7708766252773845e-05, + "loss": 2.1712, + "step": 23860 + }, + { + "epoch": 0.4400058962264151, + "grad_norm": 3.484375, + "learning_rate": 1.7705077707772204e-05, + "loss": 2.1462, + "step": 23880 + }, + { + "epoch": 0.44037441037735847, + "grad_norm": 3.046875, + "learning_rate": 1.7701386580954228e-05, + "loss": 2.1322, + "step": 23900 + }, + { + "epoch": 0.4407429245283019, + "grad_norm": 3.28125, + "learning_rate": 1.7697692873556747e-05, + "loss": 2.1108, + "step": 23920 + }, + { + "epoch": 0.4411114386792453, + "grad_norm": 3.0, + "learning_rate": 1.7693996586817436e-05, + "loss": 2.103, + "step": 23940 + }, + { + "epoch": 0.4414799528301887, + "grad_norm": 3.046875, + "learning_rate": 1.7690297721974852e-05, + "loss": 2.1343, + "step": 23960 + }, + { + "epoch": 0.44184846698113206, + "grad_norm": 3.1875, + "learning_rate": 1.7686596280268417e-05, + "loss": 2.1304, + "step": 23980 + }, + { + "epoch": 0.44221698113207547, + "grad_norm": 3.109375, + "learning_rate": 1.76828922629384e-05, + "loss": 2.0849, + "step": 24000 + }, + { + "epoch": 0.4425854952830189, + "grad_norm": 3.59375, + "learning_rate": 1.7679185671225944e-05, + "loss": 2.1174, + "step": 24020 + }, + { + "epoch": 0.44295400943396224, + "grad_norm": 3.328125, + "learning_rate": 1.7675476506373055e-05, + "loss": 2.1207, + "step": 24040 + }, + { + "epoch": 0.44332252358490565, + "grad_norm": 3.34375, + "learning_rate": 1.76717647696226e-05, + "loss": 2.1014, + "step": 24060 + }, + { + "epoch": 0.44369103773584906, + "grad_norm": 3.53125, + "learning_rate": 1.7668050462218308e-05, + "loss": 2.1058, + "step": 24080 + }, + { + "epoch": 0.44405955188679247, + "grad_norm": 3.3125, + "learning_rate": 1.7664333585404768e-05, + "loss": 2.141, + "step": 24100 + }, + { + "epoch": 0.4444280660377358, + "grad_norm": 3.640625, + "learning_rate": 1.7660614140427427e-05, + "loss": 2.1292, + "step": 24120 + }, + { + "epoch": 0.44479658018867924, + "grad_norm": 3.78125, + "learning_rate": 1.7656892128532596e-05, + "loss": 2.085, + "step": 24140 + }, + { + "epoch": 0.44516509433962265, + "grad_norm": 3.78125, + "learning_rate": 1.7653167550967453e-05, + "loss": 2.1328, + "step": 24160 + }, + { + "epoch": 0.44553360849056606, + "grad_norm": 3.140625, + "learning_rate": 1.7649440408980026e-05, + "loss": 2.121, + "step": 24180 + }, + { + "epoch": 0.4459021226415094, + "grad_norm": 2.890625, + "learning_rate": 1.7645710703819202e-05, + "loss": 2.1395, + "step": 24200 + }, + { + "epoch": 0.4462706367924528, + "grad_norm": 2.890625, + "learning_rate": 1.7641978436734732e-05, + "loss": 2.0883, + "step": 24220 + }, + { + "epoch": 0.44663915094339623, + "grad_norm": 3.03125, + "learning_rate": 1.7638243608977225e-05, + "loss": 2.0543, + "step": 24240 + }, + { + "epoch": 0.44700766509433965, + "grad_norm": 2.84375, + "learning_rate": 1.7634506221798144e-05, + "loss": 2.1409, + "step": 24260 + }, + { + "epoch": 0.447376179245283, + "grad_norm": 3.15625, + "learning_rate": 1.7630766276449813e-05, + "loss": 2.1232, + "step": 24280 + }, + { + "epoch": 0.4477446933962264, + "grad_norm": 3.0625, + "learning_rate": 1.7627023774185412e-05, + "loss": 2.1183, + "step": 24300 + }, + { + "epoch": 0.4481132075471698, + "grad_norm": 3.109375, + "learning_rate": 1.762327871625898e-05, + "loss": 2.1403, + "step": 24320 + }, + { + "epoch": 0.44848172169811323, + "grad_norm": 3.078125, + "learning_rate": 1.761953110392541e-05, + "loss": 2.1545, + "step": 24340 + }, + { + "epoch": 0.4488502358490566, + "grad_norm": 2.984375, + "learning_rate": 1.761578093844045e-05, + "loss": 2.1455, + "step": 24360 + }, + { + "epoch": 0.44921875, + "grad_norm": 3.109375, + "learning_rate": 1.7612028221060705e-05, + "loss": 2.1115, + "step": 24380 + }, + { + "epoch": 0.4495872641509434, + "grad_norm": 2.890625, + "learning_rate": 1.7608272953043636e-05, + "loss": 2.1404, + "step": 24400 + }, + { + "epoch": 0.44995577830188677, + "grad_norm": 3.0625, + "learning_rate": 1.760451513564756e-05, + "loss": 2.1152, + "step": 24420 + }, + { + "epoch": 0.4503242924528302, + "grad_norm": 2.859375, + "learning_rate": 1.760075477013164e-05, + "loss": 2.0829, + "step": 24440 + }, + { + "epoch": 0.4506928066037736, + "grad_norm": 3.328125, + "learning_rate": 1.7596991857755903e-05, + "loss": 2.1456, + "step": 24460 + }, + { + "epoch": 0.451061320754717, + "grad_norm": 3.09375, + "learning_rate": 1.7593226399781222e-05, + "loss": 2.1525, + "step": 24480 + }, + { + "epoch": 0.45142983490566035, + "grad_norm": 3.390625, + "learning_rate": 1.7589458397469333e-05, + "loss": 2.1396, + "step": 24500 + }, + { + "epoch": 0.45179834905660377, + "grad_norm": 2.953125, + "learning_rate": 1.758568785208281e-05, + "loss": 2.1288, + "step": 24520 + }, + { + "epoch": 0.4521668632075472, + "grad_norm": 3.109375, + "learning_rate": 1.758191476488509e-05, + "loss": 2.1443, + "step": 24540 + }, + { + "epoch": 0.4525353773584906, + "grad_norm": 3.15625, + "learning_rate": 1.7578139137140455e-05, + "loss": 2.1629, + "step": 24560 + }, + { + "epoch": 0.45290389150943394, + "grad_norm": 2.859375, + "learning_rate": 1.757436097011405e-05, + "loss": 2.1198, + "step": 24580 + }, + { + "epoch": 0.45327240566037735, + "grad_norm": 3.25, + "learning_rate": 1.7570580265071855e-05, + "loss": 2.1392, + "step": 24600 + }, + { + "epoch": 0.45364091981132076, + "grad_norm": 3.375, + "learning_rate": 1.7566797023280712e-05, + "loss": 2.1132, + "step": 24620 + }, + { + "epoch": 0.4540094339622642, + "grad_norm": 3.015625, + "learning_rate": 1.7563011246008307e-05, + "loss": 2.133, + "step": 24640 + }, + { + "epoch": 0.45437794811320753, + "grad_norm": 3.0625, + "learning_rate": 1.7559222934523177e-05, + "loss": 2.0993, + "step": 24660 + }, + { + "epoch": 0.45474646226415094, + "grad_norm": 3.34375, + "learning_rate": 1.7555432090094716e-05, + "loss": 2.1368, + "step": 24680 + }, + { + "epoch": 0.45511497641509435, + "grad_norm": 3.0625, + "learning_rate": 1.7551638713993145e-05, + "loss": 2.1112, + "step": 24700 + }, + { + "epoch": 0.45548349056603776, + "grad_norm": 3.15625, + "learning_rate": 1.7547842807489566e-05, + "loss": 2.0985, + "step": 24720 + }, + { + "epoch": 0.4558520047169811, + "grad_norm": 2.890625, + "learning_rate": 1.7544044371855897e-05, + "loss": 2.1309, + "step": 24740 + }, + { + "epoch": 0.45622051886792453, + "grad_norm": 3.0, + "learning_rate": 1.7540243408364924e-05, + "loss": 2.1224, + "step": 24760 + }, + { + "epoch": 0.45658903301886794, + "grad_norm": 3.078125, + "learning_rate": 1.7536439918290272e-05, + "loss": 2.1435, + "step": 24780 + }, + { + "epoch": 0.4569575471698113, + "grad_norm": 2.984375, + "learning_rate": 1.7532633902906414e-05, + "loss": 2.1285, + "step": 24800 + }, + { + "epoch": 0.4573260613207547, + "grad_norm": 3.703125, + "learning_rate": 1.7528825363488664e-05, + "loss": 2.0995, + "step": 24820 + }, + { + "epoch": 0.4576945754716981, + "grad_norm": 3.578125, + "learning_rate": 1.752501430131319e-05, + "loss": 2.1435, + "step": 24840 + }, + { + "epoch": 0.45806308962264153, + "grad_norm": 3.390625, + "learning_rate": 1.752120071765701e-05, + "loss": 2.1642, + "step": 24860 + }, + { + "epoch": 0.4584316037735849, + "grad_norm": 3.21875, + "learning_rate": 1.751738461379797e-05, + "loss": 2.1628, + "step": 24880 + }, + { + "epoch": 0.4588001179245283, + "grad_norm": 3.1875, + "learning_rate": 1.751356599101477e-05, + "loss": 2.1578, + "step": 24900 + }, + { + "epoch": 0.4591686320754717, + "grad_norm": 3.0, + "learning_rate": 1.750974485058696e-05, + "loss": 2.0953, + "step": 24920 + }, + { + "epoch": 0.4595371462264151, + "grad_norm": 3.046875, + "learning_rate": 1.7505921193794916e-05, + "loss": 2.1533, + "step": 24940 + }, + { + "epoch": 0.45990566037735847, + "grad_norm": 3.0, + "learning_rate": 1.750209502191988e-05, + "loss": 2.1063, + "step": 24960 + }, + { + "epoch": 0.4602741745283019, + "grad_norm": 2.953125, + "learning_rate": 1.749826633624392e-05, + "loss": 2.1202, + "step": 24980 + }, + { + "epoch": 0.4606426886792453, + "grad_norm": 3.09375, + "learning_rate": 1.749443513804995e-05, + "loss": 2.1018, + "step": 25000 + }, + { + "epoch": 0.4610112028301887, + "grad_norm": 3.0, + "learning_rate": 1.749060142862173e-05, + "loss": 2.1387, + "step": 25020 + }, + { + "epoch": 0.46137971698113206, + "grad_norm": 3.125, + "learning_rate": 1.7486765209243858e-05, + "loss": 2.1327, + "step": 25040 + }, + { + "epoch": 0.46174823113207547, + "grad_norm": 2.984375, + "learning_rate": 1.7482926481201775e-05, + "loss": 2.1339, + "step": 25060 + }, + { + "epoch": 0.4621167452830189, + "grad_norm": 2.921875, + "learning_rate": 1.7479085245781754e-05, + "loss": 2.0826, + "step": 25080 + }, + { + "epoch": 0.46248525943396224, + "grad_norm": 3.28125, + "learning_rate": 1.747524150427093e-05, + "loss": 2.1175, + "step": 25100 + }, + { + "epoch": 0.46285377358490565, + "grad_norm": 3.265625, + "learning_rate": 1.747139525795725e-05, + "loss": 2.1519, + "step": 25120 + }, + { + "epoch": 0.46322228773584906, + "grad_norm": 3.3125, + "learning_rate": 1.746754650812952e-05, + "loss": 2.137, + "step": 25140 + }, + { + "epoch": 0.46359080188679247, + "grad_norm": 3.25, + "learning_rate": 1.7463695256077376e-05, + "loss": 2.1531, + "step": 25160 + }, + { + "epoch": 0.4639593160377358, + "grad_norm": 3.0625, + "learning_rate": 1.7459841503091296e-05, + "loss": 2.1473, + "step": 25180 + }, + { + "epoch": 0.46432783018867924, + "grad_norm": 3.296875, + "learning_rate": 1.7455985250462597e-05, + "loss": 2.1023, + "step": 25200 + }, + { + "epoch": 0.46469634433962265, + "grad_norm": 3.4375, + "learning_rate": 1.745212649948343e-05, + "loss": 2.1229, + "step": 25220 + }, + { + "epoch": 0.46506485849056606, + "grad_norm": 3.140625, + "learning_rate": 1.7448265251446783e-05, + "loss": 2.1078, + "step": 25240 + }, + { + "epoch": 0.4654333726415094, + "grad_norm": 3.203125, + "learning_rate": 1.7444401507646488e-05, + "loss": 2.0845, + "step": 25260 + }, + { + "epoch": 0.4658018867924528, + "grad_norm": 3.109375, + "learning_rate": 1.7440535269377198e-05, + "loss": 2.1378, + "step": 25280 + }, + { + "epoch": 0.46617040094339623, + "grad_norm": 3.09375, + "learning_rate": 1.743666653793442e-05, + "loss": 2.1199, + "step": 25300 + }, + { + "epoch": 0.46653891509433965, + "grad_norm": 2.9375, + "learning_rate": 1.7432795314614484e-05, + "loss": 2.0979, + "step": 25320 + }, + { + "epoch": 0.466907429245283, + "grad_norm": 3.234375, + "learning_rate": 1.7428921600714566e-05, + "loss": 2.1255, + "step": 25340 + }, + { + "epoch": 0.4672759433962264, + "grad_norm": 3.03125, + "learning_rate": 1.7425045397532657e-05, + "loss": 2.0863, + "step": 25360 + }, + { + "epoch": 0.4676444575471698, + "grad_norm": 3.09375, + "learning_rate": 1.7421166706367604e-05, + "loss": 2.1003, + "step": 25380 + }, + { + "epoch": 0.46801297169811323, + "grad_norm": 3.171875, + "learning_rate": 1.7417285528519078e-05, + "loss": 2.1553, + "step": 25400 + }, + { + "epoch": 0.4683814858490566, + "grad_norm": 3.46875, + "learning_rate": 1.7413401865287576e-05, + "loss": 2.1474, + "step": 25420 + }, + { + "epoch": 0.46875, + "grad_norm": 3.296875, + "learning_rate": 1.740951571797444e-05, + "loss": 2.1382, + "step": 25440 + }, + { + "epoch": 0.4691185141509434, + "grad_norm": 2.984375, + "learning_rate": 1.7405627087881846e-05, + "loss": 2.1027, + "step": 25460 + }, + { + "epoch": 0.46948702830188677, + "grad_norm": 3.125, + "learning_rate": 1.7401735976312786e-05, + "loss": 2.1001, + "step": 25480 + }, + { + "epoch": 0.4698555424528302, + "grad_norm": 3.046875, + "learning_rate": 1.7397842384571098e-05, + "loss": 2.121, + "step": 25500 + }, + { + "epoch": 0.4702240566037736, + "grad_norm": 3.359375, + "learning_rate": 1.7393946313961444e-05, + "loss": 2.1108, + "step": 25520 + }, + { + "epoch": 0.470592570754717, + "grad_norm": 3.34375, + "learning_rate": 1.739004776578932e-05, + "loss": 2.0882, + "step": 25540 + }, + { + "epoch": 0.47096108490566035, + "grad_norm": 2.859375, + "learning_rate": 1.738614674136105e-05, + "loss": 2.142, + "step": 25560 + }, + { + "epoch": 0.47132959905660377, + "grad_norm": 3.078125, + "learning_rate": 1.7382243241983793e-05, + "loss": 2.0937, + "step": 25580 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 3.359375, + "learning_rate": 1.7378337268965525e-05, + "loss": 2.1117, + "step": 25600 + }, + { + "epoch": 0.4720666273584906, + "grad_norm": 3.265625, + "learning_rate": 1.7374428823615065e-05, + "loss": 2.1576, + "step": 25620 + }, + { + "epoch": 0.47243514150943394, + "grad_norm": 2.96875, + "learning_rate": 1.7370517907242054e-05, + "loss": 2.0894, + "step": 25640 + }, + { + "epoch": 0.47280365566037735, + "grad_norm": 2.828125, + "learning_rate": 1.7366604521156958e-05, + "loss": 2.1266, + "step": 25660 + }, + { + "epoch": 0.47317216981132076, + "grad_norm": 3.28125, + "learning_rate": 1.7362688666671077e-05, + "loss": 2.0865, + "step": 25680 + }, + { + "epoch": 0.4735406839622642, + "grad_norm": 3.46875, + "learning_rate": 1.7358770345096536e-05, + "loss": 2.1112, + "step": 25700 + }, + { + "epoch": 0.47390919811320753, + "grad_norm": 3.078125, + "learning_rate": 1.7354849557746275e-05, + "loss": 2.0876, + "step": 25720 + }, + { + "epoch": 0.47427771226415094, + "grad_norm": 3.3125, + "learning_rate": 1.7350926305934085e-05, + "loss": 2.1161, + "step": 25740 + }, + { + "epoch": 0.47464622641509435, + "grad_norm": 3.0, + "learning_rate": 1.7347000590974564e-05, + "loss": 2.1218, + "step": 25760 + }, + { + "epoch": 0.47501474056603776, + "grad_norm": 3.59375, + "learning_rate": 1.7343072414183136e-05, + "loss": 2.1245, + "step": 25780 + }, + { + "epoch": 0.4753832547169811, + "grad_norm": 3.34375, + "learning_rate": 1.733914177687606e-05, + "loss": 2.1292, + "step": 25800 + }, + { + "epoch": 0.47575176886792453, + "grad_norm": 3.484375, + "learning_rate": 1.733520868037041e-05, + "loss": 2.1409, + "step": 25820 + }, + { + "epoch": 0.47612028301886794, + "grad_norm": 3.078125, + "learning_rate": 1.7331273125984086e-05, + "loss": 2.1766, + "step": 25840 + }, + { + "epoch": 0.4764887971698113, + "grad_norm": 2.859375, + "learning_rate": 1.7327335115035814e-05, + "loss": 2.078, + "step": 25860 + }, + { + "epoch": 0.4768573113207547, + "grad_norm": 3.140625, + "learning_rate": 1.7323394648845145e-05, + "loss": 2.1016, + "step": 25880 + }, + { + "epoch": 0.4772258254716981, + "grad_norm": 2.9375, + "learning_rate": 1.7319451728732448e-05, + "loss": 2.1116, + "step": 25900 + }, + { + "epoch": 0.47759433962264153, + "grad_norm": 3.65625, + "learning_rate": 1.7315506356018912e-05, + "loss": 2.1183, + "step": 25920 + }, + { + "epoch": 0.4779628537735849, + "grad_norm": 4.03125, + "learning_rate": 1.731155853202656e-05, + "loss": 2.1163, + "step": 25940 + }, + { + "epoch": 0.4783313679245283, + "grad_norm": 3.453125, + "learning_rate": 1.730760825807822e-05, + "loss": 2.1244, + "step": 25960 + }, + { + "epoch": 0.4786998820754717, + "grad_norm": 2.9375, + "learning_rate": 1.7303655535497555e-05, + "loss": 2.1521, + "step": 25980 + }, + { + "epoch": 0.4790683962264151, + "grad_norm": 3.078125, + "learning_rate": 1.729970036560904e-05, + "loss": 2.151, + "step": 26000 + }, + { + "epoch": 0.47943691037735847, + "grad_norm": 3.140625, + "learning_rate": 1.7295742749737975e-05, + "loss": 2.1323, + "step": 26020 + }, + { + "epoch": 0.4798054245283019, + "grad_norm": 3.984375, + "learning_rate": 1.7291782689210475e-05, + "loss": 2.1302, + "step": 26040 + }, + { + "epoch": 0.4801739386792453, + "grad_norm": 3.109375, + "learning_rate": 1.7287820185353478e-05, + "loss": 2.1194, + "step": 26060 + }, + { + "epoch": 0.4805424528301887, + "grad_norm": 3.5, + "learning_rate": 1.728385523949474e-05, + "loss": 2.1525, + "step": 26080 + }, + { + "epoch": 0.48091096698113206, + "grad_norm": 3.3125, + "learning_rate": 1.727988785296283e-05, + "loss": 2.094, + "step": 26100 + }, + { + "epoch": 0.48127948113207547, + "grad_norm": 3.09375, + "learning_rate": 1.7275918027087146e-05, + "loss": 2.1166, + "step": 26120 + }, + { + "epoch": 0.4816479952830189, + "grad_norm": 2.984375, + "learning_rate": 1.727194576319789e-05, + "loss": 2.1332, + "step": 26140 + }, + { + "epoch": 0.48201650943396224, + "grad_norm": 3.046875, + "learning_rate": 1.7267971062626095e-05, + "loss": 2.1741, + "step": 26160 + }, + { + "epoch": 0.48238502358490565, + "grad_norm": 3.46875, + "learning_rate": 1.7263993926703593e-05, + "loss": 2.1613, + "step": 26180 + }, + { + "epoch": 0.48275353773584906, + "grad_norm": 2.8125, + "learning_rate": 1.726001435676305e-05, + "loss": 2.1198, + "step": 26200 + }, + { + "epoch": 0.48312205188679247, + "grad_norm": 3.1875, + "learning_rate": 1.725603235413794e-05, + "loss": 2.1052, + "step": 26220 + }, + { + "epoch": 0.4834905660377358, + "grad_norm": 3.296875, + "learning_rate": 1.7252047920162552e-05, + "loss": 2.1504, + "step": 26240 + }, + { + "epoch": 0.48385908018867924, + "grad_norm": 3.203125, + "learning_rate": 1.7248061056171984e-05, + "loss": 2.1091, + "step": 26260 + }, + { + "epoch": 0.48422759433962265, + "grad_norm": 3.015625, + "learning_rate": 1.7244071763502157e-05, + "loss": 2.1197, + "step": 26280 + }, + { + "epoch": 0.48459610849056606, + "grad_norm": 3.0, + "learning_rate": 1.72400800434898e-05, + "loss": 2.1539, + "step": 26300 + }, + { + "epoch": 0.4849646226415094, + "grad_norm": 3.375, + "learning_rate": 1.7236085897472463e-05, + "loss": 2.1292, + "step": 26320 + }, + { + "epoch": 0.4853331367924528, + "grad_norm": 3.28125, + "learning_rate": 1.7232089326788503e-05, + "loss": 2.1028, + "step": 26340 + }, + { + "epoch": 0.48570165094339623, + "grad_norm": 3.140625, + "learning_rate": 1.7228090332777088e-05, + "loss": 2.0949, + "step": 26360 + }, + { + "epoch": 0.48607016509433965, + "grad_norm": 3.578125, + "learning_rate": 1.72240889167782e-05, + "loss": 2.1299, + "step": 26380 + }, + { + "epoch": 0.486438679245283, + "grad_norm": 3.296875, + "learning_rate": 1.7220085080132634e-05, + "loss": 2.1269, + "step": 26400 + }, + { + "epoch": 0.4868071933962264, + "grad_norm": 3.1875, + "learning_rate": 1.7216078824181997e-05, + "loss": 2.1192, + "step": 26420 + }, + { + "epoch": 0.4871757075471698, + "grad_norm": 2.953125, + "learning_rate": 1.7212070150268702e-05, + "loss": 2.145, + "step": 26440 + }, + { + "epoch": 0.48754422169811323, + "grad_norm": 2.953125, + "learning_rate": 1.7208059059735975e-05, + "loss": 2.1215, + "step": 26460 + }, + { + "epoch": 0.4879127358490566, + "grad_norm": 3.3125, + "learning_rate": 1.720404555392785e-05, + "loss": 2.11, + "step": 26480 + }, + { + "epoch": 0.48828125, + "grad_norm": 2.8125, + "learning_rate": 1.720002963418918e-05, + "loss": 2.1165, + "step": 26500 + }, + { + "epoch": 0.4886497641509434, + "grad_norm": 3.375, + "learning_rate": 1.7196011301865608e-05, + "loss": 2.128, + "step": 26520 + }, + { + "epoch": 0.48901827830188677, + "grad_norm": 3.8125, + "learning_rate": 1.7191990558303607e-05, + "loss": 2.1379, + "step": 26540 + }, + { + "epoch": 0.4893867924528302, + "grad_norm": 2.96875, + "learning_rate": 1.7187967404850434e-05, + "loss": 2.1048, + "step": 26560 + }, + { + "epoch": 0.4897553066037736, + "grad_norm": 3.171875, + "learning_rate": 1.718394184285418e-05, + "loss": 2.137, + "step": 26580 + }, + { + "epoch": 0.490123820754717, + "grad_norm": 3.25, + "learning_rate": 1.717991387366372e-05, + "loss": 2.09, + "step": 26600 + }, + { + "epoch": 0.49049233490566035, + "grad_norm": 3.09375, + "learning_rate": 1.717588349862875e-05, + "loss": 2.135, + "step": 26620 + }, + { + "epoch": 0.49086084905660377, + "grad_norm": 3.015625, + "learning_rate": 1.7171850719099768e-05, + "loss": 2.101, + "step": 26640 + }, + { + "epoch": 0.4912293632075472, + "grad_norm": 3.03125, + "learning_rate": 1.7167815536428072e-05, + "loss": 2.0965, + "step": 26660 + }, + { + "epoch": 0.4915978773584906, + "grad_norm": 3.125, + "learning_rate": 1.716377795196578e-05, + "loss": 2.1131, + "step": 26680 + }, + { + "epoch": 0.49196639150943394, + "grad_norm": 3.53125, + "learning_rate": 1.7159737967065795e-05, + "loss": 2.1014, + "step": 26700 + }, + { + "epoch": 0.49233490566037735, + "grad_norm": 2.875, + "learning_rate": 1.715569558308184e-05, + "loss": 2.074, + "step": 26720 + }, + { + "epoch": 0.49270341981132076, + "grad_norm": 2.921875, + "learning_rate": 1.715165080136844e-05, + "loss": 2.1504, + "step": 26740 + }, + { + "epoch": 0.4930719339622642, + "grad_norm": 3.171875, + "learning_rate": 1.714760362328091e-05, + "loss": 2.1179, + "step": 26760 + }, + { + "epoch": 0.49344044811320753, + "grad_norm": 3.25, + "learning_rate": 1.7143554050175386e-05, + "loss": 2.0793, + "step": 26780 + }, + { + "epoch": 0.49380896226415094, + "grad_norm": 3.09375, + "learning_rate": 1.7139502083408794e-05, + "loss": 2.1105, + "step": 26800 + }, + { + "epoch": 0.49417747641509435, + "grad_norm": 3.3125, + "learning_rate": 1.713544772433887e-05, + "loss": 2.1033, + "step": 26820 + }, + { + "epoch": 0.49454599056603776, + "grad_norm": 3.359375, + "learning_rate": 1.7131390974324143e-05, + "loss": 2.1311, + "step": 26840 + }, + { + "epoch": 0.4949145047169811, + "grad_norm": 3.265625, + "learning_rate": 1.7127331834723952e-05, + "loss": 2.1812, + "step": 26860 + }, + { + "epoch": 0.49528301886792453, + "grad_norm": 3.46875, + "learning_rate": 1.7123270306898435e-05, + "loss": 2.1726, + "step": 26880 + }, + { + "epoch": 0.49565153301886794, + "grad_norm": 3.28125, + "learning_rate": 1.7119206392208527e-05, + "loss": 2.1126, + "step": 26900 + }, + { + "epoch": 0.4960200471698113, + "grad_norm": 2.9375, + "learning_rate": 1.7115140092015955e-05, + "loss": 2.0991, + "step": 26920 + }, + { + "epoch": 0.4963885613207547, + "grad_norm": 3.09375, + "learning_rate": 1.7111071407683266e-05, + "loss": 2.1499, + "step": 26940 + }, + { + "epoch": 0.4967570754716981, + "grad_norm": 3.140625, + "learning_rate": 1.710700034057379e-05, + "loss": 2.1414, + "step": 26960 + }, + { + "epoch": 0.49712558962264153, + "grad_norm": 3.0625, + "learning_rate": 1.710292689205166e-05, + "loss": 2.1106, + "step": 26980 + }, + { + "epoch": 0.4974941037735849, + "grad_norm": 3.15625, + "learning_rate": 1.7098851063481805e-05, + "loss": 2.0956, + "step": 27000 + }, + { + "epoch": 0.4978626179245283, + "grad_norm": 3.0625, + "learning_rate": 1.7094772856229954e-05, + "loss": 2.1349, + "step": 27020 + }, + { + "epoch": 0.4982311320754717, + "grad_norm": 2.984375, + "learning_rate": 1.7090692271662633e-05, + "loss": 2.1225, + "step": 27040 + }, + { + "epoch": 0.4985996462264151, + "grad_norm": 3.078125, + "learning_rate": 1.708660931114716e-05, + "loss": 2.0913, + "step": 27060 + }, + { + "epoch": 0.49896816037735847, + "grad_norm": 3.640625, + "learning_rate": 1.7082523976051658e-05, + "loss": 2.0946, + "step": 27080 + }, + { + "epoch": 0.4993366745283019, + "grad_norm": 3.125, + "learning_rate": 1.7078436267745035e-05, + "loss": 2.1417, + "step": 27100 + }, + { + "epoch": 0.4997051886792453, + "grad_norm": 2.9375, + "learning_rate": 1.7074346187597008e-05, + "loss": 2.1212, + "step": 27120 + }, + { + "epoch": 0.5000737028301887, + "grad_norm": 3.6875, + "learning_rate": 1.707025373697807e-05, + "loss": 2.134, + "step": 27140 + }, + { + "epoch": 0.5004422169811321, + "grad_norm": 3.09375, + "learning_rate": 1.7066158917259525e-05, + "loss": 2.1129, + "step": 27160 + }, + { + "epoch": 0.5008107311320755, + "grad_norm": 3.296875, + "learning_rate": 1.7062061729813468e-05, + "loss": 2.125, + "step": 27180 + }, + { + "epoch": 0.5011792452830188, + "grad_norm": 3.25, + "learning_rate": 1.7057962176012773e-05, + "loss": 2.1097, + "step": 27200 + }, + { + "epoch": 0.5015477594339622, + "grad_norm": 3.5625, + "learning_rate": 1.7053860257231133e-05, + "loss": 2.1353, + "step": 27220 + }, + { + "epoch": 0.5019162735849056, + "grad_norm": 3.4375, + "learning_rate": 1.7049755974843003e-05, + "loss": 2.1239, + "step": 27240 + }, + { + "epoch": 0.5022847877358491, + "grad_norm": 3.0625, + "learning_rate": 1.7045649330223657e-05, + "loss": 2.1204, + "step": 27260 + }, + { + "epoch": 0.5026533018867925, + "grad_norm": 3.03125, + "learning_rate": 1.704154032474914e-05, + "loss": 2.1124, + "step": 27280 + }, + { + "epoch": 0.5030218160377359, + "grad_norm": 2.796875, + "learning_rate": 1.7037428959796303e-05, + "loss": 2.0783, + "step": 27300 + }, + { + "epoch": 0.5033903301886793, + "grad_norm": 3.640625, + "learning_rate": 1.7033315236742777e-05, + "loss": 2.0785, + "step": 27320 + }, + { + "epoch": 0.5037588443396226, + "grad_norm": 3.03125, + "learning_rate": 1.7029199156966996e-05, + "loss": 2.1203, + "step": 27340 + }, + { + "epoch": 0.504127358490566, + "grad_norm": 3.75, + "learning_rate": 1.7025080721848165e-05, + "loss": 2.1039, + "step": 27360 + }, + { + "epoch": 0.5044958726415094, + "grad_norm": 3.234375, + "learning_rate": 1.7020959932766295e-05, + "loss": 2.1441, + "step": 27380 + }, + { + "epoch": 0.5048643867924528, + "grad_norm": 3.203125, + "learning_rate": 1.7016836791102177e-05, + "loss": 2.1242, + "step": 27400 + }, + { + "epoch": 0.5052329009433962, + "grad_norm": 3.484375, + "learning_rate": 1.7012711298237398e-05, + "loss": 2.0798, + "step": 27420 + }, + { + "epoch": 0.5056014150943396, + "grad_norm": 3.078125, + "learning_rate": 1.7008583455554318e-05, + "loss": 2.0915, + "step": 27440 + }, + { + "epoch": 0.5059699292452831, + "grad_norm": 3.390625, + "learning_rate": 1.7004453264436098e-05, + "loss": 2.1192, + "step": 27460 + }, + { + "epoch": 0.5063384433962265, + "grad_norm": 3.375, + "learning_rate": 1.7000320726266688e-05, + "loss": 2.1011, + "step": 27480 + }, + { + "epoch": 0.5067069575471698, + "grad_norm": 2.84375, + "learning_rate": 1.699618584243081e-05, + "loss": 2.1148, + "step": 27500 + }, + { + "epoch": 0.5070754716981132, + "grad_norm": 3.25, + "learning_rate": 1.6992048614313986e-05, + "loss": 2.1073, + "step": 27520 + }, + { + "epoch": 0.5074439858490566, + "grad_norm": 2.96875, + "learning_rate": 1.698790904330251e-05, + "loss": 2.0886, + "step": 27540 + }, + { + "epoch": 0.5078125, + "grad_norm": 3.359375, + "learning_rate": 1.698376713078348e-05, + "loss": 2.1294, + "step": 27560 + }, + { + "epoch": 0.5081810141509434, + "grad_norm": 3.1875, + "learning_rate": 1.697962287814476e-05, + "loss": 2.0925, + "step": 27580 + }, + { + "epoch": 0.5085495283018868, + "grad_norm": 3.375, + "learning_rate": 1.6975476286775006e-05, + "loss": 2.1111, + "step": 27600 + }, + { + "epoch": 0.5089180424528302, + "grad_norm": 3.0625, + "learning_rate": 1.697132735806366e-05, + "loss": 2.1074, + "step": 27620 + }, + { + "epoch": 0.5092865566037735, + "grad_norm": 3.296875, + "learning_rate": 1.6967176093400944e-05, + "loss": 2.1538, + "step": 27640 + }, + { + "epoch": 0.5096550707547169, + "grad_norm": 3.53125, + "learning_rate": 1.696302249417786e-05, + "loss": 2.1191, + "step": 27660 + }, + { + "epoch": 0.5100235849056604, + "grad_norm": 3.015625, + "learning_rate": 1.6958866561786198e-05, + "loss": 2.1381, + "step": 27680 + }, + { + "epoch": 0.5103920990566038, + "grad_norm": 3.4375, + "learning_rate": 1.6954708297618525e-05, + "loss": 2.1281, + "step": 27700 + }, + { + "epoch": 0.5107606132075472, + "grad_norm": 3.046875, + "learning_rate": 1.6950547703068197e-05, + "loss": 2.1153, + "step": 27720 + }, + { + "epoch": 0.5111291273584906, + "grad_norm": 3.15625, + "learning_rate": 1.6946384779529338e-05, + "loss": 2.141, + "step": 27740 + }, + { + "epoch": 0.511497641509434, + "grad_norm": 3.34375, + "learning_rate": 1.6942219528396866e-05, + "loss": 2.1423, + "step": 27760 + }, + { + "epoch": 0.5118661556603774, + "grad_norm": 3.390625, + "learning_rate": 1.6938051951066468e-05, + "loss": 2.0736, + "step": 27780 + }, + { + "epoch": 0.5122346698113207, + "grad_norm": 3.328125, + "learning_rate": 1.693388204893462e-05, + "loss": 2.1011, + "step": 27800 + }, + { + "epoch": 0.5126031839622641, + "grad_norm": 2.890625, + "learning_rate": 1.692970982339857e-05, + "loss": 2.1192, + "step": 27820 + }, + { + "epoch": 0.5129716981132075, + "grad_norm": 3.15625, + "learning_rate": 1.6925535275856342e-05, + "loss": 2.0992, + "step": 27840 + }, + { + "epoch": 0.5133402122641509, + "grad_norm": 3.296875, + "learning_rate": 1.6921358407706748e-05, + "loss": 2.1022, + "step": 27860 + }, + { + "epoch": 0.5137087264150944, + "grad_norm": 3.390625, + "learning_rate": 1.6917179220349377e-05, + "loss": 2.1182, + "step": 27880 + }, + { + "epoch": 0.5140772405660378, + "grad_norm": 3.234375, + "learning_rate": 1.6912997715184582e-05, + "loss": 2.1275, + "step": 27900 + }, + { + "epoch": 0.5144457547169812, + "grad_norm": 3.046875, + "learning_rate": 1.6908813893613504e-05, + "loss": 2.1185, + "step": 27920 + }, + { + "epoch": 0.5148142688679245, + "grad_norm": 3.9375, + "learning_rate": 1.690462775703806e-05, + "loss": 2.1168, + "step": 27940 + }, + { + "epoch": 0.5151827830188679, + "grad_norm": 3.359375, + "learning_rate": 1.6900439306860933e-05, + "loss": 2.1472, + "step": 27960 + }, + { + "epoch": 0.5155512971698113, + "grad_norm": 3.09375, + "learning_rate": 1.6896248544485596e-05, + "loss": 2.1059, + "step": 27980 + }, + { + "epoch": 0.5159198113207547, + "grad_norm": 3.234375, + "learning_rate": 1.6892055471316285e-05, + "loss": 2.1059, + "step": 28000 + }, + { + "epoch": 0.5162883254716981, + "grad_norm": 3.484375, + "learning_rate": 1.6887860088758016e-05, + "loss": 2.1528, + "step": 28020 + }, + { + "epoch": 0.5166568396226415, + "grad_norm": 3.265625, + "learning_rate": 1.6883662398216577e-05, + "loss": 2.0992, + "step": 28040 + }, + { + "epoch": 0.5170253537735849, + "grad_norm": 3.015625, + "learning_rate": 1.6879462401098526e-05, + "loss": 2.1459, + "step": 28060 + }, + { + "epoch": 0.5173938679245284, + "grad_norm": 3.25, + "learning_rate": 1.6875260098811206e-05, + "loss": 2.1381, + "step": 28080 + }, + { + "epoch": 0.5177623820754716, + "grad_norm": 3.1875, + "learning_rate": 1.687105549276271e-05, + "loss": 2.1417, + "step": 28100 + }, + { + "epoch": 0.5181308962264151, + "grad_norm": 2.859375, + "learning_rate": 1.6866848584361936e-05, + "loss": 2.1024, + "step": 28120 + }, + { + "epoch": 0.5184994103773585, + "grad_norm": 2.875, + "learning_rate": 1.6862639375018515e-05, + "loss": 2.099, + "step": 28140 + }, + { + "epoch": 0.5188679245283019, + "grad_norm": 3.421875, + "learning_rate": 1.6858427866142883e-05, + "loss": 2.0997, + "step": 28160 + }, + { + "epoch": 0.5192364386792453, + "grad_norm": 3.140625, + "learning_rate": 1.685421405914622e-05, + "loss": 2.1347, + "step": 28180 + }, + { + "epoch": 0.5196049528301887, + "grad_norm": 3.40625, + "learning_rate": 1.6849997955440494e-05, + "loss": 2.1522, + "step": 28200 + }, + { + "epoch": 0.5199734669811321, + "grad_norm": 3.15625, + "learning_rate": 1.6845779556438437e-05, + "loss": 2.0702, + "step": 28220 + }, + { + "epoch": 0.5203419811320755, + "grad_norm": 2.8125, + "learning_rate": 1.6841558863553546e-05, + "loss": 2.1233, + "step": 28240 + }, + { + "epoch": 0.5207104952830188, + "grad_norm": 3.109375, + "learning_rate": 1.6837335878200095e-05, + "loss": 2.1357, + "step": 28260 + }, + { + "epoch": 0.5210790094339622, + "grad_norm": 3.359375, + "learning_rate": 1.6833110601793118e-05, + "loss": 2.0715, + "step": 28280 + }, + { + "epoch": 0.5214475235849056, + "grad_norm": 3.5, + "learning_rate": 1.6828883035748423e-05, + "loss": 2.1097, + "step": 28300 + }, + { + "epoch": 0.5218160377358491, + "grad_norm": 3.578125, + "learning_rate": 1.682465318148258e-05, + "loss": 2.1355, + "step": 28320 + }, + { + "epoch": 0.5221845518867925, + "grad_norm": 3.109375, + "learning_rate": 1.682042104041292e-05, + "loss": 2.1086, + "step": 28340 + }, + { + "epoch": 0.5225530660377359, + "grad_norm": 3.3125, + "learning_rate": 1.6816186613957563e-05, + "loss": 2.0917, + "step": 28360 + }, + { + "epoch": 0.5229215801886793, + "grad_norm": 2.890625, + "learning_rate": 1.6811949903535376e-05, + "loss": 2.1283, + "step": 28380 + }, + { + "epoch": 0.5232900943396226, + "grad_norm": 3.484375, + "learning_rate": 1.6807710910565987e-05, + "loss": 2.1299, + "step": 28400 + }, + { + "epoch": 0.523658608490566, + "grad_norm": 3.1875, + "learning_rate": 1.6803469636469804e-05, + "loss": 2.1105, + "step": 28420 + }, + { + "epoch": 0.5240271226415094, + "grad_norm": 3.0625, + "learning_rate": 1.679922608266799e-05, + "loss": 2.1365, + "step": 28440 + }, + { + "epoch": 0.5243956367924528, + "grad_norm": 3.59375, + "learning_rate": 1.679498025058248e-05, + "loss": 2.113, + "step": 28460 + }, + { + "epoch": 0.5247641509433962, + "grad_norm": 3.15625, + "learning_rate": 1.6790732141635963e-05, + "loss": 2.138, + "step": 28480 + }, + { + "epoch": 0.5251326650943396, + "grad_norm": 3.125, + "learning_rate": 1.6786481757251887e-05, + "loss": 2.1557, + "step": 28500 + }, + { + "epoch": 0.5255011792452831, + "grad_norm": 3.234375, + "learning_rate": 1.678222909885448e-05, + "loss": 2.1213, + "step": 28520 + }, + { + "epoch": 0.5258696933962265, + "grad_norm": 3.328125, + "learning_rate": 1.6777974167868723e-05, + "loss": 2.0666, + "step": 28540 + }, + { + "epoch": 0.5262382075471698, + "grad_norm": 3.34375, + "learning_rate": 1.677371696572035e-05, + "loss": 2.1072, + "step": 28560 + }, + { + "epoch": 0.5266067216981132, + "grad_norm": 3.046875, + "learning_rate": 1.6769457493835866e-05, + "loss": 2.1163, + "step": 28580 + }, + { + "epoch": 0.5269752358490566, + "grad_norm": 3.15625, + "learning_rate": 1.6765195753642543e-05, + "loss": 2.1088, + "step": 28600 + }, + { + "epoch": 0.52734375, + "grad_norm": 3.375, + "learning_rate": 1.6760931746568394e-05, + "loss": 2.1301, + "step": 28620 + }, + { + "epoch": 0.5277122641509434, + "grad_norm": 3.46875, + "learning_rate": 1.6756665474042205e-05, + "loss": 2.0952, + "step": 28640 + }, + { + "epoch": 0.5280807783018868, + "grad_norm": 3.578125, + "learning_rate": 1.675239693749352e-05, + "loss": 2.1082, + "step": 28660 + }, + { + "epoch": 0.5284492924528302, + "grad_norm": 3.03125, + "learning_rate": 1.6748126138352635e-05, + "loss": 2.1074, + "step": 28680 + }, + { + "epoch": 0.5288178066037735, + "grad_norm": 3.59375, + "learning_rate": 1.6743853078050612e-05, + "loss": 2.1327, + "step": 28700 + }, + { + "epoch": 0.5291863207547169, + "grad_norm": 3.5, + "learning_rate": 1.6739577758019266e-05, + "loss": 2.1211, + "step": 28720 + }, + { + "epoch": 0.5295548349056604, + "grad_norm": 3.1875, + "learning_rate": 1.6735300179691174e-05, + "loss": 2.1004, + "step": 28740 + }, + { + "epoch": 0.5299233490566038, + "grad_norm": 3.3125, + "learning_rate": 1.6731020344499664e-05, + "loss": 2.0895, + "step": 28760 + }, + { + "epoch": 0.5302918632075472, + "grad_norm": 3.109375, + "learning_rate": 1.672673825387882e-05, + "loss": 2.1301, + "step": 28780 + }, + { + "epoch": 0.5306603773584906, + "grad_norm": 3.0625, + "learning_rate": 1.6722453909263488e-05, + "loss": 2.1035, + "step": 28800 + }, + { + "epoch": 0.531028891509434, + "grad_norm": 3.28125, + "learning_rate": 1.6718167312089267e-05, + "loss": 2.114, + "step": 28820 + }, + { + "epoch": 0.5313974056603774, + "grad_norm": 3.796875, + "learning_rate": 1.6713878463792507e-05, + "loss": 2.1436, + "step": 28840 + }, + { + "epoch": 0.5317659198113207, + "grad_norm": 3.328125, + "learning_rate": 1.6709587365810312e-05, + "loss": 2.1323, + "step": 28860 + }, + { + "epoch": 0.5321344339622641, + "grad_norm": 3.5, + "learning_rate": 1.6705294019580543e-05, + "loss": 2.1156, + "step": 28880 + }, + { + "epoch": 0.5325029481132075, + "grad_norm": 3.265625, + "learning_rate": 1.670099842654182e-05, + "loss": 2.0944, + "step": 28900 + }, + { + "epoch": 0.5328714622641509, + "grad_norm": 3.4375, + "learning_rate": 1.6696700588133504e-05, + "loss": 2.0737, + "step": 28920 + }, + { + "epoch": 0.5332399764150944, + "grad_norm": 3.4375, + "learning_rate": 1.6692400505795716e-05, + "loss": 2.0919, + "step": 28940 + }, + { + "epoch": 0.5336084905660378, + "grad_norm": 3.15625, + "learning_rate": 1.6688098180969323e-05, + "loss": 2.1254, + "step": 28960 + }, + { + "epoch": 0.5339770047169812, + "grad_norm": 3.25, + "learning_rate": 1.6683793615095955e-05, + "loss": 2.1216, + "step": 28980 + }, + { + "epoch": 0.5343455188679245, + "grad_norm": 3.203125, + "learning_rate": 1.6679486809617977e-05, + "loss": 2.0596, + "step": 29000 + }, + { + "epoch": 0.5347140330188679, + "grad_norm": 3.1875, + "learning_rate": 1.667517776597852e-05, + "loss": 2.1399, + "step": 29020 + }, + { + "epoch": 0.5350825471698113, + "grad_norm": 3.296875, + "learning_rate": 1.6670866485621455e-05, + "loss": 2.124, + "step": 29040 + }, + { + "epoch": 0.5354510613207547, + "grad_norm": 3.125, + "learning_rate": 1.6666552969991407e-05, + "loss": 2.1632, + "step": 29060 + }, + { + "epoch": 0.5358195754716981, + "grad_norm": 4.0, + "learning_rate": 1.666223722053374e-05, + "loss": 2.0757, + "step": 29080 + }, + { + "epoch": 0.5361880896226415, + "grad_norm": 3.21875, + "learning_rate": 1.6657919238694583e-05, + "loss": 2.1003, + "step": 29100 + }, + { + "epoch": 0.5365566037735849, + "grad_norm": 3.25, + "learning_rate": 1.6653599025920807e-05, + "loss": 2.066, + "step": 29120 + }, + { + "epoch": 0.5369251179245284, + "grad_norm": 3.421875, + "learning_rate": 1.664927658366002e-05, + "loss": 2.1059, + "step": 29140 + }, + { + "epoch": 0.5372936320754716, + "grad_norm": 3.078125, + "learning_rate": 1.664495191336059e-05, + "loss": 2.0888, + "step": 29160 + }, + { + "epoch": 0.5376621462264151, + "grad_norm": 3.171875, + "learning_rate": 1.6640625016471626e-05, + "loss": 2.1071, + "step": 29180 + }, + { + "epoch": 0.5380306603773585, + "grad_norm": 3.125, + "learning_rate": 1.6636295894442986e-05, + "loss": 2.1389, + "step": 29200 + }, + { + "epoch": 0.5383991745283019, + "grad_norm": 3.015625, + "learning_rate": 1.6631964548725264e-05, + "loss": 2.1034, + "step": 29220 + }, + { + "epoch": 0.5387676886792453, + "grad_norm": 3.1875, + "learning_rate": 1.662763098076982e-05, + "loss": 2.1374, + "step": 29240 + }, + { + "epoch": 0.5391362028301887, + "grad_norm": 3.078125, + "learning_rate": 1.6623295192028736e-05, + "loss": 2.1081, + "step": 29260 + }, + { + "epoch": 0.5395047169811321, + "grad_norm": 3.484375, + "learning_rate": 1.6618957183954845e-05, + "loss": 2.1362, + "step": 29280 + }, + { + "epoch": 0.5398732311320755, + "grad_norm": 3.140625, + "learning_rate": 1.6614616958001733e-05, + "loss": 2.1465, + "step": 29300 + }, + { + "epoch": 0.5402417452830188, + "grad_norm": 3.078125, + "learning_rate": 1.661027451562372e-05, + "loss": 2.1049, + "step": 29320 + }, + { + "epoch": 0.5406102594339622, + "grad_norm": 3.59375, + "learning_rate": 1.660592985827587e-05, + "loss": 2.149, + "step": 29340 + }, + { + "epoch": 0.5409787735849056, + "grad_norm": 3.984375, + "learning_rate": 1.660158298741399e-05, + "loss": 2.1061, + "step": 29360 + }, + { + "epoch": 0.5413472877358491, + "grad_norm": 3.0, + "learning_rate": 1.6597233904494627e-05, + "loss": 2.1071, + "step": 29380 + }, + { + "epoch": 0.5417158018867925, + "grad_norm": 3.140625, + "learning_rate": 1.659288261097508e-05, + "loss": 2.0882, + "step": 29400 + }, + { + "epoch": 0.5420843160377359, + "grad_norm": 3.421875, + "learning_rate": 1.6588529108313367e-05, + "loss": 2.0827, + "step": 29420 + }, + { + "epoch": 0.5424528301886793, + "grad_norm": 3.140625, + "learning_rate": 1.6584173397968268e-05, + "loss": 2.1535, + "step": 29440 + }, + { + "epoch": 0.5428213443396226, + "grad_norm": 3.234375, + "learning_rate": 1.6579815481399295e-05, + "loss": 2.0688, + "step": 29460 + }, + { + "epoch": 0.543189858490566, + "grad_norm": 3.125, + "learning_rate": 1.6575455360066687e-05, + "loss": 2.1156, + "step": 29480 + }, + { + "epoch": 0.5435583726415094, + "grad_norm": 3.25, + "learning_rate": 1.6571093035431448e-05, + "loss": 2.1133, + "step": 29500 + }, + { + "epoch": 0.5439268867924528, + "grad_norm": 3.234375, + "learning_rate": 1.6566728508955293e-05, + "loss": 2.1475, + "step": 29520 + }, + { + "epoch": 0.5442954009433962, + "grad_norm": 3.21875, + "learning_rate": 1.656236178210069e-05, + "loss": 2.0925, + "step": 29540 + }, + { + "epoch": 0.5446639150943396, + "grad_norm": 3.109375, + "learning_rate": 1.6557992856330846e-05, + "loss": 2.0996, + "step": 29560 + }, + { + "epoch": 0.5450324292452831, + "grad_norm": 3.21875, + "learning_rate": 1.6553621733109693e-05, + "loss": 2.0961, + "step": 29580 + }, + { + "epoch": 0.5454009433962265, + "grad_norm": 3.59375, + "learning_rate": 1.6549248413901913e-05, + "loss": 2.1165, + "step": 29600 + }, + { + "epoch": 0.5457694575471698, + "grad_norm": 3.3125, + "learning_rate": 1.6544872900172913e-05, + "loss": 2.0771, + "step": 29620 + }, + { + "epoch": 0.5461379716981132, + "grad_norm": 3.59375, + "learning_rate": 1.654049519338884e-05, + "loss": 2.1503, + "step": 29640 + }, + { + "epoch": 0.5465064858490566, + "grad_norm": 3.25, + "learning_rate": 1.6536115295016575e-05, + "loss": 2.098, + "step": 29660 + }, + { + "epoch": 0.546875, + "grad_norm": 3.40625, + "learning_rate": 1.6531733206523736e-05, + "loss": 2.136, + "step": 29680 + }, + { + "epoch": 0.5472435141509434, + "grad_norm": 3.15625, + "learning_rate": 1.652734892937867e-05, + "loss": 2.1036, + "step": 29700 + }, + { + "epoch": 0.5476120283018868, + "grad_norm": 3.015625, + "learning_rate": 1.6522962465050463e-05, + "loss": 2.0969, + "step": 29720 + }, + { + "epoch": 0.5479805424528302, + "grad_norm": 3.3125, + "learning_rate": 1.6518573815008926e-05, + "loss": 2.1452, + "step": 29740 + }, + { + "epoch": 0.5483490566037735, + "grad_norm": 3.265625, + "learning_rate": 1.6514182980724612e-05, + "loss": 2.1045, + "step": 29760 + }, + { + "epoch": 0.5487175707547169, + "grad_norm": 3.625, + "learning_rate": 1.65097899636688e-05, + "loss": 2.096, + "step": 29780 + }, + { + "epoch": 0.5490860849056604, + "grad_norm": 3.0625, + "learning_rate": 1.65053947653135e-05, + "loss": 2.0872, + "step": 29800 + }, + { + "epoch": 0.5494545990566038, + "grad_norm": 3.75, + "learning_rate": 1.6500997387131454e-05, + "loss": 2.1158, + "step": 29820 + }, + { + "epoch": 0.5498231132075472, + "grad_norm": 3.53125, + "learning_rate": 1.6496597830596136e-05, + "loss": 2.1147, + "step": 29840 + }, + { + "epoch": 0.5501916273584906, + "grad_norm": 3.25, + "learning_rate": 1.649219609718175e-05, + "loss": 2.1235, + "step": 29860 + }, + { + "epoch": 0.550560141509434, + "grad_norm": 3.5625, + "learning_rate": 1.6487792188363227e-05, + "loss": 2.107, + "step": 29880 + }, + { + "epoch": 0.5509286556603774, + "grad_norm": 3.296875, + "learning_rate": 1.648338610561623e-05, + "loss": 2.1255, + "step": 29900 + }, + { + "epoch": 0.5512971698113207, + "grad_norm": 3.265625, + "learning_rate": 1.6478977850417145e-05, + "loss": 2.1388, + "step": 29920 + }, + { + "epoch": 0.5516656839622641, + "grad_norm": 3.5, + "learning_rate": 1.6474567424243092e-05, + "loss": 2.1388, + "step": 29940 + }, + { + "epoch": 0.5520341981132075, + "grad_norm": 3.4375, + "learning_rate": 1.6470154828571913e-05, + "loss": 2.1508, + "step": 29960 + }, + { + "epoch": 0.5524027122641509, + "grad_norm": 2.984375, + "learning_rate": 1.6465740064882185e-05, + "loss": 2.0786, + "step": 29980 + }, + { + "epoch": 0.5527712264150944, + "grad_norm": 3.140625, + "learning_rate": 1.64613231346532e-05, + "loss": 2.1553, + "step": 30000 + }, + { + "epoch": 0.5531397405660378, + "grad_norm": 3.390625, + "learning_rate": 1.645690403936499e-05, + "loss": 2.1237, + "step": 30020 + }, + { + "epoch": 0.5535082547169812, + "grad_norm": 3.234375, + "learning_rate": 1.64524827804983e-05, + "loss": 2.1196, + "step": 30040 + }, + { + "epoch": 0.5538767688679245, + "grad_norm": 3.015625, + "learning_rate": 1.6448059359534604e-05, + "loss": 2.0663, + "step": 30060 + }, + { + "epoch": 0.5542452830188679, + "grad_norm": 2.9375, + "learning_rate": 1.6443633777956102e-05, + "loss": 2.0698, + "step": 30080 + }, + { + "epoch": 0.5546137971698113, + "grad_norm": 3.359375, + "learning_rate": 1.6439206037245722e-05, + "loss": 2.0849, + "step": 30100 + }, + { + "epoch": 0.5549823113207547, + "grad_norm": 3.265625, + "learning_rate": 1.64347761388871e-05, + "loss": 2.0845, + "step": 30120 + }, + { + "epoch": 0.5553508254716981, + "grad_norm": 4.0, + "learning_rate": 1.643034408436462e-05, + "loss": 2.0833, + "step": 30140 + }, + { + "epoch": 0.5557193396226415, + "grad_norm": 3.25, + "learning_rate": 1.642590987516336e-05, + "loss": 2.1202, + "step": 30160 + }, + { + "epoch": 0.5560878537735849, + "grad_norm": 3.109375, + "learning_rate": 1.6421473512769146e-05, + "loss": 2.1402, + "step": 30180 + }, + { + "epoch": 0.5564563679245284, + "grad_norm": 3.1875, + "learning_rate": 1.6417034998668506e-05, + "loss": 2.1134, + "step": 30200 + }, + { + "epoch": 0.5568248820754716, + "grad_norm": 2.90625, + "learning_rate": 1.6412594334348698e-05, + "loss": 2.116, + "step": 30220 + }, + { + "epoch": 0.5571933962264151, + "grad_norm": 3.28125, + "learning_rate": 1.6408151521297705e-05, + "loss": 2.1117, + "step": 30240 + }, + { + "epoch": 0.5575619103773585, + "grad_norm": 3.15625, + "learning_rate": 1.6403706561004215e-05, + "loss": 2.1289, + "step": 30260 + }, + { + "epoch": 0.5579304245283019, + "grad_norm": 3.5625, + "learning_rate": 1.639925945495765e-05, + "loss": 2.0787, + "step": 30280 + }, + { + "epoch": 0.5582989386792453, + "grad_norm": 3.296875, + "learning_rate": 1.6394810204648146e-05, + "loss": 2.1112, + "step": 30300 + }, + { + "epoch": 0.5586674528301887, + "grad_norm": 3.359375, + "learning_rate": 1.6390358811566556e-05, + "loss": 2.125, + "step": 30320 + }, + { + "epoch": 0.5590359669811321, + "grad_norm": 3.390625, + "learning_rate": 1.638590527720445e-05, + "loss": 2.1344, + "step": 30340 + }, + { + "epoch": 0.5594044811320755, + "grad_norm": 3.59375, + "learning_rate": 1.6381449603054122e-05, + "loss": 2.1065, + "step": 30360 + }, + { + "epoch": 0.5597729952830188, + "grad_norm": 3.046875, + "learning_rate": 1.6376991790608577e-05, + "loss": 2.1022, + "step": 30380 + }, + { + "epoch": 0.5601415094339622, + "grad_norm": 3.53125, + "learning_rate": 1.6372531841361538e-05, + "loss": 2.113, + "step": 30400 + }, + { + "epoch": 0.5605100235849056, + "grad_norm": 3.40625, + "learning_rate": 1.636806975680744e-05, + "loss": 2.1399, + "step": 30420 + }, + { + "epoch": 0.5608785377358491, + "grad_norm": 3.171875, + "learning_rate": 1.6363605538441444e-05, + "loss": 2.0953, + "step": 30440 + }, + { + "epoch": 0.5612470518867925, + "grad_norm": 3.71875, + "learning_rate": 1.6359139187759416e-05, + "loss": 2.1264, + "step": 30460 + }, + { + "epoch": 0.5616155660377359, + "grad_norm": 3.046875, + "learning_rate": 1.635467070625794e-05, + "loss": 2.1137, + "step": 30480 + }, + { + "epoch": 0.5619840801886793, + "grad_norm": 3.1875, + "learning_rate": 1.635020009543432e-05, + "loss": 2.1346, + "step": 30500 + }, + { + "epoch": 0.5623525943396226, + "grad_norm": 3.6875, + "learning_rate": 1.634572735678656e-05, + "loss": 2.1459, + "step": 30520 + }, + { + "epoch": 0.562721108490566, + "grad_norm": 3.21875, + "learning_rate": 1.6341252491813388e-05, + "loss": 2.1499, + "step": 30540 + }, + { + "epoch": 0.5630896226415094, + "grad_norm": 3.34375, + "learning_rate": 1.6336775502014236e-05, + "loss": 2.11, + "step": 30560 + }, + { + "epoch": 0.5634581367924528, + "grad_norm": 3.28125, + "learning_rate": 1.6332296388889264e-05, + "loss": 2.12, + "step": 30580 + }, + { + "epoch": 0.5638266509433962, + "grad_norm": 3.1875, + "learning_rate": 1.6327815153939325e-05, + "loss": 2.0948, + "step": 30600 + }, + { + "epoch": 0.5641951650943396, + "grad_norm": 3.34375, + "learning_rate": 1.6323331798665992e-05, + "loss": 2.0656, + "step": 30620 + }, + { + "epoch": 0.5645636792452831, + "grad_norm": 3.078125, + "learning_rate": 1.6318846324571546e-05, + "loss": 2.1065, + "step": 30640 + }, + { + "epoch": 0.5649321933962265, + "grad_norm": 3.359375, + "learning_rate": 1.6314358733158982e-05, + "loss": 2.086, + "step": 30660 + }, + { + "epoch": 0.5653007075471698, + "grad_norm": 3.140625, + "learning_rate": 1.6309869025931995e-05, + "loss": 2.1219, + "step": 30680 + }, + { + "epoch": 0.5656692216981132, + "grad_norm": 3.140625, + "learning_rate": 1.6305377204395005e-05, + "loss": 2.1086, + "step": 30700 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 3.4375, + "learning_rate": 1.630088327005312e-05, + "loss": 2.0926, + "step": 30720 + }, + { + "epoch": 0.56640625, + "grad_norm": 3.171875, + "learning_rate": 1.629638722441218e-05, + "loss": 2.1312, + "step": 30740 + }, + { + "epoch": 0.5667747641509434, + "grad_norm": 3.359375, + "learning_rate": 1.6291889068978706e-05, + "loss": 2.1144, + "step": 30760 + }, + { + "epoch": 0.5671432783018868, + "grad_norm": 3.078125, + "learning_rate": 1.628738880525995e-05, + "loss": 2.1522, + "step": 30780 + }, + { + "epoch": 0.5675117924528302, + "grad_norm": 3.375, + "learning_rate": 1.6282886434763853e-05, + "loss": 2.0975, + "step": 30800 + }, + { + "epoch": 0.5678803066037735, + "grad_norm": 3.421875, + "learning_rate": 1.627838195899907e-05, + "loss": 2.1094, + "step": 30820 + }, + { + "epoch": 0.5682488207547169, + "grad_norm": 3.34375, + "learning_rate": 1.627387537947496e-05, + "loss": 2.1036, + "step": 30840 + }, + { + "epoch": 0.5686173349056604, + "grad_norm": 2.84375, + "learning_rate": 1.626936669770159e-05, + "loss": 2.109, + "step": 30860 + }, + { + "epoch": 0.5689858490566038, + "grad_norm": 3.125, + "learning_rate": 1.6264855915189725e-05, + "loss": 2.1405, + "step": 30880 + }, + { + "epoch": 0.5693543632075472, + "grad_norm": 3.390625, + "learning_rate": 1.626034303345084e-05, + "loss": 2.1259, + "step": 30900 + }, + { + "epoch": 0.5697228773584906, + "grad_norm": 3.203125, + "learning_rate": 1.6255828053997108e-05, + "loss": 2.1401, + "step": 30920 + }, + { + "epoch": 0.570091391509434, + "grad_norm": 3.21875, + "learning_rate": 1.625131097834141e-05, + "loss": 2.1035, + "step": 30940 + }, + { + "epoch": 0.5704599056603774, + "grad_norm": 3.3125, + "learning_rate": 1.6246791807997326e-05, + "loss": 2.0843, + "step": 30960 + }, + { + "epoch": 0.5708284198113207, + "grad_norm": 3.203125, + "learning_rate": 1.6242270544479138e-05, + "loss": 2.1549, + "step": 30980 + }, + { + "epoch": 0.5711969339622641, + "grad_norm": 3.171875, + "learning_rate": 1.6237747189301832e-05, + "loss": 2.1758, + "step": 31000 + }, + { + "epoch": 0.5715654481132075, + "grad_norm": 3.578125, + "learning_rate": 1.6233221743981092e-05, + "loss": 2.0923, + "step": 31020 + }, + { + "epoch": 0.5719339622641509, + "grad_norm": 3.09375, + "learning_rate": 1.6228694210033308e-05, + "loss": 2.1146, + "step": 31040 + }, + { + "epoch": 0.5723024764150944, + "grad_norm": 3.046875, + "learning_rate": 1.6224164588975555e-05, + "loss": 2.1167, + "step": 31060 + }, + { + "epoch": 0.5726709905660378, + "grad_norm": 3.484375, + "learning_rate": 1.621963288232563e-05, + "loss": 2.0899, + "step": 31080 + }, + { + "epoch": 0.5730395047169812, + "grad_norm": 3.546875, + "learning_rate": 1.6215099091602006e-05, + "loss": 2.1209, + "step": 31100 + }, + { + "epoch": 0.5734080188679245, + "grad_norm": 3.234375, + "learning_rate": 1.6210563218323873e-05, + "loss": 2.1065, + "step": 31120 + }, + { + "epoch": 0.5737765330188679, + "grad_norm": 3.078125, + "learning_rate": 1.6206025264011112e-05, + "loss": 2.1053, + "step": 31140 + }, + { + "epoch": 0.5741450471698113, + "grad_norm": 3.375, + "learning_rate": 1.620148523018429e-05, + "loss": 2.1133, + "step": 31160 + }, + { + "epoch": 0.5745135613207547, + "grad_norm": 2.96875, + "learning_rate": 1.6196943118364687e-05, + "loss": 2.1074, + "step": 31180 + }, + { + "epoch": 0.5748820754716981, + "grad_norm": 3.1875, + "learning_rate": 1.6192398930074274e-05, + "loss": 2.1308, + "step": 31200 + }, + { + "epoch": 0.5752505896226415, + "grad_norm": 3.046875, + "learning_rate": 1.6187852666835717e-05, + "loss": 2.0995, + "step": 31220 + }, + { + "epoch": 0.5756191037735849, + "grad_norm": 3.171875, + "learning_rate": 1.6183304330172375e-05, + "loss": 2.1354, + "step": 31240 + }, + { + "epoch": 0.5759876179245284, + "grad_norm": 3.328125, + "learning_rate": 1.6178753921608303e-05, + "loss": 2.1337, + "step": 31260 + }, + { + "epoch": 0.5763561320754716, + "grad_norm": 3.46875, + "learning_rate": 1.6174201442668256e-05, + "loss": 2.0895, + "step": 31280 + }, + { + "epoch": 0.5767246462264151, + "grad_norm": 3.3125, + "learning_rate": 1.6169646894877675e-05, + "loss": 2.1219, + "step": 31300 + }, + { + "epoch": 0.5770931603773585, + "grad_norm": 3.34375, + "learning_rate": 1.6165090279762695e-05, + "loss": 2.1344, + "step": 31320 + }, + { + "epoch": 0.5774616745283019, + "grad_norm": 3.21875, + "learning_rate": 1.616053159885015e-05, + "loss": 2.1243, + "step": 31340 + }, + { + "epoch": 0.5778301886792453, + "grad_norm": 3.140625, + "learning_rate": 1.615597085366756e-05, + "loss": 2.1082, + "step": 31360 + }, + { + "epoch": 0.5781987028301887, + "grad_norm": 3.21875, + "learning_rate": 1.615140804574314e-05, + "loss": 2.0908, + "step": 31380 + }, + { + "epoch": 0.5785672169811321, + "grad_norm": 3.53125, + "learning_rate": 1.6146843176605797e-05, + "loss": 2.1109, + "step": 31400 + }, + { + "epoch": 0.5789357311320755, + "grad_norm": 3.421875, + "learning_rate": 1.6142276247785124e-05, + "loss": 2.0969, + "step": 31420 + }, + { + "epoch": 0.5793042452830188, + "grad_norm": 3.40625, + "learning_rate": 1.6137707260811404e-05, + "loss": 2.1241, + "step": 31440 + }, + { + "epoch": 0.5796727594339622, + "grad_norm": 3.203125, + "learning_rate": 1.6133136217215617e-05, + "loss": 2.1208, + "step": 31460 + }, + { + "epoch": 0.5800412735849056, + "grad_norm": 3.171875, + "learning_rate": 1.6128563118529426e-05, + "loss": 2.096, + "step": 31480 + }, + { + "epoch": 0.5804097877358491, + "grad_norm": 3.21875, + "learning_rate": 1.6123987966285182e-05, + "loss": 2.1166, + "step": 31500 + }, + { + "epoch": 0.5807783018867925, + "grad_norm": 3.671875, + "learning_rate": 1.6119410762015933e-05, + "loss": 2.1222, + "step": 31520 + }, + { + "epoch": 0.5811468160377359, + "grad_norm": 3.625, + "learning_rate": 1.61148315072554e-05, + "loss": 2.1365, + "step": 31540 + }, + { + "epoch": 0.5815153301886793, + "grad_norm": 3.390625, + "learning_rate": 1.6110250203538006e-05, + "loss": 2.1082, + "step": 31560 + }, + { + "epoch": 0.5818838443396226, + "grad_norm": 3.390625, + "learning_rate": 1.6105666852398848e-05, + "loss": 2.1201, + "step": 31580 + }, + { + "epoch": 0.582252358490566, + "grad_norm": 3.0625, + "learning_rate": 1.6101081455373717e-05, + "loss": 2.0914, + "step": 31600 + }, + { + "epoch": 0.5826208726415094, + "grad_norm": 3.4375, + "learning_rate": 1.6096494013999087e-05, + "loss": 2.0913, + "step": 31620 + }, + { + "epoch": 0.5829893867924528, + "grad_norm": 3.21875, + "learning_rate": 1.609190452981212e-05, + "loss": 2.115, + "step": 31640 + }, + { + "epoch": 0.5833579009433962, + "grad_norm": 3.1875, + "learning_rate": 1.6087313004350652e-05, + "loss": 2.1655, + "step": 31660 + }, + { + "epoch": 0.5837264150943396, + "grad_norm": 3.296875, + "learning_rate": 1.608271943915322e-05, + "loss": 2.0728, + "step": 31680 + }, + { + "epoch": 0.5840949292452831, + "grad_norm": 3.109375, + "learning_rate": 1.607812383575903e-05, + "loss": 2.097, + "step": 31700 + }, + { + "epoch": 0.5844634433962265, + "grad_norm": 3.015625, + "learning_rate": 1.6073526195707974e-05, + "loss": 2.1056, + "step": 31720 + }, + { + "epoch": 0.5848319575471698, + "grad_norm": 3.203125, + "learning_rate": 1.6068926520540634e-05, + "loss": 2.1336, + "step": 31740 + }, + { + "epoch": 0.5852004716981132, + "grad_norm": 3.375, + "learning_rate": 1.6064324811798268e-05, + "loss": 2.0836, + "step": 31760 + }, + { + "epoch": 0.5855689858490566, + "grad_norm": 3.359375, + "learning_rate": 1.6059721071022814e-05, + "loss": 2.0948, + "step": 31780 + }, + { + "epoch": 0.5859375, + "grad_norm": 3.484375, + "learning_rate": 1.605511529975689e-05, + "loss": 2.1618, + "step": 31800 + }, + { + "epoch": 0.5863060141509434, + "grad_norm": 3.21875, + "learning_rate": 1.60505074995438e-05, + "loss": 2.078, + "step": 31820 + }, + { + "epoch": 0.5866745283018868, + "grad_norm": 3.515625, + "learning_rate": 1.6045897671927527e-05, + "loss": 2.1072, + "step": 31840 + }, + { + "epoch": 0.5870430424528302, + "grad_norm": 3.734375, + "learning_rate": 1.6041285818452728e-05, + "loss": 2.1491, + "step": 31860 + }, + { + "epoch": 0.5874115566037735, + "grad_norm": 3.09375, + "learning_rate": 1.6036671940664744e-05, + "loss": 2.0958, + "step": 31880 + }, + { + "epoch": 0.5877800707547169, + "grad_norm": 2.921875, + "learning_rate": 1.6032056040109593e-05, + "loss": 2.1387, + "step": 31900 + }, + { + "epoch": 0.5881485849056604, + "grad_norm": 3.046875, + "learning_rate": 1.6027438118333968e-05, + "loss": 2.1167, + "step": 31920 + }, + { + "epoch": 0.5885170990566038, + "grad_norm": 3.296875, + "learning_rate": 1.6022818176885244e-05, + "loss": 2.1133, + "step": 31940 + }, + { + "epoch": 0.5888856132075472, + "grad_norm": 3.21875, + "learning_rate": 1.6018196217311467e-05, + "loss": 2.1085, + "step": 31960 + }, + { + "epoch": 0.5892541273584906, + "grad_norm": 3.53125, + "learning_rate": 1.6013572241161368e-05, + "loss": 2.1096, + "step": 31980 + }, + { + "epoch": 0.589622641509434, + "grad_norm": 3.25, + "learning_rate": 1.6008946249984343e-05, + "loss": 2.1176, + "step": 32000 + }, + { + "epoch": 0.5899911556603774, + "grad_norm": 3.3125, + "learning_rate": 1.6004318245330474e-05, + "loss": 2.0892, + "step": 32020 + }, + { + "epoch": 0.5903596698113207, + "grad_norm": 3.421875, + "learning_rate": 1.5999688228750506e-05, + "loss": 2.1184, + "step": 32040 + }, + { + "epoch": 0.5907281839622641, + "grad_norm": 3.078125, + "learning_rate": 1.599505620179586e-05, + "loss": 2.125, + "step": 32060 + }, + { + "epoch": 0.5910966981132075, + "grad_norm": 3.5, + "learning_rate": 1.5990422166018653e-05, + "loss": 2.0913, + "step": 32080 + }, + { + "epoch": 0.5914652122641509, + "grad_norm": 3.1875, + "learning_rate": 1.598578612297164e-05, + "loss": 2.1402, + "step": 32100 + }, + { + "epoch": 0.5918337264150944, + "grad_norm": 2.984375, + "learning_rate": 1.5981148074208275e-05, + "loss": 2.0741, + "step": 32120 + }, + { + "epoch": 0.5922022405660378, + "grad_norm": 3.1875, + "learning_rate": 1.597650802128267e-05, + "loss": 2.1666, + "step": 32140 + }, + { + "epoch": 0.5925707547169812, + "grad_norm": 3.21875, + "learning_rate": 1.5971865965749612e-05, + "loss": 2.1096, + "step": 32160 + }, + { + "epoch": 0.5929392688679245, + "grad_norm": 3.3125, + "learning_rate": 1.5967221909164565e-05, + "loss": 2.0818, + "step": 32180 + }, + { + "epoch": 0.5933077830188679, + "grad_norm": 3.390625, + "learning_rate": 1.5962575853083656e-05, + "loss": 2.0928, + "step": 32200 + }, + { + "epoch": 0.5936762971698113, + "grad_norm": 3.671875, + "learning_rate": 1.5957927799063688e-05, + "loss": 2.1074, + "step": 32220 + }, + { + "epoch": 0.5940448113207547, + "grad_norm": 3.234375, + "learning_rate": 1.5953277748662124e-05, + "loss": 2.1507, + "step": 32240 + }, + { + "epoch": 0.5944133254716981, + "grad_norm": 3.203125, + "learning_rate": 1.5948625703437105e-05, + "loss": 2.133, + "step": 32260 + }, + { + "epoch": 0.5947818396226415, + "grad_norm": 3.078125, + "learning_rate": 1.5943971664947446e-05, + "loss": 2.0641, + "step": 32280 + }, + { + "epoch": 0.5951503537735849, + "grad_norm": 3.375, + "learning_rate": 1.593931563475261e-05, + "loss": 2.0883, + "step": 32300 + }, + { + "epoch": 0.5955188679245284, + "grad_norm": 3.640625, + "learning_rate": 1.593465761441274e-05, + "loss": 2.0975, + "step": 32320 + }, + { + "epoch": 0.5958873820754716, + "grad_norm": 3.015625, + "learning_rate": 1.592999760548865e-05, + "loss": 2.1338, + "step": 32340 + }, + { + "epoch": 0.5962558962264151, + "grad_norm": 3.578125, + "learning_rate": 1.5925335609541813e-05, + "loss": 2.0861, + "step": 32360 + }, + { + "epoch": 0.5966244103773585, + "grad_norm": 3.375, + "learning_rate": 1.5920671628134374e-05, + "loss": 2.0639, + "step": 32380 + }, + { + "epoch": 0.5969929245283019, + "grad_norm": 3.703125, + "learning_rate": 1.591600566282913e-05, + "loss": 2.1162, + "step": 32400 + }, + { + "epoch": 0.5973614386792453, + "grad_norm": 3.4375, + "learning_rate": 1.591133771518956e-05, + "loss": 2.097, + "step": 32420 + }, + { + "epoch": 0.5977299528301887, + "grad_norm": 3.296875, + "learning_rate": 1.5906667786779795e-05, + "loss": 2.1414, + "step": 32440 + }, + { + "epoch": 0.5980984669811321, + "grad_norm": 3.515625, + "learning_rate": 1.5901995879164635e-05, + "loss": 2.0951, + "step": 32460 + }, + { + "epoch": 0.5984669811320755, + "grad_norm": 3.40625, + "learning_rate": 1.5897321993909543e-05, + "loss": 2.0926, + "step": 32480 + }, + { + "epoch": 0.5988354952830188, + "grad_norm": 3.171875, + "learning_rate": 1.5892646132580646e-05, + "loss": 2.0849, + "step": 32500 + }, + { + "epoch": 0.5992040094339622, + "grad_norm": 3.25, + "learning_rate": 1.5887968296744722e-05, + "loss": 2.1281, + "step": 32520 + }, + { + "epoch": 0.5995725235849056, + "grad_norm": 3.234375, + "learning_rate": 1.5883288487969228e-05, + "loss": 2.0833, + "step": 32540 + }, + { + "epoch": 0.5999410377358491, + "grad_norm": 3.421875, + "learning_rate": 1.5878606707822273e-05, + "loss": 2.1247, + "step": 32560 + }, + { + "epoch": 0.6003095518867925, + "grad_norm": 2.71875, + "learning_rate": 1.5873922957872627e-05, + "loss": 2.1174, + "step": 32580 + }, + { + "epoch": 0.6006780660377359, + "grad_norm": 3.5, + "learning_rate": 1.5869237239689716e-05, + "loss": 2.1338, + "step": 32600 + }, + { + "epoch": 0.6010465801886793, + "grad_norm": 3.234375, + "learning_rate": 1.5864549554843634e-05, + "loss": 2.1307, + "step": 32620 + }, + { + "epoch": 0.6014150943396226, + "grad_norm": 3.546875, + "learning_rate": 1.585985990490513e-05, + "loss": 2.1102, + "step": 32640 + }, + { + "epoch": 0.601783608490566, + "grad_norm": 3.21875, + "learning_rate": 1.585516829144561e-05, + "loss": 2.0847, + "step": 32660 + }, + { + "epoch": 0.6021521226415094, + "grad_norm": 3.28125, + "learning_rate": 1.5850474716037133e-05, + "loss": 2.0907, + "step": 32680 + }, + { + "epoch": 0.6025206367924528, + "grad_norm": 3.203125, + "learning_rate": 1.584577918025243e-05, + "loss": 2.0681, + "step": 32700 + }, + { + "epoch": 0.6028891509433962, + "grad_norm": 3.21875, + "learning_rate": 1.584108168566488e-05, + "loss": 2.1142, + "step": 32720 + }, + { + "epoch": 0.6032576650943396, + "grad_norm": 3.125, + "learning_rate": 1.5836382233848514e-05, + "loss": 2.0764, + "step": 32740 + }, + { + "epoch": 0.6036261792452831, + "grad_norm": 3.21875, + "learning_rate": 1.583168082637803e-05, + "loss": 2.1118, + "step": 32760 + }, + { + "epoch": 0.6039946933962265, + "grad_norm": 3.0, + "learning_rate": 1.582697746482877e-05, + "loss": 2.1332, + "step": 32780 + }, + { + "epoch": 0.6043632075471698, + "grad_norm": 3.4375, + "learning_rate": 1.5822272150776736e-05, + "loss": 2.1052, + "step": 32800 + }, + { + "epoch": 0.6047317216981132, + "grad_norm": 2.875, + "learning_rate": 1.5817564885798583e-05, + "loss": 2.1052, + "step": 32820 + }, + { + "epoch": 0.6051002358490566, + "grad_norm": 3.140625, + "learning_rate": 1.5812855671471626e-05, + "loss": 2.1038, + "step": 32840 + }, + { + "epoch": 0.60546875, + "grad_norm": 3.453125, + "learning_rate": 1.5808144509373823e-05, + "loss": 2.1094, + "step": 32860 + }, + { + "epoch": 0.6058372641509434, + "grad_norm": 3.78125, + "learning_rate": 1.5803431401083792e-05, + "loss": 2.1275, + "step": 32880 + }, + { + "epoch": 0.6062057783018868, + "grad_norm": 3.40625, + "learning_rate": 1.57987163481808e-05, + "loss": 2.1049, + "step": 32900 + }, + { + "epoch": 0.6065742924528302, + "grad_norm": 3.453125, + "learning_rate": 1.5793999352244762e-05, + "loss": 2.09, + "step": 32920 + }, + { + "epoch": 0.6069428066037735, + "grad_norm": 3.3125, + "learning_rate": 1.5789280414856256e-05, + "loss": 2.0811, + "step": 32940 + }, + { + "epoch": 0.6073113207547169, + "grad_norm": 3.171875, + "learning_rate": 1.5784559537596496e-05, + "loss": 2.0784, + "step": 32960 + }, + { + "epoch": 0.6076798349056604, + "grad_norm": 3.296875, + "learning_rate": 1.577983672204736e-05, + "loss": 2.0816, + "step": 32980 + }, + { + "epoch": 0.6080483490566038, + "grad_norm": 3.453125, + "learning_rate": 1.577511196979136e-05, + "loss": 2.1284, + "step": 33000 + }, + { + "epoch": 0.6084168632075472, + "grad_norm": 3.765625, + "learning_rate": 1.5770385282411674e-05, + "loss": 2.0767, + "step": 33020 + }, + { + "epoch": 0.6087853773584906, + "grad_norm": 3.375, + "learning_rate": 1.576565666149211e-05, + "loss": 2.1032, + "step": 33040 + }, + { + "epoch": 0.609153891509434, + "grad_norm": 3.109375, + "learning_rate": 1.5760926108617144e-05, + "loss": 2.1236, + "step": 33060 + }, + { + "epoch": 0.6095224056603774, + "grad_norm": 3.265625, + "learning_rate": 1.575619362537188e-05, + "loss": 2.0994, + "step": 33080 + }, + { + "epoch": 0.6098909198113207, + "grad_norm": 2.828125, + "learning_rate": 1.5751459213342088e-05, + "loss": 2.0793, + "step": 33100 + }, + { + "epoch": 0.6102594339622641, + "grad_norm": 3.203125, + "learning_rate": 1.5746722874114162e-05, + "loss": 2.1406, + "step": 33120 + }, + { + "epoch": 0.6106279481132075, + "grad_norm": 3.75, + "learning_rate": 1.5741984609275162e-05, + "loss": 2.1067, + "step": 33140 + }, + { + "epoch": 0.6109964622641509, + "grad_norm": 3.40625, + "learning_rate": 1.5737244420412778e-05, + "loss": 2.1355, + "step": 33160 + }, + { + "epoch": 0.6113649764150944, + "grad_norm": 3.1875, + "learning_rate": 1.573250230911536e-05, + "loss": 2.1155, + "step": 33180 + }, + { + "epoch": 0.6117334905660378, + "grad_norm": 3.0, + "learning_rate": 1.5727758276971897e-05, + "loss": 2.0982, + "step": 33200 + }, + { + "epoch": 0.6121020047169812, + "grad_norm": 3.6875, + "learning_rate": 1.5723012325572002e-05, + "loss": 2.1193, + "step": 33220 + }, + { + "epoch": 0.6124705188679245, + "grad_norm": 3.296875, + "learning_rate": 1.5718264456505964e-05, + "loss": 2.1304, + "step": 33240 + }, + { + "epoch": 0.6128390330188679, + "grad_norm": 3.546875, + "learning_rate": 1.571351467136469e-05, + "loss": 2.1315, + "step": 33260 + }, + { + "epoch": 0.6132075471698113, + "grad_norm": 3.203125, + "learning_rate": 1.5708762971739742e-05, + "loss": 2.095, + "step": 33280 + }, + { + "epoch": 0.6135760613207547, + "grad_norm": 2.984375, + "learning_rate": 1.5704009359223314e-05, + "loss": 2.1351, + "step": 33300 + }, + { + "epoch": 0.6139445754716981, + "grad_norm": 3.21875, + "learning_rate": 1.569925383540825e-05, + "loss": 2.0973, + "step": 33320 + }, + { + "epoch": 0.6143130896226415, + "grad_norm": 3.09375, + "learning_rate": 1.5694496401888025e-05, + "loss": 2.0883, + "step": 33340 + }, + { + "epoch": 0.6146816037735849, + "grad_norm": 3.296875, + "learning_rate": 1.5689737060256763e-05, + "loss": 2.0947, + "step": 33360 + }, + { + "epoch": 0.6150501179245284, + "grad_norm": 3.21875, + "learning_rate": 1.5684975812109225e-05, + "loss": 2.1122, + "step": 33380 + }, + { + "epoch": 0.6154186320754716, + "grad_norm": 3.21875, + "learning_rate": 1.5680212659040807e-05, + "loss": 2.1138, + "step": 33400 + }, + { + "epoch": 0.6157871462264151, + "grad_norm": 3.09375, + "learning_rate": 1.5675447602647548e-05, + "loss": 2.1138, + "step": 33420 + }, + { + "epoch": 0.6161556603773585, + "grad_norm": 3.59375, + "learning_rate": 1.5670680644526117e-05, + "loss": 2.0779, + "step": 33440 + }, + { + "epoch": 0.6165241745283019, + "grad_norm": 3.21875, + "learning_rate": 1.566591178627383e-05, + "loss": 2.1267, + "step": 33460 + }, + { + "epoch": 0.6168926886792453, + "grad_norm": 3.484375, + "learning_rate": 1.5661141029488636e-05, + "loss": 2.0754, + "step": 33480 + }, + { + "epoch": 0.6172612028301887, + "grad_norm": 3.21875, + "learning_rate": 1.565636837576912e-05, + "loss": 2.1245, + "step": 33500 + }, + { + "epoch": 0.6176297169811321, + "grad_norm": 3.078125, + "learning_rate": 1.5651593826714492e-05, + "loss": 2.1274, + "step": 33520 + }, + { + "epoch": 0.6179982311320755, + "grad_norm": 3.6875, + "learning_rate": 1.5646817383924623e-05, + "loss": 2.1112, + "step": 33540 + }, + { + "epoch": 0.6183667452830188, + "grad_norm": 3.25, + "learning_rate": 1.5642039048999992e-05, + "loss": 2.0871, + "step": 33560 + }, + { + "epoch": 0.6187352594339622, + "grad_norm": 3.265625, + "learning_rate": 1.5637258823541727e-05, + "loss": 2.1051, + "step": 33580 + }, + { + "epoch": 0.6191037735849056, + "grad_norm": 3.578125, + "learning_rate": 1.563247670915158e-05, + "loss": 2.1105, + "step": 33600 + }, + { + "epoch": 0.6194722877358491, + "grad_norm": 3.296875, + "learning_rate": 1.562769270743195e-05, + "loss": 2.1106, + "step": 33620 + }, + { + "epoch": 0.6198408018867925, + "grad_norm": 3.140625, + "learning_rate": 1.5622906819985853e-05, + "loss": 2.0923, + "step": 33640 + }, + { + "epoch": 0.6202093160377359, + "grad_norm": 3.34375, + "learning_rate": 1.5618119048416946e-05, + "loss": 2.0838, + "step": 33660 + }, + { + "epoch": 0.6205778301886793, + "grad_norm": 3.125, + "learning_rate": 1.561332939432951e-05, + "loss": 2.0986, + "step": 33680 + }, + { + "epoch": 0.6209463443396226, + "grad_norm": 3.640625, + "learning_rate": 1.5608537859328473e-05, + "loss": 2.112, + "step": 33700 + }, + { + "epoch": 0.621314858490566, + "grad_norm": 3.3125, + "learning_rate": 1.5603744445019372e-05, + "loss": 2.082, + "step": 33720 + }, + { + "epoch": 0.6216833726415094, + "grad_norm": 3.15625, + "learning_rate": 1.5598949153008385e-05, + "loss": 2.0741, + "step": 33740 + }, + { + "epoch": 0.6220518867924528, + "grad_norm": 3.21875, + "learning_rate": 1.5594151984902318e-05, + "loss": 2.0891, + "step": 33760 + }, + { + "epoch": 0.6224204009433962, + "grad_norm": 3.546875, + "learning_rate": 1.5589352942308608e-05, + "loss": 2.0967, + "step": 33780 + }, + { + "epoch": 0.6227889150943396, + "grad_norm": 3.4375, + "learning_rate": 1.5584552026835317e-05, + "loss": 2.0972, + "step": 33800 + }, + { + "epoch": 0.6231574292452831, + "grad_norm": 3.3125, + "learning_rate": 1.557974924009113e-05, + "loss": 2.1074, + "step": 33820 + }, + { + "epoch": 0.6235259433962265, + "grad_norm": 3.109375, + "learning_rate": 1.5574944583685375e-05, + "loss": 2.0972, + "step": 33840 + }, + { + "epoch": 0.6238944575471698, + "grad_norm": 3.4375, + "learning_rate": 1.5570138059227985e-05, + "loss": 2.0761, + "step": 33860 + }, + { + "epoch": 0.6242629716981132, + "grad_norm": 3.40625, + "learning_rate": 1.5565329668329532e-05, + "loss": 2.1143, + "step": 33880 + }, + { + "epoch": 0.6246314858490566, + "grad_norm": 3.328125, + "learning_rate": 1.556051941260121e-05, + "loss": 2.0916, + "step": 33900 + }, + { + "epoch": 0.625, + "grad_norm": 3.3125, + "learning_rate": 1.5555707293654844e-05, + "loss": 2.1213, + "step": 33920 + }, + { + "epoch": 0.6253685141509434, + "grad_norm": 3.203125, + "learning_rate": 1.5550893313102874e-05, + "loss": 2.1046, + "step": 33940 + }, + { + "epoch": 0.6257370283018868, + "grad_norm": 3.96875, + "learning_rate": 1.554607747255837e-05, + "loss": 2.1199, + "step": 33960 + }, + { + "epoch": 0.6261055424528302, + "grad_norm": 3.375, + "learning_rate": 1.5541259773635023e-05, + "loss": 2.0947, + "step": 33980 + }, + { + "epoch": 0.6264740566037735, + "grad_norm": 3.21875, + "learning_rate": 1.5536440217947143e-05, + "loss": 2.1181, + "step": 34000 + }, + { + "epoch": 0.6268425707547169, + "grad_norm": 3.453125, + "learning_rate": 1.553161880710967e-05, + "loss": 2.0968, + "step": 34020 + }, + { + "epoch": 0.6272110849056604, + "grad_norm": 3.75, + "learning_rate": 1.552679554273816e-05, + "loss": 2.097, + "step": 34040 + }, + { + "epoch": 0.6275795990566038, + "grad_norm": 3.265625, + "learning_rate": 1.5521970426448793e-05, + "loss": 2.097, + "step": 34060 + }, + { + "epoch": 0.6279481132075472, + "grad_norm": 3.296875, + "learning_rate": 1.551714345985837e-05, + "loss": 2.1032, + "step": 34080 + }, + { + "epoch": 0.6283166273584906, + "grad_norm": 3.109375, + "learning_rate": 1.5512314644584306e-05, + "loss": 2.1243, + "step": 34100 + }, + { + "epoch": 0.628685141509434, + "grad_norm": 3.765625, + "learning_rate": 1.550748398224464e-05, + "loss": 2.1123, + "step": 34120 + }, + { + "epoch": 0.6290536556603774, + "grad_norm": 3.3125, + "learning_rate": 1.5502651474458037e-05, + "loss": 2.1147, + "step": 34140 + }, + { + "epoch": 0.6294221698113207, + "grad_norm": 3.390625, + "learning_rate": 1.549781712284376e-05, + "loss": 2.1101, + "step": 34160 + }, + { + "epoch": 0.6297906839622641, + "grad_norm": 3.3125, + "learning_rate": 1.5492980929021716e-05, + "loss": 2.1171, + "step": 34180 + }, + { + "epoch": 0.6301591981132075, + "grad_norm": 3.671875, + "learning_rate": 1.5488142894612407e-05, + "loss": 2.1478, + "step": 34200 + }, + { + "epoch": 0.6305277122641509, + "grad_norm": 3.125, + "learning_rate": 1.5483303021236965e-05, + "loss": 2.1263, + "step": 34220 + }, + { + "epoch": 0.6308962264150944, + "grad_norm": 3.546875, + "learning_rate": 1.547846131051713e-05, + "loss": 2.1086, + "step": 34240 + }, + { + "epoch": 0.6312647405660378, + "grad_norm": 3.25, + "learning_rate": 1.547361776407527e-05, + "loss": 2.0889, + "step": 34260 + }, + { + "epoch": 0.6316332547169812, + "grad_norm": 3.46875, + "learning_rate": 1.546877238353435e-05, + "loss": 2.1358, + "step": 34280 + }, + { + "epoch": 0.6320017688679245, + "grad_norm": 3.5625, + "learning_rate": 1.546392517051797e-05, + "loss": 2.1093, + "step": 34300 + }, + { + "epoch": 0.6323702830188679, + "grad_norm": 3.140625, + "learning_rate": 1.5459076126650325e-05, + "loss": 2.116, + "step": 34320 + }, + { + "epoch": 0.6327387971698113, + "grad_norm": 3.390625, + "learning_rate": 1.5454225253556233e-05, + "loss": 2.1398, + "step": 34340 + }, + { + "epoch": 0.6331073113207547, + "grad_norm": 3.28125, + "learning_rate": 1.5449372552861124e-05, + "loss": 2.0971, + "step": 34360 + }, + { + "epoch": 0.6334758254716981, + "grad_norm": 3.734375, + "learning_rate": 1.5444518026191045e-05, + "loss": 2.0697, + "step": 34380 + }, + { + "epoch": 0.6338443396226415, + "grad_norm": 3.296875, + "learning_rate": 1.543966167517265e-05, + "loss": 2.1125, + "step": 34400 + }, + { + "epoch": 0.6342128537735849, + "grad_norm": 3.265625, + "learning_rate": 1.5434803501433196e-05, + "loss": 2.1267, + "step": 34420 + }, + { + "epoch": 0.6345813679245284, + "grad_norm": 3.484375, + "learning_rate": 1.5429943506600564e-05, + "loss": 2.1156, + "step": 34440 + }, + { + "epoch": 0.6349498820754716, + "grad_norm": 3.34375, + "learning_rate": 1.5425081692303244e-05, + "loss": 2.1481, + "step": 34460 + }, + { + "epoch": 0.6353183962264151, + "grad_norm": 4.3125, + "learning_rate": 1.5420218060170328e-05, + "loss": 2.0679, + "step": 34480 + }, + { + "epoch": 0.6356869103773585, + "grad_norm": 3.375, + "learning_rate": 1.541535261183152e-05, + "loss": 2.1112, + "step": 34500 + }, + { + "epoch": 0.6360554245283019, + "grad_norm": 3.75, + "learning_rate": 1.5410485348917137e-05, + "loss": 2.0833, + "step": 34520 + }, + { + "epoch": 0.6364239386792453, + "grad_norm": 3.328125, + "learning_rate": 1.54056162730581e-05, + "loss": 2.1314, + "step": 34540 + }, + { + "epoch": 0.6367924528301887, + "grad_norm": 3.6875, + "learning_rate": 1.5400745385885937e-05, + "loss": 2.1177, + "step": 34560 + }, + { + "epoch": 0.6371609669811321, + "grad_norm": 3.921875, + "learning_rate": 1.5395872689032784e-05, + "loss": 2.085, + "step": 34580 + }, + { + "epoch": 0.6375294811320755, + "grad_norm": 2.9375, + "learning_rate": 1.5390998184131387e-05, + "loss": 2.0938, + "step": 34600 + }, + { + "epoch": 0.6378979952830188, + "grad_norm": 3.484375, + "learning_rate": 1.5386121872815084e-05, + "loss": 2.0917, + "step": 34620 + }, + { + "epoch": 0.6382665094339622, + "grad_norm": 3.25, + "learning_rate": 1.5381243756717844e-05, + "loss": 2.1288, + "step": 34640 + }, + { + "epoch": 0.6386350235849056, + "grad_norm": 3.21875, + "learning_rate": 1.5376363837474214e-05, + "loss": 2.0898, + "step": 34660 + }, + { + "epoch": 0.6390035377358491, + "grad_norm": 3.5, + "learning_rate": 1.5371482116719366e-05, + "loss": 2.0933, + "step": 34680 + }, + { + "epoch": 0.6393720518867925, + "grad_norm": 3.171875, + "learning_rate": 1.5366598596089056e-05, + "loss": 2.127, + "step": 34700 + }, + { + "epoch": 0.6397405660377359, + "grad_norm": 3.5, + "learning_rate": 1.536171327721966e-05, + "loss": 2.0661, + "step": 34720 + }, + { + "epoch": 0.6401090801886793, + "grad_norm": 3.546875, + "learning_rate": 1.535682616174815e-05, + "loss": 2.1255, + "step": 34740 + }, + { + "epoch": 0.6404775943396226, + "grad_norm": 3.53125, + "learning_rate": 1.53519372513121e-05, + "loss": 2.0836, + "step": 34760 + }, + { + "epoch": 0.640846108490566, + "grad_norm": 3.40625, + "learning_rate": 1.5347046547549676e-05, + "loss": 2.0905, + "step": 34780 + }, + { + "epoch": 0.6412146226415094, + "grad_norm": 3.15625, + "learning_rate": 1.5342154052099666e-05, + "loss": 2.0995, + "step": 34800 + }, + { + "epoch": 0.6415831367924528, + "grad_norm": 3.234375, + "learning_rate": 1.5337259766601444e-05, + "loss": 2.1153, + "step": 34820 + }, + { + "epoch": 0.6419516509433962, + "grad_norm": 3.421875, + "learning_rate": 1.5332363692694984e-05, + "loss": 2.1164, + "step": 34840 + }, + { + "epoch": 0.6423201650943396, + "grad_norm": 3.1875, + "learning_rate": 1.5327465832020863e-05, + "loss": 2.1025, + "step": 34860 + }, + { + "epoch": 0.6426886792452831, + "grad_norm": 3.359375, + "learning_rate": 1.5322566186220253e-05, + "loss": 2.1183, + "step": 34880 + }, + { + "epoch": 0.6430571933962265, + "grad_norm": 3.3125, + "learning_rate": 1.531766475693493e-05, + "loss": 2.0804, + "step": 34900 + }, + { + "epoch": 0.6434257075471698, + "grad_norm": 2.984375, + "learning_rate": 1.5312761545807262e-05, + "loss": 2.1158, + "step": 34920 + }, + { + "epoch": 0.6437942216981132, + "grad_norm": 3.234375, + "learning_rate": 1.530785655448022e-05, + "loss": 2.0885, + "step": 34940 + }, + { + "epoch": 0.6441627358490566, + "grad_norm": 3.359375, + "learning_rate": 1.5302949784597362e-05, + "loss": 2.1016, + "step": 34960 + }, + { + "epoch": 0.64453125, + "grad_norm": 3.265625, + "learning_rate": 1.5298041237802857e-05, + "loss": 2.1238, + "step": 34980 + }, + { + "epoch": 0.6448997641509434, + "grad_norm": 3.453125, + "learning_rate": 1.529313091574145e-05, + "loss": 2.0982, + "step": 35000 + }, + { + "epoch": 0.6452682783018868, + "grad_norm": 3.5, + "learning_rate": 1.528821882005849e-05, + "loss": 2.073, + "step": 35020 + }, + { + "epoch": 0.6456367924528302, + "grad_norm": 3.09375, + "learning_rate": 1.5283304952399937e-05, + "loss": 2.0919, + "step": 35040 + }, + { + "epoch": 0.6460053066037735, + "grad_norm": 4.03125, + "learning_rate": 1.527838931441231e-05, + "loss": 2.1388, + "step": 35060 + }, + { + "epoch": 0.6463738207547169, + "grad_norm": 3.375, + "learning_rate": 1.5273471907742752e-05, + "loss": 2.0894, + "step": 35080 + }, + { + "epoch": 0.6467423349056604, + "grad_norm": 3.359375, + "learning_rate": 1.5268552734038983e-05, + "loss": 2.1167, + "step": 35100 + }, + { + "epoch": 0.6471108490566038, + "grad_norm": 3.515625, + "learning_rate": 1.5263631794949317e-05, + "loss": 2.1077, + "step": 35120 + }, + { + "epoch": 0.6474793632075472, + "grad_norm": 3.421875, + "learning_rate": 1.5258709092122668e-05, + "loss": 2.1183, + "step": 35140 + }, + { + "epoch": 0.6478478773584906, + "grad_norm": 3.265625, + "learning_rate": 1.5253784627208532e-05, + "loss": 2.0784, + "step": 35160 + }, + { + "epoch": 0.648216391509434, + "grad_norm": 3.5625, + "learning_rate": 1.5248858401856992e-05, + "loss": 2.1205, + "step": 35180 + }, + { + "epoch": 0.6485849056603774, + "grad_norm": 3.78125, + "learning_rate": 1.5243930417718735e-05, + "loss": 2.1164, + "step": 35200 + }, + { + "epoch": 0.6489534198113207, + "grad_norm": 3.59375, + "learning_rate": 1.5239000676445024e-05, + "loss": 2.0973, + "step": 35220 + }, + { + "epoch": 0.6493219339622641, + "grad_norm": 4.03125, + "learning_rate": 1.5234069179687718e-05, + "loss": 2.0572, + "step": 35240 + }, + { + "epoch": 0.6496904481132075, + "grad_norm": 3.5625, + "learning_rate": 1.5229135929099263e-05, + "loss": 2.123, + "step": 35260 + }, + { + "epoch": 0.6500589622641509, + "grad_norm": 3.296875, + "learning_rate": 1.5224200926332691e-05, + "loss": 2.1165, + "step": 35280 + }, + { + "epoch": 0.6504274764150944, + "grad_norm": 3.640625, + "learning_rate": 1.5219264173041623e-05, + "loss": 2.1117, + "step": 35300 + }, + { + "epoch": 0.6507959905660378, + "grad_norm": 3.609375, + "learning_rate": 1.5214325670880263e-05, + "loss": 2.119, + "step": 35320 + }, + { + "epoch": 0.6511645047169812, + "grad_norm": 3.390625, + "learning_rate": 1.520938542150341e-05, + "loss": 2.1614, + "step": 35340 + }, + { + "epoch": 0.6515330188679245, + "grad_norm": 3.078125, + "learning_rate": 1.5204443426566428e-05, + "loss": 2.0537, + "step": 35360 + }, + { + "epoch": 0.6519015330188679, + "grad_norm": 3.515625, + "learning_rate": 1.5199499687725296e-05, + "loss": 2.0929, + "step": 35380 + }, + { + "epoch": 0.6522700471698113, + "grad_norm": 3.53125, + "learning_rate": 1.5194554206636551e-05, + "loss": 2.0968, + "step": 35400 + }, + { + "epoch": 0.6526385613207547, + "grad_norm": 3.40625, + "learning_rate": 1.5189606984957332e-05, + "loss": 2.0787, + "step": 35420 + }, + { + "epoch": 0.6530070754716981, + "grad_norm": 3.546875, + "learning_rate": 1.5184658024345345e-05, + "loss": 2.1348, + "step": 35440 + }, + { + "epoch": 0.6533755896226415, + "grad_norm": 3.34375, + "learning_rate": 1.517970732645889e-05, + "loss": 2.1166, + "step": 35460 + }, + { + "epoch": 0.6537441037735849, + "grad_norm": 3.25, + "learning_rate": 1.5174754892956851e-05, + "loss": 2.1258, + "step": 35480 + }, + { + "epoch": 0.6541126179245284, + "grad_norm": 3.546875, + "learning_rate": 1.5169800725498683e-05, + "loss": 2.0838, + "step": 35500 + }, + { + "epoch": 0.6544811320754716, + "grad_norm": 3.578125, + "learning_rate": 1.5164844825744429e-05, + "loss": 2.0714, + "step": 35520 + }, + { + "epoch": 0.6548496462264151, + "grad_norm": 3.5, + "learning_rate": 1.5159887195354712e-05, + "loss": 2.127, + "step": 35540 + }, + { + "epoch": 0.6552181603773585, + "grad_norm": 3.359375, + "learning_rate": 1.5154927835990734e-05, + "loss": 2.1199, + "step": 35560 + }, + { + "epoch": 0.6555866745283019, + "grad_norm": 3.28125, + "learning_rate": 1.5149966749314277e-05, + "loss": 2.1091, + "step": 35580 + }, + { + "epoch": 0.6559551886792453, + "grad_norm": 3.4375, + "learning_rate": 1.51450039369877e-05, + "loss": 2.1034, + "step": 35600 + }, + { + "epoch": 0.6563237028301887, + "grad_norm": 3.109375, + "learning_rate": 1.514003940067394e-05, + "loss": 2.1349, + "step": 35620 + }, + { + "epoch": 0.6566922169811321, + "grad_norm": 3.84375, + "learning_rate": 1.5135073142036518e-05, + "loss": 2.1192, + "step": 35640 + }, + { + "epoch": 0.6570607311320755, + "grad_norm": 3.375, + "learning_rate": 1.5130105162739522e-05, + "loss": 2.0836, + "step": 35660 + }, + { + "epoch": 0.6574292452830188, + "grad_norm": 3.875, + "learning_rate": 1.5125135464447624e-05, + "loss": 2.1, + "step": 35680 + }, + { + "epoch": 0.6577977594339622, + "grad_norm": 3.171875, + "learning_rate": 1.5120164048826072e-05, + "loss": 2.1112, + "step": 35700 + }, + { + "epoch": 0.6581662735849056, + "grad_norm": 3.75, + "learning_rate": 1.5115190917540683e-05, + "loss": 2.1084, + "step": 35720 + }, + { + "epoch": 0.6585347877358491, + "grad_norm": 3.4375, + "learning_rate": 1.5110216072257855e-05, + "loss": 2.129, + "step": 35740 + }, + { + "epoch": 0.6589033018867925, + "grad_norm": 3.71875, + "learning_rate": 1.5105239514644558e-05, + "loss": 2.1222, + "step": 35760 + }, + { + "epoch": 0.6592718160377359, + "grad_norm": 3.625, + "learning_rate": 1.5100261246368337e-05, + "loss": 2.0847, + "step": 35780 + }, + { + "epoch": 0.6596403301886793, + "grad_norm": 3.390625, + "learning_rate": 1.5095281269097313e-05, + "loss": 2.0854, + "step": 35800 + }, + { + "epoch": 0.6600088443396226, + "grad_norm": 3.3125, + "learning_rate": 1.5090299584500168e-05, + "loss": 2.1134, + "step": 35820 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 3.578125, + "learning_rate": 1.5085316194246171e-05, + "loss": 2.0991, + "step": 35840 + }, + { + "epoch": 0.6607458726415094, + "grad_norm": 3.1875, + "learning_rate": 1.5080331100005152e-05, + "loss": 2.0939, + "step": 35860 + }, + { + "epoch": 0.6611143867924528, + "grad_norm": 3.65625, + "learning_rate": 1.5075344303447517e-05, + "loss": 2.0985, + "step": 35880 + }, + { + "epoch": 0.6614829009433962, + "grad_norm": 3.015625, + "learning_rate": 1.5070355806244239e-05, + "loss": 2.0825, + "step": 35900 + }, + { + "epoch": 0.6618514150943396, + "grad_norm": 3.328125, + "learning_rate": 1.5065365610066867e-05, + "loss": 2.0984, + "step": 35920 + }, + { + "epoch": 0.6622199292452831, + "grad_norm": 3.390625, + "learning_rate": 1.506037371658751e-05, + "loss": 2.1199, + "step": 35940 + }, + { + "epoch": 0.6625884433962265, + "grad_norm": 3.546875, + "learning_rate": 1.5055380127478859e-05, + "loss": 2.0801, + "step": 35960 + }, + { + "epoch": 0.6629569575471698, + "grad_norm": 3.3125, + "learning_rate": 1.5050384844414157e-05, + "loss": 2.1015, + "step": 35980 + }, + { + "epoch": 0.6633254716981132, + "grad_norm": 3.53125, + "learning_rate": 1.5045387869067225e-05, + "loss": 2.07, + "step": 36000 + }, + { + "epoch": 0.6636939858490566, + "grad_norm": 3.453125, + "learning_rate": 1.504038920311245e-05, + "loss": 2.0846, + "step": 36020 + }, + { + "epoch": 0.6640625, + "grad_norm": 3.8125, + "learning_rate": 1.5035388848224786e-05, + "loss": 2.1028, + "step": 36040 + }, + { + "epoch": 0.6644310141509434, + "grad_norm": 3.34375, + "learning_rate": 1.5030386806079747e-05, + "loss": 2.0821, + "step": 36060 + }, + { + "epoch": 0.6647995283018868, + "grad_norm": 3.5625, + "learning_rate": 1.5025383078353421e-05, + "loss": 2.0867, + "step": 36080 + }, + { + "epoch": 0.6651680424528302, + "grad_norm": 3.875, + "learning_rate": 1.502037766672245e-05, + "loss": 2.1109, + "step": 36100 + }, + { + "epoch": 0.6655365566037735, + "grad_norm": 3.21875, + "learning_rate": 1.5015370572864054e-05, + "loss": 2.0823, + "step": 36120 + }, + { + "epoch": 0.6659050707547169, + "grad_norm": 3.34375, + "learning_rate": 1.5010361798456004e-05, + "loss": 2.1172, + "step": 36140 + }, + { + "epoch": 0.6662735849056604, + "grad_norm": 3.40625, + "learning_rate": 1.5005351345176644e-05, + "loss": 2.0979, + "step": 36160 + }, + { + "epoch": 0.6666420990566038, + "grad_norm": 3.171875, + "learning_rate": 1.500033921470487e-05, + "loss": 2.1038, + "step": 36180 + }, + { + "epoch": 0.6670106132075472, + "grad_norm": 4.125, + "learning_rate": 1.4995325408720152e-05, + "loss": 2.1081, + "step": 36200 + }, + { + "epoch": 0.6673791273584906, + "grad_norm": 3.625, + "learning_rate": 1.4990309928902511e-05, + "loss": 2.0902, + "step": 36220 + }, + { + "epoch": 0.667747641509434, + "grad_norm": 3.453125, + "learning_rate": 1.4985292776932535e-05, + "loss": 2.0761, + "step": 36240 + }, + { + "epoch": 0.6681161556603774, + "grad_norm": 3.5, + "learning_rate": 1.4980273954491371e-05, + "loss": 2.0727, + "step": 36260 + }, + { + "epoch": 0.6684846698113207, + "grad_norm": 3.34375, + "learning_rate": 1.4975253463260723e-05, + "loss": 2.1055, + "step": 36280 + }, + { + "epoch": 0.6688531839622641, + "grad_norm": 3.65625, + "learning_rate": 1.497023130492286e-05, + "loss": 2.1052, + "step": 36300 + }, + { + "epoch": 0.6692216981132075, + "grad_norm": 3.71875, + "learning_rate": 1.4965207481160604e-05, + "loss": 2.0807, + "step": 36320 + }, + { + "epoch": 0.6695902122641509, + "grad_norm": 3.484375, + "learning_rate": 1.4960181993657336e-05, + "loss": 2.1427, + "step": 36340 + }, + { + "epoch": 0.6699587264150944, + "grad_norm": 3.65625, + "learning_rate": 1.4955154844097e-05, + "loss": 2.0833, + "step": 36360 + }, + { + "epoch": 0.6703272405660378, + "grad_norm": 3.15625, + "learning_rate": 1.495012603416409e-05, + "loss": 2.0746, + "step": 36380 + }, + { + "epoch": 0.6706957547169812, + "grad_norm": 3.421875, + "learning_rate": 1.4945095565543655e-05, + "loss": 2.0818, + "step": 36400 + }, + { + "epoch": 0.6710642688679245, + "grad_norm": 3.34375, + "learning_rate": 1.4940063439921306e-05, + "loss": 2.0967, + "step": 36420 + }, + { + "epoch": 0.6714327830188679, + "grad_norm": 3.34375, + "learning_rate": 1.4935029658983213e-05, + "loss": 2.1237, + "step": 36440 + }, + { + "epoch": 0.6718012971698113, + "grad_norm": 3.328125, + "learning_rate": 1.4929994224416088e-05, + "loss": 2.0839, + "step": 36460 + }, + { + "epoch": 0.6721698113207547, + "grad_norm": 3.265625, + "learning_rate": 1.4924957137907207e-05, + "loss": 2.1374, + "step": 36480 + }, + { + "epoch": 0.6725383254716981, + "grad_norm": 3.34375, + "learning_rate": 1.4919918401144394e-05, + "loss": 2.1441, + "step": 36500 + }, + { + "epoch": 0.6729068396226415, + "grad_norm": 3.125, + "learning_rate": 1.4914878015816028e-05, + "loss": 2.0652, + "step": 36520 + }, + { + "epoch": 0.6732753537735849, + "grad_norm": 3.359375, + "learning_rate": 1.490983598361104e-05, + "loss": 2.0931, + "step": 36540 + }, + { + "epoch": 0.6736438679245284, + "grad_norm": 2.921875, + "learning_rate": 1.4904792306218919e-05, + "loss": 2.0941, + "step": 36560 + }, + { + "epoch": 0.6740123820754716, + "grad_norm": 3.765625, + "learning_rate": 1.489974698532969e-05, + "loss": 2.0943, + "step": 36580 + }, + { + "epoch": 0.6743808962264151, + "grad_norm": 3.671875, + "learning_rate": 1.4894700022633948e-05, + "loss": 2.119, + "step": 36600 + }, + { + "epoch": 0.6747494103773585, + "grad_norm": 3.1875, + "learning_rate": 1.4889651419822824e-05, + "loss": 2.1263, + "step": 36620 + }, + { + "epoch": 0.6751179245283019, + "grad_norm": 3.296875, + "learning_rate": 1.4884601178588002e-05, + "loss": 2.1175, + "step": 36640 + }, + { + "epoch": 0.6754864386792453, + "grad_norm": 3.28125, + "learning_rate": 1.4879549300621721e-05, + "loss": 2.0747, + "step": 36660 + }, + { + "epoch": 0.6758549528301887, + "grad_norm": 3.515625, + "learning_rate": 1.4874495787616758e-05, + "loss": 2.1207, + "step": 36680 + }, + { + "epoch": 0.6762234669811321, + "grad_norm": 3.609375, + "learning_rate": 1.4869440641266448e-05, + "loss": 2.107, + "step": 36700 + }, + { + "epoch": 0.6765919811320755, + "grad_norm": 3.78125, + "learning_rate": 1.4864383863264661e-05, + "loss": 2.0793, + "step": 36720 + }, + { + "epoch": 0.6769604952830188, + "grad_norm": 3.484375, + "learning_rate": 1.4859325455305834e-05, + "loss": 2.1108, + "step": 36740 + }, + { + "epoch": 0.6773290094339622, + "grad_norm": 3.34375, + "learning_rate": 1.4854265419084928e-05, + "loss": 2.0975, + "step": 36760 + }, + { + "epoch": 0.6776975235849056, + "grad_norm": 3.40625, + "learning_rate": 1.4849203756297458e-05, + "loss": 2.0622, + "step": 36780 + }, + { + "epoch": 0.6780660377358491, + "grad_norm": 3.390625, + "learning_rate": 1.4844140468639494e-05, + "loss": 2.0965, + "step": 36800 + }, + { + "epoch": 0.6784345518867925, + "grad_norm": 3.4375, + "learning_rate": 1.4839075557807632e-05, + "loss": 2.0836, + "step": 36820 + }, + { + "epoch": 0.6788030660377359, + "grad_norm": 3.203125, + "learning_rate": 1.483400902549903e-05, + "loss": 2.1022, + "step": 36840 + }, + { + "epoch": 0.6791715801886793, + "grad_norm": 3.375, + "learning_rate": 1.4828940873411375e-05, + "loss": 2.1058, + "step": 36860 + }, + { + "epoch": 0.6795400943396226, + "grad_norm": 3.078125, + "learning_rate": 1.4823871103242909e-05, + "loss": 2.1112, + "step": 36880 + }, + { + "epoch": 0.679908608490566, + "grad_norm": 3.734375, + "learning_rate": 1.4818799716692398e-05, + "loss": 2.0981, + "step": 36900 + }, + { + "epoch": 0.6802771226415094, + "grad_norm": 3.140625, + "learning_rate": 1.4813726715459175e-05, + "loss": 2.0928, + "step": 36920 + }, + { + "epoch": 0.6806456367924528, + "grad_norm": 3.71875, + "learning_rate": 1.4808652101243093e-05, + "loss": 2.0995, + "step": 36940 + }, + { + "epoch": 0.6810141509433962, + "grad_norm": 3.640625, + "learning_rate": 1.4803575875744555e-05, + "loss": 2.1109, + "step": 36960 + }, + { + "epoch": 0.6813826650943396, + "grad_norm": 3.6875, + "learning_rate": 1.4798498040664502e-05, + "loss": 2.1201, + "step": 36980 + }, + { + "epoch": 0.6817511792452831, + "grad_norm": 3.28125, + "learning_rate": 1.4793418597704414e-05, + "loss": 2.1435, + "step": 37000 + }, + { + "epoch": 0.6821196933962265, + "grad_norm": 3.78125, + "learning_rate": 1.4788337548566309e-05, + "loss": 2.1398, + "step": 37020 + }, + { + "epoch": 0.6824882075471698, + "grad_norm": 3.171875, + "learning_rate": 1.4783254894952746e-05, + "loss": 2.0525, + "step": 37040 + }, + { + "epoch": 0.6828567216981132, + "grad_norm": 3.828125, + "learning_rate": 1.477817063856682e-05, + "loss": 2.1254, + "step": 37060 + }, + { + "epoch": 0.6832252358490566, + "grad_norm": 3.1875, + "learning_rate": 1.477308478111216e-05, + "loss": 2.1092, + "step": 37080 + }, + { + "epoch": 0.68359375, + "grad_norm": 3.21875, + "learning_rate": 1.4767997324292935e-05, + "loss": 2.1244, + "step": 37100 + }, + { + "epoch": 0.6839622641509434, + "grad_norm": 3.59375, + "learning_rate": 1.4762908269813852e-05, + "loss": 2.1023, + "step": 37120 + }, + { + "epoch": 0.6843307783018868, + "grad_norm": 3.25, + "learning_rate": 1.475781761938015e-05, + "loss": 2.0715, + "step": 37140 + }, + { + "epoch": 0.6846992924528302, + "grad_norm": 3.359375, + "learning_rate": 1.4752725374697601e-05, + "loss": 2.1126, + "step": 37160 + }, + { + "epoch": 0.6850678066037735, + "grad_norm": 3.734375, + "learning_rate": 1.4747631537472518e-05, + "loss": 2.0691, + "step": 37180 + }, + { + "epoch": 0.6854363207547169, + "grad_norm": 3.53125, + "learning_rate": 1.4742536109411739e-05, + "loss": 2.0857, + "step": 37200 + }, + { + "epoch": 0.6858048349056604, + "grad_norm": 3.375, + "learning_rate": 1.473743909222264e-05, + "loss": 2.1071, + "step": 37220 + }, + { + "epoch": 0.6861733490566038, + "grad_norm": 3.515625, + "learning_rate": 1.4732340487613128e-05, + "loss": 2.0655, + "step": 37240 + }, + { + "epoch": 0.6865418632075472, + "grad_norm": 3.21875, + "learning_rate": 1.4727240297291647e-05, + "loss": 2.1294, + "step": 37260 + }, + { + "epoch": 0.6869103773584906, + "grad_norm": 3.484375, + "learning_rate": 1.4722138522967164e-05, + "loss": 2.1172, + "step": 37280 + }, + { + "epoch": 0.687278891509434, + "grad_norm": 3.390625, + "learning_rate": 1.4717035166349182e-05, + "loss": 2.0924, + "step": 37300 + }, + { + "epoch": 0.6876474056603774, + "grad_norm": 3.234375, + "learning_rate": 1.4711930229147734e-05, + "loss": 2.096, + "step": 37320 + }, + { + "epoch": 0.6880159198113207, + "grad_norm": 3.40625, + "learning_rate": 1.4706823713073381e-05, + "loss": 2.0756, + "step": 37340 + }, + { + "epoch": 0.6883844339622641, + "grad_norm": 3.53125, + "learning_rate": 1.4701715619837216e-05, + "loss": 2.0828, + "step": 37360 + }, + { + "epoch": 0.6887529481132075, + "grad_norm": 3.625, + "learning_rate": 1.469660595115085e-05, + "loss": 2.1118, + "step": 37380 + }, + { + "epoch": 0.6891214622641509, + "grad_norm": 3.390625, + "learning_rate": 1.4691494708726441e-05, + "loss": 2.1331, + "step": 37400 + }, + { + "epoch": 0.6894899764150944, + "grad_norm": 3.9375, + "learning_rate": 1.4686381894276654e-05, + "loss": 2.1244, + "step": 37420 + }, + { + "epoch": 0.6898584905660378, + "grad_norm": 3.28125, + "learning_rate": 1.4681267509514695e-05, + "loss": 2.0784, + "step": 37440 + }, + { + "epoch": 0.6902270047169812, + "grad_norm": 3.6875, + "learning_rate": 1.467615155615429e-05, + "loss": 2.1134, + "step": 37460 + }, + { + "epoch": 0.6905955188679245, + "grad_norm": 3.265625, + "learning_rate": 1.4671034035909693e-05, + "loss": 2.1015, + "step": 37480 + }, + { + "epoch": 0.6909640330188679, + "grad_norm": 3.328125, + "learning_rate": 1.4665914950495682e-05, + "loss": 2.0944, + "step": 37500 + }, + { + "epoch": 0.6913325471698113, + "grad_norm": 3.4375, + "learning_rate": 1.4660794301627559e-05, + "loss": 2.126, + "step": 37520 + }, + { + "epoch": 0.6917010613207547, + "grad_norm": 3.5625, + "learning_rate": 1.4655672091021149e-05, + "loss": 2.0863, + "step": 37540 + }, + { + "epoch": 0.6920695754716981, + "grad_norm": 3.359375, + "learning_rate": 1.4650548320392802e-05, + "loss": 2.0932, + "step": 37560 + }, + { + "epoch": 0.6924380896226415, + "grad_norm": 3.171875, + "learning_rate": 1.464542299145939e-05, + "loss": 2.0946, + "step": 37580 + }, + { + "epoch": 0.6928066037735849, + "grad_norm": 3.359375, + "learning_rate": 1.4640296105938313e-05, + "loss": 2.0955, + "step": 37600 + }, + { + "epoch": 0.6931751179245284, + "grad_norm": 3.046875, + "learning_rate": 1.4635167665547478e-05, + "loss": 2.108, + "step": 37620 + }, + { + "epoch": 0.6935436320754716, + "grad_norm": 3.5625, + "learning_rate": 1.4630037672005329e-05, + "loss": 2.1076, + "step": 37640 + }, + { + "epoch": 0.6939121462264151, + "grad_norm": 3.703125, + "learning_rate": 1.462490612703082e-05, + "loss": 2.1149, + "step": 37660 + }, + { + "epoch": 0.6942806603773585, + "grad_norm": 3.53125, + "learning_rate": 1.4619773032343428e-05, + "loss": 2.0765, + "step": 37680 + }, + { + "epoch": 0.6946491745283019, + "grad_norm": 3.390625, + "learning_rate": 1.4614638389663153e-05, + "loss": 2.1238, + "step": 37700 + }, + { + "epoch": 0.6950176886792453, + "grad_norm": 3.640625, + "learning_rate": 1.4609502200710508e-05, + "loss": 2.0727, + "step": 37720 + }, + { + "epoch": 0.6953862028301887, + "grad_norm": 3.765625, + "learning_rate": 1.4604364467206526e-05, + "loss": 2.0892, + "step": 37740 + }, + { + "epoch": 0.6957547169811321, + "grad_norm": 3.390625, + "learning_rate": 1.4599225190872762e-05, + "loss": 2.1132, + "step": 37760 + }, + { + "epoch": 0.6961232311320755, + "grad_norm": 3.484375, + "learning_rate": 1.4594084373431276e-05, + "loss": 2.1027, + "step": 37780 + }, + { + "epoch": 0.6964917452830188, + "grad_norm": 3.265625, + "learning_rate": 1.4588942016604661e-05, + "loss": 2.1167, + "step": 37800 + }, + { + "epoch": 0.6968602594339622, + "grad_norm": 3.515625, + "learning_rate": 1.458379812211601e-05, + "loss": 2.0969, + "step": 37820 + }, + { + "epoch": 0.6972287735849056, + "grad_norm": 3.53125, + "learning_rate": 1.4578652691688943e-05, + "loss": 2.0904, + "step": 37840 + }, + { + "epoch": 0.6975972877358491, + "grad_norm": 3.609375, + "learning_rate": 1.4573505727047592e-05, + "loss": 2.1168, + "step": 37860 + }, + { + "epoch": 0.6979658018867925, + "grad_norm": 3.46875, + "learning_rate": 1.4568357229916594e-05, + "loss": 2.1109, + "step": 37880 + }, + { + "epoch": 0.6983343160377359, + "grad_norm": 3.21875, + "learning_rate": 1.4563207202021113e-05, + "loss": 2.0662, + "step": 37900 + }, + { + "epoch": 0.6987028301886793, + "grad_norm": 3.609375, + "learning_rate": 1.4558055645086817e-05, + "loss": 2.1272, + "step": 37920 + }, + { + "epoch": 0.6990713443396226, + "grad_norm": 3.09375, + "learning_rate": 1.455290256083989e-05, + "loss": 2.1354, + "step": 37940 + }, + { + "epoch": 0.699439858490566, + "grad_norm": 3.40625, + "learning_rate": 1.4547747951007022e-05, + "loss": 2.1058, + "step": 37960 + }, + { + "epoch": 0.6998083726415094, + "grad_norm": 3.4375, + "learning_rate": 1.4542591817315427e-05, + "loss": 2.116, + "step": 37980 + }, + { + "epoch": 0.7001768867924528, + "grad_norm": 3.734375, + "learning_rate": 1.4537434161492819e-05, + "loss": 2.0868, + "step": 38000 + }, + { + "epoch": 0.7005454009433962, + "grad_norm": 3.4375, + "learning_rate": 1.4532274985267418e-05, + "loss": 2.073, + "step": 38020 + }, + { + "epoch": 0.7009139150943396, + "grad_norm": 3.4375, + "learning_rate": 1.452711429036797e-05, + "loss": 2.1002, + "step": 38040 + }, + { + "epoch": 0.7012824292452831, + "grad_norm": 3.21875, + "learning_rate": 1.4521952078523711e-05, + "loss": 2.1144, + "step": 38060 + }, + { + "epoch": 0.7016509433962265, + "grad_norm": 3.453125, + "learning_rate": 1.4516788351464399e-05, + "loss": 2.0728, + "step": 38080 + }, + { + "epoch": 0.7020194575471698, + "grad_norm": 3.28125, + "learning_rate": 1.4511623110920297e-05, + "loss": 2.1343, + "step": 38100 + }, + { + "epoch": 0.7023879716981132, + "grad_norm": 3.609375, + "learning_rate": 1.4506456358622168e-05, + "loss": 2.1142, + "step": 38120 + }, + { + "epoch": 0.7027564858490566, + "grad_norm": 3.34375, + "learning_rate": 1.450128809630129e-05, + "loss": 2.119, + "step": 38140 + }, + { + "epoch": 0.703125, + "grad_norm": 3.546875, + "learning_rate": 1.4496118325689441e-05, + "loss": 2.13, + "step": 38160 + }, + { + "epoch": 0.7034935141509434, + "grad_norm": 3.21875, + "learning_rate": 1.4490947048518907e-05, + "loss": 2.0826, + "step": 38180 + }, + { + "epoch": 0.7038620283018868, + "grad_norm": 3.125, + "learning_rate": 1.4485774266522483e-05, + "loss": 2.0798, + "step": 38200 + }, + { + "epoch": 0.7042305424528302, + "grad_norm": 3.921875, + "learning_rate": 1.4480599981433456e-05, + "loss": 2.0837, + "step": 38220 + }, + { + "epoch": 0.7045990566037735, + "grad_norm": 3.171875, + "learning_rate": 1.4475424194985639e-05, + "loss": 2.1153, + "step": 38240 + }, + { + "epoch": 0.7049675707547169, + "grad_norm": 3.21875, + "learning_rate": 1.4470246908913319e-05, + "loss": 2.1019, + "step": 38260 + }, + { + "epoch": 0.7053360849056604, + "grad_norm": 3.9375, + "learning_rate": 1.4465068124951308e-05, + "loss": 2.0902, + "step": 38280 + }, + { + "epoch": 0.7057045990566038, + "grad_norm": 3.28125, + "learning_rate": 1.4459887844834908e-05, + "loss": 2.1011, + "step": 38300 + }, + { + "epoch": 0.7060731132075472, + "grad_norm": 3.21875, + "learning_rate": 1.4454706070299929e-05, + "loss": 2.0861, + "step": 38320 + }, + { + "epoch": 0.7064416273584906, + "grad_norm": 3.328125, + "learning_rate": 1.444952280308268e-05, + "loss": 2.1042, + "step": 38340 + }, + { + "epoch": 0.706810141509434, + "grad_norm": 3.6875, + "learning_rate": 1.4444338044919967e-05, + "loss": 2.1038, + "step": 38360 + }, + { + "epoch": 0.7071786556603774, + "grad_norm": 3.78125, + "learning_rate": 1.4439151797549103e-05, + "loss": 2.0971, + "step": 38380 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 3.296875, + "learning_rate": 1.4433964062707886e-05, + "loss": 2.1052, + "step": 38400 + }, + { + "epoch": 0.7079156839622641, + "grad_norm": 3.515625, + "learning_rate": 1.4428774842134634e-05, + "loss": 2.0734, + "step": 38420 + }, + { + "epoch": 0.7082841981132075, + "grad_norm": 3.140625, + "learning_rate": 1.4423584137568137e-05, + "loss": 2.0995, + "step": 38440 + }, + { + "epoch": 0.7086527122641509, + "grad_norm": 3.53125, + "learning_rate": 1.4418391950747706e-05, + "loss": 2.1332, + "step": 38460 + }, + { + "epoch": 0.7090212264150944, + "grad_norm": 3.875, + "learning_rate": 1.441319828341313e-05, + "loss": 2.1169, + "step": 38480 + }, + { + "epoch": 0.7093897405660378, + "grad_norm": 3.375, + "learning_rate": 1.440800313730471e-05, + "loss": 2.0614, + "step": 38500 + }, + { + "epoch": 0.7097582547169812, + "grad_norm": 3.625, + "learning_rate": 1.4402806514163228e-05, + "loss": 2.1203, + "step": 38520 + }, + { + "epoch": 0.7101267688679245, + "grad_norm": 3.15625, + "learning_rate": 1.4397608415729974e-05, + "loss": 2.1268, + "step": 38540 + }, + { + "epoch": 0.7104952830188679, + "grad_norm": 3.390625, + "learning_rate": 1.4392408843746722e-05, + "loss": 2.094, + "step": 38560 + }, + { + "epoch": 0.7108637971698113, + "grad_norm": 3.0625, + "learning_rate": 1.4387207799955743e-05, + "loss": 2.0892, + "step": 38580 + }, + { + "epoch": 0.7112323113207547, + "grad_norm": 3.265625, + "learning_rate": 1.4382005286099805e-05, + "loss": 2.0893, + "step": 38600 + }, + { + "epoch": 0.7116008254716981, + "grad_norm": 3.546875, + "learning_rate": 1.4376801303922161e-05, + "loss": 2.0827, + "step": 38620 + }, + { + "epoch": 0.7119693396226415, + "grad_norm": 3.828125, + "learning_rate": 1.4371595855166569e-05, + "loss": 2.0946, + "step": 38640 + }, + { + "epoch": 0.7123378537735849, + "grad_norm": 3.203125, + "learning_rate": 1.4366388941577261e-05, + "loss": 2.0905, + "step": 38660 + }, + { + "epoch": 0.7127063679245284, + "grad_norm": 3.0, + "learning_rate": 1.4361180564898976e-05, + "loss": 2.0803, + "step": 38680 + }, + { + "epoch": 0.7130748820754716, + "grad_norm": 3.703125, + "learning_rate": 1.4355970726876928e-05, + "loss": 2.0758, + "step": 38700 + }, + { + "epoch": 0.7134433962264151, + "grad_norm": 3.5, + "learning_rate": 1.4350759429256842e-05, + "loss": 2.146, + "step": 38720 + }, + { + "epoch": 0.7138119103773585, + "grad_norm": 3.34375, + "learning_rate": 1.4345546673784902e-05, + "loss": 2.1191, + "step": 38740 + }, + { + "epoch": 0.7141804245283019, + "grad_norm": 3.21875, + "learning_rate": 1.4340332462207814e-05, + "loss": 2.0651, + "step": 38760 + }, + { + "epoch": 0.7145489386792453, + "grad_norm": 3.328125, + "learning_rate": 1.433511679627274e-05, + "loss": 2.086, + "step": 38780 + }, + { + "epoch": 0.7149174528301887, + "grad_norm": 3.21875, + "learning_rate": 1.4329899677727357e-05, + "loss": 2.1065, + "step": 38800 + }, + { + "epoch": 0.7152859669811321, + "grad_norm": 3.25, + "learning_rate": 1.4324681108319807e-05, + "loss": 2.0784, + "step": 38820 + }, + { + "epoch": 0.7156544811320755, + "grad_norm": 3.5, + "learning_rate": 1.4319461089798732e-05, + "loss": 2.1246, + "step": 38840 + }, + { + "epoch": 0.7160229952830188, + "grad_norm": 3.1875, + "learning_rate": 1.431423962391326e-05, + "loss": 2.0903, + "step": 38860 + }, + { + "epoch": 0.7163915094339622, + "grad_norm": 3.609375, + "learning_rate": 1.4309016712412986e-05, + "loss": 2.1025, + "step": 38880 + }, + { + "epoch": 0.7167600235849056, + "grad_norm": 3.453125, + "learning_rate": 1.4303792357048016e-05, + "loss": 2.114, + "step": 38900 + }, + { + "epoch": 0.7171285377358491, + "grad_norm": 3.609375, + "learning_rate": 1.4298566559568916e-05, + "loss": 2.098, + "step": 38920 + }, + { + "epoch": 0.7174970518867925, + "grad_norm": 3.5625, + "learning_rate": 1.4293339321726753e-05, + "loss": 2.0719, + "step": 38940 + }, + { + "epoch": 0.7178655660377359, + "grad_norm": 3.5625, + "learning_rate": 1.4288110645273064e-05, + "loss": 2.089, + "step": 38960 + }, + { + "epoch": 0.7182340801886793, + "grad_norm": 3.4375, + "learning_rate": 1.428288053195988e-05, + "loss": 2.1086, + "step": 38980 + }, + { + "epoch": 0.7186025943396226, + "grad_norm": 3.59375, + "learning_rate": 1.4277648983539694e-05, + "loss": 2.0995, + "step": 39000 + }, + { + "epoch": 0.718971108490566, + "grad_norm": 3.53125, + "learning_rate": 1.4272416001765506e-05, + "loss": 2.087, + "step": 39020 + }, + { + "epoch": 0.7193396226415094, + "grad_norm": 3.59375, + "learning_rate": 1.4267181588390772e-05, + "loss": 2.0992, + "step": 39040 + }, + { + "epoch": 0.7197081367924528, + "grad_norm": 4.28125, + "learning_rate": 1.426194574516945e-05, + "loss": 2.0745, + "step": 39060 + }, + { + "epoch": 0.7200766509433962, + "grad_norm": 3.375, + "learning_rate": 1.4256708473855954e-05, + "loss": 2.0645, + "step": 39080 + }, + { + "epoch": 0.7204451650943396, + "grad_norm": 3.515625, + "learning_rate": 1.4251469776205195e-05, + "loss": 2.0913, + "step": 39100 + }, + { + "epoch": 0.7208136792452831, + "grad_norm": 3.328125, + "learning_rate": 1.4246229653972556e-05, + "loss": 2.0756, + "step": 39120 + }, + { + "epoch": 0.7211821933962265, + "grad_norm": 3.75, + "learning_rate": 1.4240988108913889e-05, + "loss": 2.1318, + "step": 39140 + }, + { + "epoch": 0.7215507075471698, + "grad_norm": 3.640625, + "learning_rate": 1.4235745142785538e-05, + "loss": 2.1585, + "step": 39160 + }, + { + "epoch": 0.7219192216981132, + "grad_norm": 3.703125, + "learning_rate": 1.4230500757344308e-05, + "loss": 2.0931, + "step": 39180 + }, + { + "epoch": 0.7222877358490566, + "grad_norm": 3.203125, + "learning_rate": 1.4225254954347497e-05, + "loss": 2.0734, + "step": 39200 + }, + { + "epoch": 0.72265625, + "grad_norm": 3.640625, + "learning_rate": 1.4220007735552856e-05, + "loss": 2.1068, + "step": 39220 + }, + { + "epoch": 0.7230247641509434, + "grad_norm": 3.5, + "learning_rate": 1.4214759102718631e-05, + "loss": 2.087, + "step": 39240 + }, + { + "epoch": 0.7233932783018868, + "grad_norm": 3.5, + "learning_rate": 1.4209509057603532e-05, + "loss": 2.1092, + "step": 39260 + }, + { + "epoch": 0.7237617924528302, + "grad_norm": 3.546875, + "learning_rate": 1.420425760196674e-05, + "loss": 2.0949, + "step": 39280 + }, + { + "epoch": 0.7241303066037735, + "grad_norm": 3.40625, + "learning_rate": 1.4199004737567915e-05, + "loss": 2.1012, + "step": 39300 + }, + { + "epoch": 0.7244988207547169, + "grad_norm": 3.6875, + "learning_rate": 1.4193750466167184e-05, + "loss": 2.0661, + "step": 39320 + }, + { + "epoch": 0.7248673349056604, + "grad_norm": 3.390625, + "learning_rate": 1.418849478952515e-05, + "loss": 2.0822, + "step": 39340 + }, + { + "epoch": 0.7252358490566038, + "grad_norm": 3.421875, + "learning_rate": 1.4183237709402881e-05, + "loss": 2.118, + "step": 39360 + }, + { + "epoch": 0.7256043632075472, + "grad_norm": 3.25, + "learning_rate": 1.4177979227561921e-05, + "loss": 2.0932, + "step": 39380 + }, + { + "epoch": 0.7259728773584906, + "grad_norm": 3.640625, + "learning_rate": 1.4172719345764281e-05, + "loss": 2.1111, + "step": 39400 + }, + { + "epoch": 0.726341391509434, + "grad_norm": 3.4375, + "learning_rate": 1.4167458065772443e-05, + "loss": 2.118, + "step": 39420 + }, + { + "epoch": 0.7267099056603774, + "grad_norm": 3.09375, + "learning_rate": 1.416219538934935e-05, + "loss": 2.1181, + "step": 39440 + }, + { + "epoch": 0.7270784198113207, + "grad_norm": 3.28125, + "learning_rate": 1.4156931318258425e-05, + "loss": 2.1094, + "step": 39460 + }, + { + "epoch": 0.7274469339622641, + "grad_norm": 3.40625, + "learning_rate": 1.415166585426355e-05, + "loss": 2.0935, + "step": 39480 + }, + { + "epoch": 0.7278154481132075, + "grad_norm": 3.8125, + "learning_rate": 1.4146398999129077e-05, + "loss": 2.1367, + "step": 39500 + }, + { + "epoch": 0.7281839622641509, + "grad_norm": 3.3125, + "learning_rate": 1.4141130754619821e-05, + "loss": 2.1076, + "step": 39520 + }, + { + "epoch": 0.7285524764150944, + "grad_norm": 3.484375, + "learning_rate": 1.4135861122501067e-05, + "loss": 2.0496, + "step": 39540 + }, + { + "epoch": 0.7289209905660378, + "grad_norm": 3.5625, + "learning_rate": 1.413059010453856e-05, + "loss": 2.118, + "step": 39560 + }, + { + "epoch": 0.7292895047169812, + "grad_norm": 3.640625, + "learning_rate": 1.4125317702498513e-05, + "loss": 2.1051, + "step": 39580 + }, + { + "epoch": 0.7296580188679245, + "grad_norm": 3.34375, + "learning_rate": 1.4120043918147604e-05, + "loss": 2.0852, + "step": 39600 + }, + { + "epoch": 0.7300265330188679, + "grad_norm": 3.3125, + "learning_rate": 1.4114768753252968e-05, + "loss": 2.068, + "step": 39620 + }, + { + "epoch": 0.7303950471698113, + "grad_norm": 3.53125, + "learning_rate": 1.410949220958221e-05, + "loss": 2.0859, + "step": 39640 + }, + { + "epoch": 0.7307635613207547, + "grad_norm": 4.0, + "learning_rate": 1.410421428890339e-05, + "loss": 2.0805, + "step": 39660 + }, + { + "epoch": 0.7311320754716981, + "grad_norm": 3.21875, + "learning_rate": 1.4098934992985038e-05, + "loss": 2.1178, + "step": 39680 + }, + { + "epoch": 0.7315005896226415, + "grad_norm": 3.3125, + "learning_rate": 1.4093654323596133e-05, + "loss": 2.0823, + "step": 39700 + }, + { + "epoch": 0.7318691037735849, + "grad_norm": 3.25, + "learning_rate": 1.4088372282506126e-05, + "loss": 2.0719, + "step": 39720 + }, + { + "epoch": 0.7322376179245284, + "grad_norm": 3.21875, + "learning_rate": 1.4083088871484923e-05, + "loss": 2.0899, + "step": 39740 + }, + { + "epoch": 0.7326061320754716, + "grad_norm": 3.390625, + "learning_rate": 1.4077804092302883e-05, + "loss": 2.0709, + "step": 39760 + }, + { + "epoch": 0.7329746462264151, + "grad_norm": 3.28125, + "learning_rate": 1.4072517946730835e-05, + "loss": 2.1359, + "step": 39780 + }, + { + "epoch": 0.7333431603773585, + "grad_norm": 3.25, + "learning_rate": 1.4067230436540058e-05, + "loss": 2.1006, + "step": 39800 + }, + { + "epoch": 0.7337116745283019, + "grad_norm": 3.5625, + "learning_rate": 1.4061941563502289e-05, + "loss": 2.0951, + "step": 39820 + }, + { + "epoch": 0.7340801886792453, + "grad_norm": 3.65625, + "learning_rate": 1.4056651329389724e-05, + "loss": 2.0352, + "step": 39840 + }, + { + "epoch": 0.7344487028301887, + "grad_norm": 3.375, + "learning_rate": 1.4051359735975012e-05, + "loss": 2.1079, + "step": 39860 + }, + { + "epoch": 0.7348172169811321, + "grad_norm": 3.609375, + "learning_rate": 1.404606678503126e-05, + "loss": 2.0916, + "step": 39880 + }, + { + "epoch": 0.7351857311320755, + "grad_norm": 3.671875, + "learning_rate": 1.4040772478332034e-05, + "loss": 2.058, + "step": 39900 + }, + { + "epoch": 0.7355542452830188, + "grad_norm": 3.484375, + "learning_rate": 1.4035476817651344e-05, + "loss": 2.1246, + "step": 39920 + }, + { + "epoch": 0.7359227594339622, + "grad_norm": 3.46875, + "learning_rate": 1.403017980476366e-05, + "loss": 2.0558, + "step": 39940 + }, + { + "epoch": 0.7362912735849056, + "grad_norm": 3.453125, + "learning_rate": 1.4024881441443905e-05, + "loss": 2.1148, + "step": 39960 + }, + { + "epoch": 0.7366597877358491, + "grad_norm": 3.578125, + "learning_rate": 1.4019581729467453e-05, + "loss": 2.1314, + "step": 39980 + }, + { + "epoch": 0.7370283018867925, + "grad_norm": 3.46875, + "learning_rate": 1.4014280670610134e-05, + "loss": 2.0798, + "step": 40000 + }, + { + "epoch": 0.7373968160377359, + "grad_norm": 3.578125, + "learning_rate": 1.4008978266648224e-05, + "loss": 2.1203, + "step": 40020 + }, + { + "epoch": 0.7377653301886793, + "grad_norm": 3.203125, + "learning_rate": 1.400367451935845e-05, + "loss": 2.1077, + "step": 40040 + }, + { + "epoch": 0.7381338443396226, + "grad_norm": 3.421875, + "learning_rate": 1.3998369430517994e-05, + "loss": 2.0818, + "step": 40060 + }, + { + "epoch": 0.738502358490566, + "grad_norm": 3.5, + "learning_rate": 1.3993063001904481e-05, + "loss": 2.1, + "step": 40080 + }, + { + "epoch": 0.7388708726415094, + "grad_norm": 3.609375, + "learning_rate": 1.398775523529599e-05, + "loss": 2.1003, + "step": 40100 + }, + { + "epoch": 0.7392393867924528, + "grad_norm": 3.9375, + "learning_rate": 1.3982446132471048e-05, + "loss": 2.0904, + "step": 40120 + }, + { + "epoch": 0.7396079009433962, + "grad_norm": 3.578125, + "learning_rate": 1.3977135695208625e-05, + "loss": 2.1335, + "step": 40140 + }, + { + "epoch": 0.7399764150943396, + "grad_norm": 3.390625, + "learning_rate": 1.3971823925288146e-05, + "loss": 2.0561, + "step": 40160 + }, + { + "epoch": 0.7403449292452831, + "grad_norm": 3.546875, + "learning_rate": 1.3966510824489477e-05, + "loss": 2.1061, + "step": 40180 + }, + { + "epoch": 0.7407134433962265, + "grad_norm": 3.140625, + "learning_rate": 1.3961196394592924e-05, + "loss": 2.1275, + "step": 40200 + }, + { + "epoch": 0.7410819575471698, + "grad_norm": 3.578125, + "learning_rate": 1.3955880637379257e-05, + "loss": 2.0904, + "step": 40220 + }, + { + "epoch": 0.7414504716981132, + "grad_norm": 3.296875, + "learning_rate": 1.3950563554629673e-05, + "loss": 2.1006, + "step": 40240 + }, + { + "epoch": 0.7418189858490566, + "grad_norm": 3.78125, + "learning_rate": 1.3945245148125821e-05, + "loss": 2.129, + "step": 40260 + }, + { + "epoch": 0.7421875, + "grad_norm": 3.359375, + "learning_rate": 1.3939925419649787e-05, + "loss": 2.0687, + "step": 40280 + }, + { + "epoch": 0.7425560141509434, + "grad_norm": 3.3125, + "learning_rate": 1.393460437098411e-05, + "loss": 2.0922, + "step": 40300 + }, + { + "epoch": 0.7429245283018868, + "grad_norm": 3.46875, + "learning_rate": 1.3929282003911766e-05, + "loss": 2.1088, + "step": 40320 + }, + { + "epoch": 0.7432930424528302, + "grad_norm": 3.234375, + "learning_rate": 1.3923958320216172e-05, + "loss": 2.1105, + "step": 40340 + }, + { + "epoch": 0.7436615566037735, + "grad_norm": 3.46875, + "learning_rate": 1.3918633321681185e-05, + "loss": 2.0803, + "step": 40360 + }, + { + "epoch": 0.7440300707547169, + "grad_norm": 3.28125, + "learning_rate": 1.3913307010091109e-05, + "loss": 2.0775, + "step": 40380 + }, + { + "epoch": 0.7443985849056604, + "grad_norm": 3.625, + "learning_rate": 1.3907979387230679e-05, + "loss": 2.1188, + "step": 40400 + }, + { + "epoch": 0.7447670990566038, + "grad_norm": 3.53125, + "learning_rate": 1.3902650454885073e-05, + "loss": 2.0727, + "step": 40420 + }, + { + "epoch": 0.7451356132075472, + "grad_norm": 3.703125, + "learning_rate": 1.389732021483992e-05, + "loss": 2.1374, + "step": 40440 + }, + { + "epoch": 0.7455041273584906, + "grad_norm": 3.28125, + "learning_rate": 1.3891988668881262e-05, + "loss": 2.1171, + "step": 40460 + }, + { + "epoch": 0.745872641509434, + "grad_norm": 3.578125, + "learning_rate": 1.3886655818795607e-05, + "loss": 2.0877, + "step": 40480 + }, + { + "epoch": 0.7462411556603774, + "grad_norm": 3.625, + "learning_rate": 1.388132166636987e-05, + "loss": 2.0778, + "step": 40500 + }, + { + "epoch": 0.7466096698113207, + "grad_norm": 3.25, + "learning_rate": 1.387598621339143e-05, + "loss": 2.0805, + "step": 40520 + }, + { + "epoch": 0.7469781839622641, + "grad_norm": 3.625, + "learning_rate": 1.3870649461648084e-05, + "loss": 2.0748, + "step": 40540 + }, + { + "epoch": 0.7473466981132075, + "grad_norm": 4.53125, + "learning_rate": 1.3865311412928075e-05, + "loss": 2.1069, + "step": 40560 + }, + { + "epoch": 0.7477152122641509, + "grad_norm": 3.734375, + "learning_rate": 1.3859972069020068e-05, + "loss": 2.1014, + "step": 40580 + }, + { + "epoch": 0.7480837264150944, + "grad_norm": 3.5, + "learning_rate": 1.3854631431713175e-05, + "loss": 2.1297, + "step": 40600 + }, + { + "epoch": 0.7484522405660378, + "grad_norm": 3.78125, + "learning_rate": 1.3849289502796936e-05, + "loss": 2.085, + "step": 40620 + }, + { + "epoch": 0.7488207547169812, + "grad_norm": 3.421875, + "learning_rate": 1.384394628406132e-05, + "loss": 2.094, + "step": 40640 + }, + { + "epoch": 0.7491892688679245, + "grad_norm": 3.3125, + "learning_rate": 1.383860177729674e-05, + "loss": 2.1054, + "step": 40660 + }, + { + "epoch": 0.7495577830188679, + "grad_norm": 4.125, + "learning_rate": 1.383325598429402e-05, + "loss": 2.0918, + "step": 40680 + }, + { + "epoch": 0.7499262971698113, + "grad_norm": 3.78125, + "learning_rate": 1.3827908906844441e-05, + "loss": 2.158, + "step": 40700 + }, + { + "epoch": 0.7502948113207547, + "grad_norm": 3.46875, + "learning_rate": 1.3822560546739689e-05, + "loss": 2.1069, + "step": 40720 + }, + { + "epoch": 0.7506633254716981, + "grad_norm": 3.234375, + "learning_rate": 1.38172109057719e-05, + "loss": 2.0954, + "step": 40740 + }, + { + "epoch": 0.7510318396226415, + "grad_norm": 3.53125, + "learning_rate": 1.3811859985733627e-05, + "loss": 2.0775, + "step": 40760 + }, + { + "epoch": 0.7514003537735849, + "grad_norm": 3.796875, + "learning_rate": 1.380650778841786e-05, + "loss": 2.0638, + "step": 40780 + }, + { + "epoch": 0.7517688679245284, + "grad_norm": 3.5625, + "learning_rate": 1.3801154315618007e-05, + "loss": 2.0885, + "step": 40800 + }, + { + "epoch": 0.7521373820754716, + "grad_norm": 3.4375, + "learning_rate": 1.379579956912791e-05, + "loss": 2.0732, + "step": 40820 + }, + { + "epoch": 0.7525058962264151, + "grad_norm": 3.234375, + "learning_rate": 1.379044355074184e-05, + "loss": 2.0763, + "step": 40840 + }, + { + "epoch": 0.7528744103773585, + "grad_norm": 3.140625, + "learning_rate": 1.378508626225449e-05, + "loss": 2.0928, + "step": 40860 + }, + { + "epoch": 0.7532429245283019, + "grad_norm": 3.765625, + "learning_rate": 1.3779727705460975e-05, + "loss": 2.0842, + "step": 40880 + }, + { + "epoch": 0.7536114386792453, + "grad_norm": 3.234375, + "learning_rate": 1.3774367882156845e-05, + "loss": 2.0705, + "step": 40900 + }, + { + "epoch": 0.7539799528301887, + "grad_norm": 3.59375, + "learning_rate": 1.3769006794138067e-05, + "loss": 2.1031, + "step": 40920 + }, + { + "epoch": 0.7543484669811321, + "grad_norm": 3.390625, + "learning_rate": 1.376364444320103e-05, + "loss": 2.0728, + "step": 40940 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 3.625, + "learning_rate": 1.3758280831142557e-05, + "loss": 2.1029, + "step": 40960 + }, + { + "epoch": 0.7550854952830188, + "grad_norm": 3.546875, + "learning_rate": 1.3752915959759876e-05, + "loss": 2.1108, + "step": 40980 + }, + { + "epoch": 0.7554540094339622, + "grad_norm": 3.796875, + "learning_rate": 1.3747549830850661e-05, + "loss": 2.0893, + "step": 41000 + }, + { + "epoch": 0.7558225235849056, + "grad_norm": 3.546875, + "learning_rate": 1.374218244621298e-05, + "loss": 2.1123, + "step": 41020 + }, + { + "epoch": 0.7561910377358491, + "grad_norm": 3.4375, + "learning_rate": 1.3736813807645344e-05, + "loss": 2.1372, + "step": 41040 + }, + { + "epoch": 0.7565595518867925, + "grad_norm": 3.34375, + "learning_rate": 1.3731443916946674e-05, + "loss": 2.0996, + "step": 41060 + }, + { + "epoch": 0.7569280660377359, + "grad_norm": 3.625, + "learning_rate": 1.372607277591631e-05, + "loss": 2.1195, + "step": 41080 + }, + { + "epoch": 0.7572965801886793, + "grad_norm": 3.296875, + "learning_rate": 1.3720700386354018e-05, + "loss": 2.0931, + "step": 41100 + }, + { + "epoch": 0.7576650943396226, + "grad_norm": 3.390625, + "learning_rate": 1.3715326750059972e-05, + "loss": 2.0804, + "step": 41120 + }, + { + "epoch": 0.758033608490566, + "grad_norm": 3.5, + "learning_rate": 1.3709951868834772e-05, + "loss": 2.0863, + "step": 41140 + }, + { + "epoch": 0.7584021226415094, + "grad_norm": 3.140625, + "learning_rate": 1.3704575744479432e-05, + "loss": 2.1055, + "step": 41160 + }, + { + "epoch": 0.7587706367924528, + "grad_norm": 3.546875, + "learning_rate": 1.3699198378795382e-05, + "loss": 2.0796, + "step": 41180 + }, + { + "epoch": 0.7591391509433962, + "grad_norm": 3.375, + "learning_rate": 1.3693819773584472e-05, + "loss": 2.125, + "step": 41200 + }, + { + "epoch": 0.7595076650943396, + "grad_norm": 3.515625, + "learning_rate": 1.3688439930648963e-05, + "loss": 2.1321, + "step": 41220 + }, + { + "epoch": 0.7598761792452831, + "grad_norm": 3.40625, + "learning_rate": 1.3683058851791527e-05, + "loss": 2.0773, + "step": 41240 + }, + { + "epoch": 0.7602446933962265, + "grad_norm": 3.9375, + "learning_rate": 1.3677676538815261e-05, + "loss": 2.0873, + "step": 41260 + }, + { + "epoch": 0.7606132075471698, + "grad_norm": 3.796875, + "learning_rate": 1.3672292993523668e-05, + "loss": 2.0912, + "step": 41280 + }, + { + "epoch": 0.7609817216981132, + "grad_norm": 3.53125, + "learning_rate": 1.3666908217720664e-05, + "loss": 2.0786, + "step": 41300 + }, + { + "epoch": 0.7613502358490566, + "grad_norm": 3.265625, + "learning_rate": 1.3661522213210583e-05, + "loss": 2.1001, + "step": 41320 + }, + { + "epoch": 0.76171875, + "grad_norm": 3.96875, + "learning_rate": 1.3656134981798164e-05, + "loss": 2.1018, + "step": 41340 + }, + { + "epoch": 0.7620872641509434, + "grad_norm": 3.171875, + "learning_rate": 1.3650746525288558e-05, + "loss": 2.0987, + "step": 41360 + }, + { + "epoch": 0.7624557783018868, + "grad_norm": 4.46875, + "learning_rate": 1.3645356845487332e-05, + "loss": 2.1085, + "step": 41380 + }, + { + "epoch": 0.7628242924528302, + "grad_norm": 3.71875, + "learning_rate": 1.3639965944200455e-05, + "loss": 2.0722, + "step": 41400 + }, + { + "epoch": 0.7631928066037735, + "grad_norm": 3.90625, + "learning_rate": 1.3634573823234314e-05, + "loss": 2.1317, + "step": 41420 + }, + { + "epoch": 0.7635613207547169, + "grad_norm": 3.59375, + "learning_rate": 1.3629180484395696e-05, + "loss": 2.0615, + "step": 41440 + }, + { + "epoch": 0.7639298349056604, + "grad_norm": 3.59375, + "learning_rate": 1.3623785929491802e-05, + "loss": 2.078, + "step": 41460 + }, + { + "epoch": 0.7642983490566038, + "grad_norm": 3.390625, + "learning_rate": 1.361839016033024e-05, + "loss": 2.1012, + "step": 41480 + }, + { + "epoch": 0.7646668632075472, + "grad_norm": 3.6875, + "learning_rate": 1.3612993178719021e-05, + "loss": 2.0835, + "step": 41500 + }, + { + "epoch": 0.7650353773584906, + "grad_norm": 3.625, + "learning_rate": 1.3607594986466566e-05, + "loss": 2.0691, + "step": 41520 + }, + { + "epoch": 0.765403891509434, + "grad_norm": 3.359375, + "learning_rate": 1.36021955853817e-05, + "loss": 2.114, + "step": 41540 + }, + { + "epoch": 0.7657724056603774, + "grad_norm": 3.796875, + "learning_rate": 1.3596794977273655e-05, + "loss": 2.1038, + "step": 41560 + }, + { + "epoch": 0.7661409198113207, + "grad_norm": 3.578125, + "learning_rate": 1.3591393163952064e-05, + "loss": 2.0951, + "step": 41580 + }, + { + "epoch": 0.7665094339622641, + "grad_norm": 3.5625, + "learning_rate": 1.3585990147226967e-05, + "loss": 2.077, + "step": 41600 + }, + { + "epoch": 0.7668779481132075, + "grad_norm": 3.546875, + "learning_rate": 1.3580585928908808e-05, + "loss": 2.0624, + "step": 41620 + }, + { + "epoch": 0.7672464622641509, + "grad_norm": 3.515625, + "learning_rate": 1.3575180510808425e-05, + "loss": 2.075, + "step": 41640 + }, + { + "epoch": 0.7676149764150944, + "grad_norm": 3.84375, + "learning_rate": 1.3569773894737073e-05, + "loss": 2.0686, + "step": 41660 + }, + { + "epoch": 0.7679834905660378, + "grad_norm": 3.625, + "learning_rate": 1.3564366082506395e-05, + "loss": 2.1025, + "step": 41680 + }, + { + "epoch": 0.7683520047169812, + "grad_norm": 3.53125, + "learning_rate": 1.3558957075928442e-05, + "loss": 2.1076, + "step": 41700 + }, + { + "epoch": 0.7687205188679245, + "grad_norm": 3.625, + "learning_rate": 1.3553546876815662e-05, + "loss": 2.1399, + "step": 41720 + }, + { + "epoch": 0.7690890330188679, + "grad_norm": 3.609375, + "learning_rate": 1.3548135486980904e-05, + "loss": 2.0815, + "step": 41740 + }, + { + "epoch": 0.7694575471698113, + "grad_norm": 3.625, + "learning_rate": 1.3542722908237415e-05, + "loss": 2.0644, + "step": 41760 + }, + { + "epoch": 0.7698260613207547, + "grad_norm": 3.46875, + "learning_rate": 1.3537309142398845e-05, + "loss": 2.0897, + "step": 41780 + }, + { + "epoch": 0.7701945754716981, + "grad_norm": 3.75, + "learning_rate": 1.3531894191279235e-05, + "loss": 2.1041, + "step": 41800 + }, + { + "epoch": 0.7705630896226415, + "grad_norm": 3.15625, + "learning_rate": 1.3526478056693025e-05, + "loss": 2.0835, + "step": 41820 + }, + { + "epoch": 0.7709316037735849, + "grad_norm": 3.453125, + "learning_rate": 1.3521060740455053e-05, + "loss": 2.0779, + "step": 41840 + }, + { + "epoch": 0.7713001179245284, + "grad_norm": 3.484375, + "learning_rate": 1.3515642244380553e-05, + "loss": 2.0881, + "step": 41860 + }, + { + "epoch": 0.7716686320754716, + "grad_norm": 3.328125, + "learning_rate": 1.3510222570285154e-05, + "loss": 2.1035, + "step": 41880 + }, + { + "epoch": 0.7720371462264151, + "grad_norm": 3.359375, + "learning_rate": 1.350480171998488e-05, + "loss": 2.0974, + "step": 41900 + }, + { + "epoch": 0.7724056603773585, + "grad_norm": 3.578125, + "learning_rate": 1.349937969529615e-05, + "loss": 2.0843, + "step": 41920 + }, + { + "epoch": 0.7727741745283019, + "grad_norm": 3.140625, + "learning_rate": 1.3493956498035772e-05, + "loss": 2.0871, + "step": 41940 + }, + { + "epoch": 0.7731426886792453, + "grad_norm": 3.71875, + "learning_rate": 1.3488532130020952e-05, + "loss": 2.1291, + "step": 41960 + }, + { + "epoch": 0.7735112028301887, + "grad_norm": 3.109375, + "learning_rate": 1.3483106593069286e-05, + "loss": 2.0851, + "step": 41980 + }, + { + "epoch": 0.7738797169811321, + "grad_norm": 3.328125, + "learning_rate": 1.3477679888998763e-05, + "loss": 2.0888, + "step": 42000 + }, + { + "epoch": 0.7742482311320755, + "grad_norm": 3.4375, + "learning_rate": 1.3472252019627763e-05, + "loss": 2.0906, + "step": 42020 + }, + { + "epoch": 0.7746167452830188, + "grad_norm": 3.359375, + "learning_rate": 1.346682298677505e-05, + "loss": 2.0937, + "step": 42040 + }, + { + "epoch": 0.7749852594339622, + "grad_norm": 3.328125, + "learning_rate": 1.3461392792259793e-05, + "loss": 2.0685, + "step": 42060 + }, + { + "epoch": 0.7753537735849056, + "grad_norm": 3.40625, + "learning_rate": 1.3455961437901532e-05, + "loss": 2.1033, + "step": 42080 + }, + { + "epoch": 0.7757222877358491, + "grad_norm": 3.359375, + "learning_rate": 1.3450528925520212e-05, + "loss": 2.1145, + "step": 42100 + }, + { + "epoch": 0.7760908018867925, + "grad_norm": 3.546875, + "learning_rate": 1.3445095256936154e-05, + "loss": 2.1189, + "step": 42120 + }, + { + "epoch": 0.7764593160377359, + "grad_norm": 3.515625, + "learning_rate": 1.3439660433970068e-05, + "loss": 2.0858, + "step": 42140 + }, + { + "epoch": 0.7768278301886793, + "grad_norm": 3.359375, + "learning_rate": 1.3434224458443059e-05, + "loss": 2.088, + "step": 42160 + }, + { + "epoch": 0.7771963443396226, + "grad_norm": 3.4375, + "learning_rate": 1.342878733217661e-05, + "loss": 2.08, + "step": 42180 + }, + { + "epoch": 0.777564858490566, + "grad_norm": 3.8125, + "learning_rate": 1.3423349056992596e-05, + "loss": 2.0844, + "step": 42200 + }, + { + "epoch": 0.7779333726415094, + "grad_norm": 3.78125, + "learning_rate": 1.3417909634713269e-05, + "loss": 2.0528, + "step": 42220 + }, + { + "epoch": 0.7783018867924528, + "grad_norm": 3.46875, + "learning_rate": 1.3412469067161276e-05, + "loss": 2.1108, + "step": 42240 + }, + { + "epoch": 0.7786704009433962, + "grad_norm": 3.421875, + "learning_rate": 1.340702735615963e-05, + "loss": 2.075, + "step": 42260 + }, + { + "epoch": 0.7790389150943396, + "grad_norm": 3.671875, + "learning_rate": 1.3401584503531753e-05, + "loss": 2.0889, + "step": 42280 + }, + { + "epoch": 0.7794074292452831, + "grad_norm": 3.390625, + "learning_rate": 1.3396140511101425e-05, + "loss": 2.1017, + "step": 42300 + }, + { + "epoch": 0.7797759433962265, + "grad_norm": 3.453125, + "learning_rate": 1.3390695380692824e-05, + "loss": 2.1067, + "step": 42320 + }, + { + "epoch": 0.7801444575471698, + "grad_norm": 3.5, + "learning_rate": 1.33852491141305e-05, + "loss": 2.1052, + "step": 42340 + }, + { + "epoch": 0.7805129716981132, + "grad_norm": 3.359375, + "learning_rate": 1.3379801713239388e-05, + "loss": 2.0839, + "step": 42360 + }, + { + "epoch": 0.7808814858490566, + "grad_norm": 3.671875, + "learning_rate": 1.3374353179844803e-05, + "loss": 2.11, + "step": 42380 + }, + { + "epoch": 0.78125, + "grad_norm": 3.28125, + "learning_rate": 1.3368903515772437e-05, + "loss": 2.0597, + "step": 42400 + }, + { + "epoch": 0.7816185141509434, + "grad_norm": 3.328125, + "learning_rate": 1.336345272284837e-05, + "loss": 2.0703, + "step": 42420 + }, + { + "epoch": 0.7819870283018868, + "grad_norm": 3.1875, + "learning_rate": 1.335800080289904e-05, + "loss": 2.1121, + "step": 42440 + }, + { + "epoch": 0.7823555424528302, + "grad_norm": 4.0625, + "learning_rate": 1.3352547757751287e-05, + "loss": 2.1068, + "step": 42460 + }, + { + "epoch": 0.7827240566037735, + "grad_norm": 3.5625, + "learning_rate": 1.3347093589232309e-05, + "loss": 2.0991, + "step": 42480 + }, + { + "epoch": 0.7830925707547169, + "grad_norm": 3.65625, + "learning_rate": 1.3341638299169694e-05, + "loss": 2.1281, + "step": 42500 + }, + { + "epoch": 0.7834610849056604, + "grad_norm": 3.359375, + "learning_rate": 1.3336181889391395e-05, + "loss": 2.1012, + "step": 42520 + }, + { + "epoch": 0.7838295990566038, + "grad_norm": 3.796875, + "learning_rate": 1.333072436172575e-05, + "loss": 2.0771, + "step": 42540 + }, + { + "epoch": 0.7841981132075472, + "grad_norm": 3.5, + "learning_rate": 1.332526571800146e-05, + "loss": 2.1212, + "step": 42560 + }, + { + "epoch": 0.7845666273584906, + "grad_norm": 3.40625, + "learning_rate": 1.3319805960047616e-05, + "loss": 2.1109, + "step": 42580 + }, + { + "epoch": 0.784935141509434, + "grad_norm": 3.609375, + "learning_rate": 1.331434508969366e-05, + "loss": 2.0779, + "step": 42600 + }, + { + "epoch": 0.7853036556603774, + "grad_norm": 3.671875, + "learning_rate": 1.3308883108769431e-05, + "loss": 2.0979, + "step": 42620 + }, + { + "epoch": 0.7856721698113207, + "grad_norm": 3.265625, + "learning_rate": 1.3303420019105125e-05, + "loss": 2.0982, + "step": 42640 + }, + { + "epoch": 0.7860406839622641, + "grad_norm": 3.25, + "learning_rate": 1.329795582253131e-05, + "loss": 2.1035, + "step": 42660 + }, + { + "epoch": 0.7864091981132075, + "grad_norm": 3.703125, + "learning_rate": 1.3292490520878936e-05, + "loss": 2.1214, + "step": 42680 + }, + { + "epoch": 0.7867777122641509, + "grad_norm": 3.140625, + "learning_rate": 1.3287024115979303e-05, + "loss": 2.1119, + "step": 42700 + }, + { + "epoch": 0.7871462264150944, + "grad_norm": 3.75, + "learning_rate": 1.3281556609664108e-05, + "loss": 2.1547, + "step": 42720 + }, + { + "epoch": 0.7875147405660378, + "grad_norm": 3.59375, + "learning_rate": 1.327608800376539e-05, + "loss": 2.1138, + "step": 42740 + }, + { + "epoch": 0.7878832547169812, + "grad_norm": 3.640625, + "learning_rate": 1.3270618300115578e-05, + "loss": 2.1169, + "step": 42760 + }, + { + "epoch": 0.7882517688679245, + "grad_norm": 3.53125, + "learning_rate": 1.326514750054745e-05, + "loss": 2.0912, + "step": 42780 + }, + { + "epoch": 0.7886202830188679, + "grad_norm": 3.90625, + "learning_rate": 1.3259675606894171e-05, + "loss": 2.0859, + "step": 42800 + }, + { + "epoch": 0.7889887971698113, + "grad_norm": 3.609375, + "learning_rate": 1.3254202620989254e-05, + "loss": 2.1029, + "step": 42820 + }, + { + "epoch": 0.7893573113207547, + "grad_norm": 3.78125, + "learning_rate": 1.3248728544666592e-05, + "loss": 2.0684, + "step": 42840 + }, + { + "epoch": 0.7897258254716981, + "grad_norm": 3.671875, + "learning_rate": 1.3243253379760433e-05, + "loss": 2.105, + "step": 42860 + }, + { + "epoch": 0.7900943396226415, + "grad_norm": 3.578125, + "learning_rate": 1.3237777128105398e-05, + "loss": 2.1283, + "step": 42880 + }, + { + "epoch": 0.7904628537735849, + "grad_norm": 3.859375, + "learning_rate": 1.3232299791536468e-05, + "loss": 2.1238, + "step": 42900 + }, + { + "epoch": 0.7908313679245284, + "grad_norm": 3.578125, + "learning_rate": 1.3226821371888985e-05, + "loss": 2.0742, + "step": 42920 + }, + { + "epoch": 0.7911998820754716, + "grad_norm": 3.96875, + "learning_rate": 1.322134187099866e-05, + "loss": 2.0957, + "step": 42940 + }, + { + "epoch": 0.7915683962264151, + "grad_norm": 3.5, + "learning_rate": 1.3215861290701563e-05, + "loss": 2.0695, + "step": 42960 + }, + { + "epoch": 0.7919369103773585, + "grad_norm": 3.578125, + "learning_rate": 1.3210379632834131e-05, + "loss": 2.0747, + "step": 42980 + }, + { + "epoch": 0.7923054245283019, + "grad_norm": 3.75, + "learning_rate": 1.3204896899233147e-05, + "loss": 2.1209, + "step": 43000 + }, + { + "epoch": 0.7926739386792453, + "grad_norm": 3.609375, + "learning_rate": 1.319941309173577e-05, + "loss": 2.1193, + "step": 43020 + }, + { + "epoch": 0.7930424528301887, + "grad_norm": 3.6875, + "learning_rate": 1.3193928212179515e-05, + "loss": 2.0677, + "step": 43040 + }, + { + "epoch": 0.7934109669811321, + "grad_norm": 3.53125, + "learning_rate": 1.3188442262402253e-05, + "loss": 2.0752, + "step": 43060 + }, + { + "epoch": 0.7937794811320755, + "grad_norm": 3.25, + "learning_rate": 1.3182955244242212e-05, + "loss": 2.0645, + "step": 43080 + }, + { + "epoch": 0.7941479952830188, + "grad_norm": 3.84375, + "learning_rate": 1.3177467159537987e-05, + "loss": 2.1051, + "step": 43100 + }, + { + "epoch": 0.7945165094339622, + "grad_norm": 3.6875, + "learning_rate": 1.3171978010128522e-05, + "loss": 2.0718, + "step": 43120 + }, + { + "epoch": 0.7948850235849056, + "grad_norm": 3.578125, + "learning_rate": 1.316648779785312e-05, + "loss": 2.0992, + "step": 43140 + }, + { + "epoch": 0.7952535377358491, + "grad_norm": 3.40625, + "learning_rate": 1.3160996524551442e-05, + "loss": 2.081, + "step": 43160 + }, + { + "epoch": 0.7956220518867925, + "grad_norm": 3.578125, + "learning_rate": 1.31555041920635e-05, + "loss": 2.122, + "step": 43180 + }, + { + "epoch": 0.7959905660377359, + "grad_norm": 3.53125, + "learning_rate": 1.3150010802229667e-05, + "loss": 2.1131, + "step": 43200 + }, + { + "epoch": 0.7963590801886793, + "grad_norm": 3.90625, + "learning_rate": 1.3144516356890664e-05, + "loss": 2.1123, + "step": 43220 + }, + { + "epoch": 0.7967275943396226, + "grad_norm": 3.59375, + "learning_rate": 1.3139020857887575e-05, + "loss": 2.0926, + "step": 43240 + }, + { + "epoch": 0.797096108490566, + "grad_norm": 3.421875, + "learning_rate": 1.3133524307061825e-05, + "loss": 2.1165, + "step": 43260 + }, + { + "epoch": 0.7974646226415094, + "grad_norm": 3.921875, + "learning_rate": 1.3128026706255199e-05, + "loss": 2.0997, + "step": 43280 + }, + { + "epoch": 0.7978331367924528, + "grad_norm": 3.375, + "learning_rate": 1.3122528057309836e-05, + "loss": 2.0881, + "step": 43300 + }, + { + "epoch": 0.7982016509433962, + "grad_norm": 3.484375, + "learning_rate": 1.3117028362068216e-05, + "loss": 2.0996, + "step": 43320 + }, + { + "epoch": 0.7985701650943396, + "grad_norm": 3.375, + "learning_rate": 1.3111527622373181e-05, + "loss": 2.0802, + "step": 43340 + }, + { + "epoch": 0.7989386792452831, + "grad_norm": 3.625, + "learning_rate": 1.3106025840067917e-05, + "loss": 2.0876, + "step": 43360 + }, + { + "epoch": 0.7993071933962265, + "grad_norm": 3.6875, + "learning_rate": 1.310052301699596e-05, + "loss": 2.1131, + "step": 43380 + }, + { + "epoch": 0.7996757075471698, + "grad_norm": 3.609375, + "learning_rate": 1.3095019155001196e-05, + "loss": 2.0812, + "step": 43400 + }, + { + "epoch": 0.8000442216981132, + "grad_norm": 3.796875, + "learning_rate": 1.3089514255927855e-05, + "loss": 2.0674, + "step": 43420 + }, + { + "epoch": 0.8004127358490566, + "grad_norm": 3.359375, + "learning_rate": 1.3084008321620525e-05, + "loss": 2.0876, + "step": 43440 + }, + { + "epoch": 0.80078125, + "grad_norm": 3.234375, + "learning_rate": 1.3078501353924124e-05, + "loss": 2.0707, + "step": 43460 + }, + { + "epoch": 0.8011497641509434, + "grad_norm": 3.359375, + "learning_rate": 1.3072993354683934e-05, + "loss": 2.0917, + "step": 43480 + }, + { + "epoch": 0.8015182783018868, + "grad_norm": 3.40625, + "learning_rate": 1.3067484325745569e-05, + "loss": 2.099, + "step": 43500 + }, + { + "epoch": 0.8018867924528302, + "grad_norm": 4.5, + "learning_rate": 1.3061974268955e-05, + "loss": 2.0977, + "step": 43520 + }, + { + "epoch": 0.8022553066037735, + "grad_norm": 3.5625, + "learning_rate": 1.3056463186158528e-05, + "loss": 2.1044, + "step": 43540 + }, + { + "epoch": 0.8026238207547169, + "grad_norm": 3.328125, + "learning_rate": 1.3050951079202814e-05, + "loss": 2.1305, + "step": 43560 + }, + { + "epoch": 0.8029923349056604, + "grad_norm": 3.5, + "learning_rate": 1.3045437949934847e-05, + "loss": 2.0935, + "step": 43580 + }, + { + "epoch": 0.8033608490566038, + "grad_norm": 3.5625, + "learning_rate": 1.3039923800201968e-05, + "loss": 2.1133, + "step": 43600 + }, + { + "epoch": 0.8037293632075472, + "grad_norm": 4.15625, + "learning_rate": 1.303440863185186e-05, + "loss": 2.1092, + "step": 43620 + }, + { + "epoch": 0.8040978773584906, + "grad_norm": 3.3125, + "learning_rate": 1.3028892446732542e-05, + "loss": 2.0989, + "step": 43640 + }, + { + "epoch": 0.804466391509434, + "grad_norm": 3.625, + "learning_rate": 1.302337524669238e-05, + "loss": 2.0902, + "step": 43660 + }, + { + "epoch": 0.8048349056603774, + "grad_norm": 3.6875, + "learning_rate": 1.301785703358007e-05, + "loss": 2.0903, + "step": 43680 + }, + { + "epoch": 0.8052034198113207, + "grad_norm": 3.34375, + "learning_rate": 1.3012337809244661e-05, + "loss": 2.0618, + "step": 43700 + }, + { + "epoch": 0.8055719339622641, + "grad_norm": 3.421875, + "learning_rate": 1.300681757553553e-05, + "loss": 2.12, + "step": 43720 + }, + { + "epoch": 0.8059404481132075, + "grad_norm": 3.671875, + "learning_rate": 1.3001296334302398e-05, + "loss": 2.0935, + "step": 43740 + }, + { + "epoch": 0.8063089622641509, + "grad_norm": 3.359375, + "learning_rate": 1.2995774087395319e-05, + "loss": 2.1094, + "step": 43760 + }, + { + "epoch": 0.8066774764150944, + "grad_norm": 3.578125, + "learning_rate": 1.2990250836664691e-05, + "loss": 2.0813, + "step": 43780 + }, + { + "epoch": 0.8070459905660378, + "grad_norm": 3.34375, + "learning_rate": 1.2984726583961238e-05, + "loss": 2.0757, + "step": 43800 + }, + { + "epoch": 0.8074145047169812, + "grad_norm": 3.75, + "learning_rate": 1.2979201331136034e-05, + "loss": 2.0955, + "step": 43820 + }, + { + "epoch": 0.8077830188679245, + "grad_norm": 3.328125, + "learning_rate": 1.2973675080040469e-05, + "loss": 2.1002, + "step": 43840 + }, + { + "epoch": 0.8081515330188679, + "grad_norm": 4.0625, + "learning_rate": 1.2968147832526292e-05, + "loss": 2.1288, + "step": 43860 + }, + { + "epoch": 0.8085200471698113, + "grad_norm": 3.421875, + "learning_rate": 1.2962619590445562e-05, + "loss": 2.0832, + "step": 43880 + }, + { + "epoch": 0.8088885613207547, + "grad_norm": 3.484375, + "learning_rate": 1.2957090355650684e-05, + "loss": 2.0747, + "step": 43900 + }, + { + "epoch": 0.8092570754716981, + "grad_norm": 3.625, + "learning_rate": 1.2951560129994394e-05, + "loss": 2.1125, + "step": 43920 + }, + { + "epoch": 0.8096255896226415, + "grad_norm": 3.5625, + "learning_rate": 1.294602891532976e-05, + "loss": 2.0695, + "step": 43940 + }, + { + "epoch": 0.8099941037735849, + "grad_norm": 3.546875, + "learning_rate": 1.2940496713510178e-05, + "loss": 2.1101, + "step": 43960 + }, + { + "epoch": 0.8103626179245284, + "grad_norm": 3.59375, + "learning_rate": 1.2934963526389378e-05, + "loss": 2.1301, + "step": 43980 + }, + { + "epoch": 0.8107311320754716, + "grad_norm": 3.90625, + "learning_rate": 1.2929429355821426e-05, + "loss": 2.0964, + "step": 44000 + }, + { + "epoch": 0.8110996462264151, + "grad_norm": 3.671875, + "learning_rate": 1.2923894203660699e-05, + "loss": 2.0785, + "step": 44020 + }, + { + "epoch": 0.8114681603773585, + "grad_norm": 3.59375, + "learning_rate": 1.2918358071761927e-05, + "loss": 2.1087, + "step": 44040 + }, + { + "epoch": 0.8118366745283019, + "grad_norm": 3.65625, + "learning_rate": 1.2912820961980149e-05, + "loss": 2.0711, + "step": 44060 + }, + { + "epoch": 0.8122051886792453, + "grad_norm": 3.640625, + "learning_rate": 1.2907282876170742e-05, + "loss": 2.1072, + "step": 44080 + }, + { + "epoch": 0.8125737028301887, + "grad_norm": 3.53125, + "learning_rate": 1.2901743816189401e-05, + "loss": 2.0843, + "step": 44100 + }, + { + "epoch": 0.8129422169811321, + "grad_norm": 3.3125, + "learning_rate": 1.2896203783892166e-05, + "loss": 2.1055, + "step": 44120 + }, + { + "epoch": 0.8133107311320755, + "grad_norm": 3.96875, + "learning_rate": 1.2890662781135374e-05, + "loss": 2.084, + "step": 44140 + }, + { + "epoch": 0.8136792452830188, + "grad_norm": 3.53125, + "learning_rate": 1.288512080977572e-05, + "loss": 2.1311, + "step": 44160 + }, + { + "epoch": 0.8140477594339622, + "grad_norm": 3.859375, + "learning_rate": 1.2879577871670195e-05, + "loss": 2.1068, + "step": 44180 + }, + { + "epoch": 0.8144162735849056, + "grad_norm": 3.40625, + "learning_rate": 1.2874033968676129e-05, + "loss": 2.118, + "step": 44200 + }, + { + "epoch": 0.8147847877358491, + "grad_norm": 3.296875, + "learning_rate": 1.2868489102651176e-05, + "loss": 2.1207, + "step": 44220 + }, + { + "epoch": 0.8151533018867925, + "grad_norm": 3.484375, + "learning_rate": 1.2862943275453301e-05, + "loss": 2.0789, + "step": 44240 + }, + { + "epoch": 0.8155218160377359, + "grad_norm": 3.421875, + "learning_rate": 1.285739648894081e-05, + "loss": 2.0941, + "step": 44260 + }, + { + "epoch": 0.8158903301886793, + "grad_norm": 3.609375, + "learning_rate": 1.2851848744972308e-05, + "loss": 2.0851, + "step": 44280 + }, + { + "epoch": 0.8162588443396226, + "grad_norm": 3.09375, + "learning_rate": 1.2846300045406744e-05, + "loss": 2.1026, + "step": 44300 + }, + { + "epoch": 0.816627358490566, + "grad_norm": 3.703125, + "learning_rate": 1.2840750392103363e-05, + "loss": 2.0944, + "step": 44320 + }, + { + "epoch": 0.8169958726415094, + "grad_norm": 3.53125, + "learning_rate": 1.2835199786921752e-05, + "loss": 2.086, + "step": 44340 + }, + { + "epoch": 0.8173643867924528, + "grad_norm": 3.53125, + "learning_rate": 1.28296482317218e-05, + "loss": 2.0992, + "step": 44360 + }, + { + "epoch": 0.8177329009433962, + "grad_norm": 3.3125, + "learning_rate": 1.2824095728363727e-05, + "loss": 2.0837, + "step": 44380 + }, + { + "epoch": 0.8181014150943396, + "grad_norm": 3.328125, + "learning_rate": 1.2818542278708059e-05, + "loss": 2.0892, + "step": 44400 + }, + { + "epoch": 0.8184699292452831, + "grad_norm": 3.4375, + "learning_rate": 1.2812987884615654e-05, + "loss": 2.0656, + "step": 44420 + }, + { + "epoch": 0.8188384433962265, + "grad_norm": 3.515625, + "learning_rate": 1.2807432547947666e-05, + "loss": 2.0975, + "step": 44440 + }, + { + "epoch": 0.8192069575471698, + "grad_norm": 3.578125, + "learning_rate": 1.2801876270565587e-05, + "loss": 2.0869, + "step": 44460 + }, + { + "epoch": 0.8195754716981132, + "grad_norm": 3.46875, + "learning_rate": 1.279631905433121e-05, + "loss": 2.1011, + "step": 44480 + }, + { + "epoch": 0.8199439858490566, + "grad_norm": 3.40625, + "learning_rate": 1.2790760901106643e-05, + "loss": 2.1214, + "step": 44500 + }, + { + "epoch": 0.8203125, + "grad_norm": 3.609375, + "learning_rate": 1.2785201812754315e-05, + "loss": 2.0635, + "step": 44520 + }, + { + "epoch": 0.8206810141509434, + "grad_norm": 3.90625, + "learning_rate": 1.277964179113696e-05, + "loss": 2.126, + "step": 44540 + }, + { + "epoch": 0.8210495283018868, + "grad_norm": 3.328125, + "learning_rate": 1.2774080838117638e-05, + "loss": 2.0748, + "step": 44560 + }, + { + "epoch": 0.8214180424528302, + "grad_norm": 3.3125, + "learning_rate": 1.2768518955559698e-05, + "loss": 2.0776, + "step": 44580 + }, + { + "epoch": 0.8217865566037735, + "grad_norm": 3.234375, + "learning_rate": 1.276295614532683e-05, + "loss": 2.0908, + "step": 44600 + }, + { + "epoch": 0.8221550707547169, + "grad_norm": 3.546875, + "learning_rate": 1.2757392409283012e-05, + "loss": 2.058, + "step": 44620 + }, + { + "epoch": 0.8225235849056604, + "grad_norm": 3.5, + "learning_rate": 1.275182774929254e-05, + "loss": 2.083, + "step": 44640 + }, + { + "epoch": 0.8228920990566038, + "grad_norm": 3.90625, + "learning_rate": 1.2746262167220023e-05, + "loss": 2.0858, + "step": 44660 + }, + { + "epoch": 0.8232606132075472, + "grad_norm": 3.59375, + "learning_rate": 1.2740695664930368e-05, + "loss": 2.0732, + "step": 44680 + }, + { + "epoch": 0.8236291273584906, + "grad_norm": 3.0625, + "learning_rate": 1.2735128244288808e-05, + "loss": 2.0663, + "step": 44700 + }, + { + "epoch": 0.823997641509434, + "grad_norm": 3.65625, + "learning_rate": 1.2729559907160867e-05, + "loss": 2.0469, + "step": 44720 + }, + { + "epoch": 0.8243661556603774, + "grad_norm": 3.6875, + "learning_rate": 1.2723990655412383e-05, + "loss": 2.1048, + "step": 44740 + }, + { + "epoch": 0.8247346698113207, + "grad_norm": 3.046875, + "learning_rate": 1.27184204909095e-05, + "loss": 2.1214, + "step": 44760 + }, + { + "epoch": 0.8251031839622641, + "grad_norm": 3.609375, + "learning_rate": 1.2712849415518674e-05, + "loss": 2.1122, + "step": 44780 + }, + { + "epoch": 0.8254716981132075, + "grad_norm": 3.421875, + "learning_rate": 1.2707277431106648e-05, + "loss": 2.1357, + "step": 44800 + }, + { + "epoch": 0.8258402122641509, + "grad_norm": 3.640625, + "learning_rate": 1.2701704539540492e-05, + "loss": 2.095, + "step": 44820 + }, + { + "epoch": 0.8262087264150944, + "grad_norm": 3.546875, + "learning_rate": 1.2696130742687567e-05, + "loss": 2.1051, + "step": 44840 + }, + { + "epoch": 0.8265772405660378, + "grad_norm": 3.734375, + "learning_rate": 1.2690556042415539e-05, + "loss": 2.0719, + "step": 44860 + }, + { + "epoch": 0.8269457547169812, + "grad_norm": 3.484375, + "learning_rate": 1.2684980440592378e-05, + "loss": 2.0638, + "step": 44880 + }, + { + "epoch": 0.8273142688679245, + "grad_norm": 3.84375, + "learning_rate": 1.2679403939086354e-05, + "loss": 2.1086, + "step": 44900 + }, + { + "epoch": 0.8276827830188679, + "grad_norm": 3.34375, + "learning_rate": 1.2673826539766045e-05, + "loss": 2.0985, + "step": 44920 + }, + { + "epoch": 0.8280512971698113, + "grad_norm": 3.671875, + "learning_rate": 1.2668248244500322e-05, + "loss": 2.1123, + "step": 44940 + }, + { + "epoch": 0.8284198113207547, + "grad_norm": 4.28125, + "learning_rate": 1.266266905515836e-05, + "loss": 2.0812, + "step": 44960 + }, + { + "epoch": 0.8287883254716981, + "grad_norm": 3.34375, + "learning_rate": 1.2657088973609634e-05, + "loss": 2.112, + "step": 44980 + }, + { + "epoch": 0.8291568396226415, + "grad_norm": 3.6875, + "learning_rate": 1.2651508001723916e-05, + "loss": 2.123, + "step": 45000 + }, + { + "epoch": 0.8295253537735849, + "grad_norm": 4.0, + "learning_rate": 1.2645926141371276e-05, + "loss": 2.0791, + "step": 45020 + }, + { + "epoch": 0.8298938679245284, + "grad_norm": 3.21875, + "learning_rate": 1.2640343394422084e-05, + "loss": 2.0585, + "step": 45040 + }, + { + "epoch": 0.8302623820754716, + "grad_norm": 3.171875, + "learning_rate": 1.2634759762747007e-05, + "loss": 2.0989, + "step": 45060 + }, + { + "epoch": 0.8306308962264151, + "grad_norm": 3.734375, + "learning_rate": 1.2629175248217006e-05, + "loss": 2.1006, + "step": 45080 + }, + { + "epoch": 0.8309994103773585, + "grad_norm": 3.5625, + "learning_rate": 1.262358985270334e-05, + "loss": 2.1098, + "step": 45100 + }, + { + "epoch": 0.8313679245283019, + "grad_norm": 3.515625, + "learning_rate": 1.2618003578077562e-05, + "loss": 2.0465, + "step": 45120 + }, + { + "epoch": 0.8317364386792453, + "grad_norm": 3.265625, + "learning_rate": 1.2612416426211521e-05, + "loss": 2.0996, + "step": 45140 + }, + { + "epoch": 0.8321049528301887, + "grad_norm": 3.65625, + "learning_rate": 1.260682839897736e-05, + "loss": 2.1105, + "step": 45160 + }, + { + "epoch": 0.8324734669811321, + "grad_norm": 3.546875, + "learning_rate": 1.2601239498247511e-05, + "loss": 2.094, + "step": 45180 + }, + { + "epoch": 0.8328419811320755, + "grad_norm": 3.203125, + "learning_rate": 1.2595649725894706e-05, + "loss": 2.0642, + "step": 45200 + }, + { + "epoch": 0.8332104952830188, + "grad_norm": 3.515625, + "learning_rate": 1.2590059083791961e-05, + "loss": 2.0927, + "step": 45220 + }, + { + "epoch": 0.8335790094339622, + "grad_norm": 3.65625, + "learning_rate": 1.2584467573812591e-05, + "loss": 2.0953, + "step": 45240 + }, + { + "epoch": 0.8339475235849056, + "grad_norm": 3.28125, + "learning_rate": 1.2578875197830195e-05, + "loss": 2.1321, + "step": 45260 + }, + { + "epoch": 0.8343160377358491, + "grad_norm": 3.875, + "learning_rate": 1.2573281957718671e-05, + "loss": 2.0757, + "step": 45280 + }, + { + "epoch": 0.8346845518867925, + "grad_norm": 3.625, + "learning_rate": 1.2567687855352198e-05, + "loss": 2.0963, + "step": 45300 + }, + { + "epoch": 0.8350530660377359, + "grad_norm": 3.484375, + "learning_rate": 1.2562092892605244e-05, + "loss": 2.0919, + "step": 45320 + }, + { + "epoch": 0.8354215801886793, + "grad_norm": 3.921875, + "learning_rate": 1.255649707135257e-05, + "loss": 2.0942, + "step": 45340 + }, + { + "epoch": 0.8357900943396226, + "grad_norm": 3.734375, + "learning_rate": 1.2550900393469227e-05, + "loss": 2.1035, + "step": 45360 + }, + { + "epoch": 0.836158608490566, + "grad_norm": 3.25, + "learning_rate": 1.2545302860830545e-05, + "loss": 2.0789, + "step": 45380 + }, + { + "epoch": 0.8365271226415094, + "grad_norm": 3.640625, + "learning_rate": 1.253970447531215e-05, + "loss": 2.1036, + "step": 45400 + }, + { + "epoch": 0.8368956367924528, + "grad_norm": 3.671875, + "learning_rate": 1.2534105238789938e-05, + "loss": 2.1038, + "step": 45420 + }, + { + "epoch": 0.8372641509433962, + "grad_norm": 3.609375, + "learning_rate": 1.2528505153140107e-05, + "loss": 2.0894, + "step": 45440 + }, + { + "epoch": 0.8376326650943396, + "grad_norm": 3.515625, + "learning_rate": 1.2522904220239133e-05, + "loss": 2.0581, + "step": 45460 + }, + { + "epoch": 0.8380011792452831, + "grad_norm": 3.640625, + "learning_rate": 1.2517302441963774e-05, + "loss": 2.0894, + "step": 45480 + }, + { + "epoch": 0.8383696933962265, + "grad_norm": 3.59375, + "learning_rate": 1.2511699820191074e-05, + "loss": 2.07, + "step": 45500 + }, + { + "epoch": 0.8387382075471698, + "grad_norm": 3.40625, + "learning_rate": 1.2506096356798361e-05, + "loss": 2.0902, + "step": 45520 + }, + { + "epoch": 0.8391067216981132, + "grad_norm": 4.0, + "learning_rate": 1.2500492053663239e-05, + "loss": 2.1315, + "step": 45540 + }, + { + "epoch": 0.8394752358490566, + "grad_norm": 3.203125, + "learning_rate": 1.2494886912663592e-05, + "loss": 2.0948, + "step": 45560 + }, + { + "epoch": 0.83984375, + "grad_norm": 5.71875, + "learning_rate": 1.2489280935677606e-05, + "loss": 2.0732, + "step": 45580 + }, + { + "epoch": 0.8402122641509434, + "grad_norm": 3.65625, + "learning_rate": 1.2483674124583716e-05, + "loss": 2.0729, + "step": 45600 + }, + { + "epoch": 0.8405807783018868, + "grad_norm": 3.515625, + "learning_rate": 1.2478066481260658e-05, + "loss": 2.0901, + "step": 45620 + }, + { + "epoch": 0.8409492924528302, + "grad_norm": 3.53125, + "learning_rate": 1.2472458007587436e-05, + "loss": 2.054, + "step": 45640 + }, + { + "epoch": 0.8413178066037735, + "grad_norm": 3.265625, + "learning_rate": 1.2466848705443344e-05, + "loss": 2.1036, + "step": 45660 + }, + { + "epoch": 0.8416863207547169, + "grad_norm": 3.59375, + "learning_rate": 1.246123857670794e-05, + "loss": 2.0909, + "step": 45680 + }, + { + "epoch": 0.8420548349056604, + "grad_norm": 3.296875, + "learning_rate": 1.2455627623261066e-05, + "loss": 2.0639, + "step": 45700 + }, + { + "epoch": 0.8424233490566038, + "grad_norm": 3.453125, + "learning_rate": 1.245001584698284e-05, + "loss": 2.1087, + "step": 45720 + }, + { + "epoch": 0.8427918632075472, + "grad_norm": 3.734375, + "learning_rate": 1.2444403249753659e-05, + "loss": 2.0685, + "step": 45740 + }, + { + "epoch": 0.8431603773584906, + "grad_norm": 3.296875, + "learning_rate": 1.2438789833454184e-05, + "loss": 2.0691, + "step": 45760 + }, + { + "epoch": 0.843528891509434, + "grad_norm": 4.03125, + "learning_rate": 1.2433175599965359e-05, + "loss": 2.1046, + "step": 45780 + }, + { + "epoch": 0.8438974056603774, + "grad_norm": 3.640625, + "learning_rate": 1.242756055116841e-05, + "loss": 2.1264, + "step": 45800 + }, + { + "epoch": 0.8442659198113207, + "grad_norm": 3.4375, + "learning_rate": 1.2421944688944814e-05, + "loss": 2.0904, + "step": 45820 + }, + { + "epoch": 0.8446344339622641, + "grad_norm": 3.40625, + "learning_rate": 1.2416328015176342e-05, + "loss": 2.108, + "step": 45840 + }, + { + "epoch": 0.8450029481132075, + "grad_norm": 3.78125, + "learning_rate": 1.2410710531745023e-05, + "loss": 2.109, + "step": 45860 + }, + { + "epoch": 0.8453714622641509, + "grad_norm": 3.21875, + "learning_rate": 1.2405092240533168e-05, + "loss": 2.1021, + "step": 45880 + }, + { + "epoch": 0.8457399764150944, + "grad_norm": 4.0, + "learning_rate": 1.2399473143423346e-05, + "loss": 2.0902, + "step": 45900 + }, + { + "epoch": 0.8461084905660378, + "grad_norm": 3.78125, + "learning_rate": 1.2393853242298413e-05, + "loss": 2.1032, + "step": 45920 + }, + { + "epoch": 0.8464770047169812, + "grad_norm": 3.546875, + "learning_rate": 1.2388232539041473e-05, + "loss": 2.0596, + "step": 45940 + }, + { + "epoch": 0.8468455188679245, + "grad_norm": 3.28125, + "learning_rate": 1.2382611035535921e-05, + "loss": 2.0816, + "step": 45960 + }, + { + "epoch": 0.8472140330188679, + "grad_norm": 3.84375, + "learning_rate": 1.2376988733665403e-05, + "loss": 2.1004, + "step": 45980 + }, + { + "epoch": 0.8475825471698113, + "grad_norm": 3.53125, + "learning_rate": 1.2371365635313839e-05, + "loss": 2.0398, + "step": 46000 + }, + { + "epoch": 0.8479510613207547, + "grad_norm": 3.46875, + "learning_rate": 1.2365741742365423e-05, + "loss": 2.0892, + "step": 46020 + }, + { + "epoch": 0.8483195754716981, + "grad_norm": 4.125, + "learning_rate": 1.2360117056704597e-05, + "loss": 2.0939, + "step": 46040 + }, + { + "epoch": 0.8486880896226415, + "grad_norm": 3.296875, + "learning_rate": 1.2354491580216092e-05, + "loss": 2.063, + "step": 46060 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 3.421875, + "learning_rate": 1.2348865314784881e-05, + "loss": 2.0694, + "step": 46080 + }, + { + "epoch": 0.8494251179245284, + "grad_norm": 3.71875, + "learning_rate": 1.2343238262296222e-05, + "loss": 2.0937, + "step": 46100 + }, + { + "epoch": 0.8497936320754716, + "grad_norm": 3.40625, + "learning_rate": 1.2337610424635617e-05, + "loss": 2.118, + "step": 46120 + }, + { + "epoch": 0.8501621462264151, + "grad_norm": 3.21875, + "learning_rate": 1.2331981803688853e-05, + "loss": 2.0822, + "step": 46140 + }, + { + "epoch": 0.8505306603773585, + "grad_norm": 3.734375, + "learning_rate": 1.2326352401341954e-05, + "loss": 2.0677, + "step": 46160 + }, + { + "epoch": 0.8508991745283019, + "grad_norm": 3.546875, + "learning_rate": 1.2320722219481231e-05, + "loss": 2.0968, + "step": 46180 + }, + { + "epoch": 0.8512676886792453, + "grad_norm": 3.90625, + "learning_rate": 1.2315091259993237e-05, + "loss": 2.0992, + "step": 46200 + }, + { + "epoch": 0.8516362028301887, + "grad_norm": 3.421875, + "learning_rate": 1.2309459524764798e-05, + "loss": 2.0887, + "step": 46220 + }, + { + "epoch": 0.8520047169811321, + "grad_norm": 3.953125, + "learning_rate": 1.2303827015682992e-05, + "loss": 2.102, + "step": 46240 + }, + { + "epoch": 0.8523732311320755, + "grad_norm": 3.390625, + "learning_rate": 1.2298193734635158e-05, + "loss": 2.1055, + "step": 46260 + }, + { + "epoch": 0.8527417452830188, + "grad_norm": 4.0625, + "learning_rate": 1.22925596835089e-05, + "loss": 2.0694, + "step": 46280 + }, + { + "epoch": 0.8531102594339622, + "grad_norm": 3.40625, + "learning_rate": 1.228692486419207e-05, + "loss": 2.1021, + "step": 46300 + }, + { + "epoch": 0.8534787735849056, + "grad_norm": 3.671875, + "learning_rate": 1.2281289278572786e-05, + "loss": 2.0664, + "step": 46320 + }, + { + "epoch": 0.8538472877358491, + "grad_norm": 3.34375, + "learning_rate": 1.2275652928539415e-05, + "loss": 2.0728, + "step": 46340 + }, + { + "epoch": 0.8542158018867925, + "grad_norm": 3.328125, + "learning_rate": 1.2270015815980591e-05, + "loss": 2.0508, + "step": 46360 + }, + { + "epoch": 0.8545843160377359, + "grad_norm": 3.21875, + "learning_rate": 1.2264377942785188e-05, + "loss": 2.0972, + "step": 46380 + }, + { + "epoch": 0.8549528301886793, + "grad_norm": 3.375, + "learning_rate": 1.2258739310842354e-05, + "loss": 2.0938, + "step": 46400 + }, + { + "epoch": 0.8553213443396226, + "grad_norm": 3.421875, + "learning_rate": 1.225309992204147e-05, + "loss": 2.0869, + "step": 46420 + }, + { + "epoch": 0.855689858490566, + "grad_norm": 3.703125, + "learning_rate": 1.2247459778272192e-05, + "loss": 2.1025, + "step": 46440 + }, + { + "epoch": 0.8560583726415094, + "grad_norm": 3.328125, + "learning_rate": 1.2241818881424414e-05, + "loss": 2.0439, + "step": 46460 + }, + { + "epoch": 0.8564268867924528, + "grad_norm": 3.34375, + "learning_rate": 1.2236177233388287e-05, + "loss": 2.0916, + "step": 46480 + }, + { + "epoch": 0.8567954009433962, + "grad_norm": 3.578125, + "learning_rate": 1.2230534836054212e-05, + "loss": 2.0989, + "step": 46500 + }, + { + "epoch": 0.8571639150943396, + "grad_norm": 3.5625, + "learning_rate": 1.2224891691312844e-05, + "loss": 2.0896, + "step": 46520 + }, + { + "epoch": 0.8575324292452831, + "grad_norm": 3.65625, + "learning_rate": 1.2219247801055088e-05, + "loss": 2.0734, + "step": 46540 + }, + { + "epoch": 0.8579009433962265, + "grad_norm": 3.578125, + "learning_rate": 1.2213603167172094e-05, + "loss": 2.0917, + "step": 46560 + }, + { + "epoch": 0.8582694575471698, + "grad_norm": 3.140625, + "learning_rate": 1.2207957791555274e-05, + "loss": 2.0836, + "step": 46580 + }, + { + "epoch": 0.8586379716981132, + "grad_norm": 3.78125, + "learning_rate": 1.2202311676096267e-05, + "loss": 2.0915, + "step": 46600 + }, + { + "epoch": 0.8590064858490566, + "grad_norm": 3.578125, + "learning_rate": 1.2196664822686985e-05, + "loss": 2.0994, + "step": 46620 + }, + { + "epoch": 0.859375, + "grad_norm": 3.234375, + "learning_rate": 1.2191017233219569e-05, + "loss": 2.0955, + "step": 46640 + }, + { + "epoch": 0.8597435141509434, + "grad_norm": 3.46875, + "learning_rate": 1.2185368909586411e-05, + "loss": 2.0847, + "step": 46660 + }, + { + "epoch": 0.8601120283018868, + "grad_norm": 3.5625, + "learning_rate": 1.2179719853680152e-05, + "loss": 2.0765, + "step": 46680 + }, + { + "epoch": 0.8604805424528302, + "grad_norm": 3.203125, + "learning_rate": 1.2174070067393677e-05, + "loss": 2.0842, + "step": 46700 + }, + { + "epoch": 0.8608490566037735, + "grad_norm": 4.0625, + "learning_rate": 1.2168419552620116e-05, + "loss": 2.0784, + "step": 46720 + }, + { + "epoch": 0.8612175707547169, + "grad_norm": 3.203125, + "learning_rate": 1.2162768311252843e-05, + "loss": 2.0988, + "step": 46740 + }, + { + "epoch": 0.8615860849056604, + "grad_norm": 3.3125, + "learning_rate": 1.2157116345185475e-05, + "loss": 2.1334, + "step": 46760 + }, + { + "epoch": 0.8619545990566038, + "grad_norm": 4.03125, + "learning_rate": 1.215146365631187e-05, + "loss": 2.0927, + "step": 46780 + }, + { + "epoch": 0.8623231132075472, + "grad_norm": 3.859375, + "learning_rate": 1.2145810246526133e-05, + "loss": 2.0939, + "step": 46800 + }, + { + "epoch": 0.8626916273584906, + "grad_norm": 4.15625, + "learning_rate": 1.2140156117722607e-05, + "loss": 2.0799, + "step": 46820 + }, + { + "epoch": 0.863060141509434, + "grad_norm": 3.359375, + "learning_rate": 1.2134501271795873e-05, + "loss": 2.0955, + "step": 46840 + }, + { + "epoch": 0.8634286556603774, + "grad_norm": 3.375, + "learning_rate": 1.2128845710640762e-05, + "loss": 2.1047, + "step": 46860 + }, + { + "epoch": 0.8637971698113207, + "grad_norm": 3.703125, + "learning_rate": 1.2123189436152334e-05, + "loss": 2.0664, + "step": 46880 + }, + { + "epoch": 0.8641656839622641, + "grad_norm": 3.96875, + "learning_rate": 1.2117532450225897e-05, + "loss": 2.0963, + "step": 46900 + }, + { + "epoch": 0.8645341981132075, + "grad_norm": 3.4375, + "learning_rate": 1.2111874754756987e-05, + "loss": 2.0484, + "step": 46920 + }, + { + "epoch": 0.8649027122641509, + "grad_norm": 3.328125, + "learning_rate": 1.210621635164139e-05, + "loss": 2.0713, + "step": 46940 + }, + { + "epoch": 0.8652712264150944, + "grad_norm": 3.609375, + "learning_rate": 1.2100557242775118e-05, + "loss": 2.1012, + "step": 46960 + }, + { + "epoch": 0.8656397405660378, + "grad_norm": 3.46875, + "learning_rate": 1.2094897430054426e-05, + "loss": 2.0863, + "step": 46980 + }, + { + "epoch": 0.8660082547169812, + "grad_norm": 3.96875, + "learning_rate": 1.2089236915375805e-05, + "loss": 2.0976, + "step": 47000 + }, + { + "epoch": 0.8663767688679245, + "grad_norm": 3.484375, + "learning_rate": 1.2083575700635976e-05, + "loss": 2.0411, + "step": 47020 + }, + { + "epoch": 0.8667452830188679, + "grad_norm": 3.453125, + "learning_rate": 1.2077913787731904e-05, + "loss": 2.0856, + "step": 47040 + }, + { + "epoch": 0.8671137971698113, + "grad_norm": 3.484375, + "learning_rate": 1.2072251178560777e-05, + "loss": 2.1243, + "step": 47060 + }, + { + "epoch": 0.8674823113207547, + "grad_norm": 3.625, + "learning_rate": 1.2066587875020019e-05, + "loss": 2.1039, + "step": 47080 + }, + { + "epoch": 0.8678508254716981, + "grad_norm": 3.4375, + "learning_rate": 1.2060923879007295e-05, + "loss": 2.0781, + "step": 47100 + }, + { + "epoch": 0.8682193396226415, + "grad_norm": 3.875, + "learning_rate": 1.2055259192420492e-05, + "loss": 2.0591, + "step": 47120 + }, + { + "epoch": 0.8685878537735849, + "grad_norm": 3.78125, + "learning_rate": 1.2049593817157734e-05, + "loss": 2.1065, + "step": 47140 + }, + { + "epoch": 0.8689563679245284, + "grad_norm": 3.515625, + "learning_rate": 1.2043927755117374e-05, + "loss": 2.112, + "step": 47160 + }, + { + "epoch": 0.8693248820754716, + "grad_norm": 3.578125, + "learning_rate": 1.2038261008197996e-05, + "loss": 2.0932, + "step": 47180 + }, + { + "epoch": 0.8696933962264151, + "grad_norm": 3.921875, + "learning_rate": 1.203259357829841e-05, + "loss": 2.0653, + "step": 47200 + }, + { + "epoch": 0.8700619103773585, + "grad_norm": 4.0, + "learning_rate": 1.2026925467317662e-05, + "loss": 2.0944, + "step": 47220 + }, + { + "epoch": 0.8704304245283019, + "grad_norm": 3.203125, + "learning_rate": 1.2021256677155017e-05, + "loss": 2.0835, + "step": 47240 + }, + { + "epoch": 0.8707989386792453, + "grad_norm": 3.6875, + "learning_rate": 1.2015587209709976e-05, + "loss": 2.0894, + "step": 47260 + }, + { + "epoch": 0.8711674528301887, + "grad_norm": 3.84375, + "learning_rate": 1.200991706688226e-05, + "loss": 2.0649, + "step": 47280 + }, + { + "epoch": 0.8715359669811321, + "grad_norm": 3.609375, + "learning_rate": 1.2004246250571823e-05, + "loss": 2.0844, + "step": 47300 + }, + { + "epoch": 0.8719044811320755, + "grad_norm": 3.859375, + "learning_rate": 1.1998574762678838e-05, + "loss": 2.079, + "step": 47320 + }, + { + "epoch": 0.8722729952830188, + "grad_norm": 3.5, + "learning_rate": 1.199290260510371e-05, + "loss": 2.0992, + "step": 47340 + }, + { + "epoch": 0.8726415094339622, + "grad_norm": 3.921875, + "learning_rate": 1.1987229779747061e-05, + "loss": 2.1176, + "step": 47360 + }, + { + "epoch": 0.8730100235849056, + "grad_norm": 3.6875, + "learning_rate": 1.1981556288509746e-05, + "loss": 2.0564, + "step": 47380 + }, + { + "epoch": 0.8733785377358491, + "grad_norm": 3.5625, + "learning_rate": 1.197588213329283e-05, + "loss": 2.0785, + "step": 47400 + }, + { + "epoch": 0.8737470518867925, + "grad_norm": 3.625, + "learning_rate": 1.1970207315997616e-05, + "loss": 2.0641, + "step": 47420 + }, + { + "epoch": 0.8741155660377359, + "grad_norm": 3.3125, + "learning_rate": 1.1964531838525618e-05, + "loss": 2.0346, + "step": 47440 + }, + { + "epoch": 0.8744840801886793, + "grad_norm": 3.4375, + "learning_rate": 1.195885570277857e-05, + "loss": 2.089, + "step": 47460 + }, + { + "epoch": 0.8748525943396226, + "grad_norm": 3.640625, + "learning_rate": 1.1953178910658434e-05, + "loss": 2.0708, + "step": 47480 + }, + { + "epoch": 0.875221108490566, + "grad_norm": 3.671875, + "learning_rate": 1.194750146406739e-05, + "loss": 2.1006, + "step": 47500 + }, + { + "epoch": 0.8755896226415094, + "grad_norm": 3.5625, + "learning_rate": 1.1941823364907834e-05, + "loss": 2.1384, + "step": 47520 + }, + { + "epoch": 0.8759581367924528, + "grad_norm": 3.671875, + "learning_rate": 1.1936144615082386e-05, + "loss": 2.0861, + "step": 47540 + }, + { + "epoch": 0.8763266509433962, + "grad_norm": 3.8125, + "learning_rate": 1.1930465216493876e-05, + "loss": 2.0892, + "step": 47560 + }, + { + "epoch": 0.8766951650943396, + "grad_norm": 3.359375, + "learning_rate": 1.1924785171045356e-05, + "loss": 2.0679, + "step": 47580 + }, + { + "epoch": 0.8770636792452831, + "grad_norm": 3.484375, + "learning_rate": 1.19191044806401e-05, + "loss": 2.0788, + "step": 47600 + }, + { + "epoch": 0.8774321933962265, + "grad_norm": 3.25, + "learning_rate": 1.1913423147181588e-05, + "loss": 2.0898, + "step": 47620 + }, + { + "epoch": 0.8778007075471698, + "grad_norm": 3.609375, + "learning_rate": 1.1907741172573525e-05, + "loss": 2.0817, + "step": 47640 + }, + { + "epoch": 0.8781692216981132, + "grad_norm": 3.34375, + "learning_rate": 1.190205855871982e-05, + "loss": 2.0582, + "step": 47660 + }, + { + "epoch": 0.8785377358490566, + "grad_norm": 3.5, + "learning_rate": 1.189637530752461e-05, + "loss": 2.0596, + "step": 47680 + }, + { + "epoch": 0.87890625, + "grad_norm": 3.578125, + "learning_rate": 1.1890691420892232e-05, + "loss": 2.0721, + "step": 47700 + }, + { + "epoch": 0.8792747641509434, + "grad_norm": 3.671875, + "learning_rate": 1.1885006900727245e-05, + "loss": 2.0976, + "step": 47720 + }, + { + "epoch": 0.8796432783018868, + "grad_norm": 3.265625, + "learning_rate": 1.1879321748934417e-05, + "loss": 2.0751, + "step": 47740 + }, + { + "epoch": 0.8800117924528302, + "grad_norm": 3.453125, + "learning_rate": 1.1873635967418725e-05, + "loss": 2.1256, + "step": 47760 + }, + { + "epoch": 0.8803803066037735, + "grad_norm": 3.765625, + "learning_rate": 1.1867949558085361e-05, + "loss": 2.0828, + "step": 47780 + }, + { + "epoch": 0.8807488207547169, + "grad_norm": 3.359375, + "learning_rate": 1.1862262522839724e-05, + "loss": 2.0425, + "step": 47800 + }, + { + "epoch": 0.8811173349056604, + "grad_norm": 3.46875, + "learning_rate": 1.1856574863587435e-05, + "loss": 2.082, + "step": 47820 + }, + { + "epoch": 0.8814858490566038, + "grad_norm": 3.28125, + "learning_rate": 1.1850886582234302e-05, + "loss": 2.0578, + "step": 47840 + }, + { + "epoch": 0.8818543632075472, + "grad_norm": 3.890625, + "learning_rate": 1.1845197680686362e-05, + "loss": 2.0976, + "step": 47860 + }, + { + "epoch": 0.8822228773584906, + "grad_norm": 3.84375, + "learning_rate": 1.1839508160849845e-05, + "loss": 2.0823, + "step": 47880 + }, + { + "epoch": 0.882591391509434, + "grad_norm": 3.8125, + "learning_rate": 1.18338180246312e-05, + "loss": 2.1151, + "step": 47900 + }, + { + "epoch": 0.8829599056603774, + "grad_norm": 3.5625, + "learning_rate": 1.182812727393707e-05, + "loss": 2.137, + "step": 47920 + }, + { + "epoch": 0.8833284198113207, + "grad_norm": 3.703125, + "learning_rate": 1.1822435910674322e-05, + "loss": 2.0991, + "step": 47940 + }, + { + "epoch": 0.8836969339622641, + "grad_norm": 3.453125, + "learning_rate": 1.1816743936750005e-05, + "loss": 2.1074, + "step": 47960 + }, + { + "epoch": 0.8840654481132075, + "grad_norm": 3.859375, + "learning_rate": 1.1811051354071392e-05, + "loss": 2.1239, + "step": 47980 + }, + { + "epoch": 0.8844339622641509, + "grad_norm": 3.421875, + "learning_rate": 1.180535816454595e-05, + "loss": 2.0832, + "step": 48000 + }, + { + "epoch": 0.8848024764150944, + "grad_norm": 3.71875, + "learning_rate": 1.1799664370081356e-05, + "loss": 2.0969, + "step": 48020 + }, + { + "epoch": 0.8851709905660378, + "grad_norm": 3.234375, + "learning_rate": 1.1793969972585484e-05, + "loss": 2.0816, + "step": 48040 + }, + { + "epoch": 0.8855395047169812, + "grad_norm": 3.265625, + "learning_rate": 1.1788274973966405e-05, + "loss": 2.0754, + "step": 48060 + }, + { + "epoch": 0.8859080188679245, + "grad_norm": 3.484375, + "learning_rate": 1.1782579376132411e-05, + "loss": 2.119, + "step": 48080 + }, + { + "epoch": 0.8862765330188679, + "grad_norm": 3.4375, + "learning_rate": 1.177688318099197e-05, + "loss": 2.0759, + "step": 48100 + }, + { + "epoch": 0.8866450471698113, + "grad_norm": 3.703125, + "learning_rate": 1.177118639045377e-05, + "loss": 2.1019, + "step": 48120 + }, + { + "epoch": 0.8870135613207547, + "grad_norm": 3.609375, + "learning_rate": 1.1765489006426687e-05, + "loss": 2.1048, + "step": 48140 + }, + { + "epoch": 0.8873820754716981, + "grad_norm": 3.484375, + "learning_rate": 1.1759791030819803e-05, + "loss": 2.0767, + "step": 48160 + }, + { + "epoch": 0.8877505896226415, + "grad_norm": 3.5625, + "learning_rate": 1.175409246554239e-05, + "loss": 2.0854, + "step": 48180 + }, + { + "epoch": 0.8881191037735849, + "grad_norm": 3.265625, + "learning_rate": 1.1748393312503927e-05, + "loss": 2.0574, + "step": 48200 + }, + { + "epoch": 0.8884876179245284, + "grad_norm": 4.09375, + "learning_rate": 1.1742693573614084e-05, + "loss": 2.1037, + "step": 48220 + }, + { + "epoch": 0.8888561320754716, + "grad_norm": 3.609375, + "learning_rate": 1.1736993250782729e-05, + "loss": 2.0847, + "step": 48240 + }, + { + "epoch": 0.8892246462264151, + "grad_norm": 3.4375, + "learning_rate": 1.1731292345919923e-05, + "loss": 2.0909, + "step": 48260 + }, + { + "epoch": 0.8895931603773585, + "grad_norm": 3.734375, + "learning_rate": 1.1725590860935927e-05, + "loss": 2.1199, + "step": 48280 + }, + { + "epoch": 0.8899616745283019, + "grad_norm": 3.703125, + "learning_rate": 1.1719888797741193e-05, + "loss": 2.0595, + "step": 48300 + }, + { + "epoch": 0.8903301886792453, + "grad_norm": 3.421875, + "learning_rate": 1.1714186158246365e-05, + "loss": 2.1344, + "step": 48320 + }, + { + "epoch": 0.8906987028301887, + "grad_norm": 3.28125, + "learning_rate": 1.1708482944362292e-05, + "loss": 2.1185, + "step": 48340 + }, + { + "epoch": 0.8910672169811321, + "grad_norm": 3.359375, + "learning_rate": 1.170277915799999e-05, + "loss": 2.1044, + "step": 48360 + }, + { + "epoch": 0.8914357311320755, + "grad_norm": 3.796875, + "learning_rate": 1.1697074801070697e-05, + "loss": 2.0712, + "step": 48380 + }, + { + "epoch": 0.8918042452830188, + "grad_norm": 3.234375, + "learning_rate": 1.1691369875485822e-05, + "loss": 2.0635, + "step": 48400 + }, + { + "epoch": 0.8921727594339622, + "grad_norm": 3.671875, + "learning_rate": 1.1685664383156972e-05, + "loss": 2.0526, + "step": 48420 + }, + { + "epoch": 0.8925412735849056, + "grad_norm": 3.640625, + "learning_rate": 1.167995832599594e-05, + "loss": 2.112, + "step": 48440 + }, + { + "epoch": 0.8929097877358491, + "grad_norm": 3.78125, + "learning_rate": 1.167425170591472e-05, + "loss": 2.0968, + "step": 48460 + }, + { + "epoch": 0.8932783018867925, + "grad_norm": 3.21875, + "learning_rate": 1.1668544524825472e-05, + "loss": 2.0906, + "step": 48480 + }, + { + "epoch": 0.8936468160377359, + "grad_norm": 3.34375, + "learning_rate": 1.1662836784640566e-05, + "loss": 2.0853, + "step": 48500 + }, + { + "epoch": 0.8940153301886793, + "grad_norm": 4.09375, + "learning_rate": 1.165712848727255e-05, + "loss": 2.08, + "step": 48520 + }, + { + "epoch": 0.8943838443396226, + "grad_norm": 3.296875, + "learning_rate": 1.1651419634634156e-05, + "loss": 2.1222, + "step": 48540 + }, + { + "epoch": 0.894752358490566, + "grad_norm": 3.625, + "learning_rate": 1.1645710228638308e-05, + "loss": 2.1328, + "step": 48560 + }, + { + "epoch": 0.8951208726415094, + "grad_norm": 3.71875, + "learning_rate": 1.1640000271198114e-05, + "loss": 2.0506, + "step": 48580 + }, + { + "epoch": 0.8954893867924528, + "grad_norm": 3.359375, + "learning_rate": 1.1634289764226862e-05, + "loss": 2.087, + "step": 48600 + } + ], + "logging_steps": 20, + "max_steps": 108544, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 200, + "total_flos": 4.874097472667438e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}