| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9994972347913524, |
| "eval_steps": 750, |
| "global_step": 1491, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0033517680576504107, |
| "grad_norm": 14.694869995117188, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 53.6406, |
| "mean_token_accuracy": 0.5338318642228842, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.006703536115300821, |
| "grad_norm": 14.033230781555176, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 52.3838, |
| "mean_token_accuracy": 0.5248840853571892, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.010055304172951232, |
| "grad_norm": 6.804769039154053, |
| "learning_rate": 2e-05, |
| "loss": 47.9105, |
| "mean_token_accuracy": 0.5399681400507689, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.013407072230601643, |
| "grad_norm": 7.750083923339844, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 41.8861, |
| "mean_token_accuracy": 0.55653104968369, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01675884028825205, |
| "grad_norm": 6.184543132781982, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 37.33, |
| "mean_token_accuracy": 0.5655230440199375, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.020110608345902465, |
| "grad_norm": 4.537179946899414, |
| "learning_rate": 4e-05, |
| "loss": 32.7503, |
| "mean_token_accuracy": 0.587661711126566, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.023462376403552875, |
| "grad_norm": 3.6645753383636475, |
| "learning_rate": 4.666666666666667e-05, |
| "loss": 29.1892, |
| "mean_token_accuracy": 0.6075583577156067, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.026814144461203285, |
| "grad_norm": 3.7526533603668213, |
| "learning_rate": 5.333333333333333e-05, |
| "loss": 26.3524, |
| "mean_token_accuracy": 0.6198613092303276, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.030165912518853696, |
| "grad_norm": 3.0561397075653076, |
| "learning_rate": 6e-05, |
| "loss": 24.1513, |
| "mean_token_accuracy": 0.6353930421173573, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.0335176805765041, |
| "grad_norm": 2.857618808746338, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 23.5029, |
| "mean_token_accuracy": 0.6437373287975788, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03686944863415452, |
| "grad_norm": 2.7901978492736816, |
| "learning_rate": 7.333333333333333e-05, |
| "loss": 22.9387, |
| "mean_token_accuracy": 0.646886795759201, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.04022121669180493, |
| "grad_norm": 2.8266501426696777, |
| "learning_rate": 8e-05, |
| "loss": 22.0359, |
| "mean_token_accuracy": 0.6525138475000858, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04357298474945534, |
| "grad_norm": 2.5010733604431152, |
| "learning_rate": 8.666666666666667e-05, |
| "loss": 21.5158, |
| "mean_token_accuracy": 0.6548139773309231, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.04692475280710575, |
| "grad_norm": 2.5834386348724365, |
| "learning_rate": 9.333333333333334e-05, |
| "loss": 21.5409, |
| "mean_token_accuracy": 0.6478891499340534, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.05027652086475616, |
| "grad_norm": 2.6927576065063477, |
| "learning_rate": 0.0001, |
| "loss": 20.1017, |
| "mean_token_accuracy": 0.6757474772632122, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.05362828892240657, |
| "grad_norm": 2.0276572704315186, |
| "learning_rate": 9.964689265536724e-05, |
| "loss": 19.9912, |
| "mean_token_accuracy": 0.6763999305665493, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.05698005698005698, |
| "grad_norm": 2.4628567695617676, |
| "learning_rate": 9.929378531073446e-05, |
| "loss": 19.9089, |
| "mean_token_accuracy": 0.672279854118824, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.06033182503770739, |
| "grad_norm": 2.258838415145874, |
| "learning_rate": 9.89406779661017e-05, |
| "loss": 19.7132, |
| "mean_token_accuracy": 0.6713059276342392, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0636835930953578, |
| "grad_norm": 2.447565793991089, |
| "learning_rate": 9.858757062146892e-05, |
| "loss": 18.7631, |
| "mean_token_accuracy": 0.6825208596885204, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.0670353611530082, |
| "grad_norm": 2.1105902194976807, |
| "learning_rate": 9.823446327683616e-05, |
| "loss": 19.4631, |
| "mean_token_accuracy": 0.6674435302615166, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07038712921065862, |
| "grad_norm": 2.309248447418213, |
| "learning_rate": 9.78813559322034e-05, |
| "loss": 19.0249, |
| "mean_token_accuracy": 0.6734571024775505, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.07373889726830904, |
| "grad_norm": 2.101681709289551, |
| "learning_rate": 9.752824858757063e-05, |
| "loss": 18.593, |
| "mean_token_accuracy": 0.6875097192823887, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.07709066532595944, |
| "grad_norm": 2.157726526260376, |
| "learning_rate": 9.717514124293787e-05, |
| "loss": 18.5973, |
| "mean_token_accuracy": 0.6829216606914997, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.08044243338360986, |
| "grad_norm": 2.0711209774017334, |
| "learning_rate": 9.682203389830509e-05, |
| "loss": 19.1541, |
| "mean_token_accuracy": 0.6785640828311443, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.08379420144126026, |
| "grad_norm": 2.015594959259033, |
| "learning_rate": 9.646892655367233e-05, |
| "loss": 18.9493, |
| "mean_token_accuracy": 0.6861244946718216, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.08714596949891068, |
| "grad_norm": 2.1295998096466064, |
| "learning_rate": 9.611581920903955e-05, |
| "loss": 18.5125, |
| "mean_token_accuracy": 0.6793887488543987, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.09049773755656108, |
| "grad_norm": 2.2496395111083984, |
| "learning_rate": 9.576271186440679e-05, |
| "loss": 18.4019, |
| "mean_token_accuracy": 0.6890006221830844, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.0938495056142115, |
| "grad_norm": 2.1168577671051025, |
| "learning_rate": 9.540960451977402e-05, |
| "loss": 18.7305, |
| "mean_token_accuracy": 0.6841622419655323, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0972012736718619, |
| "grad_norm": 1.8554915189743042, |
| "learning_rate": 9.505649717514125e-05, |
| "loss": 18.6606, |
| "mean_token_accuracy": 0.6859239712357521, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.10055304172951232, |
| "grad_norm": 1.9698066711425781, |
| "learning_rate": 9.470338983050848e-05, |
| "loss": 19.1065, |
| "mean_token_accuracy": 0.6759489566087723, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.10390480978716272, |
| "grad_norm": 2.2483623027801514, |
| "learning_rate": 9.43502824858757e-05, |
| "loss": 18.8041, |
| "mean_token_accuracy": 0.68142851293087, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.10725657784481314, |
| "grad_norm": 1.8570690155029297, |
| "learning_rate": 9.399717514124294e-05, |
| "loss": 18.8862, |
| "mean_token_accuracy": 0.6791303649544715, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.11060834590246355, |
| "grad_norm": 2.143021583557129, |
| "learning_rate": 9.364406779661016e-05, |
| "loss": 18.7605, |
| "mean_token_accuracy": 0.681893227249384, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.11396011396011396, |
| "grad_norm": 1.8951307535171509, |
| "learning_rate": 9.32909604519774e-05, |
| "loss": 18.3005, |
| "mean_token_accuracy": 0.6897541806101799, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.11731188201776437, |
| "grad_norm": 1.971745252609253, |
| "learning_rate": 9.293785310734464e-05, |
| "loss": 18.8995, |
| "mean_token_accuracy": 0.6820204116404056, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.12066365007541478, |
| "grad_norm": 1.910328984260559, |
| "learning_rate": 9.258474576271187e-05, |
| "loss": 18.8808, |
| "mean_token_accuracy": 0.6812884464859963, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.12401541813306519, |
| "grad_norm": 1.730974555015564, |
| "learning_rate": 9.223163841807911e-05, |
| "loss": 18.0871, |
| "mean_token_accuracy": 0.6907590143382549, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.1273671861907156, |
| "grad_norm": 2.125452995300293, |
| "learning_rate": 9.187853107344633e-05, |
| "loss": 18.1569, |
| "mean_token_accuracy": 0.689236406236887, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.13071895424836602, |
| "grad_norm": 2.0234949588775635, |
| "learning_rate": 9.152542372881357e-05, |
| "loss": 18.3342, |
| "mean_token_accuracy": 0.6902932204306126, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.1340707223060164, |
| "grad_norm": 1.9802364110946655, |
| "learning_rate": 9.11723163841808e-05, |
| "loss": 18.7942, |
| "mean_token_accuracy": 0.6788501650094986, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13742249036366683, |
| "grad_norm": 1.8897534608840942, |
| "learning_rate": 9.081920903954803e-05, |
| "loss": 18.4679, |
| "mean_token_accuracy": 0.6900524459779263, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.14077425842131724, |
| "grad_norm": 1.9040635824203491, |
| "learning_rate": 9.046610169491526e-05, |
| "loss": 18.0058, |
| "mean_token_accuracy": 0.690093420445919, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.14412602647896766, |
| "grad_norm": 2.0558955669403076, |
| "learning_rate": 9.011299435028249e-05, |
| "loss": 17.5489, |
| "mean_token_accuracy": 0.7006829999387264, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.14747779453661808, |
| "grad_norm": 1.7952055931091309, |
| "learning_rate": 8.975988700564972e-05, |
| "loss": 18.2907, |
| "mean_token_accuracy": 0.6876891441643238, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.15082956259426847, |
| "grad_norm": 1.8588192462921143, |
| "learning_rate": 8.940677966101694e-05, |
| "loss": 18.4005, |
| "mean_token_accuracy": 0.6897859051823616, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.15418133065191889, |
| "grad_norm": 1.9269477128982544, |
| "learning_rate": 8.905367231638418e-05, |
| "loss": 18.2096, |
| "mean_token_accuracy": 0.6909494370222091, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.1575330987095693, |
| "grad_norm": 1.8693301677703857, |
| "learning_rate": 8.870056497175142e-05, |
| "loss": 18.394, |
| "mean_token_accuracy": 0.6836515329778194, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.16088486676721972, |
| "grad_norm": 1.787061333656311, |
| "learning_rate": 8.834745762711864e-05, |
| "loss": 18.1503, |
| "mean_token_accuracy": 0.6907145738601684, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1642366348248701, |
| "grad_norm": 1.8895225524902344, |
| "learning_rate": 8.799435028248588e-05, |
| "loss": 18.3026, |
| "mean_token_accuracy": 0.6878940775990486, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.16758840288252053, |
| "grad_norm": 1.835693120956421, |
| "learning_rate": 8.764124293785311e-05, |
| "loss": 17.9347, |
| "mean_token_accuracy": 0.6917316012084485, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.17094017094017094, |
| "grad_norm": 1.7408661842346191, |
| "learning_rate": 8.728813559322035e-05, |
| "loss": 18.0051, |
| "mean_token_accuracy": 0.689583633840084, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.17429193899782136, |
| "grad_norm": 1.9096996784210205, |
| "learning_rate": 8.693502824858759e-05, |
| "loss": 17.6064, |
| "mean_token_accuracy": 0.6965925216674804, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.17764370705547175, |
| "grad_norm": 1.9822146892547607, |
| "learning_rate": 8.658192090395481e-05, |
| "loss": 17.6301, |
| "mean_token_accuracy": 0.7005406267940998, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.18099547511312217, |
| "grad_norm": 1.8383901119232178, |
| "learning_rate": 8.622881355932204e-05, |
| "loss": 17.9114, |
| "mean_token_accuracy": 0.6876685306429863, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.18434724317077258, |
| "grad_norm": 1.7920355796813965, |
| "learning_rate": 8.587570621468927e-05, |
| "loss": 18.1271, |
| "mean_token_accuracy": 0.689356567710638, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.187699011228423, |
| "grad_norm": 1.6455663442611694, |
| "learning_rate": 8.55225988700565e-05, |
| "loss": 17.787, |
| "mean_token_accuracy": 0.6919776491820813, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.1910507792860734, |
| "grad_norm": 1.9442647695541382, |
| "learning_rate": 8.516949152542373e-05, |
| "loss": 17.6019, |
| "mean_token_accuracy": 0.6980393722653389, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.1944025473437238, |
| "grad_norm": 2.294377565383911, |
| "learning_rate": 8.481638418079096e-05, |
| "loss": 17.8778, |
| "mean_token_accuracy": 0.6954585202038288, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.19775431540137423, |
| "grad_norm": 1.8009259700775146, |
| "learning_rate": 8.44632768361582e-05, |
| "loss": 17.5257, |
| "mean_token_accuracy": 0.6998075112700463, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.20110608345902464, |
| "grad_norm": 2.015516757965088, |
| "learning_rate": 8.411016949152542e-05, |
| "loss": 17.7554, |
| "mean_token_accuracy": 0.6968327619135379, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.20445785151667506, |
| "grad_norm": 1.5640082359313965, |
| "learning_rate": 8.375706214689266e-05, |
| "loss": 17.3438, |
| "mean_token_accuracy": 0.69996168166399, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.20780961957432545, |
| "grad_norm": 1.9527899026870728, |
| "learning_rate": 8.340395480225988e-05, |
| "loss": 17.6883, |
| "mean_token_accuracy": 0.6988407798111439, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.21116138763197587, |
| "grad_norm": 1.8222606182098389, |
| "learning_rate": 8.305084745762712e-05, |
| "loss": 17.0646, |
| "mean_token_accuracy": 0.7061679445207119, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.21451315568962628, |
| "grad_norm": 1.8560868501663208, |
| "learning_rate": 8.269774011299435e-05, |
| "loss": 17.8875, |
| "mean_token_accuracy": 0.6941629223525524, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.2178649237472767, |
| "grad_norm": 1.7588037252426147, |
| "learning_rate": 8.234463276836159e-05, |
| "loss": 17.6412, |
| "mean_token_accuracy": 0.6954927705228329, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.2212166918049271, |
| "grad_norm": 1.738242268562317, |
| "learning_rate": 8.199152542372883e-05, |
| "loss": 17.8251, |
| "mean_token_accuracy": 0.6898994512856007, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.2245684598625775, |
| "grad_norm": 1.8485089540481567, |
| "learning_rate": 8.163841807909605e-05, |
| "loss": 17.3078, |
| "mean_token_accuracy": 0.7000270999968052, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.22792022792022792, |
| "grad_norm": 1.8579105138778687, |
| "learning_rate": 8.128531073446328e-05, |
| "loss": 17.3078, |
| "mean_token_accuracy": 0.6995702408254146, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.23127199597787834, |
| "grad_norm": 1.7994352579116821, |
| "learning_rate": 8.093220338983051e-05, |
| "loss": 17.7557, |
| "mean_token_accuracy": 0.6928035505115986, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.23462376403552873, |
| "grad_norm": 1.9240634441375732, |
| "learning_rate": 8.057909604519774e-05, |
| "loss": 17.4329, |
| "mean_token_accuracy": 0.6960855178534985, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.23797553209317915, |
| "grad_norm": 1.6718952655792236, |
| "learning_rate": 8.022598870056498e-05, |
| "loss": 17.5951, |
| "mean_token_accuracy": 0.6947735913097859, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.24132730015082957, |
| "grad_norm": 1.6835826635360718, |
| "learning_rate": 7.98728813559322e-05, |
| "loss": 18.1085, |
| "mean_token_accuracy": 0.6882089108228684, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.24467906820847998, |
| "grad_norm": 1.7387073040008545, |
| "learning_rate": 7.951977401129944e-05, |
| "loss": 17.799, |
| "mean_token_accuracy": 0.6932998545467853, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.24803083626613037, |
| "grad_norm": 2.0071725845336914, |
| "learning_rate": 7.916666666666666e-05, |
| "loss": 17.4076, |
| "mean_token_accuracy": 0.6961173862218857, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.2513826043237808, |
| "grad_norm": 2.326915740966797, |
| "learning_rate": 7.88135593220339e-05, |
| "loss": 17.3121, |
| "mean_token_accuracy": 0.7005321949720382, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.2547343723814312, |
| "grad_norm": 2.1876060962677, |
| "learning_rate": 7.846045197740113e-05, |
| "loss": 17.9069, |
| "mean_token_accuracy": 0.6906426399946213, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.2580861404390816, |
| "grad_norm": 1.849671483039856, |
| "learning_rate": 7.810734463276837e-05, |
| "loss": 17.483, |
| "mean_token_accuracy": 0.7000573620200157, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.26143790849673204, |
| "grad_norm": 1.6676862239837646, |
| "learning_rate": 7.775423728813561e-05, |
| "loss": 16.8936, |
| "mean_token_accuracy": 0.7045633904635906, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.26478967655438246, |
| "grad_norm": 1.6702505350112915, |
| "learning_rate": 7.740112994350283e-05, |
| "loss": 17.904, |
| "mean_token_accuracy": 0.6874841086566448, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.2681414446120328, |
| "grad_norm": 1.7280704975128174, |
| "learning_rate": 7.704802259887007e-05, |
| "loss": 17.4515, |
| "mean_token_accuracy": 0.7018027983605861, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.27149321266968324, |
| "grad_norm": 1.8801991939544678, |
| "learning_rate": 7.669491525423729e-05, |
| "loss": 17.43, |
| "mean_token_accuracy": 0.7009049601852894, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.27484498072733365, |
| "grad_norm": 1.9758073091506958, |
| "learning_rate": 7.634180790960453e-05, |
| "loss": 17.5984, |
| "mean_token_accuracy": 0.6948069363832474, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.27819674878498407, |
| "grad_norm": 1.5747147798538208, |
| "learning_rate": 7.598870056497176e-05, |
| "loss": 18.3079, |
| "mean_token_accuracy": 0.6853139907121658, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.2815485168426345, |
| "grad_norm": 1.6292234659194946, |
| "learning_rate": 7.563559322033898e-05, |
| "loss": 17.4527, |
| "mean_token_accuracy": 0.697540608048439, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2849002849002849, |
| "grad_norm": 1.6185086965560913, |
| "learning_rate": 7.528248587570622e-05, |
| "loss": 17.4193, |
| "mean_token_accuracy": 0.7012022204697133, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.2882520529579353, |
| "grad_norm": 1.8361762762069702, |
| "learning_rate": 7.492937853107344e-05, |
| "loss": 17.4544, |
| "mean_token_accuracy": 0.698820473998785, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.29160382101558574, |
| "grad_norm": 1.7740592956542969, |
| "learning_rate": 7.457627118644068e-05, |
| "loss": 18.0507, |
| "mean_token_accuracy": 0.6881603226065636, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.29495558907323616, |
| "grad_norm": 1.8252911567687988, |
| "learning_rate": 7.42231638418079e-05, |
| "loss": 17.155, |
| "mean_token_accuracy": 0.7065504610538482, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.2983073571308865, |
| "grad_norm": 1.8424382209777832, |
| "learning_rate": 7.387005649717514e-05, |
| "loss": 17.3055, |
| "mean_token_accuracy": 0.6978819817304611, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.30165912518853694, |
| "grad_norm": 1.7494243383407593, |
| "learning_rate": 7.351694915254238e-05, |
| "loss": 16.8365, |
| "mean_token_accuracy": 0.7099504336714745, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.30501089324618735, |
| "grad_norm": 1.936540961265564, |
| "learning_rate": 7.316384180790961e-05, |
| "loss": 18.2753, |
| "mean_token_accuracy": 0.6913827233016491, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.30836266130383777, |
| "grad_norm": 1.810272216796875, |
| "learning_rate": 7.281073446327685e-05, |
| "loss": 17.0536, |
| "mean_token_accuracy": 0.6986232809722424, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.3117144293614882, |
| "grad_norm": 1.6832094192504883, |
| "learning_rate": 7.245762711864407e-05, |
| "loss": 17.2231, |
| "mean_token_accuracy": 0.702030860632658, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.3150661974191386, |
| "grad_norm": 1.8872151374816895, |
| "learning_rate": 7.21045197740113e-05, |
| "loss": 17.5502, |
| "mean_token_accuracy": 0.6932449921965599, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.318417965476789, |
| "grad_norm": 1.788021445274353, |
| "learning_rate": 7.175141242937854e-05, |
| "loss": 16.8596, |
| "mean_token_accuracy": 0.7096694305539131, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.32176973353443944, |
| "grad_norm": 1.8025559186935425, |
| "learning_rate": 7.139830508474577e-05, |
| "loss": 16.662, |
| "mean_token_accuracy": 0.7063573338091373, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.3251215015920898, |
| "grad_norm": 2.274674654006958, |
| "learning_rate": 7.1045197740113e-05, |
| "loss": 17.5965, |
| "mean_token_accuracy": 0.6934389650821686, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.3284732696497402, |
| "grad_norm": 1.6426053047180176, |
| "learning_rate": 7.069209039548022e-05, |
| "loss": 17.0914, |
| "mean_token_accuracy": 0.7049042917788029, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.33182503770739064, |
| "grad_norm": 1.6252586841583252, |
| "learning_rate": 7.033898305084746e-05, |
| "loss": 17.6078, |
| "mean_token_accuracy": 0.6924709647893905, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.33517680576504105, |
| "grad_norm": 1.7185930013656616, |
| "learning_rate": 6.998587570621468e-05, |
| "loss": 17.314, |
| "mean_token_accuracy": 0.7039985358715057, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.33852857382269147, |
| "grad_norm": 1.7891852855682373, |
| "learning_rate": 6.963276836158192e-05, |
| "loss": 17.2188, |
| "mean_token_accuracy": 0.6977060906589031, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.3418803418803419, |
| "grad_norm": 1.9103929996490479, |
| "learning_rate": 6.927966101694916e-05, |
| "loss": 17.4467, |
| "mean_token_accuracy": 0.6982413403689861, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.3452321099379923, |
| "grad_norm": 1.8996375799179077, |
| "learning_rate": 6.892655367231638e-05, |
| "loss": 16.9608, |
| "mean_token_accuracy": 0.7054095402359962, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.3485838779956427, |
| "grad_norm": 2.0335419178009033, |
| "learning_rate": 6.857344632768362e-05, |
| "loss": 17.3361, |
| "mean_token_accuracy": 0.7016568422317505, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.35193564605329314, |
| "grad_norm": 1.9008755683898926, |
| "learning_rate": 6.822033898305085e-05, |
| "loss": 16.9694, |
| "mean_token_accuracy": 0.7059390284121037, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.3552874141109435, |
| "grad_norm": 1.8340988159179688, |
| "learning_rate": 6.786723163841809e-05, |
| "loss": 17.3528, |
| "mean_token_accuracy": 0.7033507622778415, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.3586391821685939, |
| "grad_norm": 1.6903594732284546, |
| "learning_rate": 6.751412429378532e-05, |
| "loss": 17.3021, |
| "mean_token_accuracy": 0.7001501135528088, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.36199095022624433, |
| "grad_norm": 1.8101950883865356, |
| "learning_rate": 6.716101694915255e-05, |
| "loss": 17.938, |
| "mean_token_accuracy": 0.6908830553293228, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.36534271828389475, |
| "grad_norm": 1.6470075845718384, |
| "learning_rate": 6.680790960451978e-05, |
| "loss": 17.6612, |
| "mean_token_accuracy": 0.6923478744924069, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.36869448634154517, |
| "grad_norm": 2.1860337257385254, |
| "learning_rate": 6.6454802259887e-05, |
| "loss": 17.5684, |
| "mean_token_accuracy": 0.6983748801052571, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.3720462543991956, |
| "grad_norm": 1.717653512954712, |
| "learning_rate": 6.610169491525424e-05, |
| "loss": 17.1166, |
| "mean_token_accuracy": 0.7025655619800091, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.375398022456846, |
| "grad_norm": 1.9525723457336426, |
| "learning_rate": 6.574858757062147e-05, |
| "loss": 17.2908, |
| "mean_token_accuracy": 0.6997996769845486, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3787497905144964, |
| "grad_norm": 1.6053602695465088, |
| "learning_rate": 6.53954802259887e-05, |
| "loss": 17.3894, |
| "mean_token_accuracy": 0.698741364479065, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.3821015585721468, |
| "grad_norm": 1.7356934547424316, |
| "learning_rate": 6.504237288135594e-05, |
| "loss": 17.1546, |
| "mean_token_accuracy": 0.7013543620705605, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.3854533266297972, |
| "grad_norm": 1.7188559770584106, |
| "learning_rate": 6.468926553672316e-05, |
| "loss": 17.7637, |
| "mean_token_accuracy": 0.6936320647597313, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.3888050946874476, |
| "grad_norm": 1.8413478136062622, |
| "learning_rate": 6.43361581920904e-05, |
| "loss": 17.8498, |
| "mean_token_accuracy": 0.695782047510147, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.39215686274509803, |
| "grad_norm": 1.5715190172195435, |
| "learning_rate": 6.398305084745762e-05, |
| "loss": 17.4304, |
| "mean_token_accuracy": 0.6989135831594467, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.39550863080274845, |
| "grad_norm": 1.8729442358016968, |
| "learning_rate": 6.362994350282486e-05, |
| "loss": 16.9125, |
| "mean_token_accuracy": 0.708356649428606, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.39886039886039887, |
| "grad_norm": 2.099592685699463, |
| "learning_rate": 6.327683615819209e-05, |
| "loss": 17.542, |
| "mean_token_accuracy": 0.6888726130127907, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.4022121669180493, |
| "grad_norm": 1.6204314231872559, |
| "learning_rate": 6.292372881355933e-05, |
| "loss": 16.9305, |
| "mean_token_accuracy": 0.7038852870464325, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4055639349756997, |
| "grad_norm": 2.12034010887146, |
| "learning_rate": 6.257062146892656e-05, |
| "loss": 17.0389, |
| "mean_token_accuracy": 0.704576326906681, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.4089157030333501, |
| "grad_norm": 1.6821502447128296, |
| "learning_rate": 6.221751412429379e-05, |
| "loss": 16.788, |
| "mean_token_accuracy": 0.7000284940004349, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.4122674710910005, |
| "grad_norm": 1.8137435913085938, |
| "learning_rate": 6.186440677966102e-05, |
| "loss": 17.5926, |
| "mean_token_accuracy": 0.6961537927389145, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.4156192391486509, |
| "grad_norm": 1.6652235984802246, |
| "learning_rate": 6.151129943502825e-05, |
| "loss": 17.3539, |
| "mean_token_accuracy": 0.7028377398848533, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.4189710072063013, |
| "grad_norm": 1.766480803489685, |
| "learning_rate": 6.115819209039548e-05, |
| "loss": 17.529, |
| "mean_token_accuracy": 0.6905739739537239, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.42232277526395173, |
| "grad_norm": 1.6319854259490967, |
| "learning_rate": 6.080508474576272e-05, |
| "loss": 16.9847, |
| "mean_token_accuracy": 0.7060947254300117, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.42567454332160215, |
| "grad_norm": 2.1006696224212646, |
| "learning_rate": 6.045197740112994e-05, |
| "loss": 16.9317, |
| "mean_token_accuracy": 0.7015593230724335, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.42902631137925257, |
| "grad_norm": 1.7353427410125732, |
| "learning_rate": 6.009887005649718e-05, |
| "loss": 17.4744, |
| "mean_token_accuracy": 0.7001501567661762, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.432378079436903, |
| "grad_norm": 1.9449700117111206, |
| "learning_rate": 5.974576271186441e-05, |
| "loss": 16.8705, |
| "mean_token_accuracy": 0.7026407413184643, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.4357298474945534, |
| "grad_norm": 1.6030067205429077, |
| "learning_rate": 5.9392655367231644e-05, |
| "loss": 16.8924, |
| "mean_token_accuracy": 0.702277285605669, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.43908161555220376, |
| "grad_norm": 1.5722424983978271, |
| "learning_rate": 5.903954802259888e-05, |
| "loss": 17.364, |
| "mean_token_accuracy": 0.6959278948605061, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.4424333836098542, |
| "grad_norm": 1.8168216943740845, |
| "learning_rate": 5.86864406779661e-05, |
| "loss": 16.704, |
| "mean_token_accuracy": 0.7045813865959645, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.4457851516675046, |
| "grad_norm": 1.905402660369873, |
| "learning_rate": 5.833333333333334e-05, |
| "loss": 16.8896, |
| "mean_token_accuracy": 0.7026248089969158, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.449136919725155, |
| "grad_norm": 1.7437454462051392, |
| "learning_rate": 5.798022598870056e-05, |
| "loss": 17.0496, |
| "mean_token_accuracy": 0.702862861007452, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.45248868778280543, |
| "grad_norm": 1.7496871948242188, |
| "learning_rate": 5.76271186440678e-05, |
| "loss": 16.7024, |
| "mean_token_accuracy": 0.7073140636086463, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.45584045584045585, |
| "grad_norm": 1.6521803140640259, |
| "learning_rate": 5.727401129943503e-05, |
| "loss": 17.4437, |
| "mean_token_accuracy": 0.6910906590521335, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.45919222389810627, |
| "grad_norm": 1.7904677391052246, |
| "learning_rate": 5.6920903954802264e-05, |
| "loss": 17.4803, |
| "mean_token_accuracy": 0.6987466789782047, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.4625439919557567, |
| "grad_norm": 2.4545388221740723, |
| "learning_rate": 5.65677966101695e-05, |
| "loss": 17.2987, |
| "mean_token_accuracy": 0.699196208268404, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.46589576001340705, |
| "grad_norm": 1.6428866386413574, |
| "learning_rate": 5.6214689265536723e-05, |
| "loss": 16.7636, |
| "mean_token_accuracy": 0.7029999569058418, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.46924752807105746, |
| "grad_norm": 1.9685977697372437, |
| "learning_rate": 5.586158192090396e-05, |
| "loss": 17.3887, |
| "mean_token_accuracy": 0.6938736639916897, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.4725992961287079, |
| "grad_norm": 1.5567928552627563, |
| "learning_rate": 5.550847457627118e-05, |
| "loss": 17.1879, |
| "mean_token_accuracy": 0.7024729043245316, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.4759510641863583, |
| "grad_norm": 1.6846567392349243, |
| "learning_rate": 5.515536723163842e-05, |
| "loss": 16.8679, |
| "mean_token_accuracy": 0.7025640495121479, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.4793028322440087, |
| "grad_norm": 1.6596832275390625, |
| "learning_rate": 5.480225988700565e-05, |
| "loss": 16.7137, |
| "mean_token_accuracy": 0.7031160019338131, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.48265460030165913, |
| "grad_norm": 2.04453444480896, |
| "learning_rate": 5.4449152542372885e-05, |
| "loss": 17.0646, |
| "mean_token_accuracy": 0.7018779084086418, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.48600636835930955, |
| "grad_norm": 1.7244528532028198, |
| "learning_rate": 5.409604519774012e-05, |
| "loss": 17.1897, |
| "mean_token_accuracy": 0.6981223806738853, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.48935813641695997, |
| "grad_norm": 1.6929802894592285, |
| "learning_rate": 5.3742937853107344e-05, |
| "loss": 17.2678, |
| "mean_token_accuracy": 0.6996262572705746, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.4927099044746104, |
| "grad_norm": 1.7945303916931152, |
| "learning_rate": 5.338983050847458e-05, |
| "loss": 17.1465, |
| "mean_token_accuracy": 0.7002299666404724, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.49606167253226074, |
| "grad_norm": 1.5936013460159302, |
| "learning_rate": 5.30367231638418e-05, |
| "loss": 17.0265, |
| "mean_token_accuracy": 0.6998031720519066, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.49941344058991116, |
| "grad_norm": 1.553004264831543, |
| "learning_rate": 5.268361581920904e-05, |
| "loss": 16.7301, |
| "mean_token_accuracy": 0.7022854961454869, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.5027652086475616, |
| "grad_norm": 1.7667690515518188, |
| "learning_rate": 5.2330508474576275e-05, |
| "loss": 16.8576, |
| "mean_token_accuracy": 0.7085686258971691, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5027652086475616, |
| "eval_loss": 1.0600364208221436, |
| "eval_mean_token_accuracy": 0.7049777010093035, |
| "eval_runtime": 1736.5707, |
| "eval_samples_per_second": 1.392, |
| "eval_steps_per_second": 0.174, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.506116976705212, |
| "grad_norm": 1.4901829957962036, |
| "learning_rate": 5.1977401129943505e-05, |
| "loss": 17.0004, |
| "mean_token_accuracy": 0.6990960523486137, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.5094687447628624, |
| "grad_norm": 1.8451662063598633, |
| "learning_rate": 5.162429378531074e-05, |
| "loss": 17.2012, |
| "mean_token_accuracy": 0.7007680244743824, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 1.6952011585235596, |
| "learning_rate": 5.1271186440677964e-05, |
| "loss": 17.612, |
| "mean_token_accuracy": 0.6927438467741013, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.5161722808781632, |
| "grad_norm": 1.7307817935943604, |
| "learning_rate": 5.09180790960452e-05, |
| "loss": 16.8776, |
| "mean_token_accuracy": 0.706513649225235, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.5195240489358136, |
| "grad_norm": 1.6692585945129395, |
| "learning_rate": 5.056497175141243e-05, |
| "loss": 17.0364, |
| "mean_token_accuracy": 0.704279126226902, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.5228758169934641, |
| "grad_norm": 1.6963402032852173, |
| "learning_rate": 5.0211864406779666e-05, |
| "loss": 16.8957, |
| "mean_token_accuracy": 0.7085353158414364, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.5262275850511144, |
| "grad_norm": 1.678458571434021, |
| "learning_rate": 4.9858757062146896e-05, |
| "loss": 17.7932, |
| "mean_token_accuracy": 0.6964584030210972, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.5295793531087649, |
| "grad_norm": 1.7449827194213867, |
| "learning_rate": 4.9505649717514125e-05, |
| "loss": 16.8765, |
| "mean_token_accuracy": 0.7036922007799149, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.5329311211664153, |
| "grad_norm": 1.7107524871826172, |
| "learning_rate": 4.915254237288136e-05, |
| "loss": 17.243, |
| "mean_token_accuracy": 0.6997682720422744, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.5362828892240656, |
| "grad_norm": 1.6416223049163818, |
| "learning_rate": 4.879943502824859e-05, |
| "loss": 16.7253, |
| "mean_token_accuracy": 0.7050332672894001, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5396346572817161, |
| "grad_norm": 1.867213249206543, |
| "learning_rate": 4.844632768361582e-05, |
| "loss": 16.8566, |
| "mean_token_accuracy": 0.7032786093652248, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.5429864253393665, |
| "grad_norm": 1.6539360284805298, |
| "learning_rate": 4.809322033898305e-05, |
| "loss": 16.6993, |
| "mean_token_accuracy": 0.7117977932095527, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.546338193397017, |
| "grad_norm": 1.752715826034546, |
| "learning_rate": 4.7740112994350286e-05, |
| "loss": 17.5809, |
| "mean_token_accuracy": 0.6992670528590679, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.5496899614546673, |
| "grad_norm": 1.806174397468567, |
| "learning_rate": 4.7387005649717516e-05, |
| "loss": 17.1588, |
| "mean_token_accuracy": 0.6960965767502785, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5530417295123178, |
| "grad_norm": 1.719764232635498, |
| "learning_rate": 4.703389830508475e-05, |
| "loss": 16.8685, |
| "mean_token_accuracy": 0.7025568410754204, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.5563934975699681, |
| "grad_norm": 1.7800629138946533, |
| "learning_rate": 4.668079096045198e-05, |
| "loss": 16.8872, |
| "mean_token_accuracy": 0.6994628652930259, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5597452656276186, |
| "grad_norm": 1.7011103630065918, |
| "learning_rate": 4.632768361581921e-05, |
| "loss": 17.2342, |
| "mean_token_accuracy": 0.7006913289427757, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.563097033685269, |
| "grad_norm": 1.6887695789337158, |
| "learning_rate": 4.597457627118644e-05, |
| "loss": 16.7385, |
| "mean_token_accuracy": 0.7045929700136184, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5664488017429193, |
| "grad_norm": 1.9496142864227295, |
| "learning_rate": 4.562146892655367e-05, |
| "loss": 16.8387, |
| "mean_token_accuracy": 0.7083131410181522, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.5698005698005698, |
| "grad_norm": 1.7757388353347778, |
| "learning_rate": 4.5268361581920906e-05, |
| "loss": 17.3856, |
| "mean_token_accuracy": 0.6994826771318913, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5731523378582202, |
| "grad_norm": 1.7115302085876465, |
| "learning_rate": 4.491525423728814e-05, |
| "loss": 16.5993, |
| "mean_token_accuracy": 0.7093915119767189, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.5765041059158706, |
| "grad_norm": 1.7968231439590454, |
| "learning_rate": 4.456214689265537e-05, |
| "loss": 16.8983, |
| "mean_token_accuracy": 0.7087731070816516, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.579855873973521, |
| "grad_norm": 1.6066899299621582, |
| "learning_rate": 4.42090395480226e-05, |
| "loss": 16.7126, |
| "mean_token_accuracy": 0.7053335346281528, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.5832076420311715, |
| "grad_norm": 1.6380205154418945, |
| "learning_rate": 4.385593220338983e-05, |
| "loss": 17.0037, |
| "mean_token_accuracy": 0.7038719221949578, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.5865594100888218, |
| "grad_norm": 1.8956695795059204, |
| "learning_rate": 4.350282485875706e-05, |
| "loss": 16.9679, |
| "mean_token_accuracy": 0.6983371920883655, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.5899111781464723, |
| "grad_norm": 1.625135064125061, |
| "learning_rate": 4.314971751412429e-05, |
| "loss": 17.0642, |
| "mean_token_accuracy": 0.7067640118300915, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5932629462041227, |
| "grad_norm": 1.6344581842422485, |
| "learning_rate": 4.279661016949153e-05, |
| "loss": 16.3079, |
| "mean_token_accuracy": 0.7225491903722286, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.596614714261773, |
| "grad_norm": 1.7680976390838623, |
| "learning_rate": 4.244350282485876e-05, |
| "loss": 16.7187, |
| "mean_token_accuracy": 0.7041032016277313, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5999664823194235, |
| "grad_norm": 1.8056613206863403, |
| "learning_rate": 4.209039548022599e-05, |
| "loss": 17.3536, |
| "mean_token_accuracy": 0.6975419208407402, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.6033182503770739, |
| "grad_norm": 1.8398966789245605, |
| "learning_rate": 4.173728813559322e-05, |
| "loss": 16.6245, |
| "mean_token_accuracy": 0.7088275127112865, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6066700184347243, |
| "grad_norm": 1.8332566022872925, |
| "learning_rate": 4.138418079096045e-05, |
| "loss": 17.0128, |
| "mean_token_accuracy": 0.7018843114376068, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.6100217864923747, |
| "grad_norm": 1.6582337617874146, |
| "learning_rate": 4.103107344632768e-05, |
| "loss": 16.8948, |
| "mean_token_accuracy": 0.7051651798188686, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.6133735545500252, |
| "grad_norm": 1.7373839616775513, |
| "learning_rate": 4.067796610169492e-05, |
| "loss": 16.9138, |
| "mean_token_accuracy": 0.7022108249366283, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.6167253226076755, |
| "grad_norm": 1.6373577117919922, |
| "learning_rate": 4.0324858757062154e-05, |
| "loss": 17.0573, |
| "mean_token_accuracy": 0.7042267486453057, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.620077090665326, |
| "grad_norm": 1.581024408340454, |
| "learning_rate": 3.997175141242938e-05, |
| "loss": 16.6234, |
| "mean_token_accuracy": 0.7054463028907776, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.6234288587229764, |
| "grad_norm": 1.6900616884231567, |
| "learning_rate": 3.961864406779661e-05, |
| "loss": 17.0468, |
| "mean_token_accuracy": 0.7014504976570606, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.6267806267806267, |
| "grad_norm": 1.6560430526733398, |
| "learning_rate": 3.926553672316384e-05, |
| "loss": 16.909, |
| "mean_token_accuracy": 0.7064756542444229, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.6301323948382772, |
| "grad_norm": 1.8687000274658203, |
| "learning_rate": 3.891242937853107e-05, |
| "loss": 17.0047, |
| "mean_token_accuracy": 0.7055176287889481, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.6334841628959276, |
| "grad_norm": 1.777716040611267, |
| "learning_rate": 3.855932203389831e-05, |
| "loss": 16.556, |
| "mean_token_accuracy": 0.7047871246933937, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.636835930953578, |
| "grad_norm": 1.6830016374588013, |
| "learning_rate": 3.820621468926554e-05, |
| "loss": 16.5832, |
| "mean_token_accuracy": 0.7049862682819367, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.6401876990112284, |
| "grad_norm": 1.5959638357162476, |
| "learning_rate": 3.7853107344632774e-05, |
| "loss": 16.8336, |
| "mean_token_accuracy": 0.7072055459022522, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.6435394670688789, |
| "grad_norm": 1.82794189453125, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 16.6644, |
| "mean_token_accuracy": 0.7058505766093731, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.6468912351265292, |
| "grad_norm": 1.6554478406906128, |
| "learning_rate": 3.714689265536723e-05, |
| "loss": 16.2796, |
| "mean_token_accuracy": 0.7101977132260799, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.6502430031841796, |
| "grad_norm": 1.8698370456695557, |
| "learning_rate": 3.679378531073446e-05, |
| "loss": 16.1934, |
| "mean_token_accuracy": 0.7142874717712402, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.6535947712418301, |
| "grad_norm": 1.8040566444396973, |
| "learning_rate": 3.644067796610169e-05, |
| "loss": 16.5345, |
| "mean_token_accuracy": 0.7125143676996231, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.6569465392994804, |
| "grad_norm": 1.6644558906555176, |
| "learning_rate": 3.608757062146893e-05, |
| "loss": 16.508, |
| "mean_token_accuracy": 0.7078846462070942, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.6602983073571309, |
| "grad_norm": 1.7228506803512573, |
| "learning_rate": 3.573446327683616e-05, |
| "loss": 16.8474, |
| "mean_token_accuracy": 0.7084795109927654, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.6636500754147813, |
| "grad_norm": 1.486241102218628, |
| "learning_rate": 3.5381355932203394e-05, |
| "loss": 17.1453, |
| "mean_token_accuracy": 0.6975291892886162, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6670018434724317, |
| "grad_norm": 1.7130765914916992, |
| "learning_rate": 3.5028248587570624e-05, |
| "loss": 16.458, |
| "mean_token_accuracy": 0.7106956362724304, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.6703536115300821, |
| "grad_norm": 1.863926649093628, |
| "learning_rate": 3.467514124293785e-05, |
| "loss": 17.3095, |
| "mean_token_accuracy": 0.6962033234536648, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6737053795877326, |
| "grad_norm": 1.6535072326660156, |
| "learning_rate": 3.432203389830508e-05, |
| "loss": 16.6846, |
| "mean_token_accuracy": 0.7084034703671932, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.6770571476453829, |
| "grad_norm": 1.7278594970703125, |
| "learning_rate": 3.396892655367232e-05, |
| "loss": 16.9805, |
| "mean_token_accuracy": 0.7026786416769027, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6804089157030333, |
| "grad_norm": 1.9055004119873047, |
| "learning_rate": 3.361581920903955e-05, |
| "loss": 17.2562, |
| "mean_token_accuracy": 0.6977267302572727, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.6837606837606838, |
| "grad_norm": 1.6398614645004272, |
| "learning_rate": 3.326271186440678e-05, |
| "loss": 17.3378, |
| "mean_token_accuracy": 0.6958214737474918, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.6871124518183341, |
| "grad_norm": 1.926950454711914, |
| "learning_rate": 3.2909604519774014e-05, |
| "loss": 16.6536, |
| "mean_token_accuracy": 0.7083842910826206, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.6904642198759846, |
| "grad_norm": 1.8061659336090088, |
| "learning_rate": 3.2556497175141244e-05, |
| "loss": 16.643, |
| "mean_token_accuracy": 0.7093963578343392, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.693815987933635, |
| "grad_norm": 1.6816084384918213, |
| "learning_rate": 3.2203389830508473e-05, |
| "loss": 16.9696, |
| "mean_token_accuracy": 0.7000316813588142, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.6971677559912854, |
| "grad_norm": 1.630842685699463, |
| "learning_rate": 3.185028248587571e-05, |
| "loss": 16.587, |
| "mean_token_accuracy": 0.7107978977262974, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.7005195240489358, |
| "grad_norm": 1.755123257637024, |
| "learning_rate": 3.149717514124294e-05, |
| "loss": 17.0736, |
| "mean_token_accuracy": 0.7017260067164898, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.7038712921065863, |
| "grad_norm": 1.4850029945373535, |
| "learning_rate": 3.114406779661017e-05, |
| "loss": 16.3165, |
| "mean_token_accuracy": 0.7119720429182053, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7072230601642366, |
| "grad_norm": 1.916961908340454, |
| "learning_rate": 3.0790960451977405e-05, |
| "loss": 17.0237, |
| "mean_token_accuracy": 0.6976533338427544, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.710574828221887, |
| "grad_norm": 1.5003294944763184, |
| "learning_rate": 3.043785310734463e-05, |
| "loss": 16.8504, |
| "mean_token_accuracy": 0.7056308597326278, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.7139265962795375, |
| "grad_norm": 1.9166836738586426, |
| "learning_rate": 3.0084745762711864e-05, |
| "loss": 16.8231, |
| "mean_token_accuracy": 0.7023352533578873, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.7172783643371878, |
| "grad_norm": 1.7789411544799805, |
| "learning_rate": 2.97316384180791e-05, |
| "loss": 17.3132, |
| "mean_token_accuracy": 0.6994914725422859, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.7206301323948383, |
| "grad_norm": 1.7289875745773315, |
| "learning_rate": 2.937853107344633e-05, |
| "loss": 17.3902, |
| "mean_token_accuracy": 0.69447166249156, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.7239819004524887, |
| "grad_norm": 1.4835467338562012, |
| "learning_rate": 2.902542372881356e-05, |
| "loss": 16.751, |
| "mean_token_accuracy": 0.7052346661686897, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.7273336685101391, |
| "grad_norm": 1.5802119970321655, |
| "learning_rate": 2.8672316384180792e-05, |
| "loss": 16.6574, |
| "mean_token_accuracy": 0.7059398606419564, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.7306854365677895, |
| "grad_norm": 1.8420851230621338, |
| "learning_rate": 2.8319209039548022e-05, |
| "loss": 16.9315, |
| "mean_token_accuracy": 0.7063411138951778, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.7340372046254399, |
| "grad_norm": 1.7593777179718018, |
| "learning_rate": 2.7966101694915255e-05, |
| "loss": 16.8653, |
| "mean_token_accuracy": 0.7089171193540096, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.7373889726830903, |
| "grad_norm": 1.681443452835083, |
| "learning_rate": 2.7612994350282488e-05, |
| "loss": 16.9878, |
| "mean_token_accuracy": 0.7057393230497837, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.7407407407407407, |
| "grad_norm": 1.6064281463623047, |
| "learning_rate": 2.725988700564972e-05, |
| "loss": 16.6153, |
| "mean_token_accuracy": 0.7038764618337154, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.7440925087983912, |
| "grad_norm": 1.5632483959197998, |
| "learning_rate": 2.690677966101695e-05, |
| "loss": 16.0927, |
| "mean_token_accuracy": 0.7171440742909908, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.7474442768560415, |
| "grad_norm": 1.8588156700134277, |
| "learning_rate": 2.6553672316384183e-05, |
| "loss": 16.5765, |
| "mean_token_accuracy": 0.7098327249288559, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.750796044913692, |
| "grad_norm": 1.5576221942901611, |
| "learning_rate": 2.6200564971751413e-05, |
| "loss": 16.6568, |
| "mean_token_accuracy": 0.7029327027499676, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.7541478129713424, |
| "grad_norm": 1.645244836807251, |
| "learning_rate": 2.5847457627118642e-05, |
| "loss": 16.7294, |
| "mean_token_accuracy": 0.7060277953743934, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.7574995810289928, |
| "grad_norm": 1.4038984775543213, |
| "learning_rate": 2.549435028248588e-05, |
| "loss": 16.5925, |
| "mean_token_accuracy": 0.7068064086139202, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.7608513490866432, |
| "grad_norm": 1.7987641096115112, |
| "learning_rate": 2.514124293785311e-05, |
| "loss": 16.6834, |
| "mean_token_accuracy": 0.7070130936801433, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.7642031171442936, |
| "grad_norm": 1.5423444509506226, |
| "learning_rate": 2.478813559322034e-05, |
| "loss": 16.4551, |
| "mean_token_accuracy": 0.7121224895119667, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.767554885201944, |
| "grad_norm": 1.7546942234039307, |
| "learning_rate": 2.443502824858757e-05, |
| "loss": 16.9741, |
| "mean_token_accuracy": 0.7010989025235176, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.7709066532595944, |
| "grad_norm": 1.8481935262680054, |
| "learning_rate": 2.4081920903954803e-05, |
| "loss": 16.6323, |
| "mean_token_accuracy": 0.7058765202760696, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7742584213172449, |
| "grad_norm": 1.6855909824371338, |
| "learning_rate": 2.3728813559322036e-05, |
| "loss": 16.6844, |
| "mean_token_accuracy": 0.7119428858160972, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.7776101893748952, |
| "grad_norm": 1.9828130006790161, |
| "learning_rate": 2.3375706214689266e-05, |
| "loss": 16.866, |
| "mean_token_accuracy": 0.7036800056695938, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7809619574325457, |
| "grad_norm": 1.5005120038986206, |
| "learning_rate": 2.30225988700565e-05, |
| "loss": 16.3539, |
| "mean_token_accuracy": 0.711839384585619, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.7843137254901961, |
| "grad_norm": 2.262735366821289, |
| "learning_rate": 2.266949152542373e-05, |
| "loss": 16.4102, |
| "mean_token_accuracy": 0.7110463745892048, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.7876654935478465, |
| "grad_norm": 1.6699568033218384, |
| "learning_rate": 2.231638418079096e-05, |
| "loss": 17.1027, |
| "mean_token_accuracy": 0.7031991191208362, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.7910172616054969, |
| "grad_norm": 1.6248890161514282, |
| "learning_rate": 2.196327683615819e-05, |
| "loss": 16.3399, |
| "mean_token_accuracy": 0.7143234215676785, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.7943690296631473, |
| "grad_norm": 1.7570775747299194, |
| "learning_rate": 2.1610169491525427e-05, |
| "loss": 16.2255, |
| "mean_token_accuracy": 0.7123358778655529, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.7977207977207977, |
| "grad_norm": 1.9391677379608154, |
| "learning_rate": 2.1257062146892657e-05, |
| "loss": 16.3472, |
| "mean_token_accuracy": 0.711616413295269, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.8010725657784481, |
| "grad_norm": 1.8997981548309326, |
| "learning_rate": 2.0903954802259886e-05, |
| "loss": 16.5601, |
| "mean_token_accuracy": 0.7071553356945515, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.8044243338360986, |
| "grad_norm": 1.6094359159469604, |
| "learning_rate": 2.055084745762712e-05, |
| "loss": 16.622, |
| "mean_token_accuracy": 0.7043877936899662, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8077761018937489, |
| "grad_norm": 1.7940973043441772, |
| "learning_rate": 2.0197740112994352e-05, |
| "loss": 16.6535, |
| "mean_token_accuracy": 0.705554535984993, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.8111278699513994, |
| "grad_norm": 1.6890041828155518, |
| "learning_rate": 1.984463276836158e-05, |
| "loss": 17.2328, |
| "mean_token_accuracy": 0.6988375537097454, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.8144796380090498, |
| "grad_norm": 1.5568735599517822, |
| "learning_rate": 1.9491525423728814e-05, |
| "loss": 16.9753, |
| "mean_token_accuracy": 0.7015632651746273, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.8178314060667002, |
| "grad_norm": 1.7157835960388184, |
| "learning_rate": 1.9138418079096047e-05, |
| "loss": 16.3668, |
| "mean_token_accuracy": 0.7098449252545833, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.8211831741243506, |
| "grad_norm": 1.7175644636154175, |
| "learning_rate": 1.8785310734463277e-05, |
| "loss": 16.8061, |
| "mean_token_accuracy": 0.7032932281494141, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.824534942182001, |
| "grad_norm": 1.7225829362869263, |
| "learning_rate": 1.843220338983051e-05, |
| "loss": 16.5716, |
| "mean_token_accuracy": 0.7074852548539639, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.8278867102396514, |
| "grad_norm": 1.8654727935791016, |
| "learning_rate": 1.8079096045197743e-05, |
| "loss": 16.8172, |
| "mean_token_accuracy": 0.7035241700708866, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.8312384782973018, |
| "grad_norm": 1.9604694843292236, |
| "learning_rate": 1.7725988700564972e-05, |
| "loss": 16.2992, |
| "mean_token_accuracy": 0.714275274425745, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.8345902463549523, |
| "grad_norm": 1.7569185495376587, |
| "learning_rate": 1.7372881355932205e-05, |
| "loss": 16.6269, |
| "mean_token_accuracy": 0.7052666112780571, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.8379420144126026, |
| "grad_norm": 1.6537069082260132, |
| "learning_rate": 1.7019774011299435e-05, |
| "loss": 16.5978, |
| "mean_token_accuracy": 0.708269502967596, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.8412937824702531, |
| "grad_norm": 1.8623359203338623, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 16.1831, |
| "mean_token_accuracy": 0.7164609245955944, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.8446455505279035, |
| "grad_norm": 1.7004101276397705, |
| "learning_rate": 1.63135593220339e-05, |
| "loss": 16.9611, |
| "mean_token_accuracy": 0.7057129152119159, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.8479973185855538, |
| "grad_norm": 1.8294973373413086, |
| "learning_rate": 1.596045197740113e-05, |
| "loss": 16.8036, |
| "mean_token_accuracy": 0.7046464517712593, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.8513490866432043, |
| "grad_norm": 1.7992702722549438, |
| "learning_rate": 1.5607344632768363e-05, |
| "loss": 16.139, |
| "mean_token_accuracy": 0.7126708298921585, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.8547008547008547, |
| "grad_norm": 2.033846855163574, |
| "learning_rate": 1.5254237288135596e-05, |
| "loss": 16.49, |
| "mean_token_accuracy": 0.707030464708805, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.8580526227585051, |
| "grad_norm": 1.690617561340332, |
| "learning_rate": 1.4901129943502825e-05, |
| "loss": 16.7829, |
| "mean_token_accuracy": 0.7026272863149643, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.8614043908161555, |
| "grad_norm": 1.7161706686019897, |
| "learning_rate": 1.4548022598870056e-05, |
| "loss": 16.4907, |
| "mean_token_accuracy": 0.7054763376712799, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.864756158873806, |
| "grad_norm": 1.5910500288009644, |
| "learning_rate": 1.419491525423729e-05, |
| "loss": 16.3073, |
| "mean_token_accuracy": 0.7165283918380737, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.8681079269314563, |
| "grad_norm": 1.5939749479293823, |
| "learning_rate": 1.384180790960452e-05, |
| "loss": 16.6524, |
| "mean_token_accuracy": 0.705347529053688, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.8714596949891068, |
| "grad_norm": 1.7478996515274048, |
| "learning_rate": 1.3488700564971752e-05, |
| "loss": 17.1832, |
| "mean_token_accuracy": 0.6956523738801479, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8748114630467572, |
| "grad_norm": 1.6442205905914307, |
| "learning_rate": 1.3135593220338985e-05, |
| "loss": 16.3978, |
| "mean_token_accuracy": 0.7132278561592102, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.8781632311044075, |
| "grad_norm": 1.7201565504074097, |
| "learning_rate": 1.2782485875706216e-05, |
| "loss": 16.3159, |
| "mean_token_accuracy": 0.711051919311285, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.881514999162058, |
| "grad_norm": 1.829209327697754, |
| "learning_rate": 1.2429378531073447e-05, |
| "loss": 16.7987, |
| "mean_token_accuracy": 0.7058401651680469, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.8848667672197084, |
| "grad_norm": 1.4660886526107788, |
| "learning_rate": 1.2076271186440678e-05, |
| "loss": 16.7297, |
| "mean_token_accuracy": 0.7092804253101349, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.8882185352773588, |
| "grad_norm": 1.4927663803100586, |
| "learning_rate": 1.172316384180791e-05, |
| "loss": 15.9333, |
| "mean_token_accuracy": 0.7158772744238376, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.8915703033350092, |
| "grad_norm": 1.6522186994552612, |
| "learning_rate": 1.137005649717514e-05, |
| "loss": 16.4156, |
| "mean_token_accuracy": 0.7134528748691082, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.8949220713926597, |
| "grad_norm": 1.7809523344039917, |
| "learning_rate": 1.1016949152542374e-05, |
| "loss": 16.2625, |
| "mean_token_accuracy": 0.7148336976766586, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.89827383945031, |
| "grad_norm": 1.8860619068145752, |
| "learning_rate": 1.0663841807909605e-05, |
| "loss": 16.6187, |
| "mean_token_accuracy": 0.7087382405996323, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.9016256075079605, |
| "grad_norm": 1.854195475578308, |
| "learning_rate": 1.0310734463276836e-05, |
| "loss": 16.5843, |
| "mean_token_accuracy": 0.7144103929400444, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.9049773755656109, |
| "grad_norm": 1.7052239179611206, |
| "learning_rate": 9.957627118644067e-06, |
| "loss": 16.3345, |
| "mean_token_accuracy": 0.7125584341585636, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.9083291436232612, |
| "grad_norm": 1.5887420177459717, |
| "learning_rate": 9.6045197740113e-06, |
| "loss": 16.2409, |
| "mean_token_accuracy": 0.7080107174813748, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.9116809116809117, |
| "grad_norm": 1.6052732467651367, |
| "learning_rate": 9.251412429378532e-06, |
| "loss": 16.2373, |
| "mean_token_accuracy": 0.7137157171964645, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.9150326797385621, |
| "grad_norm": 1.7612617015838623, |
| "learning_rate": 8.898305084745763e-06, |
| "loss": 16.0292, |
| "mean_token_accuracy": 0.7181592255830764, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.9183844477962125, |
| "grad_norm": 1.8271749019622803, |
| "learning_rate": 8.545197740112996e-06, |
| "loss": 16.8757, |
| "mean_token_accuracy": 0.701992305368185, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.9217362158538629, |
| "grad_norm": 1.6350926160812378, |
| "learning_rate": 8.192090395480225e-06, |
| "loss": 16.6061, |
| "mean_token_accuracy": 0.7089238859713077, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.9250879839115134, |
| "grad_norm": 1.7321621179580688, |
| "learning_rate": 7.838983050847458e-06, |
| "loss": 16.2532, |
| "mean_token_accuracy": 0.7115737572312355, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.9284397519691637, |
| "grad_norm": 1.8958040475845337, |
| "learning_rate": 7.48587570621469e-06, |
| "loss": 16.5068, |
| "mean_token_accuracy": 0.7108790181577206, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.9317915200268141, |
| "grad_norm": 1.629992127418518, |
| "learning_rate": 7.1327683615819206e-06, |
| "loss": 16.2367, |
| "mean_token_accuracy": 0.7134776934981346, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.9351432880844646, |
| "grad_norm": 1.904123067855835, |
| "learning_rate": 6.779661016949153e-06, |
| "loss": 16.3444, |
| "mean_token_accuracy": 0.7045241884887219, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.9384950561421149, |
| "grad_norm": 1.6319600343704224, |
| "learning_rate": 6.426553672316385e-06, |
| "loss": 16.3, |
| "mean_token_accuracy": 0.7118948072195053, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.9418468241997654, |
| "grad_norm": 1.6921709775924683, |
| "learning_rate": 6.073446327683617e-06, |
| "loss": 16.5816, |
| "mean_token_accuracy": 0.7079687170684338, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.9451985922574158, |
| "grad_norm": 1.636551856994629, |
| "learning_rate": 5.720338983050848e-06, |
| "loss": 16.785, |
| "mean_token_accuracy": 0.7054948009550571, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.9485503603150662, |
| "grad_norm": 1.6171858310699463, |
| "learning_rate": 5.367231638418079e-06, |
| "loss": 16.6877, |
| "mean_token_accuracy": 0.7033485405147075, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.9519021283727166, |
| "grad_norm": 1.6833641529083252, |
| "learning_rate": 5.014124293785311e-06, |
| "loss": 16.5803, |
| "mean_token_accuracy": 0.706027402728796, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.9552538964303671, |
| "grad_norm": 2.0238494873046875, |
| "learning_rate": 4.6610169491525425e-06, |
| "loss": 16.4305, |
| "mean_token_accuracy": 0.7110757566988468, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.9586056644880174, |
| "grad_norm": 1.5262683629989624, |
| "learning_rate": 4.307909604519774e-06, |
| "loss": 16.105, |
| "mean_token_accuracy": 0.7173994883894921, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.9619574325456678, |
| "grad_norm": 1.6822128295898438, |
| "learning_rate": 3.954802259887006e-06, |
| "loss": 17.0064, |
| "mean_token_accuracy": 0.7033144362270832, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.9653092006033183, |
| "grad_norm": 2.1382946968078613, |
| "learning_rate": 3.6016949152542374e-06, |
| "loss": 16.6567, |
| "mean_token_accuracy": 0.7085098147392273, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.9686609686609686, |
| "grad_norm": 1.6137080192565918, |
| "learning_rate": 3.248587570621469e-06, |
| "loss": 16.4193, |
| "mean_token_accuracy": 0.7077061600983143, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.9720127367186191, |
| "grad_norm": 1.6318018436431885, |
| "learning_rate": 2.8954802259887007e-06, |
| "loss": 16.5904, |
| "mean_token_accuracy": 0.7037704810500145, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.9753645047762695, |
| "grad_norm": 1.6723519563674927, |
| "learning_rate": 2.5423728813559323e-06, |
| "loss": 16.351, |
| "mean_token_accuracy": 0.715372896194458, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.9787162728339199, |
| "grad_norm": 2.6915719509124756, |
| "learning_rate": 2.189265536723164e-06, |
| "loss": 16.5627, |
| "mean_token_accuracy": 0.706637478619814, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.9820680408915703, |
| "grad_norm": 1.9349390268325806, |
| "learning_rate": 1.8361581920903956e-06, |
| "loss": 16.7821, |
| "mean_token_accuracy": 0.7010103747248649, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.9854198089492208, |
| "grad_norm": 1.6685172319412231, |
| "learning_rate": 1.4830508474576273e-06, |
| "loss": 16.7016, |
| "mean_token_accuracy": 0.7086931586265564, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.9887715770068711, |
| "grad_norm": 1.7148998975753784, |
| "learning_rate": 1.129943502824859e-06, |
| "loss": 16.4809, |
| "mean_token_accuracy": 0.7131018862128258, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.9921233450645215, |
| "grad_norm": 1.8873836994171143, |
| "learning_rate": 7.768361581920904e-07, |
| "loss": 16.5183, |
| "mean_token_accuracy": 0.7111847102642059, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.995475113122172, |
| "grad_norm": 1.8390552997589111, |
| "learning_rate": 4.2372881355932204e-07, |
| "loss": 16.1742, |
| "mean_token_accuracy": 0.7128683432936669, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.9988268811798223, |
| "grad_norm": 1.8799461126327515, |
| "learning_rate": 7.062146892655368e-08, |
| "loss": 17.1633, |
| "mean_token_accuracy": 0.6963419988751411, |
| "step": 1490 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1491, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 750, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.5012213304045076e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|