{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999074845036544, "eval_steps": 100, "global_step": 1351, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003700619853825516, "grad_norm": 6.406365314676644, "learning_rate": 7.352941176470589e-07, "loss": 1.3566, "mean_token_accuracy": 0.6660232350230217, "step": 5 }, { "epoch": 0.007401239707651032, "grad_norm": 6.123712399933804, "learning_rate": 1.4705882352941177e-06, "loss": 1.3913, "mean_token_accuracy": 0.6558953940868377, "step": 10 }, { "epoch": 0.011101859561476548, "grad_norm": 5.275448607027431, "learning_rate": 2.2058823529411767e-06, "loss": 1.3404, "mean_token_accuracy": 0.6668361470103263, "step": 15 }, { "epoch": 0.014802479415302064, "grad_norm": 3.222168252214971, "learning_rate": 2.9411764705882355e-06, "loss": 1.2979, "mean_token_accuracy": 0.6731562301516533, "step": 20 }, { "epoch": 0.01850309926912758, "grad_norm": 2.304245354229246, "learning_rate": 3.6764705882352946e-06, "loss": 1.2317, "mean_token_accuracy": 0.6820817664265633, "step": 25 }, { "epoch": 0.022203719122953096, "grad_norm": 2.2635687771667734, "learning_rate": 4.411764705882353e-06, "loss": 1.19, "mean_token_accuracy": 0.6874127045273781, "step": 30 }, { "epoch": 0.025904338976778612, "grad_norm": 1.6106011837583105, "learning_rate": 5.147058823529411e-06, "loss": 1.1346, "mean_token_accuracy": 0.6967306867241859, "step": 35 }, { "epoch": 0.029604958830604128, "grad_norm": 1.2780962221463628, "learning_rate": 5.882352941176471e-06, "loss": 1.0694, "mean_token_accuracy": 0.7095885664224625, "step": 40 }, { "epoch": 0.03330557868442964, "grad_norm": 1.637550942271707, "learning_rate": 6.61764705882353e-06, "loss": 1.056, "mean_token_accuracy": 0.7120530039072037, "step": 45 }, { "epoch": 0.03700619853825516, "grad_norm": 1.2301760518659035, "learning_rate": 7.352941176470589e-06, "loss": 1.03, "mean_token_accuracy": 0.7162090480327606, "step": 50 }, { "epoch": 0.04070681839208067, "grad_norm": 0.9170716667796568, "learning_rate": 8.088235294117648e-06, "loss": 1.0283, "mean_token_accuracy": 0.7158895432949066, "step": 55 }, { "epoch": 0.04440743824590619, "grad_norm": 1.0543932075765083, "learning_rate": 8.823529411764707e-06, "loss": 0.9877, "mean_token_accuracy": 0.7251855596899986, "step": 60 }, { "epoch": 0.048108058099731704, "grad_norm": 0.9843413417015394, "learning_rate": 9.558823529411766e-06, "loss": 0.9974, "mean_token_accuracy": 0.7208766683936119, "step": 65 }, { "epoch": 0.051808677953557224, "grad_norm": 0.9511465656581665, "learning_rate": 1.0294117647058823e-05, "loss": 0.9924, "mean_token_accuracy": 0.7231188163161277, "step": 70 }, { "epoch": 0.055509297807382736, "grad_norm": 2.825049696114851, "learning_rate": 1.1029411764705885e-05, "loss": 0.9782, "mean_token_accuracy": 0.7259459629654884, "step": 75 }, { "epoch": 0.059209917661208256, "grad_norm": 0.938362593121529, "learning_rate": 1.1764705882352942e-05, "loss": 0.9686, "mean_token_accuracy": 0.7275517180562019, "step": 80 }, { "epoch": 0.06291053751503377, "grad_norm": 0.9202800308554298, "learning_rate": 1.25e-05, "loss": 0.9794, "mean_token_accuracy": 0.7241677865386009, "step": 85 }, { "epoch": 0.06661115736885928, "grad_norm": 0.9332127502056355, "learning_rate": 1.323529411764706e-05, "loss": 0.9497, "mean_token_accuracy": 0.731817427277565, "step": 90 }, { "epoch": 0.0703117772226848, "grad_norm": 1.0837119699847717, "learning_rate": 1.3970588235294118e-05, "loss": 0.9852, "mean_token_accuracy": 0.7222246199846267, "step": 95 }, { "epoch": 0.07401239707651032, "grad_norm": 0.8948953129317817, "learning_rate": 1.4705882352941179e-05, "loss": 0.9463, "mean_token_accuracy": 0.7313005596399307, "step": 100 }, { "epoch": 0.07401239707651032, "eval_loss": 0.9788710474967957, "eval_mean_token_accuracy": 0.7260306134368434, "eval_runtime": 24.5669, "eval_samples_per_second": 20.882, "eval_steps_per_second": 1.343, "step": 100 }, { "epoch": 0.07771301693033583, "grad_norm": 0.9653033723452836, "learning_rate": 1.5441176470588237e-05, "loss": 0.9696, "mean_token_accuracy": 0.7255482524633408, "step": 105 }, { "epoch": 0.08141363678416134, "grad_norm": 0.9349548842470216, "learning_rate": 1.6176470588235296e-05, "loss": 0.9722, "mean_token_accuracy": 0.7248646602034569, "step": 110 }, { "epoch": 0.08511425663798686, "grad_norm": 0.8956374335067367, "learning_rate": 1.6911764705882355e-05, "loss": 0.9591, "mean_token_accuracy": 0.7272942900657654, "step": 115 }, { "epoch": 0.08881487649181238, "grad_norm": 0.9788467585120308, "learning_rate": 1.7647058823529414e-05, "loss": 0.9363, "mean_token_accuracy": 0.7335584476590157, "step": 120 }, { "epoch": 0.0925154963456379, "grad_norm": 0.9134626099519884, "learning_rate": 1.8382352941176472e-05, "loss": 0.9276, "mean_token_accuracy": 0.7360474243760109, "step": 125 }, { "epoch": 0.09621611619946341, "grad_norm": 0.9753561569488689, "learning_rate": 1.911764705882353e-05, "loss": 0.9263, "mean_token_accuracy": 0.7367036134004593, "step": 130 }, { "epoch": 0.09991673605328892, "grad_norm": 0.9447239853385726, "learning_rate": 1.985294117647059e-05, "loss": 0.9358, "mean_token_accuracy": 0.7343964487314224, "step": 135 }, { "epoch": 0.10361735590711445, "grad_norm": 0.9093049891627352, "learning_rate": 1.9999465148392906e-05, "loss": 0.9199, "mean_token_accuracy": 0.7370649874210358, "step": 140 }, { "epoch": 0.10731797576093996, "grad_norm": 0.9478337858776447, "learning_rate": 1.999729241179462e-05, "loss": 0.9378, "mean_token_accuracy": 0.7336751863360405, "step": 145 }, { "epoch": 0.11101859561476547, "grad_norm": 0.8569290211164649, "learning_rate": 1.999344872485215e-05, "loss": 0.9038, "mean_token_accuracy": 0.7422620430588722, "step": 150 }, { "epoch": 0.11471921546859098, "grad_norm": 1.036972518186118, "learning_rate": 1.9987934730000457e-05, "loss": 0.9031, "mean_token_accuracy": 0.741059948503971, "step": 155 }, { "epoch": 0.11841983532241651, "grad_norm": 0.951663289147615, "learning_rate": 1.998075134885022e-05, "loss": 0.9171, "mean_token_accuracy": 0.7384189561009407, "step": 160 }, { "epoch": 0.12212045517624202, "grad_norm": 0.8919119572868265, "learning_rate": 1.9971899782033853e-05, "loss": 0.9378, "mean_token_accuracy": 0.7339910715818405, "step": 165 }, { "epoch": 0.12582107503006754, "grad_norm": 0.9627820867919544, "learning_rate": 1.9961381509004785e-05, "loss": 0.9548, "mean_token_accuracy": 0.7276575401425361, "step": 170 }, { "epoch": 0.12952169488389306, "grad_norm": 1.0867703495663577, "learning_rate": 1.9949198287790215e-05, "loss": 0.8957, "mean_token_accuracy": 0.7430507227778435, "step": 175 }, { "epoch": 0.13322231473771856, "grad_norm": 0.9434565750923781, "learning_rate": 1.9935352154697257e-05, "loss": 0.9326, "mean_token_accuracy": 0.7323492169380188, "step": 180 }, { "epoch": 0.1369229345915441, "grad_norm": 0.9338703985952416, "learning_rate": 1.9919845423972603e-05, "loss": 0.9152, "mean_token_accuracy": 0.7383304908871651, "step": 185 }, { "epoch": 0.1406235544453696, "grad_norm": 0.8449751394948155, "learning_rate": 1.9902680687415704e-05, "loss": 0.9115, "mean_token_accuracy": 0.7386730879545211, "step": 190 }, { "epoch": 0.1443241742991951, "grad_norm": 0.9634477921697367, "learning_rate": 1.9883860813945596e-05, "loss": 0.9472, "mean_token_accuracy": 0.7278202414512634, "step": 195 }, { "epoch": 0.14802479415302064, "grad_norm": 0.8797771752554515, "learning_rate": 1.986338894912137e-05, "loss": 0.9133, "mean_token_accuracy": 0.7370981857180595, "step": 200 }, { "epoch": 0.14802479415302064, "eval_loss": 0.9444138407707214, "eval_mean_token_accuracy": 0.7318792343139648, "eval_runtime": 24.471, "eval_samples_per_second": 20.964, "eval_steps_per_second": 1.349, "step": 200 }, { "epoch": 0.15172541400684614, "grad_norm": 0.9730130557080766, "learning_rate": 1.9841268514616434e-05, "loss": 0.9072, "mean_token_accuracy": 0.7391636997461319, "step": 205 }, { "epoch": 0.15542603386067166, "grad_norm": 0.9307244607734099, "learning_rate": 1.9817503207646606e-05, "loss": 0.9083, "mean_token_accuracy": 0.7380757689476013, "step": 210 }, { "epoch": 0.1591266537144972, "grad_norm": 0.8802319735889644, "learning_rate": 1.979209700035216e-05, "loss": 0.9354, "mean_token_accuracy": 0.7329269647598267, "step": 215 }, { "epoch": 0.1628272735683227, "grad_norm": 0.8725363780112383, "learning_rate": 1.976505413913393e-05, "loss": 0.9128, "mean_token_accuracy": 0.7383672297000885, "step": 220 }, { "epoch": 0.16652789342214822, "grad_norm": 0.903336658421562, "learning_rate": 1.9736379143943565e-05, "loss": 0.9385, "mean_token_accuracy": 0.7308298230171204, "step": 225 }, { "epoch": 0.17022851327597371, "grad_norm": 0.8621415714119711, "learning_rate": 1.9706076807528044e-05, "loss": 0.9239, "mean_token_accuracy": 0.7339740186929703, "step": 230 }, { "epoch": 0.17392913312979924, "grad_norm": 0.9079088453730583, "learning_rate": 1.967415219462864e-05, "loss": 0.9217, "mean_token_accuracy": 0.7344419181346893, "step": 235 }, { "epoch": 0.17762975298362477, "grad_norm": 0.8253363960767214, "learning_rate": 1.9640610641134383e-05, "loss": 0.9103, "mean_token_accuracy": 0.7382244557142258, "step": 240 }, { "epoch": 0.18133037283745027, "grad_norm": 0.934084069747635, "learning_rate": 1.9605457753190224e-05, "loss": 0.9152, "mean_token_accuracy": 0.7362594842910767, "step": 245 }, { "epoch": 0.1850309926912758, "grad_norm": 0.8995132563871583, "learning_rate": 1.9568699406260016e-05, "loss": 0.931, "mean_token_accuracy": 0.7331584557890892, "step": 250 }, { "epoch": 0.18873161254510132, "grad_norm": 0.8420380152677727, "learning_rate": 1.953034174414449e-05, "loss": 0.8976, "mean_token_accuracy": 0.7409126415848732, "step": 255 }, { "epoch": 0.19243223239892682, "grad_norm": 0.8794392938382097, "learning_rate": 1.9490391177954383e-05, "loss": 0.8971, "mean_token_accuracy": 0.7426333785057068, "step": 260 }, { "epoch": 0.19613285225275234, "grad_norm": 0.871108034639782, "learning_rate": 1.944885438503888e-05, "loss": 0.9153, "mean_token_accuracy": 0.7366316050291062, "step": 265 }, { "epoch": 0.19983347210657784, "grad_norm": 0.8826333710088322, "learning_rate": 1.9405738307869565e-05, "loss": 0.9093, "mean_token_accuracy": 0.7374404519796371, "step": 270 }, { "epoch": 0.20353409196040337, "grad_norm": 0.9186513628105667, "learning_rate": 1.936105015288003e-05, "loss": 0.9206, "mean_token_accuracy": 0.7328536227345467, "step": 275 }, { "epoch": 0.2072347118142289, "grad_norm": 0.892096099101671, "learning_rate": 1.9314797389261426e-05, "loss": 0.9193, "mean_token_accuracy": 0.7348861888051033, "step": 280 }, { "epoch": 0.2109353316680544, "grad_norm": 0.8240507326046963, "learning_rate": 1.9266987747714036e-05, "loss": 0.897, "mean_token_accuracy": 0.7411905080080032, "step": 285 }, { "epoch": 0.21463595152187992, "grad_norm": 0.867466389731865, "learning_rate": 1.9217629219155172e-05, "loss": 0.9087, "mean_token_accuracy": 0.7377493485808373, "step": 290 }, { "epoch": 0.21833657137570542, "grad_norm": 0.8533932703930674, "learning_rate": 1.916673005338357e-05, "loss": 0.8899, "mean_token_accuracy": 0.7420720800757408, "step": 295 }, { "epoch": 0.22203719122953094, "grad_norm": 0.8385314803046641, "learning_rate": 1.9114298757700508e-05, "loss": 0.9295, "mean_token_accuracy": 0.7315363213419914, "step": 300 }, { "epoch": 0.22203719122953094, "eval_loss": 0.9261447191238403, "eval_mean_token_accuracy": 0.7356406341899525, "eval_runtime": 24.4594, "eval_samples_per_second": 20.974, "eval_steps_per_second": 1.349, "step": 300 }, { "epoch": 0.22573781108335647, "grad_norm": 0.8344167353514693, "learning_rate": 1.9060344095487916e-05, "loss": 0.9021, "mean_token_accuracy": 0.7388954728841781, "step": 305 }, { "epoch": 0.22943843093718197, "grad_norm": 0.8166938367608658, "learning_rate": 1.9004875084743624e-05, "loss": 0.8745, "mean_token_accuracy": 0.7464573740959167, "step": 310 }, { "epoch": 0.2331390507910075, "grad_norm": 0.8583610850262193, "learning_rate": 1.8947900996574133e-05, "loss": 0.8906, "mean_token_accuracy": 0.7421576008200645, "step": 315 }, { "epoch": 0.23683967064483302, "grad_norm": 0.85619616919622, "learning_rate": 1.8889431353645004e-05, "loss": 0.9083, "mean_token_accuracy": 0.7389416947960854, "step": 320 }, { "epoch": 0.24054029049865852, "grad_norm": 0.95321268460484, "learning_rate": 1.8829475928589272e-05, "loss": 0.9037, "mean_token_accuracy": 0.7408745899796486, "step": 325 }, { "epoch": 0.24424091035248405, "grad_norm": 0.9192097687099984, "learning_rate": 1.8768044742374008e-05, "loss": 0.8844, "mean_token_accuracy": 0.7437578573822975, "step": 330 }, { "epoch": 0.24794153020630955, "grad_norm": 0.8224814882376746, "learning_rate": 1.870514806262544e-05, "loss": 0.8793, "mean_token_accuracy": 0.7451973140239716, "step": 335 }, { "epoch": 0.2516421500601351, "grad_norm": 0.8277417287111005, "learning_rate": 1.8640796401912805e-05, "loss": 0.8937, "mean_token_accuracy": 0.7406307712197304, "step": 340 }, { "epoch": 0.2553427699139606, "grad_norm": 0.8459764358981483, "learning_rate": 1.8575000515991283e-05, "loss": 0.8979, "mean_token_accuracy": 0.7395652890205383, "step": 345 }, { "epoch": 0.2590433897677861, "grad_norm": 0.8033711981121063, "learning_rate": 1.850777140200427e-05, "loss": 0.8813, "mean_token_accuracy": 0.7449199944734574, "step": 350 }, { "epoch": 0.2627440096216116, "grad_norm": 0.7910864449390781, "learning_rate": 1.843912029664531e-05, "loss": 0.8965, "mean_token_accuracy": 0.7398232161998749, "step": 355 }, { "epoch": 0.2664446294754371, "grad_norm": 0.8451406694855826, "learning_rate": 1.8369058674280004e-05, "loss": 0.8943, "mean_token_accuracy": 0.7411910384893418, "step": 360 }, { "epoch": 0.27014524932926265, "grad_norm": 0.8333682523549819, "learning_rate": 1.8297598245028173e-05, "loss": 0.9132, "mean_token_accuracy": 0.7367936804890632, "step": 365 }, { "epoch": 0.2738458691830882, "grad_norm": 0.8175542933467003, "learning_rate": 1.8224750952806626e-05, "loss": 0.8769, "mean_token_accuracy": 0.7455912932753563, "step": 370 }, { "epoch": 0.2775464890369137, "grad_norm": 0.8434576136182329, "learning_rate": 1.815052897333284e-05, "loss": 0.8808, "mean_token_accuracy": 0.7459388568997383, "step": 375 }, { "epoch": 0.2812471088907392, "grad_norm": 0.8936045983724704, "learning_rate": 1.8074944712089925e-05, "loss": 0.9024, "mean_token_accuracy": 0.7386106207966805, "step": 380 }, { "epoch": 0.2849477287445647, "grad_norm": 0.8151663680460399, "learning_rate": 1.799801080225316e-05, "loss": 0.8912, "mean_token_accuracy": 0.7416763469576836, "step": 385 }, { "epoch": 0.2886483485983902, "grad_norm": 0.8482431323316972, "learning_rate": 1.7919740102578482e-05, "loss": 0.8887, "mean_token_accuracy": 0.7428235754370689, "step": 390 }, { "epoch": 0.29234896845221575, "grad_norm": 0.8424168395876551, "learning_rate": 1.7840145695253258e-05, "loss": 0.8769, "mean_token_accuracy": 0.744986218214035, "step": 395 }, { "epoch": 0.2960495883060413, "grad_norm": 0.7880509490552058, "learning_rate": 1.7759240883709745e-05, "loss": 0.8706, "mean_token_accuracy": 0.74755879342556, "step": 400 }, { "epoch": 0.2960495883060413, "eval_loss": 0.9140655398368835, "eval_mean_token_accuracy": 0.7376509543621179, "eval_runtime": 24.4423, "eval_samples_per_second": 20.988, "eval_steps_per_second": 1.35, "step": 400 }, { "epoch": 0.2997502081598668, "grad_norm": 0.9325819637971632, "learning_rate": 1.7677039190401538e-05, "loss": 0.8778, "mean_token_accuracy": 0.7454720690846444, "step": 405 }, { "epoch": 0.3034508280136923, "grad_norm": 0.855702135031522, "learning_rate": 1.759355435454342e-05, "loss": 0.8825, "mean_token_accuracy": 0.7436973512172699, "step": 410 }, { "epoch": 0.3071514478675178, "grad_norm": 0.8190150096476719, "learning_rate": 1.7508800329814993e-05, "loss": 0.8801, "mean_token_accuracy": 0.7447807624936104, "step": 415 }, { "epoch": 0.31085206772134333, "grad_norm": 0.7879901787308187, "learning_rate": 1.7422791282028457e-05, "loss": 0.8926, "mean_token_accuracy": 0.7424971207976341, "step": 420 }, { "epoch": 0.31455268757516885, "grad_norm": 0.8418635735026896, "learning_rate": 1.7335541586760928e-05, "loss": 0.8962, "mean_token_accuracy": 0.740935817360878, "step": 425 }, { "epoch": 0.3182533074289944, "grad_norm": 0.8832514235130526, "learning_rate": 1.7247065826951694e-05, "loss": 0.8843, "mean_token_accuracy": 0.7440172478556633, "step": 430 }, { "epoch": 0.32195392728281985, "grad_norm": 0.9664717868103249, "learning_rate": 1.715737879046483e-05, "loss": 0.8955, "mean_token_accuracy": 0.7396286860108375, "step": 435 }, { "epoch": 0.3256545471366454, "grad_norm": 0.8907608106819015, "learning_rate": 1.7066495467617552e-05, "loss": 0.891, "mean_token_accuracy": 0.7419100046157837, "step": 440 }, { "epoch": 0.3293551669904709, "grad_norm": 0.7921927517011862, "learning_rate": 1.6974431048674714e-05, "loss": 0.8823, "mean_token_accuracy": 0.7444690898060798, "step": 445 }, { "epoch": 0.33305578684429643, "grad_norm": 0.8845253263563585, "learning_rate": 1.6881200921309914e-05, "loss": 0.8908, "mean_token_accuracy": 0.7414721488952637, "step": 450 }, { "epoch": 0.33675640669812196, "grad_norm": 0.9038197553584879, "learning_rate": 1.6786820668033596e-05, "loss": 0.8902, "mean_token_accuracy": 0.7424716472625732, "step": 455 }, { "epoch": 0.34045702655194743, "grad_norm": 0.845478277089973, "learning_rate": 1.6691306063588583e-05, "loss": 0.8862, "mean_token_accuracy": 0.7440178290009498, "step": 460 }, { "epoch": 0.34415764640577295, "grad_norm": 0.7780737971349384, "learning_rate": 1.6594673072313478e-05, "loss": 0.8683, "mean_token_accuracy": 0.7474384486675263, "step": 465 }, { "epoch": 0.3478582662595985, "grad_norm": 0.7883315377481923, "learning_rate": 1.6496937845474375e-05, "loss": 0.852, "mean_token_accuracy": 0.7510827273130417, "step": 470 }, { "epoch": 0.351558886113424, "grad_norm": 0.8522289082869288, "learning_rate": 1.639811671856535e-05, "loss": 0.8999, "mean_token_accuracy": 0.7387788712978363, "step": 475 }, { "epoch": 0.35525950596724953, "grad_norm": 0.8084630790546208, "learning_rate": 1.6298226208578127e-05, "loss": 0.8818, "mean_token_accuracy": 0.7436941042542458, "step": 480 }, { "epoch": 0.358960125821075, "grad_norm": 0.9229148133761365, "learning_rate": 1.6197283011241423e-05, "loss": 0.8909, "mean_token_accuracy": 0.7398700997233391, "step": 485 }, { "epoch": 0.36266074567490053, "grad_norm": 0.7838660795136883, "learning_rate": 1.6095303998230432e-05, "loss": 0.8747, "mean_token_accuracy": 0.7453865185379982, "step": 490 }, { "epoch": 0.36636136552872606, "grad_norm": 0.8988778984216769, "learning_rate": 1.599230621434687e-05, "loss": 0.8798, "mean_token_accuracy": 0.7436122760176659, "step": 495 }, { "epoch": 0.3700619853825516, "grad_norm": 0.8288040124260576, "learning_rate": 1.5888306874670112e-05, "loss": 0.8839, "mean_token_accuracy": 0.7431044474244117, "step": 500 }, { "epoch": 0.3700619853825516, "eval_loss": 0.90340656042099, "eval_mean_token_accuracy": 0.7403560316923893, "eval_runtime": 24.4427, "eval_samples_per_second": 20.988, "eval_steps_per_second": 1.35, "step": 500 }, { "epoch": 0.3737626052363771, "grad_norm": 0.8197782426055139, "learning_rate": 1.5783323361679865e-05, "loss": 0.8799, "mean_token_accuracy": 0.7442870959639549, "step": 505 }, { "epoch": 0.37746322509020264, "grad_norm": 0.7912056501798941, "learning_rate": 1.567737322235084e-05, "loss": 0.8661, "mean_token_accuracy": 0.7487552657723426, "step": 510 }, { "epoch": 0.3811638449440281, "grad_norm": 0.8680442053546329, "learning_rate": 1.557047416521996e-05, "loss": 0.8446, "mean_token_accuracy": 0.7539586842060089, "step": 515 }, { "epoch": 0.38486446479785363, "grad_norm": 0.7966248473049342, "learning_rate": 1.546264405742654e-05, "loss": 0.8894, "mean_token_accuracy": 0.7421392247080802, "step": 520 }, { "epoch": 0.38856508465167916, "grad_norm": 0.7832801079547608, "learning_rate": 1.535390092172597e-05, "loss": 0.8401, "mean_token_accuracy": 0.7548262551426888, "step": 525 }, { "epoch": 0.3922657045055047, "grad_norm": 0.8617978523223222, "learning_rate": 1.5244262933477401e-05, "loss": 0.8676, "mean_token_accuracy": 0.7473892971873284, "step": 530 }, { "epoch": 0.3959663243593302, "grad_norm": 0.8638721458324728, "learning_rate": 1.5133748417605878e-05, "loss": 0.8734, "mean_token_accuracy": 0.7463245391845703, "step": 535 }, { "epoch": 0.3996669442131557, "grad_norm": 0.8732616764981507, "learning_rate": 1.5022375845539537e-05, "loss": 0.8742, "mean_token_accuracy": 0.74578056037426, "step": 540 }, { "epoch": 0.4033675640669812, "grad_norm": 0.8680491958485625, "learning_rate": 1.4910163832122278e-05, "loss": 0.9111, "mean_token_accuracy": 0.7350529983639718, "step": 545 }, { "epoch": 0.40706818392080674, "grad_norm": 0.845161443541315, "learning_rate": 1.4797131132502464e-05, "loss": 0.8543, "mean_token_accuracy": 0.7507802724838257, "step": 550 }, { "epoch": 0.41076880377463226, "grad_norm": 0.7949620736755997, "learning_rate": 1.4683296638998192e-05, "loss": 0.8959, "mean_token_accuracy": 0.7393335103988647, "step": 555 }, { "epoch": 0.4144694236284578, "grad_norm": 0.8137916315549841, "learning_rate": 1.4568679377939619e-05, "loss": 0.8599, "mean_token_accuracy": 0.7493755236268044, "step": 560 }, { "epoch": 0.41817004348228326, "grad_norm": 0.8553447099213015, "learning_rate": 1.4453298506488896e-05, "loss": 0.8562, "mean_token_accuracy": 0.7500303864479065, "step": 565 }, { "epoch": 0.4218706633361088, "grad_norm": 0.78741790468558, "learning_rate": 1.4337173309438236e-05, "loss": 0.8513, "mean_token_accuracy": 0.751435661315918, "step": 570 }, { "epoch": 0.4255712831899343, "grad_norm": 0.8198917515370167, "learning_rate": 1.4220323195986649e-05, "loss": 0.8892, "mean_token_accuracy": 0.740348969399929, "step": 575 }, { "epoch": 0.42927190304375984, "grad_norm": 0.8907770425857867, "learning_rate": 1.4102767696495885e-05, "loss": 0.87, "mean_token_accuracy": 0.7466455265879631, "step": 580 }, { "epoch": 0.43297252289758537, "grad_norm": 0.7746865966025234, "learning_rate": 1.398452645922611e-05, "loss": 0.8543, "mean_token_accuracy": 0.7507255434989929, "step": 585 }, { "epoch": 0.43667314275141084, "grad_norm": 0.8081107449852742, "learning_rate": 1.3865619247051916e-05, "loss": 0.8942, "mean_token_accuracy": 0.7397695079445838, "step": 590 }, { "epoch": 0.44037376260523636, "grad_norm": 0.7928994894049232, "learning_rate": 1.3746065934159123e-05, "loss": 0.8711, "mean_token_accuracy": 0.7459681868553162, "step": 595 }, { "epoch": 0.4440743824590619, "grad_norm": 0.7457840735614479, "learning_rate": 1.3625886502723008e-05, "loss": 0.8838, "mean_token_accuracy": 0.7431347534060478, "step": 600 }, { "epoch": 0.4440743824590619, "eval_loss": 0.8946976065635681, "eval_mean_token_accuracy": 0.7420943668394377, "eval_runtime": 24.4316, "eval_samples_per_second": 20.997, "eval_steps_per_second": 1.351, "step": 600 }, { "epoch": 0.4477750023128874, "grad_norm": 0.7926368650744795, "learning_rate": 1.3505101039568494e-05, "loss": 0.8728, "mean_token_accuracy": 0.7459641486406327, "step": 605 }, { "epoch": 0.45147562216671294, "grad_norm": 0.8000063636466884, "learning_rate": 1.3383729732812814e-05, "loss": 0.8669, "mean_token_accuracy": 0.7467781469225884, "step": 610 }, { "epoch": 0.4551762420205384, "grad_norm": 0.8077945425677387, "learning_rate": 1.3261792868491267e-05, "loss": 0.8805, "mean_token_accuracy": 0.7430326372385025, "step": 615 }, { "epoch": 0.45887686187436394, "grad_norm": 0.7941374586541651, "learning_rate": 1.3139310827166613e-05, "loss": 0.859, "mean_token_accuracy": 0.7493226900696754, "step": 620 }, { "epoch": 0.46257748172818947, "grad_norm": 0.8015163522302382, "learning_rate": 1.3016304080522657e-05, "loss": 0.8573, "mean_token_accuracy": 0.7498888701200486, "step": 625 }, { "epoch": 0.466278101582015, "grad_norm": 0.8987066171816293, "learning_rate": 1.2892793187942588e-05, "loss": 0.8589, "mean_token_accuracy": 0.7495828256011009, "step": 630 }, { "epoch": 0.4699787214358405, "grad_norm": 0.8210878729300097, "learning_rate": 1.2768798793072708e-05, "loss": 0.8397, "mean_token_accuracy": 0.7545604810118676, "step": 635 }, { "epoch": 0.47367934128966604, "grad_norm": 0.8134721151372588, "learning_rate": 1.2644341620372025e-05, "loss": 0.9003, "mean_token_accuracy": 0.7380212768912315, "step": 640 }, { "epoch": 0.4773799611434915, "grad_norm": 0.7617180215752571, "learning_rate": 1.2519442471648364e-05, "loss": 0.8451, "mean_token_accuracy": 0.7525444984436035, "step": 645 }, { "epoch": 0.48108058099731704, "grad_norm": 0.7862729854353447, "learning_rate": 1.2394122222581557e-05, "loss": 0.844, "mean_token_accuracy": 0.7545543164014816, "step": 650 }, { "epoch": 0.48478120085114257, "grad_norm": 0.7602815222283319, "learning_rate": 1.226840181923427e-05, "loss": 0.8628, "mean_token_accuracy": 0.7499380096793175, "step": 655 }, { "epoch": 0.4884818207049681, "grad_norm": 0.7942344985162466, "learning_rate": 1.214230227455106e-05, "loss": 0.8903, "mean_token_accuracy": 0.7406057506799698, "step": 660 }, { "epoch": 0.4921824405587936, "grad_norm": 0.827163943781085, "learning_rate": 1.201584466484629e-05, "loss": 0.8781, "mean_token_accuracy": 0.7433477059006691, "step": 665 }, { "epoch": 0.4958830604126191, "grad_norm": 0.8927178947512965, "learning_rate": 1.1889050126281405e-05, "loss": 0.8762, "mean_token_accuracy": 0.7436618626117706, "step": 670 }, { "epoch": 0.4995836802664446, "grad_norm": 0.893577761513805, "learning_rate": 1.1761939851332241e-05, "loss": 0.8583, "mean_token_accuracy": 0.7491728380322457, "step": 675 }, { "epoch": 0.5032843001202701, "grad_norm": 0.8911659991233541, "learning_rate": 1.1634535085246903e-05, "loss": 0.8347, "mean_token_accuracy": 0.7567671954631805, "step": 680 }, { "epoch": 0.5069849199740957, "grad_norm": 0.7822165331384006, "learning_rate": 1.1506857122494832e-05, "loss": 0.8404, "mean_token_accuracy": 0.754469695687294, "step": 685 }, { "epoch": 0.5106855398279212, "grad_norm": 0.7759748923454998, "learning_rate": 1.1378927303207637e-05, "loss": 0.8741, "mean_token_accuracy": 0.7437764629721642, "step": 690 }, { "epoch": 0.5143861596817467, "grad_norm": 0.8502606153370593, "learning_rate": 1.12507670096123e-05, "loss": 0.8582, "mean_token_accuracy": 0.7506067097187042, "step": 695 }, { "epoch": 0.5180867795355723, "grad_norm": 0.7749617835044338, "learning_rate": 1.1122397662457352e-05, "loss": 0.8538, "mean_token_accuracy": 0.7503936603665352, "step": 700 }, { "epoch": 0.5180867795355723, "eval_loss": 0.888458788394928, "eval_mean_token_accuracy": 0.7433922489484152, "eval_runtime": 24.5105, "eval_samples_per_second": 20.93, "eval_steps_per_second": 1.346, "step": 700 }, { "epoch": 0.5217873993893978, "grad_norm": 0.7706407219978753, "learning_rate": 1.0993840717432582e-05, "loss": 0.8553, "mean_token_accuracy": 0.7487444669008255, "step": 705 }, { "epoch": 0.5254880192432232, "grad_norm": 0.7705189840242923, "learning_rate": 1.0865117661582958e-05, "loss": 0.8858, "mean_token_accuracy": 0.7415965974330903, "step": 710 }, { "epoch": 0.5291886390970487, "grad_norm": 0.8411896195744224, "learning_rate": 1.0736250009717249e-05, "loss": 0.8884, "mean_token_accuracy": 0.7412499740719796, "step": 715 }, { "epoch": 0.5328892589508742, "grad_norm": 0.8230171145609085, "learning_rate": 1.0607259300812047e-05, "loss": 0.8705, "mean_token_accuracy": 0.745503506064415, "step": 720 }, { "epoch": 0.5365898788046998, "grad_norm": 0.8401376931438745, "learning_rate": 1.0478167094411733e-05, "loss": 0.8707, "mean_token_accuracy": 0.7456582605838775, "step": 725 }, { "epoch": 0.5402904986585253, "grad_norm": 0.7558836285060193, "learning_rate": 1.0348994967025012e-05, "loss": 0.8545, "mean_token_accuracy": 0.7487017199397087, "step": 730 }, { "epoch": 0.5439911185123508, "grad_norm": 0.7642061571005638, "learning_rate": 1.0219764508518595e-05, "loss": 0.8654, "mean_token_accuracy": 0.7462582185864448, "step": 735 }, { "epoch": 0.5476917383661764, "grad_norm": 0.8147049407956067, "learning_rate": 1.0090497318508687e-05, "loss": 0.8424, "mean_token_accuracy": 0.75324746966362, "step": 740 }, { "epoch": 0.5513923582200019, "grad_norm": 0.7571808100586339, "learning_rate": 9.961215002750799e-06, "loss": 0.8588, "mean_token_accuracy": 0.7491571202874183, "step": 745 }, { "epoch": 0.5550929780738274, "grad_norm": 0.8038741472628308, "learning_rate": 9.831939169528565e-06, "loss": 0.8582, "mean_token_accuracy": 0.7493733122944832, "step": 750 }, { "epoch": 0.5587935979276529, "grad_norm": 0.848259052134148, "learning_rate": 9.702691426042124e-06, "loss": 0.8526, "mean_token_accuracy": 0.7506043568253518, "step": 755 }, { "epoch": 0.5624942177814783, "grad_norm": 0.7842186156617634, "learning_rate": 9.573493374796694e-06, "loss": 0.8369, "mean_token_accuracy": 0.755862507224083, "step": 760 }, { "epoch": 0.5661948376353039, "grad_norm": 0.7485121651837016, "learning_rate": 9.444366609991916e-06, "loss": 0.8729, "mean_token_accuracy": 0.7438469439744949, "step": 765 }, { "epoch": 0.5698954574891294, "grad_norm": 0.9176291688747804, "learning_rate": 9.315332713912593e-06, "loss": 0.8815, "mean_token_accuracy": 0.7436545789241791, "step": 770 }, { "epoch": 0.5735960773429549, "grad_norm": 0.8416476293024356, "learning_rate": 9.18641325332142e-06, "loss": 0.8579, "mean_token_accuracy": 0.7489444330334664, "step": 775 }, { "epoch": 0.5772966971967805, "grad_norm": 0.8665126504141849, "learning_rate": 9.057629775854314e-06, "loss": 0.8691, "mean_token_accuracy": 0.7452979102730751, "step": 780 }, { "epoch": 0.580997317050606, "grad_norm": 0.7873121719067726, "learning_rate": 8.929003806418934e-06, "loss": 0.8609, "mean_token_accuracy": 0.7471121177077293, "step": 785 }, { "epoch": 0.5846979369044315, "grad_norm": 0.7603699249259593, "learning_rate": 8.800556843597002e-06, "loss": 0.8303, "mean_token_accuracy": 0.7559542834758759, "step": 790 }, { "epoch": 0.588398556758257, "grad_norm": 0.8208200680668915, "learning_rate": 8.672310356051023e-06, "loss": 0.8755, "mean_token_accuracy": 0.742991179227829, "step": 795 }, { "epoch": 0.5920991766120826, "grad_norm": 0.7539353236493428, "learning_rate": 8.544285778936004e-06, "loss": 0.8604, "mean_token_accuracy": 0.7469533935189248, "step": 800 }, { "epoch": 0.5920991766120826, "eval_loss": 0.882485568523407, "eval_mean_token_accuracy": 0.7446909926154397, "eval_runtime": 24.5244, "eval_samples_per_second": 20.918, "eval_steps_per_second": 1.346, "step": 800 }, { "epoch": 0.5957997964659081, "grad_norm": 0.7747168848469933, "learning_rate": 8.416504510316774e-06, "loss": 0.8687, "mean_token_accuracy": 0.7457119628787041, "step": 805 }, { "epoch": 0.5995004163197336, "grad_norm": 0.7965745611460714, "learning_rate": 8.28898790759152e-06, "loss": 0.8719, "mean_token_accuracy": 0.7440481051802635, "step": 810 }, { "epoch": 0.603201036173559, "grad_norm": 0.7287090635465163, "learning_rate": 8.161757283922084e-06, "loss": 0.8259, "mean_token_accuracy": 0.7569043532013893, "step": 815 }, { "epoch": 0.6069016560273846, "grad_norm": 0.7849161686448317, "learning_rate": 8.034833904671698e-06, "loss": 0.8725, "mean_token_accuracy": 0.7442520692944526, "step": 820 }, { "epoch": 0.6106022758812101, "grad_norm": 0.7891027028253116, "learning_rate": 7.908238983850666e-06, "loss": 0.8365, "mean_token_accuracy": 0.75490812510252, "step": 825 }, { "epoch": 0.6143028957350356, "grad_norm": 0.7870957116065473, "learning_rate": 7.781993680570656e-06, "loss": 0.8576, "mean_token_accuracy": 0.7477666437625885, "step": 830 }, { "epoch": 0.6180035155888611, "grad_norm": 0.8664448499276786, "learning_rate": 7.656119095508155e-06, "loss": 0.8332, "mean_token_accuracy": 0.7553980827331543, "step": 835 }, { "epoch": 0.6217041354426867, "grad_norm": 0.8003046870817029, "learning_rate": 7.530636267377706e-06, "loss": 0.8559, "mean_token_accuracy": 0.7494190320372581, "step": 840 }, { "epoch": 0.6254047552965122, "grad_norm": 0.8000920295001714, "learning_rate": 7.405566169415481e-06, "loss": 0.8869, "mean_token_accuracy": 0.7401010394096375, "step": 845 }, { "epoch": 0.6291053751503377, "grad_norm": 0.7895598832562634, "learning_rate": 7.280929705873818e-06, "loss": 0.8648, "mean_token_accuracy": 0.7470688298344612, "step": 850 }, { "epoch": 0.6328059950041632, "grad_norm": 0.7756015770435069, "learning_rate": 7.15674770852727e-06, "loss": 0.8294, "mean_token_accuracy": 0.7561267375946045, "step": 855 }, { "epoch": 0.6365066148579888, "grad_norm": 0.7508734070910927, "learning_rate": 7.033040933190776e-06, "loss": 0.8367, "mean_token_accuracy": 0.7520585179328918, "step": 860 }, { "epoch": 0.6402072347118142, "grad_norm": 0.7569428687497424, "learning_rate": 6.909830056250527e-06, "loss": 0.8498, "mean_token_accuracy": 0.7501048058271408, "step": 865 }, { "epoch": 0.6439078545656397, "grad_norm": 0.7984119233647183, "learning_rate": 6.787135671208126e-06, "loss": 0.8571, "mean_token_accuracy": 0.7490562096238136, "step": 870 }, { "epoch": 0.6476084744194652, "grad_norm": 0.8159593622616607, "learning_rate": 6.6649782852385554e-06, "loss": 0.8686, "mean_token_accuracy": 0.7469749033451081, "step": 875 }, { "epoch": 0.6513090942732908, "grad_norm": 0.7631802495882793, "learning_rate": 6.543378315762634e-06, "loss": 0.8657, "mean_token_accuracy": 0.7458467945456505, "step": 880 }, { "epoch": 0.6550097141271163, "grad_norm": 0.812760399325826, "learning_rate": 6.42235608703441e-06, "loss": 0.8635, "mean_token_accuracy": 0.7484910488128662, "step": 885 }, { "epoch": 0.6587103339809418, "grad_norm": 0.8219546330443505, "learning_rate": 6.301931826744189e-06, "loss": 0.8363, "mean_token_accuracy": 0.7542862921953202, "step": 890 }, { "epoch": 0.6624109538347673, "grad_norm": 0.7882602270918212, "learning_rate": 6.18212566263765e-06, "loss": 0.864, "mean_token_accuracy": 0.7470616161823272, "step": 895 }, { "epoch": 0.6661115736885929, "grad_norm": 0.8408472925032482, "learning_rate": 6.0629576191517035e-06, "loss": 0.8619, "mean_token_accuracy": 0.74820506721735, "step": 900 }, { "epoch": 0.6661115736885929, "eval_loss": 0.8769035935401917, "eval_mean_token_accuracy": 0.7459533539685336, "eval_runtime": 24.4702, "eval_samples_per_second": 20.964, "eval_steps_per_second": 1.349, "step": 900 }, { "epoch": 0.6698121935424184, "grad_norm": 0.7754763488387619, "learning_rate": 5.944447614067588e-06, "loss": 0.868, "mean_token_accuracy": 0.7466130316257477, "step": 905 }, { "epoch": 0.6735128133962439, "grad_norm": 0.7533565990898803, "learning_rate": 5.8266154551818225e-06, "loss": 0.8561, "mean_token_accuracy": 0.7474694743752479, "step": 910 }, { "epoch": 0.6772134332500694, "grad_norm": 0.848613374257325, "learning_rate": 5.709480836995509e-06, "loss": 0.867, "mean_token_accuracy": 0.7461282700300217, "step": 915 }, { "epoch": 0.6809140531038949, "grad_norm": 0.7588179424646748, "learning_rate": 5.593063337422595e-06, "loss": 0.8525, "mean_token_accuracy": 0.7497700050473213, "step": 920 }, { "epoch": 0.6846146729577204, "grad_norm": 0.7363445259820587, "learning_rate": 5.477382414517625e-06, "loss": 0.8616, "mean_token_accuracy": 0.7481978580355644, "step": 925 }, { "epoch": 0.6883152928115459, "grad_norm": 0.7698222134303955, "learning_rate": 5.362457403223495e-06, "loss": 0.8553, "mean_token_accuracy": 0.749049125611782, "step": 930 }, { "epoch": 0.6920159126653714, "grad_norm": 0.7718707701904592, "learning_rate": 5.248307512139818e-06, "loss": 0.8616, "mean_token_accuracy": 0.7483461976051331, "step": 935 }, { "epoch": 0.695716532519197, "grad_norm": 0.785721659768036, "learning_rate": 5.134951820312402e-06, "loss": 0.8557, "mean_token_accuracy": 0.7493070676922798, "step": 940 }, { "epoch": 0.6994171523730225, "grad_norm": 0.7715074813141185, "learning_rate": 5.022409274044346e-06, "loss": 0.8493, "mean_token_accuracy": 0.7499154567718506, "step": 945 }, { "epoch": 0.703117772226848, "grad_norm": 0.8335206414183822, "learning_rate": 4.910698683729371e-06, "loss": 0.8373, "mean_token_accuracy": 0.7533765882253647, "step": 950 }, { "epoch": 0.7068183920806735, "grad_norm": 0.7505821456653344, "learning_rate": 4.799838720707847e-06, "loss": 0.8675, "mean_token_accuracy": 0.745770500600338, "step": 955 }, { "epoch": 0.7105190119344991, "grad_norm": 0.7594914324330723, "learning_rate": 4.6898479141460415e-06, "loss": 0.8295, "mean_token_accuracy": 0.756369736790657, "step": 960 }, { "epoch": 0.7142196317883246, "grad_norm": 0.7204641559045669, "learning_rate": 4.580744647939163e-06, "loss": 0.8624, "mean_token_accuracy": 0.7481252133846283, "step": 965 }, { "epoch": 0.71792025164215, "grad_norm": 0.8174852819097401, "learning_rate": 4.472547157638674e-06, "loss": 0.8417, "mean_token_accuracy": 0.7523833066225052, "step": 970 }, { "epoch": 0.7216208714959755, "grad_norm": 0.862526259850387, "learning_rate": 4.365273527404384e-06, "loss": 0.8423, "mean_token_accuracy": 0.754362627863884, "step": 975 }, { "epoch": 0.7253214913498011, "grad_norm": 0.802739929512475, "learning_rate": 4.258941686981864e-06, "loss": 0.8469, "mean_token_accuracy": 0.7522266939282417, "step": 980 }, { "epoch": 0.7290221112036266, "grad_norm": 0.7931469636274437, "learning_rate": 4.15356940870567e-06, "loss": 0.8305, "mean_token_accuracy": 0.7555211216211319, "step": 985 }, { "epoch": 0.7327227310574521, "grad_norm": 0.7884974421740458, "learning_rate": 4.049174304528857e-06, "loss": 0.8327, "mean_token_accuracy": 0.7560925737023354, "step": 990 }, { "epoch": 0.7364233509112776, "grad_norm": 0.7868742780753382, "learning_rate": 3.945773823079315e-06, "loss": 0.8491, "mean_token_accuracy": 0.7494197428226471, "step": 995 }, { "epoch": 0.7401239707651032, "grad_norm": 0.7556900381775965, "learning_rate": 3.8433852467434175e-06, "loss": 0.8337, "mean_token_accuracy": 0.7557049483060837, "step": 1000 }, { "epoch": 0.7401239707651032, "eval_loss": 0.872385561466217, "eval_mean_token_accuracy": 0.7468928351546779, "eval_runtime": 24.5427, "eval_samples_per_second": 20.902, "eval_steps_per_second": 1.345, "step": 1000 }, { "epoch": 0.7438245906189287, "grad_norm": 0.7642811057752702, "learning_rate": 3.742025688777413e-06, "loss": 0.8082, "mean_token_accuracy": 0.7626117318868637, "step": 1005 }, { "epoch": 0.7475252104727542, "grad_norm": 0.7878330759901441, "learning_rate": 3.641712090447125e-06, "loss": 0.8584, "mean_token_accuracy": 0.7494427710771561, "step": 1010 }, { "epoch": 0.7512258303265797, "grad_norm": 0.7817001972699995, "learning_rate": 3.542461218196379e-06, "loss": 0.8596, "mean_token_accuracy": 0.7491939187049865, "step": 1015 }, { "epoch": 0.7549264501804053, "grad_norm": 0.7922908897094753, "learning_rate": 3.444289660844665e-06, "loss": 0.837, "mean_token_accuracy": 0.7543808072805405, "step": 1020 }, { "epoch": 0.7586270700342307, "grad_norm": 0.7945985767364506, "learning_rate": 3.347213826814456e-06, "loss": 0.8662, "mean_token_accuracy": 0.7463411048054696, "step": 1025 }, { "epoch": 0.7623276898880562, "grad_norm": 0.7549974443621008, "learning_rate": 3.2512499413887255e-06, "loss": 0.8303, "mean_token_accuracy": 0.7554700002074242, "step": 1030 }, { "epoch": 0.7660283097418817, "grad_norm": 0.8236761629232633, "learning_rate": 3.1564140439990256e-06, "loss": 0.8406, "mean_token_accuracy": 0.751871857047081, "step": 1035 }, { "epoch": 0.7697289295957073, "grad_norm": 0.7267400739764023, "learning_rate": 3.0627219855446667e-06, "loss": 0.8287, "mean_token_accuracy": 0.7558587804436684, "step": 1040 }, { "epoch": 0.7734295494495328, "grad_norm": 0.776879654823643, "learning_rate": 2.970189425743383e-06, "loss": 0.8361, "mean_token_accuracy": 0.7547884792089462, "step": 1045 }, { "epoch": 0.7771301693033583, "grad_norm": 0.7552506340173256, "learning_rate": 2.8788318305139808e-06, "loss": 0.8286, "mean_token_accuracy": 0.7568780824542045, "step": 1050 }, { "epoch": 0.7808307891571838, "grad_norm": 0.7580209757202828, "learning_rate": 2.7886644693913333e-06, "loss": 0.8459, "mean_token_accuracy": 0.750422203540802, "step": 1055 }, { "epoch": 0.7845314090110094, "grad_norm": 0.7719039642796498, "learning_rate": 2.6997024129742544e-06, "loss": 0.8404, "mean_token_accuracy": 0.7537851154804229, "step": 1060 }, { "epoch": 0.7882320288648349, "grad_norm": 0.7810831799894009, "learning_rate": 2.611960530406572e-06, "loss": 0.8273, "mean_token_accuracy": 0.7569017544388771, "step": 1065 }, { "epoch": 0.7919326487186604, "grad_norm": 0.8590214475734921, "learning_rate": 2.5254534868919077e-06, "loss": 0.8299, "mean_token_accuracy": 0.7555602207779885, "step": 1070 }, { "epoch": 0.7956332685724858, "grad_norm": 0.7451834304748566, "learning_rate": 2.4401957412425213e-06, "loss": 0.847, "mean_token_accuracy": 0.7509198769927025, "step": 1075 }, { "epoch": 0.7993338884263114, "grad_norm": 0.7394748846818531, "learning_rate": 2.3562015434626784e-06, "loss": 0.8478, "mean_token_accuracy": 0.7510868698358536, "step": 1080 }, { "epoch": 0.8030345082801369, "grad_norm": 0.7653067572984463, "learning_rate": 2.273484932366874e-06, "loss": 0.847, "mean_token_accuracy": 0.7503063544631005, "step": 1085 }, { "epoch": 0.8067351281339624, "grad_norm": 0.7743151365038076, "learning_rate": 2.192059733233408e-06, "loss": 0.8472, "mean_token_accuracy": 0.7531051859259605, "step": 1090 }, { "epoch": 0.810435747987788, "grad_norm": 0.7474033991103015, "learning_rate": 2.111939555493603e-06, "loss": 0.8459, "mean_token_accuracy": 0.7517727881669998, "step": 1095 }, { "epoch": 0.8141363678416135, "grad_norm": 0.779312232265544, "learning_rate": 2.0331377904571303e-06, "loss": 0.8276, "mean_token_accuracy": 0.7547013550996781, "step": 1100 }, { "epoch": 0.8141363678416135, "eval_loss": 0.8695138692855835, "eval_mean_token_accuracy": 0.7477564071163987, "eval_runtime": 24.4317, "eval_samples_per_second": 20.997, "eval_steps_per_second": 1.351, "step": 1100 }, { "epoch": 0.817836987695439, "grad_norm": 0.8034562413677603, "learning_rate": 1.9556676090737803e-06, "loss": 0.8317, "mean_token_accuracy": 0.7545135840773582, "step": 1105 }, { "epoch": 0.8215376075492645, "grad_norm": 0.7281049735394789, "learning_rate": 1.879541959732072e-06, "loss": 0.853, "mean_token_accuracy": 0.749828140437603, "step": 1110 }, { "epoch": 0.82523822740309, "grad_norm": 0.7645464251699309, "learning_rate": 1.8047735660950427e-06, "loss": 0.8095, "mean_token_accuracy": 0.7608827918767929, "step": 1115 }, { "epoch": 0.8289388472569156, "grad_norm": 0.7946888766498504, "learning_rate": 1.7313749249736266e-06, "loss": 0.8157, "mean_token_accuracy": 0.760844600200653, "step": 1120 }, { "epoch": 0.832639467110741, "grad_norm": 0.7522474191468512, "learning_rate": 1.6593583042379192e-06, "loss": 0.8347, "mean_token_accuracy": 0.7559689804911613, "step": 1125 }, { "epoch": 0.8363400869645665, "grad_norm": 0.7489873580556787, "learning_rate": 1.5887357407667314e-06, "loss": 0.8605, "mean_token_accuracy": 0.7474079817533493, "step": 1130 }, { "epoch": 0.840040706818392, "grad_norm": 0.7641134150797313, "learning_rate": 1.5195190384357405e-06, "loss": 0.8618, "mean_token_accuracy": 0.7480388507246971, "step": 1135 }, { "epoch": 0.8437413266722176, "grad_norm": 0.7204342341589023, "learning_rate": 1.4517197661445893e-06, "loss": 0.8529, "mean_token_accuracy": 0.7483357265591621, "step": 1140 }, { "epoch": 0.8474419465260431, "grad_norm": 0.7681754372477057, "learning_rate": 1.3853492558832472e-06, "loss": 0.8306, "mean_token_accuracy": 0.7552460536360741, "step": 1145 }, { "epoch": 0.8511425663798686, "grad_norm": 0.7429470084922081, "learning_rate": 1.3204186008379926e-06, "loss": 0.8389, "mean_token_accuracy": 0.7531604886054992, "step": 1150 }, { "epoch": 0.8548431862336942, "grad_norm": 0.6873318445062786, "learning_rate": 1.2569386535372807e-06, "loss": 0.8286, "mean_token_accuracy": 0.7552321195602417, "step": 1155 }, { "epoch": 0.8585438060875197, "grad_norm": 0.7640481385504215, "learning_rate": 1.1949200240378577e-06, "loss": 0.8255, "mean_token_accuracy": 0.7569879427552223, "step": 1160 }, { "epoch": 0.8622444259413452, "grad_norm": 0.7638091289655545, "learning_rate": 1.1343730781513896e-06, "loss": 0.8558, "mean_token_accuracy": 0.749303475022316, "step": 1165 }, { "epoch": 0.8659450457951707, "grad_norm": 0.7685982859289542, "learning_rate": 1.0753079357119134e-06, "loss": 0.8273, "mean_token_accuracy": 0.7565306261181831, "step": 1170 }, { "epoch": 0.8696456656489963, "grad_norm": 0.7114001929021425, "learning_rate": 1.017734468884417e-06, "loss": 0.8473, "mean_token_accuracy": 0.7502565905451775, "step": 1175 }, { "epoch": 0.8733462855028217, "grad_norm": 0.7348011760240892, "learning_rate": 9.616623005147952e-07, "loss": 0.8286, "mean_token_accuracy": 0.7560262143611908, "step": 1180 }, { "epoch": 0.8770469053566472, "grad_norm": 0.7429654371816976, "learning_rate": 9.071008025214767e-07, "loss": 0.8158, "mean_token_accuracy": 0.760336747765541, "step": 1185 }, { "epoch": 0.8807475252104727, "grad_norm": 0.8100937899257782, "learning_rate": 8.540590943290128e-07, "loss": 0.8328, "mean_token_accuracy": 0.7545322120189667, "step": 1190 }, { "epoch": 0.8844481450642983, "grad_norm": 0.7556302951610571, "learning_rate": 8.025460413438457e-07, "loss": 0.842, "mean_token_accuracy": 0.7518000155687332, "step": 1195 }, { "epoch": 0.8881487649181238, "grad_norm": 0.7214919634823889, "learning_rate": 7.525702534725443e-07, "loss": 0.8485, "mean_token_accuracy": 0.7501253366470337, "step": 1200 }, { "epoch": 0.8881487649181238, "eval_loss": 0.8681026101112366, "eval_mean_token_accuracy": 0.7479201501066034, "eval_runtime": 24.4445, "eval_samples_per_second": 20.986, "eval_steps_per_second": 1.35, "step": 1200 }, { "epoch": 0.8918493847719493, "grad_norm": 0.7363787692810877, "learning_rate": 7.041400836827439e-07, "loss": 0.8446, "mean_token_accuracy": 0.7505712598562241, "step": 1205 }, { "epoch": 0.8955500046257748, "grad_norm": 0.7760798741879421, "learning_rate": 6.572636266070265e-07, "loss": 0.8515, "mean_token_accuracy": 0.7495753020048141, "step": 1210 }, { "epoch": 0.8992506244796004, "grad_norm": 0.7375925847527678, "learning_rate": 6.119487171899807e-07, "loss": 0.8151, "mean_token_accuracy": 0.7605884402990342, "step": 1215 }, { "epoch": 0.9029512443334259, "grad_norm": 0.8483447358597658, "learning_rate": 5.682029293786673e-07, "loss": 0.8508, "mean_token_accuracy": 0.7498715907335282, "step": 1220 }, { "epoch": 0.9066518641872514, "grad_norm": 0.7427906597662893, "learning_rate": 5.26033574856708e-07, "loss": 0.8254, "mean_token_accuracy": 0.7563312321901321, "step": 1225 }, { "epoch": 0.9103524840410768, "grad_norm": 0.720458409183581, "learning_rate": 4.854477018222103e-07, "loss": 0.8374, "mean_token_accuracy": 0.7546701580286026, "step": 1230 }, { "epoch": 0.9140531038949024, "grad_norm": 0.7702820261310465, "learning_rate": 4.464520938097294e-07, "loss": 0.8397, "mean_token_accuracy": 0.7535283699631691, "step": 1235 }, { "epoch": 0.9177537237487279, "grad_norm": 0.8128830297797902, "learning_rate": 4.0905326855646186e-07, "loss": 0.855, "mean_token_accuracy": 0.7464580446481704, "step": 1240 }, { "epoch": 0.9214543436025534, "grad_norm": 0.7719201359932438, "learning_rate": 3.732574769128738e-07, "loss": 0.8585, "mean_token_accuracy": 0.7479515969753265, "step": 1245 }, { "epoch": 0.9251549634563789, "grad_norm": 0.767822410168291, "learning_rate": 3.390707017979311e-07, "loss": 0.8417, "mean_token_accuracy": 0.7507120683789253, "step": 1250 }, { "epoch": 0.9288555833102045, "grad_norm": 0.7931837924998084, "learning_rate": 3.06498657199108e-07, "loss": 0.8423, "mean_token_accuracy": 0.7526604071259498, "step": 1255 }, { "epoch": 0.93255620316403, "grad_norm": 0.7402614770859547, "learning_rate": 2.7554678721735675e-07, "loss": 0.831, "mean_token_accuracy": 0.7547580033540726, "step": 1260 }, { "epoch": 0.9362568230178555, "grad_norm": 0.7265080682796597, "learning_rate": 2.4622026515717654e-07, "loss": 0.8577, "mean_token_accuracy": 0.7485264018177986, "step": 1265 }, { "epoch": 0.939957442871681, "grad_norm": 0.7994816530184113, "learning_rate": 2.1852399266194312e-07, "loss": 0.8384, "mean_token_accuracy": 0.7530582755804062, "step": 1270 }, { "epoch": 0.9436580627255066, "grad_norm": 0.7408942586414351, "learning_rate": 1.9246259889464935e-07, "loss": 0.8395, "mean_token_accuracy": 0.7526318833231926, "step": 1275 }, { "epoch": 0.9473586825793321, "grad_norm": 0.7531498605702176, "learning_rate": 1.6804043976418438e-07, "loss": 0.8384, "mean_token_accuracy": 0.7517712652683258, "step": 1280 }, { "epoch": 0.9510593024331575, "grad_norm": 0.7640371632548638, "learning_rate": 1.4526159719728595e-07, "loss": 0.843, "mean_token_accuracy": 0.7513806536793709, "step": 1285 }, { "epoch": 0.954759922286983, "grad_norm": 0.7800995207503227, "learning_rate": 1.24129878456285e-07, "loss": 0.8482, "mean_token_accuracy": 0.750167365372181, "step": 1290 }, { "epoch": 0.9584605421408086, "grad_norm": 0.7781230447019286, "learning_rate": 1.0464881550276362e-07, "loss": 0.8571, "mean_token_accuracy": 0.7482250303030014, "step": 1295 }, { "epoch": 0.9621611619946341, "grad_norm": 0.7814221898628717, "learning_rate": 8.682166440721729e-08, "loss": 0.8554, "mean_token_accuracy": 0.7490108326077461, "step": 1300 }, { "epoch": 0.9621611619946341, "eval_loss": 0.8676031827926636, "eval_mean_token_accuracy": 0.7480372389157613, "eval_runtime": 24.4444, "eval_samples_per_second": 20.986, "eval_steps_per_second": 1.35, "step": 1300 }, { "epoch": 0.9658617818484596, "grad_norm": 0.7864179680038271, "learning_rate": 7.065140480483235e-08, "loss": 0.8612, "mean_token_accuracy": 0.7473657980561257, "step": 1305 }, { "epoch": 0.9695624017022851, "grad_norm": 0.7508526784486548, "learning_rate": 5.6140739397474445e-08, "loss": 0.8365, "mean_token_accuracy": 0.7531405627727509, "step": 1310 }, { "epoch": 0.9732630215561107, "grad_norm": 0.7164581752617882, "learning_rate": 4.329209350195651e-08, "loss": 0.8443, "mean_token_accuracy": 0.7516303405165672, "step": 1315 }, { "epoch": 0.9769636414099362, "grad_norm": 0.8498247341964699, "learning_rate": 3.210761464466639e-08, "loss": 0.8374, "mean_token_accuracy": 0.7550172284245491, "step": 1320 }, { "epoch": 0.9806642612637617, "grad_norm": 0.7153929981028929, "learning_rate": 2.2589172202635014e-08, "loss": 0.7977, "mean_token_accuracy": 0.7646528780460358, "step": 1325 }, { "epoch": 0.9843648811175872, "grad_norm": 0.7908203411439174, "learning_rate": 1.4738357091084177e-08, "loss": 0.8371, "mean_token_accuracy": 0.753947702050209, "step": 1330 }, { "epoch": 0.9880655009714127, "grad_norm": 0.7547450484335274, "learning_rate": 8.556481497521418e-09, "loss": 0.8498, "mean_token_accuracy": 0.7504909783601761, "step": 1335 }, { "epoch": 0.9917661208252382, "grad_norm": 0.7442071956341998, "learning_rate": 4.044578662419918e-09, "loss": 0.8265, "mean_token_accuracy": 0.7562764957547188, "step": 1340 }, { "epoch": 0.9954667406790637, "grad_norm": 0.7540578819730456, "learning_rate": 1.203402706525525e-09, "loss": 0.8332, "mean_token_accuracy": 0.7546069085597992, "step": 1345 }, { "epoch": 0.9991673605328892, "grad_norm": 0.7649196329052427, "learning_rate": 3.342850480869686e-11, "loss": 0.805, "mean_token_accuracy": 0.7623659715056419, "step": 1350 }, { "epoch": 0.9999074845036544, "mean_token_accuracy": 0.7686276957392693, "step": 1351, "total_flos": 76959195168768.0, "train_loss": 0.8854525579424984, "train_runtime": 18393.1053, "train_samples_per_second": 4.701, "train_steps_per_second": 0.073 } ], "logging_steps": 5, "max_steps": 1351, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 76959195168768.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }