| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9999074845036544, |
| "eval_steps": 100, |
| "global_step": 1351, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003700619853825516, |
| "grad_norm": 6.406365314676644, |
| "learning_rate": 7.352941176470589e-07, |
| "loss": 1.3566, |
| "mean_token_accuracy": 0.6660232350230217, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.007401239707651032, |
| "grad_norm": 6.123712399933804, |
| "learning_rate": 1.4705882352941177e-06, |
| "loss": 1.3913, |
| "mean_token_accuracy": 0.6558953940868377, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.011101859561476548, |
| "grad_norm": 5.275448607027431, |
| "learning_rate": 2.2058823529411767e-06, |
| "loss": 1.3404, |
| "mean_token_accuracy": 0.6668361470103263, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.014802479415302064, |
| "grad_norm": 3.222168252214971, |
| "learning_rate": 2.9411764705882355e-06, |
| "loss": 1.2979, |
| "mean_token_accuracy": 0.6731562301516533, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01850309926912758, |
| "grad_norm": 2.304245354229246, |
| "learning_rate": 3.6764705882352946e-06, |
| "loss": 1.2317, |
| "mean_token_accuracy": 0.6820817664265633, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.022203719122953096, |
| "grad_norm": 2.2635687771667734, |
| "learning_rate": 4.411764705882353e-06, |
| "loss": 1.19, |
| "mean_token_accuracy": 0.6874127045273781, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.025904338976778612, |
| "grad_norm": 1.6106011837583105, |
| "learning_rate": 5.147058823529411e-06, |
| "loss": 1.1346, |
| "mean_token_accuracy": 0.6967306867241859, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.029604958830604128, |
| "grad_norm": 1.2780962221463628, |
| "learning_rate": 5.882352941176471e-06, |
| "loss": 1.0694, |
| "mean_token_accuracy": 0.7095885664224625, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.03330557868442964, |
| "grad_norm": 1.637550942271707, |
| "learning_rate": 6.61764705882353e-06, |
| "loss": 1.056, |
| "mean_token_accuracy": 0.7120530039072037, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.03700619853825516, |
| "grad_norm": 1.2301760518659035, |
| "learning_rate": 7.352941176470589e-06, |
| "loss": 1.03, |
| "mean_token_accuracy": 0.7162090480327606, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.04070681839208067, |
| "grad_norm": 0.9170716667796568, |
| "learning_rate": 8.088235294117648e-06, |
| "loss": 1.0283, |
| "mean_token_accuracy": 0.7158895432949066, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.04440743824590619, |
| "grad_norm": 1.0543932075765083, |
| "learning_rate": 8.823529411764707e-06, |
| "loss": 0.9877, |
| "mean_token_accuracy": 0.7251855596899986, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.048108058099731704, |
| "grad_norm": 0.9843413417015394, |
| "learning_rate": 9.558823529411766e-06, |
| "loss": 0.9974, |
| "mean_token_accuracy": 0.7208766683936119, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.051808677953557224, |
| "grad_norm": 0.9511465656581665, |
| "learning_rate": 1.0294117647058823e-05, |
| "loss": 0.9924, |
| "mean_token_accuracy": 0.7231188163161277, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.055509297807382736, |
| "grad_norm": 2.825049696114851, |
| "learning_rate": 1.1029411764705885e-05, |
| "loss": 0.9782, |
| "mean_token_accuracy": 0.7259459629654884, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.059209917661208256, |
| "grad_norm": 0.938362593121529, |
| "learning_rate": 1.1764705882352942e-05, |
| "loss": 0.9686, |
| "mean_token_accuracy": 0.7275517180562019, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.06291053751503377, |
| "grad_norm": 0.9202800308554298, |
| "learning_rate": 1.25e-05, |
| "loss": 0.9794, |
| "mean_token_accuracy": 0.7241677865386009, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.06661115736885928, |
| "grad_norm": 0.9332127502056355, |
| "learning_rate": 1.323529411764706e-05, |
| "loss": 0.9497, |
| "mean_token_accuracy": 0.731817427277565, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0703117772226848, |
| "grad_norm": 1.0837119699847717, |
| "learning_rate": 1.3970588235294118e-05, |
| "loss": 0.9852, |
| "mean_token_accuracy": 0.7222246199846267, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.07401239707651032, |
| "grad_norm": 0.8948953129317817, |
| "learning_rate": 1.4705882352941179e-05, |
| "loss": 0.9463, |
| "mean_token_accuracy": 0.7313005596399307, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07401239707651032, |
| "eval_loss": 0.9788710474967957, |
| "eval_mean_token_accuracy": 0.7260306134368434, |
| "eval_runtime": 24.5669, |
| "eval_samples_per_second": 20.882, |
| "eval_steps_per_second": 1.343, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07771301693033583, |
| "grad_norm": 0.9653033723452836, |
| "learning_rate": 1.5441176470588237e-05, |
| "loss": 0.9696, |
| "mean_token_accuracy": 0.7255482524633408, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.08141363678416134, |
| "grad_norm": 0.9349548842470216, |
| "learning_rate": 1.6176470588235296e-05, |
| "loss": 0.9722, |
| "mean_token_accuracy": 0.7248646602034569, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.08511425663798686, |
| "grad_norm": 0.8956374335067367, |
| "learning_rate": 1.6911764705882355e-05, |
| "loss": 0.9591, |
| "mean_token_accuracy": 0.7272942900657654, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.08881487649181238, |
| "grad_norm": 0.9788467585120308, |
| "learning_rate": 1.7647058823529414e-05, |
| "loss": 0.9363, |
| "mean_token_accuracy": 0.7335584476590157, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.0925154963456379, |
| "grad_norm": 0.9134626099519884, |
| "learning_rate": 1.8382352941176472e-05, |
| "loss": 0.9276, |
| "mean_token_accuracy": 0.7360474243760109, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.09621611619946341, |
| "grad_norm": 0.9753561569488689, |
| "learning_rate": 1.911764705882353e-05, |
| "loss": 0.9263, |
| "mean_token_accuracy": 0.7367036134004593, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.09991673605328892, |
| "grad_norm": 0.9447239853385726, |
| "learning_rate": 1.985294117647059e-05, |
| "loss": 0.9358, |
| "mean_token_accuracy": 0.7343964487314224, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.10361735590711445, |
| "grad_norm": 0.9093049891627352, |
| "learning_rate": 1.9999465148392906e-05, |
| "loss": 0.9199, |
| "mean_token_accuracy": 0.7370649874210358, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.10731797576093996, |
| "grad_norm": 0.9478337858776447, |
| "learning_rate": 1.999729241179462e-05, |
| "loss": 0.9378, |
| "mean_token_accuracy": 0.7336751863360405, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.11101859561476547, |
| "grad_norm": 0.8569290211164649, |
| "learning_rate": 1.999344872485215e-05, |
| "loss": 0.9038, |
| "mean_token_accuracy": 0.7422620430588722, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.11471921546859098, |
| "grad_norm": 1.036972518186118, |
| "learning_rate": 1.9987934730000457e-05, |
| "loss": 0.9031, |
| "mean_token_accuracy": 0.741059948503971, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.11841983532241651, |
| "grad_norm": 0.951663289147615, |
| "learning_rate": 1.998075134885022e-05, |
| "loss": 0.9171, |
| "mean_token_accuracy": 0.7384189561009407, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.12212045517624202, |
| "grad_norm": 0.8919119572868265, |
| "learning_rate": 1.9971899782033853e-05, |
| "loss": 0.9378, |
| "mean_token_accuracy": 0.7339910715818405, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.12582107503006754, |
| "grad_norm": 0.9627820867919544, |
| "learning_rate": 1.9961381509004785e-05, |
| "loss": 0.9548, |
| "mean_token_accuracy": 0.7276575401425361, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.12952169488389306, |
| "grad_norm": 1.0867703495663577, |
| "learning_rate": 1.9949198287790215e-05, |
| "loss": 0.8957, |
| "mean_token_accuracy": 0.7430507227778435, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.13322231473771856, |
| "grad_norm": 0.9434565750923781, |
| "learning_rate": 1.9935352154697257e-05, |
| "loss": 0.9326, |
| "mean_token_accuracy": 0.7323492169380188, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.1369229345915441, |
| "grad_norm": 0.9338703985952416, |
| "learning_rate": 1.9919845423972603e-05, |
| "loss": 0.9152, |
| "mean_token_accuracy": 0.7383304908871651, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.1406235544453696, |
| "grad_norm": 0.8449751394948155, |
| "learning_rate": 1.9902680687415704e-05, |
| "loss": 0.9115, |
| "mean_token_accuracy": 0.7386730879545211, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.1443241742991951, |
| "grad_norm": 0.9634477921697367, |
| "learning_rate": 1.9883860813945596e-05, |
| "loss": 0.9472, |
| "mean_token_accuracy": 0.7278202414512634, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.14802479415302064, |
| "grad_norm": 0.8797771752554515, |
| "learning_rate": 1.986338894912137e-05, |
| "loss": 0.9133, |
| "mean_token_accuracy": 0.7370981857180595, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.14802479415302064, |
| "eval_loss": 0.9444138407707214, |
| "eval_mean_token_accuracy": 0.7318792343139648, |
| "eval_runtime": 24.471, |
| "eval_samples_per_second": 20.964, |
| "eval_steps_per_second": 1.349, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.15172541400684614, |
| "grad_norm": 0.9730130557080766, |
| "learning_rate": 1.9841268514616434e-05, |
| "loss": 0.9072, |
| "mean_token_accuracy": 0.7391636997461319, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.15542603386067166, |
| "grad_norm": 0.9307244607734099, |
| "learning_rate": 1.9817503207646606e-05, |
| "loss": 0.9083, |
| "mean_token_accuracy": 0.7380757689476013, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.1591266537144972, |
| "grad_norm": 0.8802319735889644, |
| "learning_rate": 1.979209700035216e-05, |
| "loss": 0.9354, |
| "mean_token_accuracy": 0.7329269647598267, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.1628272735683227, |
| "grad_norm": 0.8725363780112383, |
| "learning_rate": 1.976505413913393e-05, |
| "loss": 0.9128, |
| "mean_token_accuracy": 0.7383672297000885, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.16652789342214822, |
| "grad_norm": 0.903336658421562, |
| "learning_rate": 1.9736379143943565e-05, |
| "loss": 0.9385, |
| "mean_token_accuracy": 0.7308298230171204, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.17022851327597371, |
| "grad_norm": 0.8621415714119711, |
| "learning_rate": 1.9706076807528044e-05, |
| "loss": 0.9239, |
| "mean_token_accuracy": 0.7339740186929703, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.17392913312979924, |
| "grad_norm": 0.9079088453730583, |
| "learning_rate": 1.967415219462864e-05, |
| "loss": 0.9217, |
| "mean_token_accuracy": 0.7344419181346893, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.17762975298362477, |
| "grad_norm": 0.8253363960767214, |
| "learning_rate": 1.9640610641134383e-05, |
| "loss": 0.9103, |
| "mean_token_accuracy": 0.7382244557142258, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.18133037283745027, |
| "grad_norm": 0.934084069747635, |
| "learning_rate": 1.9605457753190224e-05, |
| "loss": 0.9152, |
| "mean_token_accuracy": 0.7362594842910767, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.1850309926912758, |
| "grad_norm": 0.8995132563871583, |
| "learning_rate": 1.9568699406260016e-05, |
| "loss": 0.931, |
| "mean_token_accuracy": 0.7331584557890892, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.18873161254510132, |
| "grad_norm": 0.8420380152677727, |
| "learning_rate": 1.953034174414449e-05, |
| "loss": 0.8976, |
| "mean_token_accuracy": 0.7409126415848732, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.19243223239892682, |
| "grad_norm": 0.8794392938382097, |
| "learning_rate": 1.9490391177954383e-05, |
| "loss": 0.8971, |
| "mean_token_accuracy": 0.7426333785057068, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.19613285225275234, |
| "grad_norm": 0.871108034639782, |
| "learning_rate": 1.944885438503888e-05, |
| "loss": 0.9153, |
| "mean_token_accuracy": 0.7366316050291062, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.19983347210657784, |
| "grad_norm": 0.8826333710088322, |
| "learning_rate": 1.9405738307869565e-05, |
| "loss": 0.9093, |
| "mean_token_accuracy": 0.7374404519796371, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.20353409196040337, |
| "grad_norm": 0.9186513628105667, |
| "learning_rate": 1.936105015288003e-05, |
| "loss": 0.9206, |
| "mean_token_accuracy": 0.7328536227345467, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.2072347118142289, |
| "grad_norm": 0.892096099101671, |
| "learning_rate": 1.9314797389261426e-05, |
| "loss": 0.9193, |
| "mean_token_accuracy": 0.7348861888051033, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.2109353316680544, |
| "grad_norm": 0.8240507326046963, |
| "learning_rate": 1.9266987747714036e-05, |
| "loss": 0.897, |
| "mean_token_accuracy": 0.7411905080080032, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.21463595152187992, |
| "grad_norm": 0.867466389731865, |
| "learning_rate": 1.9217629219155172e-05, |
| "loss": 0.9087, |
| "mean_token_accuracy": 0.7377493485808373, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.21833657137570542, |
| "grad_norm": 0.8533932703930674, |
| "learning_rate": 1.916673005338357e-05, |
| "loss": 0.8899, |
| "mean_token_accuracy": 0.7420720800757408, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.22203719122953094, |
| "grad_norm": 0.8385314803046641, |
| "learning_rate": 1.9114298757700508e-05, |
| "loss": 0.9295, |
| "mean_token_accuracy": 0.7315363213419914, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.22203719122953094, |
| "eval_loss": 0.9261447191238403, |
| "eval_mean_token_accuracy": 0.7356406341899525, |
| "eval_runtime": 24.4594, |
| "eval_samples_per_second": 20.974, |
| "eval_steps_per_second": 1.349, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.22573781108335647, |
| "grad_norm": 0.8344167353514693, |
| "learning_rate": 1.9060344095487916e-05, |
| "loss": 0.9021, |
| "mean_token_accuracy": 0.7388954728841781, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.22943843093718197, |
| "grad_norm": 0.8166938367608658, |
| "learning_rate": 1.9004875084743624e-05, |
| "loss": 0.8745, |
| "mean_token_accuracy": 0.7464573740959167, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.2331390507910075, |
| "grad_norm": 0.8583610850262193, |
| "learning_rate": 1.8947900996574133e-05, |
| "loss": 0.8906, |
| "mean_token_accuracy": 0.7421576008200645, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.23683967064483302, |
| "grad_norm": 0.85619616919622, |
| "learning_rate": 1.8889431353645004e-05, |
| "loss": 0.9083, |
| "mean_token_accuracy": 0.7389416947960854, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.24054029049865852, |
| "grad_norm": 0.95321268460484, |
| "learning_rate": 1.8829475928589272e-05, |
| "loss": 0.9037, |
| "mean_token_accuracy": 0.7408745899796486, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.24424091035248405, |
| "grad_norm": 0.9192097687099984, |
| "learning_rate": 1.8768044742374008e-05, |
| "loss": 0.8844, |
| "mean_token_accuracy": 0.7437578573822975, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.24794153020630955, |
| "grad_norm": 0.8224814882376746, |
| "learning_rate": 1.870514806262544e-05, |
| "loss": 0.8793, |
| "mean_token_accuracy": 0.7451973140239716, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.2516421500601351, |
| "grad_norm": 0.8277417287111005, |
| "learning_rate": 1.8640796401912805e-05, |
| "loss": 0.8937, |
| "mean_token_accuracy": 0.7406307712197304, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.2553427699139606, |
| "grad_norm": 0.8459764358981483, |
| "learning_rate": 1.8575000515991283e-05, |
| "loss": 0.8979, |
| "mean_token_accuracy": 0.7395652890205383, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.2590433897677861, |
| "grad_norm": 0.8033711981121063, |
| "learning_rate": 1.850777140200427e-05, |
| "loss": 0.8813, |
| "mean_token_accuracy": 0.7449199944734574, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2627440096216116, |
| "grad_norm": 0.7910864449390781, |
| "learning_rate": 1.843912029664531e-05, |
| "loss": 0.8965, |
| "mean_token_accuracy": 0.7398232161998749, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.2664446294754371, |
| "grad_norm": 0.8451406694855826, |
| "learning_rate": 1.8369058674280004e-05, |
| "loss": 0.8943, |
| "mean_token_accuracy": 0.7411910384893418, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.27014524932926265, |
| "grad_norm": 0.8333682523549819, |
| "learning_rate": 1.8297598245028173e-05, |
| "loss": 0.9132, |
| "mean_token_accuracy": 0.7367936804890632, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.2738458691830882, |
| "grad_norm": 0.8175542933467003, |
| "learning_rate": 1.8224750952806626e-05, |
| "loss": 0.8769, |
| "mean_token_accuracy": 0.7455912932753563, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.2775464890369137, |
| "grad_norm": 0.8434576136182329, |
| "learning_rate": 1.815052897333284e-05, |
| "loss": 0.8808, |
| "mean_token_accuracy": 0.7459388568997383, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.2812471088907392, |
| "grad_norm": 0.8936045983724704, |
| "learning_rate": 1.8074944712089925e-05, |
| "loss": 0.9024, |
| "mean_token_accuracy": 0.7386106207966805, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.2849477287445647, |
| "grad_norm": 0.8151663680460399, |
| "learning_rate": 1.799801080225316e-05, |
| "loss": 0.8912, |
| "mean_token_accuracy": 0.7416763469576836, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.2886483485983902, |
| "grad_norm": 0.8482431323316972, |
| "learning_rate": 1.7919740102578482e-05, |
| "loss": 0.8887, |
| "mean_token_accuracy": 0.7428235754370689, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.29234896845221575, |
| "grad_norm": 0.8424168395876551, |
| "learning_rate": 1.7840145695253258e-05, |
| "loss": 0.8769, |
| "mean_token_accuracy": 0.744986218214035, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.2960495883060413, |
| "grad_norm": 0.7880509490552058, |
| "learning_rate": 1.7759240883709745e-05, |
| "loss": 0.8706, |
| "mean_token_accuracy": 0.74755879342556, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2960495883060413, |
| "eval_loss": 0.9140655398368835, |
| "eval_mean_token_accuracy": 0.7376509543621179, |
| "eval_runtime": 24.4423, |
| "eval_samples_per_second": 20.988, |
| "eval_steps_per_second": 1.35, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2997502081598668, |
| "grad_norm": 0.9325819637971632, |
| "learning_rate": 1.7677039190401538e-05, |
| "loss": 0.8778, |
| "mean_token_accuracy": 0.7454720690846444, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.3034508280136923, |
| "grad_norm": 0.855702135031522, |
| "learning_rate": 1.759355435454342e-05, |
| "loss": 0.8825, |
| "mean_token_accuracy": 0.7436973512172699, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.3071514478675178, |
| "grad_norm": 0.8190150096476719, |
| "learning_rate": 1.7508800329814993e-05, |
| "loss": 0.8801, |
| "mean_token_accuracy": 0.7447807624936104, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.31085206772134333, |
| "grad_norm": 0.7879901787308187, |
| "learning_rate": 1.7422791282028457e-05, |
| "loss": 0.8926, |
| "mean_token_accuracy": 0.7424971207976341, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.31455268757516885, |
| "grad_norm": 0.8418635735026896, |
| "learning_rate": 1.7335541586760928e-05, |
| "loss": 0.8962, |
| "mean_token_accuracy": 0.740935817360878, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.3182533074289944, |
| "grad_norm": 0.8832514235130526, |
| "learning_rate": 1.7247065826951694e-05, |
| "loss": 0.8843, |
| "mean_token_accuracy": 0.7440172478556633, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.32195392728281985, |
| "grad_norm": 0.9664717868103249, |
| "learning_rate": 1.715737879046483e-05, |
| "loss": 0.8955, |
| "mean_token_accuracy": 0.7396286860108375, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.3256545471366454, |
| "grad_norm": 0.8907608106819015, |
| "learning_rate": 1.7066495467617552e-05, |
| "loss": 0.891, |
| "mean_token_accuracy": 0.7419100046157837, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.3293551669904709, |
| "grad_norm": 0.7921927517011862, |
| "learning_rate": 1.6974431048674714e-05, |
| "loss": 0.8823, |
| "mean_token_accuracy": 0.7444690898060798, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.33305578684429643, |
| "grad_norm": 0.8845253263563585, |
| "learning_rate": 1.6881200921309914e-05, |
| "loss": 0.8908, |
| "mean_token_accuracy": 0.7414721488952637, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.33675640669812196, |
| "grad_norm": 0.9038197553584879, |
| "learning_rate": 1.6786820668033596e-05, |
| "loss": 0.8902, |
| "mean_token_accuracy": 0.7424716472625732, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.34045702655194743, |
| "grad_norm": 0.845478277089973, |
| "learning_rate": 1.6691306063588583e-05, |
| "loss": 0.8862, |
| "mean_token_accuracy": 0.7440178290009498, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.34415764640577295, |
| "grad_norm": 0.7780737971349384, |
| "learning_rate": 1.6594673072313478e-05, |
| "loss": 0.8683, |
| "mean_token_accuracy": 0.7474384486675263, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.3478582662595985, |
| "grad_norm": 0.7883315377481923, |
| "learning_rate": 1.6496937845474375e-05, |
| "loss": 0.852, |
| "mean_token_accuracy": 0.7510827273130417, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.351558886113424, |
| "grad_norm": 0.8522289082869288, |
| "learning_rate": 1.639811671856535e-05, |
| "loss": 0.8999, |
| "mean_token_accuracy": 0.7387788712978363, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.35525950596724953, |
| "grad_norm": 0.8084630790546208, |
| "learning_rate": 1.6298226208578127e-05, |
| "loss": 0.8818, |
| "mean_token_accuracy": 0.7436941042542458, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.358960125821075, |
| "grad_norm": 0.9229148133761365, |
| "learning_rate": 1.6197283011241423e-05, |
| "loss": 0.8909, |
| "mean_token_accuracy": 0.7398700997233391, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.36266074567490053, |
| "grad_norm": 0.7838660795136883, |
| "learning_rate": 1.6095303998230432e-05, |
| "loss": 0.8747, |
| "mean_token_accuracy": 0.7453865185379982, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.36636136552872606, |
| "grad_norm": 0.8988778984216769, |
| "learning_rate": 1.599230621434687e-05, |
| "loss": 0.8798, |
| "mean_token_accuracy": 0.7436122760176659, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.3700619853825516, |
| "grad_norm": 0.8288040124260576, |
| "learning_rate": 1.5888306874670112e-05, |
| "loss": 0.8839, |
| "mean_token_accuracy": 0.7431044474244117, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3700619853825516, |
| "eval_loss": 0.90340656042099, |
| "eval_mean_token_accuracy": 0.7403560316923893, |
| "eval_runtime": 24.4427, |
| "eval_samples_per_second": 20.988, |
| "eval_steps_per_second": 1.35, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3737626052363771, |
| "grad_norm": 0.8197782426055139, |
| "learning_rate": 1.5783323361679865e-05, |
| "loss": 0.8799, |
| "mean_token_accuracy": 0.7442870959639549, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.37746322509020264, |
| "grad_norm": 0.7912056501798941, |
| "learning_rate": 1.567737322235084e-05, |
| "loss": 0.8661, |
| "mean_token_accuracy": 0.7487552657723426, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.3811638449440281, |
| "grad_norm": 0.8680442053546329, |
| "learning_rate": 1.557047416521996e-05, |
| "loss": 0.8446, |
| "mean_token_accuracy": 0.7539586842060089, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.38486446479785363, |
| "grad_norm": 0.7966248473049342, |
| "learning_rate": 1.546264405742654e-05, |
| "loss": 0.8894, |
| "mean_token_accuracy": 0.7421392247080802, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.38856508465167916, |
| "grad_norm": 0.7832801079547608, |
| "learning_rate": 1.535390092172597e-05, |
| "loss": 0.8401, |
| "mean_token_accuracy": 0.7548262551426888, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.3922657045055047, |
| "grad_norm": 0.8617978523223222, |
| "learning_rate": 1.5244262933477401e-05, |
| "loss": 0.8676, |
| "mean_token_accuracy": 0.7473892971873284, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.3959663243593302, |
| "grad_norm": 0.8638721458324728, |
| "learning_rate": 1.5133748417605878e-05, |
| "loss": 0.8734, |
| "mean_token_accuracy": 0.7463245391845703, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.3996669442131557, |
| "grad_norm": 0.8732616764981507, |
| "learning_rate": 1.5022375845539537e-05, |
| "loss": 0.8742, |
| "mean_token_accuracy": 0.74578056037426, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.4033675640669812, |
| "grad_norm": 0.8680491958485625, |
| "learning_rate": 1.4910163832122278e-05, |
| "loss": 0.9111, |
| "mean_token_accuracy": 0.7350529983639718, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.40706818392080674, |
| "grad_norm": 0.845161443541315, |
| "learning_rate": 1.4797131132502464e-05, |
| "loss": 0.8543, |
| "mean_token_accuracy": 0.7507802724838257, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.41076880377463226, |
| "grad_norm": 0.7949620736755997, |
| "learning_rate": 1.4683296638998192e-05, |
| "loss": 0.8959, |
| "mean_token_accuracy": 0.7393335103988647, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.4144694236284578, |
| "grad_norm": 0.8137916315549841, |
| "learning_rate": 1.4568679377939619e-05, |
| "loss": 0.8599, |
| "mean_token_accuracy": 0.7493755236268044, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.41817004348228326, |
| "grad_norm": 0.8553447099213015, |
| "learning_rate": 1.4453298506488896e-05, |
| "loss": 0.8562, |
| "mean_token_accuracy": 0.7500303864479065, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.4218706633361088, |
| "grad_norm": 0.78741790468558, |
| "learning_rate": 1.4337173309438236e-05, |
| "loss": 0.8513, |
| "mean_token_accuracy": 0.751435661315918, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.4255712831899343, |
| "grad_norm": 0.8198917515370167, |
| "learning_rate": 1.4220323195986649e-05, |
| "loss": 0.8892, |
| "mean_token_accuracy": 0.740348969399929, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.42927190304375984, |
| "grad_norm": 0.8907770425857867, |
| "learning_rate": 1.4102767696495885e-05, |
| "loss": 0.87, |
| "mean_token_accuracy": 0.7466455265879631, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.43297252289758537, |
| "grad_norm": 0.7746865966025234, |
| "learning_rate": 1.398452645922611e-05, |
| "loss": 0.8543, |
| "mean_token_accuracy": 0.7507255434989929, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.43667314275141084, |
| "grad_norm": 0.8081107449852742, |
| "learning_rate": 1.3865619247051916e-05, |
| "loss": 0.8942, |
| "mean_token_accuracy": 0.7397695079445838, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.44037376260523636, |
| "grad_norm": 0.7928994894049232, |
| "learning_rate": 1.3746065934159123e-05, |
| "loss": 0.8711, |
| "mean_token_accuracy": 0.7459681868553162, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.4440743824590619, |
| "grad_norm": 0.7457840735614479, |
| "learning_rate": 1.3625886502723008e-05, |
| "loss": 0.8838, |
| "mean_token_accuracy": 0.7431347534060478, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4440743824590619, |
| "eval_loss": 0.8946976065635681, |
| "eval_mean_token_accuracy": 0.7420943668394377, |
| "eval_runtime": 24.4316, |
| "eval_samples_per_second": 20.997, |
| "eval_steps_per_second": 1.351, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4477750023128874, |
| "grad_norm": 0.7926368650744795, |
| "learning_rate": 1.3505101039568494e-05, |
| "loss": 0.8728, |
| "mean_token_accuracy": 0.7459641486406327, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.45147562216671294, |
| "grad_norm": 0.8000063636466884, |
| "learning_rate": 1.3383729732812814e-05, |
| "loss": 0.8669, |
| "mean_token_accuracy": 0.7467781469225884, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.4551762420205384, |
| "grad_norm": 0.8077945425677387, |
| "learning_rate": 1.3261792868491267e-05, |
| "loss": 0.8805, |
| "mean_token_accuracy": 0.7430326372385025, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.45887686187436394, |
| "grad_norm": 0.7941374586541651, |
| "learning_rate": 1.3139310827166613e-05, |
| "loss": 0.859, |
| "mean_token_accuracy": 0.7493226900696754, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.46257748172818947, |
| "grad_norm": 0.8015163522302382, |
| "learning_rate": 1.3016304080522657e-05, |
| "loss": 0.8573, |
| "mean_token_accuracy": 0.7498888701200486, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.466278101582015, |
| "grad_norm": 0.8987066171816293, |
| "learning_rate": 1.2892793187942588e-05, |
| "loss": 0.8589, |
| "mean_token_accuracy": 0.7495828256011009, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.4699787214358405, |
| "grad_norm": 0.8210878729300097, |
| "learning_rate": 1.2768798793072708e-05, |
| "loss": 0.8397, |
| "mean_token_accuracy": 0.7545604810118676, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.47367934128966604, |
| "grad_norm": 0.8134721151372588, |
| "learning_rate": 1.2644341620372025e-05, |
| "loss": 0.9003, |
| "mean_token_accuracy": 0.7380212768912315, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.4773799611434915, |
| "grad_norm": 0.7617180215752571, |
| "learning_rate": 1.2519442471648364e-05, |
| "loss": 0.8451, |
| "mean_token_accuracy": 0.7525444984436035, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.48108058099731704, |
| "grad_norm": 0.7862729854353447, |
| "learning_rate": 1.2394122222581557e-05, |
| "loss": 0.844, |
| "mean_token_accuracy": 0.7545543164014816, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.48478120085114257, |
| "grad_norm": 0.7602815222283319, |
| "learning_rate": 1.226840181923427e-05, |
| "loss": 0.8628, |
| "mean_token_accuracy": 0.7499380096793175, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.4884818207049681, |
| "grad_norm": 0.7942344985162466, |
| "learning_rate": 1.214230227455106e-05, |
| "loss": 0.8903, |
| "mean_token_accuracy": 0.7406057506799698, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.4921824405587936, |
| "grad_norm": 0.827163943781085, |
| "learning_rate": 1.201584466484629e-05, |
| "loss": 0.8781, |
| "mean_token_accuracy": 0.7433477059006691, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.4958830604126191, |
| "grad_norm": 0.8927178947512965, |
| "learning_rate": 1.1889050126281405e-05, |
| "loss": 0.8762, |
| "mean_token_accuracy": 0.7436618626117706, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.4995836802664446, |
| "grad_norm": 0.893577761513805, |
| "learning_rate": 1.1761939851332241e-05, |
| "loss": 0.8583, |
| "mean_token_accuracy": 0.7491728380322457, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.5032843001202701, |
| "grad_norm": 0.8911659991233541, |
| "learning_rate": 1.1634535085246903e-05, |
| "loss": 0.8347, |
| "mean_token_accuracy": 0.7567671954631805, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.5069849199740957, |
| "grad_norm": 0.7822165331384006, |
| "learning_rate": 1.1506857122494832e-05, |
| "loss": 0.8404, |
| "mean_token_accuracy": 0.754469695687294, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.5106855398279212, |
| "grad_norm": 0.7759748923454998, |
| "learning_rate": 1.1378927303207637e-05, |
| "loss": 0.8741, |
| "mean_token_accuracy": 0.7437764629721642, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.5143861596817467, |
| "grad_norm": 0.8502606153370593, |
| "learning_rate": 1.12507670096123e-05, |
| "loss": 0.8582, |
| "mean_token_accuracy": 0.7506067097187042, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.5180867795355723, |
| "grad_norm": 0.7749617835044338, |
| "learning_rate": 1.1122397662457352e-05, |
| "loss": 0.8538, |
| "mean_token_accuracy": 0.7503936603665352, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5180867795355723, |
| "eval_loss": 0.888458788394928, |
| "eval_mean_token_accuracy": 0.7433922489484152, |
| "eval_runtime": 24.5105, |
| "eval_samples_per_second": 20.93, |
| "eval_steps_per_second": 1.346, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5217873993893978, |
| "grad_norm": 0.7706407219978753, |
| "learning_rate": 1.0993840717432582e-05, |
| "loss": 0.8553, |
| "mean_token_accuracy": 0.7487444669008255, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.5254880192432232, |
| "grad_norm": 0.7705189840242923, |
| "learning_rate": 1.0865117661582958e-05, |
| "loss": 0.8858, |
| "mean_token_accuracy": 0.7415965974330903, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.5291886390970487, |
| "grad_norm": 0.8411896195744224, |
| "learning_rate": 1.0736250009717249e-05, |
| "loss": 0.8884, |
| "mean_token_accuracy": 0.7412499740719796, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.5328892589508742, |
| "grad_norm": 0.8230171145609085, |
| "learning_rate": 1.0607259300812047e-05, |
| "loss": 0.8705, |
| "mean_token_accuracy": 0.745503506064415, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.5365898788046998, |
| "grad_norm": 0.8401376931438745, |
| "learning_rate": 1.0478167094411733e-05, |
| "loss": 0.8707, |
| "mean_token_accuracy": 0.7456582605838775, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.5402904986585253, |
| "grad_norm": 0.7558836285060193, |
| "learning_rate": 1.0348994967025012e-05, |
| "loss": 0.8545, |
| "mean_token_accuracy": 0.7487017199397087, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.5439911185123508, |
| "grad_norm": 0.7642061571005638, |
| "learning_rate": 1.0219764508518595e-05, |
| "loss": 0.8654, |
| "mean_token_accuracy": 0.7462582185864448, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.5476917383661764, |
| "grad_norm": 0.8147049407956067, |
| "learning_rate": 1.0090497318508687e-05, |
| "loss": 0.8424, |
| "mean_token_accuracy": 0.75324746966362, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.5513923582200019, |
| "grad_norm": 0.7571808100586339, |
| "learning_rate": 9.961215002750799e-06, |
| "loss": 0.8588, |
| "mean_token_accuracy": 0.7491571202874183, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.5550929780738274, |
| "grad_norm": 0.8038741472628308, |
| "learning_rate": 9.831939169528565e-06, |
| "loss": 0.8582, |
| "mean_token_accuracy": 0.7493733122944832, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5587935979276529, |
| "grad_norm": 0.848259052134148, |
| "learning_rate": 9.702691426042124e-06, |
| "loss": 0.8526, |
| "mean_token_accuracy": 0.7506043568253518, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.5624942177814783, |
| "grad_norm": 0.7842186156617634, |
| "learning_rate": 9.573493374796694e-06, |
| "loss": 0.8369, |
| "mean_token_accuracy": 0.755862507224083, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.5661948376353039, |
| "grad_norm": 0.7485121651837016, |
| "learning_rate": 9.444366609991916e-06, |
| "loss": 0.8729, |
| "mean_token_accuracy": 0.7438469439744949, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.5698954574891294, |
| "grad_norm": 0.9176291688747804, |
| "learning_rate": 9.315332713912593e-06, |
| "loss": 0.8815, |
| "mean_token_accuracy": 0.7436545789241791, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.5735960773429549, |
| "grad_norm": 0.8416476293024356, |
| "learning_rate": 9.18641325332142e-06, |
| "loss": 0.8579, |
| "mean_token_accuracy": 0.7489444330334664, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.5772966971967805, |
| "grad_norm": 0.8665126504141849, |
| "learning_rate": 9.057629775854314e-06, |
| "loss": 0.8691, |
| "mean_token_accuracy": 0.7452979102730751, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.580997317050606, |
| "grad_norm": 0.7873121719067726, |
| "learning_rate": 8.929003806418934e-06, |
| "loss": 0.8609, |
| "mean_token_accuracy": 0.7471121177077293, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.5846979369044315, |
| "grad_norm": 0.7603699249259593, |
| "learning_rate": 8.800556843597002e-06, |
| "loss": 0.8303, |
| "mean_token_accuracy": 0.7559542834758759, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.588398556758257, |
| "grad_norm": 0.8208200680668915, |
| "learning_rate": 8.672310356051023e-06, |
| "loss": 0.8755, |
| "mean_token_accuracy": 0.742991179227829, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.5920991766120826, |
| "grad_norm": 0.7539353236493428, |
| "learning_rate": 8.544285778936004e-06, |
| "loss": 0.8604, |
| "mean_token_accuracy": 0.7469533935189248, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5920991766120826, |
| "eval_loss": 0.882485568523407, |
| "eval_mean_token_accuracy": 0.7446909926154397, |
| "eval_runtime": 24.5244, |
| "eval_samples_per_second": 20.918, |
| "eval_steps_per_second": 1.346, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5957997964659081, |
| "grad_norm": 0.7747168848469933, |
| "learning_rate": 8.416504510316774e-06, |
| "loss": 0.8687, |
| "mean_token_accuracy": 0.7457119628787041, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.5995004163197336, |
| "grad_norm": 0.7965745611460714, |
| "learning_rate": 8.28898790759152e-06, |
| "loss": 0.8719, |
| "mean_token_accuracy": 0.7440481051802635, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.603201036173559, |
| "grad_norm": 0.7287090635465163, |
| "learning_rate": 8.161757283922084e-06, |
| "loss": 0.8259, |
| "mean_token_accuracy": 0.7569043532013893, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.6069016560273846, |
| "grad_norm": 0.7849161686448317, |
| "learning_rate": 8.034833904671698e-06, |
| "loss": 0.8725, |
| "mean_token_accuracy": 0.7442520692944526, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.6106022758812101, |
| "grad_norm": 0.7891027028253116, |
| "learning_rate": 7.908238983850666e-06, |
| "loss": 0.8365, |
| "mean_token_accuracy": 0.75490812510252, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.6143028957350356, |
| "grad_norm": 0.7870957116065473, |
| "learning_rate": 7.781993680570656e-06, |
| "loss": 0.8576, |
| "mean_token_accuracy": 0.7477666437625885, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.6180035155888611, |
| "grad_norm": 0.8664448499276786, |
| "learning_rate": 7.656119095508155e-06, |
| "loss": 0.8332, |
| "mean_token_accuracy": 0.7553980827331543, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.6217041354426867, |
| "grad_norm": 0.8003046870817029, |
| "learning_rate": 7.530636267377706e-06, |
| "loss": 0.8559, |
| "mean_token_accuracy": 0.7494190320372581, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.6254047552965122, |
| "grad_norm": 0.8000920295001714, |
| "learning_rate": 7.405566169415481e-06, |
| "loss": 0.8869, |
| "mean_token_accuracy": 0.7401010394096375, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.6291053751503377, |
| "grad_norm": 0.7895598832562634, |
| "learning_rate": 7.280929705873818e-06, |
| "loss": 0.8648, |
| "mean_token_accuracy": 0.7470688298344612, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.6328059950041632, |
| "grad_norm": 0.7756015770435069, |
| "learning_rate": 7.15674770852727e-06, |
| "loss": 0.8294, |
| "mean_token_accuracy": 0.7561267375946045, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.6365066148579888, |
| "grad_norm": 0.7508734070910927, |
| "learning_rate": 7.033040933190776e-06, |
| "loss": 0.8367, |
| "mean_token_accuracy": 0.7520585179328918, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.6402072347118142, |
| "grad_norm": 0.7569428687497424, |
| "learning_rate": 6.909830056250527e-06, |
| "loss": 0.8498, |
| "mean_token_accuracy": 0.7501048058271408, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.6439078545656397, |
| "grad_norm": 0.7984119233647183, |
| "learning_rate": 6.787135671208126e-06, |
| "loss": 0.8571, |
| "mean_token_accuracy": 0.7490562096238136, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.6476084744194652, |
| "grad_norm": 0.8159593622616607, |
| "learning_rate": 6.6649782852385554e-06, |
| "loss": 0.8686, |
| "mean_token_accuracy": 0.7469749033451081, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.6513090942732908, |
| "grad_norm": 0.7631802495882793, |
| "learning_rate": 6.543378315762634e-06, |
| "loss": 0.8657, |
| "mean_token_accuracy": 0.7458467945456505, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.6550097141271163, |
| "grad_norm": 0.812760399325826, |
| "learning_rate": 6.42235608703441e-06, |
| "loss": 0.8635, |
| "mean_token_accuracy": 0.7484910488128662, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.6587103339809418, |
| "grad_norm": 0.8219546330443505, |
| "learning_rate": 6.301931826744189e-06, |
| "loss": 0.8363, |
| "mean_token_accuracy": 0.7542862921953202, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.6624109538347673, |
| "grad_norm": 0.7882602270918212, |
| "learning_rate": 6.18212566263765e-06, |
| "loss": 0.864, |
| "mean_token_accuracy": 0.7470616161823272, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.6661115736885929, |
| "grad_norm": 0.8408472925032482, |
| "learning_rate": 6.0629576191517035e-06, |
| "loss": 0.8619, |
| "mean_token_accuracy": 0.74820506721735, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6661115736885929, |
| "eval_loss": 0.8769035935401917, |
| "eval_mean_token_accuracy": 0.7459533539685336, |
| "eval_runtime": 24.4702, |
| "eval_samples_per_second": 20.964, |
| "eval_steps_per_second": 1.349, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6698121935424184, |
| "grad_norm": 0.7754763488387619, |
| "learning_rate": 5.944447614067588e-06, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.7466130316257477, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.6735128133962439, |
| "grad_norm": 0.7533565990898803, |
| "learning_rate": 5.8266154551818225e-06, |
| "loss": 0.8561, |
| "mean_token_accuracy": 0.7474694743752479, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.6772134332500694, |
| "grad_norm": 0.848613374257325, |
| "learning_rate": 5.709480836995509e-06, |
| "loss": 0.867, |
| "mean_token_accuracy": 0.7461282700300217, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.6809140531038949, |
| "grad_norm": 0.7588179424646748, |
| "learning_rate": 5.593063337422595e-06, |
| "loss": 0.8525, |
| "mean_token_accuracy": 0.7497700050473213, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.6846146729577204, |
| "grad_norm": 0.7363445259820587, |
| "learning_rate": 5.477382414517625e-06, |
| "loss": 0.8616, |
| "mean_token_accuracy": 0.7481978580355644, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.6883152928115459, |
| "grad_norm": 0.7698222134303955, |
| "learning_rate": 5.362457403223495e-06, |
| "loss": 0.8553, |
| "mean_token_accuracy": 0.749049125611782, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.6920159126653714, |
| "grad_norm": 0.7718707701904592, |
| "learning_rate": 5.248307512139818e-06, |
| "loss": 0.8616, |
| "mean_token_accuracy": 0.7483461976051331, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.695716532519197, |
| "grad_norm": 0.785721659768036, |
| "learning_rate": 5.134951820312402e-06, |
| "loss": 0.8557, |
| "mean_token_accuracy": 0.7493070676922798, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.6994171523730225, |
| "grad_norm": 0.7715074813141185, |
| "learning_rate": 5.022409274044346e-06, |
| "loss": 0.8493, |
| "mean_token_accuracy": 0.7499154567718506, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.703117772226848, |
| "grad_norm": 0.8335206414183822, |
| "learning_rate": 4.910698683729371e-06, |
| "loss": 0.8373, |
| "mean_token_accuracy": 0.7533765882253647, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.7068183920806735, |
| "grad_norm": 0.7505821456653344, |
| "learning_rate": 4.799838720707847e-06, |
| "loss": 0.8675, |
| "mean_token_accuracy": 0.745770500600338, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.7105190119344991, |
| "grad_norm": 0.7594914324330723, |
| "learning_rate": 4.6898479141460415e-06, |
| "loss": 0.8295, |
| "mean_token_accuracy": 0.756369736790657, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.7142196317883246, |
| "grad_norm": 0.7204641559045669, |
| "learning_rate": 4.580744647939163e-06, |
| "loss": 0.8624, |
| "mean_token_accuracy": 0.7481252133846283, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.71792025164215, |
| "grad_norm": 0.8174852819097401, |
| "learning_rate": 4.472547157638674e-06, |
| "loss": 0.8417, |
| "mean_token_accuracy": 0.7523833066225052, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.7216208714959755, |
| "grad_norm": 0.862526259850387, |
| "learning_rate": 4.365273527404384e-06, |
| "loss": 0.8423, |
| "mean_token_accuracy": 0.754362627863884, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.7253214913498011, |
| "grad_norm": 0.802739929512475, |
| "learning_rate": 4.258941686981864e-06, |
| "loss": 0.8469, |
| "mean_token_accuracy": 0.7522266939282417, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.7290221112036266, |
| "grad_norm": 0.7931469636274437, |
| "learning_rate": 4.15356940870567e-06, |
| "loss": 0.8305, |
| "mean_token_accuracy": 0.7555211216211319, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.7327227310574521, |
| "grad_norm": 0.7884974421740458, |
| "learning_rate": 4.049174304528857e-06, |
| "loss": 0.8327, |
| "mean_token_accuracy": 0.7560925737023354, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.7364233509112776, |
| "grad_norm": 0.7868742780753382, |
| "learning_rate": 3.945773823079315e-06, |
| "loss": 0.8491, |
| "mean_token_accuracy": 0.7494197428226471, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.7401239707651032, |
| "grad_norm": 0.7556900381775965, |
| "learning_rate": 3.8433852467434175e-06, |
| "loss": 0.8337, |
| "mean_token_accuracy": 0.7557049483060837, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.7401239707651032, |
| "eval_loss": 0.872385561466217, |
| "eval_mean_token_accuracy": 0.7468928351546779, |
| "eval_runtime": 24.5427, |
| "eval_samples_per_second": 20.902, |
| "eval_steps_per_second": 1.345, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.7438245906189287, |
| "grad_norm": 0.7642811057752702, |
| "learning_rate": 3.742025688777413e-06, |
| "loss": 0.8082, |
| "mean_token_accuracy": 0.7626117318868637, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.7475252104727542, |
| "grad_norm": 0.7878330759901441, |
| "learning_rate": 3.641712090447125e-06, |
| "loss": 0.8584, |
| "mean_token_accuracy": 0.7494427710771561, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.7512258303265797, |
| "grad_norm": 0.7817001972699995, |
| "learning_rate": 3.542461218196379e-06, |
| "loss": 0.8596, |
| "mean_token_accuracy": 0.7491939187049865, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.7549264501804053, |
| "grad_norm": 0.7922908897094753, |
| "learning_rate": 3.444289660844665e-06, |
| "loss": 0.837, |
| "mean_token_accuracy": 0.7543808072805405, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.7586270700342307, |
| "grad_norm": 0.7945985767364506, |
| "learning_rate": 3.347213826814456e-06, |
| "loss": 0.8662, |
| "mean_token_accuracy": 0.7463411048054696, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.7623276898880562, |
| "grad_norm": 0.7549974443621008, |
| "learning_rate": 3.2512499413887255e-06, |
| "loss": 0.8303, |
| "mean_token_accuracy": 0.7554700002074242, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.7660283097418817, |
| "grad_norm": 0.8236761629232633, |
| "learning_rate": 3.1564140439990256e-06, |
| "loss": 0.8406, |
| "mean_token_accuracy": 0.751871857047081, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.7697289295957073, |
| "grad_norm": 0.7267400739764023, |
| "learning_rate": 3.0627219855446667e-06, |
| "loss": 0.8287, |
| "mean_token_accuracy": 0.7558587804436684, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.7734295494495328, |
| "grad_norm": 0.776879654823643, |
| "learning_rate": 2.970189425743383e-06, |
| "loss": 0.8361, |
| "mean_token_accuracy": 0.7547884792089462, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.7771301693033583, |
| "grad_norm": 0.7552506340173256, |
| "learning_rate": 2.8788318305139808e-06, |
| "loss": 0.8286, |
| "mean_token_accuracy": 0.7568780824542045, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7808307891571838, |
| "grad_norm": 0.7580209757202828, |
| "learning_rate": 2.7886644693913333e-06, |
| "loss": 0.8459, |
| "mean_token_accuracy": 0.750422203540802, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.7845314090110094, |
| "grad_norm": 0.7719039642796498, |
| "learning_rate": 2.6997024129742544e-06, |
| "loss": 0.8404, |
| "mean_token_accuracy": 0.7537851154804229, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.7882320288648349, |
| "grad_norm": 0.7810831799894009, |
| "learning_rate": 2.611960530406572e-06, |
| "loss": 0.8273, |
| "mean_token_accuracy": 0.7569017544388771, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.7919326487186604, |
| "grad_norm": 0.8590214475734921, |
| "learning_rate": 2.5254534868919077e-06, |
| "loss": 0.8299, |
| "mean_token_accuracy": 0.7555602207779885, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.7956332685724858, |
| "grad_norm": 0.7451834304748566, |
| "learning_rate": 2.4401957412425213e-06, |
| "loss": 0.847, |
| "mean_token_accuracy": 0.7509198769927025, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.7993338884263114, |
| "grad_norm": 0.7394748846818531, |
| "learning_rate": 2.3562015434626784e-06, |
| "loss": 0.8478, |
| "mean_token_accuracy": 0.7510868698358536, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.8030345082801369, |
| "grad_norm": 0.7653067572984463, |
| "learning_rate": 2.273484932366874e-06, |
| "loss": 0.847, |
| "mean_token_accuracy": 0.7503063544631005, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.8067351281339624, |
| "grad_norm": 0.7743151365038076, |
| "learning_rate": 2.192059733233408e-06, |
| "loss": 0.8472, |
| "mean_token_accuracy": 0.7531051859259605, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.810435747987788, |
| "grad_norm": 0.7474033991103015, |
| "learning_rate": 2.111939555493603e-06, |
| "loss": 0.8459, |
| "mean_token_accuracy": 0.7517727881669998, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.8141363678416135, |
| "grad_norm": 0.779312232265544, |
| "learning_rate": 2.0331377904571303e-06, |
| "loss": 0.8276, |
| "mean_token_accuracy": 0.7547013550996781, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.8141363678416135, |
| "eval_loss": 0.8695138692855835, |
| "eval_mean_token_accuracy": 0.7477564071163987, |
| "eval_runtime": 24.4317, |
| "eval_samples_per_second": 20.997, |
| "eval_steps_per_second": 1.351, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.817836987695439, |
| "grad_norm": 0.8034562413677603, |
| "learning_rate": 1.9556676090737803e-06, |
| "loss": 0.8317, |
| "mean_token_accuracy": 0.7545135840773582, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.8215376075492645, |
| "grad_norm": 0.7281049735394789, |
| "learning_rate": 1.879541959732072e-06, |
| "loss": 0.853, |
| "mean_token_accuracy": 0.749828140437603, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.82523822740309, |
| "grad_norm": 0.7645464251699309, |
| "learning_rate": 1.8047735660950427e-06, |
| "loss": 0.8095, |
| "mean_token_accuracy": 0.7608827918767929, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.8289388472569156, |
| "grad_norm": 0.7946888766498504, |
| "learning_rate": 1.7313749249736266e-06, |
| "loss": 0.8157, |
| "mean_token_accuracy": 0.760844600200653, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.832639467110741, |
| "grad_norm": 0.7522474191468512, |
| "learning_rate": 1.6593583042379192e-06, |
| "loss": 0.8347, |
| "mean_token_accuracy": 0.7559689804911613, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.8363400869645665, |
| "grad_norm": 0.7489873580556787, |
| "learning_rate": 1.5887357407667314e-06, |
| "loss": 0.8605, |
| "mean_token_accuracy": 0.7474079817533493, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.840040706818392, |
| "grad_norm": 0.7641134150797313, |
| "learning_rate": 1.5195190384357405e-06, |
| "loss": 0.8618, |
| "mean_token_accuracy": 0.7480388507246971, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.8437413266722176, |
| "grad_norm": 0.7204342341589023, |
| "learning_rate": 1.4517197661445893e-06, |
| "loss": 0.8529, |
| "mean_token_accuracy": 0.7483357265591621, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.8474419465260431, |
| "grad_norm": 0.7681754372477057, |
| "learning_rate": 1.3853492558832472e-06, |
| "loss": 0.8306, |
| "mean_token_accuracy": 0.7552460536360741, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.8511425663798686, |
| "grad_norm": 0.7429470084922081, |
| "learning_rate": 1.3204186008379926e-06, |
| "loss": 0.8389, |
| "mean_token_accuracy": 0.7531604886054992, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.8548431862336942, |
| "grad_norm": 0.6873318445062786, |
| "learning_rate": 1.2569386535372807e-06, |
| "loss": 0.8286, |
| "mean_token_accuracy": 0.7552321195602417, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.8585438060875197, |
| "grad_norm": 0.7640481385504215, |
| "learning_rate": 1.1949200240378577e-06, |
| "loss": 0.8255, |
| "mean_token_accuracy": 0.7569879427552223, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.8622444259413452, |
| "grad_norm": 0.7638091289655545, |
| "learning_rate": 1.1343730781513896e-06, |
| "loss": 0.8558, |
| "mean_token_accuracy": 0.749303475022316, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.8659450457951707, |
| "grad_norm": 0.7685982859289542, |
| "learning_rate": 1.0753079357119134e-06, |
| "loss": 0.8273, |
| "mean_token_accuracy": 0.7565306261181831, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.8696456656489963, |
| "grad_norm": 0.7114001929021425, |
| "learning_rate": 1.017734468884417e-06, |
| "loss": 0.8473, |
| "mean_token_accuracy": 0.7502565905451775, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.8733462855028217, |
| "grad_norm": 0.7348011760240892, |
| "learning_rate": 9.616623005147952e-07, |
| "loss": 0.8286, |
| "mean_token_accuracy": 0.7560262143611908, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.8770469053566472, |
| "grad_norm": 0.7429654371816976, |
| "learning_rate": 9.071008025214767e-07, |
| "loss": 0.8158, |
| "mean_token_accuracy": 0.760336747765541, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.8807475252104727, |
| "grad_norm": 0.8100937899257782, |
| "learning_rate": 8.540590943290128e-07, |
| "loss": 0.8328, |
| "mean_token_accuracy": 0.7545322120189667, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.8844481450642983, |
| "grad_norm": 0.7556302951610571, |
| "learning_rate": 8.025460413438457e-07, |
| "loss": 0.842, |
| "mean_token_accuracy": 0.7518000155687332, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.8881487649181238, |
| "grad_norm": 0.7214919634823889, |
| "learning_rate": 7.525702534725443e-07, |
| "loss": 0.8485, |
| "mean_token_accuracy": 0.7501253366470337, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8881487649181238, |
| "eval_loss": 0.8681026101112366, |
| "eval_mean_token_accuracy": 0.7479201501066034, |
| "eval_runtime": 24.4445, |
| "eval_samples_per_second": 20.986, |
| "eval_steps_per_second": 1.35, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8918493847719493, |
| "grad_norm": 0.7363787692810877, |
| "learning_rate": 7.041400836827439e-07, |
| "loss": 0.8446, |
| "mean_token_accuracy": 0.7505712598562241, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.8955500046257748, |
| "grad_norm": 0.7760798741879421, |
| "learning_rate": 6.572636266070265e-07, |
| "loss": 0.8515, |
| "mean_token_accuracy": 0.7495753020048141, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.8992506244796004, |
| "grad_norm": 0.7375925847527678, |
| "learning_rate": 6.119487171899807e-07, |
| "loss": 0.8151, |
| "mean_token_accuracy": 0.7605884402990342, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.9029512443334259, |
| "grad_norm": 0.8483447358597658, |
| "learning_rate": 5.682029293786673e-07, |
| "loss": 0.8508, |
| "mean_token_accuracy": 0.7498715907335282, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.9066518641872514, |
| "grad_norm": 0.7427906597662893, |
| "learning_rate": 5.26033574856708e-07, |
| "loss": 0.8254, |
| "mean_token_accuracy": 0.7563312321901321, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.9103524840410768, |
| "grad_norm": 0.720458409183581, |
| "learning_rate": 4.854477018222103e-07, |
| "loss": 0.8374, |
| "mean_token_accuracy": 0.7546701580286026, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.9140531038949024, |
| "grad_norm": 0.7702820261310465, |
| "learning_rate": 4.464520938097294e-07, |
| "loss": 0.8397, |
| "mean_token_accuracy": 0.7535283699631691, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.9177537237487279, |
| "grad_norm": 0.8128830297797902, |
| "learning_rate": 4.0905326855646186e-07, |
| "loss": 0.855, |
| "mean_token_accuracy": 0.7464580446481704, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.9214543436025534, |
| "grad_norm": 0.7719201359932438, |
| "learning_rate": 3.732574769128738e-07, |
| "loss": 0.8585, |
| "mean_token_accuracy": 0.7479515969753265, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.9251549634563789, |
| "grad_norm": 0.767822410168291, |
| "learning_rate": 3.390707017979311e-07, |
| "loss": 0.8417, |
| "mean_token_accuracy": 0.7507120683789253, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.9288555833102045, |
| "grad_norm": 0.7931837924998084, |
| "learning_rate": 3.06498657199108e-07, |
| "loss": 0.8423, |
| "mean_token_accuracy": 0.7526604071259498, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.93255620316403, |
| "grad_norm": 0.7402614770859547, |
| "learning_rate": 2.7554678721735675e-07, |
| "loss": 0.831, |
| "mean_token_accuracy": 0.7547580033540726, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.9362568230178555, |
| "grad_norm": 0.7265080682796597, |
| "learning_rate": 2.4622026515717654e-07, |
| "loss": 0.8577, |
| "mean_token_accuracy": 0.7485264018177986, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.939957442871681, |
| "grad_norm": 0.7994816530184113, |
| "learning_rate": 2.1852399266194312e-07, |
| "loss": 0.8384, |
| "mean_token_accuracy": 0.7530582755804062, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.9436580627255066, |
| "grad_norm": 0.7408942586414351, |
| "learning_rate": 1.9246259889464935e-07, |
| "loss": 0.8395, |
| "mean_token_accuracy": 0.7526318833231926, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.9473586825793321, |
| "grad_norm": 0.7531498605702176, |
| "learning_rate": 1.6804043976418438e-07, |
| "loss": 0.8384, |
| "mean_token_accuracy": 0.7517712652683258, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.9510593024331575, |
| "grad_norm": 0.7640371632548638, |
| "learning_rate": 1.4526159719728595e-07, |
| "loss": 0.843, |
| "mean_token_accuracy": 0.7513806536793709, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.954759922286983, |
| "grad_norm": 0.7800995207503227, |
| "learning_rate": 1.24129878456285e-07, |
| "loss": 0.8482, |
| "mean_token_accuracy": 0.750167365372181, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.9584605421408086, |
| "grad_norm": 0.7781230447019286, |
| "learning_rate": 1.0464881550276362e-07, |
| "loss": 0.8571, |
| "mean_token_accuracy": 0.7482250303030014, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.9621611619946341, |
| "grad_norm": 0.7814221898628717, |
| "learning_rate": 8.682166440721729e-08, |
| "loss": 0.8554, |
| "mean_token_accuracy": 0.7490108326077461, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.9621611619946341, |
| "eval_loss": 0.8676031827926636, |
| "eval_mean_token_accuracy": 0.7480372389157613, |
| "eval_runtime": 24.4444, |
| "eval_samples_per_second": 20.986, |
| "eval_steps_per_second": 1.35, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.9658617818484596, |
| "grad_norm": 0.7864179680038271, |
| "learning_rate": 7.065140480483235e-08, |
| "loss": 0.8612, |
| "mean_token_accuracy": 0.7473657980561257, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.9695624017022851, |
| "grad_norm": 0.7508526784486548, |
| "learning_rate": 5.6140739397474445e-08, |
| "loss": 0.8365, |
| "mean_token_accuracy": 0.7531405627727509, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.9732630215561107, |
| "grad_norm": 0.7164581752617882, |
| "learning_rate": 4.329209350195651e-08, |
| "loss": 0.8443, |
| "mean_token_accuracy": 0.7516303405165672, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.9769636414099362, |
| "grad_norm": 0.8498247341964699, |
| "learning_rate": 3.210761464466639e-08, |
| "loss": 0.8374, |
| "mean_token_accuracy": 0.7550172284245491, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.9806642612637617, |
| "grad_norm": 0.7153929981028929, |
| "learning_rate": 2.2589172202635014e-08, |
| "loss": 0.7977, |
| "mean_token_accuracy": 0.7646528780460358, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.9843648811175872, |
| "grad_norm": 0.7908203411439174, |
| "learning_rate": 1.4738357091084177e-08, |
| "loss": 0.8371, |
| "mean_token_accuracy": 0.753947702050209, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.9880655009714127, |
| "grad_norm": 0.7547450484335274, |
| "learning_rate": 8.556481497521418e-09, |
| "loss": 0.8498, |
| "mean_token_accuracy": 0.7504909783601761, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.9917661208252382, |
| "grad_norm": 0.7442071956341998, |
| "learning_rate": 4.044578662419918e-09, |
| "loss": 0.8265, |
| "mean_token_accuracy": 0.7562764957547188, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.9954667406790637, |
| "grad_norm": 0.7540578819730456, |
| "learning_rate": 1.203402706525525e-09, |
| "loss": 0.8332, |
| "mean_token_accuracy": 0.7546069085597992, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.9991673605328892, |
| "grad_norm": 0.7649196329052427, |
| "learning_rate": 3.342850480869686e-11, |
| "loss": 0.805, |
| "mean_token_accuracy": 0.7623659715056419, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.9999074845036544, |
| "mean_token_accuracy": 0.7686276957392693, |
| "step": 1351, |
| "total_flos": 76959195168768.0, |
| "train_loss": 0.8854525579424984, |
| "train_runtime": 18393.1053, |
| "train_samples_per_second": 4.701, |
| "train_steps_per_second": 0.073 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1351, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 76959195168768.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|