diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4606 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998149861239592, + "eval_steps": 100, + "global_step": 2702, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0018501387604070306, + "grad_norm": 3.2532336927102463, + "learning_rate": 3.690036900369004e-07, + "loss": 1.2018, + "mean_token_accuracy": 0.6923797665033772, + "step": 5 + }, + { + "epoch": 0.0037002775208140612, + "grad_norm": 3.379992411847049, + "learning_rate": 7.380073800738008e-07, + "loss": 1.2137, + "mean_token_accuracy": 0.6883296987641394, + "step": 10 + }, + { + "epoch": 0.005550416281221091, + "grad_norm": 2.9610658289223575, + "learning_rate": 1.1070110701107011e-06, + "loss": 1.2189, + "mean_token_accuracy": 0.6868113883938572, + "step": 15 + }, + { + "epoch": 0.0074005550416281225, + "grad_norm": 2.6845510494996785, + "learning_rate": 1.4760147601476015e-06, + "loss": 1.1845, + "mean_token_accuracy": 0.6935479866876155, + "step": 20 + }, + { + "epoch": 0.009250693802035153, + "grad_norm": 2.2784714874592367, + "learning_rate": 1.845018450184502e-06, + "loss": 1.1136, + "mean_token_accuracy": 0.7081797044381053, + "step": 25 + }, + { + "epoch": 0.011100832562442183, + "grad_norm": 1.9537461867620272, + "learning_rate": 2.2140221402214023e-06, + "loss": 1.0814, + "mean_token_accuracy": 0.7156016722513929, + "step": 30 + }, + { + "epoch": 0.012950971322849213, + "grad_norm": 1.7176130666890619, + "learning_rate": 2.5830258302583027e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.7048473122025217, + "step": 35 + }, + { + "epoch": 0.014801110083256245, + "grad_norm": 1.4993539774408045, + "learning_rate": 2.952029520295203e-06, + "loss": 1.032, + "mean_token_accuracy": 0.7210633318347527, + "step": 40 + }, + { + "epoch": 0.016651248843663275, + "grad_norm": 1.4500321540078585, + "learning_rate": 3.3210332103321034e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.7186722136425703, + "step": 45 + }, + { + "epoch": 0.018501387604070305, + "grad_norm": 1.4404635003121444, + "learning_rate": 3.690036900369004e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7206336944162057, + "step": 50 + }, + { + "epoch": 0.020351526364477335, + "grad_norm": 1.2886852170129013, + "learning_rate": 4.059040590405905e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7232619650484746, + "step": 55 + }, + { + "epoch": 0.022201665124884366, + "grad_norm": 1.2390095863180097, + "learning_rate": 4.428044280442805e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.732270486390202, + "step": 60 + }, + { + "epoch": 0.024051803885291396, + "grad_norm": 1.4145442757057711, + "learning_rate": 4.797047970479705e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.731922585326102, + "step": 65 + }, + { + "epoch": 0.025901942645698426, + "grad_norm": 1.176692131883375, + "learning_rate": 5.166051660516605e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7308641073189992, + "step": 70 + }, + { + "epoch": 0.027752081406105456, + "grad_norm": 1.380434303930621, + "learning_rate": 5.535055350553506e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7388678618040926, + "step": 75 + }, + { + "epoch": 0.02960222016651249, + "grad_norm": 1.501779374944776, + "learning_rate": 5.904059040590406e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7435695529508822, + "step": 80 + }, + { + "epoch": 0.03145235892691952, + "grad_norm": 1.2052131397546584, + "learning_rate": 6.273062730627307e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7343740396412048, + "step": 85 + }, + { + "epoch": 0.03330249768732655, + "grad_norm": 1.2021335569617972, + "learning_rate": 6.642066420664207e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7335170827504196, + "step": 90 + }, + { + "epoch": 0.03515263644773358, + "grad_norm": 1.2616157740543787, + "learning_rate": 7.011070110701108e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7432377627483165, + "step": 95 + }, + { + "epoch": 0.03700277520814061, + "grad_norm": 1.1949277138921748, + "learning_rate": 7.380073800738008e-06, + "loss": 0.902, + "mean_token_accuracy": 0.742309200377307, + "step": 100 + }, + { + "epoch": 0.03700277520814061, + "eval_loss": 0.930350661277771, + "eval_mean_token_accuracy": 0.7317191439676337, + "eval_runtime": 14.7437, + "eval_samples_per_second": 17.431, + "eval_steps_per_second": 2.238, + "step": 100 + }, + { + "epoch": 0.03885291396854764, + "grad_norm": 1.3142989320113128, + "learning_rate": 7.749077490774908e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7479802912956741, + "step": 105 + }, + { + "epoch": 0.04070305272895467, + "grad_norm": 1.1970909118149438, + "learning_rate": 8.11808118081181e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7492966782864648, + "step": 110 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 1.1911219026023072, + "learning_rate": 8.48708487084871e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7513036077770368, + "step": 115 + }, + { + "epoch": 0.04440333024976873, + "grad_norm": 1.3099669643096605, + "learning_rate": 8.85608856088561e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7459777929168893, + "step": 120 + }, + { + "epoch": 0.04625346901017576, + "grad_norm": 1.1828627490407997, + "learning_rate": 9.22509225092251e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7388647905467014, + "step": 125 + }, + { + "epoch": 0.04810360777058279, + "grad_norm": 1.4170862708131036, + "learning_rate": 9.59409594095941e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7433845900156016, + "step": 130 + }, + { + "epoch": 0.04995374653098982, + "grad_norm": 1.2951155924958395, + "learning_rate": 9.963099630996312e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.74894087449994, + "step": 135 + }, + { + "epoch": 0.05180388529139685, + "grad_norm": 1.3599749533747763, + "learning_rate": 1.033210332103321e-05, + "loss": 0.8646, + "mean_token_accuracy": 0.7500771248521956, + "step": 140 + }, + { + "epoch": 0.05365402405180388, + "grad_norm": 1.3238238986045388, + "learning_rate": 1.0701107011070112e-05, + "loss": 0.9153, + "mean_token_accuracy": 0.7376646864163943, + "step": 145 + }, + { + "epoch": 0.05550416281221091, + "grad_norm": 1.2496868074133076, + "learning_rate": 1.1070110701107012e-05, + "loss": 0.9378, + "mean_token_accuracy": 0.7308473726558337, + "step": 150 + }, + { + "epoch": 0.05735430157261795, + "grad_norm": 1.2602983752253039, + "learning_rate": 1.1439114391143913e-05, + "loss": 0.9041, + "mean_token_accuracy": 0.7385226694279824, + "step": 155 + }, + { + "epoch": 0.05920444033302498, + "grad_norm": 1.235243719232178, + "learning_rate": 1.1808118081180812e-05, + "loss": 0.8634, + "mean_token_accuracy": 0.7505981898415235, + "step": 160 + }, + { + "epoch": 0.06105457909343201, + "grad_norm": 1.2598071389114238, + "learning_rate": 1.2177121771217713e-05, + "loss": 0.9261, + "mean_token_accuracy": 0.732949450979312, + "step": 165 + }, + { + "epoch": 0.06290471785383904, + "grad_norm": 1.1887408646799231, + "learning_rate": 1.2546125461254614e-05, + "loss": 0.8842, + "mean_token_accuracy": 0.7439330308418153, + "step": 170 + }, + { + "epoch": 0.06475485661424607, + "grad_norm": 1.510246566517054, + "learning_rate": 1.2915129151291515e-05, + "loss": 0.825, + "mean_token_accuracy": 0.7592509836076112, + "step": 175 + }, + { + "epoch": 0.0666049953746531, + "grad_norm": 1.2548910980870829, + "learning_rate": 1.3284132841328414e-05, + "loss": 0.8941, + "mean_token_accuracy": 0.7424632041813165, + "step": 180 + }, + { + "epoch": 0.06845513413506013, + "grad_norm": 1.1972240365355018, + "learning_rate": 1.3653136531365315e-05, + "loss": 0.8508, + "mean_token_accuracy": 0.7548654512507282, + "step": 185 + }, + { + "epoch": 0.07030527289546716, + "grad_norm": 1.3171902381038973, + "learning_rate": 1.4022140221402215e-05, + "loss": 0.9037, + "mean_token_accuracy": 0.7393820434296943, + "step": 190 + }, + { + "epoch": 0.07215541165587419, + "grad_norm": 1.317305476924177, + "learning_rate": 1.4391143911439116e-05, + "loss": 0.8991, + "mean_token_accuracy": 0.7396839093218253, + "step": 195 + }, + { + "epoch": 0.07400555041628122, + "grad_norm": 1.455213680501952, + "learning_rate": 1.4760147601476015e-05, + "loss": 0.8853, + "mean_token_accuracy": 0.7436791519056946, + "step": 200 + }, + { + "epoch": 0.07400555041628122, + "eval_loss": 0.8933929800987244, + "eval_mean_token_accuracy": 0.7380906954165021, + "eval_runtime": 14.5377, + "eval_samples_per_second": 17.678, + "eval_steps_per_second": 2.27, + "step": 200 + }, + { + "epoch": 0.07585568917668825, + "grad_norm": 1.231661886849989, + "learning_rate": 1.5129151291512916e-05, + "loss": 0.9105, + "mean_token_accuracy": 0.7366560460602605, + "step": 205 + }, + { + "epoch": 0.07770582793709528, + "grad_norm": 1.4183192399418325, + "learning_rate": 1.5498154981549817e-05, + "loss": 0.9121, + "mean_token_accuracy": 0.7371633196216058, + "step": 210 + }, + { + "epoch": 0.07955596669750231, + "grad_norm": 1.2787558292034593, + "learning_rate": 1.5867158671586716e-05, + "loss": 0.859, + "mean_token_accuracy": 0.7509506876334168, + "step": 215 + }, + { + "epoch": 0.08140610545790934, + "grad_norm": 1.2860685858780978, + "learning_rate": 1.623616236162362e-05, + "loss": 0.8605, + "mean_token_accuracy": 0.75086302172552, + "step": 220 + }, + { + "epoch": 0.08325624421831637, + "grad_norm": 1.2409055030181146, + "learning_rate": 1.6605166051660518e-05, + "loss": 0.8336, + "mean_token_accuracy": 0.7556560737934468, + "step": 225 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 1.2742674931988862, + "learning_rate": 1.697416974169742e-05, + "loss": 0.8654, + "mean_token_accuracy": 0.747921209333451, + "step": 230 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 1.1562860186513046, + "learning_rate": 1.734317343173432e-05, + "loss": 0.7991, + "mean_token_accuracy": 0.7676916174352729, + "step": 235 + }, + { + "epoch": 0.08880666049953746, + "grad_norm": 1.5422466200645668, + "learning_rate": 1.771217712177122e-05, + "loss": 0.7993, + "mean_token_accuracy": 0.7664292069306221, + "step": 240 + }, + { + "epoch": 0.09065679925994449, + "grad_norm": 1.184064562637193, + "learning_rate": 1.8081180811808117e-05, + "loss": 0.82, + "mean_token_accuracy": 0.7607034408583769, + "step": 245 + }, + { + "epoch": 0.09250693802035152, + "grad_norm": 1.3082964113038948, + "learning_rate": 1.845018450184502e-05, + "loss": 0.8544, + "mean_token_accuracy": 0.7510686877715592, + "step": 250 + }, + { + "epoch": 0.09435707678075855, + "grad_norm": 1.2309733600713384, + "learning_rate": 1.8819188191881922e-05, + "loss": 0.8603, + "mean_token_accuracy": 0.751500751990504, + "step": 255 + }, + { + "epoch": 0.09620721554116558, + "grad_norm": 1.3119074503742738, + "learning_rate": 1.918819188191882e-05, + "loss": 0.9105, + "mean_token_accuracy": 0.7338920653379262, + "step": 260 + }, + { + "epoch": 0.09805735430157261, + "grad_norm": 1.2298287964353394, + "learning_rate": 1.955719557195572e-05, + "loss": 0.8746, + "mean_token_accuracy": 0.7461544297267619, + "step": 265 + }, + { + "epoch": 0.09990749306197964, + "grad_norm": 1.3425181059480484, + "learning_rate": 1.9926199261992623e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.7423909244004632, + "step": 270 + }, + { + "epoch": 0.10175763182238667, + "grad_norm": 1.2389896364604396, + "learning_rate": 1.9999866396188624e-05, + "loss": 0.8888, + "mean_token_accuracy": 0.7405871479076017, + "step": 275 + }, + { + "epoch": 0.1036077705827937, + "grad_norm": 1.4616162987710688, + "learning_rate": 1.9999323636823398e-05, + "loss": 0.8787, + "mean_token_accuracy": 0.7443759621097477, + "step": 280 + }, + { + "epoch": 0.10545790934320073, + "grad_norm": 1.2068041539063639, + "learning_rate": 1.9998363394309497e-05, + "loss": 0.8533, + "mean_token_accuracy": 0.7519421916644042, + "step": 285 + }, + { + "epoch": 0.10730804810360776, + "grad_norm": 1.2411499051633939, + "learning_rate": 1.9996985708738146e-05, + "loss": 0.8588, + "mean_token_accuracy": 0.747695961536352, + "step": 290 + }, + { + "epoch": 0.1091581868640148, + "grad_norm": 1.2290950827493876, + "learning_rate": 1.999519063762928e-05, + "loss": 0.8943, + "mean_token_accuracy": 0.7404911998632147, + "step": 295 + }, + { + "epoch": 0.11100832562442182, + "grad_norm": 1.223732094320234, + "learning_rate": 1.9992978255929168e-05, + "loss": 0.8773, + "mean_token_accuracy": 0.7456867221997816, + "step": 300 + }, + { + "epoch": 0.11100832562442182, + "eval_loss": 0.8770559430122375, + "eval_mean_token_accuracy": 0.7410944533415076, + "eval_runtime": 14.4907, + "eval_samples_per_second": 17.736, + "eval_steps_per_second": 2.277, + "step": 300 + }, + { + "epoch": 0.11285846438482887, + "grad_norm": 1.2725694708390742, + "learning_rate": 1.999034865600726e-05, + "loss": 0.8495, + "mean_token_accuracy": 0.7521531690611065, + "step": 305 + }, + { + "epoch": 0.1147086031452359, + "grad_norm": 1.3269153367451316, + "learning_rate": 1.9987301947652354e-05, + "loss": 0.8343, + "mean_token_accuracy": 0.754720666938604, + "step": 310 + }, + { + "epoch": 0.11655874190564293, + "grad_norm": 1.3377086888038716, + "learning_rate": 1.998383825806799e-05, + "loss": 0.8419, + "mean_token_accuracy": 0.7534353201161064, + "step": 315 + }, + { + "epoch": 0.11840888066604996, + "grad_norm": 1.1549501990719349, + "learning_rate": 1.9979957731867143e-05, + "loss": 0.8947, + "mean_token_accuracy": 0.7394911334931107, + "step": 320 + }, + { + "epoch": 0.12025901942645699, + "grad_norm": 1.177951727618787, + "learning_rate": 1.9975660531066215e-05, + "loss": 0.8895, + "mean_token_accuracy": 0.7420869420108108, + "step": 325 + }, + { + "epoch": 0.12210915818686402, + "grad_norm": 1.3195452917167947, + "learning_rate": 1.9970946835078227e-05, + "loss": 0.8287, + "mean_token_accuracy": 0.7560502346125844, + "step": 330 + }, + { + "epoch": 0.12395929694727105, + "grad_norm": 1.251697352317517, + "learning_rate": 1.9965816840705355e-05, + "loss": 0.8877, + "mean_token_accuracy": 0.7420431117418635, + "step": 335 + }, + { + "epoch": 0.12580943570767808, + "grad_norm": 1.1727111734635083, + "learning_rate": 1.9960270762130705e-05, + "loss": 0.8497, + "mean_token_accuracy": 0.7506212598720976, + "step": 340 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 1.2936725031420457, + "learning_rate": 1.9954308830909372e-05, + "loss": 0.8463, + "mean_token_accuracy": 0.7530356983625646, + "step": 345 + }, + { + "epoch": 0.12950971322849214, + "grad_norm": 1.3430143619354924, + "learning_rate": 1.9947931295958778e-05, + "loss": 0.8469, + "mean_token_accuracy": 0.7518346344426988, + "step": 350 + }, + { + "epoch": 0.13135985198889916, + "grad_norm": 1.181627140885849, + "learning_rate": 1.9941138423548266e-05, + "loss": 0.863, + "mean_token_accuracy": 0.7461820839147225, + "step": 355 + }, + { + "epoch": 0.1332099907493062, + "grad_norm": 1.1918900962990067, + "learning_rate": 1.9933930497287996e-05, + "loss": 0.795, + "mean_token_accuracy": 0.7665149077428811, + "step": 360 + }, + { + "epoch": 0.13506012950971322, + "grad_norm": 1.1413275418075441, + "learning_rate": 1.9926307818117098e-05, + "loss": 0.8887, + "mean_token_accuracy": 0.7413485882397599, + "step": 365 + }, + { + "epoch": 0.13691026827012026, + "grad_norm": 1.2639080601730168, + "learning_rate": 1.9918270704291104e-05, + "loss": 0.8561, + "mean_token_accuracy": 0.7488176009902908, + "step": 370 + }, + { + "epoch": 0.13876040703052728, + "grad_norm": 1.3816370040142336, + "learning_rate": 1.9909819491368677e-05, + "loss": 0.8817, + "mean_token_accuracy": 0.7412092292027749, + "step": 375 + }, + { + "epoch": 0.14061054579093432, + "grad_norm": 1.1959465219113925, + "learning_rate": 1.990095453219757e-05, + "loss": 0.8257, + "mean_token_accuracy": 0.758226129008771, + "step": 380 + }, + { + "epoch": 0.14246068455134134, + "grad_norm": 1.2203724676608037, + "learning_rate": 1.989167619689993e-05, + "loss": 0.8504, + "mean_token_accuracy": 0.7506189533285046, + "step": 385 + }, + { + "epoch": 0.14431082331174838, + "grad_norm": 1.2071546200373575, + "learning_rate": 1.988198487285682e-05, + "loss": 0.882, + "mean_token_accuracy": 0.740789811102496, + "step": 390 + }, + { + "epoch": 0.1461609620721554, + "grad_norm": 1.1467698453178914, + "learning_rate": 1.9871880964692055e-05, + "loss": 0.7932, + "mean_token_accuracy": 0.7652316368897984, + "step": 395 + }, + { + "epoch": 0.14801110083256244, + "grad_norm": 1.1633432268718258, + "learning_rate": 1.9861364894255306e-05, + "loss": 0.8976, + "mean_token_accuracy": 0.7390804072687527, + "step": 400 + }, + { + "epoch": 0.14801110083256244, + "eval_loss": 0.8677666783332825, + "eval_mean_token_accuracy": 0.7428819660117091, + "eval_runtime": 14.4867, + "eval_samples_per_second": 17.74, + "eval_steps_per_second": 2.278, + "step": 400 + }, + { + "epoch": 0.1498612395929695, + "grad_norm": 1.2545670816814254, + "learning_rate": 1.985043710060449e-05, + "loss": 0.8293, + "mean_token_accuracy": 0.7566429056284478, + "step": 405 + }, + { + "epoch": 0.1517113783533765, + "grad_norm": 1.4236585256470373, + "learning_rate": 1.9839098039987435e-05, + "loss": 0.8268, + "mean_token_accuracy": 0.7558708051270291, + "step": 410 + }, + { + "epoch": 0.15356151711378355, + "grad_norm": 1.235944415828364, + "learning_rate": 1.9827348185822834e-05, + "loss": 0.8766, + "mean_token_accuracy": 0.7406190979245154, + "step": 415 + }, + { + "epoch": 0.15541165587419056, + "grad_norm": 1.3462777558428065, + "learning_rate": 1.981518802868048e-05, + "loss": 0.838, + "mean_token_accuracy": 0.7540694923610572, + "step": 420 + }, + { + "epoch": 0.1572617946345976, + "grad_norm": 1.2264185469644386, + "learning_rate": 1.9802618076260784e-05, + "loss": 0.8413, + "mean_token_accuracy": 0.7535229434697794, + "step": 425 + }, + { + "epoch": 0.15911193339500462, + "grad_norm": 1.2521783180781278, + "learning_rate": 1.9789638853373563e-05, + "loss": 0.811, + "mean_token_accuracy": 0.7617623917274129, + "step": 430 + }, + { + "epoch": 0.16096207215541167, + "grad_norm": 1.1028517750748723, + "learning_rate": 1.9776250901916168e-05, + "loss": 0.8305, + "mean_token_accuracy": 0.7555923133412319, + "step": 435 + }, + { + "epoch": 0.16281221091581868, + "grad_norm": 1.0709641886003023, + "learning_rate": 1.9762454780850807e-05, + "loss": 0.8401, + "mean_token_accuracy": 0.7530252251091509, + "step": 440 + }, + { + "epoch": 0.16466234967622573, + "grad_norm": 1.3269590712449801, + "learning_rate": 1.9748251066181247e-05, + "loss": 0.8303, + "mean_token_accuracy": 0.7548565874230693, + "step": 445 + }, + { + "epoch": 0.16651248843663274, + "grad_norm": 1.1377488686621455, + "learning_rate": 1.973364035092875e-05, + "loss": 0.8158, + "mean_token_accuracy": 0.7596406279601805, + "step": 450 + }, + { + "epoch": 0.1683626271970398, + "grad_norm": 1.2415292588608726, + "learning_rate": 1.971862324510732e-05, + "loss": 0.861, + "mean_token_accuracy": 0.7470450076733876, + "step": 455 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 1.1257187715343289, + "learning_rate": 1.9703200375698223e-05, + "loss": 0.8406, + "mean_token_accuracy": 0.7539170608283539, + "step": 460 + }, + { + "epoch": 0.17206290471785385, + "grad_norm": 1.20885923425794, + "learning_rate": 1.968737238662382e-05, + "loss": 0.8851, + "mean_token_accuracy": 0.7381369211473118, + "step": 465 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 1.0897954092903257, + "learning_rate": 1.9671139938720678e-05, + "loss": 0.8538, + "mean_token_accuracy": 0.7475365920692013, + "step": 470 + }, + { + "epoch": 0.1757631822386679, + "grad_norm": 1.3239296874571833, + "learning_rate": 1.9654503709711984e-05, + "loss": 0.8714, + "mean_token_accuracy": 0.7443980710430347, + "step": 475 + }, + { + "epoch": 0.17761332099907493, + "grad_norm": 1.181253082545939, + "learning_rate": 1.963746439417924e-05, + "loss": 0.8423, + "mean_token_accuracy": 0.7491222650473818, + "step": 480 + }, + { + "epoch": 0.17946345975948197, + "grad_norm": 1.2075410117293621, + "learning_rate": 1.962002270353328e-05, + "loss": 0.836, + "mean_token_accuracy": 0.7541004122612339, + "step": 485 + }, + { + "epoch": 0.18131359851988899, + "grad_norm": 1.1123580737034326, + "learning_rate": 1.960217936598454e-05, + "loss": 0.8384, + "mean_token_accuracy": 0.7528749033444468, + "step": 490 + }, + { + "epoch": 0.18316373728029603, + "grad_norm": 1.1086330605728787, + "learning_rate": 1.958393512651269e-05, + "loss": 0.7862, + "mean_token_accuracy": 0.7672767396835, + "step": 495 + }, + { + "epoch": 0.18501387604070305, + "grad_norm": 1.1088806213557527, + "learning_rate": 1.956529074683551e-05, + "loss": 0.8315, + "mean_token_accuracy": 0.7566864246037507, + "step": 500 + }, + { + "epoch": 0.18501387604070305, + "eval_loss": 0.8583042621612549, + "eval_mean_token_accuracy": 0.7445706510181059, + "eval_runtime": 14.493, + "eval_samples_per_second": 17.733, + "eval_steps_per_second": 2.277, + "step": 500 + }, + { + "epoch": 0.1868640148011101, + "grad_norm": 1.1979511682994708, + "learning_rate": 1.9546247005377065e-05, + "loss": 0.8704, + "mean_token_accuracy": 0.7443155876298173, + "step": 505 + }, + { + "epoch": 0.1887141535615171, + "grad_norm": 1.1370005092700617, + "learning_rate": 1.952680469723526e-05, + "loss": 0.8194, + "mean_token_accuracy": 0.7590315333634016, + "step": 510 + }, + { + "epoch": 0.19056429232192415, + "grad_norm": 1.029838285293326, + "learning_rate": 1.9506964634148597e-05, + "loss": 0.8506, + "mean_token_accuracy": 0.7514516107155556, + "step": 515 + }, + { + "epoch": 0.19241443108233117, + "grad_norm": 1.1431244734534254, + "learning_rate": 1.9486727644462306e-05, + "loss": 0.8311, + "mean_token_accuracy": 0.7570342372874708, + "step": 520 + }, + { + "epoch": 0.1942645698427382, + "grad_norm": 1.0877921648714206, + "learning_rate": 1.9466094573093744e-05, + "loss": 0.7972, + "mean_token_accuracy": 0.7632878855001356, + "step": 525 + }, + { + "epoch": 0.19611470860314523, + "grad_norm": 1.281569069976695, + "learning_rate": 1.9445066281497144e-05, + "loss": 0.823, + "mean_token_accuracy": 0.7555574522183981, + "step": 530 + }, + { + "epoch": 0.19796484736355227, + "grad_norm": 1.1606151275686378, + "learning_rate": 1.9423643647627625e-05, + "loss": 0.8019, + "mean_token_accuracy": 0.7627089286583049, + "step": 535 + }, + { + "epoch": 0.1998149861239593, + "grad_norm": 1.1557704874136578, + "learning_rate": 1.940182756590454e-05, + "loss": 0.8572, + "mean_token_accuracy": 0.7480846276484803, + "step": 540 + }, + { + "epoch": 0.20166512488436633, + "grad_norm": 1.167989428719569, + "learning_rate": 1.9379618947174155e-05, + "loss": 0.8327, + "mean_token_accuracy": 0.7561393432912055, + "step": 545 + }, + { + "epoch": 0.20351526364477335, + "grad_norm": 1.136818143773904, + "learning_rate": 1.935701871867158e-05, + "loss": 0.827, + "mean_token_accuracy": 0.7559568705035835, + "step": 550 + }, + { + "epoch": 0.2053654024051804, + "grad_norm": 1.0957982707394502, + "learning_rate": 1.9334027823982103e-05, + "loss": 0.7988, + "mean_token_accuracy": 0.7646880490511769, + "step": 555 + }, + { + "epoch": 0.2072155411655874, + "grad_norm": 1.1349454190950488, + "learning_rate": 1.9310647223001752e-05, + "loss": 0.8296, + "mean_token_accuracy": 0.7558369460588465, + "step": 560 + }, + { + "epoch": 0.20906567992599445, + "grad_norm": 1.2400602116306871, + "learning_rate": 1.9286877891897244e-05, + "loss": 0.8766, + "mean_token_accuracy": 0.7421277471875168, + "step": 565 + }, + { + "epoch": 0.21091581868640147, + "grad_norm": 1.1527099330731156, + "learning_rate": 1.9262720823065217e-05, + "loss": 0.8405, + "mean_token_accuracy": 0.7511390431924815, + "step": 570 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 1.1656004920710705, + "learning_rate": 1.923817702509081e-05, + "loss": 0.7989, + "mean_token_accuracy": 0.7620851334416682, + "step": 575 + }, + { + "epoch": 0.21461609620721553, + "grad_norm": 1.1302639136004313, + "learning_rate": 1.9213247522705532e-05, + "loss": 0.8323, + "mean_token_accuracy": 0.7538103953914914, + "step": 580 + }, + { + "epoch": 0.21646623496762257, + "grad_norm": 1.1473116616115981, + "learning_rate": 1.9187933356744504e-05, + "loss": 0.7858, + "mean_token_accuracy": 0.7652065401173526, + "step": 585 + }, + { + "epoch": 0.2183163737280296, + "grad_norm": 1.111694527569314, + "learning_rate": 1.9162235584102973e-05, + "loss": 0.8305, + "mean_token_accuracy": 0.7552945019380595, + "step": 590 + }, + { + "epoch": 0.22016651248843663, + "grad_norm": 1.2115727394267062, + "learning_rate": 1.9136155277692215e-05, + "loss": 0.8444, + "mean_token_accuracy": 0.74922453354857, + "step": 595 + }, + { + "epoch": 0.22201665124884365, + "grad_norm": 1.0946986364398767, + "learning_rate": 1.9109693526394722e-05, + "loss": 0.8541, + "mean_token_accuracy": 0.7468319261340056, + "step": 600 + }, + { + "epoch": 0.22201665124884365, + "eval_loss": 0.8494171500205994, + "eval_mean_token_accuracy": 0.7464725129577453, + "eval_runtime": 14.4831, + "eval_samples_per_second": 17.745, + "eval_steps_per_second": 2.279, + "step": 600 + }, + { + "epoch": 0.2238667900092507, + "grad_norm": 1.301315090065332, + "learning_rate": 1.9082851435018743e-05, + "loss": 0.8243, + "mean_token_accuracy": 0.7567519831905292, + "step": 605 + }, + { + "epoch": 0.22571692876965774, + "grad_norm": 1.189250316886271, + "learning_rate": 1.905563012425216e-05, + "loss": 0.8104, + "mean_token_accuracy": 0.7594515212509205, + "step": 610 + }, + { + "epoch": 0.22756706753006475, + "grad_norm": 1.0349907740506334, + "learning_rate": 1.9028030730615696e-05, + "loss": 0.8134, + "mean_token_accuracy": 0.7596982782916754, + "step": 615 + }, + { + "epoch": 0.2294172062904718, + "grad_norm": 1.2064328901840204, + "learning_rate": 1.9000054406415467e-05, + "loss": 0.8524, + "mean_token_accuracy": 0.7479771611645823, + "step": 620 + }, + { + "epoch": 0.23126734505087881, + "grad_norm": 1.2058575230432762, + "learning_rate": 1.897170231969486e-05, + "loss": 0.818, + "mean_token_accuracy": 0.7577475880707399, + "step": 625 + }, + { + "epoch": 0.23311748381128586, + "grad_norm": 1.1313674999003076, + "learning_rate": 1.8942975654185788e-05, + "loss": 0.8302, + "mean_token_accuracy": 0.7557635747228226, + "step": 630 + }, + { + "epoch": 0.23496762257169287, + "grad_norm": 1.1121912038265513, + "learning_rate": 1.8913875609259246e-05, + "loss": 0.8151, + "mean_token_accuracy": 0.7610647342258481, + "step": 635 + }, + { + "epoch": 0.23681776133209992, + "grad_norm": 1.1239981969678805, + "learning_rate": 1.8884403399875252e-05, + "loss": 0.8846, + "mean_token_accuracy": 0.7379610579576807, + "step": 640 + }, + { + "epoch": 0.23866790009250693, + "grad_norm": 1.2787795509643216, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.825, + "mean_token_accuracy": 0.7555822426137004, + "step": 645 + }, + { + "epoch": 0.24051803885291398, + "grad_norm": 1.2452331373504155, + "learning_rate": 1.8824347425215016e-05, + "loss": 0.8327, + "mean_token_accuracy": 0.7526088818619012, + "step": 650 + }, + { + "epoch": 0.242368177613321, + "grad_norm": 1.0587530432586114, + "learning_rate": 1.8793766167344115e-05, + "loss": 0.7981, + "mean_token_accuracy": 0.7635344090597403, + "step": 655 + }, + { + "epoch": 0.24421831637372804, + "grad_norm": 1.2042170188555503, + "learning_rate": 1.8762817759721735e-05, + "loss": 0.8362, + "mean_token_accuracy": 0.7525178428805248, + "step": 660 + }, + { + "epoch": 0.24606845513413506, + "grad_norm": 1.1879647658019326, + "learning_rate": 1.8731503494479132e-05, + "loss": 0.8089, + "mean_token_accuracy": 0.7601846070612515, + "step": 665 + }, + { + "epoch": 0.2479185938945421, + "grad_norm": 1.0856332430936149, + "learning_rate": 1.869982467902255e-05, + "loss": 0.8648, + "mean_token_accuracy": 0.7458757818982498, + "step": 670 + }, + { + "epoch": 0.24976873265494912, + "grad_norm": 1.132716719503818, + "learning_rate": 1.8667782635978597e-05, + "loss": 0.8403, + "mean_token_accuracy": 0.7523611774406956, + "step": 675 + }, + { + "epoch": 0.25161887141535616, + "grad_norm": 1.1700203990762743, + "learning_rate": 1.8635378703139066e-05, + "loss": 0.8393, + "mean_token_accuracy": 0.7534375797690551, + "step": 680 + }, + { + "epoch": 0.2534690101757632, + "grad_norm": 1.1106707517815726, + "learning_rate": 1.8602614233405047e-05, + "loss": 0.8132, + "mean_token_accuracy": 0.7590260462120785, + "step": 685 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 1.249201824692084, + "learning_rate": 1.8569490594730474e-05, + "loss": 0.8663, + "mean_token_accuracy": 0.7448480139248458, + "step": 690 + }, + { + "epoch": 0.25716928769657726, + "grad_norm": 1.1664831810001122, + "learning_rate": 1.853600917006497e-05, + "loss": 0.8364, + "mean_token_accuracy": 0.7538129225103697, + "step": 695 + }, + { + "epoch": 0.2590194264569843, + "grad_norm": 1.1754186180694628, + "learning_rate": 1.8502171357296144e-05, + "loss": 0.8171, + "mean_token_accuracy": 0.7586180490819495, + "step": 700 + }, + { + "epoch": 0.2590194264569843, + "eval_loss": 0.8445517420768738, + "eval_mean_token_accuracy": 0.7477923524190879, + "eval_runtime": 14.4937, + "eval_samples_per_second": 17.732, + "eval_steps_per_second": 2.277, + "step": 700 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 1.0059517029120144, + "learning_rate": 1.8467978569191216e-05, + "loss": 0.7745, + "mean_token_accuracy": 0.7693093908416766, + "step": 705 + }, + { + "epoch": 0.2627197039777983, + "grad_norm": 1.066472424068687, + "learning_rate": 1.8433432233338027e-05, + "loss": 0.7999, + "mean_token_accuracy": 0.7639893067037716, + "step": 710 + }, + { + "epoch": 0.2645698427382054, + "grad_norm": 1.1777350494806929, + "learning_rate": 1.8398533792085436e-05, + "loss": 0.8679, + "mean_token_accuracy": 0.7435399262724282, + "step": 715 + }, + { + "epoch": 0.2664199814986124, + "grad_norm": 1.0252701238189184, + "learning_rate": 1.8363284702483106e-05, + "loss": 0.7986, + "mean_token_accuracy": 0.7635025648840151, + "step": 720 + }, + { + "epoch": 0.2682701202590194, + "grad_norm": 1.0096812160230473, + "learning_rate": 1.832768643622067e-05, + "loss": 0.8127, + "mean_token_accuracy": 0.7600012314771563, + "step": 725 + }, + { + "epoch": 0.27012025901942643, + "grad_norm": 1.1741002088083832, + "learning_rate": 1.8291740479566286e-05, + "loss": 0.8005, + "mean_token_accuracy": 0.7614172160753303, + "step": 730 + }, + { + "epoch": 0.2719703977798335, + "grad_norm": 1.1040784192875175, + "learning_rate": 1.825544833330457e-05, + "loss": 0.8115, + "mean_token_accuracy": 0.7578784534983467, + "step": 735 + }, + { + "epoch": 0.2738205365402405, + "grad_norm": 1.0413211175226615, + "learning_rate": 1.8218811512673958e-05, + "loss": 0.8113, + "mean_token_accuracy": 0.7606541996746442, + "step": 740 + }, + { + "epoch": 0.27567067530064754, + "grad_norm": 1.1609696598988892, + "learning_rate": 1.818183154730344e-05, + "loss": 0.7965, + "mean_token_accuracy": 0.7633177082129581, + "step": 745 + }, + { + "epoch": 0.27752081406105455, + "grad_norm": 1.1231883821956807, + "learning_rate": 1.8144509981148675e-05, + "loss": 0.8187, + "mean_token_accuracy": 0.7578092533745984, + "step": 750 + }, + { + "epoch": 0.2793709528214616, + "grad_norm": 1.0747097312950582, + "learning_rate": 1.810684837242755e-05, + "loss": 0.8213, + "mean_token_accuracy": 0.7564159457293526, + "step": 755 + }, + { + "epoch": 0.28122109158186864, + "grad_norm": 1.1455565509851278, + "learning_rate": 1.8068848293555118e-05, + "loss": 0.8075, + "mean_token_accuracy": 0.7581692930522959, + "step": 760 + }, + { + "epoch": 0.28307123034227566, + "grad_norm": 1.0916124717106248, + "learning_rate": 1.8030511331077945e-05, + "loss": 0.7871, + "mean_token_accuracy": 0.7662962471037837, + "step": 765 + }, + { + "epoch": 0.2849213691026827, + "grad_norm": 1.0952664770299478, + "learning_rate": 1.799183908560787e-05, + "loss": 0.8132, + "mean_token_accuracy": 0.7605826243454008, + "step": 770 + }, + { + "epoch": 0.28677150786308975, + "grad_norm": 1.1071728010819362, + "learning_rate": 1.795283317175518e-05, + "loss": 0.8291, + "mean_token_accuracy": 0.7556897365917684, + "step": 775 + }, + { + "epoch": 0.28862164662349676, + "grad_norm": 1.134813517240112, + "learning_rate": 1.7913495218061202e-05, + "loss": 0.8354, + "mean_token_accuracy": 0.7521808714837601, + "step": 780 + }, + { + "epoch": 0.2904717853839038, + "grad_norm": 1.1019705155450266, + "learning_rate": 1.787382686693029e-05, + "loss": 0.8263, + "mean_token_accuracy": 0.7557652512329776, + "step": 785 + }, + { + "epoch": 0.2923219241443108, + "grad_norm": 1.1177939830161576, + "learning_rate": 1.783382977456128e-05, + "loss": 0.8342, + "mean_token_accuracy": 0.7530222967026039, + "step": 790 + }, + { + "epoch": 0.29417206290471787, + "grad_norm": 1.08025597991412, + "learning_rate": 1.779350561087833e-05, + "loss": 0.816, + "mean_token_accuracy": 0.7578065532385052, + "step": 795 + }, + { + "epoch": 0.2960222016651249, + "grad_norm": 1.080769128674541, + "learning_rate": 1.775285605946119e-05, + "loss": 0.8069, + "mean_token_accuracy": 0.7607617175780756, + "step": 800 + }, + { + "epoch": 0.2960222016651249, + "eval_loss": 0.838986337184906, + "eval_mean_token_accuracy": 0.7486298954049546, + "eval_runtime": 14.4806, + "eval_samples_per_second": 17.748, + "eval_steps_per_second": 2.279, + "step": 800 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 1.1023984964101772, + "learning_rate": 1.7711882817474922e-05, + "loss": 0.7844, + "mean_token_accuracy": 0.7683067978686096, + "step": 805 + }, + { + "epoch": 0.299722479185939, + "grad_norm": 1.0645123182011096, + "learning_rate": 1.7670587595599034e-05, + "loss": 0.8219, + "mean_token_accuracy": 0.7553624026227321, + "step": 810 + }, + { + "epoch": 0.301572617946346, + "grad_norm": 1.1201671845735144, + "learning_rate": 1.762897211795607e-05, + "loss": 0.7867, + "mean_token_accuracy": 0.7651262051167445, + "step": 815 + }, + { + "epoch": 0.303422756706753, + "grad_norm": 1.101865356005345, + "learning_rate": 1.758703812203961e-05, + "loss": 0.83, + "mean_token_accuracy": 0.7538541743928582, + "step": 820 + }, + { + "epoch": 0.30527289546716, + "grad_norm": 1.1897323943007336, + "learning_rate": 1.7544787358641735e-05, + "loss": 0.7851, + "mean_token_accuracy": 0.7680501468244437, + "step": 825 + }, + { + "epoch": 0.3071230342275671, + "grad_norm": 1.1199528427337382, + "learning_rate": 1.7502221591779932e-05, + "loss": 0.855, + "mean_token_accuracy": 0.7455190507520973, + "step": 830 + }, + { + "epoch": 0.3089731729879741, + "grad_norm": 1.1217544861851525, + "learning_rate": 1.7459342598623438e-05, + "loss": 0.8493, + "mean_token_accuracy": 0.750803485779012, + "step": 835 + }, + { + "epoch": 0.3108233117483811, + "grad_norm": 1.2082456384375413, + "learning_rate": 1.741615216941905e-05, + "loss": 0.8048, + "mean_token_accuracy": 0.7613722719486978, + "step": 840 + }, + { + "epoch": 0.31267345050878814, + "grad_norm": 1.1139734065035607, + "learning_rate": 1.7372652107416364e-05, + "loss": 0.8275, + "mean_token_accuracy": 0.7531808182011362, + "step": 845 + }, + { + "epoch": 0.3145235892691952, + "grad_norm": 1.0616818983145788, + "learning_rate": 1.7328844228792513e-05, + "loss": 0.795, + "mean_token_accuracy": 0.7628368017568732, + "step": 850 + }, + { + "epoch": 0.31637372802960223, + "grad_norm": 1.118378211316983, + "learning_rate": 1.7284730362576308e-05, + "loss": 0.8389, + "mean_token_accuracy": 0.749431517717695, + "step": 855 + }, + { + "epoch": 0.31822386679000925, + "grad_norm": 1.2147168769997703, + "learning_rate": 1.7240312350571905e-05, + "loss": 0.8457, + "mean_token_accuracy": 0.7498333040568483, + "step": 860 + }, + { + "epoch": 0.32007400555041626, + "grad_norm": 1.1514501221057487, + "learning_rate": 1.719559204728188e-05, + "loss": 0.8319, + "mean_token_accuracy": 0.7527944368243418, + "step": 865 + }, + { + "epoch": 0.32192414431082333, + "grad_norm": 1.1312696718695203, + "learning_rate": 1.715057131982983e-05, + "loss": 0.8272, + "mean_token_accuracy": 0.7545487021076067, + "step": 870 + }, + { + "epoch": 0.32377428307123035, + "grad_norm": 1.0441104385923894, + "learning_rate": 1.710525204788239e-05, + "loss": 0.7819, + "mean_token_accuracy": 0.7667960094189669, + "step": 875 + }, + { + "epoch": 0.32562442183163737, + "grad_norm": 1.04728122581024, + "learning_rate": 1.7059636123570767e-05, + "loss": 0.7762, + "mean_token_accuracy": 0.7683722808754585, + "step": 880 + }, + { + "epoch": 0.3274745605920444, + "grad_norm": 1.0276690700893418, + "learning_rate": 1.7013725451411757e-05, + "loss": 0.8034, + "mean_token_accuracy": 0.7601873594915107, + "step": 885 + }, + { + "epoch": 0.32932469935245146, + "grad_norm": 1.0999823202287056, + "learning_rate": 1.696752194822819e-05, + "loss": 0.8193, + "mean_token_accuracy": 0.7582367853568146, + "step": 890 + }, + { + "epoch": 0.33117483811285847, + "grad_norm": 1.0318057239413145, + "learning_rate": 1.692102754306895e-05, + "loss": 0.8079, + "mean_token_accuracy": 0.760669015194457, + "step": 895 + }, + { + "epoch": 0.3330249768732655, + "grad_norm": 1.066040702612707, + "learning_rate": 1.6874244177128395e-05, + "loss": 0.8428, + "mean_token_accuracy": 0.7511325668416905, + "step": 900 + }, + { + "epoch": 0.3330249768732655, + "eval_loss": 0.8337099552154541, + "eval_mean_token_accuracy": 0.7493726764189136, + "eval_runtime": 14.4963, + "eval_samples_per_second": 17.729, + "eval_steps_per_second": 2.276, + "step": 900 + }, + { + "epoch": 0.3348751156336725, + "grad_norm": 1.2073660630270728, + "learning_rate": 1.6827173803665328e-05, + "loss": 0.8192, + "mean_token_accuracy": 0.7558010505833733, + "step": 905 + }, + { + "epoch": 0.3367252543940796, + "grad_norm": 1.0066724260580413, + "learning_rate": 1.677981838792144e-05, + "loss": 0.7509, + "mean_token_accuracy": 0.7760858333288295, + "step": 910 + }, + { + "epoch": 0.3385753931544866, + "grad_norm": 1.0873672703647934, + "learning_rate": 1.6732179907039266e-05, + "loss": 0.8023, + "mean_token_accuracy": 0.7624104628372333, + "step": 915 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 1.0927982564093497, + "learning_rate": 1.6684260349979637e-05, + "loss": 0.8102, + "mean_token_accuracy": 0.7596703278938007, + "step": 920 + }, + { + "epoch": 0.3422756706753006, + "grad_norm": 1.0438601571925779, + "learning_rate": 1.6636061717438626e-05, + "loss": 0.8444, + "mean_token_accuracy": 0.7500523920314406, + "step": 925 + }, + { + "epoch": 0.3441258094357077, + "grad_norm": 1.03761659220563, + "learning_rate": 1.6587586021764022e-05, + "loss": 0.8146, + "mean_token_accuracy": 0.7571756383493733, + "step": 930 + }, + { + "epoch": 0.3459759481961147, + "grad_norm": 1.149891645832732, + "learning_rate": 1.653883528687133e-05, + "loss": 0.8237, + "mean_token_accuracy": 0.7542222854402241, + "step": 935 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 1.0403056302346276, + "learning_rate": 1.6489811548159245e-05, + "loss": 0.8256, + "mean_token_accuracy": 0.7557833181176377, + "step": 940 + }, + { + "epoch": 0.34967622571692875, + "grad_norm": 1.122265255005708, + "learning_rate": 1.6440516852424678e-05, + "loss": 0.8364, + "mean_token_accuracy": 0.7510874485776355, + "step": 945 + }, + { + "epoch": 0.3515263644773358, + "grad_norm": 1.045513608651206, + "learning_rate": 1.6390953257777324e-05, + "loss": 0.8127, + "mean_token_accuracy": 0.7589744589577777, + "step": 950 + }, + { + "epoch": 0.35337650323774283, + "grad_norm": 1.009257026361363, + "learning_rate": 1.634112283355369e-05, + "loss": 0.7776, + "mean_token_accuracy": 0.7676538111386327, + "step": 955 + }, + { + "epoch": 0.35522664199814985, + "grad_norm": 1.0720659597187137, + "learning_rate": 1.6291027660230735e-05, + "loss": 0.7853, + "mean_token_accuracy": 0.7667755959414551, + "step": 960 + }, + { + "epoch": 0.35707678075855687, + "grad_norm": 1.053524622146603, + "learning_rate": 1.6240669829338992e-05, + "loss": 0.7963, + "mean_token_accuracy": 0.7637186102490118, + "step": 965 + }, + { + "epoch": 0.35892691951896394, + "grad_norm": 1.0174159552298219, + "learning_rate": 1.6190051443375248e-05, + "loss": 0.8358, + "mean_token_accuracy": 0.7525469047361402, + "step": 970 + }, + { + "epoch": 0.36077705827937095, + "grad_norm": 1.0466584691710625, + "learning_rate": 1.6139174615714753e-05, + "loss": 0.8067, + "mean_token_accuracy": 0.7588450382789459, + "step": 975 + }, + { + "epoch": 0.36262719703977797, + "grad_norm": 1.1939869387435516, + "learning_rate": 1.6088041470523005e-05, + "loss": 0.7868, + "mean_token_accuracy": 0.7671158680263706, + "step": 980 + }, + { + "epoch": 0.364477335800185, + "grad_norm": 1.0140874943434712, + "learning_rate": 1.6036654142667043e-05, + "loss": 0.8287, + "mean_token_accuracy": 0.7542665733067967, + "step": 985 + }, + { + "epoch": 0.36632747456059206, + "grad_norm": 1.1052963609589759, + "learning_rate": 1.598501477762632e-05, + "loss": 0.7945, + "mean_token_accuracy": 0.762442251347988, + "step": 990 + }, + { + "epoch": 0.3681776133209991, + "grad_norm": 1.1225438733147488, + "learning_rate": 1.5933125531403135e-05, + "loss": 0.7976, + "mean_token_accuracy": 0.7595720683997148, + "step": 995 + }, + { + "epoch": 0.3700277520814061, + "grad_norm": 1.1468640757000927, + "learning_rate": 1.5880988570432603e-05, + "loss": 0.8051, + "mean_token_accuracy": 0.7600218442700781, + "step": 1000 + }, + { + "epoch": 0.3700277520814061, + "eval_loss": 0.828618049621582, + "eval_mean_token_accuracy": 0.7514673880622127, + "eval_runtime": 14.4755, + "eval_samples_per_second": 17.754, + "eval_steps_per_second": 2.28, + "step": 1000 + }, + { + "epoch": 0.37187789084181316, + "grad_norm": 1.00080177397972, + "learning_rate": 1.582860607149222e-05, + "loss": 0.8066, + "mean_token_accuracy": 0.760187943447982, + "step": 1005 + }, + { + "epoch": 0.3737280296022202, + "grad_norm": 1.2270098294318985, + "learning_rate": 1.5775980221610966e-05, + "loss": 0.8275, + "mean_token_accuracy": 0.7535905693809746, + "step": 1010 + }, + { + "epoch": 0.3755781683626272, + "grad_norm": 1.0596082781963458, + "learning_rate": 1.5723113217978e-05, + "loss": 0.8523, + "mean_token_accuracy": 0.7485610934984221, + "step": 1015 + }, + { + "epoch": 0.3774283071230342, + "grad_norm": 1.1517077091048888, + "learning_rate": 1.567000726785093e-05, + "loss": 0.7439, + "mean_token_accuracy": 0.7757078628646628, + "step": 1020 + }, + { + "epoch": 0.3792784458834413, + "grad_norm": 1.066020218096283, + "learning_rate": 1.561666458846365e-05, + "loss": 0.823, + "mean_token_accuracy": 0.7543368295786522, + "step": 1025 + }, + { + "epoch": 0.3811285846438483, + "grad_norm": 1.078122677532476, + "learning_rate": 1.5563087406933762e-05, + "loss": 0.7801, + "mean_token_accuracy": 0.7680241706166504, + "step": 1030 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 1.181831244610961, + "learning_rate": 1.550927796016961e-05, + "loss": 0.8363, + "mean_token_accuracy": 0.7506894750806388, + "step": 1035 + }, + { + "epoch": 0.38482886216466233, + "grad_norm": 1.1407270376899543, + "learning_rate": 1.5455238494776876e-05, + "loss": 0.7975, + "mean_token_accuracy": 0.7636928417835203, + "step": 1040 + }, + { + "epoch": 0.3866790009250694, + "grad_norm": 1.0646834300432384, + "learning_rate": 1.5400971266964772e-05, + "loss": 0.8461, + "mean_token_accuracy": 0.746257889771093, + "step": 1045 + }, + { + "epoch": 0.3885291396854764, + "grad_norm": 1.2297587305238908, + "learning_rate": 1.5346478542451862e-05, + "loss": 0.8214, + "mean_token_accuracy": 0.7562584430078917, + "step": 1050 + }, + { + "epoch": 0.39037927844588344, + "grad_norm": 1.066922606928583, + "learning_rate": 1.529176259637145e-05, + "loss": 0.825, + "mean_token_accuracy": 0.7546342425711803, + "step": 1055 + }, + { + "epoch": 0.39222941720629045, + "grad_norm": 1.118470210724078, + "learning_rate": 1.5236825713176584e-05, + "loss": 0.7984, + "mean_token_accuracy": 0.7601037488835896, + "step": 1060 + }, + { + "epoch": 0.3940795559666975, + "grad_norm": 1.0710868354595562, + "learning_rate": 1.5181670186544706e-05, + "loss": 0.8138, + "mean_token_accuracy": 0.7581725142503944, + "step": 1065 + }, + { + "epoch": 0.39592969472710454, + "grad_norm": 1.0417039842592677, + "learning_rate": 1.5126298319281859e-05, + "loss": 0.7961, + "mean_token_accuracy": 0.7620123514408658, + "step": 1070 + }, + { + "epoch": 0.39777983348751156, + "grad_norm": 0.9792946688556422, + "learning_rate": 1.5070712423226552e-05, + "loss": 0.8076, + "mean_token_accuracy": 0.7576958564017893, + "step": 1075 + }, + { + "epoch": 0.3996299722479186, + "grad_norm": 1.151799495892101, + "learning_rate": 1.5014914819153252e-05, + "loss": 0.8785, + "mean_token_accuracy": 0.7392656154989696, + "step": 1080 + }, + { + "epoch": 0.40148011100832565, + "grad_norm": 1.0262886584793083, + "learning_rate": 1.4958907836675467e-05, + "loss": 0.84, + "mean_token_accuracy": 0.7505150443521262, + "step": 1085 + }, + { + "epoch": 0.40333024976873266, + "grad_norm": 1.0571774017386317, + "learning_rate": 1.490269381414849e-05, + "loss": 0.8005, + "mean_token_accuracy": 0.759138607473643, + "step": 1090 + }, + { + "epoch": 0.4051803885291397, + "grad_norm": 1.1562321316775137, + "learning_rate": 1.484627509857178e-05, + "loss": 0.7521, + "mean_token_accuracy": 0.7756223951675719, + "step": 1095 + }, + { + "epoch": 0.4070305272895467, + "grad_norm": 1.0059260254443942, + "learning_rate": 1.4789654045490957e-05, + "loss": 0.7926, + "mean_token_accuracy": 0.7648292854766792, + "step": 1100 + }, + { + "epoch": 0.4070305272895467, + "eval_loss": 0.826316773891449, + "eval_mean_token_accuracy": 0.7514545443756653, + "eval_runtime": 14.4858, + "eval_samples_per_second": 17.742, + "eval_steps_per_second": 2.278, + "step": 1100 + }, + { + "epoch": 0.40888066604995377, + "grad_norm": 1.1281079812673853, + "learning_rate": 1.4732833018899468e-05, + "loss": 0.7852, + "mean_token_accuracy": 0.7658304995600786, + "step": 1105 + }, + { + "epoch": 0.4107308048103608, + "grad_norm": 1.0509757723036814, + "learning_rate": 1.4675814391139875e-05, + "loss": 0.8119, + "mean_token_accuracy": 0.7600977820297594, + "step": 1110 + }, + { + "epoch": 0.4125809435707678, + "grad_norm": 1.0254027575377513, + "learning_rate": 1.4618600542804819e-05, + "loss": 0.7763, + "mean_token_accuracy": 0.7672811100974183, + "step": 1115 + }, + { + "epoch": 0.4144310823311748, + "grad_norm": 1.0746090703485092, + "learning_rate": 1.4561193862637621e-05, + "loss": 0.7846, + "mean_token_accuracy": 0.7671591115012933, + "step": 1120 + }, + { + "epoch": 0.4162812210915819, + "grad_norm": 1.0784001106094996, + "learning_rate": 1.4503596747432554e-05, + "loss": 0.7737, + "mean_token_accuracy": 0.7694953031634812, + "step": 1125 + }, + { + "epoch": 0.4181313598519889, + "grad_norm": 1.1144769441312155, + "learning_rate": 1.4445811601934763e-05, + "loss": 0.8121, + "mean_token_accuracy": 0.757591062790796, + "step": 1130 + }, + { + "epoch": 0.4199814986123959, + "grad_norm": 1.1495017827697938, + "learning_rate": 1.4387840838739875e-05, + "loss": 0.7679, + "mean_token_accuracy": 0.7707191227713955, + "step": 1135 + }, + { + "epoch": 0.42183163737280294, + "grad_norm": 1.0970458958738152, + "learning_rate": 1.4329686878193271e-05, + "loss": 0.7921, + "mean_token_accuracy": 0.7618300710496058, + "step": 1140 + }, + { + "epoch": 0.42368177613321, + "grad_norm": 1.125944103520406, + "learning_rate": 1.4271352148289025e-05, + "loss": 0.7915, + "mean_token_accuracy": 0.7624915671213047, + "step": 1145 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 1.0996089635549322, + "learning_rate": 1.421283908456854e-05, + "loss": 0.8106, + "mean_token_accuracy": 0.7572764643945054, + "step": 1150 + }, + { + "epoch": 0.42738205365402404, + "grad_norm": 1.0323320095671162, + "learning_rate": 1.4154150130018867e-05, + "loss": 0.8419, + "mean_token_accuracy": 0.7464537236125296, + "step": 1155 + }, + { + "epoch": 0.42923219241443106, + "grad_norm": 1.0451207752486222, + "learning_rate": 1.4095287734970678e-05, + "loss": 0.8162, + "mean_token_accuracy": 0.7566291627238348, + "step": 1160 + }, + { + "epoch": 0.43108233117483813, + "grad_norm": 1.1112325914861976, + "learning_rate": 1.4036254356996004e-05, + "loss": 0.8129, + "mean_token_accuracy": 0.7578010478041998, + "step": 1165 + }, + { + "epoch": 0.43293246993524515, + "grad_norm": 1.0755915207347257, + "learning_rate": 1.3977052460805597e-05, + "loss": 0.7514, + "mean_token_accuracy": 0.7767243268309277, + "step": 1170 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 1.1378919985608114, + "learning_rate": 1.3917684518146044e-05, + "loss": 0.8039, + "mean_token_accuracy": 0.7589061366555783, + "step": 1175 + }, + { + "epoch": 0.4366327474560592, + "grad_norm": 1.1093355007515011, + "learning_rate": 1.3858153007696552e-05, + "loss": 0.7856, + "mean_token_accuracy": 0.7651410472546954, + "step": 1180 + }, + { + "epoch": 0.43848288621646625, + "grad_norm": 1.0204708102703368, + "learning_rate": 1.3798460414965475e-05, + "loss": 0.8205, + "mean_token_accuracy": 0.7556113719187835, + "step": 1185 + }, + { + "epoch": 0.44033302497687327, + "grad_norm": 1.055824604206192, + "learning_rate": 1.3738609232186537e-05, + "loss": 0.7651, + "mean_token_accuracy": 0.7711720594349845, + "step": 1190 + }, + { + "epoch": 0.4421831637372803, + "grad_norm": 1.0002106504419832, + "learning_rate": 1.3678601958214779e-05, + "loss": 0.7594, + "mean_token_accuracy": 0.7735433606997186, + "step": 1195 + }, + { + "epoch": 0.4440333024976873, + "grad_norm": 1.0484148074536186, + "learning_rate": 1.3618441098422215e-05, + "loss": 0.7556, + "mean_token_accuracy": 0.7730822776646882, + "step": 1200 + }, + { + "epoch": 0.4440333024976873, + "eval_loss": 0.8204125761985779, + "eval_mean_token_accuracy": 0.7529794813990031, + "eval_runtime": 14.4918, + "eval_samples_per_second": 17.734, + "eval_steps_per_second": 2.277, + "step": 1200 + }, + { + "epoch": 0.44588344125809437, + "grad_norm": 1.025854901712246, + "learning_rate": 1.3558129164593256e-05, + "loss": 0.8021, + "mean_token_accuracy": 0.7578941696386275, + "step": 1205 + }, + { + "epoch": 0.4477335800185014, + "grad_norm": 1.0709353094940375, + "learning_rate": 1.349766867481982e-05, + "loss": 0.8531, + "mean_token_accuracy": 0.7470531638847778, + "step": 1210 + }, + { + "epoch": 0.4495837187789084, + "grad_norm": 1.0414762326940254, + "learning_rate": 1.3437062153396201e-05, + "loss": 0.8071, + "mean_token_accuracy": 0.7569709613904478, + "step": 1215 + }, + { + "epoch": 0.4514338575393155, + "grad_norm": 1.0214885126272337, + "learning_rate": 1.337631213071369e-05, + "loss": 0.764, + "mean_token_accuracy": 0.7723288900666085, + "step": 1220 + }, + { + "epoch": 0.4532839962997225, + "grad_norm": 1.07135136034917, + "learning_rate": 1.331542114315491e-05, + "loss": 0.7859, + "mean_token_accuracy": 0.7645849297209407, + "step": 1225 + }, + { + "epoch": 0.4551341350601295, + "grad_norm": 1.0549125503994212, + "learning_rate": 1.325439173298793e-05, + "loss": 0.7958, + "mean_token_accuracy": 0.7617891256843489, + "step": 1230 + }, + { + "epoch": 0.4569842738205365, + "grad_norm": 1.0843729859179476, + "learning_rate": 1.3193226448260128e-05, + "loss": 0.7725, + "mean_token_accuracy": 0.7679210242707134, + "step": 1235 + }, + { + "epoch": 0.4588344125809436, + "grad_norm": 1.0822901228618558, + "learning_rate": 1.3131927842691793e-05, + "loss": 0.7665, + "mean_token_accuracy": 0.7692661007270873, + "step": 1240 + }, + { + "epoch": 0.4606845513413506, + "grad_norm": 1.0067497554215914, + "learning_rate": 1.3070498475569507e-05, + "loss": 0.7642, + "mean_token_accuracy": 0.7691396007651154, + "step": 1245 + }, + { + "epoch": 0.46253469010175763, + "grad_norm": 0.9591591299731624, + "learning_rate": 1.3008940911639302e-05, + "loss": 0.7777, + "mean_token_accuracy": 0.7671482742714102, + "step": 1250 + }, + { + "epoch": 0.46438482886216464, + "grad_norm": 1.0616514614712778, + "learning_rate": 1.2947257720999577e-05, + "loss": 0.7949, + "mean_token_accuracy": 0.7624789293043532, + "step": 1255 + }, + { + "epoch": 0.4662349676225717, + "grad_norm": 1.0716580298720833, + "learning_rate": 1.2885451478993777e-05, + "loss": 0.7267, + "mean_token_accuracy": 0.7817810725827413, + "step": 1260 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 1.0913923316264977, + "learning_rate": 1.282352476610289e-05, + "loss": 0.7925, + "mean_token_accuracy": 0.7640803986874114, + "step": 1265 + }, + { + "epoch": 0.46993524514338575, + "grad_norm": 1.047733693205379, + "learning_rate": 1.2761480167837705e-05, + "loss": 0.7726, + "mean_token_accuracy": 0.7683362140882071, + "step": 1270 + }, + { + "epoch": 0.47178538390379277, + "grad_norm": 1.0871343657719972, + "learning_rate": 1.2699320274630847e-05, + "loss": 0.7722, + "mean_token_accuracy": 0.7672574330806243, + "step": 1275 + }, + { + "epoch": 0.47363552266419984, + "grad_norm": 0.9979897501587407, + "learning_rate": 1.263704768172864e-05, + "loss": 0.8175, + "mean_token_accuracy": 0.7557377432661949, + "step": 1280 + }, + { + "epoch": 0.47548566142460685, + "grad_norm": 1.035364791356174, + "learning_rate": 1.257466498908276e-05, + "loss": 0.8248, + "mean_token_accuracy": 0.7542804408382418, + "step": 1285 + }, + { + "epoch": 0.47733580018501387, + "grad_norm": 1.0852810997244706, + "learning_rate": 1.2512174801241657e-05, + "loss": 0.7599, + "mean_token_accuracy": 0.772593514843307, + "step": 1290 + }, + { + "epoch": 0.4791859389454209, + "grad_norm": 1.0204188573963087, + "learning_rate": 1.2449579727241834e-05, + "loss": 0.796, + "mean_token_accuracy": 0.7628381193423197, + "step": 1295 + }, + { + "epoch": 0.48103607770582796, + "grad_norm": 1.03685174943182, + "learning_rate": 1.2386882380498918e-05, + "loss": 0.7809, + "mean_token_accuracy": 0.7664953916089738, + "step": 1300 + }, + { + "epoch": 0.48103607770582796, + "eval_loss": 0.8149722814559937, + "eval_mean_token_accuracy": 0.7544001447993829, + "eval_runtime": 14.4881, + "eval_samples_per_second": 17.739, + "eval_steps_per_second": 2.278, + "step": 1300 + }, + { + "epoch": 0.482886216466235, + "grad_norm": 1.0267696469451673, + "learning_rate": 1.2324085378698529e-05, + "loss": 0.7787, + "mean_token_accuracy": 0.7656609239113119, + "step": 1305 + }, + { + "epoch": 0.484736355226642, + "grad_norm": 1.0524566424082218, + "learning_rate": 1.2261191343687e-05, + "loss": 0.7514, + "mean_token_accuracy": 0.7748276821387221, + "step": 1310 + }, + { + "epoch": 0.486586493987049, + "grad_norm": 1.1116838244568499, + "learning_rate": 1.219820290136192e-05, + "loss": 0.827, + "mean_token_accuracy": 0.7533423385611829, + "step": 1315 + }, + { + "epoch": 0.4884366327474561, + "grad_norm": 1.032214961442017, + "learning_rate": 1.2135122681562481e-05, + "loss": 0.7533, + "mean_token_accuracy": 0.7746545480291259, + "step": 1320 + }, + { + "epoch": 0.4902867715078631, + "grad_norm": 1.124005211518299, + "learning_rate": 1.2071953317959692e-05, + "loss": 0.8094, + "mean_token_accuracy": 0.7577481981855726, + "step": 1325 + }, + { + "epoch": 0.4921369102682701, + "grad_norm": 1.1191287250616673, + "learning_rate": 1.2008697447946421e-05, + "loss": 0.7923, + "mean_token_accuracy": 0.7603309763381085, + "step": 1330 + }, + { + "epoch": 0.4939870490286771, + "grad_norm": 1.0786256452269447, + "learning_rate": 1.1945357712527273e-05, + "loss": 0.7913, + "mean_token_accuracy": 0.761239890997093, + "step": 1335 + }, + { + "epoch": 0.4958371877890842, + "grad_norm": 1.0040727041593884, + "learning_rate": 1.1881936756208329e-05, + "loss": 0.7845, + "mean_token_accuracy": 0.7647461866808782, + "step": 1340 + }, + { + "epoch": 0.4976873265494912, + "grad_norm": 1.0691850596908117, + "learning_rate": 1.1818437226886738e-05, + "loss": 0.7567, + "mean_token_accuracy": 0.7705560374150549, + "step": 1345 + }, + { + "epoch": 0.49953746530989823, + "grad_norm": 1.1751805649138247, + "learning_rate": 1.1754861775740163e-05, + "loss": 0.777, + "mean_token_accuracy": 0.7657378225766951, + "step": 1350 + }, + { + "epoch": 0.5013876040703052, + "grad_norm": 1.0792717001597631, + "learning_rate": 1.1691213057116082e-05, + "loss": 0.8062, + "mean_token_accuracy": 0.759409702403176, + "step": 1355 + }, + { + "epoch": 0.5032377428307123, + "grad_norm": 1.016376817901578, + "learning_rate": 1.1627493728420978e-05, + "loss": 0.769, + "mean_token_accuracy": 0.7668364587211491, + "step": 1360 + }, + { + "epoch": 0.5050878815911193, + "grad_norm": 1.1776703603507872, + "learning_rate": 1.1563706450009391e-05, + "loss": 0.7556, + "mean_token_accuracy": 0.7745420812091514, + "step": 1365 + }, + { + "epoch": 0.5069380203515264, + "grad_norm": 1.0228564368804778, + "learning_rate": 1.1499853885072827e-05, + "loss": 0.8494, + "mean_token_accuracy": 0.7474684151475192, + "step": 1370 + }, + { + "epoch": 0.5087881591119334, + "grad_norm": 0.9892765969730934, + "learning_rate": 1.1435938699528586e-05, + "loss": 0.7703, + "mean_token_accuracy": 0.7691140797000647, + "step": 1375 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 1.1123282748747376, + "learning_rate": 1.137196356190845e-05, + "loss": 0.7801, + "mean_token_accuracy": 0.7656304388228795, + "step": 1380 + }, + { + "epoch": 0.5124884366327475, + "grad_norm": 1.0163434815001926, + "learning_rate": 1.1307931143247268e-05, + "loss": 0.7803, + "mean_token_accuracy": 0.7622797743389912, + "step": 1385 + }, + { + "epoch": 0.5143385753931545, + "grad_norm": 1.022862711724792, + "learning_rate": 1.1243844116971433e-05, + "loss": 0.7861, + "mean_token_accuracy": 0.7635373929602338, + "step": 1390 + }, + { + "epoch": 0.5161887141535615, + "grad_norm": 1.1006515999892494, + "learning_rate": 1.1179705158787276e-05, + "loss": 0.8005, + "mean_token_accuracy": 0.7602463102192736, + "step": 1395 + }, + { + "epoch": 0.5180388529139686, + "grad_norm": 1.0908918809370063, + "learning_rate": 1.1115516946569333e-05, + "loss": 0.7609, + "mean_token_accuracy": 0.7711215724879116, + "step": 1400 + }, + { + "epoch": 0.5180388529139686, + "eval_loss": 0.811427891254425, + "eval_mean_token_accuracy": 0.7555099883424168, + "eval_runtime": 14.4895, + "eval_samples_per_second": 17.737, + "eval_steps_per_second": 2.278, + "step": 1400 + }, + { + "epoch": 0.5198889916743756, + "grad_norm": 0.9960952624740553, + "learning_rate": 1.105128216024857e-05, + "loss": 0.7531, + "mean_token_accuracy": 0.7742651560917937, + "step": 1405 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 1.0712181884851713, + "learning_rate": 1.0987003481700456e-05, + "loss": 0.8263, + "mean_token_accuracy": 0.7548726084948635, + "step": 1410 + }, + { + "epoch": 0.5235892691951897, + "grad_norm": 0.962587858503466, + "learning_rate": 1.092268359463302e-05, + "loss": 0.7802, + "mean_token_accuracy": 0.7654979070176253, + "step": 1415 + }, + { + "epoch": 0.5254394079555966, + "grad_norm": 1.0314955342754693, + "learning_rate": 1.0858325184474796e-05, + "loss": 0.7972, + "mean_token_accuracy": 0.760485619264745, + "step": 1420 + }, + { + "epoch": 0.5272895467160037, + "grad_norm": 1.1346549310208, + "learning_rate": 1.0793930938262689e-05, + "loss": 0.7696, + "mean_token_accuracy": 0.7693947644435266, + "step": 1425 + }, + { + "epoch": 0.5291396854764108, + "grad_norm": 1.0216019550710123, + "learning_rate": 1.0729503544529814e-05, + "loss": 0.7727, + "mean_token_accuracy": 0.768932178121201, + "step": 1430 + }, + { + "epoch": 0.5309898242368177, + "grad_norm": 0.9472129400506908, + "learning_rate": 1.0665045693193226e-05, + "loss": 0.7671, + "mean_token_accuracy": 0.7700756794971265, + "step": 1435 + }, + { + "epoch": 0.5328399629972248, + "grad_norm": 1.1032367818548314, + "learning_rate": 1.0600560075441617e-05, + "loss": 0.8192, + "mean_token_accuracy": 0.7533493995313415, + "step": 1440 + }, + { + "epoch": 0.5346901017576319, + "grad_norm": 1.0279974465073964, + "learning_rate": 1.0536049383622966e-05, + "loss": 0.7394, + "mean_token_accuracy": 0.7777864795666072, + "step": 1445 + }, + { + "epoch": 0.5365402405180388, + "grad_norm": 1.1091512699477888, + "learning_rate": 1.047151631113212e-05, + "loss": 0.8159, + "mean_token_accuracy": 0.7563010582513655, + "step": 1450 + }, + { + "epoch": 0.5383903792784459, + "grad_norm": 0.982874751224346, + "learning_rate": 1.0406963552298332e-05, + "loss": 0.7687, + "mean_token_accuracy": 0.7672807195200553, + "step": 1455 + }, + { + "epoch": 0.5402405180388529, + "grad_norm": 1.011002173538521, + "learning_rate": 1.034239380227281e-05, + "loss": 0.8038, + "mean_token_accuracy": 0.757725049607814, + "step": 1460 + }, + { + "epoch": 0.5420906567992599, + "grad_norm": 1.068879088124628, + "learning_rate": 1.0277809756916134e-05, + "loss": 0.8491, + "mean_token_accuracy": 0.7446647560185832, + "step": 1465 + }, + { + "epoch": 0.543940795559667, + "grad_norm": 1.080205359496898, + "learning_rate": 1.0213214112685747e-05, + "loss": 0.7689, + "mean_token_accuracy": 0.7677916620774836, + "step": 1470 + }, + { + "epoch": 0.545790934320074, + "grad_norm": 1.1451803451542424, + "learning_rate": 1.0148609566523358e-05, + "loss": 0.7677, + "mean_token_accuracy": 0.7704958522545351, + "step": 1475 + }, + { + "epoch": 0.547641073080481, + "grad_norm": 1.057942901385822, + "learning_rate": 1.0083998815742335e-05, + "loss": 0.83, + "mean_token_accuracy": 0.7525314188953106, + "step": 1480 + }, + { + "epoch": 0.5494912118408881, + "grad_norm": 1.066590261657824, + "learning_rate": 1.0019384557915099e-05, + "loss": 0.7675, + "mean_token_accuracy": 0.7684333982108851, + "step": 1485 + }, + { + "epoch": 0.5513413506012951, + "grad_norm": 1.0460321794792906, + "learning_rate": 9.9547694907605e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7660815749985537, + "step": 1490 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 1.1261446087683922, + "learning_rate": 9.890156312031165e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7692461806199885, + "step": 1495 + }, + { + "epoch": 0.5550416281221091, + "grad_norm": 0.9986543574992506, + "learning_rate": 9.825547719400889e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7622542095855191, + "step": 1500 + }, + { + "epoch": 0.5550416281221091, + "eval_loss": 0.8071566820144653, + "eval_mean_token_accuracy": 0.7552991469731356, + "eval_runtime": 14.4774, + "eval_samples_per_second": 17.752, + "eval_steps_per_second": 2.279, + "step": 1500 + }, + { + "epoch": 0.5568917668825162, + "grad_norm": 1.0151608706490083, + "learning_rate": 9.760946410351988e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7580960402877259, + "step": 1505 + }, + { + "epoch": 0.5587419056429233, + "grad_norm": 1.1431246375821682, + "learning_rate": 9.696355082062679e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7582158059106736, + "step": 1510 + }, + { + "epoch": 0.5605920444033302, + "grad_norm": 1.1366756392969894, + "learning_rate": 9.631776431294475e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7604761415631306, + "step": 1515 + }, + { + "epoch": 0.5624421831637373, + "grad_norm": 1.064426561745209, + "learning_rate": 9.567213154279582e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7658801850249978, + "step": 1520 + }, + { + "epoch": 0.5642923219241444, + "grad_norm": 1.2206940490140434, + "learning_rate": 9.502667946608332e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7693527271718168, + "step": 1525 + }, + { + "epoch": 0.5661424606845513, + "grad_norm": 1.061867256987294, + "learning_rate": 9.43814350311666e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7639054903302552, + "step": 1530 + }, + { + "epoch": 0.5679925994449584, + "grad_norm": 1.0773596589419776, + "learning_rate": 9.37364251777355e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.762305142002752, + "step": 1535 + }, + { + "epoch": 0.5698427382053654, + "grad_norm": 0.9305860721255377, + "learning_rate": 9.309167683568597e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7730082687058873, + "step": 1540 + }, + { + "epoch": 0.5716928769657724, + "grad_norm": 1.0438568368975347, + "learning_rate": 9.244721692399545e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7685756377740635, + "step": 1545 + }, + { + "epoch": 0.5735430157261795, + "grad_norm": 1.0417179823764866, + "learning_rate": 9.180307234959918e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.7646893641926666, + "step": 1550 + }, + { + "epoch": 0.5753931544865865, + "grad_norm": 1.2021420430572476, + "learning_rate": 9.115927000626665e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7579319671140221, + "step": 1555 + }, + { + "epoch": 0.5772432932469935, + "grad_norm": 0.9807147679471476, + "learning_rate": 9.051583677347879e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.7715240766674182, + "step": 1560 + }, + { + "epoch": 0.5790934320074006, + "grad_norm": 1.112301651376973, + "learning_rate": 8.987279951530586e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7612853963155312, + "step": 1565 + }, + { + "epoch": 0.5809435707678076, + "grad_norm": 0.9953893253844242, + "learning_rate": 8.923018507928564e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7674813105116938, + "step": 1570 + }, + { + "epoch": 0.5827937095282146, + "grad_norm": 1.2330863209065388, + "learning_rate": 8.85880202953026e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7580528204819157, + "step": 1575 + }, + { + "epoch": 0.5846438482886216, + "grad_norm": 0.9732935412122807, + "learning_rate": 8.79463319744677e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7667603675581345, + "step": 1580 + }, + { + "epoch": 0.5864939870490287, + "grad_norm": 1.1106291605099863, + "learning_rate": 8.730514690799916e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7612764600013756, + "step": 1585 + }, + { + "epoch": 0.5883441258094357, + "grad_norm": 1.0738935746625857, + "learning_rate": 8.666449186610353e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7642549505626682, + "step": 1590 + }, + { + "epoch": 0.5901942645698427, + "grad_norm": 1.048630302618336, + "learning_rate": 8.60243935968585e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7461853124183249, + "step": 1595 + }, + { + "epoch": 0.5920444033302498, + "grad_norm": 1.0343551648098699, + "learning_rate": 8.538487882509568e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7662510960595321, + "step": 1600 + }, + { + "epoch": 0.5920444033302498, + "eval_loss": 0.8040695190429688, + "eval_mean_token_accuracy": 0.7570943422819563, + "eval_runtime": 14.4778, + "eval_samples_per_second": 17.751, + "eval_steps_per_second": 2.279, + "step": 1600 + }, + { + "epoch": 0.5938945420906568, + "grad_norm": 1.0961002826991237, + "learning_rate": 8.474597425128501e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7858780701540196, + "step": 1605 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 1.0240683040483451, + "learning_rate": 8.410770655042003e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.7792364544911828, + "step": 1610 + }, + { + "epoch": 0.5975948196114709, + "grad_norm": 1.0349492153500044, + "learning_rate": 8.347010237090408e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7691388439187106, + "step": 1615 + }, + { + "epoch": 0.599444958371878, + "grad_norm": 1.0054589625531773, + "learning_rate": 8.283318833343773e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7662995637411906, + "step": 1620 + }, + { + "epoch": 0.6012950971322849, + "grad_norm": 1.1881406299373063, + "learning_rate": 8.219699102990735e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7612431169828326, + "step": 1625 + }, + { + "epoch": 0.603145235892692, + "grad_norm": 1.0560921557847984, + "learning_rate": 8.156153702227484e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7761245925311822, + "step": 1630 + }, + { + "epoch": 0.6049953746530989, + "grad_norm": 1.034438128134003, + "learning_rate": 8.092685284146865e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7680173066432896, + "step": 1635 + }, + { + "epoch": 0.606845513413506, + "grad_norm": 1.0572450811760434, + "learning_rate": 8.029296498627608e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7613668976017041, + "step": 1640 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 1.0584211857192012, + "learning_rate": 7.965989992223693e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7641754527609304, + "step": 1645 + }, + { + "epoch": 0.61054579093432, + "grad_norm": 1.1010535461787674, + "learning_rate": 7.90276840805385e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7607485404059696, + "step": 1650 + }, + { + "epoch": 0.6123959296947271, + "grad_norm": 0.9641280597909674, + "learning_rate": 7.839634385691214e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7601345706529836, + "step": 1655 + }, + { + "epoch": 0.6142460684551342, + "grad_norm": 1.0258649015427124, + "learning_rate": 7.776590561053117e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7634307977874382, + "step": 1660 + }, + { + "epoch": 0.6160962072155411, + "grad_norm": 0.8915240330728268, + "learning_rate": 7.713639566291028e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.7752575802558226, + "step": 1665 + }, + { + "epoch": 0.6179463459759482, + "grad_norm": 1.0798487335720899, + "learning_rate": 7.650784029680662e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7706145475575775, + "step": 1670 + }, + { + "epoch": 0.6197964847363552, + "grad_norm": 1.066207862211954, + "learning_rate": 7.58802657551225e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7657687551987788, + "step": 1675 + }, + { + "epoch": 0.6216466234967623, + "grad_norm": 1.0729408640073785, + "learning_rate": 7.52536982398097e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7756252876469019, + "step": 1680 + }, + { + "epoch": 0.6234967622571693, + "grad_norm": 0.9892530959877567, + "learning_rate": 7.46281639107755e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7692424630264253, + "step": 1685 + }, + { + "epoch": 0.6253469010175763, + "grad_norm": 1.0863365296832108, + "learning_rate": 7.400368888479048e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7710561072765569, + "step": 1690 + }, + { + "epoch": 0.6271970397779834, + "grad_norm": 1.0219625056615904, + "learning_rate": 7.3380299234398076e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7619297248431736, + "step": 1695 + }, + { + "epoch": 0.6290471785383904, + "grad_norm": 1.059974746495433, + "learning_rate": 7.275802098682612e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7626931188688311, + "step": 1700 + }, + { + "epoch": 0.6290471785383904, + "eval_loss": 0.8005915880203247, + "eval_mean_token_accuracy": 0.7581766535594516, + "eval_runtime": 14.4898, + "eval_samples_per_second": 17.737, + "eval_steps_per_second": 2.277, + "step": 1700 + }, + { + "epoch": 0.6308973172987974, + "grad_norm": 0.9799431584372903, + "learning_rate": 7.213688012290004e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7774764007386381, + "step": 1705 + }, + { + "epoch": 0.6327474560592045, + "grad_norm": 1.0000745192700642, + "learning_rate": 7.151690257595826e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.780328525900085, + "step": 1710 + }, + { + "epoch": 0.6345975948196114, + "grad_norm": 1.0126846740497386, + "learning_rate": 7.089811423076936e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7583421822590737, + "step": 1715 + }, + { + "epoch": 0.6364477335800185, + "grad_norm": 1.0000203186518826, + "learning_rate": 7.028054092245134e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7628886754547157, + "step": 1720 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 0.9650765115575336, + "learning_rate": 6.966420843539321e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7618325160379136, + "step": 1725 + }, + { + "epoch": 0.6401480111008325, + "grad_norm": 1.1005970343406948, + "learning_rate": 6.90491425021781e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7678248186075521, + "step": 1730 + }, + { + "epoch": 0.6419981498612396, + "grad_norm": 1.0505455182882948, + "learning_rate": 6.843536880250914e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7689772447519609, + "step": 1735 + }, + { + "epoch": 0.6438482886216467, + "grad_norm": 0.9898877515789932, + "learning_rate": 6.7822912962137225e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7783401565473735, + "step": 1740 + }, + { + "epoch": 0.6456984273820536, + "grad_norm": 1.0307728197070343, + "learning_rate": 6.721180055179113e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7834734571304314, + "step": 1745 + }, + { + "epoch": 0.6475485661424607, + "grad_norm": 1.0217053947622021, + "learning_rate": 6.660205708610987e-06, + "loss": 0.8081, + "mean_token_accuracy": 0.7579255638466, + "step": 1750 + }, + { + "epoch": 0.6493987049028677, + "grad_norm": 1.2605689757287866, + "learning_rate": 6.599370802257755e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7777790478140751, + "step": 1755 + }, + { + "epoch": 0.6512488436632747, + "grad_norm": 1.01099031383984, + "learning_rate": 6.5386778760460316e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7716764250341648, + "step": 1760 + }, + { + "epoch": 0.6530989824236818, + "grad_norm": 1.2715805643074796, + "learning_rate": 6.478129463974598e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7558262398936096, + "step": 1765 + }, + { + "epoch": 0.6549491211840888, + "grad_norm": 0.9662190840752783, + "learning_rate": 6.417728094008613e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7687993838083231, + "step": 1770 + }, + { + "epoch": 0.6567992599444958, + "grad_norm": 1.010012007940517, + "learning_rate": 6.357476287974051e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7674187685491771, + "step": 1775 + }, + { + "epoch": 0.6586493987049029, + "grad_norm": 0.989995878607444, + "learning_rate": 6.297376561452428e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7707717495376883, + "step": 1780 + }, + { + "epoch": 0.6604995374653099, + "grad_norm": 1.0965967826776033, + "learning_rate": 6.237431423675764e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7600581895253644, + "step": 1785 + }, + { + "epoch": 0.6623496762257169, + "grad_norm": 1.0944956142895659, + "learning_rate": 6.177643377421827e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7690723528400425, + "step": 1790 + }, + { + "epoch": 0.6641998149861239, + "grad_norm": 0.9484250663966232, + "learning_rate": 6.118014918909633e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7624083755642939, + "step": 1795 + }, + { + "epoch": 0.666049953746531, + "grad_norm": 1.04260067541159, + "learning_rate": 6.058548537695225e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7818991059910552, + "step": 1800 + }, + { + "epoch": 0.666049953746531, + "eval_loss": 0.7974567413330078, + "eval_mean_token_accuracy": 0.758648402989306, + "eval_runtime": 14.493, + "eval_samples_per_second": 17.733, + "eval_steps_per_second": 2.277, + "step": 1800 + }, + { + "epoch": 0.667900092506938, + "grad_norm": 1.041018059945966, + "learning_rate": 5.999246716567737e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7686837663567166, + "step": 1805 + }, + { + "epoch": 0.669750231267345, + "grad_norm": 1.041455209455465, + "learning_rate": 5.940111931445731e-06, + "loss": 0.739, + "mean_token_accuracy": 0.776196068889061, + "step": 1810 + }, + { + "epoch": 0.6716003700277521, + "grad_norm": 1.0276645002232094, + "learning_rate": 5.881146651273825e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.776803086081887, + "step": 1815 + }, + { + "epoch": 0.6734505087881592, + "grad_norm": 0.9890391264194114, + "learning_rate": 5.822353337919616e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7800533235185135, + "step": 1820 + }, + { + "epoch": 0.6753006475485661, + "grad_norm": 0.9915118541663397, + "learning_rate": 5.763734446070892e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7703118187370247, + "step": 1825 + }, + { + "epoch": 0.6771507863089732, + "grad_norm": 1.0111652906412603, + "learning_rate": 5.705292423133133e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7715821180476097, + "step": 1830 + }, + { + "epoch": 0.6790009250693803, + "grad_norm": 1.0250668165136037, + "learning_rate": 5.647029709127355e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7660108322883035, + "step": 1835 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 1.0048116407382357, + "learning_rate": 5.5889487365882065e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7675672052488259, + "step": 1840 + }, + { + "epoch": 0.6827012025901943, + "grad_norm": 1.0720723116253543, + "learning_rate": 5.531051930462437e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.761526189556413, + "step": 1845 + }, + { + "epoch": 0.6845513413506013, + "grad_norm": 1.2109050334319285, + "learning_rate": 5.4733417080076325e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7607614872634044, + "step": 1850 + }, + { + "epoch": 0.6864014801110083, + "grad_norm": 1.0066129717884702, + "learning_rate": 5.415820478691301e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7695679105630997, + "step": 1855 + }, + { + "epoch": 0.6882516188714154, + "grad_norm": 1.051999189563777, + "learning_rate": 5.358490644090263e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7590051863461479, + "step": 1860 + }, + { + "epoch": 0.6901017576318224, + "grad_norm": 1.0329924873076963, + "learning_rate": 5.3013545977904005e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7648981731466844, + "step": 1865 + }, + { + "epoch": 0.6919518963922294, + "grad_norm": 0.9946398062632457, + "learning_rate": 5.244414725286717e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7679705836852264, + "step": 1870 + }, + { + "epoch": 0.6938020351526365, + "grad_norm": 1.0125237652940526, + "learning_rate": 5.187673403883721e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7728285361093897, + "step": 1875 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 1.020579186254837, + "learning_rate": 5.131133002596199e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7709160057009022, + "step": 1880 + }, + { + "epoch": 0.6975023126734505, + "grad_norm": 1.0196295499189978, + "learning_rate": 5.074795882050293e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7631482519837886, + "step": 1885 + }, + { + "epoch": 0.6993524514338575, + "grad_norm": 1.0612455040071458, + "learning_rate": 5.018664394384942e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7663977047281036, + "step": 1890 + }, + { + "epoch": 0.7012025901942646, + "grad_norm": 0.9685884293761735, + "learning_rate": 4.9627408831536705e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7722706761571061, + "step": 1895 + }, + { + "epoch": 0.7030527289546716, + "grad_norm": 1.0928373482050855, + "learning_rate": 4.907027683226761e-06, + "loss": 0.7578, + "mean_token_accuracy": 0.7715491047241985, + "step": 1900 + }, + { + "epoch": 0.7030527289546716, + "eval_loss": 0.7944484353065491, + "eval_mean_token_accuracy": 0.7586305880034768, + "eval_runtime": 14.4938, + "eval_samples_per_second": 17.732, + "eval_steps_per_second": 2.277, + "step": 1900 + }, + { + "epoch": 0.7049028677150786, + "grad_norm": 1.0859465932930563, + "learning_rate": 4.85152712069375e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7549385137308637, + "step": 1905 + }, + { + "epoch": 0.7067530064754857, + "grad_norm": 1.049513938085134, + "learning_rate": 4.7962415127663265e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.780447195112969, + "step": 1910 + }, + { + "epoch": 0.7086031452358927, + "grad_norm": 1.117548059960726, + "learning_rate": 4.74117316768158e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7721869332481018, + "step": 1915 + }, + { + "epoch": 0.7104532839962997, + "grad_norm": 1.11422278792922, + "learning_rate": 4.686324384605629e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7730883016038614, + "step": 1920 + }, + { + "epoch": 0.7123034227567068, + "grad_norm": 1.0027647488893963, + "learning_rate": 4.631697453537623e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7813702584856604, + "step": 1925 + }, + { + "epoch": 0.7141535615171137, + "grad_norm": 1.0253284112435188, + "learning_rate": 4.577294655214144e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7609428914317682, + "step": 1930 + }, + { + "epoch": 0.7160037002775208, + "grad_norm": 1.0086636261634596, + "learning_rate": 4.523118261013969e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.767739248153237, + "step": 1935 + }, + { + "epoch": 0.7178538390379279, + "grad_norm": 1.0039426377703728, + "learning_rate": 4.469170532863254e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7692549651896964, + "step": 1940 + }, + { + "epoch": 0.7197039777983348, + "grad_norm": 1.0467718283259275, + "learning_rate": 4.415453723141081e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.766137714726382, + "step": 1945 + }, + { + "epoch": 0.7215541165587419, + "grad_norm": 1.0204963769550388, + "learning_rate": 4.361970074585426e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7785238995989615, + "step": 1950 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 1.2478881992057826, + "learning_rate": 4.308721820199529e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.773089613662054, + "step": 1955 + }, + { + "epoch": 0.7252543940795559, + "grad_norm": 1.0157944912100592, + "learning_rate": 4.255711183158635e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7741391603914481, + "step": 1960 + }, + { + "epoch": 0.727104532839963, + "grad_norm": 1.1145800466349667, + "learning_rate": 4.2029403767172175e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7712003935996773, + "step": 1965 + }, + { + "epoch": 0.72895467160037, + "grad_norm": 1.0555285599879347, + "learning_rate": 4.150411604116531e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7661195204415396, + "step": 1970 + }, + { + "epoch": 0.730804810360777, + "grad_norm": 1.039666306825263, + "learning_rate": 4.098127058492652e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.762705448864735, + "step": 1975 + }, + { + "epoch": 0.7326549491211841, + "grad_norm": 1.0884275336693787, + "learning_rate": 4.0460889227849e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.757057582191561, + "step": 1980 + }, + { + "epoch": 0.7345050878815911, + "grad_norm": 1.0878124608265338, + "learning_rate": 3.9942993696447045e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7696510981203281, + "step": 1985 + }, + { + "epoch": 0.7363552266419982, + "grad_norm": 1.0551187039104941, + "learning_rate": 3.942760561344877e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7746019033217356, + "step": 1990 + }, + { + "epoch": 0.7382053654024052, + "grad_norm": 1.0569964699563308, + "learning_rate": 3.891474649689362e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7723389486944285, + "step": 1995 + }, + { + "epoch": 0.7400555041628122, + "grad_norm": 1.06797998826191, + "learning_rate": 3.840443775923365e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.76490327037364, + "step": 2000 + }, + { + "epoch": 0.7400555041628122, + "eval_loss": 0.7914577126502991, + "eval_mean_token_accuracy": 0.759882789130325, + "eval_runtime": 14.4892, + "eval_samples_per_second": 17.737, + "eval_steps_per_second": 2.278, + "step": 2000 + }, + { + "epoch": 0.7419056429232193, + "grad_norm": 1.0858077301242162, + "learning_rate": 3.7896700706439826e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7624325181582925, + "step": 2005 + }, + { + "epoch": 0.7437557816836263, + "grad_norm": 1.0750538773831415, + "learning_rate": 3.7391556537112282e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7578645451713619, + "step": 2010 + }, + { + "epoch": 0.7456059204440333, + "grad_norm": 1.027208255024697, + "learning_rate": 3.6889026341595378e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7620649916785809, + "step": 2015 + }, + { + "epoch": 0.7474560592044404, + "grad_norm": 1.0246182177676837, + "learning_rate": 3.6389131101096953e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.777594919047721, + "step": 2020 + }, + { + "epoch": 0.7493061979648473, + "grad_norm": 1.2659873412226397, + "learning_rate": 3.5891891686812597e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7837638744388433, + "step": 2025 + }, + { + "epoch": 0.7511563367252544, + "grad_norm": 1.0688167050893478, + "learning_rate": 3.5397328859054138e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.779197990567462, + "step": 2030 + }, + { + "epoch": 0.7530064754856615, + "grad_norm": 1.0915733116492814, + "learning_rate": 3.490546326638273e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7582757575526667, + "step": 2035 + }, + { + "epoch": 0.7548566142460684, + "grad_norm": 1.0438375949270606, + "learning_rate": 3.441631544474705e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7618316070751303, + "step": 2040 + }, + { + "epoch": 0.7567067530064755, + "grad_norm": 1.0237992046246145, + "learning_rate": 3.3929905816625653e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.773186823181064, + "step": 2045 + }, + { + "epoch": 0.7585568917668826, + "grad_norm": 1.0256768895360044, + "learning_rate": 3.344625469017445e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7737045922742098, + "step": 2050 + }, + { + "epoch": 0.7604070305272895, + "grad_norm": 1.043374364393721, + "learning_rate": 3.2965382258378674e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7734598187393918, + "step": 2055 + }, + { + "epoch": 0.7622571692876966, + "grad_norm": 0.9804528106242374, + "learning_rate": 3.248730859821002e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.776697671440589, + "step": 2060 + }, + { + "epoch": 0.7641073080481036, + "grad_norm": 0.974395081360433, + "learning_rate": 3.2012053669788136e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7701240185117962, + "step": 2065 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 1.0363184652960642, + "learning_rate": 3.1539637315547524e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7762055215347891, + "step": 2070 + }, + { + "epoch": 0.7678075855689177, + "grad_norm": 0.98386995648262, + "learning_rate": 3.1070079259408934e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7652368736182762, + "step": 2075 + }, + { + "epoch": 0.7696577243293247, + "grad_norm": 1.0743727238303387, + "learning_rate": 3.0603399105955966e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7741191045729113, + "step": 2080 + }, + { + "epoch": 0.7715078630897317, + "grad_norm": 0.9698667856409672, + "learning_rate": 3.0139616339616394e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7750052290641862, + "step": 2085 + }, + { + "epoch": 0.7733580018501388, + "grad_norm": 1.006836944818379, + "learning_rate": 2.9678750323848893e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.771547904489535, + "step": 2090 + }, + { + "epoch": 0.7752081406105458, + "grad_norm": 1.0158248195838104, + "learning_rate": 2.922082030033446e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7739714202760842, + "step": 2095 + }, + { + "epoch": 0.7770582793709528, + "grad_norm": 1.0639645532739403, + "learning_rate": 2.8765845388172955e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7675081240734547, + "step": 2100 + }, + { + "epoch": 0.7770582793709528, + "eval_loss": 0.7892476320266724, + "eval_mean_token_accuracy": 0.7602525156843513, + "eval_runtime": 14.4946, + "eval_samples_per_second": 17.731, + "eval_steps_per_second": 2.277, + "step": 2100 + }, + { + "epoch": 0.7789084181313598, + "grad_norm": 1.0772175688563412, + "learning_rate": 2.831384458308518e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7666759174113675, + "step": 2105 + }, + { + "epoch": 0.7807585568917669, + "grad_norm": 0.9821594377540279, + "learning_rate": 2.7864836756619407e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7722864174809689, + "step": 2110 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 1.0706616998380452, + "learning_rate": 2.741884065536373e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7732205631957721, + "step": 2115 + }, + { + "epoch": 0.7844588344125809, + "grad_norm": 1.1395051421573679, + "learning_rate": 2.6975874900163223e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7609839531511644, + "step": 2120 + }, + { + "epoch": 0.786308973172988, + "grad_norm": 1.0617330782151873, + "learning_rate": 2.6535957985342653e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7740794699356759, + "step": 2125 + }, + { + "epoch": 0.788159111933395, + "grad_norm": 1.0170725813049966, + "learning_rate": 2.6099108277934105e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7698504790001218, + "step": 2130 + }, + { + "epoch": 0.790009250693802, + "grad_norm": 0.99005471423073, + "learning_rate": 2.5665344016910367e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7805393061106906, + "step": 2135 + }, + { + "epoch": 0.7918593894542091, + "grad_norm": 1.0209391952521114, + "learning_rate": 2.523468331242329e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.775106477758667, + "step": 2140 + }, + { + "epoch": 0.793709528214616, + "grad_norm": 0.9885771051658575, + "learning_rate": 2.4807144145047734e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7733637946562394, + "step": 2145 + }, + { + "epoch": 0.7955596669750231, + "grad_norm": 1.0754493810134182, + "learning_rate": 2.438274436503074e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7694866322167656, + "step": 2150 + }, + { + "epoch": 0.7974098057354302, + "grad_norm": 1.0498610421419157, + "learning_rate": 2.396150169154644e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7797983225533884, + "step": 2155 + }, + { + "epoch": 0.7992599444958371, + "grad_norm": 1.1169640190240655, + "learning_rate": 2.3543433711956197e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7754534172004726, + "step": 2160 + }, + { + "epoch": 0.8011100832562442, + "grad_norm": 1.0665839477262065, + "learning_rate": 2.3128557881074153e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7582482785725089, + "step": 2165 + }, + { + "epoch": 0.8029602220166513, + "grad_norm": 1.094468319473981, + "learning_rate": 2.271689152043873e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7621698155375765, + "step": 2170 + }, + { + "epoch": 0.8048103607770583, + "grad_norm": 1.0122297016704693, + "learning_rate": 2.230845181758928e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7664906481324298, + "step": 2175 + }, + { + "epoch": 0.8066604995374653, + "grad_norm": 0.9777139475173185, + "learning_rate": 2.1903255825348533e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7574765314021219, + "step": 2180 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 0.9962044531332499, + "learning_rate": 2.150132046111054e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7648124824177949, + "step": 2185 + }, + { + "epoch": 0.8103607770582794, + "grad_norm": 1.0509474901378228, + "learning_rate": 2.1102662506134506e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7704075349125244, + "step": 2190 + }, + { + "epoch": 0.8122109158186864, + "grad_norm": 1.0372725962020504, + "learning_rate": 2.0707298604843964e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7750629134489836, + "step": 2195 + }, + { + "epoch": 0.8140610545790934, + "grad_norm": 1.0656046493126763, + "learning_rate": 2.03152452641321e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7687188507342828, + "step": 2200 + }, + { + "epoch": 0.8140610545790934, + "eval_loss": 0.7875065207481384, + "eval_mean_token_accuracy": 0.7605228454614659, + "eval_runtime": 14.4851, + "eval_samples_per_second": 17.742, + "eval_steps_per_second": 2.278, + "step": 2200 + }, + { + "epoch": 0.8159111933395005, + "grad_norm": 0.9383804875481359, + "learning_rate": 1.9926518852672294e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7826616789995635, + "step": 2205 + }, + { + "epoch": 0.8177613320999075, + "grad_norm": 1.0531634024785752, + "learning_rate": 1.9541135600234917e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7550138127754585, + "step": 2210 + }, + { + "epoch": 0.8196114708603145, + "grad_norm": 1.045243955730307, + "learning_rate": 1.9159111597009584e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.770212609280218, + "step": 2215 + }, + { + "epoch": 0.8214616096207216, + "grad_norm": 1.0405653171665286, + "learning_rate": 1.8780462792933473e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.767269228422944, + "step": 2220 + }, + { + "epoch": 0.8233117483811286, + "grad_norm": 1.0714416047042246, + "learning_rate": 1.8405204997025394e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7625043712111882, + "step": 2225 + }, + { + "epoch": 0.8251618871415356, + "grad_norm": 1.0358923918877083, + "learning_rate": 1.8033353876725578e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7688595431996983, + "step": 2230 + }, + { + "epoch": 0.8270120259019427, + "grad_norm": 1.019883136075587, + "learning_rate": 1.766492495724178e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7667773964962867, + "step": 2235 + }, + { + "epoch": 0.8288621646623496, + "grad_norm": 1.0460123665428231, + "learning_rate": 1.7299933620900945e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7729793590568864, + "step": 2240 + }, + { + "epoch": 0.8307123034227567, + "grad_norm": 1.1462416303495035, + "learning_rate": 1.6938395106507034e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7685725295799959, + "step": 2245 + }, + { + "epoch": 0.8325624421831638, + "grad_norm": 1.0353673556479475, + "learning_rate": 1.658032450870467e-06, + "loss": 0.77, + "mean_token_accuracy": 0.765378813232025, + "step": 2250 + }, + { + "epoch": 0.8344125809435707, + "grad_norm": 1.0450521447932095, + "learning_rate": 1.622573677734911e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7687499025709724, + "step": 2255 + }, + { + "epoch": 0.8362627197039778, + "grad_norm": 0.9719217219316074, + "learning_rate": 1.587464671688187e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7719135274258382, + "step": 2260 + }, + { + "epoch": 0.8381128584643849, + "grad_norm": 1.0161143824737915, + "learning_rate": 1.552706898571288e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7710960983212521, + "step": 2265 + }, + { + "epoch": 0.8399629972247918, + "grad_norm": 0.9953359793876579, + "learning_rate": 1.5183018095608138e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.7722743713679878, + "step": 2270 + }, + { + "epoch": 0.8418131359851989, + "grad_norm": 1.0279238774997308, + "learning_rate": 1.4842508411084145e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7592500544438198, + "step": 2275 + }, + { + "epoch": 0.8436632747456059, + "grad_norm": 1.041907930976154, + "learning_rate": 1.4505554148807954e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7543533069155867, + "step": 2280 + }, + { + "epoch": 0.845513413506013, + "grad_norm": 1.049466075813878, + "learning_rate": 1.4172169377003775e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7610983553855026, + "step": 2285 + }, + { + "epoch": 0.84736355226642, + "grad_norm": 0.9844681012566066, + "learning_rate": 1.3842368014865414e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.761373152582306, + "step": 2290 + }, + { + "epoch": 0.849213691026827, + "grad_norm": 0.9554974166436959, + "learning_rate": 1.3516163831975337e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7762473770772875, + "step": 2295 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 1.0788307387037936, + "learning_rate": 1.3193570447729642e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7737391528509198, + "step": 2300 + }, + { + "epoch": 0.851063829787234, + "eval_loss": 0.786279559135437, + "eval_mean_token_accuracy": 0.7609007886976703, + "eval_runtime": 14.4822, + "eval_samples_per_second": 17.746, + "eval_steps_per_second": 2.279, + "step": 2300 + }, + { + "epoch": 0.8529139685476411, + "grad_norm": 0.9496030655797715, + "learning_rate": 1.2874601330769488e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7729458755416608, + "step": 2305 + }, + { + "epoch": 0.8547641073080481, + "grad_norm": 1.0540165831902506, + "learning_rate": 1.255926979841876e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7739465308411136, + "step": 2310 + }, + { + "epoch": 0.8566142460684552, + "grad_norm": 1.0590440494105375, + "learning_rate": 1.224758901612796e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7670089847984997, + "step": 2315 + }, + { + "epoch": 0.8584643848288621, + "grad_norm": 1.0429337534318288, + "learning_rate": 1.1939571996924738e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7746519685787489, + "step": 2320 + }, + { + "epoch": 0.8603145235892692, + "grad_norm": 0.9828231148374663, + "learning_rate": 1.1635231600870334e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7791841223784906, + "step": 2325 + }, + { + "epoch": 0.8621646623496763, + "grad_norm": 0.9881991333824852, + "learning_rate": 1.1334580534522932e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7731870067732876, + "step": 2330 + }, + { + "epoch": 0.8640148011100832, + "grad_norm": 0.9422103377253418, + "learning_rate": 1.1037631350406874e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7747367954907216, + "step": 2335 + }, + { + "epoch": 0.8658649398704903, + "grad_norm": 1.0201931984649346, + "learning_rate": 1.0744396446488781e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.759772846423205, + "step": 2340 + }, + { + "epoch": 0.8677150786308974, + "grad_norm": 0.9812688426997548, + "learning_rate": 1.0454888065659775e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7756920132959128, + "step": 2345 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 1.0363730698488482, + "learning_rate": 1.0169118295224488e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.788842175974254, + "step": 2350 + }, + { + "epoch": 0.8714153561517114, + "grad_norm": 1.0697488384549165, + "learning_rate": 9.887099066396178e-07, + "loss": 0.7412, + "mean_token_accuracy": 0.7755874472438374, + "step": 2355 + }, + { + "epoch": 0.8732654949121184, + "grad_norm": 0.9449397856166111, + "learning_rate": 9.608842153798903e-07, + "loss": 0.7631, + "mean_token_accuracy": 0.770625945449995, + "step": 2360 + }, + { + "epoch": 0.8751156336725254, + "grad_norm": 1.1054468669780388, + "learning_rate": 9.33435917497556e-07, + "loss": 0.7587, + "mean_token_accuracy": 0.7747619995397745, + "step": 2365 + }, + { + "epoch": 0.8769657724329325, + "grad_norm": 1.169374546075903, + "learning_rate": 9.063661589903116e-07, + "loss": 0.8126, + "mean_token_accuracy": 0.7561669915486283, + "step": 2370 + }, + { + "epoch": 0.8788159111933395, + "grad_norm": 1.0018702365915755, + "learning_rate": 8.796760700513984e-07, + "loss": 0.7629, + "mean_token_accuracy": 0.7702714833424923, + "step": 2375 + }, + { + "epoch": 0.8806660499537465, + "grad_norm": 0.9690645637665382, + "learning_rate": 8.533667650224253e-07, + "loss": 0.7335, + "mean_token_accuracy": 0.7779256861906803, + "step": 2380 + }, + { + "epoch": 0.8825161887141536, + "grad_norm": 1.0558707914057475, + "learning_rate": 8.274393423468385e-07, + "loss": 0.8192, + "mean_token_accuracy": 0.7535938075696429, + "step": 2385 + }, + { + "epoch": 0.8843663274745606, + "grad_norm": 1.1196200478508793, + "learning_rate": 8.018948845240538e-07, + "loss": 0.7822, + "mean_token_accuracy": 0.7635259209833902, + "step": 2390 + }, + { + "epoch": 0.8862164662349676, + "grad_norm": 0.9895391633901707, + "learning_rate": 7.767344580642821e-07, + "loss": 0.7517, + "mean_token_accuracy": 0.7721421582525065, + "step": 2395 + }, + { + "epoch": 0.8880666049953746, + "grad_norm": 1.001217042444044, + "learning_rate": 7.519591134439753e-07, + "loss": 0.7353, + "mean_token_accuracy": 0.7765837436804288, + "step": 2400 + }, + { + "epoch": 0.8880666049953746, + "eval_loss": 0.7853637337684631, + "eval_mean_token_accuracy": 0.7611377265923632, + "eval_runtime": 14.4906, + "eval_samples_per_second": 17.736, + "eval_steps_per_second": 2.277, + "step": 2400 + }, + { + "epoch": 0.8899167437557817, + "grad_norm": 1.0252282529989225, + "learning_rate": 7.275698850619861e-07, + "loss": 0.7608, + "mean_token_accuracy": 0.7706756195265745, + "step": 2405 + }, + { + "epoch": 0.8917668825161887, + "grad_norm": 1.0489611217005643, + "learning_rate": 7.035677911963712e-07, + "loss": 0.7287, + "mean_token_accuracy": 0.7818988370989893, + "step": 2410 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 0.9632480520763403, + "learning_rate": 6.799538339618838e-07, + "loss": 0.7991, + "mean_token_accuracy": 0.7591850094088031, + "step": 2415 + }, + { + "epoch": 0.8954671600370028, + "grad_norm": 1.0227614202252435, + "learning_rate": 6.567289992681258e-07, + "loss": 0.7225, + "mean_token_accuracy": 0.7820670674240945, + "step": 2420 + }, + { + "epoch": 0.8973172987974098, + "grad_norm": 1.0427872949729369, + "learning_rate": 6.33894256778399e-07, + "loss": 0.7578, + "mean_token_accuracy": 0.770660680936283, + "step": 2425 + }, + { + "epoch": 0.8991674375578168, + "grad_norm": 0.978179909101089, + "learning_rate": 6.114505598692011e-07, + "loss": 0.7446, + "mean_token_accuracy": 0.7736356496562514, + "step": 2430 + }, + { + "epoch": 0.9010175763182239, + "grad_norm": 1.0599673999615982, + "learning_rate": 5.893988455904387e-07, + "loss": 0.7751, + "mean_token_accuracy": 0.7662434293080997, + "step": 2435 + }, + { + "epoch": 0.902867715078631, + "grad_norm": 1.0174154417223824, + "learning_rate": 5.677400346262918e-07, + "loss": 0.7931, + "mean_token_accuracy": 0.7629375605413742, + "step": 2440 + }, + { + "epoch": 0.9047178538390379, + "grad_norm": 0.9709016227578701, + "learning_rate": 5.464750312567835e-07, + "loss": 0.7715, + "mean_token_accuracy": 0.7673094289599657, + "step": 2445 + }, + { + "epoch": 0.906567992599445, + "grad_norm": 0.9946658696929017, + "learning_rate": 5.256047233200201e-07, + "loss": 0.7445, + "mean_token_accuracy": 0.7759183383070225, + "step": 2450 + }, + { + "epoch": 0.9084181313598519, + "grad_norm": 1.040396747175872, + "learning_rate": 5.051299821751254e-07, + "loss": 0.7747, + "mean_token_accuracy": 0.7681567411545749, + "step": 2455 + }, + { + "epoch": 0.910268270120259, + "grad_norm": 1.047875088044362, + "learning_rate": 4.850516626658585e-07, + "loss": 0.8075, + "mean_token_accuracy": 0.7572996835337105, + "step": 2460 + }, + { + "epoch": 0.9121184088806661, + "grad_norm": 1.0206254437948445, + "learning_rate": 4.653706030849214e-07, + "loss": 0.7964, + "mean_token_accuracy": 0.7601130625153626, + "step": 2465 + }, + { + "epoch": 0.913968547641073, + "grad_norm": 1.0080413383470523, + "learning_rate": 4.4608762513896455e-07, + "loss": 0.7417, + "mean_token_accuracy": 0.7754506330915906, + "step": 2470 + }, + { + "epoch": 0.9158186864014801, + "grad_norm": 0.9808393522971406, + "learning_rate": 4.2720353391427547e-07, + "loss": 0.7572, + "mean_token_accuracy": 0.7703957100508998, + "step": 2475 + }, + { + "epoch": 0.9176688251618872, + "grad_norm": 1.0192407246776254, + "learning_rate": 4.087191178431682e-07, + "loss": 0.7389, + "mean_token_accuracy": 0.7775585693030911, + "step": 2480 + }, + { + "epoch": 0.9195189639222942, + "grad_norm": 0.9439881468215712, + "learning_rate": 3.9063514867105914e-07, + "loss": 0.755, + "mean_token_accuracy": 0.7702673384466353, + "step": 2485 + }, + { + "epoch": 0.9213691026827012, + "grad_norm": 0.9754955523951211, + "learning_rate": 3.729523814242608e-07, + "loss": 0.759, + "mean_token_accuracy": 0.7722613161293765, + "step": 2490 + }, + { + "epoch": 0.9232192414431082, + "grad_norm": 1.010055772171327, + "learning_rate": 3.5567155437843725e-07, + "loss": 0.7383, + "mean_token_accuracy": 0.7762423133196154, + "step": 2495 + }, + { + "epoch": 0.9250693802035153, + "grad_norm": 1.0814553575890316, + "learning_rate": 3.3879338902779945e-07, + "loss": 0.7346, + "mean_token_accuracy": 0.7772283457999138, + "step": 2500 + }, + { + "epoch": 0.9250693802035153, + "eval_loss": 0.7848615050315857, + "eval_mean_token_accuracy": 0.7614065082708322, + "eval_runtime": 14.49, + "eval_samples_per_second": 17.736, + "eval_steps_per_second": 2.277, + "step": 2500 + }, + { + "epoch": 0.9269195189639223, + "grad_norm": 0.9684129516769776, + "learning_rate": 3.223185900549686e-07, + "loss": 0.7665, + "mean_token_accuracy": 0.7701124291962537, + "step": 2505 + }, + { + "epoch": 0.9287696577243293, + "grad_norm": 1.150171862104356, + "learning_rate": 3.0624784530156384e-07, + "loss": 0.7642, + "mean_token_accuracy": 0.7688744420479343, + "step": 2510 + }, + { + "epoch": 0.9306197964847364, + "grad_norm": 0.9492928908241531, + "learning_rate": 2.905818257394799e-07, + "loss": 0.7614, + "mean_token_accuracy": 0.766935504925393, + "step": 2515 + }, + { + "epoch": 0.9324699352451434, + "grad_norm": 1.0370729751691576, + "learning_rate": 2.753211854428728e-07, + "loss": 0.7777, + "mean_token_accuracy": 0.7641760171586149, + "step": 2520 + }, + { + "epoch": 0.9343200740055504, + "grad_norm": 1.0135796606721166, + "learning_rate": 2.604665615608526e-07, + "loss": 0.7897, + "mean_token_accuracy": 0.7614986125780634, + "step": 2525 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.9799977644627964, + "learning_rate": 2.460185742908816e-07, + "loss": 0.8043, + "mean_token_accuracy": 0.758147690134667, + "step": 2530 + }, + { + "epoch": 0.9380203515263644, + "grad_norm": 0.9767990150036658, + "learning_rate": 2.3197782685288385e-07, + "loss": 0.7773, + "mean_token_accuracy": 0.7650184388101581, + "step": 2535 + }, + { + "epoch": 0.9398704902867715, + "grad_norm": 0.9750790861478491, + "learning_rate": 2.1834490546405186e-07, + "loss": 0.7608, + "mean_token_accuracy": 0.7687009631669187, + "step": 2540 + }, + { + "epoch": 0.9417206290471786, + "grad_norm": 1.022732313156545, + "learning_rate": 2.0512037931437855e-07, + "loss": 0.7937, + "mean_token_accuracy": 0.7603656820388162, + "step": 2545 + }, + { + "epoch": 0.9435707678075855, + "grad_norm": 0.9855425389677217, + "learning_rate": 1.9230480054288958e-07, + "loss": 0.7439, + "mean_token_accuracy": 0.7760858998801707, + "step": 2550 + }, + { + "epoch": 0.9454209065679926, + "grad_norm": 0.9337760849128578, + "learning_rate": 1.7989870421459498e-07, + "loss": 0.7482, + "mean_token_accuracy": 0.7745016372562605, + "step": 2555 + }, + { + "epoch": 0.9472710453283997, + "grad_norm": 0.9836546027372839, + "learning_rate": 1.6790260829814053e-07, + "loss": 0.774, + "mean_token_accuracy": 0.7672312632782938, + "step": 2560 + }, + { + "epoch": 0.9491211840888066, + "grad_norm": 0.9936808945709743, + "learning_rate": 1.5631701364419492e-07, + "loss": 0.7766, + "mean_token_accuracy": 0.7658642650010227, + "step": 2565 + }, + { + "epoch": 0.9509713228492137, + "grad_norm": 1.0191286758955544, + "learning_rate": 1.4514240396452438e-07, + "loss": 0.7601, + "mean_token_accuracy": 0.7708545977410293, + "step": 2570 + }, + { + "epoch": 0.9528214616096207, + "grad_norm": 0.9999825388420481, + "learning_rate": 1.3437924581181205e-07, + "loss": 0.7894, + "mean_token_accuracy": 0.7644351630911664, + "step": 2575 + }, + { + "epoch": 0.9546716003700277, + "grad_norm": 1.0157083629158576, + "learning_rate": 1.2402798856016474e-07, + "loss": 0.8051, + "mean_token_accuracy": 0.7594893254401365, + "step": 2580 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 1.1327318894167577, + "learning_rate": 1.1408906438636236e-07, + "loss": 0.7898, + "mean_token_accuracy": 0.7622680369859076, + "step": 2585 + }, + { + "epoch": 0.9583718778908418, + "grad_norm": 1.021819944028649, + "learning_rate": 1.045628882518046e-07, + "loss": 0.7282, + "mean_token_accuracy": 0.7775804046072962, + "step": 2590 + }, + { + "epoch": 0.9602220166512488, + "grad_norm": 1.0034217793974385, + "learning_rate": 9.544985788519589e-08, + "loss": 0.7503, + "mean_token_accuracy": 0.77198131109448, + "step": 2595 + }, + { + "epoch": 0.9620721554116559, + "grad_norm": 1.0344424553167828, + "learning_rate": 8.675035376593088e-08, + "loss": 0.746, + "mean_token_accuracy": 0.7754697265847252, + "step": 2600 + }, + { + "epoch": 0.9620721554116559, + "eval_loss": 0.784595787525177, + "eval_mean_token_accuracy": 0.7615490216632195, + "eval_runtime": 14.4871, + "eval_samples_per_second": 17.74, + "eval_steps_per_second": 2.278, + "step": 2600 + }, + { + "epoch": 0.9639222941720629, + "grad_norm": 1.066302103013376, + "learning_rate": 7.846473910821162e-08, + "loss": 0.8012, + "mean_token_accuracy": 0.7589824884508426, + "step": 2605 + }, + { + "epoch": 0.96577243293247, + "grad_norm": 1.0439775703442844, + "learning_rate": 7.059335984588634e-08, + "loss": 0.7714, + "mean_token_accuracy": 0.7685618026808643, + "step": 2610 + }, + { + "epoch": 0.967622571692877, + "grad_norm": 0.9979568716403987, + "learning_rate": 6.313654461800322e-08, + "loss": 0.7691, + "mean_token_accuracy": 0.7692392163861215, + "step": 2615 + }, + { + "epoch": 0.969472710453284, + "grad_norm": 1.0462609907918055, + "learning_rate": 5.609460475509032e-08, + "loss": 0.8118, + "mean_token_accuracy": 0.7550502699957827, + "step": 2620 + }, + { + "epoch": 0.971322849213691, + "grad_norm": 0.9468653120899511, + "learning_rate": 4.9467834266154756e-08, + "loss": 0.7415, + "mean_token_accuracy": 0.7774647236663202, + "step": 2625 + }, + { + "epoch": 0.973172987974098, + "grad_norm": 0.9393352507405182, + "learning_rate": 4.325650982641039e-08, + "loss": 0.7775, + "mean_token_accuracy": 0.7653114104869969, + "step": 2630 + }, + { + "epoch": 0.9750231267345051, + "grad_norm": 0.9759398836029194, + "learning_rate": 3.746089076572701e-08, + "loss": 0.7693, + "mean_token_accuracy": 0.7675998725554745, + "step": 2635 + }, + { + "epoch": 0.9768732654949122, + "grad_norm": 1.017627330808071, + "learning_rate": 3.208121905779904e-08, + "loss": 0.7506, + "mean_token_accuracy": 0.7735333802043229, + "step": 2640 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 1.0255166411692331, + "learning_rate": 2.711771931004692e-08, + "loss": 0.786, + "mean_token_accuracy": 0.7631780300946223, + "step": 2645 + }, + { + "epoch": 0.9805735430157262, + "grad_norm": 1.0338569675309728, + "learning_rate": 2.257059875423795e-08, + "loss": 0.7517, + "mean_token_accuracy": 0.772640776817001, + "step": 2650 + }, + { + "epoch": 0.9824236817761333, + "grad_norm": 1.0308143426134002, + "learning_rate": 1.8440047237832105e-08, + "loss": 0.76, + "mean_token_accuracy": 0.7701472438917961, + "step": 2655 + }, + { + "epoch": 0.9842738205365402, + "grad_norm": 1.0058292615563849, + "learning_rate": 1.472623721606059e-08, + "loss": 0.7579, + "mean_token_accuracy": 0.7706978101988982, + "step": 2660 + }, + { + "epoch": 0.9861239592969473, + "grad_norm": 1.0090757117370701, + "learning_rate": 1.1429323744720499e-08, + "loss": 0.7332, + "mean_token_accuracy": 0.7793816957245315, + "step": 2665 + }, + { + "epoch": 0.9879740980573543, + "grad_norm": 1.140399977966575, + "learning_rate": 8.549444473702207e-09, + "loss": 0.7798, + "mean_token_accuracy": 0.7659991946659387, + "step": 2670 + }, + { + "epoch": 0.9898242368177613, + "grad_norm": 1.0222090295721236, + "learning_rate": 6.086719641246186e-09, + "loss": 0.8039, + "mean_token_accuracy": 0.7583265951424814, + "step": 2675 + }, + { + "epoch": 0.9916743755781684, + "grad_norm": 1.115581827026461, + "learning_rate": 4.041252068918145e-09, + "loss": 0.7483, + "mean_token_accuracy": 0.7735649345949459, + "step": 2680 + }, + { + "epoch": 0.9935245143385754, + "grad_norm": 0.9356178934535633, + "learning_rate": 2.4131271573191172e-09, + "loss": 0.7597, + "mean_token_accuracy": 0.770809537902344, + "step": 2685 + }, + { + "epoch": 0.9953746530989824, + "grad_norm": 0.9999781646028101, + "learning_rate": 1.2024128825172121e-09, + "loss": 0.7681, + "mean_token_accuracy": 0.7680575958454982, + "step": 2690 + }, + { + "epoch": 0.9972247918593895, + "grad_norm": 1.11640561965384, + "learning_rate": 4.0915979321320967e-10, + "loss": 0.7955, + "mean_token_accuracy": 0.7621472228807514, + "step": 2695 + }, + { + "epoch": 0.9990749306197965, + "grad_norm": 1.0569074447134446, + "learning_rate": 3.3401008625588706e-11, + "loss": 0.7555, + "mean_token_accuracy": 0.7726517307618869, + "step": 2700 + }, + { + "epoch": 0.9990749306197965, + "eval_loss": 0.7846623659133911, + "eval_mean_token_accuracy": 0.7613747534964659, + "eval_runtime": 14.4979, + "eval_samples_per_second": 17.727, + "eval_steps_per_second": 2.276, + "step": 2700 + }, + { + "epoch": 0.9998149861239592, + "mean_token_accuracy": 0.7608276848992214, + "step": 2702, + "total_flos": 76966677970944.0, + "train_loss": 0.8051455439592978, + "train_runtime": 7868.3356, + "train_samples_per_second": 5.495, + "train_steps_per_second": 0.343 + } + ], + "logging_steps": 5, + "max_steps": 2702, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 76966677970944.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}