{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 7644, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013082155939298796, "grad_norm": 7.290712591801908, "learning_rate": 4.9999999999999996e-05, "loss": 5.7851, "step": 10 }, { "epoch": 0.0026164311878597592, "grad_norm": 3.67299584724583, "learning_rate": 6.505149978319905e-05, "loss": 3.1259, "step": 20 }, { "epoch": 0.003924646781789639, "grad_norm": 2.4764513799723513, "learning_rate": 7.385606273598311e-05, "loss": 1.6786, "step": 30 }, { "epoch": 0.0052328623757195184, "grad_norm": 1.7953600265789333, "learning_rate": 8.01029995663981e-05, "loss": 1.2211, "step": 40 }, { "epoch": 0.006541077969649398, "grad_norm": 1.8771167917018612, "learning_rate": 8.494850021680092e-05, "loss": 1.2585, "step": 50 }, { "epoch": 0.007849293563579277, "grad_norm": 1.2473543190033471, "learning_rate": 8.890756251918216e-05, "loss": 0.9283, "step": 60 }, { "epoch": 0.009157509157509158, "grad_norm": 2.618558273245568, "learning_rate": 9.225490200071284e-05, "loss": 1.1733, "step": 70 }, { "epoch": 0.010465724751439037, "grad_norm": 1.2184582707007738, "learning_rate": 9.515449934959716e-05, "loss": 0.9113, "step": 80 }, { "epoch": 0.011773940345368918, "grad_norm": 1.1266167128317874, "learning_rate": 9.771212547196623e-05, "loss": 1.0926, "step": 90 }, { "epoch": 0.013082155939298797, "grad_norm": 1.2759202461423789, "learning_rate": 9.999999999999999e-05, "loss": 0.9088, "step": 100 }, { "epoch": 0.014390371533228676, "grad_norm": 1.1320718200823117, "learning_rate": 9.988069989395547e-05, "loss": 1.0652, "step": 110 }, { "epoch": 0.015698587127158554, "grad_norm": 1.253166703044972, "learning_rate": 9.974814422057265e-05, "loss": 0.8037, "step": 120 }, { "epoch": 0.017006802721088437, "grad_norm": 0.8736521465598114, "learning_rate": 9.961558854718983e-05, "loss": 1.0371, "step": 130 }, { "epoch": 0.018315018315018316, "grad_norm": 0.9331897722352183, "learning_rate": 9.9483032873807e-05, "loss": 0.8399, "step": 140 }, { "epoch": 0.019623233908948195, "grad_norm": 1.16561958480625, "learning_rate": 9.935047720042418e-05, "loss": 1.0586, "step": 150 }, { "epoch": 0.020931449502878074, "grad_norm": 0.8921867195796972, "learning_rate": 9.921792152704136e-05, "loss": 0.7718, "step": 160 }, { "epoch": 0.022239665096807953, "grad_norm": 0.7140587209081476, "learning_rate": 9.908536585365854e-05, "loss": 1.009, "step": 170 }, { "epoch": 0.023547880690737835, "grad_norm": 0.8309946634200358, "learning_rate": 9.895281018027573e-05, "loss": 0.7839, "step": 180 }, { "epoch": 0.024856096284667714, "grad_norm": 0.7342060793171521, "learning_rate": 9.88202545068929e-05, "loss": 0.9935, "step": 190 }, { "epoch": 0.026164311878597593, "grad_norm": 0.9847538624284081, "learning_rate": 9.868769883351007e-05, "loss": 0.7705, "step": 200 }, { "epoch": 0.027472527472527472, "grad_norm": 0.8944456756402039, "learning_rate": 9.855514316012726e-05, "loss": 1.0197, "step": 210 }, { "epoch": 0.02878074306645735, "grad_norm": 0.7463212583397272, "learning_rate": 9.842258748674443e-05, "loss": 0.7562, "step": 220 }, { "epoch": 0.030088958660387233, "grad_norm": 0.7842188750961813, "learning_rate": 9.829003181336162e-05, "loss": 0.9935, "step": 230 }, { "epoch": 0.03139717425431711, "grad_norm": 0.8675590356137016, "learning_rate": 9.815747613997879e-05, "loss": 0.7518, "step": 240 }, { "epoch": 0.03270538984824699, "grad_norm": 0.5923018779242124, "learning_rate": 9.802492046659596e-05, "loss": 0.9582, "step": 250 }, { "epoch": 0.034013605442176874, "grad_norm": 0.6928464778679819, "learning_rate": 9.789236479321315e-05, "loss": 0.747, "step": 260 }, { "epoch": 0.03532182103610675, "grad_norm": 0.7113730083361403, "learning_rate": 9.775980911983034e-05, "loss": 0.9576, "step": 270 }, { "epoch": 0.03663003663003663, "grad_norm": 0.8679078003827115, "learning_rate": 9.762725344644751e-05, "loss": 0.7286, "step": 280 }, { "epoch": 0.03793825222396651, "grad_norm": 0.600907373361144, "learning_rate": 9.74946977730647e-05, "loss": 0.9778, "step": 290 }, { "epoch": 0.03924646781789639, "grad_norm": 0.8848902266623525, "learning_rate": 9.736214209968187e-05, "loss": 0.7405, "step": 300 }, { "epoch": 0.04055468341182627, "grad_norm": 0.9589691469495706, "learning_rate": 9.722958642629904e-05, "loss": 0.9579, "step": 310 }, { "epoch": 0.04186289900575615, "grad_norm": 0.7099733591624596, "learning_rate": 9.709703075291623e-05, "loss": 0.7529, "step": 320 }, { "epoch": 0.04317111459968603, "grad_norm": 0.6510293809764843, "learning_rate": 9.696447507953341e-05, "loss": 0.979, "step": 330 }, { "epoch": 0.044479330193615906, "grad_norm": 1.1641190214279045, "learning_rate": 9.683191940615059e-05, "loss": 0.7132, "step": 340 }, { "epoch": 0.045787545787545784, "grad_norm": 0.9673000795752291, "learning_rate": 9.669936373276777e-05, "loss": 0.9253, "step": 350 }, { "epoch": 0.04709576138147567, "grad_norm": 0.7190353586884389, "learning_rate": 9.656680805938494e-05, "loss": 0.726, "step": 360 }, { "epoch": 0.04840397697540555, "grad_norm": 0.7201601219088483, "learning_rate": 9.643425238600212e-05, "loss": 0.9286, "step": 370 }, { "epoch": 0.04971219256933543, "grad_norm": 0.884374342504821, "learning_rate": 9.63016967126193e-05, "loss": 0.7203, "step": 380 }, { "epoch": 0.05102040816326531, "grad_norm": 0.7049160941807705, "learning_rate": 9.616914103923649e-05, "loss": 0.9509, "step": 390 }, { "epoch": 0.052328623757195186, "grad_norm": 0.9056185856880267, "learning_rate": 9.603658536585366e-05, "loss": 0.7115, "step": 400 }, { "epoch": 0.053636839351125065, "grad_norm": 0.6053280737552994, "learning_rate": 9.590402969247085e-05, "loss": 0.949, "step": 410 }, { "epoch": 0.054945054945054944, "grad_norm": 0.669203867860999, "learning_rate": 9.577147401908802e-05, "loss": 0.7155, "step": 420 }, { "epoch": 0.05625327053898482, "grad_norm": 0.7049507997603442, "learning_rate": 9.56389183457052e-05, "loss": 0.9633, "step": 430 }, { "epoch": 0.0575614861329147, "grad_norm": 0.72413054526148, "learning_rate": 9.550636267232238e-05, "loss": 0.6704, "step": 440 }, { "epoch": 0.05886970172684458, "grad_norm": 0.9434997463945898, "learning_rate": 9.537380699893957e-05, "loss": 0.9329, "step": 450 }, { "epoch": 0.06017791732077447, "grad_norm": 0.9425009249090048, "learning_rate": 9.524125132555674e-05, "loss": 0.6726, "step": 460 }, { "epoch": 0.061486132914704346, "grad_norm": 0.529629216676613, "learning_rate": 9.510869565217391e-05, "loss": 0.9172, "step": 470 }, { "epoch": 0.06279434850863422, "grad_norm": 0.694713736738518, "learning_rate": 9.49761399787911e-05, "loss": 0.7293, "step": 480 }, { "epoch": 0.0641025641025641, "grad_norm": 0.6230312130051202, "learning_rate": 9.484358430540827e-05, "loss": 0.903, "step": 490 }, { "epoch": 0.06541077969649398, "grad_norm": 0.6370052650163481, "learning_rate": 9.471102863202546e-05, "loss": 0.7062, "step": 500 }, { "epoch": 0.06671899529042387, "grad_norm": 0.7394610544442655, "learning_rate": 9.457847295864264e-05, "loss": 0.9301, "step": 510 }, { "epoch": 0.06802721088435375, "grad_norm": 0.5284256055090087, "learning_rate": 9.444591728525982e-05, "loss": 0.6504, "step": 520 }, { "epoch": 0.06933542647828363, "grad_norm": 0.8769802465059457, "learning_rate": 9.431336161187699e-05, "loss": 0.9392, "step": 530 }, { "epoch": 0.0706436420722135, "grad_norm": 0.7788506408432221, "learning_rate": 9.418080593849417e-05, "loss": 0.7188, "step": 540 }, { "epoch": 0.07195185766614338, "grad_norm": 0.6258384386257697, "learning_rate": 9.404825026511135e-05, "loss": 0.8991, "step": 550 }, { "epoch": 0.07326007326007326, "grad_norm": 0.771128895833298, "learning_rate": 9.391569459172853e-05, "loss": 0.6951, "step": 560 }, { "epoch": 0.07456828885400314, "grad_norm": 0.8070532606977017, "learning_rate": 9.378313891834572e-05, "loss": 0.9198, "step": 570 }, { "epoch": 0.07587650444793302, "grad_norm": 0.7040977734805667, "learning_rate": 9.365058324496289e-05, "loss": 0.7285, "step": 580 }, { "epoch": 0.0771847200418629, "grad_norm": 0.5877418285296147, "learning_rate": 9.351802757158006e-05, "loss": 0.8823, "step": 590 }, { "epoch": 0.07849293563579278, "grad_norm": 0.43162998979605355, "learning_rate": 9.338547189819725e-05, "loss": 0.6631, "step": 600 }, { "epoch": 0.07980115122972266, "grad_norm": 0.5655884401483549, "learning_rate": 9.325291622481442e-05, "loss": 0.9211, "step": 610 }, { "epoch": 0.08110936682365254, "grad_norm": 0.6606829717694761, "learning_rate": 9.312036055143161e-05, "loss": 0.6835, "step": 620 }, { "epoch": 0.08241758241758242, "grad_norm": 0.4866281049362381, "learning_rate": 9.29878048780488e-05, "loss": 0.8975, "step": 630 }, { "epoch": 0.0837257980115123, "grad_norm": 1.1102292165370264, "learning_rate": 9.285524920466597e-05, "loss": 0.6914, "step": 640 }, { "epoch": 0.08503401360544217, "grad_norm": 0.5006402275523141, "learning_rate": 9.272269353128314e-05, "loss": 0.9399, "step": 650 }, { "epoch": 0.08634222919937205, "grad_norm": 0.7796348119914489, "learning_rate": 9.259013785790033e-05, "loss": 0.7019, "step": 660 }, { "epoch": 0.08765044479330193, "grad_norm": 0.7168780685110666, "learning_rate": 9.24575821845175e-05, "loss": 0.8866, "step": 670 }, { "epoch": 0.08895866038723181, "grad_norm": 0.8634136089366375, "learning_rate": 9.232502651113469e-05, "loss": 0.6761, "step": 680 }, { "epoch": 0.09026687598116169, "grad_norm": 0.596915980244832, "learning_rate": 9.219247083775187e-05, "loss": 0.9005, "step": 690 }, { "epoch": 0.09157509157509157, "grad_norm": 0.6529475469297664, "learning_rate": 9.205991516436903e-05, "loss": 0.6601, "step": 700 }, { "epoch": 0.09288330716902145, "grad_norm": 0.5753441026258548, "learning_rate": 9.192735949098622e-05, "loss": 0.9205, "step": 710 }, { "epoch": 0.09419152276295134, "grad_norm": 0.7907931238031155, "learning_rate": 9.17948038176034e-05, "loss": 0.6816, "step": 720 }, { "epoch": 0.09549973835688122, "grad_norm": 0.5399094070955297, "learning_rate": 9.166224814422058e-05, "loss": 0.9374, "step": 730 }, { "epoch": 0.0968079539508111, "grad_norm": 0.7177139918496634, "learning_rate": 9.152969247083776e-05, "loss": 0.6665, "step": 740 }, { "epoch": 0.09811616954474098, "grad_norm": 0.5696746776689743, "learning_rate": 9.139713679745493e-05, "loss": 0.9129, "step": 750 }, { "epoch": 0.09942438513867086, "grad_norm": 0.638093758359057, "learning_rate": 9.126458112407211e-05, "loss": 0.6675, "step": 760 }, { "epoch": 0.10073260073260074, "grad_norm": 0.6020538420505003, "learning_rate": 9.11320254506893e-05, "loss": 0.8901, "step": 770 }, { "epoch": 0.10204081632653061, "grad_norm": 0.6120466557768905, "learning_rate": 9.099946977730648e-05, "loss": 0.6796, "step": 780 }, { "epoch": 0.1033490319204605, "grad_norm": 0.6210967554289628, "learning_rate": 9.086691410392365e-05, "loss": 0.8869, "step": 790 }, { "epoch": 0.10465724751439037, "grad_norm": 0.8091885327796373, "learning_rate": 9.073435843054084e-05, "loss": 0.6625, "step": 800 }, { "epoch": 0.10596546310832025, "grad_norm": 0.5779837898387246, "learning_rate": 9.060180275715801e-05, "loss": 0.8876, "step": 810 }, { "epoch": 0.10727367870225013, "grad_norm": 0.7299991372030511, "learning_rate": 9.046924708377518e-05, "loss": 0.696, "step": 820 }, { "epoch": 0.10858189429618001, "grad_norm": 0.6620337610636755, "learning_rate": 9.033669141039237e-05, "loss": 0.9008, "step": 830 }, { "epoch": 0.10989010989010989, "grad_norm": 0.6010699007449223, "learning_rate": 9.020413573700954e-05, "loss": 0.6712, "step": 840 }, { "epoch": 0.11119832548403977, "grad_norm": 0.5427558713650651, "learning_rate": 9.007158006362673e-05, "loss": 0.8753, "step": 850 }, { "epoch": 0.11250654107796965, "grad_norm": 0.6742371579613256, "learning_rate": 8.993902439024391e-05, "loss": 0.6737, "step": 860 }, { "epoch": 0.11381475667189953, "grad_norm": 0.7058859567453811, "learning_rate": 8.980646871686109e-05, "loss": 0.8956, "step": 870 }, { "epoch": 0.1151229722658294, "grad_norm": 0.7684505962300139, "learning_rate": 8.967391304347826e-05, "loss": 0.6851, "step": 880 }, { "epoch": 0.11643118785975928, "grad_norm": 0.5221308693647347, "learning_rate": 8.954135737009545e-05, "loss": 0.8774, "step": 890 }, { "epoch": 0.11773940345368916, "grad_norm": 0.6747737190536728, "learning_rate": 8.940880169671262e-05, "loss": 0.6816, "step": 900 }, { "epoch": 0.11904761904761904, "grad_norm": 0.46024093702145724, "learning_rate": 8.92762460233298e-05, "loss": 0.8785, "step": 910 }, { "epoch": 0.12035583464154893, "grad_norm": 0.5572757908727249, "learning_rate": 8.914369034994699e-05, "loss": 0.6749, "step": 920 }, { "epoch": 0.12166405023547881, "grad_norm": 0.5755409211612259, "learning_rate": 8.901113467656415e-05, "loss": 0.8996, "step": 930 }, { "epoch": 0.12297226582940869, "grad_norm": 0.44743009853908355, "learning_rate": 8.887857900318134e-05, "loss": 0.6812, "step": 940 }, { "epoch": 0.12428048142333857, "grad_norm": 0.5356391814605695, "learning_rate": 8.874602332979852e-05, "loss": 0.8962, "step": 950 }, { "epoch": 0.12558869701726844, "grad_norm": 0.9142027438043182, "learning_rate": 8.86134676564157e-05, "loss": 0.6923, "step": 960 }, { "epoch": 0.12689691261119831, "grad_norm": 0.5787045277844781, "learning_rate": 8.848091198303288e-05, "loss": 0.8958, "step": 970 }, { "epoch": 0.1282051282051282, "grad_norm": 0.7928771640327954, "learning_rate": 8.834835630965005e-05, "loss": 0.6926, "step": 980 }, { "epoch": 0.12951334379905807, "grad_norm": 0.6614005586526391, "learning_rate": 8.821580063626723e-05, "loss": 0.8844, "step": 990 }, { "epoch": 0.13082155939298795, "grad_norm": 0.49917342117601304, "learning_rate": 8.808324496288441e-05, "loss": 0.6839, "step": 1000 }, { "epoch": 0.13212977498691783, "grad_norm": 0.6870942412562521, "learning_rate": 8.79506892895016e-05, "loss": 0.937, "step": 1010 }, { "epoch": 0.13343799058084774, "grad_norm": 0.48623360323222364, "learning_rate": 8.781813361611877e-05, "loss": 0.6424, "step": 1020 }, { "epoch": 0.13474620617477762, "grad_norm": 0.522869801801631, "learning_rate": 8.768557794273596e-05, "loss": 0.9139, "step": 1030 }, { "epoch": 0.1360544217687075, "grad_norm": 0.8763988251800717, "learning_rate": 8.755302226935313e-05, "loss": 0.6746, "step": 1040 }, { "epoch": 0.13736263736263737, "grad_norm": 0.7101195150803217, "learning_rate": 8.74204665959703e-05, "loss": 0.8867, "step": 1050 }, { "epoch": 0.13867085295656725, "grad_norm": 0.6262246747509773, "learning_rate": 8.728791092258749e-05, "loss": 0.6833, "step": 1060 }, { "epoch": 0.13997906855049713, "grad_norm": 0.5315808280206341, "learning_rate": 8.715535524920468e-05, "loss": 0.9028, "step": 1070 }, { "epoch": 0.141287284144427, "grad_norm": 0.5625387450672273, "learning_rate": 8.702279957582185e-05, "loss": 0.6623, "step": 1080 }, { "epoch": 0.1425954997383569, "grad_norm": 0.6106698449027703, "learning_rate": 8.689024390243903e-05, "loss": 0.9054, "step": 1090 }, { "epoch": 0.14390371533228677, "grad_norm": 0.785333814713217, "learning_rate": 8.67576882290562e-05, "loss": 0.6621, "step": 1100 }, { "epoch": 0.14521193092621665, "grad_norm": 0.5341457759006656, "learning_rate": 8.662513255567338e-05, "loss": 0.8977, "step": 1110 }, { "epoch": 0.14652014652014653, "grad_norm": 0.4836909763150667, "learning_rate": 8.649257688229057e-05, "loss": 0.643, "step": 1120 }, { "epoch": 0.1478283621140764, "grad_norm": 0.6214661415425415, "learning_rate": 8.636002120890775e-05, "loss": 0.8619, "step": 1130 }, { "epoch": 0.14913657770800628, "grad_norm": 0.44754299062781455, "learning_rate": 8.622746553552492e-05, "loss": 0.6545, "step": 1140 }, { "epoch": 0.15044479330193616, "grad_norm": 0.654698416147961, "learning_rate": 8.609490986214211e-05, "loss": 0.8687, "step": 1150 }, { "epoch": 0.15175300889586604, "grad_norm": 0.7796186456327326, "learning_rate": 8.596235418875928e-05, "loss": 0.6516, "step": 1160 }, { "epoch": 0.15306122448979592, "grad_norm": 0.5619622188020412, "learning_rate": 8.582979851537646e-05, "loss": 0.9019, "step": 1170 }, { "epoch": 0.1543694400837258, "grad_norm": 0.589108674850363, "learning_rate": 8.569724284199364e-05, "loss": 0.6514, "step": 1180 }, { "epoch": 0.15567765567765568, "grad_norm": 0.9533885730603633, "learning_rate": 8.556468716861083e-05, "loss": 0.8537, "step": 1190 }, { "epoch": 0.15698587127158556, "grad_norm": 0.5479885592424896, "learning_rate": 8.5432131495228e-05, "loss": 0.6615, "step": 1200 }, { "epoch": 0.15829408686551544, "grad_norm": 0.6153890373628342, "learning_rate": 8.529957582184517e-05, "loss": 0.8631, "step": 1210 }, { "epoch": 0.15960230245944532, "grad_norm": 0.7917177530306803, "learning_rate": 8.516702014846236e-05, "loss": 0.6616, "step": 1220 }, { "epoch": 0.1609105180533752, "grad_norm": 0.656469547745639, "learning_rate": 8.503446447507953e-05, "loss": 0.8929, "step": 1230 }, { "epoch": 0.16221873364730507, "grad_norm": 0.5569826880804676, "learning_rate": 8.490190880169672e-05, "loss": 0.6477, "step": 1240 }, { "epoch": 0.16352694924123495, "grad_norm": 0.6145021073052068, "learning_rate": 8.47693531283139e-05, "loss": 0.8835, "step": 1250 }, { "epoch": 0.16483516483516483, "grad_norm": 0.6285934467054461, "learning_rate": 8.463679745493108e-05, "loss": 0.6549, "step": 1260 }, { "epoch": 0.1661433804290947, "grad_norm": 0.4969597278981036, "learning_rate": 8.450424178154825e-05, "loss": 0.8846, "step": 1270 }, { "epoch": 0.1674515960230246, "grad_norm": 0.6062848536416026, "learning_rate": 8.437168610816544e-05, "loss": 0.6269, "step": 1280 }, { "epoch": 0.16875981161695447, "grad_norm": 0.5265730484032111, "learning_rate": 8.423913043478261e-05, "loss": 0.8592, "step": 1290 }, { "epoch": 0.17006802721088435, "grad_norm": 0.5811110234076874, "learning_rate": 8.41065747613998e-05, "loss": 0.6544, "step": 1300 }, { "epoch": 0.17137624280481423, "grad_norm": 0.5639694934132476, "learning_rate": 8.397401908801698e-05, "loss": 0.8732, "step": 1310 }, { "epoch": 0.1726844583987441, "grad_norm": 0.7531411828367692, "learning_rate": 8.384146341463415e-05, "loss": 0.6683, "step": 1320 }, { "epoch": 0.17399267399267399, "grad_norm": 0.5146605068810605, "learning_rate": 8.370890774125133e-05, "loss": 0.8911, "step": 1330 }, { "epoch": 0.17530088958660386, "grad_norm": 0.5881044587524927, "learning_rate": 8.357635206786851e-05, "loss": 0.6957, "step": 1340 }, { "epoch": 0.17660910518053374, "grad_norm": 0.6108606112713066, "learning_rate": 8.344379639448568e-05, "loss": 0.864, "step": 1350 }, { "epoch": 0.17791732077446362, "grad_norm": 0.6838348363870184, "learning_rate": 8.331124072110287e-05, "loss": 0.6382, "step": 1360 }, { "epoch": 0.1792255363683935, "grad_norm": 0.5844899134885503, "learning_rate": 8.317868504772006e-05, "loss": 0.895, "step": 1370 }, { "epoch": 0.18053375196232338, "grad_norm": 0.40337593852276243, "learning_rate": 8.304612937433723e-05, "loss": 0.649, "step": 1380 }, { "epoch": 0.18184196755625326, "grad_norm": 0.49730732309328707, "learning_rate": 8.29135737009544e-05, "loss": 0.8442, "step": 1390 }, { "epoch": 0.18315018315018314, "grad_norm": 0.5590991907664666, "learning_rate": 8.278101802757159e-05, "loss": 0.6445, "step": 1400 }, { "epoch": 0.18445839874411302, "grad_norm": 0.6588503605001691, "learning_rate": 8.264846235418876e-05, "loss": 0.864, "step": 1410 }, { "epoch": 0.1857666143380429, "grad_norm": 0.6197416228060506, "learning_rate": 8.251590668080595e-05, "loss": 0.6094, "step": 1420 }, { "epoch": 0.1870748299319728, "grad_norm": 0.5482300336211388, "learning_rate": 8.238335100742312e-05, "loss": 0.8619, "step": 1430 }, { "epoch": 0.18838304552590268, "grad_norm": 0.72709681776675, "learning_rate": 8.225079533404029e-05, "loss": 0.6456, "step": 1440 }, { "epoch": 0.18969126111983256, "grad_norm": 0.5600056919125233, "learning_rate": 8.211823966065748e-05, "loss": 0.86, "step": 1450 }, { "epoch": 0.19099947671376244, "grad_norm": 0.7289799556624317, "learning_rate": 8.198568398727466e-05, "loss": 0.6271, "step": 1460 }, { "epoch": 0.19230769230769232, "grad_norm": 0.4590489049870012, "learning_rate": 8.185312831389184e-05, "loss": 0.8897, "step": 1470 }, { "epoch": 0.1936159079016222, "grad_norm": 0.8036211881560831, "learning_rate": 8.172057264050902e-05, "loss": 0.6434, "step": 1480 }, { "epoch": 0.19492412349555208, "grad_norm": 0.49768694148703807, "learning_rate": 8.15880169671262e-05, "loss": 0.8417, "step": 1490 }, { "epoch": 0.19623233908948196, "grad_norm": 0.771940987579212, "learning_rate": 8.145546129374337e-05, "loss": 0.626, "step": 1500 }, { "epoch": 0.19754055468341183, "grad_norm": 0.5487861196155561, "learning_rate": 8.132290562036055e-05, "loss": 0.863, "step": 1510 }, { "epoch": 0.1988487702773417, "grad_norm": 0.5767745735777565, "learning_rate": 8.119034994697774e-05, "loss": 0.6327, "step": 1520 }, { "epoch": 0.2001569858712716, "grad_norm": 0.5740160328527427, "learning_rate": 8.105779427359491e-05, "loss": 0.8293, "step": 1530 }, { "epoch": 0.20146520146520147, "grad_norm": 0.6248485702307536, "learning_rate": 8.09252386002121e-05, "loss": 0.6525, "step": 1540 }, { "epoch": 0.20277341705913135, "grad_norm": 0.5959014412308178, "learning_rate": 8.079268292682927e-05, "loss": 0.8793, "step": 1550 }, { "epoch": 0.20408163265306123, "grad_norm": 0.5523669620436882, "learning_rate": 8.066012725344644e-05, "loss": 0.6549, "step": 1560 }, { "epoch": 0.2053898482469911, "grad_norm": 0.6456441196706465, "learning_rate": 8.052757158006363e-05, "loss": 0.8688, "step": 1570 }, { "epoch": 0.206698063840921, "grad_norm": 0.5659881168480183, "learning_rate": 8.039501590668082e-05, "loss": 0.6565, "step": 1580 }, { "epoch": 0.20800627943485087, "grad_norm": 0.5616175050073812, "learning_rate": 8.026246023329799e-05, "loss": 0.8418, "step": 1590 }, { "epoch": 0.20931449502878074, "grad_norm": 0.6028672819947086, "learning_rate": 8.012990455991518e-05, "loss": 0.6231, "step": 1600 }, { "epoch": 0.21062271062271062, "grad_norm": 0.546703449007772, "learning_rate": 7.999734888653235e-05, "loss": 0.8631, "step": 1610 }, { "epoch": 0.2119309262166405, "grad_norm": 0.3849996641772154, "learning_rate": 7.986479321314952e-05, "loss": 0.6246, "step": 1620 }, { "epoch": 0.21323914181057038, "grad_norm": 0.4435598672137561, "learning_rate": 7.973223753976671e-05, "loss": 0.886, "step": 1630 }, { "epoch": 0.21454735740450026, "grad_norm": 0.6111533721181235, "learning_rate": 7.95996818663839e-05, "loss": 0.6494, "step": 1640 }, { "epoch": 0.21585557299843014, "grad_norm": 0.5729934681943539, "learning_rate": 7.946712619300107e-05, "loss": 0.8618, "step": 1650 }, { "epoch": 0.21716378859236002, "grad_norm": 0.6355561878934224, "learning_rate": 7.933457051961824e-05, "loss": 0.6303, "step": 1660 }, { "epoch": 0.2184720041862899, "grad_norm": 0.4669924162265557, "learning_rate": 7.920201484623541e-05, "loss": 0.8654, "step": 1670 }, { "epoch": 0.21978021978021978, "grad_norm": 0.6815024166926259, "learning_rate": 7.90694591728526e-05, "loss": 0.6299, "step": 1680 }, { "epoch": 0.22108843537414966, "grad_norm": 0.5542650967967168, "learning_rate": 7.893690349946978e-05, "loss": 0.8661, "step": 1690 }, { "epoch": 0.22239665096807953, "grad_norm": 0.5596504228795459, "learning_rate": 7.880434782608696e-05, "loss": 0.6462, "step": 1700 }, { "epoch": 0.2237048665620094, "grad_norm": 0.6616295642250447, "learning_rate": 7.867179215270414e-05, "loss": 0.8638, "step": 1710 }, { "epoch": 0.2250130821559393, "grad_norm": 0.42204578619386185, "learning_rate": 7.853923647932132e-05, "loss": 0.6493, "step": 1720 }, { "epoch": 0.22632129774986917, "grad_norm": 0.6146957520525497, "learning_rate": 7.840668080593849e-05, "loss": 0.8621, "step": 1730 }, { "epoch": 0.22762951334379905, "grad_norm": 0.595798638618577, "learning_rate": 7.827412513255567e-05, "loss": 0.623, "step": 1740 }, { "epoch": 0.22893772893772893, "grad_norm": 0.624548132547215, "learning_rate": 7.814156945917286e-05, "loss": 0.8582, "step": 1750 }, { "epoch": 0.2302459445316588, "grad_norm": 0.6305314992926004, "learning_rate": 7.800901378579003e-05, "loss": 0.6336, "step": 1760 }, { "epoch": 0.2315541601255887, "grad_norm": 0.7759096340841495, "learning_rate": 7.787645811240722e-05, "loss": 0.8387, "step": 1770 }, { "epoch": 0.23286237571951857, "grad_norm": 0.5604953018516682, "learning_rate": 7.774390243902439e-05, "loss": 0.6283, "step": 1780 }, { "epoch": 0.23417059131344845, "grad_norm": 0.7099271936900229, "learning_rate": 7.761134676564156e-05, "loss": 0.8628, "step": 1790 }, { "epoch": 0.23547880690737832, "grad_norm": 0.474479805099593, "learning_rate": 7.747879109225875e-05, "loss": 0.6259, "step": 1800 }, { "epoch": 0.2367870225013082, "grad_norm": 0.5416834886960876, "learning_rate": 7.734623541887594e-05, "loss": 0.8567, "step": 1810 }, { "epoch": 0.23809523809523808, "grad_norm": 0.5379729696776889, "learning_rate": 7.721367974549311e-05, "loss": 0.6096, "step": 1820 }, { "epoch": 0.239403453689168, "grad_norm": 0.6138823092852339, "learning_rate": 7.70811240721103e-05, "loss": 0.8454, "step": 1830 }, { "epoch": 0.24071166928309787, "grad_norm": 0.7163967325753281, "learning_rate": 7.694856839872747e-05, "loss": 0.6751, "step": 1840 }, { "epoch": 0.24201988487702775, "grad_norm": 0.7446327434350999, "learning_rate": 7.681601272534464e-05, "loss": 0.8933, "step": 1850 }, { "epoch": 0.24332810047095763, "grad_norm": 0.674203813956785, "learning_rate": 7.668345705196183e-05, "loss": 0.6436, "step": 1860 }, { "epoch": 0.2446363160648875, "grad_norm": 0.5298303453138061, "learning_rate": 7.655090137857901e-05, "loss": 0.8383, "step": 1870 }, { "epoch": 0.24594453165881738, "grad_norm": 0.46309042220240854, "learning_rate": 7.641834570519619e-05, "loss": 0.624, "step": 1880 }, { "epoch": 0.24725274725274726, "grad_norm": 0.527675840331917, "learning_rate": 7.628579003181336e-05, "loss": 0.8423, "step": 1890 }, { "epoch": 0.24856096284667714, "grad_norm": 0.6656931192324065, "learning_rate": 7.615323435843054e-05, "loss": 0.66, "step": 1900 }, { "epoch": 0.24986917844060702, "grad_norm": 0.5865447145073008, "learning_rate": 7.602067868504772e-05, "loss": 0.8659, "step": 1910 }, { "epoch": 0.25117739403453687, "grad_norm": 0.5349083325210562, "learning_rate": 7.58881230116649e-05, "loss": 0.6439, "step": 1920 }, { "epoch": 0.2524856096284668, "grad_norm": 0.5557018540060792, "learning_rate": 7.575556733828209e-05, "loss": 0.846, "step": 1930 }, { "epoch": 0.25379382522239663, "grad_norm": 0.5858709719889754, "learning_rate": 7.562301166489926e-05, "loss": 0.6473, "step": 1940 }, { "epoch": 0.25510204081632654, "grad_norm": 0.5461126801060399, "learning_rate": 7.549045599151643e-05, "loss": 0.874, "step": 1950 }, { "epoch": 0.2564102564102564, "grad_norm": 0.6905067688723441, "learning_rate": 7.535790031813362e-05, "loss": 0.6091, "step": 1960 }, { "epoch": 0.2577184720041863, "grad_norm": 0.7656408539875517, "learning_rate": 7.522534464475079e-05, "loss": 0.8679, "step": 1970 }, { "epoch": 0.25902668759811615, "grad_norm": 0.7233211006267372, "learning_rate": 7.509278897136798e-05, "loss": 0.6749, "step": 1980 }, { "epoch": 0.26033490319204605, "grad_norm": 0.5989700856780242, "learning_rate": 7.496023329798517e-05, "loss": 0.833, "step": 1990 }, { "epoch": 0.2616431187859759, "grad_norm": 0.5435529702312377, "learning_rate": 7.482767762460234e-05, "loss": 0.658, "step": 2000 }, { "epoch": 0.2629513343799058, "grad_norm": 0.5335997393071716, "learning_rate": 7.469512195121951e-05, "loss": 0.8399, "step": 2010 }, { "epoch": 0.26425954997383566, "grad_norm": 0.9150436835320093, "learning_rate": 7.45625662778367e-05, "loss": 0.6114, "step": 2020 }, { "epoch": 0.26556776556776557, "grad_norm": 0.5384709854955332, "learning_rate": 7.443001060445387e-05, "loss": 0.8413, "step": 2030 }, { "epoch": 0.2668759811616955, "grad_norm": 0.8677435387982771, "learning_rate": 7.429745493107106e-05, "loss": 0.6475, "step": 2040 }, { "epoch": 0.2681841967556253, "grad_norm": 0.5123690892694776, "learning_rate": 7.416489925768824e-05, "loss": 0.849, "step": 2050 }, { "epoch": 0.26949241234955523, "grad_norm": 0.5800543468099533, "learning_rate": 7.403234358430541e-05, "loss": 0.6127, "step": 2060 }, { "epoch": 0.2708006279434851, "grad_norm": 0.8015793490826957, "learning_rate": 7.389978791092259e-05, "loss": 0.8682, "step": 2070 }, { "epoch": 0.272108843537415, "grad_norm": 0.5876567022243202, "learning_rate": 7.376723223753977e-05, "loss": 0.6164, "step": 2080 }, { "epoch": 0.27341705913134484, "grad_norm": 0.5542805867826196, "learning_rate": 7.363467656415695e-05, "loss": 0.8808, "step": 2090 }, { "epoch": 0.27472527472527475, "grad_norm": 0.5170888187943017, "learning_rate": 7.350212089077413e-05, "loss": 0.6375, "step": 2100 }, { "epoch": 0.2760334903192046, "grad_norm": 0.622686247940597, "learning_rate": 7.336956521739132e-05, "loss": 0.8605, "step": 2110 }, { "epoch": 0.2773417059131345, "grad_norm": 0.5587125235931543, "learning_rate": 7.323700954400848e-05, "loss": 0.6237, "step": 2120 }, { "epoch": 0.27864992150706436, "grad_norm": 0.6015065825515082, "learning_rate": 7.310445387062566e-05, "loss": 0.8276, "step": 2130 }, { "epoch": 0.27995813710099426, "grad_norm": 0.5122297183116, "learning_rate": 7.297189819724285e-05, "loss": 0.6289, "step": 2140 }, { "epoch": 0.2812663526949241, "grad_norm": 0.5663980177757836, "learning_rate": 7.283934252386002e-05, "loss": 0.861, "step": 2150 }, { "epoch": 0.282574568288854, "grad_norm": 0.7939853395114802, "learning_rate": 7.270678685047721e-05, "loss": 0.6551, "step": 2160 }, { "epoch": 0.2838827838827839, "grad_norm": 0.5287178325117134, "learning_rate": 7.257423117709438e-05, "loss": 0.8747, "step": 2170 }, { "epoch": 0.2851909994767138, "grad_norm": 0.562616836311441, "learning_rate": 7.244167550371155e-05, "loss": 0.6357, "step": 2180 }, { "epoch": 0.28649921507064363, "grad_norm": 0.5117823698698972, "learning_rate": 7.230911983032874e-05, "loss": 0.8377, "step": 2190 }, { "epoch": 0.28780743066457354, "grad_norm": 0.6453579912049506, "learning_rate": 7.217656415694593e-05, "loss": 0.6476, "step": 2200 }, { "epoch": 0.2891156462585034, "grad_norm": 0.8731364069825441, "learning_rate": 7.20440084835631e-05, "loss": 0.8773, "step": 2210 }, { "epoch": 0.2904238618524333, "grad_norm": 0.6010315625749808, "learning_rate": 7.191145281018028e-05, "loss": 0.6083, "step": 2220 }, { "epoch": 0.29173207744636315, "grad_norm": 0.5201002760771769, "learning_rate": 7.177889713679746e-05, "loss": 0.8496, "step": 2230 }, { "epoch": 0.29304029304029305, "grad_norm": 0.6001877343124563, "learning_rate": 7.164634146341463e-05, "loss": 0.6389, "step": 2240 }, { "epoch": 0.2943485086342229, "grad_norm": 0.4371089670254897, "learning_rate": 7.151378579003182e-05, "loss": 0.8706, "step": 2250 }, { "epoch": 0.2956567242281528, "grad_norm": 0.5778087926720157, "learning_rate": 7.1381230116649e-05, "loss": 0.5939, "step": 2260 }, { "epoch": 0.29696493982208266, "grad_norm": 0.5241829955540505, "learning_rate": 7.124867444326617e-05, "loss": 0.8461, "step": 2270 }, { "epoch": 0.29827315541601257, "grad_norm": 0.5396461429281756, "learning_rate": 7.111611876988336e-05, "loss": 0.645, "step": 2280 }, { "epoch": 0.2995813710099424, "grad_norm": 0.7287884001398448, "learning_rate": 7.098356309650053e-05, "loss": 0.8349, "step": 2290 }, { "epoch": 0.3008895866038723, "grad_norm": 0.43985568295051974, "learning_rate": 7.08510074231177e-05, "loss": 0.6023, "step": 2300 }, { "epoch": 0.3021978021978022, "grad_norm": 0.512789679014569, "learning_rate": 7.071845174973489e-05, "loss": 0.8592, "step": 2310 }, { "epoch": 0.3035060177917321, "grad_norm": 0.5758747078766807, "learning_rate": 7.058589607635208e-05, "loss": 0.6302, "step": 2320 }, { "epoch": 0.30481423338566194, "grad_norm": 0.605959590741427, "learning_rate": 7.045334040296925e-05, "loss": 0.8437, "step": 2330 }, { "epoch": 0.30612244897959184, "grad_norm": 0.5064367889657759, "learning_rate": 7.032078472958644e-05, "loss": 0.6228, "step": 2340 }, { "epoch": 0.3074306645735217, "grad_norm": 0.4968490853247213, "learning_rate": 7.018822905620361e-05, "loss": 0.8556, "step": 2350 }, { "epoch": 0.3087388801674516, "grad_norm": 0.5772837050103059, "learning_rate": 7.005567338282078e-05, "loss": 0.6234, "step": 2360 }, { "epoch": 0.31004709576138145, "grad_norm": 0.6941306253378574, "learning_rate": 6.992311770943797e-05, "loss": 0.8362, "step": 2370 }, { "epoch": 0.31135531135531136, "grad_norm": 0.39822917719900375, "learning_rate": 6.979056203605516e-05, "loss": 0.6162, "step": 2380 }, { "epoch": 0.3126635269492412, "grad_norm": 0.4661807662020318, "learning_rate": 6.965800636267233e-05, "loss": 0.8414, "step": 2390 }, { "epoch": 0.3139717425431711, "grad_norm": 0.45291786679307866, "learning_rate": 6.95254506892895e-05, "loss": 0.6433, "step": 2400 }, { "epoch": 0.31527995813710097, "grad_norm": 0.6300442868544786, "learning_rate": 6.939289501590669e-05, "loss": 0.8564, "step": 2410 }, { "epoch": 0.3165881737310309, "grad_norm": 0.5144882452050621, "learning_rate": 6.926033934252386e-05, "loss": 0.6018, "step": 2420 }, { "epoch": 0.3178963893249607, "grad_norm": 0.5886230291498049, "learning_rate": 6.912778366914105e-05, "loss": 0.8861, "step": 2430 }, { "epoch": 0.31920460491889063, "grad_norm": 0.6836197118690605, "learning_rate": 6.899522799575823e-05, "loss": 0.6267, "step": 2440 }, { "epoch": 0.32051282051282054, "grad_norm": 0.4951991615110591, "learning_rate": 6.88626723223754e-05, "loss": 0.8387, "step": 2450 }, { "epoch": 0.3218210361067504, "grad_norm": 0.6412063103894238, "learning_rate": 6.873011664899258e-05, "loss": 0.5927, "step": 2460 }, { "epoch": 0.3231292517006803, "grad_norm": 0.5782953097169173, "learning_rate": 6.859756097560976e-05, "loss": 0.8418, "step": 2470 }, { "epoch": 0.32443746729461015, "grad_norm": 0.5200166820109174, "learning_rate": 6.846500530222694e-05, "loss": 0.6324, "step": 2480 }, { "epoch": 0.32574568288854006, "grad_norm": 0.4922169436686345, "learning_rate": 6.833244962884412e-05, "loss": 0.8298, "step": 2490 }, { "epoch": 0.3270538984824699, "grad_norm": 0.4912804994081646, "learning_rate": 6.819989395546131e-05, "loss": 0.6123, "step": 2500 }, { "epoch": 0.3283621140763998, "grad_norm": 0.5502723124138919, "learning_rate": 6.806733828207848e-05, "loss": 0.8522, "step": 2510 }, { "epoch": 0.32967032967032966, "grad_norm": 0.5875399056209475, "learning_rate": 6.793478260869565e-05, "loss": 0.6144, "step": 2520 }, { "epoch": 0.33097854526425957, "grad_norm": 0.5609437523704139, "learning_rate": 6.780222693531283e-05, "loss": 0.8347, "step": 2530 }, { "epoch": 0.3322867608581894, "grad_norm": 0.338386078229902, "learning_rate": 6.766967126193001e-05, "loss": 0.587, "step": 2540 }, { "epoch": 0.33359497645211933, "grad_norm": 0.48756512910329014, "learning_rate": 6.75371155885472e-05, "loss": 0.8327, "step": 2550 }, { "epoch": 0.3349031920460492, "grad_norm": 0.5143776858886028, "learning_rate": 6.740455991516437e-05, "loss": 0.6092, "step": 2560 }, { "epoch": 0.3362114076399791, "grad_norm": 0.5332261671554006, "learning_rate": 6.727200424178154e-05, "loss": 0.8486, "step": 2570 }, { "epoch": 0.33751962323390894, "grad_norm": 0.6594969470024875, "learning_rate": 6.713944856839873e-05, "loss": 0.6256, "step": 2580 }, { "epoch": 0.33882783882783885, "grad_norm": 0.4553927815832327, "learning_rate": 6.70068928950159e-05, "loss": 0.8357, "step": 2590 }, { "epoch": 0.3401360544217687, "grad_norm": 1.1222853735995233, "learning_rate": 6.687433722163309e-05, "loss": 0.6007, "step": 2600 }, { "epoch": 0.3414442700156986, "grad_norm": 0.528439009952554, "learning_rate": 6.674178154825027e-05, "loss": 0.856, "step": 2610 }, { "epoch": 0.34275248560962845, "grad_norm": 0.5933307919214319, "learning_rate": 6.660922587486745e-05, "loss": 0.6421, "step": 2620 }, { "epoch": 0.34406070120355836, "grad_norm": 0.6607714283195972, "learning_rate": 6.647667020148462e-05, "loss": 0.8567, "step": 2630 }, { "epoch": 0.3453689167974882, "grad_norm": 0.7007503714365677, "learning_rate": 6.63441145281018e-05, "loss": 0.6207, "step": 2640 }, { "epoch": 0.3466771323914181, "grad_norm": 0.5135207638483642, "learning_rate": 6.621155885471898e-05, "loss": 0.8288, "step": 2650 }, { "epoch": 0.34798534798534797, "grad_norm": 0.6140077846492299, "learning_rate": 6.607900318133616e-05, "loss": 0.6015, "step": 2660 }, { "epoch": 0.3492935635792779, "grad_norm": 0.5100090518445459, "learning_rate": 6.594644750795335e-05, "loss": 0.8348, "step": 2670 }, { "epoch": 0.35060177917320773, "grad_norm": 0.5113632712765585, "learning_rate": 6.581389183457052e-05, "loss": 0.5924, "step": 2680 }, { "epoch": 0.35190999476713763, "grad_norm": 0.5039720207488532, "learning_rate": 6.56813361611877e-05, "loss": 0.8184, "step": 2690 }, { "epoch": 0.3532182103610675, "grad_norm": 0.49806965997978137, "learning_rate": 6.554878048780488e-05, "loss": 0.6046, "step": 2700 }, { "epoch": 0.3545264259549974, "grad_norm": 0.5365120734516775, "learning_rate": 6.541622481442205e-05, "loss": 0.8514, "step": 2710 }, { "epoch": 0.35583464154892724, "grad_norm": 0.4007231339986853, "learning_rate": 6.528366914103924e-05, "loss": 0.5871, "step": 2720 }, { "epoch": 0.35714285714285715, "grad_norm": 0.46254442147732966, "learning_rate": 6.515111346765643e-05, "loss": 0.8411, "step": 2730 }, { "epoch": 0.358451072736787, "grad_norm": 0.6576167821737804, "learning_rate": 6.50185577942736e-05, "loss": 0.6293, "step": 2740 }, { "epoch": 0.3597592883307169, "grad_norm": 0.5967952480358752, "learning_rate": 6.488600212089077e-05, "loss": 0.8511, "step": 2750 }, { "epoch": 0.36106750392464676, "grad_norm": 0.43508005533871247, "learning_rate": 6.475344644750796e-05, "loss": 0.621, "step": 2760 }, { "epoch": 0.36237571951857667, "grad_norm": 0.48108090717243124, "learning_rate": 6.462089077412513e-05, "loss": 0.8263, "step": 2770 }, { "epoch": 0.3636839351125065, "grad_norm": 0.568823321053749, "learning_rate": 6.448833510074232e-05, "loss": 0.5941, "step": 2780 }, { "epoch": 0.3649921507064364, "grad_norm": 0.5222642833529956, "learning_rate": 6.43557794273595e-05, "loss": 0.8525, "step": 2790 }, { "epoch": 0.3663003663003663, "grad_norm": 0.5919546377210694, "learning_rate": 6.422322375397666e-05, "loss": 0.5901, "step": 2800 }, { "epoch": 0.3676085818942962, "grad_norm": 0.46817017239205494, "learning_rate": 6.409066808059385e-05, "loss": 0.8678, "step": 2810 }, { "epoch": 0.36891679748822603, "grad_norm": 0.5530202729949584, "learning_rate": 6.395811240721103e-05, "loss": 0.6401, "step": 2820 }, { "epoch": 0.37022501308215594, "grad_norm": 0.6007081120859754, "learning_rate": 6.382555673382821e-05, "loss": 0.8291, "step": 2830 }, { "epoch": 0.3715332286760858, "grad_norm": 0.39683924232406637, "learning_rate": 6.36930010604454e-05, "loss": 0.6127, "step": 2840 }, { "epoch": 0.3728414442700157, "grad_norm": 0.5452479168330785, "learning_rate": 6.356044538706257e-05, "loss": 0.8495, "step": 2850 }, { "epoch": 0.3741496598639456, "grad_norm": 0.8980580036344767, "learning_rate": 6.342788971367974e-05, "loss": 0.6471, "step": 2860 }, { "epoch": 0.37545787545787546, "grad_norm": 0.48885709974761143, "learning_rate": 6.329533404029692e-05, "loss": 0.8626, "step": 2870 }, { "epoch": 0.37676609105180536, "grad_norm": 0.5684144901718525, "learning_rate": 6.316277836691411e-05, "loss": 0.5869, "step": 2880 }, { "epoch": 0.3780743066457352, "grad_norm": 0.5836928349425793, "learning_rate": 6.303022269353128e-05, "loss": 0.8205, "step": 2890 }, { "epoch": 0.3793825222396651, "grad_norm": 0.5698475129755676, "learning_rate": 6.289766702014847e-05, "loss": 0.5985, "step": 2900 }, { "epoch": 0.38069073783359497, "grad_norm": 0.5257222435975014, "learning_rate": 6.276511134676564e-05, "loss": 0.8572, "step": 2910 }, { "epoch": 0.3819989534275249, "grad_norm": 0.524065233271633, "learning_rate": 6.263255567338282e-05, "loss": 0.5917, "step": 2920 }, { "epoch": 0.38330716902145473, "grad_norm": 0.5635081193305911, "learning_rate": 6.25e-05, "loss": 0.8333, "step": 2930 }, { "epoch": 0.38461538461538464, "grad_norm": 0.7155336191069814, "learning_rate": 6.236744432661719e-05, "loss": 0.6211, "step": 2940 }, { "epoch": 0.3859236002093145, "grad_norm": 0.5087934741189605, "learning_rate": 6.223488865323436e-05, "loss": 0.8531, "step": 2950 }, { "epoch": 0.3872318158032444, "grad_norm": 0.5635575417178986, "learning_rate": 6.210233297985155e-05, "loss": 0.6134, "step": 2960 }, { "epoch": 0.38854003139717425, "grad_norm": 0.5136035425363329, "learning_rate": 6.196977730646872e-05, "loss": 0.8142, "step": 2970 }, { "epoch": 0.38984824699110415, "grad_norm": 0.6741458077201186, "learning_rate": 6.183722163308589e-05, "loss": 0.6388, "step": 2980 }, { "epoch": 0.391156462585034, "grad_norm": 0.49349088755882425, "learning_rate": 6.170466595970308e-05, "loss": 0.8764, "step": 2990 }, { "epoch": 0.3924646781789639, "grad_norm": 0.5383608654756946, "learning_rate": 6.157211028632026e-05, "loss": 0.6271, "step": 3000 }, { "epoch": 0.39377289377289376, "grad_norm": 0.5194331296491325, "learning_rate": 6.143955461293744e-05, "loss": 0.8087, "step": 3010 }, { "epoch": 0.39508110936682367, "grad_norm": 1.1199076383208129, "learning_rate": 6.130699893955462e-05, "loss": 0.5948, "step": 3020 }, { "epoch": 0.3963893249607535, "grad_norm": 0.5119305052018042, "learning_rate": 6.11744432661718e-05, "loss": 0.7975, "step": 3030 }, { "epoch": 0.3976975405546834, "grad_norm": 0.6365196412735817, "learning_rate": 6.104188759278897e-05, "loss": 0.614, "step": 3040 }, { "epoch": 0.3990057561486133, "grad_norm": 0.4736074626159983, "learning_rate": 6.0909331919406154e-05, "loss": 0.8416, "step": 3050 }, { "epoch": 0.4003139717425432, "grad_norm": 0.8101929076329764, "learning_rate": 6.077677624602334e-05, "loss": 0.6173, "step": 3060 }, { "epoch": 0.40162218733647304, "grad_norm": 0.6168972276400505, "learning_rate": 6.0644220572640506e-05, "loss": 0.8407, "step": 3070 }, { "epoch": 0.40293040293040294, "grad_norm": 0.7038053233260652, "learning_rate": 6.051166489925769e-05, "loss": 0.6247, "step": 3080 }, { "epoch": 0.4042386185243328, "grad_norm": 0.457467448213452, "learning_rate": 6.037910922587487e-05, "loss": 0.8322, "step": 3090 }, { "epoch": 0.4055468341182627, "grad_norm": 0.49357888449301257, "learning_rate": 6.0246553552492044e-05, "loss": 0.5908, "step": 3100 }, { "epoch": 0.40685504971219255, "grad_norm": 0.5808455738349774, "learning_rate": 6.011399787910923e-05, "loss": 0.8208, "step": 3110 }, { "epoch": 0.40816326530612246, "grad_norm": 0.7022945988432154, "learning_rate": 5.998144220572641e-05, "loss": 0.6518, "step": 3120 }, { "epoch": 0.4094714809000523, "grad_norm": 0.6078075113609221, "learning_rate": 5.984888653234358e-05, "loss": 0.8636, "step": 3130 }, { "epoch": 0.4107796964939822, "grad_norm": 0.6199267742106505, "learning_rate": 5.971633085896077e-05, "loss": 0.5869, "step": 3140 }, { "epoch": 0.41208791208791207, "grad_norm": 0.6063216826492921, "learning_rate": 5.958377518557795e-05, "loss": 0.8341, "step": 3150 }, { "epoch": 0.413396127681842, "grad_norm": 0.3985769574080392, "learning_rate": 5.945121951219512e-05, "loss": 0.6207, "step": 3160 }, { "epoch": 0.4147043432757718, "grad_norm": 0.5360752659857189, "learning_rate": 5.931866383881231e-05, "loss": 0.8621, "step": 3170 }, { "epoch": 0.41601255886970173, "grad_norm": 0.7664757845756865, "learning_rate": 5.9186108165429486e-05, "loss": 0.6172, "step": 3180 }, { "epoch": 0.4173207744636316, "grad_norm": 0.5679551885466096, "learning_rate": 5.905355249204666e-05, "loss": 0.8324, "step": 3190 }, { "epoch": 0.4186289900575615, "grad_norm": 0.5745648833281791, "learning_rate": 5.8920996818663845e-05, "loss": 0.5986, "step": 3200 }, { "epoch": 0.41993720565149134, "grad_norm": 0.5395040305019896, "learning_rate": 5.8788441145281024e-05, "loss": 0.8278, "step": 3210 }, { "epoch": 0.42124542124542125, "grad_norm": 0.5945122594045797, "learning_rate": 5.86558854718982e-05, "loss": 0.6128, "step": 3220 }, { "epoch": 0.4225536368393511, "grad_norm": 0.4545401837270316, "learning_rate": 5.852332979851538e-05, "loss": 0.8388, "step": 3230 }, { "epoch": 0.423861852433281, "grad_norm": 0.5949653779281652, "learning_rate": 5.839077412513256e-05, "loss": 0.6167, "step": 3240 }, { "epoch": 0.42517006802721086, "grad_norm": 0.5067052935095019, "learning_rate": 5.8258218451749735e-05, "loss": 0.8489, "step": 3250 }, { "epoch": 0.42647828362114076, "grad_norm": 0.5191225717265471, "learning_rate": 5.812566277836692e-05, "loss": 0.6059, "step": 3260 }, { "epoch": 0.42778649921507067, "grad_norm": 0.4969767064651156, "learning_rate": 5.79931071049841e-05, "loss": 0.8627, "step": 3270 }, { "epoch": 0.4290947148090005, "grad_norm": 0.5785476329537975, "learning_rate": 5.786055143160127e-05, "loss": 0.622, "step": 3280 }, { "epoch": 0.43040293040293043, "grad_norm": 0.5598487117314234, "learning_rate": 5.772799575821846e-05, "loss": 0.807, "step": 3290 }, { "epoch": 0.4317111459968603, "grad_norm": 0.5427120253597403, "learning_rate": 5.759544008483564e-05, "loss": 0.6085, "step": 3300 }, { "epoch": 0.4330193615907902, "grad_norm": 0.5672042251019751, "learning_rate": 5.746288441145281e-05, "loss": 0.822, "step": 3310 }, { "epoch": 0.43432757718472004, "grad_norm": 0.2894379338449793, "learning_rate": 5.733032873806999e-05, "loss": 0.5945, "step": 3320 }, { "epoch": 0.43563579277864994, "grad_norm": 0.677558336208755, "learning_rate": 5.719777306468718e-05, "loss": 0.8413, "step": 3330 }, { "epoch": 0.4369440083725798, "grad_norm": 0.723454845173246, "learning_rate": 5.706521739130435e-05, "loss": 0.6216, "step": 3340 }, { "epoch": 0.4382522239665097, "grad_norm": 0.4482745953779932, "learning_rate": 5.693266171792153e-05, "loss": 0.8383, "step": 3350 }, { "epoch": 0.43956043956043955, "grad_norm": 0.5812694343372896, "learning_rate": 5.68001060445387e-05, "loss": 0.6193, "step": 3360 }, { "epoch": 0.44086865515436946, "grad_norm": 0.6007437160420993, "learning_rate": 5.666755037115589e-05, "loss": 0.8203, "step": 3370 }, { "epoch": 0.4421768707482993, "grad_norm": 0.6580604949252575, "learning_rate": 5.653499469777307e-05, "loss": 0.6043, "step": 3380 }, { "epoch": 0.4434850863422292, "grad_norm": 0.6046064236672993, "learning_rate": 5.640243902439024e-05, "loss": 0.8204, "step": 3390 }, { "epoch": 0.44479330193615907, "grad_norm": 0.47437375038207613, "learning_rate": 5.6269883351007426e-05, "loss": 0.6205, "step": 3400 }, { "epoch": 0.446101517530089, "grad_norm": 0.4644396135020371, "learning_rate": 5.6137327677624605e-05, "loss": 0.8304, "step": 3410 }, { "epoch": 0.4474097331240188, "grad_norm": 0.6080652395243531, "learning_rate": 5.600477200424178e-05, "loss": 0.5971, "step": 3420 }, { "epoch": 0.44871794871794873, "grad_norm": 0.4969712368034865, "learning_rate": 5.5872216330858964e-05, "loss": 0.824, "step": 3430 }, { "epoch": 0.4500261643118786, "grad_norm": 0.7769636104632821, "learning_rate": 5.5739660657476144e-05, "loss": 0.6071, "step": 3440 }, { "epoch": 0.4513343799058085, "grad_norm": 0.5343464867649641, "learning_rate": 5.5607104984093316e-05, "loss": 0.8439, "step": 3450 }, { "epoch": 0.45264259549973834, "grad_norm": 0.4670043204824149, "learning_rate": 5.54745493107105e-05, "loss": 0.5924, "step": 3460 }, { "epoch": 0.45395081109366825, "grad_norm": 0.5128069264337731, "learning_rate": 5.534199363732768e-05, "loss": 0.8318, "step": 3470 }, { "epoch": 0.4552590266875981, "grad_norm": 0.5401925527479321, "learning_rate": 5.5209437963944854e-05, "loss": 0.5857, "step": 3480 }, { "epoch": 0.456567242281528, "grad_norm": 0.5099962477533319, "learning_rate": 5.507688229056204e-05, "loss": 0.8202, "step": 3490 }, { "epoch": 0.45787545787545786, "grad_norm": 0.5003587460481527, "learning_rate": 5.494432661717922e-05, "loss": 0.58, "step": 3500 }, { "epoch": 0.45918367346938777, "grad_norm": 0.5134647272370393, "learning_rate": 5.481177094379639e-05, "loss": 0.8078, "step": 3510 }, { "epoch": 0.4604918890633176, "grad_norm": 0.5469814680290468, "learning_rate": 5.467921527041357e-05, "loss": 0.6337, "step": 3520 }, { "epoch": 0.4618001046572475, "grad_norm": 0.592648552852463, "learning_rate": 5.454665959703076e-05, "loss": 0.8411, "step": 3530 }, { "epoch": 0.4631083202511774, "grad_norm": 0.5181504434630264, "learning_rate": 5.441410392364793e-05, "loss": 0.6083, "step": 3540 }, { "epoch": 0.4644165358451073, "grad_norm": 0.5499418553622303, "learning_rate": 5.428154825026511e-05, "loss": 0.849, "step": 3550 }, { "epoch": 0.46572475143903713, "grad_norm": 0.6574654424945503, "learning_rate": 5.4148992576882296e-05, "loss": 0.608, "step": 3560 }, { "epoch": 0.46703296703296704, "grad_norm": 0.6187475128909983, "learning_rate": 5.401643690349947e-05, "loss": 0.8159, "step": 3570 }, { "epoch": 0.4683411826268969, "grad_norm": 0.534345893303697, "learning_rate": 5.388388123011665e-05, "loss": 0.6117, "step": 3580 }, { "epoch": 0.4696493982208268, "grad_norm": 0.4564109249983669, "learning_rate": 5.3751325556733834e-05, "loss": 0.8049, "step": 3590 }, { "epoch": 0.47095761381475665, "grad_norm": 0.5578958162524437, "learning_rate": 5.361876988335101e-05, "loss": 0.6157, "step": 3600 }, { "epoch": 0.47226582940868655, "grad_norm": 0.500181472769301, "learning_rate": 5.3486214209968186e-05, "loss": 0.8502, "step": 3610 }, { "epoch": 0.4735740450026164, "grad_norm": 0.6141834045693259, "learning_rate": 5.335365853658537e-05, "loss": 0.6031, "step": 3620 }, { "epoch": 0.4748822605965463, "grad_norm": 0.4961433938028621, "learning_rate": 5.3221102863202545e-05, "loss": 0.8375, "step": 3630 }, { "epoch": 0.47619047619047616, "grad_norm": 0.520091439534862, "learning_rate": 5.3088547189819725e-05, "loss": 0.5814, "step": 3640 }, { "epoch": 0.47749869178440607, "grad_norm": 0.6206247789922092, "learning_rate": 5.295599151643691e-05, "loss": 0.8411, "step": 3650 }, { "epoch": 0.478806907378336, "grad_norm": 0.5343995283355165, "learning_rate": 5.282343584305408e-05, "loss": 0.5814, "step": 3660 }, { "epoch": 0.48011512297226583, "grad_norm": 0.6876308319219042, "learning_rate": 5.269088016967126e-05, "loss": 0.8112, "step": 3670 }, { "epoch": 0.48142333856619574, "grad_norm": 0.8088277743851824, "learning_rate": 5.255832449628845e-05, "loss": 0.5852, "step": 3680 }, { "epoch": 0.4827315541601256, "grad_norm": 0.5226035766833458, "learning_rate": 5.242576882290562e-05, "loss": 0.8115, "step": 3690 }, { "epoch": 0.4840397697540555, "grad_norm": 0.711443652596892, "learning_rate": 5.22932131495228e-05, "loss": 0.6331, "step": 3700 }, { "epoch": 0.48534798534798534, "grad_norm": 0.44695965081239714, "learning_rate": 5.216065747613999e-05, "loss": 0.8312, "step": 3710 }, { "epoch": 0.48665620094191525, "grad_norm": 0.43172836208765664, "learning_rate": 5.202810180275716e-05, "loss": 0.6138, "step": 3720 }, { "epoch": 0.4879644165358451, "grad_norm": 0.48852761265154354, "learning_rate": 5.189554612937434e-05, "loss": 0.8535, "step": 3730 }, { "epoch": 0.489272632129775, "grad_norm": 0.98510305137075, "learning_rate": 5.1762990455991525e-05, "loss": 0.6202, "step": 3740 }, { "epoch": 0.49058084772370486, "grad_norm": 0.48915174823336055, "learning_rate": 5.163043478260869e-05, "loss": 0.8519, "step": 3750 }, { "epoch": 0.49188906331763477, "grad_norm": 0.5904577427313913, "learning_rate": 5.149787910922588e-05, "loss": 0.6166, "step": 3760 }, { "epoch": 0.4931972789115646, "grad_norm": 0.5128070606528945, "learning_rate": 5.1365323435843063e-05, "loss": 0.8471, "step": 3770 }, { "epoch": 0.4945054945054945, "grad_norm": 0.5725171611100351, "learning_rate": 5.123276776246023e-05, "loss": 0.6287, "step": 3780 }, { "epoch": 0.4958137100994244, "grad_norm": 0.539320741587729, "learning_rate": 5.1100212089077415e-05, "loss": 0.824, "step": 3790 }, { "epoch": 0.4971219256933543, "grad_norm": 0.4028412015043289, "learning_rate": 5.0967656415694595e-05, "loss": 0.5806, "step": 3800 }, { "epoch": 0.49843014128728413, "grad_norm": 0.5389621790607799, "learning_rate": 5.083510074231177e-05, "loss": 0.8308, "step": 3810 }, { "epoch": 0.49973835688121404, "grad_norm": 0.6011860291653554, "learning_rate": 5.0702545068928954e-05, "loss": 0.6003, "step": 3820 }, { "epoch": 0.501046572475144, "grad_norm": 0.5389080519884727, "learning_rate": 5.056998939554613e-05, "loss": 0.8276, "step": 3830 }, { "epoch": 0.5023547880690737, "grad_norm": 0.6088518124424828, "learning_rate": 5.0437433722163306e-05, "loss": 0.6195, "step": 3840 }, { "epoch": 0.5036630036630036, "grad_norm": 0.6515108951670922, "learning_rate": 5.030487804878049e-05, "loss": 0.8051, "step": 3850 }, { "epoch": 0.5049712192569336, "grad_norm": 0.4313425003620371, "learning_rate": 5.017232237539767e-05, "loss": 0.6071, "step": 3860 }, { "epoch": 0.5062794348508635, "grad_norm": 0.5264707278305082, "learning_rate": 5.0039766702014844e-05, "loss": 0.8364, "step": 3870 }, { "epoch": 0.5075876504447933, "grad_norm": 0.7048581680415387, "learning_rate": 4.990721102863203e-05, "loss": 0.6049, "step": 3880 }, { "epoch": 0.5088958660387232, "grad_norm": 0.5055653926619902, "learning_rate": 4.97746553552492e-05, "loss": 0.8285, "step": 3890 }, { "epoch": 0.5102040816326531, "grad_norm": 0.5437489221882642, "learning_rate": 4.964209968186639e-05, "loss": 0.5851, "step": 3900 }, { "epoch": 0.511512297226583, "grad_norm": 0.5503188602842589, "learning_rate": 4.950954400848357e-05, "loss": 0.8251, "step": 3910 }, { "epoch": 0.5128205128205128, "grad_norm": 0.5699382425924778, "learning_rate": 4.937698833510074e-05, "loss": 0.6065, "step": 3920 }, { "epoch": 0.5141287284144427, "grad_norm": 0.5601404210013031, "learning_rate": 4.924443266171793e-05, "loss": 0.8332, "step": 3930 }, { "epoch": 0.5154369440083726, "grad_norm": 0.6107160876812969, "learning_rate": 4.9111876988335106e-05, "loss": 0.6407, "step": 3940 }, { "epoch": 0.5167451596023025, "grad_norm": 0.5627086115947509, "learning_rate": 4.897932131495228e-05, "loss": 0.8524, "step": 3950 }, { "epoch": 0.5180533751962323, "grad_norm": 0.47974680429027505, "learning_rate": 4.8846765641569465e-05, "loss": 0.5891, "step": 3960 }, { "epoch": 0.5193615907901622, "grad_norm": 0.6078144910339062, "learning_rate": 4.8714209968186645e-05, "loss": 0.8207, "step": 3970 }, { "epoch": 0.5206698063840921, "grad_norm": 0.6828182233693575, "learning_rate": 4.858165429480382e-05, "loss": 0.6234, "step": 3980 }, { "epoch": 0.521978021978022, "grad_norm": 0.5078577809608877, "learning_rate": 4.8449098621421e-05, "loss": 0.8293, "step": 3990 }, { "epoch": 0.5232862375719518, "grad_norm": 0.501493452942641, "learning_rate": 4.8316542948038176e-05, "loss": 0.5924, "step": 4000 }, { "epoch": 0.5245944531658817, "grad_norm": 0.5914460419911263, "learning_rate": 4.8183987274655355e-05, "loss": 0.8204, "step": 4010 }, { "epoch": 0.5259026687598116, "grad_norm": 0.5108518700290716, "learning_rate": 4.8051431601272535e-05, "loss": 0.5938, "step": 4020 }, { "epoch": 0.5272108843537415, "grad_norm": 0.5931433550467333, "learning_rate": 4.7918875927889714e-05, "loss": 0.8405, "step": 4030 }, { "epoch": 0.5285190999476713, "grad_norm": 0.6193186586877497, "learning_rate": 4.7786320254506894e-05, "loss": 0.5813, "step": 4040 }, { "epoch": 0.5298273155416012, "grad_norm": 0.5380628082146289, "learning_rate": 4.765376458112407e-05, "loss": 0.8669, "step": 4050 }, { "epoch": 0.5311355311355311, "grad_norm": 0.5763128333912579, "learning_rate": 4.752120890774125e-05, "loss": 0.6076, "step": 4060 }, { "epoch": 0.532443746729461, "grad_norm": 0.5145862547581488, "learning_rate": 4.738865323435843e-05, "loss": 0.8424, "step": 4070 }, { "epoch": 0.533751962323391, "grad_norm": 0.5664958788683205, "learning_rate": 4.725609756097561e-05, "loss": 0.6293, "step": 4080 }, { "epoch": 0.5350601779173207, "grad_norm": 0.5292223052367038, "learning_rate": 4.712354188759279e-05, "loss": 0.8173, "step": 4090 }, { "epoch": 0.5363683935112507, "grad_norm": 0.58673619277203, "learning_rate": 4.699098621420997e-05, "loss": 0.5763, "step": 4100 }, { "epoch": 0.5376766091051806, "grad_norm": 0.5594174176545202, "learning_rate": 4.685843054082715e-05, "loss": 0.8028, "step": 4110 }, { "epoch": 0.5389848246991105, "grad_norm": 0.6339559869110518, "learning_rate": 4.672587486744433e-05, "loss": 0.6053, "step": 4120 }, { "epoch": 0.5402930402930403, "grad_norm": 0.671405553815309, "learning_rate": 4.659331919406151e-05, "loss": 0.8398, "step": 4130 }, { "epoch": 0.5416012558869702, "grad_norm": 0.5445580424726075, "learning_rate": 4.646076352067869e-05, "loss": 0.621, "step": 4140 }, { "epoch": 0.5429094714809001, "grad_norm": 0.5938642782111705, "learning_rate": 4.632820784729587e-05, "loss": 0.8295, "step": 4150 }, { "epoch": 0.54421768707483, "grad_norm": 0.6435152678972964, "learning_rate": 4.6195652173913046e-05, "loss": 0.618, "step": 4160 }, { "epoch": 0.5455259026687598, "grad_norm": 0.5089883798718602, "learning_rate": 4.6063096500530226e-05, "loss": 0.8135, "step": 4170 }, { "epoch": 0.5468341182626897, "grad_norm": 0.5714660790898384, "learning_rate": 4.5930540827147405e-05, "loss": 0.622, "step": 4180 }, { "epoch": 0.5481423338566196, "grad_norm": 0.5423779003965784, "learning_rate": 4.5797985153764584e-05, "loss": 0.8011, "step": 4190 }, { "epoch": 0.5494505494505495, "grad_norm": 0.5305546579845897, "learning_rate": 4.5665429480381764e-05, "loss": 0.5792, "step": 4200 }, { "epoch": 0.5507587650444793, "grad_norm": 0.5057565979180018, "learning_rate": 4.553287380699894e-05, "loss": 0.8326, "step": 4210 }, { "epoch": 0.5520669806384092, "grad_norm": 0.5108055603767078, "learning_rate": 4.540031813361612e-05, "loss": 0.6014, "step": 4220 }, { "epoch": 0.5533751962323391, "grad_norm": 0.5358448167728354, "learning_rate": 4.5267762460233295e-05, "loss": 0.8152, "step": 4230 }, { "epoch": 0.554683411826269, "grad_norm": 0.7719427855271279, "learning_rate": 4.513520678685048e-05, "loss": 0.5967, "step": 4240 }, { "epoch": 0.5559916274201988, "grad_norm": 0.6513813945282019, "learning_rate": 4.500265111346766e-05, "loss": 0.8133, "step": 4250 }, { "epoch": 0.5572998430141287, "grad_norm": 0.611360841284405, "learning_rate": 4.487009544008483e-05, "loss": 0.6109, "step": 4260 }, { "epoch": 0.5586080586080586, "grad_norm": 0.5122207195541725, "learning_rate": 4.473753976670202e-05, "loss": 0.8237, "step": 4270 }, { "epoch": 0.5599162742019885, "grad_norm": 0.47024973003128884, "learning_rate": 4.46049840933192e-05, "loss": 0.6308, "step": 4280 }, { "epoch": 0.5612244897959183, "grad_norm": 0.5605094944984179, "learning_rate": 4.447242841993637e-05, "loss": 0.8664, "step": 4290 }, { "epoch": 0.5625327053898482, "grad_norm": 0.6555414150243584, "learning_rate": 4.433987274655356e-05, "loss": 0.6108, "step": 4300 }, { "epoch": 0.5638409209837781, "grad_norm": 0.4937828454850159, "learning_rate": 4.420731707317074e-05, "loss": 0.8288, "step": 4310 }, { "epoch": 0.565149136577708, "grad_norm": 0.5949648171644456, "learning_rate": 4.407476139978791e-05, "loss": 0.5755, "step": 4320 }, { "epoch": 0.5664573521716378, "grad_norm": 0.6150493489271065, "learning_rate": 4.3942205726405096e-05, "loss": 0.8562, "step": 4330 }, { "epoch": 0.5677655677655677, "grad_norm": 0.5168321021587649, "learning_rate": 4.3809650053022275e-05, "loss": 0.5859, "step": 4340 }, { "epoch": 0.5690737833594977, "grad_norm": 0.5149672746596766, "learning_rate": 4.367709437963945e-05, "loss": 0.8035, "step": 4350 }, { "epoch": 0.5703819989534276, "grad_norm": 0.6254032996036644, "learning_rate": 4.3544538706256634e-05, "loss": 0.6081, "step": 4360 }, { "epoch": 0.5716902145473574, "grad_norm": 0.5439410227795257, "learning_rate": 4.341198303287381e-05, "loss": 0.8486, "step": 4370 }, { "epoch": 0.5729984301412873, "grad_norm": 0.6989019848301199, "learning_rate": 4.3279427359490986e-05, "loss": 0.5977, "step": 4380 }, { "epoch": 0.5743066457352172, "grad_norm": 0.5253254730813908, "learning_rate": 4.314687168610817e-05, "loss": 0.8166, "step": 4390 }, { "epoch": 0.5756148613291471, "grad_norm": 0.6306757564007922, "learning_rate": 4.3014316012725345e-05, "loss": 0.6037, "step": 4400 }, { "epoch": 0.5769230769230769, "grad_norm": 0.457131073575816, "learning_rate": 4.2881760339342524e-05, "loss": 0.8058, "step": 4410 }, { "epoch": 0.5782312925170068, "grad_norm": 0.6651032212615784, "learning_rate": 4.274920466595971e-05, "loss": 0.577, "step": 4420 }, { "epoch": 0.5795395081109367, "grad_norm": 0.5052314086043747, "learning_rate": 4.261664899257688e-05, "loss": 0.8133, "step": 4430 }, { "epoch": 0.5808477237048666, "grad_norm": 0.6345651754805625, "learning_rate": 4.248409331919406e-05, "loss": 0.5862, "step": 4440 }, { "epoch": 0.5821559392987964, "grad_norm": 0.5470901361744871, "learning_rate": 4.235153764581124e-05, "loss": 0.82, "step": 4450 }, { "epoch": 0.5834641548927263, "grad_norm": 0.6148840090122306, "learning_rate": 4.221898197242842e-05, "loss": 0.5963, "step": 4460 }, { "epoch": 0.5847723704866562, "grad_norm": 0.5654198812120623, "learning_rate": 4.20864262990456e-05, "loss": 0.8098, "step": 4470 }, { "epoch": 0.5860805860805861, "grad_norm": 0.8759288696982239, "learning_rate": 4.195387062566278e-05, "loss": 0.6222, "step": 4480 }, { "epoch": 0.587388801674516, "grad_norm": 0.479216838198247, "learning_rate": 4.182131495227996e-05, "loss": 0.8338, "step": 4490 }, { "epoch": 0.5886970172684458, "grad_norm": 0.7698717669406883, "learning_rate": 4.168875927889714e-05, "loss": 0.6278, "step": 4500 }, { "epoch": 0.5900052328623757, "grad_norm": 0.48674948133579515, "learning_rate": 4.155620360551432e-05, "loss": 0.7887, "step": 4510 }, { "epoch": 0.5913134484563056, "grad_norm": 0.5786002565311555, "learning_rate": 4.14236479321315e-05, "loss": 0.5891, "step": 4520 }, { "epoch": 0.5926216640502355, "grad_norm": 0.5370485226818142, "learning_rate": 4.129109225874868e-05, "loss": 0.8276, "step": 4530 }, { "epoch": 0.5939298796441653, "grad_norm": 0.43625907655336543, "learning_rate": 4.1158536585365856e-05, "loss": 0.5856, "step": 4540 }, { "epoch": 0.5952380952380952, "grad_norm": 0.5266268760596537, "learning_rate": 4.1025980911983036e-05, "loss": 0.7964, "step": 4550 }, { "epoch": 0.5965463108320251, "grad_norm": 0.5771002724293784, "learning_rate": 4.0893425238600215e-05, "loss": 0.5854, "step": 4560 }, { "epoch": 0.597854526425955, "grad_norm": 0.5919954827087538, "learning_rate": 4.076086956521739e-05, "loss": 0.8034, "step": 4570 }, { "epoch": 0.5991627420198848, "grad_norm": 0.5680838313723001, "learning_rate": 4.0628313891834574e-05, "loss": 0.5926, "step": 4580 }, { "epoch": 0.6004709576138147, "grad_norm": 0.6715456913252982, "learning_rate": 4.049575821845175e-05, "loss": 0.8389, "step": 4590 }, { "epoch": 0.6017791732077447, "grad_norm": 0.5499015722260664, "learning_rate": 4.0363202545068926e-05, "loss": 0.6022, "step": 4600 }, { "epoch": 0.6030873888016746, "grad_norm": 0.7560936310398005, "learning_rate": 4.023064687168611e-05, "loss": 0.8014, "step": 4610 }, { "epoch": 0.6043956043956044, "grad_norm": 0.5816663839566236, "learning_rate": 4.009809119830329e-05, "loss": 0.6037, "step": 4620 }, { "epoch": 0.6057038199895343, "grad_norm": 0.4954065888150648, "learning_rate": 3.9965535524920464e-05, "loss": 0.8123, "step": 4630 }, { "epoch": 0.6070120355834642, "grad_norm": 0.6634722345039996, "learning_rate": 3.983297985153765e-05, "loss": 0.6093, "step": 4640 }, { "epoch": 0.6083202511773941, "grad_norm": 0.5448340011567387, "learning_rate": 3.970042417815483e-05, "loss": 0.8391, "step": 4650 }, { "epoch": 0.6096284667713239, "grad_norm": 0.7234298679122729, "learning_rate": 3.9567868504772e-05, "loss": 0.6123, "step": 4660 }, { "epoch": 0.6109366823652538, "grad_norm": 0.7113516275751068, "learning_rate": 3.943531283138919e-05, "loss": 0.8188, "step": 4670 }, { "epoch": 0.6122448979591837, "grad_norm": 0.5343485382635467, "learning_rate": 3.930275715800637e-05, "loss": 0.6025, "step": 4680 }, { "epoch": 0.6135531135531136, "grad_norm": 0.574688006845476, "learning_rate": 3.917020148462354e-05, "loss": 0.8244, "step": 4690 }, { "epoch": 0.6148613291470434, "grad_norm": 0.5954437693210181, "learning_rate": 3.9037645811240727e-05, "loss": 0.5976, "step": 4700 }, { "epoch": 0.6161695447409733, "grad_norm": 0.6359916685979564, "learning_rate": 3.89050901378579e-05, "loss": 0.804, "step": 4710 }, { "epoch": 0.6174777603349032, "grad_norm": 0.37864818768935987, "learning_rate": 3.877253446447508e-05, "loss": 0.5819, "step": 4720 }, { "epoch": 0.6187859759288331, "grad_norm": 0.6317553124676056, "learning_rate": 3.8639978791092265e-05, "loss": 0.8275, "step": 4730 }, { "epoch": 0.6200941915227629, "grad_norm": 0.37899782241778535, "learning_rate": 3.850742311770944e-05, "loss": 0.5939, "step": 4740 }, { "epoch": 0.6214024071166928, "grad_norm": 0.5096002461412386, "learning_rate": 3.837486744432662e-05, "loss": 0.8152, "step": 4750 }, { "epoch": 0.6227106227106227, "grad_norm": 0.5858186125506725, "learning_rate": 3.82423117709438e-05, "loss": 0.5971, "step": 4760 }, { "epoch": 0.6240188383045526, "grad_norm": 0.5258202103911215, "learning_rate": 3.8109756097560976e-05, "loss": 0.8146, "step": 4770 }, { "epoch": 0.6253270538984824, "grad_norm": 0.59652515926618, "learning_rate": 3.7977200424178155e-05, "loss": 0.6034, "step": 4780 }, { "epoch": 0.6266352694924123, "grad_norm": 0.692241221647863, "learning_rate": 3.784464475079534e-05, "loss": 0.817, "step": 4790 }, { "epoch": 0.6279434850863422, "grad_norm": 0.5751964849585177, "learning_rate": 3.7712089077412514e-05, "loss": 0.5948, "step": 4800 }, { "epoch": 0.6292517006802721, "grad_norm": 0.4867932739502536, "learning_rate": 3.757953340402969e-05, "loss": 0.7895, "step": 4810 }, { "epoch": 0.6305599162742019, "grad_norm": 0.6527891870524758, "learning_rate": 3.744697773064688e-05, "loss": 0.5869, "step": 4820 }, { "epoch": 0.6318681318681318, "grad_norm": 0.542994791536692, "learning_rate": 3.731442205726405e-05, "loss": 0.8016, "step": 4830 }, { "epoch": 0.6331763474620618, "grad_norm": 0.49091636854896203, "learning_rate": 3.718186638388123e-05, "loss": 0.591, "step": 4840 }, { "epoch": 0.6344845630559917, "grad_norm": 0.4808333120684155, "learning_rate": 3.704931071049841e-05, "loss": 0.8374, "step": 4850 }, { "epoch": 0.6357927786499215, "grad_norm": 0.609783391275745, "learning_rate": 3.691675503711559e-05, "loss": 0.5727, "step": 4860 }, { "epoch": 0.6371009942438514, "grad_norm": 0.4901032831365679, "learning_rate": 3.678419936373277e-05, "loss": 0.8326, "step": 4870 }, { "epoch": 0.6384092098377813, "grad_norm": 0.5535614493168636, "learning_rate": 3.665164369034995e-05, "loss": 0.587, "step": 4880 }, { "epoch": 0.6397174254317112, "grad_norm": 0.5661579063549987, "learning_rate": 3.651908801696713e-05, "loss": 0.8006, "step": 4890 }, { "epoch": 0.6410256410256411, "grad_norm": 0.5755165016330849, "learning_rate": 3.638653234358431e-05, "loss": 0.5548, "step": 4900 }, { "epoch": 0.6423338566195709, "grad_norm": 0.5195028269557372, "learning_rate": 3.625397667020149e-05, "loss": 0.8411, "step": 4910 }, { "epoch": 0.6436420722135008, "grad_norm": 0.595652162086749, "learning_rate": 3.6121420996818666e-05, "loss": 0.5966, "step": 4920 }, { "epoch": 0.6449502878074307, "grad_norm": 0.5243990905265108, "learning_rate": 3.5988865323435846e-05, "loss": 0.8298, "step": 4930 }, { "epoch": 0.6462585034013606, "grad_norm": 0.7940136105161147, "learning_rate": 3.585630965005302e-05, "loss": 0.6309, "step": 4940 }, { "epoch": 0.6475667189952904, "grad_norm": 0.4909267588948854, "learning_rate": 3.5723753976670205e-05, "loss": 0.7967, "step": 4950 }, { "epoch": 0.6488749345892203, "grad_norm": 0.6021461164458334, "learning_rate": 3.5591198303287384e-05, "loss": 0.5946, "step": 4960 }, { "epoch": 0.6501831501831502, "grad_norm": 0.5082946197773784, "learning_rate": 3.545864262990456e-05, "loss": 0.7959, "step": 4970 }, { "epoch": 0.6514913657770801, "grad_norm": 0.6887923783738013, "learning_rate": 3.532608695652174e-05, "loss": 0.6178, "step": 4980 }, { "epoch": 0.6527995813710099, "grad_norm": 0.4863608457352857, "learning_rate": 3.519353128313892e-05, "loss": 0.8218, "step": 4990 }, { "epoch": 0.6541077969649398, "grad_norm": 0.6370508452362942, "learning_rate": 3.5060975609756095e-05, "loss": 0.5726, "step": 5000 }, { "epoch": 0.6554160125588697, "grad_norm": 0.5626726082078685, "learning_rate": 3.492841993637328e-05, "loss": 0.8388, "step": 5010 }, { "epoch": 0.6567242281527996, "grad_norm": 0.6058799396796493, "learning_rate": 3.479586426299046e-05, "loss": 0.6345, "step": 5020 }, { "epoch": 0.6580324437467294, "grad_norm": 0.5486653470890362, "learning_rate": 3.466330858960763e-05, "loss": 0.8304, "step": 5030 }, { "epoch": 0.6593406593406593, "grad_norm": 0.7402614360383322, "learning_rate": 3.453075291622482e-05, "loss": 0.5972, "step": 5040 }, { "epoch": 0.6606488749345892, "grad_norm": 0.5065684112904657, "learning_rate": 3.4398197242842e-05, "loss": 0.8049, "step": 5050 }, { "epoch": 0.6619570905285191, "grad_norm": 0.4487165281422432, "learning_rate": 3.426564156945917e-05, "loss": 0.5813, "step": 5060 }, { "epoch": 0.6632653061224489, "grad_norm": 0.5576418936555791, "learning_rate": 3.413308589607636e-05, "loss": 0.85, "step": 5070 }, { "epoch": 0.6645735217163788, "grad_norm": 0.5629552965222027, "learning_rate": 3.400053022269353e-05, "loss": 0.6144, "step": 5080 }, { "epoch": 0.6658817373103088, "grad_norm": 0.5340051545534295, "learning_rate": 3.386797454931071e-05, "loss": 0.8256, "step": 5090 }, { "epoch": 0.6671899529042387, "grad_norm": 0.6481929473760202, "learning_rate": 3.3735418875927896e-05, "loss": 0.5934, "step": 5100 }, { "epoch": 0.6684981684981685, "grad_norm": 0.586068083609296, "learning_rate": 3.360286320254507e-05, "loss": 0.8065, "step": 5110 }, { "epoch": 0.6698063840920984, "grad_norm": 0.5898674035419238, "learning_rate": 3.347030752916225e-05, "loss": 0.6332, "step": 5120 }, { "epoch": 0.6711145996860283, "grad_norm": 0.5271996892541019, "learning_rate": 3.3337751855779434e-05, "loss": 0.8407, "step": 5130 }, { "epoch": 0.6724228152799582, "grad_norm": 0.7209460794111061, "learning_rate": 3.3205196182396606e-05, "loss": 0.5954, "step": 5140 }, { "epoch": 0.673731030873888, "grad_norm": 0.5419491953310692, "learning_rate": 3.3072640509013786e-05, "loss": 0.8186, "step": 5150 }, { "epoch": 0.6750392464678179, "grad_norm": 0.6363503952339683, "learning_rate": 3.294008483563097e-05, "loss": 0.6252, "step": 5160 }, { "epoch": 0.6763474620617478, "grad_norm": 0.5479539766686561, "learning_rate": 3.2807529162248144e-05, "loss": 0.8248, "step": 5170 }, { "epoch": 0.6776556776556777, "grad_norm": 0.5535849243715827, "learning_rate": 3.2674973488865324e-05, "loss": 0.5717, "step": 5180 }, { "epoch": 0.6789638932496075, "grad_norm": 0.5091172962824471, "learning_rate": 3.25424178154825e-05, "loss": 0.8203, "step": 5190 }, { "epoch": 0.6802721088435374, "grad_norm": 0.562594433405121, "learning_rate": 3.240986214209968e-05, "loss": 0.6089, "step": 5200 }, { "epoch": 0.6815803244374673, "grad_norm": 0.5517130562661668, "learning_rate": 3.227730646871686e-05, "loss": 0.8446, "step": 5210 }, { "epoch": 0.6828885400313972, "grad_norm": 0.4047148337025719, "learning_rate": 3.214475079533404e-05, "loss": 0.5894, "step": 5220 }, { "epoch": 0.684196755625327, "grad_norm": 0.5239322456435999, "learning_rate": 3.201219512195122e-05, "loss": 0.8321, "step": 5230 }, { "epoch": 0.6855049712192569, "grad_norm": 0.5498585626284401, "learning_rate": 3.18796394485684e-05, "loss": 0.6279, "step": 5240 }, { "epoch": 0.6868131868131868, "grad_norm": 0.48743593460492207, "learning_rate": 3.174708377518558e-05, "loss": 0.8242, "step": 5250 }, { "epoch": 0.6881214024071167, "grad_norm": 0.5651556275625538, "learning_rate": 3.161452810180276e-05, "loss": 0.6286, "step": 5260 }, { "epoch": 0.6894296180010465, "grad_norm": 0.5754174898482404, "learning_rate": 3.148197242841994e-05, "loss": 0.8177, "step": 5270 }, { "epoch": 0.6907378335949764, "grad_norm": 0.6605607744635831, "learning_rate": 3.134941675503712e-05, "loss": 0.597, "step": 5280 }, { "epoch": 0.6920460491889063, "grad_norm": 0.5228692951705382, "learning_rate": 3.12168610816543e-05, "loss": 0.7984, "step": 5290 }, { "epoch": 0.6933542647828362, "grad_norm": 0.4480430717512152, "learning_rate": 3.1084305408271477e-05, "loss": 0.6067, "step": 5300 }, { "epoch": 0.6946624803767661, "grad_norm": 0.5638626523175552, "learning_rate": 3.095174973488865e-05, "loss": 0.8093, "step": 5310 }, { "epoch": 0.6959706959706959, "grad_norm": 0.5946989663976191, "learning_rate": 3.0819194061505835e-05, "loss": 0.59, "step": 5320 }, { "epoch": 0.6972789115646258, "grad_norm": 0.5671263430143716, "learning_rate": 3.0686638388123015e-05, "loss": 0.8009, "step": 5330 }, { "epoch": 0.6985871271585558, "grad_norm": 0.589042697181555, "learning_rate": 3.055408271474019e-05, "loss": 0.5782, "step": 5340 }, { "epoch": 0.6998953427524857, "grad_norm": 0.7073787068267711, "learning_rate": 3.0421527041357374e-05, "loss": 0.7989, "step": 5350 }, { "epoch": 0.7012035583464155, "grad_norm": 0.5352877950074024, "learning_rate": 3.028897136797455e-05, "loss": 0.6068, "step": 5360 }, { "epoch": 0.7025117739403454, "grad_norm": 0.5346918859139157, "learning_rate": 3.015641569459173e-05, "loss": 0.8129, "step": 5370 }, { "epoch": 0.7038199895342753, "grad_norm": 0.48063163414005916, "learning_rate": 3.0023860021208912e-05, "loss": 0.561, "step": 5380 }, { "epoch": 0.7051282051282052, "grad_norm": 0.5431033944364678, "learning_rate": 2.9891304347826088e-05, "loss": 0.8314, "step": 5390 }, { "epoch": 0.706436420722135, "grad_norm": 0.6251701992093956, "learning_rate": 2.9758748674443264e-05, "loss": 0.6043, "step": 5400 }, { "epoch": 0.7077446363160649, "grad_norm": 0.538107678456505, "learning_rate": 2.962619300106045e-05, "loss": 0.8345, "step": 5410 }, { "epoch": 0.7090528519099948, "grad_norm": 0.5596293446673235, "learning_rate": 2.9493637327677626e-05, "loss": 0.5767, "step": 5420 }, { "epoch": 0.7103610675039247, "grad_norm": 0.6302036834487577, "learning_rate": 2.9361081654294802e-05, "loss": 0.817, "step": 5430 }, { "epoch": 0.7116692830978545, "grad_norm": 0.6442132298338353, "learning_rate": 2.9228525980911985e-05, "loss": 0.5956, "step": 5440 }, { "epoch": 0.7129774986917844, "grad_norm": 0.5294060275695159, "learning_rate": 2.9095970307529164e-05, "loss": 0.8086, "step": 5450 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5780709508327503, "learning_rate": 2.896341463414634e-05, "loss": 0.599, "step": 5460 }, { "epoch": 0.7155939298796442, "grad_norm": 0.5178595179064079, "learning_rate": 2.8830858960763523e-05, "loss": 0.8023, "step": 5470 }, { "epoch": 0.716902145473574, "grad_norm": 0.4908347612839904, "learning_rate": 2.8698303287380702e-05, "loss": 0.5906, "step": 5480 }, { "epoch": 0.7182103610675039, "grad_norm": 0.5745266376669839, "learning_rate": 2.8565747613997878e-05, "loss": 0.8017, "step": 5490 }, { "epoch": 0.7195185766614338, "grad_norm": 0.7091252655798257, "learning_rate": 2.843319194061506e-05, "loss": 0.6165, "step": 5500 }, { "epoch": 0.7208267922553637, "grad_norm": 0.5082645638178946, "learning_rate": 2.830063626723224e-05, "loss": 0.7916, "step": 5510 }, { "epoch": 0.7221350078492935, "grad_norm": 0.8135604062724642, "learning_rate": 2.8168080593849416e-05, "loss": 0.5766, "step": 5520 }, { "epoch": 0.7234432234432234, "grad_norm": 0.5344352881317546, "learning_rate": 2.80355249204666e-05, "loss": 0.7945, "step": 5530 }, { "epoch": 0.7247514390371533, "grad_norm": 0.6213640188906101, "learning_rate": 2.7902969247083775e-05, "loss": 0.5739, "step": 5540 }, { "epoch": 0.7260596546310832, "grad_norm": 0.5133786932833476, "learning_rate": 2.7770413573700955e-05, "loss": 0.8175, "step": 5550 }, { "epoch": 0.727367870225013, "grad_norm": 0.5687616103525199, "learning_rate": 2.7637857900318137e-05, "loss": 0.5946, "step": 5560 }, { "epoch": 0.7286760858189429, "grad_norm": 0.5361603157753395, "learning_rate": 2.7505302226935313e-05, "loss": 0.8331, "step": 5570 }, { "epoch": 0.7299843014128728, "grad_norm": 0.4846643967669185, "learning_rate": 2.7372746553552493e-05, "loss": 0.5846, "step": 5580 }, { "epoch": 0.7312925170068028, "grad_norm": 0.5264633004571062, "learning_rate": 2.7240190880169676e-05, "loss": 0.7929, "step": 5590 }, { "epoch": 0.7326007326007326, "grad_norm": 0.5169934066982514, "learning_rate": 2.710763520678685e-05, "loss": 0.5812, "step": 5600 }, { "epoch": 0.7339089481946625, "grad_norm": 0.49516725051064175, "learning_rate": 2.697507953340403e-05, "loss": 0.8268, "step": 5610 }, { "epoch": 0.7352171637885924, "grad_norm": 0.5545634117837589, "learning_rate": 2.6842523860021214e-05, "loss": 0.6293, "step": 5620 }, { "epoch": 0.7365253793825223, "grad_norm": 0.5230373403077291, "learning_rate": 2.670996818663839e-05, "loss": 0.8166, "step": 5630 }, { "epoch": 0.7378335949764521, "grad_norm": 0.599453616763764, "learning_rate": 2.6577412513255566e-05, "loss": 0.5906, "step": 5640 }, { "epoch": 0.739141810570382, "grad_norm": 0.5973016920910486, "learning_rate": 2.6444856839872752e-05, "loss": 0.8012, "step": 5650 }, { "epoch": 0.7404500261643119, "grad_norm": 0.6449099450136865, "learning_rate": 2.6312301166489928e-05, "loss": 0.5705, "step": 5660 }, { "epoch": 0.7417582417582418, "grad_norm": 0.553182572295564, "learning_rate": 2.6179745493107104e-05, "loss": 0.8191, "step": 5670 }, { "epoch": 0.7430664573521716, "grad_norm": 0.4476262335389087, "learning_rate": 2.6047189819724287e-05, "loss": 0.5958, "step": 5680 }, { "epoch": 0.7443746729461015, "grad_norm": 0.4834161167587533, "learning_rate": 2.5914634146341466e-05, "loss": 0.8144, "step": 5690 }, { "epoch": 0.7456828885400314, "grad_norm": 0.5286677625170841, "learning_rate": 2.5782078472958642e-05, "loss": 0.5816, "step": 5700 }, { "epoch": 0.7469911041339613, "grad_norm": 0.5413842703568794, "learning_rate": 2.5649522799575825e-05, "loss": 0.8312, "step": 5710 }, { "epoch": 0.7482993197278912, "grad_norm": 0.6786517057597097, "learning_rate": 2.5516967126193004e-05, "loss": 0.6086, "step": 5720 }, { "epoch": 0.749607535321821, "grad_norm": 0.49742040117830627, "learning_rate": 2.538441145281018e-05, "loss": 0.7967, "step": 5730 }, { "epoch": 0.7509157509157509, "grad_norm": 0.49256559696291485, "learning_rate": 2.525185577942736e-05, "loss": 0.5724, "step": 5740 }, { "epoch": 0.7522239665096808, "grad_norm": 0.5263192680418415, "learning_rate": 2.5119300106044542e-05, "loss": 0.8011, "step": 5750 }, { "epoch": 0.7535321821036107, "grad_norm": 0.6716690763876726, "learning_rate": 2.498674443266172e-05, "loss": 0.601, "step": 5760 }, { "epoch": 0.7548403976975405, "grad_norm": 0.5268203857733461, "learning_rate": 2.4854188759278898e-05, "loss": 0.8224, "step": 5770 }, { "epoch": 0.7561486132914704, "grad_norm": 0.6545344595309585, "learning_rate": 2.4721633085896077e-05, "loss": 0.5883, "step": 5780 }, { "epoch": 0.7574568288854003, "grad_norm": 0.5192641904709395, "learning_rate": 2.4589077412513257e-05, "loss": 0.8223, "step": 5790 }, { "epoch": 0.7587650444793302, "grad_norm": 0.5569056177071494, "learning_rate": 2.4456521739130436e-05, "loss": 0.5882, "step": 5800 }, { "epoch": 0.76007326007326, "grad_norm": 0.652217104391276, "learning_rate": 2.4323966065747615e-05, "loss": 0.8111, "step": 5810 }, { "epoch": 0.7613814756671899, "grad_norm": 0.715865728522808, "learning_rate": 2.4191410392364795e-05, "loss": 0.5839, "step": 5820 }, { "epoch": 0.7626896912611199, "grad_norm": 0.4622828984795863, "learning_rate": 2.4058854718981974e-05, "loss": 0.8152, "step": 5830 }, { "epoch": 0.7639979068550498, "grad_norm": 0.7105275925193085, "learning_rate": 2.392629904559915e-05, "loss": 0.5727, "step": 5840 }, { "epoch": 0.7653061224489796, "grad_norm": 0.5593573119711108, "learning_rate": 2.3793743372216333e-05, "loss": 0.787, "step": 5850 }, { "epoch": 0.7666143380429095, "grad_norm": 0.6976481535500858, "learning_rate": 2.3661187698833512e-05, "loss": 0.5828, "step": 5860 }, { "epoch": 0.7679225536368394, "grad_norm": 0.5629390717962797, "learning_rate": 2.352863202545069e-05, "loss": 0.8119, "step": 5870 }, { "epoch": 0.7692307692307693, "grad_norm": 0.8329984172410997, "learning_rate": 2.3396076352067868e-05, "loss": 0.5953, "step": 5880 }, { "epoch": 0.7705389848246991, "grad_norm": 0.5054513797033992, "learning_rate": 2.326352067868505e-05, "loss": 0.7897, "step": 5890 }, { "epoch": 0.771847200418629, "grad_norm": 0.8042478184879194, "learning_rate": 2.3130965005302227e-05, "loss": 0.601, "step": 5900 }, { "epoch": 0.7731554160125589, "grad_norm": 0.5134604404253917, "learning_rate": 2.2998409331919406e-05, "loss": 0.825, "step": 5910 }, { "epoch": 0.7744636316064888, "grad_norm": 0.6307033662226441, "learning_rate": 2.286585365853659e-05, "loss": 0.559, "step": 5920 }, { "epoch": 0.7757718472004186, "grad_norm": 0.5923006697127559, "learning_rate": 2.2733297985153765e-05, "loss": 0.7833, "step": 5930 }, { "epoch": 0.7770800627943485, "grad_norm": 0.7423403377088905, "learning_rate": 2.2600742311770944e-05, "loss": 0.5916, "step": 5940 }, { "epoch": 0.7783882783882784, "grad_norm": 0.5275344149548817, "learning_rate": 2.2468186638388124e-05, "loss": 0.7857, "step": 5950 }, { "epoch": 0.7796964939822083, "grad_norm": 0.4485788605700081, "learning_rate": 2.2335630965005303e-05, "loss": 0.5938, "step": 5960 }, { "epoch": 0.7810047095761381, "grad_norm": 0.45581986333641783, "learning_rate": 2.2203075291622482e-05, "loss": 0.789, "step": 5970 }, { "epoch": 0.782312925170068, "grad_norm": 0.7077436921839176, "learning_rate": 2.2070519618239662e-05, "loss": 0.5639, "step": 5980 }, { "epoch": 0.7836211407639979, "grad_norm": 0.485078935511288, "learning_rate": 2.193796394485684e-05, "loss": 0.7974, "step": 5990 }, { "epoch": 0.7849293563579278, "grad_norm": 0.5415931380682013, "learning_rate": 2.180540827147402e-05, "loss": 0.5513, "step": 6000 }, { "epoch": 0.7862375719518576, "grad_norm": 0.6028769503669652, "learning_rate": 2.16728525980912e-05, "loss": 0.7859, "step": 6010 }, { "epoch": 0.7875457875457875, "grad_norm": 0.43775607621380236, "learning_rate": 2.154029692470838e-05, "loss": 0.586, "step": 6020 }, { "epoch": 0.7888540031397174, "grad_norm": 0.5129027729272807, "learning_rate": 2.140774125132556e-05, "loss": 0.8006, "step": 6030 }, { "epoch": 0.7901622187336473, "grad_norm": 0.7986746080445549, "learning_rate": 2.1275185577942735e-05, "loss": 0.6124, "step": 6040 }, { "epoch": 0.7914704343275771, "grad_norm": 0.5495341720621897, "learning_rate": 2.1142629904559917e-05, "loss": 0.8162, "step": 6050 }, { "epoch": 0.792778649921507, "grad_norm": 0.6565347700150155, "learning_rate": 2.1010074231177097e-05, "loss": 0.5797, "step": 6060 }, { "epoch": 0.794086865515437, "grad_norm": 0.49231844511035533, "learning_rate": 2.0877518557794273e-05, "loss": 0.815, "step": 6070 }, { "epoch": 0.7953950811093669, "grad_norm": 0.487405981388951, "learning_rate": 2.0744962884411452e-05, "loss": 0.6098, "step": 6080 }, { "epoch": 0.7967032967032966, "grad_norm": 0.5688986137956366, "learning_rate": 2.0612407211028635e-05, "loss": 0.8328, "step": 6090 }, { "epoch": 0.7980115122972266, "grad_norm": 0.553490629602128, "learning_rate": 2.047985153764581e-05, "loss": 0.5689, "step": 6100 }, { "epoch": 0.7993197278911565, "grad_norm": 0.5247897399616922, "learning_rate": 2.034729586426299e-05, "loss": 0.809, "step": 6110 }, { "epoch": 0.8006279434850864, "grad_norm": 0.6208953939424018, "learning_rate": 2.021474019088017e-05, "loss": 0.5615, "step": 6120 }, { "epoch": 0.8019361590790163, "grad_norm": 0.5020903361827662, "learning_rate": 2.008218451749735e-05, "loss": 0.8156, "step": 6130 }, { "epoch": 0.8032443746729461, "grad_norm": 0.5291771623927373, "learning_rate": 1.994962884411453e-05, "loss": 0.5884, "step": 6140 }, { "epoch": 0.804552590266876, "grad_norm": 0.5285998915355191, "learning_rate": 1.9817073170731708e-05, "loss": 0.8051, "step": 6150 }, { "epoch": 0.8058608058608059, "grad_norm": 0.8178935383618982, "learning_rate": 1.9684517497348887e-05, "loss": 0.6138, "step": 6160 }, { "epoch": 0.8071690214547358, "grad_norm": 0.4987221022793129, "learning_rate": 1.9551961823966067e-05, "loss": 0.8294, "step": 6170 }, { "epoch": 0.8084772370486656, "grad_norm": 0.6163654350885334, "learning_rate": 1.9419406150583246e-05, "loss": 0.5636, "step": 6180 }, { "epoch": 0.8097854526425955, "grad_norm": 0.555469302539477, "learning_rate": 1.9286850477200426e-05, "loss": 0.8178, "step": 6190 }, { "epoch": 0.8110936682365254, "grad_norm": 0.6123412415543169, "learning_rate": 1.9154294803817605e-05, "loss": 0.5704, "step": 6200 }, { "epoch": 0.8124018838304553, "grad_norm": 0.5124297425086758, "learning_rate": 1.9021739130434784e-05, "loss": 0.8193, "step": 6210 }, { "epoch": 0.8137100994243851, "grad_norm": 0.8183903363373162, "learning_rate": 1.8889183457051964e-05, "loss": 0.6082, "step": 6220 }, { "epoch": 0.815018315018315, "grad_norm": 0.5990002289487968, "learning_rate": 1.8756627783669143e-05, "loss": 0.8463, "step": 6230 }, { "epoch": 0.8163265306122449, "grad_norm": 0.5158204377847465, "learning_rate": 1.8624072110286323e-05, "loss": 0.5905, "step": 6240 }, { "epoch": 0.8176347462061748, "grad_norm": 0.5071733107919829, "learning_rate": 1.84915164369035e-05, "loss": 0.8052, "step": 6250 }, { "epoch": 0.8189429618001046, "grad_norm": 0.7067873685301963, "learning_rate": 1.835896076352068e-05, "loss": 0.6175, "step": 6260 }, { "epoch": 0.8202511773940345, "grad_norm": 0.49736879675215423, "learning_rate": 1.8226405090137857e-05, "loss": 0.8187, "step": 6270 }, { "epoch": 0.8215593929879644, "grad_norm": 0.6169326476344377, "learning_rate": 1.8093849416755037e-05, "loss": 0.5969, "step": 6280 }, { "epoch": 0.8228676085818943, "grad_norm": 0.5358771287992227, "learning_rate": 1.796129374337222e-05, "loss": 0.8084, "step": 6290 }, { "epoch": 0.8241758241758241, "grad_norm": 0.6155654512041941, "learning_rate": 1.7828738069989395e-05, "loss": 0.5751, "step": 6300 }, { "epoch": 0.825484039769754, "grad_norm": 0.5694555500789946, "learning_rate": 1.7696182396606575e-05, "loss": 0.8114, "step": 6310 }, { "epoch": 0.826792255363684, "grad_norm": 0.5299599349376566, "learning_rate": 1.7563626723223754e-05, "loss": 0.5815, "step": 6320 }, { "epoch": 0.8281004709576139, "grad_norm": 0.5002411114343089, "learning_rate": 1.7431071049840934e-05, "loss": 0.8116, "step": 6330 }, { "epoch": 0.8294086865515437, "grad_norm": 0.6844907765956427, "learning_rate": 1.7298515376458113e-05, "loss": 0.5998, "step": 6340 }, { "epoch": 0.8307169021454736, "grad_norm": 0.5784470770219757, "learning_rate": 1.7165959703075292e-05, "loss": 0.8211, "step": 6350 }, { "epoch": 0.8320251177394035, "grad_norm": 0.6402598629047223, "learning_rate": 1.7033404029692472e-05, "loss": 0.5686, "step": 6360 }, { "epoch": 0.8333333333333334, "grad_norm": 0.5486156319155983, "learning_rate": 1.690084835630965e-05, "loss": 0.819, "step": 6370 }, { "epoch": 0.8346415489272632, "grad_norm": 0.552758307022236, "learning_rate": 1.676829268292683e-05, "loss": 0.5744, "step": 6380 }, { "epoch": 0.8359497645211931, "grad_norm": 0.5129974867018121, "learning_rate": 1.663573700954401e-05, "loss": 0.802, "step": 6390 }, { "epoch": 0.837257980115123, "grad_norm": 0.6477174203830427, "learning_rate": 1.650318133616119e-05, "loss": 0.604, "step": 6400 }, { "epoch": 0.8385661957090529, "grad_norm": 0.5721470905692986, "learning_rate": 1.637062566277837e-05, "loss": 0.7867, "step": 6410 }, { "epoch": 0.8398744113029827, "grad_norm": 0.5613277299496854, "learning_rate": 1.6238069989395545e-05, "loss": 0.5995, "step": 6420 }, { "epoch": 0.8411826268969126, "grad_norm": 0.4873586998049457, "learning_rate": 1.6105514316012728e-05, "loss": 0.7987, "step": 6430 }, { "epoch": 0.8424908424908425, "grad_norm": 0.6328797394647059, "learning_rate": 1.5972958642629907e-05, "loss": 0.5875, "step": 6440 }, { "epoch": 0.8437990580847724, "grad_norm": 0.5065229266101028, "learning_rate": 1.5840402969247083e-05, "loss": 0.7832, "step": 6450 }, { "epoch": 0.8451072736787022, "grad_norm": 0.7363086761942835, "learning_rate": 1.5707847295864266e-05, "loss": 0.5797, "step": 6460 }, { "epoch": 0.8464154892726321, "grad_norm": 0.5910783809750648, "learning_rate": 1.5575291622481442e-05, "loss": 0.8484, "step": 6470 }, { "epoch": 0.847723704866562, "grad_norm": 0.46343031391913037, "learning_rate": 1.544273594909862e-05, "loss": 0.5783, "step": 6480 }, { "epoch": 0.8490319204604919, "grad_norm": 0.5168004042862498, "learning_rate": 1.53101802757158e-05, "loss": 0.7856, "step": 6490 }, { "epoch": 0.8503401360544217, "grad_norm": 0.7961735878117324, "learning_rate": 1.517762460233298e-05, "loss": 0.5835, "step": 6500 }, { "epoch": 0.8516483516483516, "grad_norm": 0.4869428795315839, "learning_rate": 1.504506892895016e-05, "loss": 0.7959, "step": 6510 }, { "epoch": 0.8529565672422815, "grad_norm": 0.6051259113895093, "learning_rate": 1.491251325556734e-05, "loss": 0.5812, "step": 6520 }, { "epoch": 0.8542647828362114, "grad_norm": 0.48181809369845346, "learning_rate": 1.4779957582184516e-05, "loss": 0.7691, "step": 6530 }, { "epoch": 0.8555729984301413, "grad_norm": 0.6165360359930039, "learning_rate": 1.4647401908801697e-05, "loss": 0.5906, "step": 6540 }, { "epoch": 0.8568812140240711, "grad_norm": 0.49455764845266237, "learning_rate": 1.4514846235418877e-05, "loss": 0.7854, "step": 6550 }, { "epoch": 0.858189429618001, "grad_norm": 0.5284557358740105, "learning_rate": 1.4382290562036055e-05, "loss": 0.588, "step": 6560 }, { "epoch": 0.859497645211931, "grad_norm": 0.5292637656987533, "learning_rate": 1.4249734888653236e-05, "loss": 0.8016, "step": 6570 }, { "epoch": 0.8608058608058609, "grad_norm": 0.8513431344553715, "learning_rate": 1.4117179215270415e-05, "loss": 0.6167, "step": 6580 }, { "epoch": 0.8621140763997907, "grad_norm": 0.579288662692426, "learning_rate": 1.3984623541887593e-05, "loss": 0.8134, "step": 6590 }, { "epoch": 0.8634222919937206, "grad_norm": 0.6785159865651157, "learning_rate": 1.3852067868504772e-05, "loss": 0.5753, "step": 6600 }, { "epoch": 0.8647305075876505, "grad_norm": 0.5143467597796619, "learning_rate": 1.3719512195121953e-05, "loss": 0.8117, "step": 6610 }, { "epoch": 0.8660387231815804, "grad_norm": 0.5958544884291407, "learning_rate": 1.3586956521739131e-05, "loss": 0.5694, "step": 6620 }, { "epoch": 0.8673469387755102, "grad_norm": 0.6204168418491064, "learning_rate": 1.345440084835631e-05, "loss": 0.7969, "step": 6630 }, { "epoch": 0.8686551543694401, "grad_norm": 0.5741612534922768, "learning_rate": 1.3321845174973491e-05, "loss": 0.5918, "step": 6640 }, { "epoch": 0.86996336996337, "grad_norm": 0.5784688853516818, "learning_rate": 1.3189289501590667e-05, "loss": 0.8035, "step": 6650 }, { "epoch": 0.8712715855572999, "grad_norm": 0.5603176409725421, "learning_rate": 1.3056733828207849e-05, "loss": 0.5805, "step": 6660 }, { "epoch": 0.8725798011512297, "grad_norm": 0.5498601994735227, "learning_rate": 1.2924178154825028e-05, "loss": 0.7983, "step": 6670 }, { "epoch": 0.8738880167451596, "grad_norm": 0.46586893720865835, "learning_rate": 1.2791622481442206e-05, "loss": 0.5847, "step": 6680 }, { "epoch": 0.8751962323390895, "grad_norm": 0.4854338950174297, "learning_rate": 1.2659066808059387e-05, "loss": 0.7692, "step": 6690 }, { "epoch": 0.8765044479330194, "grad_norm": 0.5304541776380965, "learning_rate": 1.2526511134676563e-05, "loss": 0.5896, "step": 6700 }, { "epoch": 0.8778126635269492, "grad_norm": 0.5909426804833323, "learning_rate": 1.2393955461293744e-05, "loss": 0.7792, "step": 6710 }, { "epoch": 0.8791208791208791, "grad_norm": 0.4671763867260705, "learning_rate": 1.2261399787910923e-05, "loss": 0.5638, "step": 6720 }, { "epoch": 0.880429094714809, "grad_norm": 0.5359237947603994, "learning_rate": 1.2128844114528103e-05, "loss": 0.8022, "step": 6730 }, { "epoch": 0.8817373103087389, "grad_norm": 0.6960066070363433, "learning_rate": 1.1996288441145282e-05, "loss": 0.5957, "step": 6740 }, { "epoch": 0.8830455259026687, "grad_norm": 0.4855729622009359, "learning_rate": 1.186373276776246e-05, "loss": 0.8295, "step": 6750 }, { "epoch": 0.8843537414965986, "grad_norm": 0.9533835028532404, "learning_rate": 1.173117709437964e-05, "loss": 0.5855, "step": 6760 }, { "epoch": 0.8856619570905285, "grad_norm": 0.5386665177217399, "learning_rate": 1.1598621420996818e-05, "loss": 0.7768, "step": 6770 }, { "epoch": 0.8869701726844584, "grad_norm": 0.5851585267921338, "learning_rate": 1.1466065747613998e-05, "loss": 0.5735, "step": 6780 }, { "epoch": 0.8882783882783882, "grad_norm": 0.6320477723686851, "learning_rate": 1.1333510074231179e-05, "loss": 0.8027, "step": 6790 }, { "epoch": 0.8895866038723181, "grad_norm": 0.6634243839335632, "learning_rate": 1.1200954400848357e-05, "loss": 0.5573, "step": 6800 }, { "epoch": 0.890894819466248, "grad_norm": 0.5535217711779135, "learning_rate": 1.1068398727465536e-05, "loss": 0.7966, "step": 6810 }, { "epoch": 0.892203035060178, "grad_norm": 0.7832734038273745, "learning_rate": 1.0935843054082715e-05, "loss": 0.6074, "step": 6820 }, { "epoch": 0.8935112506541077, "grad_norm": 0.520725853685351, "learning_rate": 1.0803287380699895e-05, "loss": 0.7948, "step": 6830 }, { "epoch": 0.8948194662480377, "grad_norm": 0.5140360510057409, "learning_rate": 1.0670731707317074e-05, "loss": 0.5672, "step": 6840 }, { "epoch": 0.8961276818419676, "grad_norm": 0.5480237760678021, "learning_rate": 1.0538176033934252e-05, "loss": 0.8193, "step": 6850 }, { "epoch": 0.8974358974358975, "grad_norm": 0.5379154678692974, "learning_rate": 1.0405620360551433e-05, "loss": 0.5615, "step": 6860 }, { "epoch": 0.8987441130298273, "grad_norm": 0.5748341162685962, "learning_rate": 1.027306468716861e-05, "loss": 0.7761, "step": 6870 }, { "epoch": 0.9000523286237572, "grad_norm": 0.354346750572681, "learning_rate": 1.014050901378579e-05, "loss": 0.5742, "step": 6880 }, { "epoch": 0.9013605442176871, "grad_norm": 0.5556016455622972, "learning_rate": 1.000795334040297e-05, "loss": 0.8124, "step": 6890 }, { "epoch": 0.902668759811617, "grad_norm": 0.616134681196408, "learning_rate": 9.875397667020149e-06, "loss": 0.5799, "step": 6900 }, { "epoch": 0.9039769754055468, "grad_norm": 0.5351793977307407, "learning_rate": 9.742841993637328e-06, "loss": 0.8111, "step": 6910 }, { "epoch": 0.9052851909994767, "grad_norm": 0.4796711570776462, "learning_rate": 9.610286320254508e-06, "loss": 0.5897, "step": 6920 }, { "epoch": 0.9065934065934066, "grad_norm": 0.5333607429667859, "learning_rate": 9.477730646871687e-06, "loss": 0.7895, "step": 6930 }, { "epoch": 0.9079016221873365, "grad_norm": 0.7210385096597725, "learning_rate": 9.345174973488865e-06, "loss": 0.5814, "step": 6940 }, { "epoch": 0.9092098377812664, "grad_norm": 0.4921959597022122, "learning_rate": 9.212619300106044e-06, "loss": 0.7722, "step": 6950 }, { "epoch": 0.9105180533751962, "grad_norm": 0.582597864482659, "learning_rate": 9.080063626723225e-06, "loss": 0.5718, "step": 6960 }, { "epoch": 0.9118262689691261, "grad_norm": 0.5378533355374352, "learning_rate": 8.947507953340403e-06, "loss": 0.7841, "step": 6970 }, { "epoch": 0.913134484563056, "grad_norm": 0.6935491033828649, "learning_rate": 8.814952279957582e-06, "loss": 0.5882, "step": 6980 }, { "epoch": 0.9144427001569859, "grad_norm": 0.5318156050528525, "learning_rate": 8.682396606574762e-06, "loss": 0.7961, "step": 6990 }, { "epoch": 0.9157509157509157, "grad_norm": 0.6218540662399403, "learning_rate": 8.549840933191941e-06, "loss": 0.5646, "step": 7000 }, { "epoch": 0.9170591313448456, "grad_norm": 0.5580021318568493, "learning_rate": 8.41728525980912e-06, "loss": 0.8223, "step": 7010 }, { "epoch": 0.9183673469387755, "grad_norm": 0.7372202219508209, "learning_rate": 8.2847295864263e-06, "loss": 0.5732, "step": 7020 }, { "epoch": 0.9196755625327054, "grad_norm": 0.5531047720727633, "learning_rate": 8.15217391304348e-06, "loss": 0.8233, "step": 7030 }, { "epoch": 0.9209837781266352, "grad_norm": 0.9657658989896405, "learning_rate": 8.019618239660657e-06, "loss": 0.5559, "step": 7040 }, { "epoch": 0.9222919937205651, "grad_norm": 0.5772618182869884, "learning_rate": 7.887062566277838e-06, "loss": 0.7868, "step": 7050 }, { "epoch": 0.923600209314495, "grad_norm": 0.4581262580386774, "learning_rate": 7.754506892895016e-06, "loss": 0.5821, "step": 7060 }, { "epoch": 0.924908424908425, "grad_norm": 0.6112521566283506, "learning_rate": 7.621951219512195e-06, "loss": 0.7813, "step": 7070 }, { "epoch": 0.9262166405023547, "grad_norm": 0.7318140554967631, "learning_rate": 7.4893955461293745e-06, "loss": 0.5652, "step": 7080 }, { "epoch": 0.9275248560962847, "grad_norm": 0.55917265698163, "learning_rate": 7.356839872746554e-06, "loss": 0.8161, "step": 7090 }, { "epoch": 0.9288330716902146, "grad_norm": 0.7037568341200027, "learning_rate": 7.224284199363733e-06, "loss": 0.586, "step": 7100 }, { "epoch": 0.9301412872841445, "grad_norm": 0.5346184273438803, "learning_rate": 7.091728525980912e-06, "loss": 0.7951, "step": 7110 }, { "epoch": 0.9314495028780743, "grad_norm": 0.4150830640122542, "learning_rate": 6.959172852598092e-06, "loss": 0.5949, "step": 7120 }, { "epoch": 0.9327577184720042, "grad_norm": 0.5114818023098201, "learning_rate": 6.826617179215271e-06, "loss": 0.7904, "step": 7130 }, { "epoch": 0.9340659340659341, "grad_norm": 0.5908812058211556, "learning_rate": 6.69406150583245e-06, "loss": 0.5674, "step": 7140 }, { "epoch": 0.935374149659864, "grad_norm": 0.5343311980537179, "learning_rate": 6.561505832449629e-06, "loss": 0.7942, "step": 7150 }, { "epoch": 0.9366823652537938, "grad_norm": 0.5903444129390092, "learning_rate": 6.428950159066809e-06, "loss": 0.5679, "step": 7160 }, { "epoch": 0.9379905808477237, "grad_norm": 0.5453396700244478, "learning_rate": 6.296394485683987e-06, "loss": 0.8118, "step": 7170 }, { "epoch": 0.9392987964416536, "grad_norm": 0.48350350014595894, "learning_rate": 6.163838812301167e-06, "loss": 0.5974, "step": 7180 }, { "epoch": 0.9406070120355835, "grad_norm": 0.5664756102574485, "learning_rate": 6.031283138918345e-06, "loss": 0.7948, "step": 7190 }, { "epoch": 0.9419152276295133, "grad_norm": 0.5142501404418801, "learning_rate": 5.8987274655355255e-06, "loss": 0.606, "step": 7200 }, { "epoch": 0.9432234432234432, "grad_norm": 0.54569469097517, "learning_rate": 5.766171792152705e-06, "loss": 0.8303, "step": 7210 }, { "epoch": 0.9445316588173731, "grad_norm": 0.7008918499549309, "learning_rate": 5.6336161187698835e-06, "loss": 0.5698, "step": 7220 }, { "epoch": 0.945839874411303, "grad_norm": 0.4720187827903435, "learning_rate": 5.501060445387063e-06, "loss": 0.813, "step": 7230 }, { "epoch": 0.9471480900052328, "grad_norm": 0.6243694714554565, "learning_rate": 5.368504772004242e-06, "loss": 0.5858, "step": 7240 }, { "epoch": 0.9484563055991627, "grad_norm": 0.6260033107066734, "learning_rate": 5.235949098621421e-06, "loss": 0.8116, "step": 7250 }, { "epoch": 0.9497645211930926, "grad_norm": 0.6435112573826539, "learning_rate": 5.103393425238601e-06, "loss": 0.5544, "step": 7260 }, { "epoch": 0.9510727367870225, "grad_norm": 0.47784017687891694, "learning_rate": 4.9708377518557796e-06, "loss": 0.7778, "step": 7270 }, { "epoch": 0.9523809523809523, "grad_norm": 0.5688080296868022, "learning_rate": 4.838282078472959e-06, "loss": 0.6016, "step": 7280 }, { "epoch": 0.9536891679748822, "grad_norm": 0.5213924382658889, "learning_rate": 4.705726405090138e-06, "loss": 0.7974, "step": 7290 }, { "epoch": 0.9549973835688121, "grad_norm": 0.45047173757415426, "learning_rate": 4.573170731707317e-06, "loss": 0.5249, "step": 7300 }, { "epoch": 0.956305599162742, "grad_norm": 0.5437903519326854, "learning_rate": 4.440615058324496e-06, "loss": 0.8166, "step": 7310 }, { "epoch": 0.957613814756672, "grad_norm": 0.48264576838040973, "learning_rate": 4.308059384941676e-06, "loss": 0.5814, "step": 7320 }, { "epoch": 0.9589220303506018, "grad_norm": 0.545526763976128, "learning_rate": 4.175503711558855e-06, "loss": 0.8006, "step": 7330 }, { "epoch": 0.9602302459445317, "grad_norm": 0.38539225021424495, "learning_rate": 4.0429480381760345e-06, "loss": 0.5745, "step": 7340 }, { "epoch": 0.9615384615384616, "grad_norm": 0.5798773104196537, "learning_rate": 3.910392364793213e-06, "loss": 0.8073, "step": 7350 }, { "epoch": 0.9628466771323915, "grad_norm": 0.6878789040909351, "learning_rate": 3.777836691410393e-06, "loss": 0.5763, "step": 7360 }, { "epoch": 0.9641548927263213, "grad_norm": 0.5289835887969982, "learning_rate": 3.6452810180275714e-06, "loss": 0.7766, "step": 7370 }, { "epoch": 0.9654631083202512, "grad_norm": 0.40667843587961816, "learning_rate": 3.5127253446447508e-06, "loss": 0.558, "step": 7380 }, { "epoch": 0.9667713239141811, "grad_norm": 0.5840192236729715, "learning_rate": 3.3801696712619306e-06, "loss": 0.8129, "step": 7390 }, { "epoch": 0.968079539508111, "grad_norm": 0.3178170420497473, "learning_rate": 3.247613997879109e-06, "loss": 0.5787, "step": 7400 }, { "epoch": 0.9693877551020408, "grad_norm": 0.54996002509364, "learning_rate": 3.115058324496289e-06, "loss": 0.8326, "step": 7410 }, { "epoch": 0.9706959706959707, "grad_norm": 0.5534777218559572, "learning_rate": 2.982502651113468e-06, "loss": 0.5821, "step": 7420 }, { "epoch": 0.9720041862899006, "grad_norm": 0.5230629562997223, "learning_rate": 2.849946977730647e-06, "loss": 0.7779, "step": 7430 }, { "epoch": 0.9733124018838305, "grad_norm": 0.875422492824055, "learning_rate": 2.7173913043478263e-06, "loss": 0.592, "step": 7440 }, { "epoch": 0.9746206174777603, "grad_norm": 0.5569915281293889, "learning_rate": 2.5848356309650052e-06, "loss": 0.8067, "step": 7450 }, { "epoch": 0.9759288330716902, "grad_norm": 0.6183408664255449, "learning_rate": 2.4522799575821846e-06, "loss": 0.5875, "step": 7460 }, { "epoch": 0.9772370486656201, "grad_norm": 0.49698428419163243, "learning_rate": 2.319724284199364e-06, "loss": 0.8197, "step": 7470 }, { "epoch": 0.97854526425955, "grad_norm": 0.6395403482510305, "learning_rate": 2.187168610816543e-06, "loss": 0.5956, "step": 7480 }, { "epoch": 0.9798534798534798, "grad_norm": 0.5141508620993104, "learning_rate": 2.054612937433722e-06, "loss": 0.8213, "step": 7490 }, { "epoch": 0.9811616954474097, "grad_norm": 0.7297722231605804, "learning_rate": 1.9220572640509014e-06, "loss": 0.5724, "step": 7500 }, { "epoch": 0.9824699110413396, "grad_norm": 0.6139671735977023, "learning_rate": 1.7895015906680807e-06, "loss": 0.8038, "step": 7510 }, { "epoch": 0.9837781266352695, "grad_norm": 0.6348684038508452, "learning_rate": 1.65694591728526e-06, "loss": 0.5891, "step": 7520 }, { "epoch": 0.9850863422291993, "grad_norm": 0.5570560567015977, "learning_rate": 1.5243902439024391e-06, "loss": 0.8008, "step": 7530 }, { "epoch": 0.9863945578231292, "grad_norm": 0.5421909673109165, "learning_rate": 1.3918345705196183e-06, "loss": 0.5792, "step": 7540 }, { "epoch": 0.9877027734170591, "grad_norm": 0.5728614761702414, "learning_rate": 1.2592788971367975e-06, "loss": 0.7941, "step": 7550 }, { "epoch": 0.989010989010989, "grad_norm": 0.6317238615567622, "learning_rate": 1.1267232237539766e-06, "loss": 0.6009, "step": 7560 }, { "epoch": 0.9903192046049188, "grad_norm": 0.5160377073279534, "learning_rate": 9.94167550371156e-07, "loss": 0.7883, "step": 7570 }, { "epoch": 0.9916274201988488, "grad_norm": 0.690634120523154, "learning_rate": 8.616118769883351e-07, "loss": 0.5766, "step": 7580 }, { "epoch": 0.9929356357927787, "grad_norm": 0.5284248764034778, "learning_rate": 7.290562036055143e-07, "loss": 0.8011, "step": 7590 }, { "epoch": 0.9942438513867086, "grad_norm": 0.5999593891013711, "learning_rate": 5.965005302226936e-07, "loss": 0.5956, "step": 7600 }, { "epoch": 0.9955520669806384, "grad_norm": 0.48546714136377134, "learning_rate": 4.6394485683987276e-07, "loss": 0.7916, "step": 7610 }, { "epoch": 0.9968602825745683, "grad_norm": 0.3062976732646243, "learning_rate": 3.31389183457052e-07, "loss": 0.5679, "step": 7620 }, { "epoch": 0.9981684981684982, "grad_norm": 0.5269723896971663, "learning_rate": 1.9883351007423118e-07, "loss": 0.8292, "step": 7630 }, { "epoch": 0.9994767137624281, "grad_norm": 0.49183779024992025, "learning_rate": 6.62778366914104e-08, "loss": 0.5841, "step": 7640 } ], "logging_steps": 10, "max_steps": 7644, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }