PEFT
Safetensors
deu05232's picture
Upload folder using huggingface_hub
d9dcc38 verified
raw
history blame
134 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 7644,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013082155939298796,
"grad_norm": 7.290712591801908,
"learning_rate": 4.9999999999999996e-05,
"loss": 5.7851,
"step": 10
},
{
"epoch": 0.0026164311878597592,
"grad_norm": 3.67299584724583,
"learning_rate": 6.505149978319905e-05,
"loss": 3.1259,
"step": 20
},
{
"epoch": 0.003924646781789639,
"grad_norm": 2.4764513799723513,
"learning_rate": 7.385606273598311e-05,
"loss": 1.6786,
"step": 30
},
{
"epoch": 0.0052328623757195184,
"grad_norm": 1.7953600265789333,
"learning_rate": 8.01029995663981e-05,
"loss": 1.2211,
"step": 40
},
{
"epoch": 0.006541077969649398,
"grad_norm": 1.8771167917018612,
"learning_rate": 8.494850021680092e-05,
"loss": 1.2585,
"step": 50
},
{
"epoch": 0.007849293563579277,
"grad_norm": 1.2473543190033471,
"learning_rate": 8.890756251918216e-05,
"loss": 0.9283,
"step": 60
},
{
"epoch": 0.009157509157509158,
"grad_norm": 2.618558273245568,
"learning_rate": 9.225490200071284e-05,
"loss": 1.1733,
"step": 70
},
{
"epoch": 0.010465724751439037,
"grad_norm": 1.2184582707007738,
"learning_rate": 9.515449934959716e-05,
"loss": 0.9113,
"step": 80
},
{
"epoch": 0.011773940345368918,
"grad_norm": 1.1266167128317874,
"learning_rate": 9.771212547196623e-05,
"loss": 1.0926,
"step": 90
},
{
"epoch": 0.013082155939298797,
"grad_norm": 1.2759202461423789,
"learning_rate": 9.999999999999999e-05,
"loss": 0.9088,
"step": 100
},
{
"epoch": 0.014390371533228676,
"grad_norm": 1.1320718200823117,
"learning_rate": 9.988069989395547e-05,
"loss": 1.0652,
"step": 110
},
{
"epoch": 0.015698587127158554,
"grad_norm": 1.253166703044972,
"learning_rate": 9.974814422057265e-05,
"loss": 0.8037,
"step": 120
},
{
"epoch": 0.017006802721088437,
"grad_norm": 0.8736521465598114,
"learning_rate": 9.961558854718983e-05,
"loss": 1.0371,
"step": 130
},
{
"epoch": 0.018315018315018316,
"grad_norm": 0.9331897722352183,
"learning_rate": 9.9483032873807e-05,
"loss": 0.8399,
"step": 140
},
{
"epoch": 0.019623233908948195,
"grad_norm": 1.16561958480625,
"learning_rate": 9.935047720042418e-05,
"loss": 1.0586,
"step": 150
},
{
"epoch": 0.020931449502878074,
"grad_norm": 0.8921867195796972,
"learning_rate": 9.921792152704136e-05,
"loss": 0.7718,
"step": 160
},
{
"epoch": 0.022239665096807953,
"grad_norm": 0.7140587209081476,
"learning_rate": 9.908536585365854e-05,
"loss": 1.009,
"step": 170
},
{
"epoch": 0.023547880690737835,
"grad_norm": 0.8309946634200358,
"learning_rate": 9.895281018027573e-05,
"loss": 0.7839,
"step": 180
},
{
"epoch": 0.024856096284667714,
"grad_norm": 0.7342060793171521,
"learning_rate": 9.88202545068929e-05,
"loss": 0.9935,
"step": 190
},
{
"epoch": 0.026164311878597593,
"grad_norm": 0.9847538624284081,
"learning_rate": 9.868769883351007e-05,
"loss": 0.7705,
"step": 200
},
{
"epoch": 0.027472527472527472,
"grad_norm": 0.8944456756402039,
"learning_rate": 9.855514316012726e-05,
"loss": 1.0197,
"step": 210
},
{
"epoch": 0.02878074306645735,
"grad_norm": 0.7463212583397272,
"learning_rate": 9.842258748674443e-05,
"loss": 0.7562,
"step": 220
},
{
"epoch": 0.030088958660387233,
"grad_norm": 0.7842188750961813,
"learning_rate": 9.829003181336162e-05,
"loss": 0.9935,
"step": 230
},
{
"epoch": 0.03139717425431711,
"grad_norm": 0.8675590356137016,
"learning_rate": 9.815747613997879e-05,
"loss": 0.7518,
"step": 240
},
{
"epoch": 0.03270538984824699,
"grad_norm": 0.5923018779242124,
"learning_rate": 9.802492046659596e-05,
"loss": 0.9582,
"step": 250
},
{
"epoch": 0.034013605442176874,
"grad_norm": 0.6928464778679819,
"learning_rate": 9.789236479321315e-05,
"loss": 0.747,
"step": 260
},
{
"epoch": 0.03532182103610675,
"grad_norm": 0.7113730083361403,
"learning_rate": 9.775980911983034e-05,
"loss": 0.9576,
"step": 270
},
{
"epoch": 0.03663003663003663,
"grad_norm": 0.8679078003827115,
"learning_rate": 9.762725344644751e-05,
"loss": 0.7286,
"step": 280
},
{
"epoch": 0.03793825222396651,
"grad_norm": 0.600907373361144,
"learning_rate": 9.74946977730647e-05,
"loss": 0.9778,
"step": 290
},
{
"epoch": 0.03924646781789639,
"grad_norm": 0.8848902266623525,
"learning_rate": 9.736214209968187e-05,
"loss": 0.7405,
"step": 300
},
{
"epoch": 0.04055468341182627,
"grad_norm": 0.9589691469495706,
"learning_rate": 9.722958642629904e-05,
"loss": 0.9579,
"step": 310
},
{
"epoch": 0.04186289900575615,
"grad_norm": 0.7099733591624596,
"learning_rate": 9.709703075291623e-05,
"loss": 0.7529,
"step": 320
},
{
"epoch": 0.04317111459968603,
"grad_norm": 0.6510293809764843,
"learning_rate": 9.696447507953341e-05,
"loss": 0.979,
"step": 330
},
{
"epoch": 0.044479330193615906,
"grad_norm": 1.1641190214279045,
"learning_rate": 9.683191940615059e-05,
"loss": 0.7132,
"step": 340
},
{
"epoch": 0.045787545787545784,
"grad_norm": 0.9673000795752291,
"learning_rate": 9.669936373276777e-05,
"loss": 0.9253,
"step": 350
},
{
"epoch": 0.04709576138147567,
"grad_norm": 0.7190353586884389,
"learning_rate": 9.656680805938494e-05,
"loss": 0.726,
"step": 360
},
{
"epoch": 0.04840397697540555,
"grad_norm": 0.7201601219088483,
"learning_rate": 9.643425238600212e-05,
"loss": 0.9286,
"step": 370
},
{
"epoch": 0.04971219256933543,
"grad_norm": 0.884374342504821,
"learning_rate": 9.63016967126193e-05,
"loss": 0.7203,
"step": 380
},
{
"epoch": 0.05102040816326531,
"grad_norm": 0.7049160941807705,
"learning_rate": 9.616914103923649e-05,
"loss": 0.9509,
"step": 390
},
{
"epoch": 0.052328623757195186,
"grad_norm": 0.9056185856880267,
"learning_rate": 9.603658536585366e-05,
"loss": 0.7115,
"step": 400
},
{
"epoch": 0.053636839351125065,
"grad_norm": 0.6053280737552994,
"learning_rate": 9.590402969247085e-05,
"loss": 0.949,
"step": 410
},
{
"epoch": 0.054945054945054944,
"grad_norm": 0.669203867860999,
"learning_rate": 9.577147401908802e-05,
"loss": 0.7155,
"step": 420
},
{
"epoch": 0.05625327053898482,
"grad_norm": 0.7049507997603442,
"learning_rate": 9.56389183457052e-05,
"loss": 0.9633,
"step": 430
},
{
"epoch": 0.0575614861329147,
"grad_norm": 0.72413054526148,
"learning_rate": 9.550636267232238e-05,
"loss": 0.6704,
"step": 440
},
{
"epoch": 0.05886970172684458,
"grad_norm": 0.9434997463945898,
"learning_rate": 9.537380699893957e-05,
"loss": 0.9329,
"step": 450
},
{
"epoch": 0.06017791732077447,
"grad_norm": 0.9425009249090048,
"learning_rate": 9.524125132555674e-05,
"loss": 0.6726,
"step": 460
},
{
"epoch": 0.061486132914704346,
"grad_norm": 0.529629216676613,
"learning_rate": 9.510869565217391e-05,
"loss": 0.9172,
"step": 470
},
{
"epoch": 0.06279434850863422,
"grad_norm": 0.694713736738518,
"learning_rate": 9.49761399787911e-05,
"loss": 0.7293,
"step": 480
},
{
"epoch": 0.0641025641025641,
"grad_norm": 0.6230312130051202,
"learning_rate": 9.484358430540827e-05,
"loss": 0.903,
"step": 490
},
{
"epoch": 0.06541077969649398,
"grad_norm": 0.6370052650163481,
"learning_rate": 9.471102863202546e-05,
"loss": 0.7062,
"step": 500
},
{
"epoch": 0.06671899529042387,
"grad_norm": 0.7394610544442655,
"learning_rate": 9.457847295864264e-05,
"loss": 0.9301,
"step": 510
},
{
"epoch": 0.06802721088435375,
"grad_norm": 0.5284256055090087,
"learning_rate": 9.444591728525982e-05,
"loss": 0.6504,
"step": 520
},
{
"epoch": 0.06933542647828363,
"grad_norm": 0.8769802465059457,
"learning_rate": 9.431336161187699e-05,
"loss": 0.9392,
"step": 530
},
{
"epoch": 0.0706436420722135,
"grad_norm": 0.7788506408432221,
"learning_rate": 9.418080593849417e-05,
"loss": 0.7188,
"step": 540
},
{
"epoch": 0.07195185766614338,
"grad_norm": 0.6258384386257697,
"learning_rate": 9.404825026511135e-05,
"loss": 0.8991,
"step": 550
},
{
"epoch": 0.07326007326007326,
"grad_norm": 0.771128895833298,
"learning_rate": 9.391569459172853e-05,
"loss": 0.6951,
"step": 560
},
{
"epoch": 0.07456828885400314,
"grad_norm": 0.8070532606977017,
"learning_rate": 9.378313891834572e-05,
"loss": 0.9198,
"step": 570
},
{
"epoch": 0.07587650444793302,
"grad_norm": 0.7040977734805667,
"learning_rate": 9.365058324496289e-05,
"loss": 0.7285,
"step": 580
},
{
"epoch": 0.0771847200418629,
"grad_norm": 0.5877418285296147,
"learning_rate": 9.351802757158006e-05,
"loss": 0.8823,
"step": 590
},
{
"epoch": 0.07849293563579278,
"grad_norm": 0.43162998979605355,
"learning_rate": 9.338547189819725e-05,
"loss": 0.6631,
"step": 600
},
{
"epoch": 0.07980115122972266,
"grad_norm": 0.5655884401483549,
"learning_rate": 9.325291622481442e-05,
"loss": 0.9211,
"step": 610
},
{
"epoch": 0.08110936682365254,
"grad_norm": 0.6606829717694761,
"learning_rate": 9.312036055143161e-05,
"loss": 0.6835,
"step": 620
},
{
"epoch": 0.08241758241758242,
"grad_norm": 0.4866281049362381,
"learning_rate": 9.29878048780488e-05,
"loss": 0.8975,
"step": 630
},
{
"epoch": 0.0837257980115123,
"grad_norm": 1.1102292165370264,
"learning_rate": 9.285524920466597e-05,
"loss": 0.6914,
"step": 640
},
{
"epoch": 0.08503401360544217,
"grad_norm": 0.5006402275523141,
"learning_rate": 9.272269353128314e-05,
"loss": 0.9399,
"step": 650
},
{
"epoch": 0.08634222919937205,
"grad_norm": 0.7796348119914489,
"learning_rate": 9.259013785790033e-05,
"loss": 0.7019,
"step": 660
},
{
"epoch": 0.08765044479330193,
"grad_norm": 0.7168780685110666,
"learning_rate": 9.24575821845175e-05,
"loss": 0.8866,
"step": 670
},
{
"epoch": 0.08895866038723181,
"grad_norm": 0.8634136089366375,
"learning_rate": 9.232502651113469e-05,
"loss": 0.6761,
"step": 680
},
{
"epoch": 0.09026687598116169,
"grad_norm": 0.596915980244832,
"learning_rate": 9.219247083775187e-05,
"loss": 0.9005,
"step": 690
},
{
"epoch": 0.09157509157509157,
"grad_norm": 0.6529475469297664,
"learning_rate": 9.205991516436903e-05,
"loss": 0.6601,
"step": 700
},
{
"epoch": 0.09288330716902145,
"grad_norm": 0.5753441026258548,
"learning_rate": 9.192735949098622e-05,
"loss": 0.9205,
"step": 710
},
{
"epoch": 0.09419152276295134,
"grad_norm": 0.7907931238031155,
"learning_rate": 9.17948038176034e-05,
"loss": 0.6816,
"step": 720
},
{
"epoch": 0.09549973835688122,
"grad_norm": 0.5399094070955297,
"learning_rate": 9.166224814422058e-05,
"loss": 0.9374,
"step": 730
},
{
"epoch": 0.0968079539508111,
"grad_norm": 0.7177139918496634,
"learning_rate": 9.152969247083776e-05,
"loss": 0.6665,
"step": 740
},
{
"epoch": 0.09811616954474098,
"grad_norm": 0.5696746776689743,
"learning_rate": 9.139713679745493e-05,
"loss": 0.9129,
"step": 750
},
{
"epoch": 0.09942438513867086,
"grad_norm": 0.638093758359057,
"learning_rate": 9.126458112407211e-05,
"loss": 0.6675,
"step": 760
},
{
"epoch": 0.10073260073260074,
"grad_norm": 0.6020538420505003,
"learning_rate": 9.11320254506893e-05,
"loss": 0.8901,
"step": 770
},
{
"epoch": 0.10204081632653061,
"grad_norm": 0.6120466557768905,
"learning_rate": 9.099946977730648e-05,
"loss": 0.6796,
"step": 780
},
{
"epoch": 0.1033490319204605,
"grad_norm": 0.6210967554289628,
"learning_rate": 9.086691410392365e-05,
"loss": 0.8869,
"step": 790
},
{
"epoch": 0.10465724751439037,
"grad_norm": 0.8091885327796373,
"learning_rate": 9.073435843054084e-05,
"loss": 0.6625,
"step": 800
},
{
"epoch": 0.10596546310832025,
"grad_norm": 0.5779837898387246,
"learning_rate": 9.060180275715801e-05,
"loss": 0.8876,
"step": 810
},
{
"epoch": 0.10727367870225013,
"grad_norm": 0.7299991372030511,
"learning_rate": 9.046924708377518e-05,
"loss": 0.696,
"step": 820
},
{
"epoch": 0.10858189429618001,
"grad_norm": 0.6620337610636755,
"learning_rate": 9.033669141039237e-05,
"loss": 0.9008,
"step": 830
},
{
"epoch": 0.10989010989010989,
"grad_norm": 0.6010699007449223,
"learning_rate": 9.020413573700954e-05,
"loss": 0.6712,
"step": 840
},
{
"epoch": 0.11119832548403977,
"grad_norm": 0.5427558713650651,
"learning_rate": 9.007158006362673e-05,
"loss": 0.8753,
"step": 850
},
{
"epoch": 0.11250654107796965,
"grad_norm": 0.6742371579613256,
"learning_rate": 8.993902439024391e-05,
"loss": 0.6737,
"step": 860
},
{
"epoch": 0.11381475667189953,
"grad_norm": 0.7058859567453811,
"learning_rate": 8.980646871686109e-05,
"loss": 0.8956,
"step": 870
},
{
"epoch": 0.1151229722658294,
"grad_norm": 0.7684505962300139,
"learning_rate": 8.967391304347826e-05,
"loss": 0.6851,
"step": 880
},
{
"epoch": 0.11643118785975928,
"grad_norm": 0.5221308693647347,
"learning_rate": 8.954135737009545e-05,
"loss": 0.8774,
"step": 890
},
{
"epoch": 0.11773940345368916,
"grad_norm": 0.6747737190536728,
"learning_rate": 8.940880169671262e-05,
"loss": 0.6816,
"step": 900
},
{
"epoch": 0.11904761904761904,
"grad_norm": 0.46024093702145724,
"learning_rate": 8.92762460233298e-05,
"loss": 0.8785,
"step": 910
},
{
"epoch": 0.12035583464154893,
"grad_norm": 0.5572757908727249,
"learning_rate": 8.914369034994699e-05,
"loss": 0.6749,
"step": 920
},
{
"epoch": 0.12166405023547881,
"grad_norm": 0.5755409211612259,
"learning_rate": 8.901113467656415e-05,
"loss": 0.8996,
"step": 930
},
{
"epoch": 0.12297226582940869,
"grad_norm": 0.44743009853908355,
"learning_rate": 8.887857900318134e-05,
"loss": 0.6812,
"step": 940
},
{
"epoch": 0.12428048142333857,
"grad_norm": 0.5356391814605695,
"learning_rate": 8.874602332979852e-05,
"loss": 0.8962,
"step": 950
},
{
"epoch": 0.12558869701726844,
"grad_norm": 0.9142027438043182,
"learning_rate": 8.86134676564157e-05,
"loss": 0.6923,
"step": 960
},
{
"epoch": 0.12689691261119831,
"grad_norm": 0.5787045277844781,
"learning_rate": 8.848091198303288e-05,
"loss": 0.8958,
"step": 970
},
{
"epoch": 0.1282051282051282,
"grad_norm": 0.7928771640327954,
"learning_rate": 8.834835630965005e-05,
"loss": 0.6926,
"step": 980
},
{
"epoch": 0.12951334379905807,
"grad_norm": 0.6614005586526391,
"learning_rate": 8.821580063626723e-05,
"loss": 0.8844,
"step": 990
},
{
"epoch": 0.13082155939298795,
"grad_norm": 0.49917342117601304,
"learning_rate": 8.808324496288441e-05,
"loss": 0.6839,
"step": 1000
},
{
"epoch": 0.13212977498691783,
"grad_norm": 0.6870942412562521,
"learning_rate": 8.79506892895016e-05,
"loss": 0.937,
"step": 1010
},
{
"epoch": 0.13343799058084774,
"grad_norm": 0.48623360323222364,
"learning_rate": 8.781813361611877e-05,
"loss": 0.6424,
"step": 1020
},
{
"epoch": 0.13474620617477762,
"grad_norm": 0.522869801801631,
"learning_rate": 8.768557794273596e-05,
"loss": 0.9139,
"step": 1030
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.8763988251800717,
"learning_rate": 8.755302226935313e-05,
"loss": 0.6746,
"step": 1040
},
{
"epoch": 0.13736263736263737,
"grad_norm": 0.7101195150803217,
"learning_rate": 8.74204665959703e-05,
"loss": 0.8867,
"step": 1050
},
{
"epoch": 0.13867085295656725,
"grad_norm": 0.6262246747509773,
"learning_rate": 8.728791092258749e-05,
"loss": 0.6833,
"step": 1060
},
{
"epoch": 0.13997906855049713,
"grad_norm": 0.5315808280206341,
"learning_rate": 8.715535524920468e-05,
"loss": 0.9028,
"step": 1070
},
{
"epoch": 0.141287284144427,
"grad_norm": 0.5625387450672273,
"learning_rate": 8.702279957582185e-05,
"loss": 0.6623,
"step": 1080
},
{
"epoch": 0.1425954997383569,
"grad_norm": 0.6106698449027703,
"learning_rate": 8.689024390243903e-05,
"loss": 0.9054,
"step": 1090
},
{
"epoch": 0.14390371533228677,
"grad_norm": 0.785333814713217,
"learning_rate": 8.67576882290562e-05,
"loss": 0.6621,
"step": 1100
},
{
"epoch": 0.14521193092621665,
"grad_norm": 0.5341457759006656,
"learning_rate": 8.662513255567338e-05,
"loss": 0.8977,
"step": 1110
},
{
"epoch": 0.14652014652014653,
"grad_norm": 0.4836909763150667,
"learning_rate": 8.649257688229057e-05,
"loss": 0.643,
"step": 1120
},
{
"epoch": 0.1478283621140764,
"grad_norm": 0.6214661415425415,
"learning_rate": 8.636002120890775e-05,
"loss": 0.8619,
"step": 1130
},
{
"epoch": 0.14913657770800628,
"grad_norm": 0.44754299062781455,
"learning_rate": 8.622746553552492e-05,
"loss": 0.6545,
"step": 1140
},
{
"epoch": 0.15044479330193616,
"grad_norm": 0.654698416147961,
"learning_rate": 8.609490986214211e-05,
"loss": 0.8687,
"step": 1150
},
{
"epoch": 0.15175300889586604,
"grad_norm": 0.7796186456327326,
"learning_rate": 8.596235418875928e-05,
"loss": 0.6516,
"step": 1160
},
{
"epoch": 0.15306122448979592,
"grad_norm": 0.5619622188020412,
"learning_rate": 8.582979851537646e-05,
"loss": 0.9019,
"step": 1170
},
{
"epoch": 0.1543694400837258,
"grad_norm": 0.589108674850363,
"learning_rate": 8.569724284199364e-05,
"loss": 0.6514,
"step": 1180
},
{
"epoch": 0.15567765567765568,
"grad_norm": 0.9533885730603633,
"learning_rate": 8.556468716861083e-05,
"loss": 0.8537,
"step": 1190
},
{
"epoch": 0.15698587127158556,
"grad_norm": 0.5479885592424896,
"learning_rate": 8.5432131495228e-05,
"loss": 0.6615,
"step": 1200
},
{
"epoch": 0.15829408686551544,
"grad_norm": 0.6153890373628342,
"learning_rate": 8.529957582184517e-05,
"loss": 0.8631,
"step": 1210
},
{
"epoch": 0.15960230245944532,
"grad_norm": 0.7917177530306803,
"learning_rate": 8.516702014846236e-05,
"loss": 0.6616,
"step": 1220
},
{
"epoch": 0.1609105180533752,
"grad_norm": 0.656469547745639,
"learning_rate": 8.503446447507953e-05,
"loss": 0.8929,
"step": 1230
},
{
"epoch": 0.16221873364730507,
"grad_norm": 0.5569826880804676,
"learning_rate": 8.490190880169672e-05,
"loss": 0.6477,
"step": 1240
},
{
"epoch": 0.16352694924123495,
"grad_norm": 0.6145021073052068,
"learning_rate": 8.47693531283139e-05,
"loss": 0.8835,
"step": 1250
},
{
"epoch": 0.16483516483516483,
"grad_norm": 0.6285934467054461,
"learning_rate": 8.463679745493108e-05,
"loss": 0.6549,
"step": 1260
},
{
"epoch": 0.1661433804290947,
"grad_norm": 0.4969597278981036,
"learning_rate": 8.450424178154825e-05,
"loss": 0.8846,
"step": 1270
},
{
"epoch": 0.1674515960230246,
"grad_norm": 0.6062848536416026,
"learning_rate": 8.437168610816544e-05,
"loss": 0.6269,
"step": 1280
},
{
"epoch": 0.16875981161695447,
"grad_norm": 0.5265730484032111,
"learning_rate": 8.423913043478261e-05,
"loss": 0.8592,
"step": 1290
},
{
"epoch": 0.17006802721088435,
"grad_norm": 0.5811110234076874,
"learning_rate": 8.41065747613998e-05,
"loss": 0.6544,
"step": 1300
},
{
"epoch": 0.17137624280481423,
"grad_norm": 0.5639694934132476,
"learning_rate": 8.397401908801698e-05,
"loss": 0.8732,
"step": 1310
},
{
"epoch": 0.1726844583987441,
"grad_norm": 0.7531411828367692,
"learning_rate": 8.384146341463415e-05,
"loss": 0.6683,
"step": 1320
},
{
"epoch": 0.17399267399267399,
"grad_norm": 0.5146605068810605,
"learning_rate": 8.370890774125133e-05,
"loss": 0.8911,
"step": 1330
},
{
"epoch": 0.17530088958660386,
"grad_norm": 0.5881044587524927,
"learning_rate": 8.357635206786851e-05,
"loss": 0.6957,
"step": 1340
},
{
"epoch": 0.17660910518053374,
"grad_norm": 0.6108606112713066,
"learning_rate": 8.344379639448568e-05,
"loss": 0.864,
"step": 1350
},
{
"epoch": 0.17791732077446362,
"grad_norm": 0.6838348363870184,
"learning_rate": 8.331124072110287e-05,
"loss": 0.6382,
"step": 1360
},
{
"epoch": 0.1792255363683935,
"grad_norm": 0.5844899134885503,
"learning_rate": 8.317868504772006e-05,
"loss": 0.895,
"step": 1370
},
{
"epoch": 0.18053375196232338,
"grad_norm": 0.40337593852276243,
"learning_rate": 8.304612937433723e-05,
"loss": 0.649,
"step": 1380
},
{
"epoch": 0.18184196755625326,
"grad_norm": 0.49730732309328707,
"learning_rate": 8.29135737009544e-05,
"loss": 0.8442,
"step": 1390
},
{
"epoch": 0.18315018315018314,
"grad_norm": 0.5590991907664666,
"learning_rate": 8.278101802757159e-05,
"loss": 0.6445,
"step": 1400
},
{
"epoch": 0.18445839874411302,
"grad_norm": 0.6588503605001691,
"learning_rate": 8.264846235418876e-05,
"loss": 0.864,
"step": 1410
},
{
"epoch": 0.1857666143380429,
"grad_norm": 0.6197416228060506,
"learning_rate": 8.251590668080595e-05,
"loss": 0.6094,
"step": 1420
},
{
"epoch": 0.1870748299319728,
"grad_norm": 0.5482300336211388,
"learning_rate": 8.238335100742312e-05,
"loss": 0.8619,
"step": 1430
},
{
"epoch": 0.18838304552590268,
"grad_norm": 0.72709681776675,
"learning_rate": 8.225079533404029e-05,
"loss": 0.6456,
"step": 1440
},
{
"epoch": 0.18969126111983256,
"grad_norm": 0.5600056919125233,
"learning_rate": 8.211823966065748e-05,
"loss": 0.86,
"step": 1450
},
{
"epoch": 0.19099947671376244,
"grad_norm": 0.7289799556624317,
"learning_rate": 8.198568398727466e-05,
"loss": 0.6271,
"step": 1460
},
{
"epoch": 0.19230769230769232,
"grad_norm": 0.4590489049870012,
"learning_rate": 8.185312831389184e-05,
"loss": 0.8897,
"step": 1470
},
{
"epoch": 0.1936159079016222,
"grad_norm": 0.8036211881560831,
"learning_rate": 8.172057264050902e-05,
"loss": 0.6434,
"step": 1480
},
{
"epoch": 0.19492412349555208,
"grad_norm": 0.49768694148703807,
"learning_rate": 8.15880169671262e-05,
"loss": 0.8417,
"step": 1490
},
{
"epoch": 0.19623233908948196,
"grad_norm": 0.771940987579212,
"learning_rate": 8.145546129374337e-05,
"loss": 0.626,
"step": 1500
},
{
"epoch": 0.19754055468341183,
"grad_norm": 0.5487861196155561,
"learning_rate": 8.132290562036055e-05,
"loss": 0.863,
"step": 1510
},
{
"epoch": 0.1988487702773417,
"grad_norm": 0.5767745735777565,
"learning_rate": 8.119034994697774e-05,
"loss": 0.6327,
"step": 1520
},
{
"epoch": 0.2001569858712716,
"grad_norm": 0.5740160328527427,
"learning_rate": 8.105779427359491e-05,
"loss": 0.8293,
"step": 1530
},
{
"epoch": 0.20146520146520147,
"grad_norm": 0.6248485702307536,
"learning_rate": 8.09252386002121e-05,
"loss": 0.6525,
"step": 1540
},
{
"epoch": 0.20277341705913135,
"grad_norm": 0.5959014412308178,
"learning_rate": 8.079268292682927e-05,
"loss": 0.8793,
"step": 1550
},
{
"epoch": 0.20408163265306123,
"grad_norm": 0.5523669620436882,
"learning_rate": 8.066012725344644e-05,
"loss": 0.6549,
"step": 1560
},
{
"epoch": 0.2053898482469911,
"grad_norm": 0.6456441196706465,
"learning_rate": 8.052757158006363e-05,
"loss": 0.8688,
"step": 1570
},
{
"epoch": 0.206698063840921,
"grad_norm": 0.5659881168480183,
"learning_rate": 8.039501590668082e-05,
"loss": 0.6565,
"step": 1580
},
{
"epoch": 0.20800627943485087,
"grad_norm": 0.5616175050073812,
"learning_rate": 8.026246023329799e-05,
"loss": 0.8418,
"step": 1590
},
{
"epoch": 0.20931449502878074,
"grad_norm": 0.6028672819947086,
"learning_rate": 8.012990455991518e-05,
"loss": 0.6231,
"step": 1600
},
{
"epoch": 0.21062271062271062,
"grad_norm": 0.546703449007772,
"learning_rate": 7.999734888653235e-05,
"loss": 0.8631,
"step": 1610
},
{
"epoch": 0.2119309262166405,
"grad_norm": 0.3849996641772154,
"learning_rate": 7.986479321314952e-05,
"loss": 0.6246,
"step": 1620
},
{
"epoch": 0.21323914181057038,
"grad_norm": 0.4435598672137561,
"learning_rate": 7.973223753976671e-05,
"loss": 0.886,
"step": 1630
},
{
"epoch": 0.21454735740450026,
"grad_norm": 0.6111533721181235,
"learning_rate": 7.95996818663839e-05,
"loss": 0.6494,
"step": 1640
},
{
"epoch": 0.21585557299843014,
"grad_norm": 0.5729934681943539,
"learning_rate": 7.946712619300107e-05,
"loss": 0.8618,
"step": 1650
},
{
"epoch": 0.21716378859236002,
"grad_norm": 0.6355561878934224,
"learning_rate": 7.933457051961824e-05,
"loss": 0.6303,
"step": 1660
},
{
"epoch": 0.2184720041862899,
"grad_norm": 0.4669924162265557,
"learning_rate": 7.920201484623541e-05,
"loss": 0.8654,
"step": 1670
},
{
"epoch": 0.21978021978021978,
"grad_norm": 0.6815024166926259,
"learning_rate": 7.90694591728526e-05,
"loss": 0.6299,
"step": 1680
},
{
"epoch": 0.22108843537414966,
"grad_norm": 0.5542650967967168,
"learning_rate": 7.893690349946978e-05,
"loss": 0.8661,
"step": 1690
},
{
"epoch": 0.22239665096807953,
"grad_norm": 0.5596504228795459,
"learning_rate": 7.880434782608696e-05,
"loss": 0.6462,
"step": 1700
},
{
"epoch": 0.2237048665620094,
"grad_norm": 0.6616295642250447,
"learning_rate": 7.867179215270414e-05,
"loss": 0.8638,
"step": 1710
},
{
"epoch": 0.2250130821559393,
"grad_norm": 0.42204578619386185,
"learning_rate": 7.853923647932132e-05,
"loss": 0.6493,
"step": 1720
},
{
"epoch": 0.22632129774986917,
"grad_norm": 0.6146957520525497,
"learning_rate": 7.840668080593849e-05,
"loss": 0.8621,
"step": 1730
},
{
"epoch": 0.22762951334379905,
"grad_norm": 0.595798638618577,
"learning_rate": 7.827412513255567e-05,
"loss": 0.623,
"step": 1740
},
{
"epoch": 0.22893772893772893,
"grad_norm": 0.624548132547215,
"learning_rate": 7.814156945917286e-05,
"loss": 0.8582,
"step": 1750
},
{
"epoch": 0.2302459445316588,
"grad_norm": 0.6305314992926004,
"learning_rate": 7.800901378579003e-05,
"loss": 0.6336,
"step": 1760
},
{
"epoch": 0.2315541601255887,
"grad_norm": 0.7759096340841495,
"learning_rate": 7.787645811240722e-05,
"loss": 0.8387,
"step": 1770
},
{
"epoch": 0.23286237571951857,
"grad_norm": 0.5604953018516682,
"learning_rate": 7.774390243902439e-05,
"loss": 0.6283,
"step": 1780
},
{
"epoch": 0.23417059131344845,
"grad_norm": 0.7099271936900229,
"learning_rate": 7.761134676564156e-05,
"loss": 0.8628,
"step": 1790
},
{
"epoch": 0.23547880690737832,
"grad_norm": 0.474479805099593,
"learning_rate": 7.747879109225875e-05,
"loss": 0.6259,
"step": 1800
},
{
"epoch": 0.2367870225013082,
"grad_norm": 0.5416834886960876,
"learning_rate": 7.734623541887594e-05,
"loss": 0.8567,
"step": 1810
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.5379729696776889,
"learning_rate": 7.721367974549311e-05,
"loss": 0.6096,
"step": 1820
},
{
"epoch": 0.239403453689168,
"grad_norm": 0.6138823092852339,
"learning_rate": 7.70811240721103e-05,
"loss": 0.8454,
"step": 1830
},
{
"epoch": 0.24071166928309787,
"grad_norm": 0.7163967325753281,
"learning_rate": 7.694856839872747e-05,
"loss": 0.6751,
"step": 1840
},
{
"epoch": 0.24201988487702775,
"grad_norm": 0.7446327434350999,
"learning_rate": 7.681601272534464e-05,
"loss": 0.8933,
"step": 1850
},
{
"epoch": 0.24332810047095763,
"grad_norm": 0.674203813956785,
"learning_rate": 7.668345705196183e-05,
"loss": 0.6436,
"step": 1860
},
{
"epoch": 0.2446363160648875,
"grad_norm": 0.5298303453138061,
"learning_rate": 7.655090137857901e-05,
"loss": 0.8383,
"step": 1870
},
{
"epoch": 0.24594453165881738,
"grad_norm": 0.46309042220240854,
"learning_rate": 7.641834570519619e-05,
"loss": 0.624,
"step": 1880
},
{
"epoch": 0.24725274725274726,
"grad_norm": 0.527675840331917,
"learning_rate": 7.628579003181336e-05,
"loss": 0.8423,
"step": 1890
},
{
"epoch": 0.24856096284667714,
"grad_norm": 0.6656931192324065,
"learning_rate": 7.615323435843054e-05,
"loss": 0.66,
"step": 1900
},
{
"epoch": 0.24986917844060702,
"grad_norm": 0.5865447145073008,
"learning_rate": 7.602067868504772e-05,
"loss": 0.8659,
"step": 1910
},
{
"epoch": 0.25117739403453687,
"grad_norm": 0.5349083325210562,
"learning_rate": 7.58881230116649e-05,
"loss": 0.6439,
"step": 1920
},
{
"epoch": 0.2524856096284668,
"grad_norm": 0.5557018540060792,
"learning_rate": 7.575556733828209e-05,
"loss": 0.846,
"step": 1930
},
{
"epoch": 0.25379382522239663,
"grad_norm": 0.5858709719889754,
"learning_rate": 7.562301166489926e-05,
"loss": 0.6473,
"step": 1940
},
{
"epoch": 0.25510204081632654,
"grad_norm": 0.5461126801060399,
"learning_rate": 7.549045599151643e-05,
"loss": 0.874,
"step": 1950
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.6905067688723441,
"learning_rate": 7.535790031813362e-05,
"loss": 0.6091,
"step": 1960
},
{
"epoch": 0.2577184720041863,
"grad_norm": 0.7656408539875517,
"learning_rate": 7.522534464475079e-05,
"loss": 0.8679,
"step": 1970
},
{
"epoch": 0.25902668759811615,
"grad_norm": 0.7233211006267372,
"learning_rate": 7.509278897136798e-05,
"loss": 0.6749,
"step": 1980
},
{
"epoch": 0.26033490319204605,
"grad_norm": 0.5989700856780242,
"learning_rate": 7.496023329798517e-05,
"loss": 0.833,
"step": 1990
},
{
"epoch": 0.2616431187859759,
"grad_norm": 0.5435529702312377,
"learning_rate": 7.482767762460234e-05,
"loss": 0.658,
"step": 2000
},
{
"epoch": 0.2629513343799058,
"grad_norm": 0.5335997393071716,
"learning_rate": 7.469512195121951e-05,
"loss": 0.8399,
"step": 2010
},
{
"epoch": 0.26425954997383566,
"grad_norm": 0.9150436835320093,
"learning_rate": 7.45625662778367e-05,
"loss": 0.6114,
"step": 2020
},
{
"epoch": 0.26556776556776557,
"grad_norm": 0.5384709854955332,
"learning_rate": 7.443001060445387e-05,
"loss": 0.8413,
"step": 2030
},
{
"epoch": 0.2668759811616955,
"grad_norm": 0.8677435387982771,
"learning_rate": 7.429745493107106e-05,
"loss": 0.6475,
"step": 2040
},
{
"epoch": 0.2681841967556253,
"grad_norm": 0.5123690892694776,
"learning_rate": 7.416489925768824e-05,
"loss": 0.849,
"step": 2050
},
{
"epoch": 0.26949241234955523,
"grad_norm": 0.5800543468099533,
"learning_rate": 7.403234358430541e-05,
"loss": 0.6127,
"step": 2060
},
{
"epoch": 0.2708006279434851,
"grad_norm": 0.8015793490826957,
"learning_rate": 7.389978791092259e-05,
"loss": 0.8682,
"step": 2070
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.5876567022243202,
"learning_rate": 7.376723223753977e-05,
"loss": 0.6164,
"step": 2080
},
{
"epoch": 0.27341705913134484,
"grad_norm": 0.5542805867826196,
"learning_rate": 7.363467656415695e-05,
"loss": 0.8808,
"step": 2090
},
{
"epoch": 0.27472527472527475,
"grad_norm": 0.5170888187943017,
"learning_rate": 7.350212089077413e-05,
"loss": 0.6375,
"step": 2100
},
{
"epoch": 0.2760334903192046,
"grad_norm": 0.622686247940597,
"learning_rate": 7.336956521739132e-05,
"loss": 0.8605,
"step": 2110
},
{
"epoch": 0.2773417059131345,
"grad_norm": 0.5587125235931543,
"learning_rate": 7.323700954400848e-05,
"loss": 0.6237,
"step": 2120
},
{
"epoch": 0.27864992150706436,
"grad_norm": 0.6015065825515082,
"learning_rate": 7.310445387062566e-05,
"loss": 0.8276,
"step": 2130
},
{
"epoch": 0.27995813710099426,
"grad_norm": 0.5122297183116,
"learning_rate": 7.297189819724285e-05,
"loss": 0.6289,
"step": 2140
},
{
"epoch": 0.2812663526949241,
"grad_norm": 0.5663980177757836,
"learning_rate": 7.283934252386002e-05,
"loss": 0.861,
"step": 2150
},
{
"epoch": 0.282574568288854,
"grad_norm": 0.7939853395114802,
"learning_rate": 7.270678685047721e-05,
"loss": 0.6551,
"step": 2160
},
{
"epoch": 0.2838827838827839,
"grad_norm": 0.5287178325117134,
"learning_rate": 7.257423117709438e-05,
"loss": 0.8747,
"step": 2170
},
{
"epoch": 0.2851909994767138,
"grad_norm": 0.562616836311441,
"learning_rate": 7.244167550371155e-05,
"loss": 0.6357,
"step": 2180
},
{
"epoch": 0.28649921507064363,
"grad_norm": 0.5117823698698972,
"learning_rate": 7.230911983032874e-05,
"loss": 0.8377,
"step": 2190
},
{
"epoch": 0.28780743066457354,
"grad_norm": 0.6453579912049506,
"learning_rate": 7.217656415694593e-05,
"loss": 0.6476,
"step": 2200
},
{
"epoch": 0.2891156462585034,
"grad_norm": 0.8731364069825441,
"learning_rate": 7.20440084835631e-05,
"loss": 0.8773,
"step": 2210
},
{
"epoch": 0.2904238618524333,
"grad_norm": 0.6010315625749808,
"learning_rate": 7.191145281018028e-05,
"loss": 0.6083,
"step": 2220
},
{
"epoch": 0.29173207744636315,
"grad_norm": 0.5201002760771769,
"learning_rate": 7.177889713679746e-05,
"loss": 0.8496,
"step": 2230
},
{
"epoch": 0.29304029304029305,
"grad_norm": 0.6001877343124563,
"learning_rate": 7.164634146341463e-05,
"loss": 0.6389,
"step": 2240
},
{
"epoch": 0.2943485086342229,
"grad_norm": 0.4371089670254897,
"learning_rate": 7.151378579003182e-05,
"loss": 0.8706,
"step": 2250
},
{
"epoch": 0.2956567242281528,
"grad_norm": 0.5778087926720157,
"learning_rate": 7.1381230116649e-05,
"loss": 0.5939,
"step": 2260
},
{
"epoch": 0.29696493982208266,
"grad_norm": 0.5241829955540505,
"learning_rate": 7.124867444326617e-05,
"loss": 0.8461,
"step": 2270
},
{
"epoch": 0.29827315541601257,
"grad_norm": 0.5396461429281756,
"learning_rate": 7.111611876988336e-05,
"loss": 0.645,
"step": 2280
},
{
"epoch": 0.2995813710099424,
"grad_norm": 0.7287884001398448,
"learning_rate": 7.098356309650053e-05,
"loss": 0.8349,
"step": 2290
},
{
"epoch": 0.3008895866038723,
"grad_norm": 0.43985568295051974,
"learning_rate": 7.08510074231177e-05,
"loss": 0.6023,
"step": 2300
},
{
"epoch": 0.3021978021978022,
"grad_norm": 0.512789679014569,
"learning_rate": 7.071845174973489e-05,
"loss": 0.8592,
"step": 2310
},
{
"epoch": 0.3035060177917321,
"grad_norm": 0.5758747078766807,
"learning_rate": 7.058589607635208e-05,
"loss": 0.6302,
"step": 2320
},
{
"epoch": 0.30481423338566194,
"grad_norm": 0.605959590741427,
"learning_rate": 7.045334040296925e-05,
"loss": 0.8437,
"step": 2330
},
{
"epoch": 0.30612244897959184,
"grad_norm": 0.5064367889657759,
"learning_rate": 7.032078472958644e-05,
"loss": 0.6228,
"step": 2340
},
{
"epoch": 0.3074306645735217,
"grad_norm": 0.4968490853247213,
"learning_rate": 7.018822905620361e-05,
"loss": 0.8556,
"step": 2350
},
{
"epoch": 0.3087388801674516,
"grad_norm": 0.5772837050103059,
"learning_rate": 7.005567338282078e-05,
"loss": 0.6234,
"step": 2360
},
{
"epoch": 0.31004709576138145,
"grad_norm": 0.6941306253378574,
"learning_rate": 6.992311770943797e-05,
"loss": 0.8362,
"step": 2370
},
{
"epoch": 0.31135531135531136,
"grad_norm": 0.39822917719900375,
"learning_rate": 6.979056203605516e-05,
"loss": 0.6162,
"step": 2380
},
{
"epoch": 0.3126635269492412,
"grad_norm": 0.4661807662020318,
"learning_rate": 6.965800636267233e-05,
"loss": 0.8414,
"step": 2390
},
{
"epoch": 0.3139717425431711,
"grad_norm": 0.45291786679307866,
"learning_rate": 6.95254506892895e-05,
"loss": 0.6433,
"step": 2400
},
{
"epoch": 0.31527995813710097,
"grad_norm": 0.6300442868544786,
"learning_rate": 6.939289501590669e-05,
"loss": 0.8564,
"step": 2410
},
{
"epoch": 0.3165881737310309,
"grad_norm": 0.5144882452050621,
"learning_rate": 6.926033934252386e-05,
"loss": 0.6018,
"step": 2420
},
{
"epoch": 0.3178963893249607,
"grad_norm": 0.5886230291498049,
"learning_rate": 6.912778366914105e-05,
"loss": 0.8861,
"step": 2430
},
{
"epoch": 0.31920460491889063,
"grad_norm": 0.6836197118690605,
"learning_rate": 6.899522799575823e-05,
"loss": 0.6267,
"step": 2440
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.4951991615110591,
"learning_rate": 6.88626723223754e-05,
"loss": 0.8387,
"step": 2450
},
{
"epoch": 0.3218210361067504,
"grad_norm": 0.6412063103894238,
"learning_rate": 6.873011664899258e-05,
"loss": 0.5927,
"step": 2460
},
{
"epoch": 0.3231292517006803,
"grad_norm": 0.5782953097169173,
"learning_rate": 6.859756097560976e-05,
"loss": 0.8418,
"step": 2470
},
{
"epoch": 0.32443746729461015,
"grad_norm": 0.5200166820109174,
"learning_rate": 6.846500530222694e-05,
"loss": 0.6324,
"step": 2480
},
{
"epoch": 0.32574568288854006,
"grad_norm": 0.4922169436686345,
"learning_rate": 6.833244962884412e-05,
"loss": 0.8298,
"step": 2490
},
{
"epoch": 0.3270538984824699,
"grad_norm": 0.4912804994081646,
"learning_rate": 6.819989395546131e-05,
"loss": 0.6123,
"step": 2500
},
{
"epoch": 0.3283621140763998,
"grad_norm": 0.5502723124138919,
"learning_rate": 6.806733828207848e-05,
"loss": 0.8522,
"step": 2510
},
{
"epoch": 0.32967032967032966,
"grad_norm": 0.5875399056209475,
"learning_rate": 6.793478260869565e-05,
"loss": 0.6144,
"step": 2520
},
{
"epoch": 0.33097854526425957,
"grad_norm": 0.5609437523704139,
"learning_rate": 6.780222693531283e-05,
"loss": 0.8347,
"step": 2530
},
{
"epoch": 0.3322867608581894,
"grad_norm": 0.338386078229902,
"learning_rate": 6.766967126193001e-05,
"loss": 0.587,
"step": 2540
},
{
"epoch": 0.33359497645211933,
"grad_norm": 0.48756512910329014,
"learning_rate": 6.75371155885472e-05,
"loss": 0.8327,
"step": 2550
},
{
"epoch": 0.3349031920460492,
"grad_norm": 0.5143776858886028,
"learning_rate": 6.740455991516437e-05,
"loss": 0.6092,
"step": 2560
},
{
"epoch": 0.3362114076399791,
"grad_norm": 0.5332261671554006,
"learning_rate": 6.727200424178154e-05,
"loss": 0.8486,
"step": 2570
},
{
"epoch": 0.33751962323390894,
"grad_norm": 0.6594969470024875,
"learning_rate": 6.713944856839873e-05,
"loss": 0.6256,
"step": 2580
},
{
"epoch": 0.33882783882783885,
"grad_norm": 0.4553927815832327,
"learning_rate": 6.70068928950159e-05,
"loss": 0.8357,
"step": 2590
},
{
"epoch": 0.3401360544217687,
"grad_norm": 1.1222853735995233,
"learning_rate": 6.687433722163309e-05,
"loss": 0.6007,
"step": 2600
},
{
"epoch": 0.3414442700156986,
"grad_norm": 0.528439009952554,
"learning_rate": 6.674178154825027e-05,
"loss": 0.856,
"step": 2610
},
{
"epoch": 0.34275248560962845,
"grad_norm": 0.5933307919214319,
"learning_rate": 6.660922587486745e-05,
"loss": 0.6421,
"step": 2620
},
{
"epoch": 0.34406070120355836,
"grad_norm": 0.6607714283195972,
"learning_rate": 6.647667020148462e-05,
"loss": 0.8567,
"step": 2630
},
{
"epoch": 0.3453689167974882,
"grad_norm": 0.7007503714365677,
"learning_rate": 6.63441145281018e-05,
"loss": 0.6207,
"step": 2640
},
{
"epoch": 0.3466771323914181,
"grad_norm": 0.5135207638483642,
"learning_rate": 6.621155885471898e-05,
"loss": 0.8288,
"step": 2650
},
{
"epoch": 0.34798534798534797,
"grad_norm": 0.6140077846492299,
"learning_rate": 6.607900318133616e-05,
"loss": 0.6015,
"step": 2660
},
{
"epoch": 0.3492935635792779,
"grad_norm": 0.5100090518445459,
"learning_rate": 6.594644750795335e-05,
"loss": 0.8348,
"step": 2670
},
{
"epoch": 0.35060177917320773,
"grad_norm": 0.5113632712765585,
"learning_rate": 6.581389183457052e-05,
"loss": 0.5924,
"step": 2680
},
{
"epoch": 0.35190999476713763,
"grad_norm": 0.5039720207488532,
"learning_rate": 6.56813361611877e-05,
"loss": 0.8184,
"step": 2690
},
{
"epoch": 0.3532182103610675,
"grad_norm": 0.49806965997978137,
"learning_rate": 6.554878048780488e-05,
"loss": 0.6046,
"step": 2700
},
{
"epoch": 0.3545264259549974,
"grad_norm": 0.5365120734516775,
"learning_rate": 6.541622481442205e-05,
"loss": 0.8514,
"step": 2710
},
{
"epoch": 0.35583464154892724,
"grad_norm": 0.4007231339986853,
"learning_rate": 6.528366914103924e-05,
"loss": 0.5871,
"step": 2720
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.46254442147732966,
"learning_rate": 6.515111346765643e-05,
"loss": 0.8411,
"step": 2730
},
{
"epoch": 0.358451072736787,
"grad_norm": 0.6576167821737804,
"learning_rate": 6.50185577942736e-05,
"loss": 0.6293,
"step": 2740
},
{
"epoch": 0.3597592883307169,
"grad_norm": 0.5967952480358752,
"learning_rate": 6.488600212089077e-05,
"loss": 0.8511,
"step": 2750
},
{
"epoch": 0.36106750392464676,
"grad_norm": 0.43508005533871247,
"learning_rate": 6.475344644750796e-05,
"loss": 0.621,
"step": 2760
},
{
"epoch": 0.36237571951857667,
"grad_norm": 0.48108090717243124,
"learning_rate": 6.462089077412513e-05,
"loss": 0.8263,
"step": 2770
},
{
"epoch": 0.3636839351125065,
"grad_norm": 0.568823321053749,
"learning_rate": 6.448833510074232e-05,
"loss": 0.5941,
"step": 2780
},
{
"epoch": 0.3649921507064364,
"grad_norm": 0.5222642833529956,
"learning_rate": 6.43557794273595e-05,
"loss": 0.8525,
"step": 2790
},
{
"epoch": 0.3663003663003663,
"grad_norm": 0.5919546377210694,
"learning_rate": 6.422322375397666e-05,
"loss": 0.5901,
"step": 2800
},
{
"epoch": 0.3676085818942962,
"grad_norm": 0.46817017239205494,
"learning_rate": 6.409066808059385e-05,
"loss": 0.8678,
"step": 2810
},
{
"epoch": 0.36891679748822603,
"grad_norm": 0.5530202729949584,
"learning_rate": 6.395811240721103e-05,
"loss": 0.6401,
"step": 2820
},
{
"epoch": 0.37022501308215594,
"grad_norm": 0.6007081120859754,
"learning_rate": 6.382555673382821e-05,
"loss": 0.8291,
"step": 2830
},
{
"epoch": 0.3715332286760858,
"grad_norm": 0.39683924232406637,
"learning_rate": 6.36930010604454e-05,
"loss": 0.6127,
"step": 2840
},
{
"epoch": 0.3728414442700157,
"grad_norm": 0.5452479168330785,
"learning_rate": 6.356044538706257e-05,
"loss": 0.8495,
"step": 2850
},
{
"epoch": 0.3741496598639456,
"grad_norm": 0.8980580036344767,
"learning_rate": 6.342788971367974e-05,
"loss": 0.6471,
"step": 2860
},
{
"epoch": 0.37545787545787546,
"grad_norm": 0.48885709974761143,
"learning_rate": 6.329533404029692e-05,
"loss": 0.8626,
"step": 2870
},
{
"epoch": 0.37676609105180536,
"grad_norm": 0.5684144901718525,
"learning_rate": 6.316277836691411e-05,
"loss": 0.5869,
"step": 2880
},
{
"epoch": 0.3780743066457352,
"grad_norm": 0.5836928349425793,
"learning_rate": 6.303022269353128e-05,
"loss": 0.8205,
"step": 2890
},
{
"epoch": 0.3793825222396651,
"grad_norm": 0.5698475129755676,
"learning_rate": 6.289766702014847e-05,
"loss": 0.5985,
"step": 2900
},
{
"epoch": 0.38069073783359497,
"grad_norm": 0.5257222435975014,
"learning_rate": 6.276511134676564e-05,
"loss": 0.8572,
"step": 2910
},
{
"epoch": 0.3819989534275249,
"grad_norm": 0.524065233271633,
"learning_rate": 6.263255567338282e-05,
"loss": 0.5917,
"step": 2920
},
{
"epoch": 0.38330716902145473,
"grad_norm": 0.5635081193305911,
"learning_rate": 6.25e-05,
"loss": 0.8333,
"step": 2930
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.7155336191069814,
"learning_rate": 6.236744432661719e-05,
"loss": 0.6211,
"step": 2940
},
{
"epoch": 0.3859236002093145,
"grad_norm": 0.5087934741189605,
"learning_rate": 6.223488865323436e-05,
"loss": 0.8531,
"step": 2950
},
{
"epoch": 0.3872318158032444,
"grad_norm": 0.5635575417178986,
"learning_rate": 6.210233297985155e-05,
"loss": 0.6134,
"step": 2960
},
{
"epoch": 0.38854003139717425,
"grad_norm": 0.5136035425363329,
"learning_rate": 6.196977730646872e-05,
"loss": 0.8142,
"step": 2970
},
{
"epoch": 0.38984824699110415,
"grad_norm": 0.6741458077201186,
"learning_rate": 6.183722163308589e-05,
"loss": 0.6388,
"step": 2980
},
{
"epoch": 0.391156462585034,
"grad_norm": 0.49349088755882425,
"learning_rate": 6.170466595970308e-05,
"loss": 0.8764,
"step": 2990
},
{
"epoch": 0.3924646781789639,
"grad_norm": 0.5383608654756946,
"learning_rate": 6.157211028632026e-05,
"loss": 0.6271,
"step": 3000
},
{
"epoch": 0.39377289377289376,
"grad_norm": 0.5194331296491325,
"learning_rate": 6.143955461293744e-05,
"loss": 0.8087,
"step": 3010
},
{
"epoch": 0.39508110936682367,
"grad_norm": 1.1199076383208129,
"learning_rate": 6.130699893955462e-05,
"loss": 0.5948,
"step": 3020
},
{
"epoch": 0.3963893249607535,
"grad_norm": 0.5119305052018042,
"learning_rate": 6.11744432661718e-05,
"loss": 0.7975,
"step": 3030
},
{
"epoch": 0.3976975405546834,
"grad_norm": 0.6365196412735817,
"learning_rate": 6.104188759278897e-05,
"loss": 0.614,
"step": 3040
},
{
"epoch": 0.3990057561486133,
"grad_norm": 0.4736074626159983,
"learning_rate": 6.0909331919406154e-05,
"loss": 0.8416,
"step": 3050
},
{
"epoch": 0.4003139717425432,
"grad_norm": 0.8101929076329764,
"learning_rate": 6.077677624602334e-05,
"loss": 0.6173,
"step": 3060
},
{
"epoch": 0.40162218733647304,
"grad_norm": 0.6168972276400505,
"learning_rate": 6.0644220572640506e-05,
"loss": 0.8407,
"step": 3070
},
{
"epoch": 0.40293040293040294,
"grad_norm": 0.7038053233260652,
"learning_rate": 6.051166489925769e-05,
"loss": 0.6247,
"step": 3080
},
{
"epoch": 0.4042386185243328,
"grad_norm": 0.457467448213452,
"learning_rate": 6.037910922587487e-05,
"loss": 0.8322,
"step": 3090
},
{
"epoch": 0.4055468341182627,
"grad_norm": 0.49357888449301257,
"learning_rate": 6.0246553552492044e-05,
"loss": 0.5908,
"step": 3100
},
{
"epoch": 0.40685504971219255,
"grad_norm": 0.5808455738349774,
"learning_rate": 6.011399787910923e-05,
"loss": 0.8208,
"step": 3110
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.7022945988432154,
"learning_rate": 5.998144220572641e-05,
"loss": 0.6518,
"step": 3120
},
{
"epoch": 0.4094714809000523,
"grad_norm": 0.6078075113609221,
"learning_rate": 5.984888653234358e-05,
"loss": 0.8636,
"step": 3130
},
{
"epoch": 0.4107796964939822,
"grad_norm": 0.6199267742106505,
"learning_rate": 5.971633085896077e-05,
"loss": 0.5869,
"step": 3140
},
{
"epoch": 0.41208791208791207,
"grad_norm": 0.6063216826492921,
"learning_rate": 5.958377518557795e-05,
"loss": 0.8341,
"step": 3150
},
{
"epoch": 0.413396127681842,
"grad_norm": 0.3985769574080392,
"learning_rate": 5.945121951219512e-05,
"loss": 0.6207,
"step": 3160
},
{
"epoch": 0.4147043432757718,
"grad_norm": 0.5360752659857189,
"learning_rate": 5.931866383881231e-05,
"loss": 0.8621,
"step": 3170
},
{
"epoch": 0.41601255886970173,
"grad_norm": 0.7664757845756865,
"learning_rate": 5.9186108165429486e-05,
"loss": 0.6172,
"step": 3180
},
{
"epoch": 0.4173207744636316,
"grad_norm": 0.5679551885466096,
"learning_rate": 5.905355249204666e-05,
"loss": 0.8324,
"step": 3190
},
{
"epoch": 0.4186289900575615,
"grad_norm": 0.5745648833281791,
"learning_rate": 5.8920996818663845e-05,
"loss": 0.5986,
"step": 3200
},
{
"epoch": 0.41993720565149134,
"grad_norm": 0.5395040305019896,
"learning_rate": 5.8788441145281024e-05,
"loss": 0.8278,
"step": 3210
},
{
"epoch": 0.42124542124542125,
"grad_norm": 0.5945122594045797,
"learning_rate": 5.86558854718982e-05,
"loss": 0.6128,
"step": 3220
},
{
"epoch": 0.4225536368393511,
"grad_norm": 0.4545401837270316,
"learning_rate": 5.852332979851538e-05,
"loss": 0.8388,
"step": 3230
},
{
"epoch": 0.423861852433281,
"grad_norm": 0.5949653779281652,
"learning_rate": 5.839077412513256e-05,
"loss": 0.6167,
"step": 3240
},
{
"epoch": 0.42517006802721086,
"grad_norm": 0.5067052935095019,
"learning_rate": 5.8258218451749735e-05,
"loss": 0.8489,
"step": 3250
},
{
"epoch": 0.42647828362114076,
"grad_norm": 0.5191225717265471,
"learning_rate": 5.812566277836692e-05,
"loss": 0.6059,
"step": 3260
},
{
"epoch": 0.42778649921507067,
"grad_norm": 0.4969767064651156,
"learning_rate": 5.79931071049841e-05,
"loss": 0.8627,
"step": 3270
},
{
"epoch": 0.4290947148090005,
"grad_norm": 0.5785476329537975,
"learning_rate": 5.786055143160127e-05,
"loss": 0.622,
"step": 3280
},
{
"epoch": 0.43040293040293043,
"grad_norm": 0.5598487117314234,
"learning_rate": 5.772799575821846e-05,
"loss": 0.807,
"step": 3290
},
{
"epoch": 0.4317111459968603,
"grad_norm": 0.5427120253597403,
"learning_rate": 5.759544008483564e-05,
"loss": 0.6085,
"step": 3300
},
{
"epoch": 0.4330193615907902,
"grad_norm": 0.5672042251019751,
"learning_rate": 5.746288441145281e-05,
"loss": 0.822,
"step": 3310
},
{
"epoch": 0.43432757718472004,
"grad_norm": 0.2894379338449793,
"learning_rate": 5.733032873806999e-05,
"loss": 0.5945,
"step": 3320
},
{
"epoch": 0.43563579277864994,
"grad_norm": 0.677558336208755,
"learning_rate": 5.719777306468718e-05,
"loss": 0.8413,
"step": 3330
},
{
"epoch": 0.4369440083725798,
"grad_norm": 0.723454845173246,
"learning_rate": 5.706521739130435e-05,
"loss": 0.6216,
"step": 3340
},
{
"epoch": 0.4382522239665097,
"grad_norm": 0.4482745953779932,
"learning_rate": 5.693266171792153e-05,
"loss": 0.8383,
"step": 3350
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.5812694343372896,
"learning_rate": 5.68001060445387e-05,
"loss": 0.6193,
"step": 3360
},
{
"epoch": 0.44086865515436946,
"grad_norm": 0.6007437160420993,
"learning_rate": 5.666755037115589e-05,
"loss": 0.8203,
"step": 3370
},
{
"epoch": 0.4421768707482993,
"grad_norm": 0.6580604949252575,
"learning_rate": 5.653499469777307e-05,
"loss": 0.6043,
"step": 3380
},
{
"epoch": 0.4434850863422292,
"grad_norm": 0.6046064236672993,
"learning_rate": 5.640243902439024e-05,
"loss": 0.8204,
"step": 3390
},
{
"epoch": 0.44479330193615907,
"grad_norm": 0.47437375038207613,
"learning_rate": 5.6269883351007426e-05,
"loss": 0.6205,
"step": 3400
},
{
"epoch": 0.446101517530089,
"grad_norm": 0.4644396135020371,
"learning_rate": 5.6137327677624605e-05,
"loss": 0.8304,
"step": 3410
},
{
"epoch": 0.4474097331240188,
"grad_norm": 0.6080652395243531,
"learning_rate": 5.600477200424178e-05,
"loss": 0.5971,
"step": 3420
},
{
"epoch": 0.44871794871794873,
"grad_norm": 0.4969712368034865,
"learning_rate": 5.5872216330858964e-05,
"loss": 0.824,
"step": 3430
},
{
"epoch": 0.4500261643118786,
"grad_norm": 0.7769636104632821,
"learning_rate": 5.5739660657476144e-05,
"loss": 0.6071,
"step": 3440
},
{
"epoch": 0.4513343799058085,
"grad_norm": 0.5343464867649641,
"learning_rate": 5.5607104984093316e-05,
"loss": 0.8439,
"step": 3450
},
{
"epoch": 0.45264259549973834,
"grad_norm": 0.4670043204824149,
"learning_rate": 5.54745493107105e-05,
"loss": 0.5924,
"step": 3460
},
{
"epoch": 0.45395081109366825,
"grad_norm": 0.5128069264337731,
"learning_rate": 5.534199363732768e-05,
"loss": 0.8318,
"step": 3470
},
{
"epoch": 0.4552590266875981,
"grad_norm": 0.5401925527479321,
"learning_rate": 5.5209437963944854e-05,
"loss": 0.5857,
"step": 3480
},
{
"epoch": 0.456567242281528,
"grad_norm": 0.5099962477533319,
"learning_rate": 5.507688229056204e-05,
"loss": 0.8202,
"step": 3490
},
{
"epoch": 0.45787545787545786,
"grad_norm": 0.5003587460481527,
"learning_rate": 5.494432661717922e-05,
"loss": 0.58,
"step": 3500
},
{
"epoch": 0.45918367346938777,
"grad_norm": 0.5134647272370393,
"learning_rate": 5.481177094379639e-05,
"loss": 0.8078,
"step": 3510
},
{
"epoch": 0.4604918890633176,
"grad_norm": 0.5469814680290468,
"learning_rate": 5.467921527041357e-05,
"loss": 0.6337,
"step": 3520
},
{
"epoch": 0.4618001046572475,
"grad_norm": 0.592648552852463,
"learning_rate": 5.454665959703076e-05,
"loss": 0.8411,
"step": 3530
},
{
"epoch": 0.4631083202511774,
"grad_norm": 0.5181504434630264,
"learning_rate": 5.441410392364793e-05,
"loss": 0.6083,
"step": 3540
},
{
"epoch": 0.4644165358451073,
"grad_norm": 0.5499418553622303,
"learning_rate": 5.428154825026511e-05,
"loss": 0.849,
"step": 3550
},
{
"epoch": 0.46572475143903713,
"grad_norm": 0.6574654424945503,
"learning_rate": 5.4148992576882296e-05,
"loss": 0.608,
"step": 3560
},
{
"epoch": 0.46703296703296704,
"grad_norm": 0.6187475128909983,
"learning_rate": 5.401643690349947e-05,
"loss": 0.8159,
"step": 3570
},
{
"epoch": 0.4683411826268969,
"grad_norm": 0.534345893303697,
"learning_rate": 5.388388123011665e-05,
"loss": 0.6117,
"step": 3580
},
{
"epoch": 0.4696493982208268,
"grad_norm": 0.4564109249983669,
"learning_rate": 5.3751325556733834e-05,
"loss": 0.8049,
"step": 3590
},
{
"epoch": 0.47095761381475665,
"grad_norm": 0.5578958162524437,
"learning_rate": 5.361876988335101e-05,
"loss": 0.6157,
"step": 3600
},
{
"epoch": 0.47226582940868655,
"grad_norm": 0.500181472769301,
"learning_rate": 5.3486214209968186e-05,
"loss": 0.8502,
"step": 3610
},
{
"epoch": 0.4735740450026164,
"grad_norm": 0.6141834045693259,
"learning_rate": 5.335365853658537e-05,
"loss": 0.6031,
"step": 3620
},
{
"epoch": 0.4748822605965463,
"grad_norm": 0.4961433938028621,
"learning_rate": 5.3221102863202545e-05,
"loss": 0.8375,
"step": 3630
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.520091439534862,
"learning_rate": 5.3088547189819725e-05,
"loss": 0.5814,
"step": 3640
},
{
"epoch": 0.47749869178440607,
"grad_norm": 0.6206247789922092,
"learning_rate": 5.295599151643691e-05,
"loss": 0.8411,
"step": 3650
},
{
"epoch": 0.478806907378336,
"grad_norm": 0.5343995283355165,
"learning_rate": 5.282343584305408e-05,
"loss": 0.5814,
"step": 3660
},
{
"epoch": 0.48011512297226583,
"grad_norm": 0.6876308319219042,
"learning_rate": 5.269088016967126e-05,
"loss": 0.8112,
"step": 3670
},
{
"epoch": 0.48142333856619574,
"grad_norm": 0.8088277743851824,
"learning_rate": 5.255832449628845e-05,
"loss": 0.5852,
"step": 3680
},
{
"epoch": 0.4827315541601256,
"grad_norm": 0.5226035766833458,
"learning_rate": 5.242576882290562e-05,
"loss": 0.8115,
"step": 3690
},
{
"epoch": 0.4840397697540555,
"grad_norm": 0.711443652596892,
"learning_rate": 5.22932131495228e-05,
"loss": 0.6331,
"step": 3700
},
{
"epoch": 0.48534798534798534,
"grad_norm": 0.44695965081239714,
"learning_rate": 5.216065747613999e-05,
"loss": 0.8312,
"step": 3710
},
{
"epoch": 0.48665620094191525,
"grad_norm": 0.43172836208765664,
"learning_rate": 5.202810180275716e-05,
"loss": 0.6138,
"step": 3720
},
{
"epoch": 0.4879644165358451,
"grad_norm": 0.48852761265154354,
"learning_rate": 5.189554612937434e-05,
"loss": 0.8535,
"step": 3730
},
{
"epoch": 0.489272632129775,
"grad_norm": 0.98510305137075,
"learning_rate": 5.1762990455991525e-05,
"loss": 0.6202,
"step": 3740
},
{
"epoch": 0.49058084772370486,
"grad_norm": 0.48915174823336055,
"learning_rate": 5.163043478260869e-05,
"loss": 0.8519,
"step": 3750
},
{
"epoch": 0.49188906331763477,
"grad_norm": 0.5904577427313913,
"learning_rate": 5.149787910922588e-05,
"loss": 0.6166,
"step": 3760
},
{
"epoch": 0.4931972789115646,
"grad_norm": 0.5128070606528945,
"learning_rate": 5.1365323435843063e-05,
"loss": 0.8471,
"step": 3770
},
{
"epoch": 0.4945054945054945,
"grad_norm": 0.5725171611100351,
"learning_rate": 5.123276776246023e-05,
"loss": 0.6287,
"step": 3780
},
{
"epoch": 0.4958137100994244,
"grad_norm": 0.539320741587729,
"learning_rate": 5.1100212089077415e-05,
"loss": 0.824,
"step": 3790
},
{
"epoch": 0.4971219256933543,
"grad_norm": 0.4028412015043289,
"learning_rate": 5.0967656415694595e-05,
"loss": 0.5806,
"step": 3800
},
{
"epoch": 0.49843014128728413,
"grad_norm": 0.5389621790607799,
"learning_rate": 5.083510074231177e-05,
"loss": 0.8308,
"step": 3810
},
{
"epoch": 0.49973835688121404,
"grad_norm": 0.6011860291653554,
"learning_rate": 5.0702545068928954e-05,
"loss": 0.6003,
"step": 3820
},
{
"epoch": 0.501046572475144,
"grad_norm": 0.5389080519884727,
"learning_rate": 5.056998939554613e-05,
"loss": 0.8276,
"step": 3830
},
{
"epoch": 0.5023547880690737,
"grad_norm": 0.6088518124424828,
"learning_rate": 5.0437433722163306e-05,
"loss": 0.6195,
"step": 3840
},
{
"epoch": 0.5036630036630036,
"grad_norm": 0.6515108951670922,
"learning_rate": 5.030487804878049e-05,
"loss": 0.8051,
"step": 3850
},
{
"epoch": 0.5049712192569336,
"grad_norm": 0.4313425003620371,
"learning_rate": 5.017232237539767e-05,
"loss": 0.6071,
"step": 3860
},
{
"epoch": 0.5062794348508635,
"grad_norm": 0.5264707278305082,
"learning_rate": 5.0039766702014844e-05,
"loss": 0.8364,
"step": 3870
},
{
"epoch": 0.5075876504447933,
"grad_norm": 0.7048581680415387,
"learning_rate": 4.990721102863203e-05,
"loss": 0.6049,
"step": 3880
},
{
"epoch": 0.5088958660387232,
"grad_norm": 0.5055653926619902,
"learning_rate": 4.97746553552492e-05,
"loss": 0.8285,
"step": 3890
},
{
"epoch": 0.5102040816326531,
"grad_norm": 0.5437489221882642,
"learning_rate": 4.964209968186639e-05,
"loss": 0.5851,
"step": 3900
},
{
"epoch": 0.511512297226583,
"grad_norm": 0.5503188602842589,
"learning_rate": 4.950954400848357e-05,
"loss": 0.8251,
"step": 3910
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.5699382425924778,
"learning_rate": 4.937698833510074e-05,
"loss": 0.6065,
"step": 3920
},
{
"epoch": 0.5141287284144427,
"grad_norm": 0.5601404210013031,
"learning_rate": 4.924443266171793e-05,
"loss": 0.8332,
"step": 3930
},
{
"epoch": 0.5154369440083726,
"grad_norm": 0.6107160876812969,
"learning_rate": 4.9111876988335106e-05,
"loss": 0.6407,
"step": 3940
},
{
"epoch": 0.5167451596023025,
"grad_norm": 0.5627086115947509,
"learning_rate": 4.897932131495228e-05,
"loss": 0.8524,
"step": 3950
},
{
"epoch": 0.5180533751962323,
"grad_norm": 0.47974680429027505,
"learning_rate": 4.8846765641569465e-05,
"loss": 0.5891,
"step": 3960
},
{
"epoch": 0.5193615907901622,
"grad_norm": 0.6078144910339062,
"learning_rate": 4.8714209968186645e-05,
"loss": 0.8207,
"step": 3970
},
{
"epoch": 0.5206698063840921,
"grad_norm": 0.6828182233693575,
"learning_rate": 4.858165429480382e-05,
"loss": 0.6234,
"step": 3980
},
{
"epoch": 0.521978021978022,
"grad_norm": 0.5078577809608877,
"learning_rate": 4.8449098621421e-05,
"loss": 0.8293,
"step": 3990
},
{
"epoch": 0.5232862375719518,
"grad_norm": 0.501493452942641,
"learning_rate": 4.8316542948038176e-05,
"loss": 0.5924,
"step": 4000
},
{
"epoch": 0.5245944531658817,
"grad_norm": 0.5914460419911263,
"learning_rate": 4.8183987274655355e-05,
"loss": 0.8204,
"step": 4010
},
{
"epoch": 0.5259026687598116,
"grad_norm": 0.5108518700290716,
"learning_rate": 4.8051431601272535e-05,
"loss": 0.5938,
"step": 4020
},
{
"epoch": 0.5272108843537415,
"grad_norm": 0.5931433550467333,
"learning_rate": 4.7918875927889714e-05,
"loss": 0.8405,
"step": 4030
},
{
"epoch": 0.5285190999476713,
"grad_norm": 0.6193186586877497,
"learning_rate": 4.7786320254506894e-05,
"loss": 0.5813,
"step": 4040
},
{
"epoch": 0.5298273155416012,
"grad_norm": 0.5380628082146289,
"learning_rate": 4.765376458112407e-05,
"loss": 0.8669,
"step": 4050
},
{
"epoch": 0.5311355311355311,
"grad_norm": 0.5763128333912579,
"learning_rate": 4.752120890774125e-05,
"loss": 0.6076,
"step": 4060
},
{
"epoch": 0.532443746729461,
"grad_norm": 0.5145862547581488,
"learning_rate": 4.738865323435843e-05,
"loss": 0.8424,
"step": 4070
},
{
"epoch": 0.533751962323391,
"grad_norm": 0.5664958788683205,
"learning_rate": 4.725609756097561e-05,
"loss": 0.6293,
"step": 4080
},
{
"epoch": 0.5350601779173207,
"grad_norm": 0.5292223052367038,
"learning_rate": 4.712354188759279e-05,
"loss": 0.8173,
"step": 4090
},
{
"epoch": 0.5363683935112507,
"grad_norm": 0.58673619277203,
"learning_rate": 4.699098621420997e-05,
"loss": 0.5763,
"step": 4100
},
{
"epoch": 0.5376766091051806,
"grad_norm": 0.5594174176545202,
"learning_rate": 4.685843054082715e-05,
"loss": 0.8028,
"step": 4110
},
{
"epoch": 0.5389848246991105,
"grad_norm": 0.6339559869110518,
"learning_rate": 4.672587486744433e-05,
"loss": 0.6053,
"step": 4120
},
{
"epoch": 0.5402930402930403,
"grad_norm": 0.671405553815309,
"learning_rate": 4.659331919406151e-05,
"loss": 0.8398,
"step": 4130
},
{
"epoch": 0.5416012558869702,
"grad_norm": 0.5445580424726075,
"learning_rate": 4.646076352067869e-05,
"loss": 0.621,
"step": 4140
},
{
"epoch": 0.5429094714809001,
"grad_norm": 0.5938642782111705,
"learning_rate": 4.632820784729587e-05,
"loss": 0.8295,
"step": 4150
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.6435152678972964,
"learning_rate": 4.6195652173913046e-05,
"loss": 0.618,
"step": 4160
},
{
"epoch": 0.5455259026687598,
"grad_norm": 0.5089883798718602,
"learning_rate": 4.6063096500530226e-05,
"loss": 0.8135,
"step": 4170
},
{
"epoch": 0.5468341182626897,
"grad_norm": 0.5714660790898384,
"learning_rate": 4.5930540827147405e-05,
"loss": 0.622,
"step": 4180
},
{
"epoch": 0.5481423338566196,
"grad_norm": 0.5423779003965784,
"learning_rate": 4.5797985153764584e-05,
"loss": 0.8011,
"step": 4190
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.5305546579845897,
"learning_rate": 4.5665429480381764e-05,
"loss": 0.5792,
"step": 4200
},
{
"epoch": 0.5507587650444793,
"grad_norm": 0.5057565979180018,
"learning_rate": 4.553287380699894e-05,
"loss": 0.8326,
"step": 4210
},
{
"epoch": 0.5520669806384092,
"grad_norm": 0.5108055603767078,
"learning_rate": 4.540031813361612e-05,
"loss": 0.6014,
"step": 4220
},
{
"epoch": 0.5533751962323391,
"grad_norm": 0.5358448167728354,
"learning_rate": 4.5267762460233295e-05,
"loss": 0.8152,
"step": 4230
},
{
"epoch": 0.554683411826269,
"grad_norm": 0.7719427855271279,
"learning_rate": 4.513520678685048e-05,
"loss": 0.5967,
"step": 4240
},
{
"epoch": 0.5559916274201988,
"grad_norm": 0.6513813945282019,
"learning_rate": 4.500265111346766e-05,
"loss": 0.8133,
"step": 4250
},
{
"epoch": 0.5572998430141287,
"grad_norm": 0.611360841284405,
"learning_rate": 4.487009544008483e-05,
"loss": 0.6109,
"step": 4260
},
{
"epoch": 0.5586080586080586,
"grad_norm": 0.5122207195541725,
"learning_rate": 4.473753976670202e-05,
"loss": 0.8237,
"step": 4270
},
{
"epoch": 0.5599162742019885,
"grad_norm": 0.47024973003128884,
"learning_rate": 4.46049840933192e-05,
"loss": 0.6308,
"step": 4280
},
{
"epoch": 0.5612244897959183,
"grad_norm": 0.5605094944984179,
"learning_rate": 4.447242841993637e-05,
"loss": 0.8664,
"step": 4290
},
{
"epoch": 0.5625327053898482,
"grad_norm": 0.6555414150243584,
"learning_rate": 4.433987274655356e-05,
"loss": 0.6108,
"step": 4300
},
{
"epoch": 0.5638409209837781,
"grad_norm": 0.4937828454850159,
"learning_rate": 4.420731707317074e-05,
"loss": 0.8288,
"step": 4310
},
{
"epoch": 0.565149136577708,
"grad_norm": 0.5949648171644456,
"learning_rate": 4.407476139978791e-05,
"loss": 0.5755,
"step": 4320
},
{
"epoch": 0.5664573521716378,
"grad_norm": 0.6150493489271065,
"learning_rate": 4.3942205726405096e-05,
"loss": 0.8562,
"step": 4330
},
{
"epoch": 0.5677655677655677,
"grad_norm": 0.5168321021587649,
"learning_rate": 4.3809650053022275e-05,
"loss": 0.5859,
"step": 4340
},
{
"epoch": 0.5690737833594977,
"grad_norm": 0.5149672746596766,
"learning_rate": 4.367709437963945e-05,
"loss": 0.8035,
"step": 4350
},
{
"epoch": 0.5703819989534276,
"grad_norm": 0.6254032996036644,
"learning_rate": 4.3544538706256634e-05,
"loss": 0.6081,
"step": 4360
},
{
"epoch": 0.5716902145473574,
"grad_norm": 0.5439410227795257,
"learning_rate": 4.341198303287381e-05,
"loss": 0.8486,
"step": 4370
},
{
"epoch": 0.5729984301412873,
"grad_norm": 0.6989019848301199,
"learning_rate": 4.3279427359490986e-05,
"loss": 0.5977,
"step": 4380
},
{
"epoch": 0.5743066457352172,
"grad_norm": 0.5253254730813908,
"learning_rate": 4.314687168610817e-05,
"loss": 0.8166,
"step": 4390
},
{
"epoch": 0.5756148613291471,
"grad_norm": 0.6306757564007922,
"learning_rate": 4.3014316012725345e-05,
"loss": 0.6037,
"step": 4400
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.457131073575816,
"learning_rate": 4.2881760339342524e-05,
"loss": 0.8058,
"step": 4410
},
{
"epoch": 0.5782312925170068,
"grad_norm": 0.6651032212615784,
"learning_rate": 4.274920466595971e-05,
"loss": 0.577,
"step": 4420
},
{
"epoch": 0.5795395081109367,
"grad_norm": 0.5052314086043747,
"learning_rate": 4.261664899257688e-05,
"loss": 0.8133,
"step": 4430
},
{
"epoch": 0.5808477237048666,
"grad_norm": 0.6345651754805625,
"learning_rate": 4.248409331919406e-05,
"loss": 0.5862,
"step": 4440
},
{
"epoch": 0.5821559392987964,
"grad_norm": 0.5470901361744871,
"learning_rate": 4.235153764581124e-05,
"loss": 0.82,
"step": 4450
},
{
"epoch": 0.5834641548927263,
"grad_norm": 0.6148840090122306,
"learning_rate": 4.221898197242842e-05,
"loss": 0.5963,
"step": 4460
},
{
"epoch": 0.5847723704866562,
"grad_norm": 0.5654198812120623,
"learning_rate": 4.20864262990456e-05,
"loss": 0.8098,
"step": 4470
},
{
"epoch": 0.5860805860805861,
"grad_norm": 0.8759288696982239,
"learning_rate": 4.195387062566278e-05,
"loss": 0.6222,
"step": 4480
},
{
"epoch": 0.587388801674516,
"grad_norm": 0.479216838198247,
"learning_rate": 4.182131495227996e-05,
"loss": 0.8338,
"step": 4490
},
{
"epoch": 0.5886970172684458,
"grad_norm": 0.7698717669406883,
"learning_rate": 4.168875927889714e-05,
"loss": 0.6278,
"step": 4500
},
{
"epoch": 0.5900052328623757,
"grad_norm": 0.48674948133579515,
"learning_rate": 4.155620360551432e-05,
"loss": 0.7887,
"step": 4510
},
{
"epoch": 0.5913134484563056,
"grad_norm": 0.5786002565311555,
"learning_rate": 4.14236479321315e-05,
"loss": 0.5891,
"step": 4520
},
{
"epoch": 0.5926216640502355,
"grad_norm": 0.5370485226818142,
"learning_rate": 4.129109225874868e-05,
"loss": 0.8276,
"step": 4530
},
{
"epoch": 0.5939298796441653,
"grad_norm": 0.43625907655336543,
"learning_rate": 4.1158536585365856e-05,
"loss": 0.5856,
"step": 4540
},
{
"epoch": 0.5952380952380952,
"grad_norm": 0.5266268760596537,
"learning_rate": 4.1025980911983036e-05,
"loss": 0.7964,
"step": 4550
},
{
"epoch": 0.5965463108320251,
"grad_norm": 0.5771002724293784,
"learning_rate": 4.0893425238600215e-05,
"loss": 0.5854,
"step": 4560
},
{
"epoch": 0.597854526425955,
"grad_norm": 0.5919954827087538,
"learning_rate": 4.076086956521739e-05,
"loss": 0.8034,
"step": 4570
},
{
"epoch": 0.5991627420198848,
"grad_norm": 0.5680838313723001,
"learning_rate": 4.0628313891834574e-05,
"loss": 0.5926,
"step": 4580
},
{
"epoch": 0.6004709576138147,
"grad_norm": 0.6715456913252982,
"learning_rate": 4.049575821845175e-05,
"loss": 0.8389,
"step": 4590
},
{
"epoch": 0.6017791732077447,
"grad_norm": 0.5499015722260664,
"learning_rate": 4.0363202545068926e-05,
"loss": 0.6022,
"step": 4600
},
{
"epoch": 0.6030873888016746,
"grad_norm": 0.7560936310398005,
"learning_rate": 4.023064687168611e-05,
"loss": 0.8014,
"step": 4610
},
{
"epoch": 0.6043956043956044,
"grad_norm": 0.5816663839566236,
"learning_rate": 4.009809119830329e-05,
"loss": 0.6037,
"step": 4620
},
{
"epoch": 0.6057038199895343,
"grad_norm": 0.4954065888150648,
"learning_rate": 3.9965535524920464e-05,
"loss": 0.8123,
"step": 4630
},
{
"epoch": 0.6070120355834642,
"grad_norm": 0.6634722345039996,
"learning_rate": 3.983297985153765e-05,
"loss": 0.6093,
"step": 4640
},
{
"epoch": 0.6083202511773941,
"grad_norm": 0.5448340011567387,
"learning_rate": 3.970042417815483e-05,
"loss": 0.8391,
"step": 4650
},
{
"epoch": 0.6096284667713239,
"grad_norm": 0.7234298679122729,
"learning_rate": 3.9567868504772e-05,
"loss": 0.6123,
"step": 4660
},
{
"epoch": 0.6109366823652538,
"grad_norm": 0.7113516275751068,
"learning_rate": 3.943531283138919e-05,
"loss": 0.8188,
"step": 4670
},
{
"epoch": 0.6122448979591837,
"grad_norm": 0.5343485382635467,
"learning_rate": 3.930275715800637e-05,
"loss": 0.6025,
"step": 4680
},
{
"epoch": 0.6135531135531136,
"grad_norm": 0.574688006845476,
"learning_rate": 3.917020148462354e-05,
"loss": 0.8244,
"step": 4690
},
{
"epoch": 0.6148613291470434,
"grad_norm": 0.5954437693210181,
"learning_rate": 3.9037645811240727e-05,
"loss": 0.5976,
"step": 4700
},
{
"epoch": 0.6161695447409733,
"grad_norm": 0.6359916685979564,
"learning_rate": 3.89050901378579e-05,
"loss": 0.804,
"step": 4710
},
{
"epoch": 0.6174777603349032,
"grad_norm": 0.37864818768935987,
"learning_rate": 3.877253446447508e-05,
"loss": 0.5819,
"step": 4720
},
{
"epoch": 0.6187859759288331,
"grad_norm": 0.6317553124676056,
"learning_rate": 3.8639978791092265e-05,
"loss": 0.8275,
"step": 4730
},
{
"epoch": 0.6200941915227629,
"grad_norm": 0.37899782241778535,
"learning_rate": 3.850742311770944e-05,
"loss": 0.5939,
"step": 4740
},
{
"epoch": 0.6214024071166928,
"grad_norm": 0.5096002461412386,
"learning_rate": 3.837486744432662e-05,
"loss": 0.8152,
"step": 4750
},
{
"epoch": 0.6227106227106227,
"grad_norm": 0.5858186125506725,
"learning_rate": 3.82423117709438e-05,
"loss": 0.5971,
"step": 4760
},
{
"epoch": 0.6240188383045526,
"grad_norm": 0.5258202103911215,
"learning_rate": 3.8109756097560976e-05,
"loss": 0.8146,
"step": 4770
},
{
"epoch": 0.6253270538984824,
"grad_norm": 0.59652515926618,
"learning_rate": 3.7977200424178155e-05,
"loss": 0.6034,
"step": 4780
},
{
"epoch": 0.6266352694924123,
"grad_norm": 0.692241221647863,
"learning_rate": 3.784464475079534e-05,
"loss": 0.817,
"step": 4790
},
{
"epoch": 0.6279434850863422,
"grad_norm": 0.5751964849585177,
"learning_rate": 3.7712089077412514e-05,
"loss": 0.5948,
"step": 4800
},
{
"epoch": 0.6292517006802721,
"grad_norm": 0.4867932739502536,
"learning_rate": 3.757953340402969e-05,
"loss": 0.7895,
"step": 4810
},
{
"epoch": 0.6305599162742019,
"grad_norm": 0.6527891870524758,
"learning_rate": 3.744697773064688e-05,
"loss": 0.5869,
"step": 4820
},
{
"epoch": 0.6318681318681318,
"grad_norm": 0.542994791536692,
"learning_rate": 3.731442205726405e-05,
"loss": 0.8016,
"step": 4830
},
{
"epoch": 0.6331763474620618,
"grad_norm": 0.49091636854896203,
"learning_rate": 3.718186638388123e-05,
"loss": 0.591,
"step": 4840
},
{
"epoch": 0.6344845630559917,
"grad_norm": 0.4808333120684155,
"learning_rate": 3.704931071049841e-05,
"loss": 0.8374,
"step": 4850
},
{
"epoch": 0.6357927786499215,
"grad_norm": 0.609783391275745,
"learning_rate": 3.691675503711559e-05,
"loss": 0.5727,
"step": 4860
},
{
"epoch": 0.6371009942438514,
"grad_norm": 0.4901032831365679,
"learning_rate": 3.678419936373277e-05,
"loss": 0.8326,
"step": 4870
},
{
"epoch": 0.6384092098377813,
"grad_norm": 0.5535614493168636,
"learning_rate": 3.665164369034995e-05,
"loss": 0.587,
"step": 4880
},
{
"epoch": 0.6397174254317112,
"grad_norm": 0.5661579063549987,
"learning_rate": 3.651908801696713e-05,
"loss": 0.8006,
"step": 4890
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.5755165016330849,
"learning_rate": 3.638653234358431e-05,
"loss": 0.5548,
"step": 4900
},
{
"epoch": 0.6423338566195709,
"grad_norm": 0.5195028269557372,
"learning_rate": 3.625397667020149e-05,
"loss": 0.8411,
"step": 4910
},
{
"epoch": 0.6436420722135008,
"grad_norm": 0.595652162086749,
"learning_rate": 3.6121420996818666e-05,
"loss": 0.5966,
"step": 4920
},
{
"epoch": 0.6449502878074307,
"grad_norm": 0.5243990905265108,
"learning_rate": 3.5988865323435846e-05,
"loss": 0.8298,
"step": 4930
},
{
"epoch": 0.6462585034013606,
"grad_norm": 0.7940136105161147,
"learning_rate": 3.585630965005302e-05,
"loss": 0.6309,
"step": 4940
},
{
"epoch": 0.6475667189952904,
"grad_norm": 0.4909267588948854,
"learning_rate": 3.5723753976670205e-05,
"loss": 0.7967,
"step": 4950
},
{
"epoch": 0.6488749345892203,
"grad_norm": 0.6021461164458334,
"learning_rate": 3.5591198303287384e-05,
"loss": 0.5946,
"step": 4960
},
{
"epoch": 0.6501831501831502,
"grad_norm": 0.5082946197773784,
"learning_rate": 3.545864262990456e-05,
"loss": 0.7959,
"step": 4970
},
{
"epoch": 0.6514913657770801,
"grad_norm": 0.6887923783738013,
"learning_rate": 3.532608695652174e-05,
"loss": 0.6178,
"step": 4980
},
{
"epoch": 0.6527995813710099,
"grad_norm": 0.4863608457352857,
"learning_rate": 3.519353128313892e-05,
"loss": 0.8218,
"step": 4990
},
{
"epoch": 0.6541077969649398,
"grad_norm": 0.6370508452362942,
"learning_rate": 3.5060975609756095e-05,
"loss": 0.5726,
"step": 5000
},
{
"epoch": 0.6554160125588697,
"grad_norm": 0.5626726082078685,
"learning_rate": 3.492841993637328e-05,
"loss": 0.8388,
"step": 5010
},
{
"epoch": 0.6567242281527996,
"grad_norm": 0.6058799396796493,
"learning_rate": 3.479586426299046e-05,
"loss": 0.6345,
"step": 5020
},
{
"epoch": 0.6580324437467294,
"grad_norm": 0.5486653470890362,
"learning_rate": 3.466330858960763e-05,
"loss": 0.8304,
"step": 5030
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.7402614360383322,
"learning_rate": 3.453075291622482e-05,
"loss": 0.5972,
"step": 5040
},
{
"epoch": 0.6606488749345892,
"grad_norm": 0.5065684112904657,
"learning_rate": 3.4398197242842e-05,
"loss": 0.8049,
"step": 5050
},
{
"epoch": 0.6619570905285191,
"grad_norm": 0.4487165281422432,
"learning_rate": 3.426564156945917e-05,
"loss": 0.5813,
"step": 5060
},
{
"epoch": 0.6632653061224489,
"grad_norm": 0.5576418936555791,
"learning_rate": 3.413308589607636e-05,
"loss": 0.85,
"step": 5070
},
{
"epoch": 0.6645735217163788,
"grad_norm": 0.5629552965222027,
"learning_rate": 3.400053022269353e-05,
"loss": 0.6144,
"step": 5080
},
{
"epoch": 0.6658817373103088,
"grad_norm": 0.5340051545534295,
"learning_rate": 3.386797454931071e-05,
"loss": 0.8256,
"step": 5090
},
{
"epoch": 0.6671899529042387,
"grad_norm": 0.6481929473760202,
"learning_rate": 3.3735418875927896e-05,
"loss": 0.5934,
"step": 5100
},
{
"epoch": 0.6684981684981685,
"grad_norm": 0.586068083609296,
"learning_rate": 3.360286320254507e-05,
"loss": 0.8065,
"step": 5110
},
{
"epoch": 0.6698063840920984,
"grad_norm": 0.5898674035419238,
"learning_rate": 3.347030752916225e-05,
"loss": 0.6332,
"step": 5120
},
{
"epoch": 0.6711145996860283,
"grad_norm": 0.5271996892541019,
"learning_rate": 3.3337751855779434e-05,
"loss": 0.8407,
"step": 5130
},
{
"epoch": 0.6724228152799582,
"grad_norm": 0.7209460794111061,
"learning_rate": 3.3205196182396606e-05,
"loss": 0.5954,
"step": 5140
},
{
"epoch": 0.673731030873888,
"grad_norm": 0.5419491953310692,
"learning_rate": 3.3072640509013786e-05,
"loss": 0.8186,
"step": 5150
},
{
"epoch": 0.6750392464678179,
"grad_norm": 0.6363503952339683,
"learning_rate": 3.294008483563097e-05,
"loss": 0.6252,
"step": 5160
},
{
"epoch": 0.6763474620617478,
"grad_norm": 0.5479539766686561,
"learning_rate": 3.2807529162248144e-05,
"loss": 0.8248,
"step": 5170
},
{
"epoch": 0.6776556776556777,
"grad_norm": 0.5535849243715827,
"learning_rate": 3.2674973488865324e-05,
"loss": 0.5717,
"step": 5180
},
{
"epoch": 0.6789638932496075,
"grad_norm": 0.5091172962824471,
"learning_rate": 3.25424178154825e-05,
"loss": 0.8203,
"step": 5190
},
{
"epoch": 0.6802721088435374,
"grad_norm": 0.562594433405121,
"learning_rate": 3.240986214209968e-05,
"loss": 0.6089,
"step": 5200
},
{
"epoch": 0.6815803244374673,
"grad_norm": 0.5517130562661668,
"learning_rate": 3.227730646871686e-05,
"loss": 0.8446,
"step": 5210
},
{
"epoch": 0.6828885400313972,
"grad_norm": 0.4047148337025719,
"learning_rate": 3.214475079533404e-05,
"loss": 0.5894,
"step": 5220
},
{
"epoch": 0.684196755625327,
"grad_norm": 0.5239322456435999,
"learning_rate": 3.201219512195122e-05,
"loss": 0.8321,
"step": 5230
},
{
"epoch": 0.6855049712192569,
"grad_norm": 0.5498585626284401,
"learning_rate": 3.18796394485684e-05,
"loss": 0.6279,
"step": 5240
},
{
"epoch": 0.6868131868131868,
"grad_norm": 0.48743593460492207,
"learning_rate": 3.174708377518558e-05,
"loss": 0.8242,
"step": 5250
},
{
"epoch": 0.6881214024071167,
"grad_norm": 0.5651556275625538,
"learning_rate": 3.161452810180276e-05,
"loss": 0.6286,
"step": 5260
},
{
"epoch": 0.6894296180010465,
"grad_norm": 0.5754174898482404,
"learning_rate": 3.148197242841994e-05,
"loss": 0.8177,
"step": 5270
},
{
"epoch": 0.6907378335949764,
"grad_norm": 0.6605607744635831,
"learning_rate": 3.134941675503712e-05,
"loss": 0.597,
"step": 5280
},
{
"epoch": 0.6920460491889063,
"grad_norm": 0.5228692951705382,
"learning_rate": 3.12168610816543e-05,
"loss": 0.7984,
"step": 5290
},
{
"epoch": 0.6933542647828362,
"grad_norm": 0.4480430717512152,
"learning_rate": 3.1084305408271477e-05,
"loss": 0.6067,
"step": 5300
},
{
"epoch": 0.6946624803767661,
"grad_norm": 0.5638626523175552,
"learning_rate": 3.095174973488865e-05,
"loss": 0.8093,
"step": 5310
},
{
"epoch": 0.6959706959706959,
"grad_norm": 0.5946989663976191,
"learning_rate": 3.0819194061505835e-05,
"loss": 0.59,
"step": 5320
},
{
"epoch": 0.6972789115646258,
"grad_norm": 0.5671263430143716,
"learning_rate": 3.0686638388123015e-05,
"loss": 0.8009,
"step": 5330
},
{
"epoch": 0.6985871271585558,
"grad_norm": 0.589042697181555,
"learning_rate": 3.055408271474019e-05,
"loss": 0.5782,
"step": 5340
},
{
"epoch": 0.6998953427524857,
"grad_norm": 0.7073787068267711,
"learning_rate": 3.0421527041357374e-05,
"loss": 0.7989,
"step": 5350
},
{
"epoch": 0.7012035583464155,
"grad_norm": 0.5352877950074024,
"learning_rate": 3.028897136797455e-05,
"loss": 0.6068,
"step": 5360
},
{
"epoch": 0.7025117739403454,
"grad_norm": 0.5346918859139157,
"learning_rate": 3.015641569459173e-05,
"loss": 0.8129,
"step": 5370
},
{
"epoch": 0.7038199895342753,
"grad_norm": 0.48063163414005916,
"learning_rate": 3.0023860021208912e-05,
"loss": 0.561,
"step": 5380
},
{
"epoch": 0.7051282051282052,
"grad_norm": 0.5431033944364678,
"learning_rate": 2.9891304347826088e-05,
"loss": 0.8314,
"step": 5390
},
{
"epoch": 0.706436420722135,
"grad_norm": 0.6251701992093956,
"learning_rate": 2.9758748674443264e-05,
"loss": 0.6043,
"step": 5400
},
{
"epoch": 0.7077446363160649,
"grad_norm": 0.538107678456505,
"learning_rate": 2.962619300106045e-05,
"loss": 0.8345,
"step": 5410
},
{
"epoch": 0.7090528519099948,
"grad_norm": 0.5596293446673235,
"learning_rate": 2.9493637327677626e-05,
"loss": 0.5767,
"step": 5420
},
{
"epoch": 0.7103610675039247,
"grad_norm": 0.6302036834487577,
"learning_rate": 2.9361081654294802e-05,
"loss": 0.817,
"step": 5430
},
{
"epoch": 0.7116692830978545,
"grad_norm": 0.6442132298338353,
"learning_rate": 2.9228525980911985e-05,
"loss": 0.5956,
"step": 5440
},
{
"epoch": 0.7129774986917844,
"grad_norm": 0.5294060275695159,
"learning_rate": 2.9095970307529164e-05,
"loss": 0.8086,
"step": 5450
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.5780709508327503,
"learning_rate": 2.896341463414634e-05,
"loss": 0.599,
"step": 5460
},
{
"epoch": 0.7155939298796442,
"grad_norm": 0.5178595179064079,
"learning_rate": 2.8830858960763523e-05,
"loss": 0.8023,
"step": 5470
},
{
"epoch": 0.716902145473574,
"grad_norm": 0.4908347612839904,
"learning_rate": 2.8698303287380702e-05,
"loss": 0.5906,
"step": 5480
},
{
"epoch": 0.7182103610675039,
"grad_norm": 0.5745266376669839,
"learning_rate": 2.8565747613997878e-05,
"loss": 0.8017,
"step": 5490
},
{
"epoch": 0.7195185766614338,
"grad_norm": 0.7091252655798257,
"learning_rate": 2.843319194061506e-05,
"loss": 0.6165,
"step": 5500
},
{
"epoch": 0.7208267922553637,
"grad_norm": 0.5082645638178946,
"learning_rate": 2.830063626723224e-05,
"loss": 0.7916,
"step": 5510
},
{
"epoch": 0.7221350078492935,
"grad_norm": 0.8135604062724642,
"learning_rate": 2.8168080593849416e-05,
"loss": 0.5766,
"step": 5520
},
{
"epoch": 0.7234432234432234,
"grad_norm": 0.5344352881317546,
"learning_rate": 2.80355249204666e-05,
"loss": 0.7945,
"step": 5530
},
{
"epoch": 0.7247514390371533,
"grad_norm": 0.6213640188906101,
"learning_rate": 2.7902969247083775e-05,
"loss": 0.5739,
"step": 5540
},
{
"epoch": 0.7260596546310832,
"grad_norm": 0.5133786932833476,
"learning_rate": 2.7770413573700955e-05,
"loss": 0.8175,
"step": 5550
},
{
"epoch": 0.727367870225013,
"grad_norm": 0.5687616103525199,
"learning_rate": 2.7637857900318137e-05,
"loss": 0.5946,
"step": 5560
},
{
"epoch": 0.7286760858189429,
"grad_norm": 0.5361603157753395,
"learning_rate": 2.7505302226935313e-05,
"loss": 0.8331,
"step": 5570
},
{
"epoch": 0.7299843014128728,
"grad_norm": 0.4846643967669185,
"learning_rate": 2.7372746553552493e-05,
"loss": 0.5846,
"step": 5580
},
{
"epoch": 0.7312925170068028,
"grad_norm": 0.5264633004571062,
"learning_rate": 2.7240190880169676e-05,
"loss": 0.7929,
"step": 5590
},
{
"epoch": 0.7326007326007326,
"grad_norm": 0.5169934066982514,
"learning_rate": 2.710763520678685e-05,
"loss": 0.5812,
"step": 5600
},
{
"epoch": 0.7339089481946625,
"grad_norm": 0.49516725051064175,
"learning_rate": 2.697507953340403e-05,
"loss": 0.8268,
"step": 5610
},
{
"epoch": 0.7352171637885924,
"grad_norm": 0.5545634117837589,
"learning_rate": 2.6842523860021214e-05,
"loss": 0.6293,
"step": 5620
},
{
"epoch": 0.7365253793825223,
"grad_norm": 0.5230373403077291,
"learning_rate": 2.670996818663839e-05,
"loss": 0.8166,
"step": 5630
},
{
"epoch": 0.7378335949764521,
"grad_norm": 0.599453616763764,
"learning_rate": 2.6577412513255566e-05,
"loss": 0.5906,
"step": 5640
},
{
"epoch": 0.739141810570382,
"grad_norm": 0.5973016920910486,
"learning_rate": 2.6444856839872752e-05,
"loss": 0.8012,
"step": 5650
},
{
"epoch": 0.7404500261643119,
"grad_norm": 0.6449099450136865,
"learning_rate": 2.6312301166489928e-05,
"loss": 0.5705,
"step": 5660
},
{
"epoch": 0.7417582417582418,
"grad_norm": 0.553182572295564,
"learning_rate": 2.6179745493107104e-05,
"loss": 0.8191,
"step": 5670
},
{
"epoch": 0.7430664573521716,
"grad_norm": 0.4476262335389087,
"learning_rate": 2.6047189819724287e-05,
"loss": 0.5958,
"step": 5680
},
{
"epoch": 0.7443746729461015,
"grad_norm": 0.4834161167587533,
"learning_rate": 2.5914634146341466e-05,
"loss": 0.8144,
"step": 5690
},
{
"epoch": 0.7456828885400314,
"grad_norm": 0.5286677625170841,
"learning_rate": 2.5782078472958642e-05,
"loss": 0.5816,
"step": 5700
},
{
"epoch": 0.7469911041339613,
"grad_norm": 0.5413842703568794,
"learning_rate": 2.5649522799575825e-05,
"loss": 0.8312,
"step": 5710
},
{
"epoch": 0.7482993197278912,
"grad_norm": 0.6786517057597097,
"learning_rate": 2.5516967126193004e-05,
"loss": 0.6086,
"step": 5720
},
{
"epoch": 0.749607535321821,
"grad_norm": 0.49742040117830627,
"learning_rate": 2.538441145281018e-05,
"loss": 0.7967,
"step": 5730
},
{
"epoch": 0.7509157509157509,
"grad_norm": 0.49256559696291485,
"learning_rate": 2.525185577942736e-05,
"loss": 0.5724,
"step": 5740
},
{
"epoch": 0.7522239665096808,
"grad_norm": 0.5263192680418415,
"learning_rate": 2.5119300106044542e-05,
"loss": 0.8011,
"step": 5750
},
{
"epoch": 0.7535321821036107,
"grad_norm": 0.6716690763876726,
"learning_rate": 2.498674443266172e-05,
"loss": 0.601,
"step": 5760
},
{
"epoch": 0.7548403976975405,
"grad_norm": 0.5268203857733461,
"learning_rate": 2.4854188759278898e-05,
"loss": 0.8224,
"step": 5770
},
{
"epoch": 0.7561486132914704,
"grad_norm": 0.6545344595309585,
"learning_rate": 2.4721633085896077e-05,
"loss": 0.5883,
"step": 5780
},
{
"epoch": 0.7574568288854003,
"grad_norm": 0.5192641904709395,
"learning_rate": 2.4589077412513257e-05,
"loss": 0.8223,
"step": 5790
},
{
"epoch": 0.7587650444793302,
"grad_norm": 0.5569056177071494,
"learning_rate": 2.4456521739130436e-05,
"loss": 0.5882,
"step": 5800
},
{
"epoch": 0.76007326007326,
"grad_norm": 0.652217104391276,
"learning_rate": 2.4323966065747615e-05,
"loss": 0.8111,
"step": 5810
},
{
"epoch": 0.7613814756671899,
"grad_norm": 0.715865728522808,
"learning_rate": 2.4191410392364795e-05,
"loss": 0.5839,
"step": 5820
},
{
"epoch": 0.7626896912611199,
"grad_norm": 0.4622828984795863,
"learning_rate": 2.4058854718981974e-05,
"loss": 0.8152,
"step": 5830
},
{
"epoch": 0.7639979068550498,
"grad_norm": 0.7105275925193085,
"learning_rate": 2.392629904559915e-05,
"loss": 0.5727,
"step": 5840
},
{
"epoch": 0.7653061224489796,
"grad_norm": 0.5593573119711108,
"learning_rate": 2.3793743372216333e-05,
"loss": 0.787,
"step": 5850
},
{
"epoch": 0.7666143380429095,
"grad_norm": 0.6976481535500858,
"learning_rate": 2.3661187698833512e-05,
"loss": 0.5828,
"step": 5860
},
{
"epoch": 0.7679225536368394,
"grad_norm": 0.5629390717962797,
"learning_rate": 2.352863202545069e-05,
"loss": 0.8119,
"step": 5870
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.8329984172410997,
"learning_rate": 2.3396076352067868e-05,
"loss": 0.5953,
"step": 5880
},
{
"epoch": 0.7705389848246991,
"grad_norm": 0.5054513797033992,
"learning_rate": 2.326352067868505e-05,
"loss": 0.7897,
"step": 5890
},
{
"epoch": 0.771847200418629,
"grad_norm": 0.8042478184879194,
"learning_rate": 2.3130965005302227e-05,
"loss": 0.601,
"step": 5900
},
{
"epoch": 0.7731554160125589,
"grad_norm": 0.5134604404253917,
"learning_rate": 2.2998409331919406e-05,
"loss": 0.825,
"step": 5910
},
{
"epoch": 0.7744636316064888,
"grad_norm": 0.6307033662226441,
"learning_rate": 2.286585365853659e-05,
"loss": 0.559,
"step": 5920
},
{
"epoch": 0.7757718472004186,
"grad_norm": 0.5923006697127559,
"learning_rate": 2.2733297985153765e-05,
"loss": 0.7833,
"step": 5930
},
{
"epoch": 0.7770800627943485,
"grad_norm": 0.7423403377088905,
"learning_rate": 2.2600742311770944e-05,
"loss": 0.5916,
"step": 5940
},
{
"epoch": 0.7783882783882784,
"grad_norm": 0.5275344149548817,
"learning_rate": 2.2468186638388124e-05,
"loss": 0.7857,
"step": 5950
},
{
"epoch": 0.7796964939822083,
"grad_norm": 0.4485788605700081,
"learning_rate": 2.2335630965005303e-05,
"loss": 0.5938,
"step": 5960
},
{
"epoch": 0.7810047095761381,
"grad_norm": 0.45581986333641783,
"learning_rate": 2.2203075291622482e-05,
"loss": 0.789,
"step": 5970
},
{
"epoch": 0.782312925170068,
"grad_norm": 0.7077436921839176,
"learning_rate": 2.2070519618239662e-05,
"loss": 0.5639,
"step": 5980
},
{
"epoch": 0.7836211407639979,
"grad_norm": 0.485078935511288,
"learning_rate": 2.193796394485684e-05,
"loss": 0.7974,
"step": 5990
},
{
"epoch": 0.7849293563579278,
"grad_norm": 0.5415931380682013,
"learning_rate": 2.180540827147402e-05,
"loss": 0.5513,
"step": 6000
},
{
"epoch": 0.7862375719518576,
"grad_norm": 0.6028769503669652,
"learning_rate": 2.16728525980912e-05,
"loss": 0.7859,
"step": 6010
},
{
"epoch": 0.7875457875457875,
"grad_norm": 0.43775607621380236,
"learning_rate": 2.154029692470838e-05,
"loss": 0.586,
"step": 6020
},
{
"epoch": 0.7888540031397174,
"grad_norm": 0.5129027729272807,
"learning_rate": 2.140774125132556e-05,
"loss": 0.8006,
"step": 6030
},
{
"epoch": 0.7901622187336473,
"grad_norm": 0.7986746080445549,
"learning_rate": 2.1275185577942735e-05,
"loss": 0.6124,
"step": 6040
},
{
"epoch": 0.7914704343275771,
"grad_norm": 0.5495341720621897,
"learning_rate": 2.1142629904559917e-05,
"loss": 0.8162,
"step": 6050
},
{
"epoch": 0.792778649921507,
"grad_norm": 0.6565347700150155,
"learning_rate": 2.1010074231177097e-05,
"loss": 0.5797,
"step": 6060
},
{
"epoch": 0.794086865515437,
"grad_norm": 0.49231844511035533,
"learning_rate": 2.0877518557794273e-05,
"loss": 0.815,
"step": 6070
},
{
"epoch": 0.7953950811093669,
"grad_norm": 0.487405981388951,
"learning_rate": 2.0744962884411452e-05,
"loss": 0.6098,
"step": 6080
},
{
"epoch": 0.7967032967032966,
"grad_norm": 0.5688986137956366,
"learning_rate": 2.0612407211028635e-05,
"loss": 0.8328,
"step": 6090
},
{
"epoch": 0.7980115122972266,
"grad_norm": 0.553490629602128,
"learning_rate": 2.047985153764581e-05,
"loss": 0.5689,
"step": 6100
},
{
"epoch": 0.7993197278911565,
"grad_norm": 0.5247897399616922,
"learning_rate": 2.034729586426299e-05,
"loss": 0.809,
"step": 6110
},
{
"epoch": 0.8006279434850864,
"grad_norm": 0.6208953939424018,
"learning_rate": 2.021474019088017e-05,
"loss": 0.5615,
"step": 6120
},
{
"epoch": 0.8019361590790163,
"grad_norm": 0.5020903361827662,
"learning_rate": 2.008218451749735e-05,
"loss": 0.8156,
"step": 6130
},
{
"epoch": 0.8032443746729461,
"grad_norm": 0.5291771623927373,
"learning_rate": 1.994962884411453e-05,
"loss": 0.5884,
"step": 6140
},
{
"epoch": 0.804552590266876,
"grad_norm": 0.5285998915355191,
"learning_rate": 1.9817073170731708e-05,
"loss": 0.8051,
"step": 6150
},
{
"epoch": 0.8058608058608059,
"grad_norm": 0.8178935383618982,
"learning_rate": 1.9684517497348887e-05,
"loss": 0.6138,
"step": 6160
},
{
"epoch": 0.8071690214547358,
"grad_norm": 0.4987221022793129,
"learning_rate": 1.9551961823966067e-05,
"loss": 0.8294,
"step": 6170
},
{
"epoch": 0.8084772370486656,
"grad_norm": 0.6163654350885334,
"learning_rate": 1.9419406150583246e-05,
"loss": 0.5636,
"step": 6180
},
{
"epoch": 0.8097854526425955,
"grad_norm": 0.555469302539477,
"learning_rate": 1.9286850477200426e-05,
"loss": 0.8178,
"step": 6190
},
{
"epoch": 0.8110936682365254,
"grad_norm": 0.6123412415543169,
"learning_rate": 1.9154294803817605e-05,
"loss": 0.5704,
"step": 6200
},
{
"epoch": 0.8124018838304553,
"grad_norm": 0.5124297425086758,
"learning_rate": 1.9021739130434784e-05,
"loss": 0.8193,
"step": 6210
},
{
"epoch": 0.8137100994243851,
"grad_norm": 0.8183903363373162,
"learning_rate": 1.8889183457051964e-05,
"loss": 0.6082,
"step": 6220
},
{
"epoch": 0.815018315018315,
"grad_norm": 0.5990002289487968,
"learning_rate": 1.8756627783669143e-05,
"loss": 0.8463,
"step": 6230
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.5158204377847465,
"learning_rate": 1.8624072110286323e-05,
"loss": 0.5905,
"step": 6240
},
{
"epoch": 0.8176347462061748,
"grad_norm": 0.5071733107919829,
"learning_rate": 1.84915164369035e-05,
"loss": 0.8052,
"step": 6250
},
{
"epoch": 0.8189429618001046,
"grad_norm": 0.7067873685301963,
"learning_rate": 1.835896076352068e-05,
"loss": 0.6175,
"step": 6260
},
{
"epoch": 0.8202511773940345,
"grad_norm": 0.49736879675215423,
"learning_rate": 1.8226405090137857e-05,
"loss": 0.8187,
"step": 6270
},
{
"epoch": 0.8215593929879644,
"grad_norm": 0.6169326476344377,
"learning_rate": 1.8093849416755037e-05,
"loss": 0.5969,
"step": 6280
},
{
"epoch": 0.8228676085818943,
"grad_norm": 0.5358771287992227,
"learning_rate": 1.796129374337222e-05,
"loss": 0.8084,
"step": 6290
},
{
"epoch": 0.8241758241758241,
"grad_norm": 0.6155654512041941,
"learning_rate": 1.7828738069989395e-05,
"loss": 0.5751,
"step": 6300
},
{
"epoch": 0.825484039769754,
"grad_norm": 0.5694555500789946,
"learning_rate": 1.7696182396606575e-05,
"loss": 0.8114,
"step": 6310
},
{
"epoch": 0.826792255363684,
"grad_norm": 0.5299599349376566,
"learning_rate": 1.7563626723223754e-05,
"loss": 0.5815,
"step": 6320
},
{
"epoch": 0.8281004709576139,
"grad_norm": 0.5002411114343089,
"learning_rate": 1.7431071049840934e-05,
"loss": 0.8116,
"step": 6330
},
{
"epoch": 0.8294086865515437,
"grad_norm": 0.6844907765956427,
"learning_rate": 1.7298515376458113e-05,
"loss": 0.5998,
"step": 6340
},
{
"epoch": 0.8307169021454736,
"grad_norm": 0.5784470770219757,
"learning_rate": 1.7165959703075292e-05,
"loss": 0.8211,
"step": 6350
},
{
"epoch": 0.8320251177394035,
"grad_norm": 0.6402598629047223,
"learning_rate": 1.7033404029692472e-05,
"loss": 0.5686,
"step": 6360
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.5486156319155983,
"learning_rate": 1.690084835630965e-05,
"loss": 0.819,
"step": 6370
},
{
"epoch": 0.8346415489272632,
"grad_norm": 0.552758307022236,
"learning_rate": 1.676829268292683e-05,
"loss": 0.5744,
"step": 6380
},
{
"epoch": 0.8359497645211931,
"grad_norm": 0.5129974867018121,
"learning_rate": 1.663573700954401e-05,
"loss": 0.802,
"step": 6390
},
{
"epoch": 0.837257980115123,
"grad_norm": 0.6477174203830427,
"learning_rate": 1.650318133616119e-05,
"loss": 0.604,
"step": 6400
},
{
"epoch": 0.8385661957090529,
"grad_norm": 0.5721470905692986,
"learning_rate": 1.637062566277837e-05,
"loss": 0.7867,
"step": 6410
},
{
"epoch": 0.8398744113029827,
"grad_norm": 0.5613277299496854,
"learning_rate": 1.6238069989395545e-05,
"loss": 0.5995,
"step": 6420
},
{
"epoch": 0.8411826268969126,
"grad_norm": 0.4873586998049457,
"learning_rate": 1.6105514316012728e-05,
"loss": 0.7987,
"step": 6430
},
{
"epoch": 0.8424908424908425,
"grad_norm": 0.6328797394647059,
"learning_rate": 1.5972958642629907e-05,
"loss": 0.5875,
"step": 6440
},
{
"epoch": 0.8437990580847724,
"grad_norm": 0.5065229266101028,
"learning_rate": 1.5840402969247083e-05,
"loss": 0.7832,
"step": 6450
},
{
"epoch": 0.8451072736787022,
"grad_norm": 0.7363086761942835,
"learning_rate": 1.5707847295864266e-05,
"loss": 0.5797,
"step": 6460
},
{
"epoch": 0.8464154892726321,
"grad_norm": 0.5910783809750648,
"learning_rate": 1.5575291622481442e-05,
"loss": 0.8484,
"step": 6470
},
{
"epoch": 0.847723704866562,
"grad_norm": 0.46343031391913037,
"learning_rate": 1.544273594909862e-05,
"loss": 0.5783,
"step": 6480
},
{
"epoch": 0.8490319204604919,
"grad_norm": 0.5168004042862498,
"learning_rate": 1.53101802757158e-05,
"loss": 0.7856,
"step": 6490
},
{
"epoch": 0.8503401360544217,
"grad_norm": 0.7961735878117324,
"learning_rate": 1.517762460233298e-05,
"loss": 0.5835,
"step": 6500
},
{
"epoch": 0.8516483516483516,
"grad_norm": 0.4869428795315839,
"learning_rate": 1.504506892895016e-05,
"loss": 0.7959,
"step": 6510
},
{
"epoch": 0.8529565672422815,
"grad_norm": 0.6051259113895093,
"learning_rate": 1.491251325556734e-05,
"loss": 0.5812,
"step": 6520
},
{
"epoch": 0.8542647828362114,
"grad_norm": 0.48181809369845346,
"learning_rate": 1.4779957582184516e-05,
"loss": 0.7691,
"step": 6530
},
{
"epoch": 0.8555729984301413,
"grad_norm": 0.6165360359930039,
"learning_rate": 1.4647401908801697e-05,
"loss": 0.5906,
"step": 6540
},
{
"epoch": 0.8568812140240711,
"grad_norm": 0.49455764845266237,
"learning_rate": 1.4514846235418877e-05,
"loss": 0.7854,
"step": 6550
},
{
"epoch": 0.858189429618001,
"grad_norm": 0.5284557358740105,
"learning_rate": 1.4382290562036055e-05,
"loss": 0.588,
"step": 6560
},
{
"epoch": 0.859497645211931,
"grad_norm": 0.5292637656987533,
"learning_rate": 1.4249734888653236e-05,
"loss": 0.8016,
"step": 6570
},
{
"epoch": 0.8608058608058609,
"grad_norm": 0.8513431344553715,
"learning_rate": 1.4117179215270415e-05,
"loss": 0.6167,
"step": 6580
},
{
"epoch": 0.8621140763997907,
"grad_norm": 0.579288662692426,
"learning_rate": 1.3984623541887593e-05,
"loss": 0.8134,
"step": 6590
},
{
"epoch": 0.8634222919937206,
"grad_norm": 0.6785159865651157,
"learning_rate": 1.3852067868504772e-05,
"loss": 0.5753,
"step": 6600
},
{
"epoch": 0.8647305075876505,
"grad_norm": 0.5143467597796619,
"learning_rate": 1.3719512195121953e-05,
"loss": 0.8117,
"step": 6610
},
{
"epoch": 0.8660387231815804,
"grad_norm": 0.5958544884291407,
"learning_rate": 1.3586956521739131e-05,
"loss": 0.5694,
"step": 6620
},
{
"epoch": 0.8673469387755102,
"grad_norm": 0.6204168418491064,
"learning_rate": 1.345440084835631e-05,
"loss": 0.7969,
"step": 6630
},
{
"epoch": 0.8686551543694401,
"grad_norm": 0.5741612534922768,
"learning_rate": 1.3321845174973491e-05,
"loss": 0.5918,
"step": 6640
},
{
"epoch": 0.86996336996337,
"grad_norm": 0.5784688853516818,
"learning_rate": 1.3189289501590667e-05,
"loss": 0.8035,
"step": 6650
},
{
"epoch": 0.8712715855572999,
"grad_norm": 0.5603176409725421,
"learning_rate": 1.3056733828207849e-05,
"loss": 0.5805,
"step": 6660
},
{
"epoch": 0.8725798011512297,
"grad_norm": 0.5498601994735227,
"learning_rate": 1.2924178154825028e-05,
"loss": 0.7983,
"step": 6670
},
{
"epoch": 0.8738880167451596,
"grad_norm": 0.46586893720865835,
"learning_rate": 1.2791622481442206e-05,
"loss": 0.5847,
"step": 6680
},
{
"epoch": 0.8751962323390895,
"grad_norm": 0.4854338950174297,
"learning_rate": 1.2659066808059387e-05,
"loss": 0.7692,
"step": 6690
},
{
"epoch": 0.8765044479330194,
"grad_norm": 0.5304541776380965,
"learning_rate": 1.2526511134676563e-05,
"loss": 0.5896,
"step": 6700
},
{
"epoch": 0.8778126635269492,
"grad_norm": 0.5909426804833323,
"learning_rate": 1.2393955461293744e-05,
"loss": 0.7792,
"step": 6710
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.4671763867260705,
"learning_rate": 1.2261399787910923e-05,
"loss": 0.5638,
"step": 6720
},
{
"epoch": 0.880429094714809,
"grad_norm": 0.5359237947603994,
"learning_rate": 1.2128844114528103e-05,
"loss": 0.8022,
"step": 6730
},
{
"epoch": 0.8817373103087389,
"grad_norm": 0.6960066070363433,
"learning_rate": 1.1996288441145282e-05,
"loss": 0.5957,
"step": 6740
},
{
"epoch": 0.8830455259026687,
"grad_norm": 0.4855729622009359,
"learning_rate": 1.186373276776246e-05,
"loss": 0.8295,
"step": 6750
},
{
"epoch": 0.8843537414965986,
"grad_norm": 0.9533835028532404,
"learning_rate": 1.173117709437964e-05,
"loss": 0.5855,
"step": 6760
},
{
"epoch": 0.8856619570905285,
"grad_norm": 0.5386665177217399,
"learning_rate": 1.1598621420996818e-05,
"loss": 0.7768,
"step": 6770
},
{
"epoch": 0.8869701726844584,
"grad_norm": 0.5851585267921338,
"learning_rate": 1.1466065747613998e-05,
"loss": 0.5735,
"step": 6780
},
{
"epoch": 0.8882783882783882,
"grad_norm": 0.6320477723686851,
"learning_rate": 1.1333510074231179e-05,
"loss": 0.8027,
"step": 6790
},
{
"epoch": 0.8895866038723181,
"grad_norm": 0.6634243839335632,
"learning_rate": 1.1200954400848357e-05,
"loss": 0.5573,
"step": 6800
},
{
"epoch": 0.890894819466248,
"grad_norm": 0.5535217711779135,
"learning_rate": 1.1068398727465536e-05,
"loss": 0.7966,
"step": 6810
},
{
"epoch": 0.892203035060178,
"grad_norm": 0.7832734038273745,
"learning_rate": 1.0935843054082715e-05,
"loss": 0.6074,
"step": 6820
},
{
"epoch": 0.8935112506541077,
"grad_norm": 0.520725853685351,
"learning_rate": 1.0803287380699895e-05,
"loss": 0.7948,
"step": 6830
},
{
"epoch": 0.8948194662480377,
"grad_norm": 0.5140360510057409,
"learning_rate": 1.0670731707317074e-05,
"loss": 0.5672,
"step": 6840
},
{
"epoch": 0.8961276818419676,
"grad_norm": 0.5480237760678021,
"learning_rate": 1.0538176033934252e-05,
"loss": 0.8193,
"step": 6850
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.5379154678692974,
"learning_rate": 1.0405620360551433e-05,
"loss": 0.5615,
"step": 6860
},
{
"epoch": 0.8987441130298273,
"grad_norm": 0.5748341162685962,
"learning_rate": 1.027306468716861e-05,
"loss": 0.7761,
"step": 6870
},
{
"epoch": 0.9000523286237572,
"grad_norm": 0.354346750572681,
"learning_rate": 1.014050901378579e-05,
"loss": 0.5742,
"step": 6880
},
{
"epoch": 0.9013605442176871,
"grad_norm": 0.5556016455622972,
"learning_rate": 1.000795334040297e-05,
"loss": 0.8124,
"step": 6890
},
{
"epoch": 0.902668759811617,
"grad_norm": 0.616134681196408,
"learning_rate": 9.875397667020149e-06,
"loss": 0.5799,
"step": 6900
},
{
"epoch": 0.9039769754055468,
"grad_norm": 0.5351793977307407,
"learning_rate": 9.742841993637328e-06,
"loss": 0.8111,
"step": 6910
},
{
"epoch": 0.9052851909994767,
"grad_norm": 0.4796711570776462,
"learning_rate": 9.610286320254508e-06,
"loss": 0.5897,
"step": 6920
},
{
"epoch": 0.9065934065934066,
"grad_norm": 0.5333607429667859,
"learning_rate": 9.477730646871687e-06,
"loss": 0.7895,
"step": 6930
},
{
"epoch": 0.9079016221873365,
"grad_norm": 0.7210385096597725,
"learning_rate": 9.345174973488865e-06,
"loss": 0.5814,
"step": 6940
},
{
"epoch": 0.9092098377812664,
"grad_norm": 0.4921959597022122,
"learning_rate": 9.212619300106044e-06,
"loss": 0.7722,
"step": 6950
},
{
"epoch": 0.9105180533751962,
"grad_norm": 0.582597864482659,
"learning_rate": 9.080063626723225e-06,
"loss": 0.5718,
"step": 6960
},
{
"epoch": 0.9118262689691261,
"grad_norm": 0.5378533355374352,
"learning_rate": 8.947507953340403e-06,
"loss": 0.7841,
"step": 6970
},
{
"epoch": 0.913134484563056,
"grad_norm": 0.6935491033828649,
"learning_rate": 8.814952279957582e-06,
"loss": 0.5882,
"step": 6980
},
{
"epoch": 0.9144427001569859,
"grad_norm": 0.5318156050528525,
"learning_rate": 8.682396606574762e-06,
"loss": 0.7961,
"step": 6990
},
{
"epoch": 0.9157509157509157,
"grad_norm": 0.6218540662399403,
"learning_rate": 8.549840933191941e-06,
"loss": 0.5646,
"step": 7000
},
{
"epoch": 0.9170591313448456,
"grad_norm": 0.5580021318568493,
"learning_rate": 8.41728525980912e-06,
"loss": 0.8223,
"step": 7010
},
{
"epoch": 0.9183673469387755,
"grad_norm": 0.7372202219508209,
"learning_rate": 8.2847295864263e-06,
"loss": 0.5732,
"step": 7020
},
{
"epoch": 0.9196755625327054,
"grad_norm": 0.5531047720727633,
"learning_rate": 8.15217391304348e-06,
"loss": 0.8233,
"step": 7030
},
{
"epoch": 0.9209837781266352,
"grad_norm": 0.9657658989896405,
"learning_rate": 8.019618239660657e-06,
"loss": 0.5559,
"step": 7040
},
{
"epoch": 0.9222919937205651,
"grad_norm": 0.5772618182869884,
"learning_rate": 7.887062566277838e-06,
"loss": 0.7868,
"step": 7050
},
{
"epoch": 0.923600209314495,
"grad_norm": 0.4581262580386774,
"learning_rate": 7.754506892895016e-06,
"loss": 0.5821,
"step": 7060
},
{
"epoch": 0.924908424908425,
"grad_norm": 0.6112521566283506,
"learning_rate": 7.621951219512195e-06,
"loss": 0.7813,
"step": 7070
},
{
"epoch": 0.9262166405023547,
"grad_norm": 0.7318140554967631,
"learning_rate": 7.4893955461293745e-06,
"loss": 0.5652,
"step": 7080
},
{
"epoch": 0.9275248560962847,
"grad_norm": 0.55917265698163,
"learning_rate": 7.356839872746554e-06,
"loss": 0.8161,
"step": 7090
},
{
"epoch": 0.9288330716902146,
"grad_norm": 0.7037568341200027,
"learning_rate": 7.224284199363733e-06,
"loss": 0.586,
"step": 7100
},
{
"epoch": 0.9301412872841445,
"grad_norm": 0.5346184273438803,
"learning_rate": 7.091728525980912e-06,
"loss": 0.7951,
"step": 7110
},
{
"epoch": 0.9314495028780743,
"grad_norm": 0.4150830640122542,
"learning_rate": 6.959172852598092e-06,
"loss": 0.5949,
"step": 7120
},
{
"epoch": 0.9327577184720042,
"grad_norm": 0.5114818023098201,
"learning_rate": 6.826617179215271e-06,
"loss": 0.7904,
"step": 7130
},
{
"epoch": 0.9340659340659341,
"grad_norm": 0.5908812058211556,
"learning_rate": 6.69406150583245e-06,
"loss": 0.5674,
"step": 7140
},
{
"epoch": 0.935374149659864,
"grad_norm": 0.5343311980537179,
"learning_rate": 6.561505832449629e-06,
"loss": 0.7942,
"step": 7150
},
{
"epoch": 0.9366823652537938,
"grad_norm": 0.5903444129390092,
"learning_rate": 6.428950159066809e-06,
"loss": 0.5679,
"step": 7160
},
{
"epoch": 0.9379905808477237,
"grad_norm": 0.5453396700244478,
"learning_rate": 6.296394485683987e-06,
"loss": 0.8118,
"step": 7170
},
{
"epoch": 0.9392987964416536,
"grad_norm": 0.48350350014595894,
"learning_rate": 6.163838812301167e-06,
"loss": 0.5974,
"step": 7180
},
{
"epoch": 0.9406070120355835,
"grad_norm": 0.5664756102574485,
"learning_rate": 6.031283138918345e-06,
"loss": 0.7948,
"step": 7190
},
{
"epoch": 0.9419152276295133,
"grad_norm": 0.5142501404418801,
"learning_rate": 5.8987274655355255e-06,
"loss": 0.606,
"step": 7200
},
{
"epoch": 0.9432234432234432,
"grad_norm": 0.54569469097517,
"learning_rate": 5.766171792152705e-06,
"loss": 0.8303,
"step": 7210
},
{
"epoch": 0.9445316588173731,
"grad_norm": 0.7008918499549309,
"learning_rate": 5.6336161187698835e-06,
"loss": 0.5698,
"step": 7220
},
{
"epoch": 0.945839874411303,
"grad_norm": 0.4720187827903435,
"learning_rate": 5.501060445387063e-06,
"loss": 0.813,
"step": 7230
},
{
"epoch": 0.9471480900052328,
"grad_norm": 0.6243694714554565,
"learning_rate": 5.368504772004242e-06,
"loss": 0.5858,
"step": 7240
},
{
"epoch": 0.9484563055991627,
"grad_norm": 0.6260033107066734,
"learning_rate": 5.235949098621421e-06,
"loss": 0.8116,
"step": 7250
},
{
"epoch": 0.9497645211930926,
"grad_norm": 0.6435112573826539,
"learning_rate": 5.103393425238601e-06,
"loss": 0.5544,
"step": 7260
},
{
"epoch": 0.9510727367870225,
"grad_norm": 0.47784017687891694,
"learning_rate": 4.9708377518557796e-06,
"loss": 0.7778,
"step": 7270
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.5688080296868022,
"learning_rate": 4.838282078472959e-06,
"loss": 0.6016,
"step": 7280
},
{
"epoch": 0.9536891679748822,
"grad_norm": 0.5213924382658889,
"learning_rate": 4.705726405090138e-06,
"loss": 0.7974,
"step": 7290
},
{
"epoch": 0.9549973835688121,
"grad_norm": 0.45047173757415426,
"learning_rate": 4.573170731707317e-06,
"loss": 0.5249,
"step": 7300
},
{
"epoch": 0.956305599162742,
"grad_norm": 0.5437903519326854,
"learning_rate": 4.440615058324496e-06,
"loss": 0.8166,
"step": 7310
},
{
"epoch": 0.957613814756672,
"grad_norm": 0.48264576838040973,
"learning_rate": 4.308059384941676e-06,
"loss": 0.5814,
"step": 7320
},
{
"epoch": 0.9589220303506018,
"grad_norm": 0.545526763976128,
"learning_rate": 4.175503711558855e-06,
"loss": 0.8006,
"step": 7330
},
{
"epoch": 0.9602302459445317,
"grad_norm": 0.38539225021424495,
"learning_rate": 4.0429480381760345e-06,
"loss": 0.5745,
"step": 7340
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.5798773104196537,
"learning_rate": 3.910392364793213e-06,
"loss": 0.8073,
"step": 7350
},
{
"epoch": 0.9628466771323915,
"grad_norm": 0.6878789040909351,
"learning_rate": 3.777836691410393e-06,
"loss": 0.5763,
"step": 7360
},
{
"epoch": 0.9641548927263213,
"grad_norm": 0.5289835887969982,
"learning_rate": 3.6452810180275714e-06,
"loss": 0.7766,
"step": 7370
},
{
"epoch": 0.9654631083202512,
"grad_norm": 0.40667843587961816,
"learning_rate": 3.5127253446447508e-06,
"loss": 0.558,
"step": 7380
},
{
"epoch": 0.9667713239141811,
"grad_norm": 0.5840192236729715,
"learning_rate": 3.3801696712619306e-06,
"loss": 0.8129,
"step": 7390
},
{
"epoch": 0.968079539508111,
"grad_norm": 0.3178170420497473,
"learning_rate": 3.247613997879109e-06,
"loss": 0.5787,
"step": 7400
},
{
"epoch": 0.9693877551020408,
"grad_norm": 0.54996002509364,
"learning_rate": 3.115058324496289e-06,
"loss": 0.8326,
"step": 7410
},
{
"epoch": 0.9706959706959707,
"grad_norm": 0.5534777218559572,
"learning_rate": 2.982502651113468e-06,
"loss": 0.5821,
"step": 7420
},
{
"epoch": 0.9720041862899006,
"grad_norm": 0.5230629562997223,
"learning_rate": 2.849946977730647e-06,
"loss": 0.7779,
"step": 7430
},
{
"epoch": 0.9733124018838305,
"grad_norm": 0.875422492824055,
"learning_rate": 2.7173913043478263e-06,
"loss": 0.592,
"step": 7440
},
{
"epoch": 0.9746206174777603,
"grad_norm": 0.5569915281293889,
"learning_rate": 2.5848356309650052e-06,
"loss": 0.8067,
"step": 7450
},
{
"epoch": 0.9759288330716902,
"grad_norm": 0.6183408664255449,
"learning_rate": 2.4522799575821846e-06,
"loss": 0.5875,
"step": 7460
},
{
"epoch": 0.9772370486656201,
"grad_norm": 0.49698428419163243,
"learning_rate": 2.319724284199364e-06,
"loss": 0.8197,
"step": 7470
},
{
"epoch": 0.97854526425955,
"grad_norm": 0.6395403482510305,
"learning_rate": 2.187168610816543e-06,
"loss": 0.5956,
"step": 7480
},
{
"epoch": 0.9798534798534798,
"grad_norm": 0.5141508620993104,
"learning_rate": 2.054612937433722e-06,
"loss": 0.8213,
"step": 7490
},
{
"epoch": 0.9811616954474097,
"grad_norm": 0.7297722231605804,
"learning_rate": 1.9220572640509014e-06,
"loss": 0.5724,
"step": 7500
},
{
"epoch": 0.9824699110413396,
"grad_norm": 0.6139671735977023,
"learning_rate": 1.7895015906680807e-06,
"loss": 0.8038,
"step": 7510
},
{
"epoch": 0.9837781266352695,
"grad_norm": 0.6348684038508452,
"learning_rate": 1.65694591728526e-06,
"loss": 0.5891,
"step": 7520
},
{
"epoch": 0.9850863422291993,
"grad_norm": 0.5570560567015977,
"learning_rate": 1.5243902439024391e-06,
"loss": 0.8008,
"step": 7530
},
{
"epoch": 0.9863945578231292,
"grad_norm": 0.5421909673109165,
"learning_rate": 1.3918345705196183e-06,
"loss": 0.5792,
"step": 7540
},
{
"epoch": 0.9877027734170591,
"grad_norm": 0.5728614761702414,
"learning_rate": 1.2592788971367975e-06,
"loss": 0.7941,
"step": 7550
},
{
"epoch": 0.989010989010989,
"grad_norm": 0.6317238615567622,
"learning_rate": 1.1267232237539766e-06,
"loss": 0.6009,
"step": 7560
},
{
"epoch": 0.9903192046049188,
"grad_norm": 0.5160377073279534,
"learning_rate": 9.94167550371156e-07,
"loss": 0.7883,
"step": 7570
},
{
"epoch": 0.9916274201988488,
"grad_norm": 0.690634120523154,
"learning_rate": 8.616118769883351e-07,
"loss": 0.5766,
"step": 7580
},
{
"epoch": 0.9929356357927787,
"grad_norm": 0.5284248764034778,
"learning_rate": 7.290562036055143e-07,
"loss": 0.8011,
"step": 7590
},
{
"epoch": 0.9942438513867086,
"grad_norm": 0.5999593891013711,
"learning_rate": 5.965005302226936e-07,
"loss": 0.5956,
"step": 7600
},
{
"epoch": 0.9955520669806384,
"grad_norm": 0.48546714136377134,
"learning_rate": 4.6394485683987276e-07,
"loss": 0.7916,
"step": 7610
},
{
"epoch": 0.9968602825745683,
"grad_norm": 0.3062976732646243,
"learning_rate": 3.31389183457052e-07,
"loss": 0.5679,
"step": 7620
},
{
"epoch": 0.9981684981684982,
"grad_norm": 0.5269723896971663,
"learning_rate": 1.9883351007423118e-07,
"loss": 0.8292,
"step": 7630
},
{
"epoch": 0.9994767137624281,
"grad_norm": 0.49183779024992025,
"learning_rate": 6.62778366914104e-08,
"loss": 0.5841,
"step": 7640
}
],
"logging_steps": 10,
"max_steps": 7644,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}