Alan96's picture
Upload 48 files
ce1d956 verified
{
"best_metric": 0.6242377758026123,
"best_model_checkpoint": "./model/google/flan-t5-large-train_r_aug-tqa/checkpoint-12000",
"epoch": 1.9913707268503154,
"eval_steps": 1000,
"global_step": 12000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00016594756057085962,
"grad_norm": 4.900241374969482,
"learning_rate": 4.9997234207323826e-05,
"loss": 1.1171,
"step": 1
},
{
"epoch": 0.001659475605708596,
"grad_norm": 3.7945871353149414,
"learning_rate": 4.997234207323819e-05,
"loss": 1.1217,
"step": 10
},
{
"epoch": 0.003318951211417192,
"grad_norm": 3.418729543685913,
"learning_rate": 4.9944684146476384e-05,
"loss": 1.0061,
"step": 20
},
{
"epoch": 0.004978426817125788,
"grad_norm": 20.93488311767578,
"learning_rate": 4.9917026219714574e-05,
"loss": 0.952,
"step": 30
},
{
"epoch": 0.006637902422834384,
"grad_norm": 4.321225643157959,
"learning_rate": 4.9889368292952765e-05,
"loss": 0.984,
"step": 40
},
{
"epoch": 0.00829737802854298,
"grad_norm": 3.9566705226898193,
"learning_rate": 4.986171036619095e-05,
"loss": 0.8963,
"step": 50
},
{
"epoch": 0.009956853634251576,
"grad_norm": 3.582169771194458,
"learning_rate": 4.983405243942914e-05,
"loss": 0.962,
"step": 60
},
{
"epoch": 0.011616329239960173,
"grad_norm": 3.904132127761841,
"learning_rate": 4.980639451266733e-05,
"loss": 0.9226,
"step": 70
},
{
"epoch": 0.013275804845668768,
"grad_norm": 3.31311297416687,
"learning_rate": 4.977873658590552e-05,
"loss": 0.8423,
"step": 80
},
{
"epoch": 0.014935280451377365,
"grad_norm": 3.5518760681152344,
"learning_rate": 4.975107865914371e-05,
"loss": 0.9015,
"step": 90
},
{
"epoch": 0.01659475605708596,
"grad_norm": 3.8338520526885986,
"learning_rate": 4.97234207323819e-05,
"loss": 0.8883,
"step": 100
},
{
"epoch": 0.018254231662794558,
"grad_norm": 4.829561233520508,
"learning_rate": 4.969576280562009e-05,
"loss": 0.8755,
"step": 110
},
{
"epoch": 0.019913707268503153,
"grad_norm": 3.4585771560668945,
"learning_rate": 4.966810487885828e-05,
"loss": 0.8807,
"step": 120
},
{
"epoch": 0.021573182874211748,
"grad_norm": 4.243607997894287,
"learning_rate": 4.9640446952096474e-05,
"loss": 0.9079,
"step": 130
},
{
"epoch": 0.023232658479920346,
"grad_norm": 2.565274238586426,
"learning_rate": 4.961278902533466e-05,
"loss": 0.8284,
"step": 140
},
{
"epoch": 0.02489213408562894,
"grad_norm": 3.1821136474609375,
"learning_rate": 4.958513109857285e-05,
"loss": 0.9609,
"step": 150
},
{
"epoch": 0.026551609691337536,
"grad_norm": 3.768364667892456,
"learning_rate": 4.955747317181104e-05,
"loss": 0.8117,
"step": 160
},
{
"epoch": 0.028211085297046135,
"grad_norm": 3.528536558151245,
"learning_rate": 4.952981524504923e-05,
"loss": 0.8547,
"step": 170
},
{
"epoch": 0.02987056090275473,
"grad_norm": 3.6502795219421387,
"learning_rate": 4.950215731828742e-05,
"loss": 0.8715,
"step": 180
},
{
"epoch": 0.031530036508463324,
"grad_norm": 3.962785482406616,
"learning_rate": 4.947449939152561e-05,
"loss": 0.862,
"step": 190
},
{
"epoch": 0.03318951211417192,
"grad_norm": 3.7627573013305664,
"learning_rate": 4.94468414647638e-05,
"loss": 0.9098,
"step": 200
},
{
"epoch": 0.03484898771988052,
"grad_norm": 2.818209409713745,
"learning_rate": 4.941918353800199e-05,
"loss": 0.8594,
"step": 210
},
{
"epoch": 0.036508463325589116,
"grad_norm": 2.6864259243011475,
"learning_rate": 4.939152561124018e-05,
"loss": 0.8562,
"step": 220
},
{
"epoch": 0.03816793893129771,
"grad_norm": 3.5816614627838135,
"learning_rate": 4.936386768447838e-05,
"loss": 0.8762,
"step": 230
},
{
"epoch": 0.039827414537006306,
"grad_norm": 3.709526777267456,
"learning_rate": 4.9336209757716564e-05,
"loss": 0.8121,
"step": 240
},
{
"epoch": 0.0414868901427149,
"grad_norm": 3.305800676345825,
"learning_rate": 4.9308551830954754e-05,
"loss": 0.8347,
"step": 250
},
{
"epoch": 0.043146365748423496,
"grad_norm": 3.0622973442077637,
"learning_rate": 4.9280893904192945e-05,
"loss": 0.8788,
"step": 260
},
{
"epoch": 0.0448058413541321,
"grad_norm": 3.436626434326172,
"learning_rate": 4.9253235977431136e-05,
"loss": 0.881,
"step": 270
},
{
"epoch": 0.04646531695984069,
"grad_norm": 3.1859350204467773,
"learning_rate": 4.9225578050669326e-05,
"loss": 0.8458,
"step": 280
},
{
"epoch": 0.04812479256554929,
"grad_norm": 4.285528659820557,
"learning_rate": 4.919792012390752e-05,
"loss": 0.8636,
"step": 290
},
{
"epoch": 0.04978426817125788,
"grad_norm": 3.98508882522583,
"learning_rate": 4.917026219714571e-05,
"loss": 0.8134,
"step": 300
},
{
"epoch": 0.05144374377696648,
"grad_norm": 3.6300930976867676,
"learning_rate": 4.91426042703839e-05,
"loss": 0.7973,
"step": 310
},
{
"epoch": 0.05310321938267507,
"grad_norm": 3.743924140930176,
"learning_rate": 4.911494634362209e-05,
"loss": 0.8277,
"step": 320
},
{
"epoch": 0.054762694988383674,
"grad_norm": 2.5988316535949707,
"learning_rate": 4.908728841686027e-05,
"loss": 0.8533,
"step": 330
},
{
"epoch": 0.05642217059409227,
"grad_norm": 3.569610357284546,
"learning_rate": 4.905963049009846e-05,
"loss": 0.8456,
"step": 340
},
{
"epoch": 0.058081646199800864,
"grad_norm": 2.9307737350463867,
"learning_rate": 4.9031972563336654e-05,
"loss": 0.7879,
"step": 350
},
{
"epoch": 0.05974112180550946,
"grad_norm": 3.5210940837860107,
"learning_rate": 4.9004314636574844e-05,
"loss": 0.8394,
"step": 360
},
{
"epoch": 0.061400597411218054,
"grad_norm": 2.749647617340088,
"learning_rate": 4.8976656709813035e-05,
"loss": 0.795,
"step": 370
},
{
"epoch": 0.06306007301692665,
"grad_norm": 4.256681442260742,
"learning_rate": 4.8948998783051226e-05,
"loss": 0.8333,
"step": 380
},
{
"epoch": 0.06471954862263525,
"grad_norm": 3.1975724697113037,
"learning_rate": 4.8921340856289416e-05,
"loss": 0.8099,
"step": 390
},
{
"epoch": 0.06637902422834384,
"grad_norm": 3.0923843383789062,
"learning_rate": 4.889368292952761e-05,
"loss": 0.8211,
"step": 400
},
{
"epoch": 0.06803849983405244,
"grad_norm": 2.9573616981506348,
"learning_rate": 4.88660250027658e-05,
"loss": 0.8438,
"step": 410
},
{
"epoch": 0.06969797543976104,
"grad_norm": 3.519888162612915,
"learning_rate": 4.883836707600398e-05,
"loss": 0.7978,
"step": 420
},
{
"epoch": 0.07135745104546963,
"grad_norm": 4.7146196365356445,
"learning_rate": 4.881070914924217e-05,
"loss": 0.7877,
"step": 430
},
{
"epoch": 0.07301692665117823,
"grad_norm": 2.54521107673645,
"learning_rate": 4.878305122248036e-05,
"loss": 0.7914,
"step": 440
},
{
"epoch": 0.07467640225688682,
"grad_norm": 3.443538188934326,
"learning_rate": 4.875539329571855e-05,
"loss": 0.875,
"step": 450
},
{
"epoch": 0.07633587786259542,
"grad_norm": 3.5744690895080566,
"learning_rate": 4.8727735368956744e-05,
"loss": 0.8347,
"step": 460
},
{
"epoch": 0.07799535346830401,
"grad_norm": 3.129127025604248,
"learning_rate": 4.8700077442194934e-05,
"loss": 0.88,
"step": 470
},
{
"epoch": 0.07965482907401261,
"grad_norm": 3.40054988861084,
"learning_rate": 4.8672419515433125e-05,
"loss": 0.8377,
"step": 480
},
{
"epoch": 0.08131430467972121,
"grad_norm": 4.048141956329346,
"learning_rate": 4.8644761588671316e-05,
"loss": 0.8229,
"step": 490
},
{
"epoch": 0.0829737802854298,
"grad_norm": 3.28328537940979,
"learning_rate": 4.8617103661909506e-05,
"loss": 0.846,
"step": 500
},
{
"epoch": 0.0846332558911384,
"grad_norm": 3.936415672302246,
"learning_rate": 4.85894457351477e-05,
"loss": 0.8217,
"step": 510
},
{
"epoch": 0.08629273149684699,
"grad_norm": 3.3447494506835938,
"learning_rate": 4.856178780838588e-05,
"loss": 0.8553,
"step": 520
},
{
"epoch": 0.0879522071025556,
"grad_norm": 3.550673007965088,
"learning_rate": 4.853412988162407e-05,
"loss": 0.8646,
"step": 530
},
{
"epoch": 0.0896116827082642,
"grad_norm": 2.695237398147583,
"learning_rate": 4.850647195486226e-05,
"loss": 0.7649,
"step": 540
},
{
"epoch": 0.09127115831397278,
"grad_norm": 2.307586193084717,
"learning_rate": 4.847881402810045e-05,
"loss": 0.8461,
"step": 550
},
{
"epoch": 0.09293063391968139,
"grad_norm": 3.99825119972229,
"learning_rate": 4.845115610133864e-05,
"loss": 0.8614,
"step": 560
},
{
"epoch": 0.09459010952538997,
"grad_norm": 2.767484426498413,
"learning_rate": 4.8423498174576834e-05,
"loss": 0.8256,
"step": 570
},
{
"epoch": 0.09624958513109858,
"grad_norm": 3.375134229660034,
"learning_rate": 4.8395840247815024e-05,
"loss": 0.8062,
"step": 580
},
{
"epoch": 0.09790906073680716,
"grad_norm": 3.587320327758789,
"learning_rate": 4.8368182321053215e-05,
"loss": 0.7861,
"step": 590
},
{
"epoch": 0.09956853634251576,
"grad_norm": 3.553729772567749,
"learning_rate": 4.8340524394291406e-05,
"loss": 0.886,
"step": 600
},
{
"epoch": 0.10122801194822437,
"grad_norm": 3.4651665687561035,
"learning_rate": 4.8312866467529596e-05,
"loss": 0.7524,
"step": 610
},
{
"epoch": 0.10288748755393295,
"grad_norm": 2.79295015335083,
"learning_rate": 4.828520854076779e-05,
"loss": 0.7185,
"step": 620
},
{
"epoch": 0.10454696315964156,
"grad_norm": 3.275655508041382,
"learning_rate": 4.825755061400598e-05,
"loss": 0.8519,
"step": 630
},
{
"epoch": 0.10620643876535014,
"grad_norm": 3.79915714263916,
"learning_rate": 4.822989268724417e-05,
"loss": 0.8619,
"step": 640
},
{
"epoch": 0.10786591437105875,
"grad_norm": 3.0836708545684814,
"learning_rate": 4.820223476048236e-05,
"loss": 0.8856,
"step": 650
},
{
"epoch": 0.10952538997676735,
"grad_norm": 3.225219488143921,
"learning_rate": 4.817457683372055e-05,
"loss": 0.8831,
"step": 660
},
{
"epoch": 0.11118486558247594,
"grad_norm": 2.489872932434082,
"learning_rate": 4.814691890695874e-05,
"loss": 0.8049,
"step": 670
},
{
"epoch": 0.11284434118818454,
"grad_norm": 3.352848768234253,
"learning_rate": 4.811926098019693e-05,
"loss": 0.8649,
"step": 680
},
{
"epoch": 0.11450381679389313,
"grad_norm": 5.773054122924805,
"learning_rate": 4.809160305343512e-05,
"loss": 0.7939,
"step": 690
},
{
"epoch": 0.11616329239960173,
"grad_norm": 3.3380932807922363,
"learning_rate": 4.8063945126673305e-05,
"loss": 0.8273,
"step": 700
},
{
"epoch": 0.11782276800531032,
"grad_norm": 3.8309950828552246,
"learning_rate": 4.8036287199911496e-05,
"loss": 0.9265,
"step": 710
},
{
"epoch": 0.11948224361101892,
"grad_norm": 3.8041698932647705,
"learning_rate": 4.8008629273149686e-05,
"loss": 0.8258,
"step": 720
},
{
"epoch": 0.12114171921672752,
"grad_norm": 3.6036689281463623,
"learning_rate": 4.798097134638788e-05,
"loss": 0.7618,
"step": 730
},
{
"epoch": 0.12280119482243611,
"grad_norm": 3.5238475799560547,
"learning_rate": 4.795331341962607e-05,
"loss": 0.7538,
"step": 740
},
{
"epoch": 0.12446067042814471,
"grad_norm": 2.8986926078796387,
"learning_rate": 4.792565549286426e-05,
"loss": 0.7851,
"step": 750
},
{
"epoch": 0.1261201460338533,
"grad_norm": 3.8696155548095703,
"learning_rate": 4.789799756610245e-05,
"loss": 0.8544,
"step": 760
},
{
"epoch": 0.12777962163956189,
"grad_norm": 3.1447415351867676,
"learning_rate": 4.787033963934064e-05,
"loss": 0.7631,
"step": 770
},
{
"epoch": 0.1294390972452705,
"grad_norm": 3.2269225120544434,
"learning_rate": 4.784268171257883e-05,
"loss": 0.8603,
"step": 780
},
{
"epoch": 0.1310985728509791,
"grad_norm": 3.555079698562622,
"learning_rate": 4.7815023785817014e-05,
"loss": 0.8047,
"step": 790
},
{
"epoch": 0.13275804845668768,
"grad_norm": 4.38774299621582,
"learning_rate": 4.7787365859055205e-05,
"loss": 0.8193,
"step": 800
},
{
"epoch": 0.1344175240623963,
"grad_norm": 2.849234104156494,
"learning_rate": 4.7759707932293395e-05,
"loss": 0.8236,
"step": 810
},
{
"epoch": 0.13607699966810488,
"grad_norm": 3.4063913822174072,
"learning_rate": 4.7732050005531586e-05,
"loss": 0.7542,
"step": 820
},
{
"epoch": 0.13773647527381347,
"grad_norm": 4.454982280731201,
"learning_rate": 4.7704392078769776e-05,
"loss": 0.8126,
"step": 830
},
{
"epoch": 0.13939595087952208,
"grad_norm": 3.7919139862060547,
"learning_rate": 4.767673415200797e-05,
"loss": 0.8232,
"step": 840
},
{
"epoch": 0.14105542648523067,
"grad_norm": 2.609391927719116,
"learning_rate": 4.764907622524616e-05,
"loss": 0.748,
"step": 850
},
{
"epoch": 0.14271490209093926,
"grad_norm": 2.9120664596557617,
"learning_rate": 4.762141829848435e-05,
"loss": 0.8619,
"step": 860
},
{
"epoch": 0.14437437769664785,
"grad_norm": 3.4429476261138916,
"learning_rate": 4.759376037172254e-05,
"loss": 0.7827,
"step": 870
},
{
"epoch": 0.14603385330235646,
"grad_norm": 3.6868903636932373,
"learning_rate": 4.756610244496073e-05,
"loss": 0.8245,
"step": 880
},
{
"epoch": 0.14769332890806505,
"grad_norm": 3.6222927570343018,
"learning_rate": 4.753844451819891e-05,
"loss": 0.7452,
"step": 890
},
{
"epoch": 0.14935280451377364,
"grad_norm": 4.014353275299072,
"learning_rate": 4.7510786591437104e-05,
"loss": 0.8375,
"step": 900
},
{
"epoch": 0.15101228011948226,
"grad_norm": 2.4085919857025146,
"learning_rate": 4.7483128664675295e-05,
"loss": 0.8016,
"step": 910
},
{
"epoch": 0.15267175572519084,
"grad_norm": 2.73573637008667,
"learning_rate": 4.7455470737913485e-05,
"loss": 0.8543,
"step": 920
},
{
"epoch": 0.15433123133089943,
"grad_norm": 3.7764389514923096,
"learning_rate": 4.7427812811151676e-05,
"loss": 0.8465,
"step": 930
},
{
"epoch": 0.15599070693660802,
"grad_norm": 3.0908584594726562,
"learning_rate": 4.7400154884389866e-05,
"loss": 0.849,
"step": 940
},
{
"epoch": 0.15765018254231664,
"grad_norm": 2.892361640930176,
"learning_rate": 4.737249695762806e-05,
"loss": 0.7693,
"step": 950
},
{
"epoch": 0.15930965814802522,
"grad_norm": 3.766294479370117,
"learning_rate": 4.734483903086625e-05,
"loss": 0.845,
"step": 960
},
{
"epoch": 0.1609691337537338,
"grad_norm": 3.2067556381225586,
"learning_rate": 4.731718110410444e-05,
"loss": 0.8432,
"step": 970
},
{
"epoch": 0.16262860935944243,
"grad_norm": 3.325576066970825,
"learning_rate": 4.728952317734263e-05,
"loss": 0.7712,
"step": 980
},
{
"epoch": 0.16428808496515102,
"grad_norm": 2.6038808822631836,
"learning_rate": 4.726186525058082e-05,
"loss": 0.758,
"step": 990
},
{
"epoch": 0.1659475605708596,
"grad_norm": 3.717463254928589,
"learning_rate": 4.723420732381901e-05,
"loss": 0.8038,
"step": 1000
},
{
"epoch": 0.1659475605708596,
"eval_gen_len": 44.25406626506024,
"eval_loss": 0.6970572471618652,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1355.4037,
"eval_samples_per_second": 4.89,
"eval_steps_per_second": 0.306,
"step": 1000
},
{
"epoch": 0.1676070361765682,
"grad_norm": 2.7978012561798096,
"learning_rate": 4.72065493970572e-05,
"loss": 0.8413,
"step": 1010
},
{
"epoch": 0.1692665117822768,
"grad_norm": 3.021860361099243,
"learning_rate": 4.717889147029539e-05,
"loss": 0.7488,
"step": 1020
},
{
"epoch": 0.1709259873879854,
"grad_norm": 3.307393789291382,
"learning_rate": 4.715123354353358e-05,
"loss": 0.7884,
"step": 1030
},
{
"epoch": 0.17258546299369398,
"grad_norm": 4.444802761077881,
"learning_rate": 4.712357561677177e-05,
"loss": 0.7777,
"step": 1040
},
{
"epoch": 0.1742449385994026,
"grad_norm": 2.8152804374694824,
"learning_rate": 4.709591769000996e-05,
"loss": 0.8055,
"step": 1050
},
{
"epoch": 0.1759044142051112,
"grad_norm": 2.854592800140381,
"learning_rate": 4.7068259763248154e-05,
"loss": 0.778,
"step": 1060
},
{
"epoch": 0.17756388981081977,
"grad_norm": 3.072824716567993,
"learning_rate": 4.704060183648634e-05,
"loss": 0.776,
"step": 1070
},
{
"epoch": 0.1792233654165284,
"grad_norm": 3.6928513050079346,
"learning_rate": 4.701294390972453e-05,
"loss": 0.7938,
"step": 1080
},
{
"epoch": 0.18088284102223698,
"grad_norm": 2.9358620643615723,
"learning_rate": 4.698528598296272e-05,
"loss": 0.7847,
"step": 1090
},
{
"epoch": 0.18254231662794557,
"grad_norm": 2.9071340560913086,
"learning_rate": 4.695762805620091e-05,
"loss": 0.7455,
"step": 1100
},
{
"epoch": 0.18420179223365415,
"grad_norm": 3.4249751567840576,
"learning_rate": 4.69299701294391e-05,
"loss": 0.8142,
"step": 1110
},
{
"epoch": 0.18586126783936277,
"grad_norm": 3.0051093101501465,
"learning_rate": 4.690231220267729e-05,
"loss": 0.7925,
"step": 1120
},
{
"epoch": 0.18752074344507136,
"grad_norm": 2.9879422187805176,
"learning_rate": 4.687465427591548e-05,
"loss": 0.728,
"step": 1130
},
{
"epoch": 0.18918021905077995,
"grad_norm": 3.4878718852996826,
"learning_rate": 4.684699634915367e-05,
"loss": 0.7735,
"step": 1140
},
{
"epoch": 0.19083969465648856,
"grad_norm": 3.447152853012085,
"learning_rate": 4.681933842239186e-05,
"loss": 0.7562,
"step": 1150
},
{
"epoch": 0.19249917026219715,
"grad_norm": 3.2192983627319336,
"learning_rate": 4.679168049563005e-05,
"loss": 0.8128,
"step": 1160
},
{
"epoch": 0.19415864586790574,
"grad_norm": 3.2137930393218994,
"learning_rate": 4.676402256886824e-05,
"loss": 0.8402,
"step": 1170
},
{
"epoch": 0.19581812147361433,
"grad_norm": 2.711993455886841,
"learning_rate": 4.673636464210643e-05,
"loss": 0.7724,
"step": 1180
},
{
"epoch": 0.19747759707932294,
"grad_norm": 2.9814987182617188,
"learning_rate": 4.670870671534462e-05,
"loss": 0.7809,
"step": 1190
},
{
"epoch": 0.19913707268503153,
"grad_norm": 3.1554079055786133,
"learning_rate": 4.668104878858281e-05,
"loss": 0.756,
"step": 1200
},
{
"epoch": 0.20079654829074012,
"grad_norm": 3.341683864593506,
"learning_rate": 4.6653390861821e-05,
"loss": 0.8088,
"step": 1210
},
{
"epoch": 0.20245602389644873,
"grad_norm": 2.8297119140625,
"learning_rate": 4.662573293505919e-05,
"loss": 0.8249,
"step": 1220
},
{
"epoch": 0.20411549950215732,
"grad_norm": 3.3890442848205566,
"learning_rate": 4.659807500829738e-05,
"loss": 0.8123,
"step": 1230
},
{
"epoch": 0.2057749751078659,
"grad_norm": 3.256871223449707,
"learning_rate": 4.657041708153557e-05,
"loss": 0.8309,
"step": 1240
},
{
"epoch": 0.2074344507135745,
"grad_norm": 3.438433885574341,
"learning_rate": 4.654275915477376e-05,
"loss": 0.8109,
"step": 1250
},
{
"epoch": 0.2090939263192831,
"grad_norm": 3.2116994857788086,
"learning_rate": 4.6515101228011946e-05,
"loss": 0.6768,
"step": 1260
},
{
"epoch": 0.2107534019249917,
"grad_norm": 2.6069250106811523,
"learning_rate": 4.6487443301250137e-05,
"loss": 0.7702,
"step": 1270
},
{
"epoch": 0.2124128775307003,
"grad_norm": 3.114304304122925,
"learning_rate": 4.645978537448833e-05,
"loss": 0.7797,
"step": 1280
},
{
"epoch": 0.2140723531364089,
"grad_norm": 2.907708168029785,
"learning_rate": 4.643212744772652e-05,
"loss": 0.7823,
"step": 1290
},
{
"epoch": 0.2157318287421175,
"grad_norm": 3.989586353302002,
"learning_rate": 4.640446952096471e-05,
"loss": 0.7759,
"step": 1300
},
{
"epoch": 0.21739130434782608,
"grad_norm": 3.164072275161743,
"learning_rate": 4.63768115942029e-05,
"loss": 0.7979,
"step": 1310
},
{
"epoch": 0.2190507799535347,
"grad_norm": 3.060279607772827,
"learning_rate": 4.634915366744109e-05,
"loss": 0.7863,
"step": 1320
},
{
"epoch": 0.22071025555924328,
"grad_norm": 3.268155336380005,
"learning_rate": 4.632149574067928e-05,
"loss": 0.8434,
"step": 1330
},
{
"epoch": 0.22236973116495187,
"grad_norm": 2.992119550704956,
"learning_rate": 4.629383781391747e-05,
"loss": 0.7481,
"step": 1340
},
{
"epoch": 0.22402920677066046,
"grad_norm": 2.692070722579956,
"learning_rate": 4.626617988715566e-05,
"loss": 0.7951,
"step": 1350
},
{
"epoch": 0.22568868237636908,
"grad_norm": 3.0484812259674072,
"learning_rate": 4.623852196039385e-05,
"loss": 0.8139,
"step": 1360
},
{
"epoch": 0.22734815798207766,
"grad_norm": 3.799321174621582,
"learning_rate": 4.621086403363204e-05,
"loss": 0.8022,
"step": 1370
},
{
"epoch": 0.22900763358778625,
"grad_norm": 3.715362548828125,
"learning_rate": 4.618320610687023e-05,
"loss": 0.7153,
"step": 1380
},
{
"epoch": 0.23066710919349487,
"grad_norm": 3.7485156059265137,
"learning_rate": 4.6155548180108424e-05,
"loss": 0.8893,
"step": 1390
},
{
"epoch": 0.23232658479920346,
"grad_norm": 3.6852569580078125,
"learning_rate": 4.6127890253346615e-05,
"loss": 0.7838,
"step": 1400
},
{
"epoch": 0.23398606040491204,
"grad_norm": 3.174116611480713,
"learning_rate": 4.6100232326584805e-05,
"loss": 0.7797,
"step": 1410
},
{
"epoch": 0.23564553601062063,
"grad_norm": 4.355712890625,
"learning_rate": 4.6072574399822996e-05,
"loss": 0.8305,
"step": 1420
},
{
"epoch": 0.23730501161632925,
"grad_norm": 2.906917095184326,
"learning_rate": 4.6044916473061186e-05,
"loss": 0.8066,
"step": 1430
},
{
"epoch": 0.23896448722203784,
"grad_norm": 2.5050249099731445,
"learning_rate": 4.601725854629937e-05,
"loss": 0.8062,
"step": 1440
},
{
"epoch": 0.24062396282774642,
"grad_norm": 3.4250411987304688,
"learning_rate": 4.598960061953756e-05,
"loss": 0.7336,
"step": 1450
},
{
"epoch": 0.24228343843345504,
"grad_norm": 2.7324156761169434,
"learning_rate": 4.596194269277575e-05,
"loss": 0.7701,
"step": 1460
},
{
"epoch": 0.24394291403916363,
"grad_norm": 2.688563585281372,
"learning_rate": 4.593428476601394e-05,
"loss": 0.7636,
"step": 1470
},
{
"epoch": 0.24560238964487222,
"grad_norm": 3.027071952819824,
"learning_rate": 4.590662683925213e-05,
"loss": 0.7496,
"step": 1480
},
{
"epoch": 0.2472618652505808,
"grad_norm": 2.9742684364318848,
"learning_rate": 4.587896891249032e-05,
"loss": 0.8015,
"step": 1490
},
{
"epoch": 0.24892134085628942,
"grad_norm": 3.729691982269287,
"learning_rate": 4.5851310985728514e-05,
"loss": 0.8203,
"step": 1500
},
{
"epoch": 0.25058081646199803,
"grad_norm": 3.2418553829193115,
"learning_rate": 4.5823653058966705e-05,
"loss": 0.8219,
"step": 1510
},
{
"epoch": 0.2522402920677066,
"grad_norm": 2.6074585914611816,
"learning_rate": 4.5795995132204895e-05,
"loss": 0.6838,
"step": 1520
},
{
"epoch": 0.2538997676734152,
"grad_norm": 3.124091148376465,
"learning_rate": 4.5768337205443086e-05,
"loss": 0.7972,
"step": 1530
},
{
"epoch": 0.25555924327912377,
"grad_norm": 3.9640090465545654,
"learning_rate": 4.574067927868127e-05,
"loss": 0.7879,
"step": 1540
},
{
"epoch": 0.2572187188848324,
"grad_norm": 4.680671215057373,
"learning_rate": 4.571302135191946e-05,
"loss": 0.7252,
"step": 1550
},
{
"epoch": 0.258878194490541,
"grad_norm": 4.106893539428711,
"learning_rate": 4.568536342515765e-05,
"loss": 0.8023,
"step": 1560
},
{
"epoch": 0.26053767009624956,
"grad_norm": 3.206587314605713,
"learning_rate": 4.565770549839584e-05,
"loss": 0.8094,
"step": 1570
},
{
"epoch": 0.2621971457019582,
"grad_norm": 3.4632327556610107,
"learning_rate": 4.563004757163403e-05,
"loss": 0.7602,
"step": 1580
},
{
"epoch": 0.2638566213076668,
"grad_norm": 2.723336935043335,
"learning_rate": 4.560238964487222e-05,
"loss": 0.7412,
"step": 1590
},
{
"epoch": 0.26551609691337535,
"grad_norm": 2.829049587249756,
"learning_rate": 4.5574731718110413e-05,
"loss": 0.7328,
"step": 1600
},
{
"epoch": 0.26717557251908397,
"grad_norm": 2.7743582725524902,
"learning_rate": 4.5547073791348604e-05,
"loss": 0.854,
"step": 1610
},
{
"epoch": 0.2688350481247926,
"grad_norm": 2.7201788425445557,
"learning_rate": 4.5519415864586795e-05,
"loss": 0.8023,
"step": 1620
},
{
"epoch": 0.27049452373050115,
"grad_norm": 2.8764491081237793,
"learning_rate": 4.549175793782498e-05,
"loss": 0.8004,
"step": 1630
},
{
"epoch": 0.27215399933620976,
"grad_norm": 2.746384859085083,
"learning_rate": 4.546410001106317e-05,
"loss": 0.7538,
"step": 1640
},
{
"epoch": 0.2738134749419184,
"grad_norm": 3.606780529022217,
"learning_rate": 4.543644208430136e-05,
"loss": 0.796,
"step": 1650
},
{
"epoch": 0.27547295054762694,
"grad_norm": 2.4817562103271484,
"learning_rate": 4.540878415753955e-05,
"loss": 0.8017,
"step": 1660
},
{
"epoch": 0.27713242615333555,
"grad_norm": 3.016995668411255,
"learning_rate": 4.538112623077774e-05,
"loss": 0.833,
"step": 1670
},
{
"epoch": 0.27879190175904417,
"grad_norm": 2.847045421600342,
"learning_rate": 4.535346830401593e-05,
"loss": 0.7387,
"step": 1680
},
{
"epoch": 0.28045137736475273,
"grad_norm": 3.473771333694458,
"learning_rate": 4.532581037725412e-05,
"loss": 0.8778,
"step": 1690
},
{
"epoch": 0.28211085297046135,
"grad_norm": 3.311330795288086,
"learning_rate": 4.529815245049231e-05,
"loss": 0.7255,
"step": 1700
},
{
"epoch": 0.2837703285761699,
"grad_norm": 2.5803961753845215,
"learning_rate": 4.5270494523730503e-05,
"loss": 0.8126,
"step": 1710
},
{
"epoch": 0.2854298041818785,
"grad_norm": 3.2069246768951416,
"learning_rate": 4.5242836596968694e-05,
"loss": 0.7845,
"step": 1720
},
{
"epoch": 0.28708927978758714,
"grad_norm": 2.9690170288085938,
"learning_rate": 4.5215178670206885e-05,
"loss": 0.7635,
"step": 1730
},
{
"epoch": 0.2887487553932957,
"grad_norm": 2.6883442401885986,
"learning_rate": 4.5187520743445075e-05,
"loss": 0.7325,
"step": 1740
},
{
"epoch": 0.2904082309990043,
"grad_norm": 2.671856641769409,
"learning_rate": 4.5159862816683266e-05,
"loss": 0.7755,
"step": 1750
},
{
"epoch": 0.29206770660471293,
"grad_norm": 2.4875473976135254,
"learning_rate": 4.5132204889921457e-05,
"loss": 0.798,
"step": 1760
},
{
"epoch": 0.2937271822104215,
"grad_norm": 3.223682165145874,
"learning_rate": 4.510454696315965e-05,
"loss": 0.813,
"step": 1770
},
{
"epoch": 0.2953866578161301,
"grad_norm": 2.2691056728363037,
"learning_rate": 4.507688903639784e-05,
"loss": 0.805,
"step": 1780
},
{
"epoch": 0.2970461334218387,
"grad_norm": 3.5715551376342773,
"learning_rate": 4.504923110963603e-05,
"loss": 0.7586,
"step": 1790
},
{
"epoch": 0.2987056090275473,
"grad_norm": 2.625098466873169,
"learning_rate": 4.502157318287422e-05,
"loss": 0.8703,
"step": 1800
},
{
"epoch": 0.3003650846332559,
"grad_norm": 3.1126928329467773,
"learning_rate": 4.499391525611241e-05,
"loss": 0.7879,
"step": 1810
},
{
"epoch": 0.3020245602389645,
"grad_norm": 3.1483154296875,
"learning_rate": 4.4966257329350593e-05,
"loss": 0.7849,
"step": 1820
},
{
"epoch": 0.3036840358446731,
"grad_norm": 2.979381799697876,
"learning_rate": 4.4938599402588784e-05,
"loss": 0.7649,
"step": 1830
},
{
"epoch": 0.3053435114503817,
"grad_norm": 3.920473098754883,
"learning_rate": 4.4910941475826975e-05,
"loss": 0.7568,
"step": 1840
},
{
"epoch": 0.30700298705609025,
"grad_norm": 2.9425113201141357,
"learning_rate": 4.4883283549065165e-05,
"loss": 0.7519,
"step": 1850
},
{
"epoch": 0.30866246266179886,
"grad_norm": 3.8821983337402344,
"learning_rate": 4.4855625622303356e-05,
"loss": 0.8094,
"step": 1860
},
{
"epoch": 0.3103219382675075,
"grad_norm": 2.6359121799468994,
"learning_rate": 4.4827967695541547e-05,
"loss": 0.8354,
"step": 1870
},
{
"epoch": 0.31198141387321604,
"grad_norm": 2.5459086894989014,
"learning_rate": 4.480030976877974e-05,
"loss": 0.7459,
"step": 1880
},
{
"epoch": 0.31364088947892466,
"grad_norm": 2.7496984004974365,
"learning_rate": 4.477265184201793e-05,
"loss": 0.7553,
"step": 1890
},
{
"epoch": 0.31530036508463327,
"grad_norm": 6.382673740386963,
"learning_rate": 4.474499391525612e-05,
"loss": 0.8221,
"step": 1900
},
{
"epoch": 0.31695984069034183,
"grad_norm": 2.9733335971832275,
"learning_rate": 4.47173359884943e-05,
"loss": 0.8175,
"step": 1910
},
{
"epoch": 0.31861931629605045,
"grad_norm": 2.1865618228912354,
"learning_rate": 4.468967806173249e-05,
"loss": 0.7811,
"step": 1920
},
{
"epoch": 0.32027879190175906,
"grad_norm": 6.8357648849487305,
"learning_rate": 4.4662020134970684e-05,
"loss": 0.6994,
"step": 1930
},
{
"epoch": 0.3219382675074676,
"grad_norm": 2.662757635116577,
"learning_rate": 4.4634362208208874e-05,
"loss": 0.7632,
"step": 1940
},
{
"epoch": 0.32359774311317624,
"grad_norm": 3.145087480545044,
"learning_rate": 4.4606704281447065e-05,
"loss": 0.8453,
"step": 1950
},
{
"epoch": 0.32525721871888486,
"grad_norm": 3.6768178939819336,
"learning_rate": 4.4579046354685255e-05,
"loss": 0.7155,
"step": 1960
},
{
"epoch": 0.3269166943245934,
"grad_norm": 3.166222333908081,
"learning_rate": 4.4551388427923446e-05,
"loss": 0.7469,
"step": 1970
},
{
"epoch": 0.32857616993030203,
"grad_norm": 2.641578197479248,
"learning_rate": 4.452373050116164e-05,
"loss": 0.8268,
"step": 1980
},
{
"epoch": 0.33023564553601065,
"grad_norm": 3.3563296794891357,
"learning_rate": 4.449607257439983e-05,
"loss": 0.7641,
"step": 1990
},
{
"epoch": 0.3318951211417192,
"grad_norm": 3.222212076187134,
"learning_rate": 4.446841464763801e-05,
"loss": 0.8712,
"step": 2000
},
{
"epoch": 0.3318951211417192,
"eval_gen_len": 47.415361445783134,
"eval_loss": 0.6793270707130432,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1404.8246,
"eval_samples_per_second": 4.718,
"eval_steps_per_second": 0.295,
"step": 2000
},
{
"epoch": 0.3335545967474278,
"grad_norm": 3.183871030807495,
"learning_rate": 4.44407567208762e-05,
"loss": 0.854,
"step": 2010
},
{
"epoch": 0.3352140723531364,
"grad_norm": 3.283364772796631,
"learning_rate": 4.441309879411439e-05,
"loss": 0.7531,
"step": 2020
},
{
"epoch": 0.336873547958845,
"grad_norm": 3.249227523803711,
"learning_rate": 4.438544086735258e-05,
"loss": 0.7456,
"step": 2030
},
{
"epoch": 0.3385330235645536,
"grad_norm": 2.5303399562835693,
"learning_rate": 4.4357782940590774e-05,
"loss": 0.8303,
"step": 2040
},
{
"epoch": 0.3401924991702622,
"grad_norm": 3.1236414909362793,
"learning_rate": 4.4330125013828964e-05,
"loss": 0.7733,
"step": 2050
},
{
"epoch": 0.3418519747759708,
"grad_norm": 3.557269811630249,
"learning_rate": 4.4302467087067155e-05,
"loss": 0.7385,
"step": 2060
},
{
"epoch": 0.3435114503816794,
"grad_norm": 3.093048334121704,
"learning_rate": 4.4274809160305345e-05,
"loss": 0.7486,
"step": 2070
},
{
"epoch": 0.34517092598738797,
"grad_norm": 3.6354196071624756,
"learning_rate": 4.4247151233543536e-05,
"loss": 0.8101,
"step": 2080
},
{
"epoch": 0.3468304015930966,
"grad_norm": 3.0970399379730225,
"learning_rate": 4.421949330678173e-05,
"loss": 0.7527,
"step": 2090
},
{
"epoch": 0.3484898771988052,
"grad_norm": 2.604701280593872,
"learning_rate": 4.419183538001991e-05,
"loss": 0.8143,
"step": 2100
},
{
"epoch": 0.35014935280451376,
"grad_norm": 4.05767822265625,
"learning_rate": 4.41641774532581e-05,
"loss": 0.8129,
"step": 2110
},
{
"epoch": 0.3518088284102224,
"grad_norm": 3.9038188457489014,
"learning_rate": 4.413651952649629e-05,
"loss": 0.7824,
"step": 2120
},
{
"epoch": 0.353468304015931,
"grad_norm": 2.652456760406494,
"learning_rate": 4.410886159973448e-05,
"loss": 0.7678,
"step": 2130
},
{
"epoch": 0.35512777962163955,
"grad_norm": 2.766669511795044,
"learning_rate": 4.408120367297267e-05,
"loss": 0.7936,
"step": 2140
},
{
"epoch": 0.35678725522734817,
"grad_norm": 3.649751663208008,
"learning_rate": 4.4053545746210864e-05,
"loss": 0.7205,
"step": 2150
},
{
"epoch": 0.3584467308330568,
"grad_norm": 2.6961166858673096,
"learning_rate": 4.4025887819449054e-05,
"loss": 0.8331,
"step": 2160
},
{
"epoch": 0.36010620643876534,
"grad_norm": 3.3244972229003906,
"learning_rate": 4.399822989268725e-05,
"loss": 0.8189,
"step": 2170
},
{
"epoch": 0.36176568204447396,
"grad_norm": 3.269043445587158,
"learning_rate": 4.397057196592544e-05,
"loss": 0.8237,
"step": 2180
},
{
"epoch": 0.3634251576501825,
"grad_norm": 3.3199679851531982,
"learning_rate": 4.3942914039163626e-05,
"loss": 0.7835,
"step": 2190
},
{
"epoch": 0.36508463325589113,
"grad_norm": 3.377162218093872,
"learning_rate": 4.391525611240182e-05,
"loss": 0.7065,
"step": 2200
},
{
"epoch": 0.36674410886159975,
"grad_norm": 2.6638193130493164,
"learning_rate": 4.388759818564001e-05,
"loss": 0.7611,
"step": 2210
},
{
"epoch": 0.3684035844673083,
"grad_norm": 2.741482734680176,
"learning_rate": 4.38599402588782e-05,
"loss": 0.7971,
"step": 2220
},
{
"epoch": 0.3700630600730169,
"grad_norm": 4.292590618133545,
"learning_rate": 4.383228233211639e-05,
"loss": 0.735,
"step": 2230
},
{
"epoch": 0.37172253567872554,
"grad_norm": 2.7163352966308594,
"learning_rate": 4.380462440535458e-05,
"loss": 0.742,
"step": 2240
},
{
"epoch": 0.3733820112844341,
"grad_norm": 2.661367654800415,
"learning_rate": 4.377696647859277e-05,
"loss": 0.7493,
"step": 2250
},
{
"epoch": 0.3750414868901427,
"grad_norm": 3.442807674407959,
"learning_rate": 4.374930855183096e-05,
"loss": 0.7258,
"step": 2260
},
{
"epoch": 0.37670096249585133,
"grad_norm": 3.017528772354126,
"learning_rate": 4.372165062506915e-05,
"loss": 0.7717,
"step": 2270
},
{
"epoch": 0.3783604381015599,
"grad_norm": 3.1746342182159424,
"learning_rate": 4.3693992698307335e-05,
"loss": 0.7912,
"step": 2280
},
{
"epoch": 0.3800199137072685,
"grad_norm": 2.567218780517578,
"learning_rate": 4.3666334771545526e-05,
"loss": 0.7329,
"step": 2290
},
{
"epoch": 0.3816793893129771,
"grad_norm": 3.3965744972229004,
"learning_rate": 4.3638676844783716e-05,
"loss": 0.8182,
"step": 2300
},
{
"epoch": 0.3833388649186857,
"grad_norm": 2.220444440841675,
"learning_rate": 4.361101891802191e-05,
"loss": 0.697,
"step": 2310
},
{
"epoch": 0.3849983405243943,
"grad_norm": 2.949594259262085,
"learning_rate": 4.35833609912601e-05,
"loss": 0.8063,
"step": 2320
},
{
"epoch": 0.38665781613010286,
"grad_norm": 3.4351999759674072,
"learning_rate": 4.355570306449829e-05,
"loss": 0.8039,
"step": 2330
},
{
"epoch": 0.3883172917358115,
"grad_norm": 3.2207376956939697,
"learning_rate": 4.352804513773648e-05,
"loss": 0.8298,
"step": 2340
},
{
"epoch": 0.3899767673415201,
"grad_norm": 2.397782802581787,
"learning_rate": 4.350038721097467e-05,
"loss": 0.6659,
"step": 2350
},
{
"epoch": 0.39163624294722865,
"grad_norm": 2.6824631690979004,
"learning_rate": 4.347272928421286e-05,
"loss": 0.7375,
"step": 2360
},
{
"epoch": 0.39329571855293727,
"grad_norm": 2.6002721786499023,
"learning_rate": 4.3445071357451044e-05,
"loss": 0.7758,
"step": 2370
},
{
"epoch": 0.3949551941586459,
"grad_norm": 3.270160675048828,
"learning_rate": 4.3417413430689234e-05,
"loss": 0.8297,
"step": 2380
},
{
"epoch": 0.39661466976435444,
"grad_norm": 3.2965505123138428,
"learning_rate": 4.3389755503927425e-05,
"loss": 0.76,
"step": 2390
},
{
"epoch": 0.39827414537006306,
"grad_norm": 3.4657270908355713,
"learning_rate": 4.3362097577165616e-05,
"loss": 0.7997,
"step": 2400
},
{
"epoch": 0.3999336209757717,
"grad_norm": 2.213045120239258,
"learning_rate": 4.3334439650403806e-05,
"loss": 0.7271,
"step": 2410
},
{
"epoch": 0.40159309658148024,
"grad_norm": 2.360948085784912,
"learning_rate": 4.3306781723642e-05,
"loss": 0.7122,
"step": 2420
},
{
"epoch": 0.40325257218718885,
"grad_norm": 2.8001227378845215,
"learning_rate": 4.327912379688019e-05,
"loss": 0.8604,
"step": 2430
},
{
"epoch": 0.40491204779289747,
"grad_norm": 2.840575933456421,
"learning_rate": 4.325146587011838e-05,
"loss": 0.7454,
"step": 2440
},
{
"epoch": 0.406571523398606,
"grad_norm": 3.021378993988037,
"learning_rate": 4.322380794335657e-05,
"loss": 0.7479,
"step": 2450
},
{
"epoch": 0.40823099900431464,
"grad_norm": 3.7414467334747314,
"learning_rate": 4.319615001659476e-05,
"loss": 0.7676,
"step": 2460
},
{
"epoch": 0.40989047461002326,
"grad_norm": 3.384713888168335,
"learning_rate": 4.316849208983294e-05,
"loss": 0.7451,
"step": 2470
},
{
"epoch": 0.4115499502157318,
"grad_norm": 3.215459108352661,
"learning_rate": 4.3140834163071134e-05,
"loss": 0.7401,
"step": 2480
},
{
"epoch": 0.41320942582144043,
"grad_norm": 4.62844705581665,
"learning_rate": 4.3113176236309324e-05,
"loss": 0.8015,
"step": 2490
},
{
"epoch": 0.414868901427149,
"grad_norm": 2.7699246406555176,
"learning_rate": 4.3085518309547515e-05,
"loss": 0.7665,
"step": 2500
},
{
"epoch": 0.4165283770328576,
"grad_norm": 3.4257094860076904,
"learning_rate": 4.3057860382785706e-05,
"loss": 0.8242,
"step": 2510
},
{
"epoch": 0.4181878526385662,
"grad_norm": 2.566210985183716,
"learning_rate": 4.3030202456023896e-05,
"loss": 0.8138,
"step": 2520
},
{
"epoch": 0.4198473282442748,
"grad_norm": 2.644387722015381,
"learning_rate": 4.300254452926209e-05,
"loss": 0.7472,
"step": 2530
},
{
"epoch": 0.4215068038499834,
"grad_norm": 2.5530991554260254,
"learning_rate": 4.297488660250028e-05,
"loss": 0.8139,
"step": 2540
},
{
"epoch": 0.423166279455692,
"grad_norm": 3.8945424556732178,
"learning_rate": 4.294722867573847e-05,
"loss": 0.8397,
"step": 2550
},
{
"epoch": 0.4248257550614006,
"grad_norm": 2.5263960361480713,
"learning_rate": 4.291957074897666e-05,
"loss": 0.7298,
"step": 2560
},
{
"epoch": 0.4264852306671092,
"grad_norm": 2.987938404083252,
"learning_rate": 4.289191282221485e-05,
"loss": 0.7347,
"step": 2570
},
{
"epoch": 0.4281447062728178,
"grad_norm": 2.844803810119629,
"learning_rate": 4.286425489545304e-05,
"loss": 0.7533,
"step": 2580
},
{
"epoch": 0.42980418187852637,
"grad_norm": 2.995476245880127,
"learning_rate": 4.283659696869123e-05,
"loss": 0.847,
"step": 2590
},
{
"epoch": 0.431463657484235,
"grad_norm": 2.791422128677368,
"learning_rate": 4.280893904192942e-05,
"loss": 0.7767,
"step": 2600
},
{
"epoch": 0.4331231330899436,
"grad_norm": 2.7166290283203125,
"learning_rate": 4.278128111516761e-05,
"loss": 0.7942,
"step": 2610
},
{
"epoch": 0.43478260869565216,
"grad_norm": 2.4218597412109375,
"learning_rate": 4.27536231884058e-05,
"loss": 0.7281,
"step": 2620
},
{
"epoch": 0.4364420843013608,
"grad_norm": 2.9908969402313232,
"learning_rate": 4.272596526164399e-05,
"loss": 0.7485,
"step": 2630
},
{
"epoch": 0.4381015599070694,
"grad_norm": 2.891364336013794,
"learning_rate": 4.2698307334882184e-05,
"loss": 0.7798,
"step": 2640
},
{
"epoch": 0.43976103551277795,
"grad_norm": 2.570340394973755,
"learning_rate": 4.267064940812037e-05,
"loss": 0.7695,
"step": 2650
},
{
"epoch": 0.44142051111848657,
"grad_norm": 3.531270742416382,
"learning_rate": 4.264299148135856e-05,
"loss": 0.7381,
"step": 2660
},
{
"epoch": 0.44307998672419513,
"grad_norm": 3.6200950145721436,
"learning_rate": 4.261533355459675e-05,
"loss": 0.8237,
"step": 2670
},
{
"epoch": 0.44473946232990375,
"grad_norm": 2.875049352645874,
"learning_rate": 4.258767562783494e-05,
"loss": 0.7549,
"step": 2680
},
{
"epoch": 0.44639893793561236,
"grad_norm": 2.9184587001800537,
"learning_rate": 4.256001770107313e-05,
"loss": 0.74,
"step": 2690
},
{
"epoch": 0.4480584135413209,
"grad_norm": 3.756166458129883,
"learning_rate": 4.253235977431132e-05,
"loss": 0.7764,
"step": 2700
},
{
"epoch": 0.44971788914702954,
"grad_norm": 2.6574513912200928,
"learning_rate": 4.250470184754951e-05,
"loss": 0.6868,
"step": 2710
},
{
"epoch": 0.45137736475273815,
"grad_norm": 2.8833422660827637,
"learning_rate": 4.24770439207877e-05,
"loss": 0.7902,
"step": 2720
},
{
"epoch": 0.4530368403584467,
"grad_norm": 3.441880464553833,
"learning_rate": 4.244938599402589e-05,
"loss": 0.7375,
"step": 2730
},
{
"epoch": 0.45469631596415533,
"grad_norm": 2.796851396560669,
"learning_rate": 4.242172806726408e-05,
"loss": 0.7475,
"step": 2740
},
{
"epoch": 0.45635579156986394,
"grad_norm": 3.374749183654785,
"learning_rate": 4.239407014050227e-05,
"loss": 0.7325,
"step": 2750
},
{
"epoch": 0.4580152671755725,
"grad_norm": 3.025646209716797,
"learning_rate": 4.236641221374046e-05,
"loss": 0.7562,
"step": 2760
},
{
"epoch": 0.4596747427812811,
"grad_norm": 3.104525566101074,
"learning_rate": 4.233875428697865e-05,
"loss": 0.7907,
"step": 2770
},
{
"epoch": 0.46133421838698974,
"grad_norm": 3.566995143890381,
"learning_rate": 4.231109636021684e-05,
"loss": 0.714,
"step": 2780
},
{
"epoch": 0.4629936939926983,
"grad_norm": 2.225813150405884,
"learning_rate": 4.228343843345503e-05,
"loss": 0.7958,
"step": 2790
},
{
"epoch": 0.4646531695984069,
"grad_norm": 5.6908440589904785,
"learning_rate": 4.225578050669322e-05,
"loss": 0.8055,
"step": 2800
},
{
"epoch": 0.4663126452041155,
"grad_norm": 3.8612444400787354,
"learning_rate": 4.222812257993141e-05,
"loss": 0.772,
"step": 2810
},
{
"epoch": 0.4679721208098241,
"grad_norm": 2.7307820320129395,
"learning_rate": 4.22004646531696e-05,
"loss": 0.7752,
"step": 2820
},
{
"epoch": 0.4696315964155327,
"grad_norm": 2.845541477203369,
"learning_rate": 4.217280672640779e-05,
"loss": 0.7597,
"step": 2830
},
{
"epoch": 0.47129107202124126,
"grad_norm": 2.981630563735962,
"learning_rate": 4.2145148799645976e-05,
"loss": 0.7566,
"step": 2840
},
{
"epoch": 0.4729505476269499,
"grad_norm": 2.511880397796631,
"learning_rate": 4.2117490872884166e-05,
"loss": 0.7161,
"step": 2850
},
{
"epoch": 0.4746100232326585,
"grad_norm": 3.225386142730713,
"learning_rate": 4.208983294612236e-05,
"loss": 0.791,
"step": 2860
},
{
"epoch": 0.47626949883836706,
"grad_norm": 3.897096633911133,
"learning_rate": 4.206217501936055e-05,
"loss": 0.7072,
"step": 2870
},
{
"epoch": 0.47792897444407567,
"grad_norm": 2.640658140182495,
"learning_rate": 4.203451709259874e-05,
"loss": 0.7708,
"step": 2880
},
{
"epoch": 0.4795884500497843,
"grad_norm": 2.9083096981048584,
"learning_rate": 4.200685916583693e-05,
"loss": 0.7547,
"step": 2890
},
{
"epoch": 0.48124792565549285,
"grad_norm": 2.8646459579467773,
"learning_rate": 4.197920123907512e-05,
"loss": 0.766,
"step": 2900
},
{
"epoch": 0.48290740126120146,
"grad_norm": 2.6047298908233643,
"learning_rate": 4.195154331231331e-05,
"loss": 0.7522,
"step": 2910
},
{
"epoch": 0.4845668768669101,
"grad_norm": 3.0007834434509277,
"learning_rate": 4.19238853855515e-05,
"loss": 0.7766,
"step": 2920
},
{
"epoch": 0.48622635247261864,
"grad_norm": 3.063333511352539,
"learning_rate": 4.189622745878969e-05,
"loss": 0.8042,
"step": 2930
},
{
"epoch": 0.48788582807832725,
"grad_norm": 2.5916037559509277,
"learning_rate": 4.186856953202788e-05,
"loss": 0.7362,
"step": 2940
},
{
"epoch": 0.48954530368403587,
"grad_norm": 4.322835922241211,
"learning_rate": 4.184091160526607e-05,
"loss": 0.7463,
"step": 2950
},
{
"epoch": 0.49120477928974443,
"grad_norm": 2.634681463241577,
"learning_rate": 4.181325367850426e-05,
"loss": 0.755,
"step": 2960
},
{
"epoch": 0.49286425489545305,
"grad_norm": 2.652538776397705,
"learning_rate": 4.1785595751742454e-05,
"loss": 0.7799,
"step": 2970
},
{
"epoch": 0.4945237305011616,
"grad_norm": 3.1968343257904053,
"learning_rate": 4.1757937824980644e-05,
"loss": 0.7158,
"step": 2980
},
{
"epoch": 0.4961832061068702,
"grad_norm": 3.14144229888916,
"learning_rate": 4.1730279898218835e-05,
"loss": 0.7382,
"step": 2990
},
{
"epoch": 0.49784268171257884,
"grad_norm": 3.0171921253204346,
"learning_rate": 4.1702621971457026e-05,
"loss": 0.7521,
"step": 3000
},
{
"epoch": 0.49784268171257884,
"eval_gen_len": 41.39789156626506,
"eval_loss": 0.6684303283691406,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1316.8012,
"eval_samples_per_second": 5.033,
"eval_steps_per_second": 0.315,
"step": 3000
},
{
"epoch": 0.4995021573182874,
"grad_norm": 2.768636465072632,
"learning_rate": 4.1674964044695216e-05,
"loss": 0.7066,
"step": 3010
},
{
"epoch": 0.5011616329239961,
"grad_norm": 2.3734288215637207,
"learning_rate": 4.16473061179334e-05,
"loss": 0.7546,
"step": 3020
},
{
"epoch": 0.5028211085297046,
"grad_norm": 2.6482138633728027,
"learning_rate": 4.161964819117159e-05,
"loss": 0.7635,
"step": 3030
},
{
"epoch": 0.5044805841354132,
"grad_norm": 3.832292079925537,
"learning_rate": 4.159199026440978e-05,
"loss": 0.8326,
"step": 3040
},
{
"epoch": 0.5061400597411218,
"grad_norm": 2.807021379470825,
"learning_rate": 4.156433233764797e-05,
"loss": 0.7692,
"step": 3050
},
{
"epoch": 0.5077995353468304,
"grad_norm": 3.430129289627075,
"learning_rate": 4.153667441088616e-05,
"loss": 0.7774,
"step": 3060
},
{
"epoch": 0.509459010952539,
"grad_norm": 2.7762088775634766,
"learning_rate": 4.150901648412435e-05,
"loss": 0.7413,
"step": 3070
},
{
"epoch": 0.5111184865582475,
"grad_norm": 2.8153493404388428,
"learning_rate": 4.1481358557362544e-05,
"loss": 0.8242,
"step": 3080
},
{
"epoch": 0.5127779621639562,
"grad_norm": 2.6910579204559326,
"learning_rate": 4.1453700630600734e-05,
"loss": 0.8018,
"step": 3090
},
{
"epoch": 0.5144374377696648,
"grad_norm": 3.160053014755249,
"learning_rate": 4.1426042703838925e-05,
"loss": 0.8186,
"step": 3100
},
{
"epoch": 0.5160969133753733,
"grad_norm": 2.774655818939209,
"learning_rate": 4.1398384777077116e-05,
"loss": 0.7863,
"step": 3110
},
{
"epoch": 0.517756388981082,
"grad_norm": 3.7984707355499268,
"learning_rate": 4.13707268503153e-05,
"loss": 0.7761,
"step": 3120
},
{
"epoch": 0.5194158645867906,
"grad_norm": 2.766265869140625,
"learning_rate": 4.134306892355349e-05,
"loss": 0.8273,
"step": 3130
},
{
"epoch": 0.5210753401924991,
"grad_norm": 3.027769088745117,
"learning_rate": 4.131541099679168e-05,
"loss": 0.6994,
"step": 3140
},
{
"epoch": 0.5227348157982078,
"grad_norm": 3.2986860275268555,
"learning_rate": 4.128775307002987e-05,
"loss": 0.7505,
"step": 3150
},
{
"epoch": 0.5243942914039164,
"grad_norm": 3.352910041809082,
"learning_rate": 4.126009514326806e-05,
"loss": 0.8348,
"step": 3160
},
{
"epoch": 0.5260537670096249,
"grad_norm": 3.2695400714874268,
"learning_rate": 4.123243721650625e-05,
"loss": 0.7393,
"step": 3170
},
{
"epoch": 0.5277132426153336,
"grad_norm": 2.6088485717773438,
"learning_rate": 4.120477928974444e-05,
"loss": 0.7835,
"step": 3180
},
{
"epoch": 0.5293727182210421,
"grad_norm": 2.8191120624542236,
"learning_rate": 4.1177121362982634e-05,
"loss": 0.7469,
"step": 3190
},
{
"epoch": 0.5310321938267507,
"grad_norm": 6.382346153259277,
"learning_rate": 4.1149463436220824e-05,
"loss": 0.781,
"step": 3200
},
{
"epoch": 0.5326916694324594,
"grad_norm": 3.124753713607788,
"learning_rate": 4.112180550945901e-05,
"loss": 0.7763,
"step": 3210
},
{
"epoch": 0.5343511450381679,
"grad_norm": 3.3809783458709717,
"learning_rate": 4.10941475826972e-05,
"loss": 0.7479,
"step": 3220
},
{
"epoch": 0.5360106206438765,
"grad_norm": 3.1917128562927246,
"learning_rate": 4.106648965593539e-05,
"loss": 0.745,
"step": 3230
},
{
"epoch": 0.5376700962495852,
"grad_norm": 2.9804718494415283,
"learning_rate": 4.103883172917358e-05,
"loss": 0.8186,
"step": 3240
},
{
"epoch": 0.5393295718552937,
"grad_norm": 2.429513692855835,
"learning_rate": 4.101117380241177e-05,
"loss": 0.7253,
"step": 3250
},
{
"epoch": 0.5409890474610023,
"grad_norm": 3.4630186557769775,
"learning_rate": 4.098351587564996e-05,
"loss": 0.8013,
"step": 3260
},
{
"epoch": 0.542648523066711,
"grad_norm": 3.3614048957824707,
"learning_rate": 4.095585794888815e-05,
"loss": 0.7448,
"step": 3270
},
{
"epoch": 0.5443079986724195,
"grad_norm": 3.4482579231262207,
"learning_rate": 4.092820002212634e-05,
"loss": 0.7889,
"step": 3280
},
{
"epoch": 0.5459674742781281,
"grad_norm": 3.538914442062378,
"learning_rate": 4.090054209536453e-05,
"loss": 0.7433,
"step": 3290
},
{
"epoch": 0.5476269498838368,
"grad_norm": 2.6519882678985596,
"learning_rate": 4.0872884168602724e-05,
"loss": 0.7425,
"step": 3300
},
{
"epoch": 0.5492864254895453,
"grad_norm": 3.908871650695801,
"learning_rate": 4.0845226241840914e-05,
"loss": 0.7687,
"step": 3310
},
{
"epoch": 0.5509459010952539,
"grad_norm": 2.4761886596679688,
"learning_rate": 4.0817568315079105e-05,
"loss": 0.7559,
"step": 3320
},
{
"epoch": 0.5526053767009625,
"grad_norm": 2.936110496520996,
"learning_rate": 4.0789910388317296e-05,
"loss": 0.7407,
"step": 3330
},
{
"epoch": 0.5542648523066711,
"grad_norm": 2.414314031600952,
"learning_rate": 4.0762252461555486e-05,
"loss": 0.7345,
"step": 3340
},
{
"epoch": 0.5559243279123797,
"grad_norm": 2.5228681564331055,
"learning_rate": 4.073459453479368e-05,
"loss": 0.7433,
"step": 3350
},
{
"epoch": 0.5575838035180883,
"grad_norm": 2.4649956226348877,
"learning_rate": 4.070693660803187e-05,
"loss": 0.7801,
"step": 3360
},
{
"epoch": 0.5592432791237969,
"grad_norm": 3.223370313644409,
"learning_rate": 4.067927868127006e-05,
"loss": 0.6799,
"step": 3370
},
{
"epoch": 0.5609027547295055,
"grad_norm": 3.04349422454834,
"learning_rate": 4.065162075450825e-05,
"loss": 0.7712,
"step": 3380
},
{
"epoch": 0.562562230335214,
"grad_norm": 3.191512107849121,
"learning_rate": 4.062396282774644e-05,
"loss": 0.8188,
"step": 3390
},
{
"epoch": 0.5642217059409227,
"grad_norm": 2.470961332321167,
"learning_rate": 4.059630490098462e-05,
"loss": 0.7609,
"step": 3400
},
{
"epoch": 0.5658811815466313,
"grad_norm": 3.0786991119384766,
"learning_rate": 4.0568646974222814e-05,
"loss": 0.8628,
"step": 3410
},
{
"epoch": 0.5675406571523398,
"grad_norm": 3.9537196159362793,
"learning_rate": 4.0540989047461005e-05,
"loss": 0.7777,
"step": 3420
},
{
"epoch": 0.5692001327580485,
"grad_norm": 3.1566216945648193,
"learning_rate": 4.0513331120699195e-05,
"loss": 0.819,
"step": 3430
},
{
"epoch": 0.570859608363757,
"grad_norm": 3.2270612716674805,
"learning_rate": 4.0485673193937386e-05,
"loss": 0.7715,
"step": 3440
},
{
"epoch": 0.5725190839694656,
"grad_norm": 3.0721094608306885,
"learning_rate": 4.0458015267175576e-05,
"loss": 0.7747,
"step": 3450
},
{
"epoch": 0.5741785595751743,
"grad_norm": 3.0017573833465576,
"learning_rate": 4.043035734041377e-05,
"loss": 0.6807,
"step": 3460
},
{
"epoch": 0.5758380351808828,
"grad_norm": 3.0706708431243896,
"learning_rate": 4.040269941365196e-05,
"loss": 0.8453,
"step": 3470
},
{
"epoch": 0.5774975107865914,
"grad_norm": 2.895575761795044,
"learning_rate": 4.037504148689015e-05,
"loss": 0.8199,
"step": 3480
},
{
"epoch": 0.5791569863923001,
"grad_norm": 2.690824270248413,
"learning_rate": 4.034738356012833e-05,
"loss": 0.7427,
"step": 3490
},
{
"epoch": 0.5808164619980086,
"grad_norm": 3.1231939792633057,
"learning_rate": 4.031972563336652e-05,
"loss": 0.7237,
"step": 3500
},
{
"epoch": 0.5824759376037172,
"grad_norm": 3.78774356842041,
"learning_rate": 4.029206770660471e-05,
"loss": 0.7579,
"step": 3510
},
{
"epoch": 0.5841354132094259,
"grad_norm": 3.0420353412628174,
"learning_rate": 4.0264409779842904e-05,
"loss": 0.7719,
"step": 3520
},
{
"epoch": 0.5857948888151344,
"grad_norm": 2.9405903816223145,
"learning_rate": 4.0236751853081095e-05,
"loss": 0.7119,
"step": 3530
},
{
"epoch": 0.587454364420843,
"grad_norm": 3.3033666610717773,
"learning_rate": 4.0209093926319285e-05,
"loss": 0.7153,
"step": 3540
},
{
"epoch": 0.5891138400265516,
"grad_norm": 2.730015993118286,
"learning_rate": 4.0181435999557476e-05,
"loss": 0.8199,
"step": 3550
},
{
"epoch": 0.5907733156322602,
"grad_norm": 4.007871150970459,
"learning_rate": 4.0153778072795666e-05,
"loss": 0.7864,
"step": 3560
},
{
"epoch": 0.5924327912379688,
"grad_norm": 5.673678398132324,
"learning_rate": 4.012612014603386e-05,
"loss": 0.7674,
"step": 3570
},
{
"epoch": 0.5940922668436774,
"grad_norm": 3.6139519214630127,
"learning_rate": 4.009846221927204e-05,
"loss": 0.7064,
"step": 3580
},
{
"epoch": 0.595751742449386,
"grad_norm": 2.455223321914673,
"learning_rate": 4.007080429251023e-05,
"loss": 0.7463,
"step": 3590
},
{
"epoch": 0.5974112180550946,
"grad_norm": 2.7342236042022705,
"learning_rate": 4.004314636574842e-05,
"loss": 0.702,
"step": 3600
},
{
"epoch": 0.5990706936608032,
"grad_norm": 3.3068127632141113,
"learning_rate": 4.001548843898661e-05,
"loss": 0.7635,
"step": 3610
},
{
"epoch": 0.6007301692665118,
"grad_norm": 3.509694814682007,
"learning_rate": 3.99878305122248e-05,
"loss": 0.6797,
"step": 3620
},
{
"epoch": 0.6023896448722204,
"grad_norm": 3.3854920864105225,
"learning_rate": 3.9960172585462994e-05,
"loss": 0.7737,
"step": 3630
},
{
"epoch": 0.604049120477929,
"grad_norm": 3.1327128410339355,
"learning_rate": 3.9932514658701185e-05,
"loss": 0.7498,
"step": 3640
},
{
"epoch": 0.6057085960836376,
"grad_norm": 2.878110408782959,
"learning_rate": 3.9904856731939375e-05,
"loss": 0.7231,
"step": 3650
},
{
"epoch": 0.6073680716893461,
"grad_norm": 2.6065754890441895,
"learning_rate": 3.9877198805177566e-05,
"loss": 0.7698,
"step": 3660
},
{
"epoch": 0.6090275472950548,
"grad_norm": 4.012957572937012,
"learning_rate": 3.9849540878415756e-05,
"loss": 0.7458,
"step": 3670
},
{
"epoch": 0.6106870229007634,
"grad_norm": 4.345855712890625,
"learning_rate": 3.982188295165395e-05,
"loss": 0.7561,
"step": 3680
},
{
"epoch": 0.6123464985064719,
"grad_norm": 3.0930395126342773,
"learning_rate": 3.979422502489214e-05,
"loss": 0.811,
"step": 3690
},
{
"epoch": 0.6140059741121805,
"grad_norm": 2.519012212753296,
"learning_rate": 3.976656709813033e-05,
"loss": 0.7081,
"step": 3700
},
{
"epoch": 0.6156654497178892,
"grad_norm": 3.1722018718719482,
"learning_rate": 3.973890917136852e-05,
"loss": 0.7695,
"step": 3710
},
{
"epoch": 0.6173249253235977,
"grad_norm": 2.899458885192871,
"learning_rate": 3.971125124460671e-05,
"loss": 0.7741,
"step": 3720
},
{
"epoch": 0.6189844009293063,
"grad_norm": 2.858637809753418,
"learning_rate": 3.96835933178449e-05,
"loss": 0.7594,
"step": 3730
},
{
"epoch": 0.620643876535015,
"grad_norm": 3.2127928733825684,
"learning_rate": 3.965593539108309e-05,
"loss": 0.7823,
"step": 3740
},
{
"epoch": 0.6223033521407235,
"grad_norm": 2.691950798034668,
"learning_rate": 3.962827746432128e-05,
"loss": 0.7618,
"step": 3750
},
{
"epoch": 0.6239628277464321,
"grad_norm": 2.7668871879577637,
"learning_rate": 3.960061953755947e-05,
"loss": 0.7169,
"step": 3760
},
{
"epoch": 0.6256223033521408,
"grad_norm": 3.0835137367248535,
"learning_rate": 3.9572961610797656e-05,
"loss": 0.7301,
"step": 3770
},
{
"epoch": 0.6272817789578493,
"grad_norm": 2.561507225036621,
"learning_rate": 3.9545303684035846e-05,
"loss": 0.7917,
"step": 3780
},
{
"epoch": 0.6289412545635579,
"grad_norm": 3.317300796508789,
"learning_rate": 3.951764575727404e-05,
"loss": 0.7741,
"step": 3790
},
{
"epoch": 0.6306007301692665,
"grad_norm": 2.73250412940979,
"learning_rate": 3.948998783051223e-05,
"loss": 0.7402,
"step": 3800
},
{
"epoch": 0.6322602057749751,
"grad_norm": 3.5862343311309814,
"learning_rate": 3.946232990375042e-05,
"loss": 0.7247,
"step": 3810
},
{
"epoch": 0.6339196813806837,
"grad_norm": 3.3748645782470703,
"learning_rate": 3.943467197698861e-05,
"loss": 0.7192,
"step": 3820
},
{
"epoch": 0.6355791569863923,
"grad_norm": 3.1063249111175537,
"learning_rate": 3.94070140502268e-05,
"loss": 0.7557,
"step": 3830
},
{
"epoch": 0.6372386325921009,
"grad_norm": 2.6087396144866943,
"learning_rate": 3.937935612346499e-05,
"loss": 0.7523,
"step": 3840
},
{
"epoch": 0.6388981081978095,
"grad_norm": 2.9586808681488037,
"learning_rate": 3.935169819670318e-05,
"loss": 0.7762,
"step": 3850
},
{
"epoch": 0.6405575838035181,
"grad_norm": 2.84586501121521,
"learning_rate": 3.9324040269941365e-05,
"loss": 0.722,
"step": 3860
},
{
"epoch": 0.6422170594092267,
"grad_norm": 3.2668354511260986,
"learning_rate": 3.9296382343179555e-05,
"loss": 0.7091,
"step": 3870
},
{
"epoch": 0.6438765350149352,
"grad_norm": 2.432651996612549,
"learning_rate": 3.9268724416417746e-05,
"loss": 0.761,
"step": 3880
},
{
"epoch": 0.6455360106206439,
"grad_norm": 2.915759801864624,
"learning_rate": 3.9241066489655937e-05,
"loss": 0.7052,
"step": 3890
},
{
"epoch": 0.6471954862263525,
"grad_norm": 3.8678624629974365,
"learning_rate": 3.921340856289413e-05,
"loss": 0.7719,
"step": 3900
},
{
"epoch": 0.648854961832061,
"grad_norm": 2.6558640003204346,
"learning_rate": 3.918575063613232e-05,
"loss": 0.8168,
"step": 3910
},
{
"epoch": 0.6505144374377697,
"grad_norm": 2.678056478500366,
"learning_rate": 3.915809270937051e-05,
"loss": 0.7295,
"step": 3920
},
{
"epoch": 0.6521739130434783,
"grad_norm": 2.953716993331909,
"learning_rate": 3.91304347826087e-05,
"loss": 0.7825,
"step": 3930
},
{
"epoch": 0.6538333886491868,
"grad_norm": 3.628190755844116,
"learning_rate": 3.910277685584689e-05,
"loss": 0.7077,
"step": 3940
},
{
"epoch": 0.6554928642548955,
"grad_norm": 4.070046424865723,
"learning_rate": 3.9075118929085073e-05,
"loss": 0.7921,
"step": 3950
},
{
"epoch": 0.6571523398606041,
"grad_norm": 3.1164205074310303,
"learning_rate": 3.9047461002323264e-05,
"loss": 0.7921,
"step": 3960
},
{
"epoch": 0.6588118154663126,
"grad_norm": 4.493014812469482,
"learning_rate": 3.9019803075561455e-05,
"loss": 0.7053,
"step": 3970
},
{
"epoch": 0.6604712910720213,
"grad_norm": 3.5576398372650146,
"learning_rate": 3.8992145148799645e-05,
"loss": 0.7438,
"step": 3980
},
{
"epoch": 0.6621307666777299,
"grad_norm": 2.7487661838531494,
"learning_rate": 3.8964487222037836e-05,
"loss": 0.7851,
"step": 3990
},
{
"epoch": 0.6637902422834384,
"grad_norm": 3.2422597408294678,
"learning_rate": 3.8936829295276027e-05,
"loss": 0.6859,
"step": 4000
},
{
"epoch": 0.6637902422834384,
"eval_gen_len": 45.130421686746985,
"eval_loss": 0.6545064449310303,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1363.3071,
"eval_samples_per_second": 4.862,
"eval_steps_per_second": 0.304,
"step": 4000
},
{
"epoch": 0.665449717889147,
"grad_norm": 3.739856481552124,
"learning_rate": 3.890917136851422e-05,
"loss": 0.7867,
"step": 4010
},
{
"epoch": 0.6671091934948556,
"grad_norm": 2.2884652614593506,
"learning_rate": 3.888151344175241e-05,
"loss": 0.7778,
"step": 4020
},
{
"epoch": 0.6687686691005642,
"grad_norm": 2.648066759109497,
"learning_rate": 3.88538555149906e-05,
"loss": 0.7422,
"step": 4030
},
{
"epoch": 0.6704281447062728,
"grad_norm": 3.048558235168457,
"learning_rate": 3.882619758822879e-05,
"loss": 0.7105,
"step": 4040
},
{
"epoch": 0.6720876203119814,
"grad_norm": 2.565505027770996,
"learning_rate": 3.879853966146697e-05,
"loss": 0.7514,
"step": 4050
},
{
"epoch": 0.67374709591769,
"grad_norm": 2.495495557785034,
"learning_rate": 3.8770881734705164e-05,
"loss": 0.7649,
"step": 4060
},
{
"epoch": 0.6754065715233986,
"grad_norm": 2.873587131500244,
"learning_rate": 3.8743223807943354e-05,
"loss": 0.789,
"step": 4070
},
{
"epoch": 0.6770660471291072,
"grad_norm": 3.2935984134674072,
"learning_rate": 3.8715565881181545e-05,
"loss": 0.7066,
"step": 4080
},
{
"epoch": 0.6787255227348158,
"grad_norm": 2.862403392791748,
"learning_rate": 3.8687907954419735e-05,
"loss": 0.6715,
"step": 4090
},
{
"epoch": 0.6803849983405243,
"grad_norm": 2.6522021293640137,
"learning_rate": 3.8660250027657926e-05,
"loss": 0.705,
"step": 4100
},
{
"epoch": 0.682044473946233,
"grad_norm": 2.5446276664733887,
"learning_rate": 3.863259210089612e-05,
"loss": 0.8133,
"step": 4110
},
{
"epoch": 0.6837039495519416,
"grad_norm": 4.17519998550415,
"learning_rate": 3.8604934174134314e-05,
"loss": 0.6572,
"step": 4120
},
{
"epoch": 0.6853634251576501,
"grad_norm": 2.3463404178619385,
"learning_rate": 3.8577276247372505e-05,
"loss": 0.7737,
"step": 4130
},
{
"epoch": 0.6870229007633588,
"grad_norm": 2.4097015857696533,
"learning_rate": 3.854961832061069e-05,
"loss": 0.7391,
"step": 4140
},
{
"epoch": 0.6886823763690674,
"grad_norm": 3.627779483795166,
"learning_rate": 3.852196039384888e-05,
"loss": 0.7328,
"step": 4150
},
{
"epoch": 0.6903418519747759,
"grad_norm": 2.6210310459136963,
"learning_rate": 3.849430246708707e-05,
"loss": 0.7873,
"step": 4160
},
{
"epoch": 0.6920013275804846,
"grad_norm": 2.494967460632324,
"learning_rate": 3.846664454032526e-05,
"loss": 0.7616,
"step": 4170
},
{
"epoch": 0.6936608031861932,
"grad_norm": 3.345163106918335,
"learning_rate": 3.843898661356345e-05,
"loss": 0.7826,
"step": 4180
},
{
"epoch": 0.6953202787919017,
"grad_norm": 2.6801059246063232,
"learning_rate": 3.841132868680164e-05,
"loss": 0.7355,
"step": 4190
},
{
"epoch": 0.6969797543976104,
"grad_norm": 2.771024227142334,
"learning_rate": 3.838367076003983e-05,
"loss": 0.794,
"step": 4200
},
{
"epoch": 0.698639230003319,
"grad_norm": 3.4726293087005615,
"learning_rate": 3.835601283327802e-05,
"loss": 0.8092,
"step": 4210
},
{
"epoch": 0.7002987056090275,
"grad_norm": 3.0892086029052734,
"learning_rate": 3.8328354906516213e-05,
"loss": 0.8002,
"step": 4220
},
{
"epoch": 0.7019581812147362,
"grad_norm": 2.4271984100341797,
"learning_rate": 3.83006969797544e-05,
"loss": 0.7059,
"step": 4230
},
{
"epoch": 0.7036176568204447,
"grad_norm": 2.875136613845825,
"learning_rate": 3.827303905299259e-05,
"loss": 0.7054,
"step": 4240
},
{
"epoch": 0.7052771324261533,
"grad_norm": 2.9489753246307373,
"learning_rate": 3.824538112623078e-05,
"loss": 0.8017,
"step": 4250
},
{
"epoch": 0.706936608031862,
"grad_norm": 2.9833528995513916,
"learning_rate": 3.821772319946897e-05,
"loss": 0.8047,
"step": 4260
},
{
"epoch": 0.7085960836375705,
"grad_norm": 3.6213622093200684,
"learning_rate": 3.819006527270716e-05,
"loss": 0.6837,
"step": 4270
},
{
"epoch": 0.7102555592432791,
"grad_norm": 2.7520642280578613,
"learning_rate": 3.816240734594535e-05,
"loss": 0.7782,
"step": 4280
},
{
"epoch": 0.7119150348489878,
"grad_norm": 2.3629531860351562,
"learning_rate": 3.813474941918354e-05,
"loss": 0.7936,
"step": 4290
},
{
"epoch": 0.7135745104546963,
"grad_norm": 7.7020158767700195,
"learning_rate": 3.810709149242173e-05,
"loss": 0.8537,
"step": 4300
},
{
"epoch": 0.7152339860604049,
"grad_norm": 2.456869602203369,
"learning_rate": 3.807943356565992e-05,
"loss": 0.7971,
"step": 4310
},
{
"epoch": 0.7168934616661136,
"grad_norm": 3.720423936843872,
"learning_rate": 3.8051775638898106e-05,
"loss": 0.7088,
"step": 4320
},
{
"epoch": 0.7185529372718221,
"grad_norm": 2.5650722980499268,
"learning_rate": 3.80241177121363e-05,
"loss": 0.7349,
"step": 4330
},
{
"epoch": 0.7202124128775307,
"grad_norm": 3.272597551345825,
"learning_rate": 3.799645978537449e-05,
"loss": 0.7849,
"step": 4340
},
{
"epoch": 0.7218718884832392,
"grad_norm": 2.7749786376953125,
"learning_rate": 3.796880185861268e-05,
"loss": 0.7628,
"step": 4350
},
{
"epoch": 0.7235313640889479,
"grad_norm": 3.038445472717285,
"learning_rate": 3.794114393185087e-05,
"loss": 0.8163,
"step": 4360
},
{
"epoch": 0.7251908396946565,
"grad_norm": 2.468409538269043,
"learning_rate": 3.791348600508906e-05,
"loss": 0.8063,
"step": 4370
},
{
"epoch": 0.726850315300365,
"grad_norm": 2.681001901626587,
"learning_rate": 3.788582807832725e-05,
"loss": 0.7814,
"step": 4380
},
{
"epoch": 0.7285097909060737,
"grad_norm": 2.493736743927002,
"learning_rate": 3.785817015156544e-05,
"loss": 0.7732,
"step": 4390
},
{
"epoch": 0.7301692665117823,
"grad_norm": 3.0504839420318604,
"learning_rate": 3.783051222480363e-05,
"loss": 0.7583,
"step": 4400
},
{
"epoch": 0.7318287421174908,
"grad_norm": 3.1474430561065674,
"learning_rate": 3.780285429804182e-05,
"loss": 0.7625,
"step": 4410
},
{
"epoch": 0.7334882177231995,
"grad_norm": 4.236194610595703,
"learning_rate": 3.7775196371280005e-05,
"loss": 0.7585,
"step": 4420
},
{
"epoch": 0.7351476933289081,
"grad_norm": 3.4272358417510986,
"learning_rate": 3.7747538444518196e-05,
"loss": 0.7197,
"step": 4430
},
{
"epoch": 0.7368071689346166,
"grad_norm": 6.535729885101318,
"learning_rate": 3.771988051775639e-05,
"loss": 0.7672,
"step": 4440
},
{
"epoch": 0.7384666445403253,
"grad_norm": 3.000758171081543,
"learning_rate": 3.769222259099458e-05,
"loss": 0.7655,
"step": 4450
},
{
"epoch": 0.7401261201460339,
"grad_norm": 3.097958564758301,
"learning_rate": 3.766456466423277e-05,
"loss": 0.7275,
"step": 4460
},
{
"epoch": 0.7417855957517424,
"grad_norm": 4.476720333099365,
"learning_rate": 3.763690673747096e-05,
"loss": 0.7846,
"step": 4470
},
{
"epoch": 0.7434450713574511,
"grad_norm": 2.7858529090881348,
"learning_rate": 3.760924881070915e-05,
"loss": 0.7956,
"step": 4480
},
{
"epoch": 0.7451045469631596,
"grad_norm": 3.158928155899048,
"learning_rate": 3.758159088394734e-05,
"loss": 0.7277,
"step": 4490
},
{
"epoch": 0.7467640225688682,
"grad_norm": 3.0849194526672363,
"learning_rate": 3.755393295718553e-05,
"loss": 0.7204,
"step": 4500
},
{
"epoch": 0.7484234981745769,
"grad_norm": 2.7116057872772217,
"learning_rate": 3.752627503042372e-05,
"loss": 0.7761,
"step": 4510
},
{
"epoch": 0.7500829737802854,
"grad_norm": 2.537970542907715,
"learning_rate": 3.749861710366191e-05,
"loss": 0.7093,
"step": 4520
},
{
"epoch": 0.751742449385994,
"grad_norm": 2.507575035095215,
"learning_rate": 3.74709591769001e-05,
"loss": 0.7035,
"step": 4530
},
{
"epoch": 0.7534019249917027,
"grad_norm": 2.7225263118743896,
"learning_rate": 3.744330125013829e-05,
"loss": 0.7011,
"step": 4540
},
{
"epoch": 0.7550614005974112,
"grad_norm": 2.6047916412353516,
"learning_rate": 3.7415643323376484e-05,
"loss": 0.7202,
"step": 4550
},
{
"epoch": 0.7567208762031198,
"grad_norm": 3.1778979301452637,
"learning_rate": 3.7387985396614674e-05,
"loss": 0.7007,
"step": 4560
},
{
"epoch": 0.7583803518088285,
"grad_norm": 3.0564684867858887,
"learning_rate": 3.7360327469852865e-05,
"loss": 0.7175,
"step": 4570
},
{
"epoch": 0.760039827414537,
"grad_norm": 2.680342674255371,
"learning_rate": 3.7332669543091055e-05,
"loss": 0.7314,
"step": 4580
},
{
"epoch": 0.7616993030202456,
"grad_norm": 4.322812080383301,
"learning_rate": 3.7305011616329246e-05,
"loss": 0.7302,
"step": 4590
},
{
"epoch": 0.7633587786259542,
"grad_norm": 2.9004077911376953,
"learning_rate": 3.727735368956743e-05,
"loss": 0.7247,
"step": 4600
},
{
"epoch": 0.7650182542316628,
"grad_norm": 2.5869526863098145,
"learning_rate": 3.724969576280562e-05,
"loss": 0.7489,
"step": 4610
},
{
"epoch": 0.7666777298373714,
"grad_norm": 3.1568167209625244,
"learning_rate": 3.722203783604381e-05,
"loss": 0.7502,
"step": 4620
},
{
"epoch": 0.76833720544308,
"grad_norm": 2.8454699516296387,
"learning_rate": 3.7194379909282e-05,
"loss": 0.7598,
"step": 4630
},
{
"epoch": 0.7699966810487886,
"grad_norm": 2.3352134227752686,
"learning_rate": 3.716672198252019e-05,
"loss": 0.7036,
"step": 4640
},
{
"epoch": 0.7716561566544972,
"grad_norm": 2.986163377761841,
"learning_rate": 3.713906405575838e-05,
"loss": 0.679,
"step": 4650
},
{
"epoch": 0.7733156322602057,
"grad_norm": 3.093139886856079,
"learning_rate": 3.7111406128996574e-05,
"loss": 0.7585,
"step": 4660
},
{
"epoch": 0.7749751078659144,
"grad_norm": 2.8010923862457275,
"learning_rate": 3.7083748202234764e-05,
"loss": 0.7965,
"step": 4670
},
{
"epoch": 0.776634583471623,
"grad_norm": 3.4075088500976562,
"learning_rate": 3.7056090275472955e-05,
"loss": 0.6914,
"step": 4680
},
{
"epoch": 0.7782940590773315,
"grad_norm": 3.064790725708008,
"learning_rate": 3.7028432348711145e-05,
"loss": 0.7382,
"step": 4690
},
{
"epoch": 0.7799535346830402,
"grad_norm": 2.642049789428711,
"learning_rate": 3.700077442194933e-05,
"loss": 0.7604,
"step": 4700
},
{
"epoch": 0.7816130102887487,
"grad_norm": 2.7482261657714844,
"learning_rate": 3.697311649518752e-05,
"loss": 0.7906,
"step": 4710
},
{
"epoch": 0.7832724858944573,
"grad_norm": 3.271319627761841,
"learning_rate": 3.694545856842571e-05,
"loss": 0.7139,
"step": 4720
},
{
"epoch": 0.784931961500166,
"grad_norm": 3.490607500076294,
"learning_rate": 3.69178006416639e-05,
"loss": 0.6696,
"step": 4730
},
{
"epoch": 0.7865914371058745,
"grad_norm": 3.562227487564087,
"learning_rate": 3.689014271490209e-05,
"loss": 0.7333,
"step": 4740
},
{
"epoch": 0.7882509127115831,
"grad_norm": 2.2841796875,
"learning_rate": 3.686248478814028e-05,
"loss": 0.7322,
"step": 4750
},
{
"epoch": 0.7899103883172918,
"grad_norm": 2.444272041320801,
"learning_rate": 3.683482686137847e-05,
"loss": 0.7256,
"step": 4760
},
{
"epoch": 0.7915698639230003,
"grad_norm": 2.394650936126709,
"learning_rate": 3.6807168934616664e-05,
"loss": 0.7735,
"step": 4770
},
{
"epoch": 0.7932293395287089,
"grad_norm": 3.035123348236084,
"learning_rate": 3.6779511007854854e-05,
"loss": 0.7702,
"step": 4780
},
{
"epoch": 0.7948888151344176,
"grad_norm": 2.773576259613037,
"learning_rate": 3.675185308109304e-05,
"loss": 0.7024,
"step": 4790
},
{
"epoch": 0.7965482907401261,
"grad_norm": 3.0329270362854004,
"learning_rate": 3.672419515433123e-05,
"loss": 0.7271,
"step": 4800
},
{
"epoch": 0.7982077663458347,
"grad_norm": 3.254540205001831,
"learning_rate": 3.669653722756942e-05,
"loss": 0.7227,
"step": 4810
},
{
"epoch": 0.7998672419515434,
"grad_norm": 3.238571882247925,
"learning_rate": 3.666887930080761e-05,
"loss": 0.8293,
"step": 4820
},
{
"epoch": 0.8015267175572519,
"grad_norm": 5.008415222167969,
"learning_rate": 3.66412213740458e-05,
"loss": 0.7218,
"step": 4830
},
{
"epoch": 0.8031861931629605,
"grad_norm": 2.9953482151031494,
"learning_rate": 3.661356344728399e-05,
"loss": 0.7505,
"step": 4840
},
{
"epoch": 0.8048456687686691,
"grad_norm": 2.52767014503479,
"learning_rate": 3.658590552052218e-05,
"loss": 0.7171,
"step": 4850
},
{
"epoch": 0.8065051443743777,
"grad_norm": 2.4959516525268555,
"learning_rate": 3.655824759376037e-05,
"loss": 0.759,
"step": 4860
},
{
"epoch": 0.8081646199800863,
"grad_norm": 3.0055716037750244,
"learning_rate": 3.653058966699856e-05,
"loss": 0.7182,
"step": 4870
},
{
"epoch": 0.8098240955857949,
"grad_norm": 3.2920753955841064,
"learning_rate": 3.6502931740236754e-05,
"loss": 0.8102,
"step": 4880
},
{
"epoch": 0.8114835711915035,
"grad_norm": 2.4466500282287598,
"learning_rate": 3.6475273813474944e-05,
"loss": 0.7053,
"step": 4890
},
{
"epoch": 0.813143046797212,
"grad_norm": 3.1616554260253906,
"learning_rate": 3.6447615886713135e-05,
"loss": 0.7155,
"step": 4900
},
{
"epoch": 0.8148025224029207,
"grad_norm": 2.9846699237823486,
"learning_rate": 3.6419957959951325e-05,
"loss": 0.7488,
"step": 4910
},
{
"epoch": 0.8164619980086293,
"grad_norm": 3.1745669841766357,
"learning_rate": 3.6392300033189516e-05,
"loss": 0.7309,
"step": 4920
},
{
"epoch": 0.8181214736143378,
"grad_norm": 3.3618462085723877,
"learning_rate": 3.636464210642771e-05,
"loss": 0.7755,
"step": 4930
},
{
"epoch": 0.8197809492200465,
"grad_norm": 2.7471730709075928,
"learning_rate": 3.63369841796659e-05,
"loss": 0.8052,
"step": 4940
},
{
"epoch": 0.8214404248257551,
"grad_norm": 2.917133331298828,
"learning_rate": 3.630932625290409e-05,
"loss": 0.7502,
"step": 4950
},
{
"epoch": 0.8230999004314636,
"grad_norm": 3.5757155418395996,
"learning_rate": 3.628166832614228e-05,
"loss": 0.7106,
"step": 4960
},
{
"epoch": 0.8247593760371722,
"grad_norm": 3.1652987003326416,
"learning_rate": 3.625401039938047e-05,
"loss": 0.7839,
"step": 4970
},
{
"epoch": 0.8264188516428809,
"grad_norm": 3.2558505535125732,
"learning_rate": 3.622635247261865e-05,
"loss": 0.7705,
"step": 4980
},
{
"epoch": 0.8280783272485894,
"grad_norm": 2.7738609313964844,
"learning_rate": 3.6198694545856844e-05,
"loss": 0.7781,
"step": 4990
},
{
"epoch": 0.829737802854298,
"grad_norm": 3.5117483139038086,
"learning_rate": 3.6171036619095034e-05,
"loss": 0.7234,
"step": 5000
},
{
"epoch": 0.829737802854298,
"eval_gen_len": 45.7875,
"eval_loss": 0.6476278901100159,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1347.2209,
"eval_samples_per_second": 4.92,
"eval_steps_per_second": 0.308,
"step": 5000
},
{
"epoch": 0.8313972784600067,
"grad_norm": 2.7426326274871826,
"learning_rate": 3.6143378692333225e-05,
"loss": 0.844,
"step": 5010
},
{
"epoch": 0.8330567540657152,
"grad_norm": 3.0585334300994873,
"learning_rate": 3.6115720765571416e-05,
"loss": 0.7621,
"step": 5020
},
{
"epoch": 0.8347162296714238,
"grad_norm": 2.7952592372894287,
"learning_rate": 3.6088062838809606e-05,
"loss": 0.7114,
"step": 5030
},
{
"epoch": 0.8363757052771325,
"grad_norm": 2.2673919200897217,
"learning_rate": 3.60604049120478e-05,
"loss": 0.7163,
"step": 5040
},
{
"epoch": 0.838035180882841,
"grad_norm": 2.556400775909424,
"learning_rate": 3.603274698528599e-05,
"loss": 0.7641,
"step": 5050
},
{
"epoch": 0.8396946564885496,
"grad_norm": 3.465658187866211,
"learning_rate": 3.600508905852418e-05,
"loss": 0.777,
"step": 5060
},
{
"epoch": 0.8413541320942582,
"grad_norm": 3.9427356719970703,
"learning_rate": 3.597743113176236e-05,
"loss": 0.7304,
"step": 5070
},
{
"epoch": 0.8430136076999668,
"grad_norm": 2.4685842990875244,
"learning_rate": 3.594977320500055e-05,
"loss": 0.7541,
"step": 5080
},
{
"epoch": 0.8446730833056754,
"grad_norm": 2.746155023574829,
"learning_rate": 3.592211527823874e-05,
"loss": 0.7408,
"step": 5090
},
{
"epoch": 0.846332558911384,
"grad_norm": 3.3263399600982666,
"learning_rate": 3.5894457351476934e-05,
"loss": 0.755,
"step": 5100
},
{
"epoch": 0.8479920345170926,
"grad_norm": 2.7520947456359863,
"learning_rate": 3.5866799424715124e-05,
"loss": 0.7239,
"step": 5110
},
{
"epoch": 0.8496515101228012,
"grad_norm": 3.0293684005737305,
"learning_rate": 3.5839141497953315e-05,
"loss": 0.8202,
"step": 5120
},
{
"epoch": 0.8513109857285098,
"grad_norm": 3.1652681827545166,
"learning_rate": 3.5811483571191506e-05,
"loss": 0.7466,
"step": 5130
},
{
"epoch": 0.8529704613342184,
"grad_norm": 3.104422092437744,
"learning_rate": 3.5783825644429696e-05,
"loss": 0.6651,
"step": 5140
},
{
"epoch": 0.854629936939927,
"grad_norm": 2.858640193939209,
"learning_rate": 3.575616771766789e-05,
"loss": 0.7021,
"step": 5150
},
{
"epoch": 0.8562894125456356,
"grad_norm": 2.765066385269165,
"learning_rate": 3.572850979090607e-05,
"loss": 0.768,
"step": 5160
},
{
"epoch": 0.8579488881513442,
"grad_norm": 3.4913389682769775,
"learning_rate": 3.570085186414426e-05,
"loss": 0.6938,
"step": 5170
},
{
"epoch": 0.8596083637570527,
"grad_norm": 2.3819427490234375,
"learning_rate": 3.567319393738245e-05,
"loss": 0.6678,
"step": 5180
},
{
"epoch": 0.8612678393627614,
"grad_norm": 5.098767280578613,
"learning_rate": 3.564553601062064e-05,
"loss": 0.7674,
"step": 5190
},
{
"epoch": 0.86292731496847,
"grad_norm": 2.6126420497894287,
"learning_rate": 3.561787808385883e-05,
"loss": 0.6877,
"step": 5200
},
{
"epoch": 0.8645867905741785,
"grad_norm": 2.502443313598633,
"learning_rate": 3.5590220157097024e-05,
"loss": 0.7378,
"step": 5210
},
{
"epoch": 0.8662462661798872,
"grad_norm": 3.2261953353881836,
"learning_rate": 3.5562562230335214e-05,
"loss": 0.7222,
"step": 5220
},
{
"epoch": 0.8679057417855958,
"grad_norm": 3.1731908321380615,
"learning_rate": 3.5534904303573405e-05,
"loss": 0.8105,
"step": 5230
},
{
"epoch": 0.8695652173913043,
"grad_norm": 3.071484327316284,
"learning_rate": 3.5507246376811596e-05,
"loss": 0.7798,
"step": 5240
},
{
"epoch": 0.871224692997013,
"grad_norm": 3.302419662475586,
"learning_rate": 3.5479588450049786e-05,
"loss": 0.7042,
"step": 5250
},
{
"epoch": 0.8728841686027216,
"grad_norm": 2.4384875297546387,
"learning_rate": 3.545193052328798e-05,
"loss": 0.7471,
"step": 5260
},
{
"epoch": 0.8745436442084301,
"grad_norm": 3.48577880859375,
"learning_rate": 3.542427259652617e-05,
"loss": 0.7472,
"step": 5270
},
{
"epoch": 0.8762031198141388,
"grad_norm": 3.4272027015686035,
"learning_rate": 3.539661466976436e-05,
"loss": 0.7818,
"step": 5280
},
{
"epoch": 0.8778625954198473,
"grad_norm": 2.677224636077881,
"learning_rate": 3.536895674300255e-05,
"loss": 0.7573,
"step": 5290
},
{
"epoch": 0.8795220710255559,
"grad_norm": 4.305156230926514,
"learning_rate": 3.534129881624074e-05,
"loss": 0.8223,
"step": 5300
},
{
"epoch": 0.8811815466312645,
"grad_norm": 3.6015806198120117,
"learning_rate": 3.531364088947893e-05,
"loss": 0.6546,
"step": 5310
},
{
"epoch": 0.8828410222369731,
"grad_norm": 3.4311509132385254,
"learning_rate": 3.528598296271712e-05,
"loss": 0.7541,
"step": 5320
},
{
"epoch": 0.8845004978426817,
"grad_norm": 2.226832866668701,
"learning_rate": 3.525832503595531e-05,
"loss": 0.6735,
"step": 5330
},
{
"epoch": 0.8861599734483903,
"grad_norm": 2.7130308151245117,
"learning_rate": 3.52306671091935e-05,
"loss": 0.738,
"step": 5340
},
{
"epoch": 0.8878194490540989,
"grad_norm": 2.3154845237731934,
"learning_rate": 3.5203009182431686e-05,
"loss": 0.7721,
"step": 5350
},
{
"epoch": 0.8894789246598075,
"grad_norm": 2.7403759956359863,
"learning_rate": 3.5175351255669876e-05,
"loss": 0.7622,
"step": 5360
},
{
"epoch": 0.891138400265516,
"grad_norm": 2.614175796508789,
"learning_rate": 3.514769332890807e-05,
"loss": 0.7583,
"step": 5370
},
{
"epoch": 0.8927978758712247,
"grad_norm": 2.589661121368408,
"learning_rate": 3.512003540214626e-05,
"loss": 0.7198,
"step": 5380
},
{
"epoch": 0.8944573514769333,
"grad_norm": 2.6613898277282715,
"learning_rate": 3.509237747538445e-05,
"loss": 0.7294,
"step": 5390
},
{
"epoch": 0.8961168270826418,
"grad_norm": 2.723780632019043,
"learning_rate": 3.506471954862264e-05,
"loss": 0.7546,
"step": 5400
},
{
"epoch": 0.8977763026883505,
"grad_norm": 2.591231107711792,
"learning_rate": 3.503706162186083e-05,
"loss": 0.7599,
"step": 5410
},
{
"epoch": 0.8994357782940591,
"grad_norm": 3.1421103477478027,
"learning_rate": 3.500940369509902e-05,
"loss": 0.7105,
"step": 5420
},
{
"epoch": 0.9010952538997676,
"grad_norm": 3.057150363922119,
"learning_rate": 3.498174576833721e-05,
"loss": 0.7236,
"step": 5430
},
{
"epoch": 0.9027547295054763,
"grad_norm": 2.2406747341156006,
"learning_rate": 3.4954087841575394e-05,
"loss": 0.8167,
"step": 5440
},
{
"epoch": 0.9044142051111849,
"grad_norm": 2.847642421722412,
"learning_rate": 3.4926429914813585e-05,
"loss": 0.736,
"step": 5450
},
{
"epoch": 0.9060736807168934,
"grad_norm": 2.9997546672821045,
"learning_rate": 3.4898771988051776e-05,
"loss": 0.7859,
"step": 5460
},
{
"epoch": 0.9077331563226021,
"grad_norm": 3.000199556350708,
"learning_rate": 3.4871114061289966e-05,
"loss": 0.8063,
"step": 5470
},
{
"epoch": 0.9093926319283107,
"grad_norm": 3.7363743782043457,
"learning_rate": 3.484345613452816e-05,
"loss": 0.7054,
"step": 5480
},
{
"epoch": 0.9110521075340192,
"grad_norm": 3.2134993076324463,
"learning_rate": 3.481579820776635e-05,
"loss": 0.7257,
"step": 5490
},
{
"epoch": 0.9127115831397279,
"grad_norm": 9.707784652709961,
"learning_rate": 3.478814028100454e-05,
"loss": 0.7097,
"step": 5500
},
{
"epoch": 0.9143710587454364,
"grad_norm": 3.060612201690674,
"learning_rate": 3.476048235424273e-05,
"loss": 0.7491,
"step": 5510
},
{
"epoch": 0.916030534351145,
"grad_norm": 2.9372243881225586,
"learning_rate": 3.473282442748092e-05,
"loss": 0.785,
"step": 5520
},
{
"epoch": 0.9176900099568537,
"grad_norm": 2.6264986991882324,
"learning_rate": 3.47051665007191e-05,
"loss": 0.7403,
"step": 5530
},
{
"epoch": 0.9193494855625622,
"grad_norm": 3.810741901397705,
"learning_rate": 3.4677508573957294e-05,
"loss": 0.782,
"step": 5540
},
{
"epoch": 0.9210089611682708,
"grad_norm": 2.4057788848876953,
"learning_rate": 3.4649850647195484e-05,
"loss": 0.7332,
"step": 5550
},
{
"epoch": 0.9226684367739795,
"grad_norm": 2.5776431560516357,
"learning_rate": 3.4622192720433675e-05,
"loss": 0.7777,
"step": 5560
},
{
"epoch": 0.924327912379688,
"grad_norm": 2.6932153701782227,
"learning_rate": 3.4594534793671866e-05,
"loss": 0.7943,
"step": 5570
},
{
"epoch": 0.9259873879853966,
"grad_norm": 2.4345345497131348,
"learning_rate": 3.4566876866910056e-05,
"loss": 0.7558,
"step": 5580
},
{
"epoch": 0.9276468635911053,
"grad_norm": 4.420886039733887,
"learning_rate": 3.453921894014825e-05,
"loss": 0.7235,
"step": 5590
},
{
"epoch": 0.9293063391968138,
"grad_norm": 3.0717618465423584,
"learning_rate": 3.451156101338644e-05,
"loss": 0.7157,
"step": 5600
},
{
"epoch": 0.9309658148025224,
"grad_norm": 2.4393157958984375,
"learning_rate": 3.448390308662463e-05,
"loss": 0.7524,
"step": 5610
},
{
"epoch": 0.932625290408231,
"grad_norm": 2.8943607807159424,
"learning_rate": 3.445624515986282e-05,
"loss": 0.703,
"step": 5620
},
{
"epoch": 0.9342847660139396,
"grad_norm": 2.4995365142822266,
"learning_rate": 3.442858723310101e-05,
"loss": 0.7619,
"step": 5630
},
{
"epoch": 0.9359442416196482,
"grad_norm": 2.7852730751037598,
"learning_rate": 3.44009293063392e-05,
"loss": 0.7796,
"step": 5640
},
{
"epoch": 0.9376037172253567,
"grad_norm": 2.8426830768585205,
"learning_rate": 3.437327137957739e-05,
"loss": 0.7605,
"step": 5650
},
{
"epoch": 0.9392631928310654,
"grad_norm": 2.4552955627441406,
"learning_rate": 3.434561345281558e-05,
"loss": 0.6601,
"step": 5660
},
{
"epoch": 0.940922668436774,
"grad_norm": 3.545321464538574,
"learning_rate": 3.431795552605377e-05,
"loss": 0.7432,
"step": 5670
},
{
"epoch": 0.9425821440424825,
"grad_norm": 3.482745885848999,
"learning_rate": 3.429029759929196e-05,
"loss": 0.6854,
"step": 5680
},
{
"epoch": 0.9442416196481912,
"grad_norm": 2.753021717071533,
"learning_rate": 3.426263967253015e-05,
"loss": 0.726,
"step": 5690
},
{
"epoch": 0.9459010952538998,
"grad_norm": 2.742332696914673,
"learning_rate": 3.4234981745768344e-05,
"loss": 0.727,
"step": 5700
},
{
"epoch": 0.9475605708596083,
"grad_norm": 3.272718667984009,
"learning_rate": 3.4207323819006534e-05,
"loss": 0.7593,
"step": 5710
},
{
"epoch": 0.949220046465317,
"grad_norm": 2.3225224018096924,
"learning_rate": 3.417966589224472e-05,
"loss": 0.6858,
"step": 5720
},
{
"epoch": 0.9508795220710256,
"grad_norm": 3.2801098823547363,
"learning_rate": 3.415200796548291e-05,
"loss": 0.7554,
"step": 5730
},
{
"epoch": 0.9525389976767341,
"grad_norm": 2.5931944847106934,
"learning_rate": 3.41243500387211e-05,
"loss": 0.7443,
"step": 5740
},
{
"epoch": 0.9541984732824428,
"grad_norm": 2.872978687286377,
"learning_rate": 3.409669211195929e-05,
"loss": 0.673,
"step": 5750
},
{
"epoch": 0.9558579488881513,
"grad_norm": 2.8649654388427734,
"learning_rate": 3.406903418519748e-05,
"loss": 0.7302,
"step": 5760
},
{
"epoch": 0.9575174244938599,
"grad_norm": 3.1845171451568604,
"learning_rate": 3.404137625843567e-05,
"loss": 0.8126,
"step": 5770
},
{
"epoch": 0.9591769000995686,
"grad_norm": 2.9678781032562256,
"learning_rate": 3.401371833167386e-05,
"loss": 0.7463,
"step": 5780
},
{
"epoch": 0.9608363757052771,
"grad_norm": 3.0109095573425293,
"learning_rate": 3.398606040491205e-05,
"loss": 0.7716,
"step": 5790
},
{
"epoch": 0.9624958513109857,
"grad_norm": 3.596421241760254,
"learning_rate": 3.395840247815024e-05,
"loss": 0.7262,
"step": 5800
},
{
"epoch": 0.9641553269166944,
"grad_norm": 2.6864354610443115,
"learning_rate": 3.393074455138843e-05,
"loss": 0.7623,
"step": 5810
},
{
"epoch": 0.9658148025224029,
"grad_norm": 3.0654313564300537,
"learning_rate": 3.390308662462662e-05,
"loss": 0.7788,
"step": 5820
},
{
"epoch": 0.9674742781281115,
"grad_norm": 3.0649118423461914,
"learning_rate": 3.387542869786481e-05,
"loss": 0.7007,
"step": 5830
},
{
"epoch": 0.9691337537338202,
"grad_norm": 2.9800171852111816,
"learning_rate": 3.3847770771103e-05,
"loss": 0.7909,
"step": 5840
},
{
"epoch": 0.9707932293395287,
"grad_norm": 4.103562355041504,
"learning_rate": 3.382011284434119e-05,
"loss": 0.7356,
"step": 5850
},
{
"epoch": 0.9724527049452373,
"grad_norm": 3.1845431327819824,
"learning_rate": 3.379245491757938e-05,
"loss": 0.7547,
"step": 5860
},
{
"epoch": 0.974112180550946,
"grad_norm": 3.618088960647583,
"learning_rate": 3.376479699081757e-05,
"loss": 0.7636,
"step": 5870
},
{
"epoch": 0.9757716561566545,
"grad_norm": 2.9461872577667236,
"learning_rate": 3.373713906405576e-05,
"loss": 0.7425,
"step": 5880
},
{
"epoch": 0.9774311317623631,
"grad_norm": 2.7135982513427734,
"learning_rate": 3.370948113729395e-05,
"loss": 0.7326,
"step": 5890
},
{
"epoch": 0.9790906073680717,
"grad_norm": 2.974194049835205,
"learning_rate": 3.3681823210532136e-05,
"loss": 0.7332,
"step": 5900
},
{
"epoch": 0.9807500829737803,
"grad_norm": 3.9138126373291016,
"learning_rate": 3.3654165283770326e-05,
"loss": 0.6801,
"step": 5910
},
{
"epoch": 0.9824095585794889,
"grad_norm": 3.1881282329559326,
"learning_rate": 3.362650735700852e-05,
"loss": 0.6514,
"step": 5920
},
{
"epoch": 0.9840690341851974,
"grad_norm": 3.0503334999084473,
"learning_rate": 3.359884943024671e-05,
"loss": 0.7232,
"step": 5930
},
{
"epoch": 0.9857285097909061,
"grad_norm": 3.4355130195617676,
"learning_rate": 3.35711915034849e-05,
"loss": 0.7704,
"step": 5940
},
{
"epoch": 0.9873879853966147,
"grad_norm": 2.670332193374634,
"learning_rate": 3.354353357672309e-05,
"loss": 0.7534,
"step": 5950
},
{
"epoch": 0.9890474610023232,
"grad_norm": 3.4164748191833496,
"learning_rate": 3.351587564996128e-05,
"loss": 0.6759,
"step": 5960
},
{
"epoch": 0.9907069366080319,
"grad_norm": 2.7511496543884277,
"learning_rate": 3.348821772319947e-05,
"loss": 0.7524,
"step": 5970
},
{
"epoch": 0.9923664122137404,
"grad_norm": 3.1281206607818604,
"learning_rate": 3.346055979643766e-05,
"loss": 0.7486,
"step": 5980
},
{
"epoch": 0.994025887819449,
"grad_norm": 2.146121025085449,
"learning_rate": 3.343290186967585e-05,
"loss": 0.7709,
"step": 5990
},
{
"epoch": 0.9956853634251577,
"grad_norm": 2.9810049533843994,
"learning_rate": 3.3405243942914035e-05,
"loss": 0.7439,
"step": 6000
},
{
"epoch": 0.9956853634251577,
"eval_gen_len": 40.46566265060241,
"eval_loss": 0.6396129131317139,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1287.1539,
"eval_samples_per_second": 5.149,
"eval_steps_per_second": 0.322,
"step": 6000
},
{
"epoch": 0.9973448390308662,
"grad_norm": 3.1074583530426025,
"learning_rate": 3.3377586016152226e-05,
"loss": 0.7614,
"step": 6010
},
{
"epoch": 0.9990043146365748,
"grad_norm": 2.733332395553589,
"learning_rate": 3.3349928089390416e-05,
"loss": 0.7873,
"step": 6020
},
{
"epoch": 1.0006637902422835,
"grad_norm": 2.6200480461120605,
"learning_rate": 3.332227016262861e-05,
"loss": 0.715,
"step": 6030
},
{
"epoch": 1.0023232658479921,
"grad_norm": 2.853236436843872,
"learning_rate": 3.32946122358668e-05,
"loss": 0.6356,
"step": 6040
},
{
"epoch": 1.0039827414537006,
"grad_norm": 2.4870617389678955,
"learning_rate": 3.3266954309104995e-05,
"loss": 0.735,
"step": 6050
},
{
"epoch": 1.0056422170594093,
"grad_norm": 2.46635365486145,
"learning_rate": 3.3239296382343186e-05,
"loss": 0.6706,
"step": 6060
},
{
"epoch": 1.007301692665118,
"grad_norm": 2.235780954360962,
"learning_rate": 3.3211638455581376e-05,
"loss": 0.6001,
"step": 6070
},
{
"epoch": 1.0089611682708264,
"grad_norm": 2.660212516784668,
"learning_rate": 3.318398052881957e-05,
"loss": 0.6637,
"step": 6080
},
{
"epoch": 1.010620643876535,
"grad_norm": 3.813750743865967,
"learning_rate": 3.315632260205775e-05,
"loss": 0.6834,
"step": 6090
},
{
"epoch": 1.0122801194822435,
"grad_norm": 2.3156824111938477,
"learning_rate": 3.312866467529594e-05,
"loss": 0.619,
"step": 6100
},
{
"epoch": 1.0139395950879522,
"grad_norm": 2.3183748722076416,
"learning_rate": 3.310100674853413e-05,
"loss": 0.6093,
"step": 6110
},
{
"epoch": 1.0155990706936608,
"grad_norm": 3.1354057788848877,
"learning_rate": 3.307334882177232e-05,
"loss": 0.6389,
"step": 6120
},
{
"epoch": 1.0172585462993693,
"grad_norm": 2.536813259124756,
"learning_rate": 3.304569089501051e-05,
"loss": 0.6711,
"step": 6130
},
{
"epoch": 1.018918021905078,
"grad_norm": 2.364082098007202,
"learning_rate": 3.3018032968248704e-05,
"loss": 0.6867,
"step": 6140
},
{
"epoch": 1.0205774975107866,
"grad_norm": 2.6374351978302,
"learning_rate": 3.2990375041486895e-05,
"loss": 0.69,
"step": 6150
},
{
"epoch": 1.022236973116495,
"grad_norm": 3.3656816482543945,
"learning_rate": 3.2962717114725085e-05,
"loss": 0.6982,
"step": 6160
},
{
"epoch": 1.0238964487222038,
"grad_norm": 2.7787158489227295,
"learning_rate": 3.2935059187963276e-05,
"loss": 0.6822,
"step": 6170
},
{
"epoch": 1.0255559243279124,
"grad_norm": 3.168287754058838,
"learning_rate": 3.290740126120146e-05,
"loss": 0.6435,
"step": 6180
},
{
"epoch": 1.0272153999336209,
"grad_norm": 2.751758575439453,
"learning_rate": 3.287974333443965e-05,
"loss": 0.6937,
"step": 6190
},
{
"epoch": 1.0288748755393295,
"grad_norm": 2.5584921836853027,
"learning_rate": 3.285208540767784e-05,
"loss": 0.6076,
"step": 6200
},
{
"epoch": 1.0305343511450382,
"grad_norm": 3.0572123527526855,
"learning_rate": 3.282442748091603e-05,
"loss": 0.648,
"step": 6210
},
{
"epoch": 1.0321938267507467,
"grad_norm": 2.5673274993896484,
"learning_rate": 3.279676955415422e-05,
"loss": 0.5858,
"step": 6220
},
{
"epoch": 1.0338533023564553,
"grad_norm": 2.4913575649261475,
"learning_rate": 3.276911162739241e-05,
"loss": 0.6632,
"step": 6230
},
{
"epoch": 1.035512777962164,
"grad_norm": 2.7290186882019043,
"learning_rate": 3.27414537006306e-05,
"loss": 0.6078,
"step": 6240
},
{
"epoch": 1.0371722535678725,
"grad_norm": 3.055506706237793,
"learning_rate": 3.2713795773868794e-05,
"loss": 0.67,
"step": 6250
},
{
"epoch": 1.0388317291735811,
"grad_norm": 2.851560354232788,
"learning_rate": 3.2686137847106985e-05,
"loss": 0.6333,
"step": 6260
},
{
"epoch": 1.0404912047792898,
"grad_norm": 2.7579662799835205,
"learning_rate": 3.2658479920345175e-05,
"loss": 0.6085,
"step": 6270
},
{
"epoch": 1.0421506803849983,
"grad_norm": 3.7273287773132324,
"learning_rate": 3.263082199358336e-05,
"loss": 0.656,
"step": 6280
},
{
"epoch": 1.043810155990707,
"grad_norm": 2.7276690006256104,
"learning_rate": 3.260316406682155e-05,
"loss": 0.6387,
"step": 6290
},
{
"epoch": 1.0454696315964156,
"grad_norm": 3.238990068435669,
"learning_rate": 3.257550614005974e-05,
"loss": 0.6504,
"step": 6300
},
{
"epoch": 1.047129107202124,
"grad_norm": 3.2276484966278076,
"learning_rate": 3.254784821329793e-05,
"loss": 0.6543,
"step": 6310
},
{
"epoch": 1.0487885828078327,
"grad_norm": 3.0428245067596436,
"learning_rate": 3.252019028653612e-05,
"loss": 0.7151,
"step": 6320
},
{
"epoch": 1.0504480584135414,
"grad_norm": 2.6299469470977783,
"learning_rate": 3.249253235977431e-05,
"loss": 0.6054,
"step": 6330
},
{
"epoch": 1.0521075340192498,
"grad_norm": 2.7120039463043213,
"learning_rate": 3.24648744330125e-05,
"loss": 0.6339,
"step": 6340
},
{
"epoch": 1.0537670096249585,
"grad_norm": 2.739844560623169,
"learning_rate": 3.243721650625069e-05,
"loss": 0.6676,
"step": 6350
},
{
"epoch": 1.0554264852306672,
"grad_norm": 2.740752696990967,
"learning_rate": 3.2409558579488884e-05,
"loss": 0.6693,
"step": 6360
},
{
"epoch": 1.0570859608363756,
"grad_norm": 6.5435051918029785,
"learning_rate": 3.238190065272707e-05,
"loss": 0.6744,
"step": 6370
},
{
"epoch": 1.0587454364420843,
"grad_norm": 3.4088094234466553,
"learning_rate": 3.235424272596526e-05,
"loss": 0.6362,
"step": 6380
},
{
"epoch": 1.060404912047793,
"grad_norm": 4.100635051727295,
"learning_rate": 3.232658479920345e-05,
"loss": 0.6715,
"step": 6390
},
{
"epoch": 1.0620643876535014,
"grad_norm": 2.5293679237365723,
"learning_rate": 3.229892687244164e-05,
"loss": 0.7047,
"step": 6400
},
{
"epoch": 1.06372386325921,
"grad_norm": 3.4982504844665527,
"learning_rate": 3.227126894567983e-05,
"loss": 0.6444,
"step": 6410
},
{
"epoch": 1.0653833388649188,
"grad_norm": 2.6392831802368164,
"learning_rate": 3.224361101891802e-05,
"loss": 0.6596,
"step": 6420
},
{
"epoch": 1.0670428144706272,
"grad_norm": 2.652277708053589,
"learning_rate": 3.221595309215621e-05,
"loss": 0.6365,
"step": 6430
},
{
"epoch": 1.0687022900763359,
"grad_norm": 2.6296143531799316,
"learning_rate": 3.21882951653944e-05,
"loss": 0.684,
"step": 6440
},
{
"epoch": 1.0703617656820446,
"grad_norm": 2.9363362789154053,
"learning_rate": 3.216063723863259e-05,
"loss": 0.6578,
"step": 6450
},
{
"epoch": 1.072021241287753,
"grad_norm": 2.624547004699707,
"learning_rate": 3.2132979311870783e-05,
"loss": 0.6462,
"step": 6460
},
{
"epoch": 1.0736807168934617,
"grad_norm": 2.5561087131500244,
"learning_rate": 3.2105321385108974e-05,
"loss": 0.643,
"step": 6470
},
{
"epoch": 1.0753401924991703,
"grad_norm": 2.7740066051483154,
"learning_rate": 3.2077663458347165e-05,
"loss": 0.6781,
"step": 6480
},
{
"epoch": 1.0769996681048788,
"grad_norm": 2.1783266067504883,
"learning_rate": 3.2050005531585355e-05,
"loss": 0.6169,
"step": 6490
},
{
"epoch": 1.0786591437105875,
"grad_norm": 2.971466541290283,
"learning_rate": 3.2022347604823546e-05,
"loss": 0.6347,
"step": 6500
},
{
"epoch": 1.0803186193162961,
"grad_norm": 3.469334602355957,
"learning_rate": 3.1994689678061737e-05,
"loss": 0.6412,
"step": 6510
},
{
"epoch": 1.0819780949220046,
"grad_norm": 3.781665802001953,
"learning_rate": 3.196703175129993e-05,
"loss": 0.6077,
"step": 6520
},
{
"epoch": 1.0836375705277133,
"grad_norm": 3.0542349815368652,
"learning_rate": 3.193937382453812e-05,
"loss": 0.6172,
"step": 6530
},
{
"epoch": 1.085297046133422,
"grad_norm": 2.6497995853424072,
"learning_rate": 3.191171589777631e-05,
"loss": 0.6114,
"step": 6540
},
{
"epoch": 1.0869565217391304,
"grad_norm": 3.0180935859680176,
"learning_rate": 3.188405797101449e-05,
"loss": 0.6214,
"step": 6550
},
{
"epoch": 1.088615997344839,
"grad_norm": 2.9615893363952637,
"learning_rate": 3.185640004425268e-05,
"loss": 0.6236,
"step": 6560
},
{
"epoch": 1.0902754729505477,
"grad_norm": 2.6770997047424316,
"learning_rate": 3.1828742117490873e-05,
"loss": 0.6799,
"step": 6570
},
{
"epoch": 1.0919349485562562,
"grad_norm": 4.559014320373535,
"learning_rate": 3.1801084190729064e-05,
"loss": 0.6842,
"step": 6580
},
{
"epoch": 1.0935944241619648,
"grad_norm": 2.9613256454467773,
"learning_rate": 3.1773426263967255e-05,
"loss": 0.6931,
"step": 6590
},
{
"epoch": 1.0952538997676735,
"grad_norm": 2.513901472091675,
"learning_rate": 3.1745768337205445e-05,
"loss": 0.6738,
"step": 6600
},
{
"epoch": 1.096913375373382,
"grad_norm": 3.3798329830169678,
"learning_rate": 3.1718110410443636e-05,
"loss": 0.6172,
"step": 6610
},
{
"epoch": 1.0985728509790906,
"grad_norm": 3.9530441761016846,
"learning_rate": 3.1690452483681827e-05,
"loss": 0.677,
"step": 6620
},
{
"epoch": 1.1002323265847993,
"grad_norm": 2.511976957321167,
"learning_rate": 3.166279455692002e-05,
"loss": 0.7343,
"step": 6630
},
{
"epoch": 1.1018918021905078,
"grad_norm": 3.531120538711548,
"learning_rate": 3.163513663015821e-05,
"loss": 0.6117,
"step": 6640
},
{
"epoch": 1.1035512777962164,
"grad_norm": 2.912233829498291,
"learning_rate": 3.160747870339639e-05,
"loss": 0.6477,
"step": 6650
},
{
"epoch": 1.105210753401925,
"grad_norm": 2.7462551593780518,
"learning_rate": 3.157982077663458e-05,
"loss": 0.6137,
"step": 6660
},
{
"epoch": 1.1068702290076335,
"grad_norm": 3.05641770362854,
"learning_rate": 3.155216284987277e-05,
"loss": 0.6083,
"step": 6670
},
{
"epoch": 1.1085297046133422,
"grad_norm": 2.6118903160095215,
"learning_rate": 3.1524504923110963e-05,
"loss": 0.5909,
"step": 6680
},
{
"epoch": 1.1101891802190509,
"grad_norm": 2.862626314163208,
"learning_rate": 3.1496846996349154e-05,
"loss": 0.5916,
"step": 6690
},
{
"epoch": 1.1118486558247593,
"grad_norm": 3.342639923095703,
"learning_rate": 3.1469189069587345e-05,
"loss": 0.7119,
"step": 6700
},
{
"epoch": 1.113508131430468,
"grad_norm": 2.773423910140991,
"learning_rate": 3.1441531142825535e-05,
"loss": 0.6357,
"step": 6710
},
{
"epoch": 1.1151676070361765,
"grad_norm": 2.826077461242676,
"learning_rate": 3.1413873216063726e-05,
"loss": 0.6012,
"step": 6720
},
{
"epoch": 1.1168270826418851,
"grad_norm": 3.7453114986419678,
"learning_rate": 3.1386215289301917e-05,
"loss": 0.6853,
"step": 6730
},
{
"epoch": 1.1184865582475938,
"grad_norm": 2.565749406814575,
"learning_rate": 3.13585573625401e-05,
"loss": 0.6068,
"step": 6740
},
{
"epoch": 1.1201460338533025,
"grad_norm": 3.0959222316741943,
"learning_rate": 3.133089943577829e-05,
"loss": 0.6165,
"step": 6750
},
{
"epoch": 1.121805509459011,
"grad_norm": 2.629734992980957,
"learning_rate": 3.130324150901648e-05,
"loss": 0.6487,
"step": 6760
},
{
"epoch": 1.1234649850647196,
"grad_norm": 3.2976136207580566,
"learning_rate": 3.127558358225467e-05,
"loss": 0.6019,
"step": 6770
},
{
"epoch": 1.125124460670428,
"grad_norm": 2.2839882373809814,
"learning_rate": 3.124792565549286e-05,
"loss": 0.6067,
"step": 6780
},
{
"epoch": 1.1267839362761367,
"grad_norm": 2.334839105606079,
"learning_rate": 3.1220267728731054e-05,
"loss": 0.6528,
"step": 6790
},
{
"epoch": 1.1284434118818454,
"grad_norm": 1.9476408958435059,
"learning_rate": 3.1192609801969244e-05,
"loss": 0.6461,
"step": 6800
},
{
"epoch": 1.130102887487554,
"grad_norm": 2.838207244873047,
"learning_rate": 3.1164951875207435e-05,
"loss": 0.7088,
"step": 6810
},
{
"epoch": 1.1317623630932625,
"grad_norm": 5.253453731536865,
"learning_rate": 3.1137293948445625e-05,
"loss": 0.59,
"step": 6820
},
{
"epoch": 1.1334218386989712,
"grad_norm": 2.9335341453552246,
"learning_rate": 3.1109636021683816e-05,
"loss": 0.6578,
"step": 6830
},
{
"epoch": 1.1350813143046796,
"grad_norm": 3.694380521774292,
"learning_rate": 3.108197809492201e-05,
"loss": 0.6394,
"step": 6840
},
{
"epoch": 1.1367407899103883,
"grad_norm": 2.563654661178589,
"learning_rate": 3.10543201681602e-05,
"loss": 0.6348,
"step": 6850
},
{
"epoch": 1.138400265516097,
"grad_norm": 3.120288610458374,
"learning_rate": 3.102666224139839e-05,
"loss": 0.6605,
"step": 6860
},
{
"epoch": 1.1400597411218054,
"grad_norm": 3.0192902088165283,
"learning_rate": 3.099900431463658e-05,
"loss": 0.6986,
"step": 6870
},
{
"epoch": 1.141719216727514,
"grad_norm": 2.648144483566284,
"learning_rate": 3.097134638787477e-05,
"loss": 0.6621,
"step": 6880
},
{
"epoch": 1.1433786923332228,
"grad_norm": 4.2885541915893555,
"learning_rate": 3.094368846111296e-05,
"loss": 0.638,
"step": 6890
},
{
"epoch": 1.1450381679389312,
"grad_norm": 2.5953876972198486,
"learning_rate": 3.091603053435115e-05,
"loss": 0.671,
"step": 6900
},
{
"epoch": 1.1466976435446399,
"grad_norm": 2.913402557373047,
"learning_rate": 3.088837260758934e-05,
"loss": 0.6305,
"step": 6910
},
{
"epoch": 1.1483571191503485,
"grad_norm": 3.232034206390381,
"learning_rate": 3.086071468082753e-05,
"loss": 0.6411,
"step": 6920
},
{
"epoch": 1.150016594756057,
"grad_norm": 3.1293694972991943,
"learning_rate": 3.0833056754065715e-05,
"loss": 0.6412,
"step": 6930
},
{
"epoch": 1.1516760703617657,
"grad_norm": 2.8033883571624756,
"learning_rate": 3.0805398827303906e-05,
"loss": 0.6943,
"step": 6940
},
{
"epoch": 1.1533355459674743,
"grad_norm": 3.2198326587677,
"learning_rate": 3.07777409005421e-05,
"loss": 0.6954,
"step": 6950
},
{
"epoch": 1.1549950215731828,
"grad_norm": 2.676884174346924,
"learning_rate": 3.075008297378029e-05,
"loss": 0.7078,
"step": 6960
},
{
"epoch": 1.1566544971788915,
"grad_norm": 2.441145658493042,
"learning_rate": 3.072242504701848e-05,
"loss": 0.6885,
"step": 6970
},
{
"epoch": 1.1583139727846001,
"grad_norm": 5.0609612464904785,
"learning_rate": 3.069476712025667e-05,
"loss": 0.6249,
"step": 6980
},
{
"epoch": 1.1599734483903086,
"grad_norm": 3.058180332183838,
"learning_rate": 3.066710919349486e-05,
"loss": 0.5588,
"step": 6990
},
{
"epoch": 1.1616329239960173,
"grad_norm": 3.6752076148986816,
"learning_rate": 3.063945126673305e-05,
"loss": 0.6351,
"step": 7000
},
{
"epoch": 1.1616329239960173,
"eval_gen_len": 42.34563253012048,
"eval_loss": 0.6441066265106201,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1313.5891,
"eval_samples_per_second": 5.046,
"eval_steps_per_second": 0.316,
"step": 7000
},
{
"epoch": 1.163292399601726,
"grad_norm": 2.524378776550293,
"learning_rate": 3.061179333997124e-05,
"loss": 0.6689,
"step": 7010
},
{
"epoch": 1.1649518752074344,
"grad_norm": 3.153264284133911,
"learning_rate": 3.0584135413209424e-05,
"loss": 0.6713,
"step": 7020
},
{
"epoch": 1.166611350813143,
"grad_norm": 2.891984224319458,
"learning_rate": 3.0556477486447615e-05,
"loss": 0.6728,
"step": 7030
},
{
"epoch": 1.1682708264188517,
"grad_norm": 2.646772623062134,
"learning_rate": 3.0528819559685805e-05,
"loss": 0.6418,
"step": 7040
},
{
"epoch": 1.1699303020245602,
"grad_norm": 2.870234489440918,
"learning_rate": 3.0501161632923996e-05,
"loss": 0.6429,
"step": 7050
},
{
"epoch": 1.1715897776302688,
"grad_norm": 2.939676523208618,
"learning_rate": 3.0473503706162187e-05,
"loss": 0.6506,
"step": 7060
},
{
"epoch": 1.1732492532359775,
"grad_norm": 3.037081003189087,
"learning_rate": 3.0445845779400377e-05,
"loss": 0.6933,
"step": 7070
},
{
"epoch": 1.174908728841686,
"grad_norm": 2.871654987335205,
"learning_rate": 3.0418187852638568e-05,
"loss": 0.6712,
"step": 7080
},
{
"epoch": 1.1765682044473946,
"grad_norm": 2.945828914642334,
"learning_rate": 3.039052992587676e-05,
"loss": 0.6474,
"step": 7090
},
{
"epoch": 1.1782276800531033,
"grad_norm": 3.159989595413208,
"learning_rate": 3.036287199911495e-05,
"loss": 0.6564,
"step": 7100
},
{
"epoch": 1.1798871556588117,
"grad_norm": 2.5197629928588867,
"learning_rate": 3.0335214072353136e-05,
"loss": 0.6618,
"step": 7110
},
{
"epoch": 1.1815466312645204,
"grad_norm": 2.656416416168213,
"learning_rate": 3.0307556145591327e-05,
"loss": 0.6439,
"step": 7120
},
{
"epoch": 1.183206106870229,
"grad_norm": 3.2077407836914062,
"learning_rate": 3.0279898218829518e-05,
"loss": 0.659,
"step": 7130
},
{
"epoch": 1.1848655824759375,
"grad_norm": 2.5919651985168457,
"learning_rate": 3.0252240292067708e-05,
"loss": 0.6787,
"step": 7140
},
{
"epoch": 1.1865250580816462,
"grad_norm": 3.7159323692321777,
"learning_rate": 3.02245823653059e-05,
"loss": 0.6452,
"step": 7150
},
{
"epoch": 1.1881845336873549,
"grad_norm": 2.56329607963562,
"learning_rate": 3.019692443854409e-05,
"loss": 0.6842,
"step": 7160
},
{
"epoch": 1.1898440092930633,
"grad_norm": 2.9777672290802,
"learning_rate": 3.016926651178228e-05,
"loss": 0.6809,
"step": 7170
},
{
"epoch": 1.191503484898772,
"grad_norm": 2.5090603828430176,
"learning_rate": 3.014160858502047e-05,
"loss": 0.6442,
"step": 7180
},
{
"epoch": 1.1931629605044807,
"grad_norm": 2.4584901332855225,
"learning_rate": 3.011395065825866e-05,
"loss": 0.6453,
"step": 7190
},
{
"epoch": 1.1948224361101891,
"grad_norm": 3.469609260559082,
"learning_rate": 3.0086292731496845e-05,
"loss": 0.697,
"step": 7200
},
{
"epoch": 1.1964819117158978,
"grad_norm": 3.686086893081665,
"learning_rate": 3.0058634804735036e-05,
"loss": 0.6188,
"step": 7210
},
{
"epoch": 1.1981413873216065,
"grad_norm": 2.244830846786499,
"learning_rate": 3.0030976877973226e-05,
"loss": 0.6651,
"step": 7220
},
{
"epoch": 1.199800862927315,
"grad_norm": 2.945749521255493,
"learning_rate": 3.0003318951211417e-05,
"loss": 0.6997,
"step": 7230
},
{
"epoch": 1.2014603385330236,
"grad_norm": 2.82460880279541,
"learning_rate": 2.9975661024449608e-05,
"loss": 0.5754,
"step": 7240
},
{
"epoch": 1.2031198141387323,
"grad_norm": 3.569021224975586,
"learning_rate": 2.99480030976878e-05,
"loss": 0.6104,
"step": 7250
},
{
"epoch": 1.2047792897444407,
"grad_norm": 3.185415744781494,
"learning_rate": 2.992034517092599e-05,
"loss": 0.6243,
"step": 7260
},
{
"epoch": 1.2064387653501494,
"grad_norm": 2.5919158458709717,
"learning_rate": 2.989268724416418e-05,
"loss": 0.6446,
"step": 7270
},
{
"epoch": 1.208098240955858,
"grad_norm": 2.6875202655792236,
"learning_rate": 2.986502931740237e-05,
"loss": 0.6539,
"step": 7280
},
{
"epoch": 1.2097577165615665,
"grad_norm": 3.1231086254119873,
"learning_rate": 2.983737139064056e-05,
"loss": 0.6445,
"step": 7290
},
{
"epoch": 1.2114171921672752,
"grad_norm": 3.024702787399292,
"learning_rate": 2.9809713463878748e-05,
"loss": 0.5636,
"step": 7300
},
{
"epoch": 1.2130766677729836,
"grad_norm": 2.9127464294433594,
"learning_rate": 2.978205553711694e-05,
"loss": 0.6534,
"step": 7310
},
{
"epoch": 1.2147361433786923,
"grad_norm": 4.460144996643066,
"learning_rate": 2.975439761035513e-05,
"loss": 0.6012,
"step": 7320
},
{
"epoch": 1.216395618984401,
"grad_norm": 2.9597043991088867,
"learning_rate": 2.972673968359332e-05,
"loss": 0.6361,
"step": 7330
},
{
"epoch": 1.2180550945901096,
"grad_norm": 2.4563422203063965,
"learning_rate": 2.969908175683151e-05,
"loss": 0.6568,
"step": 7340
},
{
"epoch": 1.219714570195818,
"grad_norm": 2.4884228706359863,
"learning_rate": 2.96714238300697e-05,
"loss": 0.6493,
"step": 7350
},
{
"epoch": 1.2213740458015268,
"grad_norm": 2.275660514831543,
"learning_rate": 2.9643765903307892e-05,
"loss": 0.6286,
"step": 7360
},
{
"epoch": 1.2230335214072352,
"grad_norm": 2.318924903869629,
"learning_rate": 2.9616107976546082e-05,
"loss": 0.5821,
"step": 7370
},
{
"epoch": 1.2246929970129439,
"grad_norm": 2.770963430404663,
"learning_rate": 2.9588450049784273e-05,
"loss": 0.6369,
"step": 7380
},
{
"epoch": 1.2263524726186525,
"grad_norm": 2.4397408962249756,
"learning_rate": 2.9560792123022457e-05,
"loss": 0.7002,
"step": 7390
},
{
"epoch": 1.2280119482243612,
"grad_norm": 3.2758724689483643,
"learning_rate": 2.9533134196260647e-05,
"loss": 0.6908,
"step": 7400
},
{
"epoch": 1.2296714238300697,
"grad_norm": 2.9591310024261475,
"learning_rate": 2.9505476269498838e-05,
"loss": 0.6101,
"step": 7410
},
{
"epoch": 1.2313308994357783,
"grad_norm": 3.2585608959198,
"learning_rate": 2.947781834273703e-05,
"loss": 0.6751,
"step": 7420
},
{
"epoch": 1.2329903750414868,
"grad_norm": 2.8872411251068115,
"learning_rate": 2.945016041597522e-05,
"loss": 0.672,
"step": 7430
},
{
"epoch": 1.2346498506471955,
"grad_norm": 2.7128067016601562,
"learning_rate": 2.942250248921341e-05,
"loss": 0.6619,
"step": 7440
},
{
"epoch": 1.2363093262529041,
"grad_norm": 2.6857752799987793,
"learning_rate": 2.93948445624516e-05,
"loss": 0.6276,
"step": 7450
},
{
"epoch": 1.2379688018586128,
"grad_norm": 2.4469690322875977,
"learning_rate": 2.936718663568979e-05,
"loss": 0.6659,
"step": 7460
},
{
"epoch": 1.2396282774643212,
"grad_norm": 2.634112596511841,
"learning_rate": 2.9339528708927982e-05,
"loss": 0.6445,
"step": 7470
},
{
"epoch": 1.24128775307003,
"grad_norm": 3.0444681644439697,
"learning_rate": 2.931187078216617e-05,
"loss": 0.6314,
"step": 7480
},
{
"epoch": 1.2429472286757384,
"grad_norm": 3.5143823623657227,
"learning_rate": 2.928421285540436e-05,
"loss": 0.6747,
"step": 7490
},
{
"epoch": 1.244606704281447,
"grad_norm": 2.7041027545928955,
"learning_rate": 2.925655492864255e-05,
"loss": 0.628,
"step": 7500
},
{
"epoch": 1.2462661798871557,
"grad_norm": 2.981811285018921,
"learning_rate": 2.922889700188074e-05,
"loss": 0.6689,
"step": 7510
},
{
"epoch": 1.2479256554928642,
"grad_norm": 2.5438361167907715,
"learning_rate": 2.920123907511893e-05,
"loss": 0.6445,
"step": 7520
},
{
"epoch": 1.2495851310985728,
"grad_norm": 3.3671348094940186,
"learning_rate": 2.9173581148357122e-05,
"loss": 0.7005,
"step": 7530
},
{
"epoch": 1.2512446067042815,
"grad_norm": 2.6247966289520264,
"learning_rate": 2.9145923221595313e-05,
"loss": 0.6512,
"step": 7540
},
{
"epoch": 1.25290408230999,
"grad_norm": 3.0514414310455322,
"learning_rate": 2.9118265294833503e-05,
"loss": 0.6164,
"step": 7550
},
{
"epoch": 1.2545635579156986,
"grad_norm": 2.6410088539123535,
"learning_rate": 2.9090607368071694e-05,
"loss": 0.5988,
"step": 7560
},
{
"epoch": 1.2562230335214073,
"grad_norm": 3.9841790199279785,
"learning_rate": 2.9062949441309885e-05,
"loss": 0.6863,
"step": 7570
},
{
"epoch": 1.257882509127116,
"grad_norm": 2.80208158493042,
"learning_rate": 2.903529151454807e-05,
"loss": 0.6827,
"step": 7580
},
{
"epoch": 1.2595419847328244,
"grad_norm": 3.831223964691162,
"learning_rate": 2.900763358778626e-05,
"loss": 0.6628,
"step": 7590
},
{
"epoch": 1.261201460338533,
"grad_norm": 3.715212821960449,
"learning_rate": 2.897997566102445e-05,
"loss": 0.7214,
"step": 7600
},
{
"epoch": 1.2628609359442415,
"grad_norm": 3.88436222076416,
"learning_rate": 2.895231773426264e-05,
"loss": 0.612,
"step": 7610
},
{
"epoch": 1.2645204115499502,
"grad_norm": 2.4657158851623535,
"learning_rate": 2.892465980750083e-05,
"loss": 0.5995,
"step": 7620
},
{
"epoch": 1.2661798871556589,
"grad_norm": 3.639241933822632,
"learning_rate": 2.889700188073902e-05,
"loss": 0.6866,
"step": 7630
},
{
"epoch": 1.2678393627613673,
"grad_norm": 2.5907673835754395,
"learning_rate": 2.8869343953977212e-05,
"loss": 0.6062,
"step": 7640
},
{
"epoch": 1.269498838367076,
"grad_norm": 3.1435792446136475,
"learning_rate": 2.8841686027215403e-05,
"loss": 0.6667,
"step": 7650
},
{
"epoch": 1.2711583139727847,
"grad_norm": 3.6317081451416016,
"learning_rate": 2.8814028100453593e-05,
"loss": 0.6012,
"step": 7660
},
{
"epoch": 1.2728177895784931,
"grad_norm": 3.563117265701294,
"learning_rate": 2.878637017369178e-05,
"loss": 0.6467,
"step": 7670
},
{
"epoch": 1.2744772651842018,
"grad_norm": 2.753971576690674,
"learning_rate": 2.875871224692997e-05,
"loss": 0.7008,
"step": 7680
},
{
"epoch": 1.2761367407899105,
"grad_norm": 2.5007359981536865,
"learning_rate": 2.8731054320168162e-05,
"loss": 0.6534,
"step": 7690
},
{
"epoch": 1.277796216395619,
"grad_norm": 4.024910926818848,
"learning_rate": 2.8703396393406352e-05,
"loss": 0.6863,
"step": 7700
},
{
"epoch": 1.2794556920013276,
"grad_norm": 2.818535566329956,
"learning_rate": 2.8675738466644543e-05,
"loss": 0.6127,
"step": 7710
},
{
"epoch": 1.2811151676070363,
"grad_norm": 2.7450509071350098,
"learning_rate": 2.8648080539882734e-05,
"loss": 0.6392,
"step": 7720
},
{
"epoch": 1.2827746432127447,
"grad_norm": 3.4333643913269043,
"learning_rate": 2.8620422613120924e-05,
"loss": 0.6657,
"step": 7730
},
{
"epoch": 1.2844341188184534,
"grad_norm": 2.8145976066589355,
"learning_rate": 2.8592764686359115e-05,
"loss": 0.644,
"step": 7740
},
{
"epoch": 1.286093594424162,
"grad_norm": 3.2298789024353027,
"learning_rate": 2.8565106759597306e-05,
"loss": 0.5739,
"step": 7750
},
{
"epoch": 1.2877530700298705,
"grad_norm": 2.691585063934326,
"learning_rate": 2.853744883283549e-05,
"loss": 0.6572,
"step": 7760
},
{
"epoch": 1.2894125456355792,
"grad_norm": 2.9739527702331543,
"learning_rate": 2.850979090607368e-05,
"loss": 0.655,
"step": 7770
},
{
"epoch": 1.2910720212412876,
"grad_norm": 2.2871487140655518,
"learning_rate": 2.848213297931187e-05,
"loss": 0.6159,
"step": 7780
},
{
"epoch": 1.2927314968469963,
"grad_norm": 2.895627975463867,
"learning_rate": 2.845447505255006e-05,
"loss": 0.621,
"step": 7790
},
{
"epoch": 1.294390972452705,
"grad_norm": 2.9289920330047607,
"learning_rate": 2.8426817125788252e-05,
"loss": 0.6461,
"step": 7800
},
{
"epoch": 1.2960504480584136,
"grad_norm": 3.2139761447906494,
"learning_rate": 2.8399159199026443e-05,
"loss": 0.626,
"step": 7810
},
{
"epoch": 1.297709923664122,
"grad_norm": 2.8325042724609375,
"learning_rate": 2.8371501272264633e-05,
"loss": 0.6721,
"step": 7820
},
{
"epoch": 1.2993693992698307,
"grad_norm": 3.305152654647827,
"learning_rate": 2.8343843345502824e-05,
"loss": 0.6762,
"step": 7830
},
{
"epoch": 1.3010288748755392,
"grad_norm": 3.015709161758423,
"learning_rate": 2.8316185418741014e-05,
"loss": 0.7299,
"step": 7840
},
{
"epoch": 1.3026883504812479,
"grad_norm": 2.6467528343200684,
"learning_rate": 2.8288527491979198e-05,
"loss": 0.6014,
"step": 7850
},
{
"epoch": 1.3043478260869565,
"grad_norm": 3.627946138381958,
"learning_rate": 2.826086956521739e-05,
"loss": 0.5908,
"step": 7860
},
{
"epoch": 1.3060073016926652,
"grad_norm": 2.5281457901000977,
"learning_rate": 2.8233211638455583e-05,
"loss": 0.6756,
"step": 7870
},
{
"epoch": 1.3076667772983737,
"grad_norm": 2.6659297943115234,
"learning_rate": 2.8205553711693773e-05,
"loss": 0.7228,
"step": 7880
},
{
"epoch": 1.3093262529040823,
"grad_norm": 3.160283327102661,
"learning_rate": 2.8177895784931964e-05,
"loss": 0.6716,
"step": 7890
},
{
"epoch": 1.3109857285097908,
"grad_norm": 2.502490520477295,
"learning_rate": 2.8150237858170155e-05,
"loss": 0.6461,
"step": 7900
},
{
"epoch": 1.3126452041154995,
"grad_norm": 3.6391186714172363,
"learning_rate": 2.8122579931408345e-05,
"loss": 0.6583,
"step": 7910
},
{
"epoch": 1.3143046797212081,
"grad_norm": 3.105423927307129,
"learning_rate": 2.8094922004646536e-05,
"loss": 0.6507,
"step": 7920
},
{
"epoch": 1.3159641553269168,
"grad_norm": 3.11173415184021,
"learning_rate": 2.8067264077884727e-05,
"loss": 0.6345,
"step": 7930
},
{
"epoch": 1.3176236309326252,
"grad_norm": 2.8621175289154053,
"learning_rate": 2.8039606151122917e-05,
"loss": 0.6265,
"step": 7940
},
{
"epoch": 1.319283106538334,
"grad_norm": 3.6533260345458984,
"learning_rate": 2.80119482243611e-05,
"loss": 0.7305,
"step": 7950
},
{
"epoch": 1.3209425821440424,
"grad_norm": 2.6195852756500244,
"learning_rate": 2.798429029759929e-05,
"loss": 0.63,
"step": 7960
},
{
"epoch": 1.322602057749751,
"grad_norm": 3.0489957332611084,
"learning_rate": 2.7956632370837482e-05,
"loss": 0.7371,
"step": 7970
},
{
"epoch": 1.3242615333554597,
"grad_norm": 2.971059560775757,
"learning_rate": 2.7928974444075673e-05,
"loss": 0.735,
"step": 7980
},
{
"epoch": 1.3259210089611684,
"grad_norm": 2.765115976333618,
"learning_rate": 2.7901316517313863e-05,
"loss": 0.6667,
"step": 7990
},
{
"epoch": 1.3275804845668768,
"grad_norm": 2.631791353225708,
"learning_rate": 2.7873658590552054e-05,
"loss": 0.6851,
"step": 8000
},
{
"epoch": 1.3275804845668768,
"eval_gen_len": 43.86626506024096,
"eval_loss": 0.6383147239685059,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1348.2209,
"eval_samples_per_second": 4.916,
"eval_steps_per_second": 0.308,
"step": 8000
},
{
"epoch": 1.3292399601725855,
"grad_norm": 2.0661141872406006,
"learning_rate": 2.7846000663790245e-05,
"loss": 0.6178,
"step": 8010
},
{
"epoch": 1.330899435778294,
"grad_norm": 2.870048999786377,
"learning_rate": 2.7818342737028435e-05,
"loss": 0.5981,
"step": 8020
},
{
"epoch": 1.3325589113840026,
"grad_norm": 2.73580265045166,
"learning_rate": 2.7790684810266626e-05,
"loss": 0.675,
"step": 8030
},
{
"epoch": 1.3342183869897113,
"grad_norm": 3.086110830307007,
"learning_rate": 2.776302688350481e-05,
"loss": 0.6549,
"step": 8040
},
{
"epoch": 1.33587786259542,
"grad_norm": 2.6239116191864014,
"learning_rate": 2.7735368956743e-05,
"loss": 0.6646,
"step": 8050
},
{
"epoch": 1.3375373382011284,
"grad_norm": 2.4428651332855225,
"learning_rate": 2.770771102998119e-05,
"loss": 0.6159,
"step": 8060
},
{
"epoch": 1.339196813806837,
"grad_norm": 2.848881244659424,
"learning_rate": 2.768005310321938e-05,
"loss": 0.6522,
"step": 8070
},
{
"epoch": 1.3408562894125455,
"grad_norm": 2.495037078857422,
"learning_rate": 2.7652395176457572e-05,
"loss": 0.6597,
"step": 8080
},
{
"epoch": 1.3425157650182542,
"grad_norm": 2.6207704544067383,
"learning_rate": 2.7624737249695763e-05,
"loss": 0.655,
"step": 8090
},
{
"epoch": 1.3441752406239629,
"grad_norm": 4.076655864715576,
"learning_rate": 2.7597079322933957e-05,
"loss": 0.6712,
"step": 8100
},
{
"epoch": 1.3458347162296715,
"grad_norm": 2.6263961791992188,
"learning_rate": 2.7569421396172148e-05,
"loss": 0.6117,
"step": 8110
},
{
"epoch": 1.34749419183538,
"grad_norm": 3.9632880687713623,
"learning_rate": 2.7541763469410338e-05,
"loss": 0.65,
"step": 8120
},
{
"epoch": 1.3491536674410887,
"grad_norm": 2.890467405319214,
"learning_rate": 2.7514105542648522e-05,
"loss": 0.584,
"step": 8130
},
{
"epoch": 1.3508131430467971,
"grad_norm": 2.1744771003723145,
"learning_rate": 2.7486447615886713e-05,
"loss": 0.6357,
"step": 8140
},
{
"epoch": 1.3524726186525058,
"grad_norm": 2.8852360248565674,
"learning_rate": 2.7458789689124903e-05,
"loss": 0.6569,
"step": 8150
},
{
"epoch": 1.3541320942582145,
"grad_norm": 3.050649404525757,
"learning_rate": 2.7431131762363094e-05,
"loss": 0.6305,
"step": 8160
},
{
"epoch": 1.3557915698639231,
"grad_norm": 3.469940423965454,
"learning_rate": 2.7403473835601284e-05,
"loss": 0.6498,
"step": 8170
},
{
"epoch": 1.3574510454696316,
"grad_norm": 2.8733341693878174,
"learning_rate": 2.7375815908839475e-05,
"loss": 0.7025,
"step": 8180
},
{
"epoch": 1.3591105210753402,
"grad_norm": 2.502155065536499,
"learning_rate": 2.7348157982077666e-05,
"loss": 0.6661,
"step": 8190
},
{
"epoch": 1.3607699966810487,
"grad_norm": 3.2295703887939453,
"learning_rate": 2.7320500055315856e-05,
"loss": 0.5898,
"step": 8200
},
{
"epoch": 1.3624294722867574,
"grad_norm": 3.060533046722412,
"learning_rate": 2.7292842128554047e-05,
"loss": 0.5634,
"step": 8210
},
{
"epoch": 1.364088947892466,
"grad_norm": 2.6943938732147217,
"learning_rate": 2.7265184201792238e-05,
"loss": 0.5965,
"step": 8220
},
{
"epoch": 1.3657484234981747,
"grad_norm": 3.3528614044189453,
"learning_rate": 2.723752627503042e-05,
"loss": 0.6374,
"step": 8230
},
{
"epoch": 1.3674078991038832,
"grad_norm": 3.091634511947632,
"learning_rate": 2.7209868348268612e-05,
"loss": 0.6532,
"step": 8240
},
{
"epoch": 1.3690673747095918,
"grad_norm": 2.3139495849609375,
"learning_rate": 2.7182210421506803e-05,
"loss": 0.6079,
"step": 8250
},
{
"epoch": 1.3707268503153003,
"grad_norm": 2.5526602268218994,
"learning_rate": 2.7154552494744993e-05,
"loss": 0.6627,
"step": 8260
},
{
"epoch": 1.372386325921009,
"grad_norm": 2.624354600906372,
"learning_rate": 2.7126894567983184e-05,
"loss": 0.6785,
"step": 8270
},
{
"epoch": 1.3740458015267176,
"grad_norm": 2.9834911823272705,
"learning_rate": 2.7099236641221375e-05,
"loss": 0.6338,
"step": 8280
},
{
"epoch": 1.375705277132426,
"grad_norm": 2.112609624862671,
"learning_rate": 2.7071578714459565e-05,
"loss": 0.5915,
"step": 8290
},
{
"epoch": 1.3773647527381347,
"grad_norm": 3.110013723373413,
"learning_rate": 2.7043920787697756e-05,
"loss": 0.616,
"step": 8300
},
{
"epoch": 1.3790242283438434,
"grad_norm": 3.0879156589508057,
"learning_rate": 2.7016262860935946e-05,
"loss": 0.5853,
"step": 8310
},
{
"epoch": 1.3806837039495519,
"grad_norm": 2.6645970344543457,
"learning_rate": 2.6988604934174134e-05,
"loss": 0.6464,
"step": 8320
},
{
"epoch": 1.3823431795552605,
"grad_norm": 3.700145721435547,
"learning_rate": 2.6960947007412324e-05,
"loss": 0.6499,
"step": 8330
},
{
"epoch": 1.3840026551609692,
"grad_norm": 3.2514538764953613,
"learning_rate": 2.6933289080650515e-05,
"loss": 0.5642,
"step": 8340
},
{
"epoch": 1.3856621307666777,
"grad_norm": 3.3618319034576416,
"learning_rate": 2.6905631153888705e-05,
"loss": 0.6425,
"step": 8350
},
{
"epoch": 1.3873216063723863,
"grad_norm": 2.636019229888916,
"learning_rate": 2.6877973227126896e-05,
"loss": 0.6842,
"step": 8360
},
{
"epoch": 1.388981081978095,
"grad_norm": 3.2628235816955566,
"learning_rate": 2.6850315300365087e-05,
"loss": 0.6271,
"step": 8370
},
{
"epoch": 1.3906405575838034,
"grad_norm": 2.7013912200927734,
"learning_rate": 2.6822657373603277e-05,
"loss": 0.6782,
"step": 8380
},
{
"epoch": 1.3923000331895121,
"grad_norm": 2.627906084060669,
"learning_rate": 2.6794999446841468e-05,
"loss": 0.6403,
"step": 8390
},
{
"epoch": 1.3939595087952208,
"grad_norm": 3.1490073204040527,
"learning_rate": 2.676734152007966e-05,
"loss": 0.6318,
"step": 8400
},
{
"epoch": 1.3956189844009292,
"grad_norm": 3.9242594242095947,
"learning_rate": 2.6739683593317842e-05,
"loss": 0.6493,
"step": 8410
},
{
"epoch": 1.397278460006638,
"grad_norm": 2.508676528930664,
"learning_rate": 2.6712025666556033e-05,
"loss": 0.6678,
"step": 8420
},
{
"epoch": 1.3989379356123464,
"grad_norm": 2.6345014572143555,
"learning_rate": 2.6684367739794224e-05,
"loss": 0.6622,
"step": 8430
},
{
"epoch": 1.400597411218055,
"grad_norm": 3.522874116897583,
"learning_rate": 2.6656709813032414e-05,
"loss": 0.6576,
"step": 8440
},
{
"epoch": 1.4022568868237637,
"grad_norm": 3.2026262283325195,
"learning_rate": 2.6629051886270605e-05,
"loss": 0.6862,
"step": 8450
},
{
"epoch": 1.4039163624294724,
"grad_norm": 3.2341012954711914,
"learning_rate": 2.6601393959508796e-05,
"loss": 0.6862,
"step": 8460
},
{
"epoch": 1.4055758380351808,
"grad_norm": 3.0050671100616455,
"learning_rate": 2.6573736032746986e-05,
"loss": 0.6724,
"step": 8470
},
{
"epoch": 1.4072353136408895,
"grad_norm": 2.9396309852600098,
"learning_rate": 2.6546078105985177e-05,
"loss": 0.6655,
"step": 8480
},
{
"epoch": 1.408894789246598,
"grad_norm": 3.1885645389556885,
"learning_rate": 2.6518420179223367e-05,
"loss": 0.6946,
"step": 8490
},
{
"epoch": 1.4105542648523066,
"grad_norm": 2.589405059814453,
"learning_rate": 2.6490762252461558e-05,
"loss": 0.5516,
"step": 8500
},
{
"epoch": 1.4122137404580153,
"grad_norm": 2.978219747543335,
"learning_rate": 2.6463104325699745e-05,
"loss": 0.6607,
"step": 8510
},
{
"epoch": 1.413873216063724,
"grad_norm": 2.7965850830078125,
"learning_rate": 2.6435446398937936e-05,
"loss": 0.6475,
"step": 8520
},
{
"epoch": 1.4155326916694324,
"grad_norm": 2.3449814319610596,
"learning_rate": 2.6407788472176126e-05,
"loss": 0.5845,
"step": 8530
},
{
"epoch": 1.417192167275141,
"grad_norm": 2.6544861793518066,
"learning_rate": 2.6380130545414317e-05,
"loss": 0.6219,
"step": 8540
},
{
"epoch": 1.4188516428808495,
"grad_norm": 2.6153106689453125,
"learning_rate": 2.6352472618652508e-05,
"loss": 0.5959,
"step": 8550
},
{
"epoch": 1.4205111184865582,
"grad_norm": 2.964139938354492,
"learning_rate": 2.63248146918907e-05,
"loss": 0.606,
"step": 8560
},
{
"epoch": 1.4221705940922669,
"grad_norm": 3.122018814086914,
"learning_rate": 2.629715676512889e-05,
"loss": 0.6384,
"step": 8570
},
{
"epoch": 1.4238300696979755,
"grad_norm": 2.3362247943878174,
"learning_rate": 2.626949883836708e-05,
"loss": 0.6425,
"step": 8580
},
{
"epoch": 1.425489545303684,
"grad_norm": 3.3441717624664307,
"learning_rate": 2.624184091160527e-05,
"loss": 0.6729,
"step": 8590
},
{
"epoch": 1.4271490209093927,
"grad_norm": 3.02563214302063,
"learning_rate": 2.6214182984843454e-05,
"loss": 0.6573,
"step": 8600
},
{
"epoch": 1.4288084965151011,
"grad_norm": 2.8026158809661865,
"learning_rate": 2.6186525058081645e-05,
"loss": 0.6934,
"step": 8610
},
{
"epoch": 1.4304679721208098,
"grad_norm": 2.406740427017212,
"learning_rate": 2.6158867131319835e-05,
"loss": 0.6005,
"step": 8620
},
{
"epoch": 1.4321274477265185,
"grad_norm": 2.6590702533721924,
"learning_rate": 2.6131209204558026e-05,
"loss": 0.6476,
"step": 8630
},
{
"epoch": 1.4337869233322271,
"grad_norm": 2.5404417514801025,
"learning_rate": 2.6103551277796216e-05,
"loss": 0.6904,
"step": 8640
},
{
"epoch": 1.4354463989379356,
"grad_norm": 2.8186190128326416,
"learning_rate": 2.6075893351034407e-05,
"loss": 0.5546,
"step": 8650
},
{
"epoch": 1.4371058745436442,
"grad_norm": 3.4550652503967285,
"learning_rate": 2.6048235424272598e-05,
"loss": 0.6777,
"step": 8660
},
{
"epoch": 1.4387653501493527,
"grad_norm": 2.432072877883911,
"learning_rate": 2.602057749751079e-05,
"loss": 0.6266,
"step": 8670
},
{
"epoch": 1.4404248257550614,
"grad_norm": 2.4931819438934326,
"learning_rate": 2.599291957074898e-05,
"loss": 0.6463,
"step": 8680
},
{
"epoch": 1.44208430136077,
"grad_norm": 3.0838961601257324,
"learning_rate": 2.5965261643987166e-05,
"loss": 0.653,
"step": 8690
},
{
"epoch": 1.4437437769664787,
"grad_norm": 2.035360336303711,
"learning_rate": 2.5937603717225357e-05,
"loss": 0.6876,
"step": 8700
},
{
"epoch": 1.4454032525721872,
"grad_norm": 2.6690499782562256,
"learning_rate": 2.5909945790463547e-05,
"loss": 0.7085,
"step": 8710
},
{
"epoch": 1.4470627281778958,
"grad_norm": 4.042128086090088,
"learning_rate": 2.5882287863701738e-05,
"loss": 0.6389,
"step": 8720
},
{
"epoch": 1.4487222037836043,
"grad_norm": 2.342874526977539,
"learning_rate": 2.585462993693993e-05,
"loss": 0.6777,
"step": 8730
},
{
"epoch": 1.450381679389313,
"grad_norm": 3.238492488861084,
"learning_rate": 2.582697201017812e-05,
"loss": 0.6661,
"step": 8740
},
{
"epoch": 1.4520411549950216,
"grad_norm": 2.1739718914031982,
"learning_rate": 2.579931408341631e-05,
"loss": 0.5928,
"step": 8750
},
{
"epoch": 1.4537006306007303,
"grad_norm": 2.4919285774230957,
"learning_rate": 2.57716561566545e-05,
"loss": 0.5875,
"step": 8760
},
{
"epoch": 1.4553601062064387,
"grad_norm": 3.0230798721313477,
"learning_rate": 2.574399822989269e-05,
"loss": 0.6158,
"step": 8770
},
{
"epoch": 1.4570195818121474,
"grad_norm": 2.7015600204467773,
"learning_rate": 2.5716340303130875e-05,
"loss": 0.6585,
"step": 8780
},
{
"epoch": 1.4586790574178559,
"grad_norm": 2.7721285820007324,
"learning_rate": 2.5688682376369066e-05,
"loss": 0.6449,
"step": 8790
},
{
"epoch": 1.4603385330235645,
"grad_norm": 2.542915105819702,
"learning_rate": 2.5661024449607256e-05,
"loss": 0.6492,
"step": 8800
},
{
"epoch": 1.4619980086292732,
"grad_norm": 2.4796090126037598,
"learning_rate": 2.5633366522845447e-05,
"loss": 0.6353,
"step": 8810
},
{
"epoch": 1.4636574842349819,
"grad_norm": 2.580699920654297,
"learning_rate": 2.5605708596083637e-05,
"loss": 0.6512,
"step": 8820
},
{
"epoch": 1.4653169598406903,
"grad_norm": 2.977383613586426,
"learning_rate": 2.5578050669321828e-05,
"loss": 0.5601,
"step": 8830
},
{
"epoch": 1.466976435446399,
"grad_norm": 2.8172965049743652,
"learning_rate": 2.555039274256002e-05,
"loss": 0.6693,
"step": 8840
},
{
"epoch": 1.4686359110521074,
"grad_norm": 2.785449981689453,
"learning_rate": 2.552273481579821e-05,
"loss": 0.5605,
"step": 8850
},
{
"epoch": 1.4702953866578161,
"grad_norm": 3.623539447784424,
"learning_rate": 2.54950768890364e-05,
"loss": 0.6339,
"step": 8860
},
{
"epoch": 1.4719548622635248,
"grad_norm": 3.051231861114502,
"learning_rate": 2.546741896227459e-05,
"loss": 0.719,
"step": 8870
},
{
"epoch": 1.4736143378692335,
"grad_norm": 3.1508374214172363,
"learning_rate": 2.5439761035512778e-05,
"loss": 0.6446,
"step": 8880
},
{
"epoch": 1.475273813474942,
"grad_norm": 2.500828742980957,
"learning_rate": 2.541210310875097e-05,
"loss": 0.6077,
"step": 8890
},
{
"epoch": 1.4769332890806506,
"grad_norm": 3.5404295921325684,
"learning_rate": 2.538444518198916e-05,
"loss": 0.6684,
"step": 8900
},
{
"epoch": 1.478592764686359,
"grad_norm": 3.6613194942474365,
"learning_rate": 2.535678725522735e-05,
"loss": 0.6374,
"step": 8910
},
{
"epoch": 1.4802522402920677,
"grad_norm": 2.6419968605041504,
"learning_rate": 2.532912932846554e-05,
"loss": 0.6182,
"step": 8920
},
{
"epoch": 1.4819117158977764,
"grad_norm": 3.8098807334899902,
"learning_rate": 2.530147140170373e-05,
"loss": 0.7255,
"step": 8930
},
{
"epoch": 1.4835711915034848,
"grad_norm": 3.2926089763641357,
"learning_rate": 2.527381347494192e-05,
"loss": 0.6579,
"step": 8940
},
{
"epoch": 1.4852306671091935,
"grad_norm": 4.201939582824707,
"learning_rate": 2.5246155548180112e-05,
"loss": 0.65,
"step": 8950
},
{
"epoch": 1.4868901427149022,
"grad_norm": 2.726099967956543,
"learning_rate": 2.5218497621418303e-05,
"loss": 0.5891,
"step": 8960
},
{
"epoch": 1.4885496183206106,
"grad_norm": 2.545718193054199,
"learning_rate": 2.5190839694656487e-05,
"loss": 0.6279,
"step": 8970
},
{
"epoch": 1.4902090939263193,
"grad_norm": 2.973696231842041,
"learning_rate": 2.5163181767894677e-05,
"loss": 0.6443,
"step": 8980
},
{
"epoch": 1.491868569532028,
"grad_norm": 3.0759024620056152,
"learning_rate": 2.5135523841132868e-05,
"loss": 0.6623,
"step": 8990
},
{
"epoch": 1.4935280451377364,
"grad_norm": 3.30869722366333,
"learning_rate": 2.510786591437106e-05,
"loss": 0.6932,
"step": 9000
},
{
"epoch": 1.4935280451377364,
"eval_gen_len": 45.99307228915663,
"eval_loss": 0.6329591274261475,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1393.4765,
"eval_samples_per_second": 4.756,
"eval_steps_per_second": 0.298,
"step": 9000
},
{
"epoch": 1.495187520743445,
"grad_norm": 3.156658172607422,
"learning_rate": 2.508020798760925e-05,
"loss": 0.6035,
"step": 9010
},
{
"epoch": 1.4968469963491537,
"grad_norm": 2.6789326667785645,
"learning_rate": 2.505255006084744e-05,
"loss": 0.6678,
"step": 9020
},
{
"epoch": 1.4985064719548622,
"grad_norm": 2.7142372131347656,
"learning_rate": 2.502489213408563e-05,
"loss": 0.6547,
"step": 9030
},
{
"epoch": 1.5001659475605709,
"grad_norm": 3.6695101261138916,
"learning_rate": 2.4997234207323818e-05,
"loss": 0.6935,
"step": 9040
},
{
"epoch": 1.5018254231662795,
"grad_norm": 2.5122296810150146,
"learning_rate": 2.4969576280562008e-05,
"loss": 0.7043,
"step": 9050
},
{
"epoch": 1.5034848987719882,
"grad_norm": 2.7815310955047607,
"learning_rate": 2.49419183538002e-05,
"loss": 0.6292,
"step": 9060
},
{
"epoch": 1.5051443743776967,
"grad_norm": 2.4146716594696045,
"learning_rate": 2.4914260427038393e-05,
"loss": 0.6641,
"step": 9070
},
{
"epoch": 1.506803849983405,
"grad_norm": 2.420832633972168,
"learning_rate": 2.4886602500276583e-05,
"loss": 0.5655,
"step": 9080
},
{
"epoch": 1.5084633255891138,
"grad_norm": 2.830427885055542,
"learning_rate": 2.485894457351477e-05,
"loss": 0.5935,
"step": 9090
},
{
"epoch": 1.5101228011948225,
"grad_norm": 2.9108097553253174,
"learning_rate": 2.483128664675296e-05,
"loss": 0.6461,
"step": 9100
},
{
"epoch": 1.5117822768005311,
"grad_norm": 4.128769874572754,
"learning_rate": 2.4803628719991152e-05,
"loss": 0.6407,
"step": 9110
},
{
"epoch": 1.5134417524062398,
"grad_norm": 2.5911433696746826,
"learning_rate": 2.4775970793229343e-05,
"loss": 0.663,
"step": 9120
},
{
"epoch": 1.5151012280119482,
"grad_norm": 2.6011362075805664,
"learning_rate": 2.474831286646753e-05,
"loss": 0.6861,
"step": 9130
},
{
"epoch": 1.5167607036176567,
"grad_norm": 3.9745078086853027,
"learning_rate": 2.472065493970572e-05,
"loss": 0.6649,
"step": 9140
},
{
"epoch": 1.5184201792233654,
"grad_norm": 2.8070755004882812,
"learning_rate": 2.469299701294391e-05,
"loss": 0.6806,
"step": 9150
},
{
"epoch": 1.520079654829074,
"grad_norm": 3.158174514770508,
"learning_rate": 2.46653390861821e-05,
"loss": 0.7047,
"step": 9160
},
{
"epoch": 1.5217391304347827,
"grad_norm": 2.933155059814453,
"learning_rate": 2.4637681159420292e-05,
"loss": 0.6379,
"step": 9170
},
{
"epoch": 1.5233986060404912,
"grad_norm": 2.6287198066711426,
"learning_rate": 2.461002323265848e-05,
"loss": 0.603,
"step": 9180
},
{
"epoch": 1.5250580816461998,
"grad_norm": 2.4520461559295654,
"learning_rate": 2.458236530589667e-05,
"loss": 0.637,
"step": 9190
},
{
"epoch": 1.5267175572519083,
"grad_norm": 2.695462226867676,
"learning_rate": 2.455470737913486e-05,
"loss": 0.6158,
"step": 9200
},
{
"epoch": 1.528377032857617,
"grad_norm": 2.981191396713257,
"learning_rate": 2.452704945237305e-05,
"loss": 0.7432,
"step": 9210
},
{
"epoch": 1.5300365084633256,
"grad_norm": 2.3489644527435303,
"learning_rate": 2.4499391525611242e-05,
"loss": 0.7302,
"step": 9220
},
{
"epoch": 1.5316959840690343,
"grad_norm": 3.2739570140838623,
"learning_rate": 2.447173359884943e-05,
"loss": 0.6705,
"step": 9230
},
{
"epoch": 1.5333554596747427,
"grad_norm": 3.6367099285125732,
"learning_rate": 2.444407567208762e-05,
"loss": 0.6617,
"step": 9240
},
{
"epoch": 1.5350149352804514,
"grad_norm": 2.5076191425323486,
"learning_rate": 2.441641774532581e-05,
"loss": 0.6446,
"step": 9250
},
{
"epoch": 1.5366744108861599,
"grad_norm": 2.3779613971710205,
"learning_rate": 2.4388759818564e-05,
"loss": 0.6996,
"step": 9260
},
{
"epoch": 1.5383338864918685,
"grad_norm": 2.8726320266723633,
"learning_rate": 2.436110189180219e-05,
"loss": 0.6534,
"step": 9270
},
{
"epoch": 1.5399933620975772,
"grad_norm": 3.3525993824005127,
"learning_rate": 2.4333443965040382e-05,
"loss": 0.6596,
"step": 9280
},
{
"epoch": 1.5416528377032859,
"grad_norm": 2.6728854179382324,
"learning_rate": 2.4305786038278573e-05,
"loss": 0.6563,
"step": 9290
},
{
"epoch": 1.5433123133089943,
"grad_norm": 3.2132279872894287,
"learning_rate": 2.4278128111516763e-05,
"loss": 0.6718,
"step": 9300
},
{
"epoch": 1.544971788914703,
"grad_norm": 2.8587329387664795,
"learning_rate": 2.4250470184754954e-05,
"loss": 0.6499,
"step": 9310
},
{
"epoch": 1.5466312645204114,
"grad_norm": 3.229907751083374,
"learning_rate": 2.422281225799314e-05,
"loss": 0.6458,
"step": 9320
},
{
"epoch": 1.5482907401261201,
"grad_norm": 2.425075054168701,
"learning_rate": 2.4195154331231332e-05,
"loss": 0.6801,
"step": 9330
},
{
"epoch": 1.5499502157318288,
"grad_norm": 2.5436460971832275,
"learning_rate": 2.4167496404469523e-05,
"loss": 0.6251,
"step": 9340
},
{
"epoch": 1.5516096913375375,
"grad_norm": 2.9517149925231934,
"learning_rate": 2.4139838477707713e-05,
"loss": 0.6242,
"step": 9350
},
{
"epoch": 1.553269166943246,
"grad_norm": 3.7397382259368896,
"learning_rate": 2.41121805509459e-05,
"loss": 0.6693,
"step": 9360
},
{
"epoch": 1.5549286425489546,
"grad_norm": 3.662407398223877,
"learning_rate": 2.408452262418409e-05,
"loss": 0.6185,
"step": 9370
},
{
"epoch": 1.556588118154663,
"grad_norm": 3.0817925930023193,
"learning_rate": 2.405686469742228e-05,
"loss": 0.6114,
"step": 9380
},
{
"epoch": 1.5582475937603717,
"grad_norm": 3.0462610721588135,
"learning_rate": 2.4029206770660472e-05,
"loss": 0.6051,
"step": 9390
},
{
"epoch": 1.5599070693660804,
"grad_norm": 2.5288925170898438,
"learning_rate": 2.4001548843898663e-05,
"loss": 0.6199,
"step": 9400
},
{
"epoch": 1.561566544971789,
"grad_norm": 3.114104747772217,
"learning_rate": 2.397389091713685e-05,
"loss": 0.6531,
"step": 9410
},
{
"epoch": 1.5632260205774975,
"grad_norm": 3.1850929260253906,
"learning_rate": 2.394623299037504e-05,
"loss": 0.681,
"step": 9420
},
{
"epoch": 1.5648854961832062,
"grad_norm": 5.208533763885498,
"learning_rate": 2.391857506361323e-05,
"loss": 0.6649,
"step": 9430
},
{
"epoch": 1.5665449717889146,
"grad_norm": 2.8434998989105225,
"learning_rate": 2.3890917136851422e-05,
"loss": 0.6585,
"step": 9440
},
{
"epoch": 1.5682044473946233,
"grad_norm": 2.9898252487182617,
"learning_rate": 2.3863259210089613e-05,
"loss": 0.631,
"step": 9450
},
{
"epoch": 1.569863923000332,
"grad_norm": 2.4391958713531494,
"learning_rate": 2.3835601283327803e-05,
"loss": 0.5867,
"step": 9460
},
{
"epoch": 1.5715233986060406,
"grad_norm": 2.509895086288452,
"learning_rate": 2.3807943356565994e-05,
"loss": 0.6581,
"step": 9470
},
{
"epoch": 1.573182874211749,
"grad_norm": 2.519028425216675,
"learning_rate": 2.3780285429804184e-05,
"loss": 0.6581,
"step": 9480
},
{
"epoch": 1.5748423498174575,
"grad_norm": 2.4313178062438965,
"learning_rate": 2.3752627503042375e-05,
"loss": 0.6034,
"step": 9490
},
{
"epoch": 1.5765018254231662,
"grad_norm": 2.631897449493408,
"learning_rate": 2.3724969576280562e-05,
"loss": 0.6592,
"step": 9500
},
{
"epoch": 1.5781613010288749,
"grad_norm": 2.4608094692230225,
"learning_rate": 2.3697311649518753e-05,
"loss": 0.6637,
"step": 9510
},
{
"epoch": 1.5798207766345835,
"grad_norm": 2.5385336875915527,
"learning_rate": 2.3669653722756944e-05,
"loss": 0.7129,
"step": 9520
},
{
"epoch": 1.5814802522402922,
"grad_norm": 5.761340618133545,
"learning_rate": 2.3641995795995134e-05,
"loss": 0.6218,
"step": 9530
},
{
"epoch": 1.5831397278460007,
"grad_norm": 3.5122549533843994,
"learning_rate": 2.3614337869233325e-05,
"loss": 0.6643,
"step": 9540
},
{
"epoch": 1.584799203451709,
"grad_norm": 3.492122173309326,
"learning_rate": 2.3586679942471512e-05,
"loss": 0.6172,
"step": 9550
},
{
"epoch": 1.5864586790574178,
"grad_norm": 2.2743546962738037,
"learning_rate": 2.3559022015709703e-05,
"loss": 0.6185,
"step": 9560
},
{
"epoch": 1.5881181546631264,
"grad_norm": 2.686427116394043,
"learning_rate": 2.3531364088947893e-05,
"loss": 0.5884,
"step": 9570
},
{
"epoch": 1.5897776302688351,
"grad_norm": 3.168736219406128,
"learning_rate": 2.3503706162186084e-05,
"loss": 0.6482,
"step": 9580
},
{
"epoch": 1.5914371058745438,
"grad_norm": 2.285508632659912,
"learning_rate": 2.3476048235424275e-05,
"loss": 0.6357,
"step": 9590
},
{
"epoch": 1.5930965814802522,
"grad_norm": 2.3977394104003906,
"learning_rate": 2.3448390308662462e-05,
"loss": 0.6018,
"step": 9600
},
{
"epoch": 1.5947560570859607,
"grad_norm": 3.285010814666748,
"learning_rate": 2.3420732381900652e-05,
"loss": 0.6282,
"step": 9610
},
{
"epoch": 1.5964155326916694,
"grad_norm": 2.762423038482666,
"learning_rate": 2.3393074455138843e-05,
"loss": 0.6137,
"step": 9620
},
{
"epoch": 1.598075008297378,
"grad_norm": 2.6022305488586426,
"learning_rate": 2.3365416528377034e-05,
"loss": 0.7065,
"step": 9630
},
{
"epoch": 1.5997344839030867,
"grad_norm": 2.3043951988220215,
"learning_rate": 2.3337758601615224e-05,
"loss": 0.6861,
"step": 9640
},
{
"epoch": 1.6013939595087954,
"grad_norm": 3.265958309173584,
"learning_rate": 2.3310100674853415e-05,
"loss": 0.6301,
"step": 9650
},
{
"epoch": 1.6030534351145038,
"grad_norm": 2.5754103660583496,
"learning_rate": 2.3282442748091605e-05,
"loss": 0.6359,
"step": 9660
},
{
"epoch": 1.6047129107202123,
"grad_norm": 2.7039718627929688,
"learning_rate": 2.3254784821329796e-05,
"loss": 0.5976,
"step": 9670
},
{
"epoch": 1.606372386325921,
"grad_norm": 3.076953887939453,
"learning_rate": 2.3227126894567987e-05,
"loss": 0.5774,
"step": 9680
},
{
"epoch": 1.6080318619316296,
"grad_norm": 2.4393558502197266,
"learning_rate": 2.3199468967806174e-05,
"loss": 0.6095,
"step": 9690
},
{
"epoch": 1.6096913375373383,
"grad_norm": 2.4519779682159424,
"learning_rate": 2.3171811041044365e-05,
"loss": 0.6077,
"step": 9700
},
{
"epoch": 1.611350813143047,
"grad_norm": 2.989640712738037,
"learning_rate": 2.3144153114282555e-05,
"loss": 0.6352,
"step": 9710
},
{
"epoch": 1.6130102887487554,
"grad_norm": 3.529949188232422,
"learning_rate": 2.3116495187520746e-05,
"loss": 0.6709,
"step": 9720
},
{
"epoch": 1.6146697643544639,
"grad_norm": 4.415449619293213,
"learning_rate": 2.3088837260758936e-05,
"loss": 0.6486,
"step": 9730
},
{
"epoch": 1.6163292399601725,
"grad_norm": 3.1440181732177734,
"learning_rate": 2.3061179333997124e-05,
"loss": 0.6455,
"step": 9740
},
{
"epoch": 1.6179887155658812,
"grad_norm": 2.616605043411255,
"learning_rate": 2.3033521407235314e-05,
"loss": 0.642,
"step": 9750
},
{
"epoch": 1.6196481911715899,
"grad_norm": 4.769134521484375,
"learning_rate": 2.3005863480473505e-05,
"loss": 0.6213,
"step": 9760
},
{
"epoch": 1.6213076667772985,
"grad_norm": 2.7411723136901855,
"learning_rate": 2.2978205553711696e-05,
"loss": 0.6623,
"step": 9770
},
{
"epoch": 1.622967142383007,
"grad_norm": 2.7197189331054688,
"learning_rate": 2.2950547626949883e-05,
"loss": 0.6586,
"step": 9780
},
{
"epoch": 1.6246266179887154,
"grad_norm": 3.303685188293457,
"learning_rate": 2.2922889700188073e-05,
"loss": 0.692,
"step": 9790
},
{
"epoch": 1.626286093594424,
"grad_norm": 2.6795287132263184,
"learning_rate": 2.2895231773426264e-05,
"loss": 0.6298,
"step": 9800
},
{
"epoch": 1.6279455692001328,
"grad_norm": 2.594517707824707,
"learning_rate": 2.2867573846664455e-05,
"loss": 0.5813,
"step": 9810
},
{
"epoch": 1.6296050448058415,
"grad_norm": 2.5385282039642334,
"learning_rate": 2.2839915919902645e-05,
"loss": 0.6388,
"step": 9820
},
{
"epoch": 1.63126452041155,
"grad_norm": 3.192117691040039,
"learning_rate": 2.2812257993140836e-05,
"loss": 0.5766,
"step": 9830
},
{
"epoch": 1.6329239960172586,
"grad_norm": 2.816152811050415,
"learning_rate": 2.2784600066379026e-05,
"loss": 0.6354,
"step": 9840
},
{
"epoch": 1.634583471622967,
"grad_norm": 2.7126712799072266,
"learning_rate": 2.2756942139617217e-05,
"loss": 0.6325,
"step": 9850
},
{
"epoch": 1.6362429472286757,
"grad_norm": 2.268371105194092,
"learning_rate": 2.2729284212855408e-05,
"loss": 0.5883,
"step": 9860
},
{
"epoch": 1.6379024228343844,
"grad_norm": 3.135272741317749,
"learning_rate": 2.27016262860936e-05,
"loss": 0.6781,
"step": 9870
},
{
"epoch": 1.639561898440093,
"grad_norm": 3.2926957607269287,
"learning_rate": 2.2673968359331786e-05,
"loss": 0.6279,
"step": 9880
},
{
"epoch": 1.6412213740458015,
"grad_norm": 3.9669864177703857,
"learning_rate": 2.2646310432569976e-05,
"loss": 0.6569,
"step": 9890
},
{
"epoch": 1.6428808496515102,
"grad_norm": 2.7280452251434326,
"learning_rate": 2.2618652505808167e-05,
"loss": 0.6547,
"step": 9900
},
{
"epoch": 1.6445403252572186,
"grad_norm": 3.3885629177093506,
"learning_rate": 2.2590994579046357e-05,
"loss": 0.6048,
"step": 9910
},
{
"epoch": 1.6461998008629273,
"grad_norm": 2.8787283897399902,
"learning_rate": 2.2563336652284545e-05,
"loss": 0.6147,
"step": 9920
},
{
"epoch": 1.647859276468636,
"grad_norm": 2.199803113937378,
"learning_rate": 2.2535678725522735e-05,
"loss": 0.618,
"step": 9930
},
{
"epoch": 1.6495187520743446,
"grad_norm": 3.0264763832092285,
"learning_rate": 2.2508020798760926e-05,
"loss": 0.6366,
"step": 9940
},
{
"epoch": 1.651178227680053,
"grad_norm": 2.1966772079467773,
"learning_rate": 2.2480362871999116e-05,
"loss": 0.6564,
"step": 9950
},
{
"epoch": 1.6528377032857617,
"grad_norm": 2.9565937519073486,
"learning_rate": 2.2452704945237307e-05,
"loss": 0.675,
"step": 9960
},
{
"epoch": 1.6544971788914702,
"grad_norm": 3.1447999477386475,
"learning_rate": 2.2425047018475494e-05,
"loss": 0.6851,
"step": 9970
},
{
"epoch": 1.6561566544971789,
"grad_norm": 2.6409244537353516,
"learning_rate": 2.2397389091713685e-05,
"loss": 0.6022,
"step": 9980
},
{
"epoch": 1.6578161301028875,
"grad_norm": 3.335141181945801,
"learning_rate": 2.2369731164951876e-05,
"loss": 0.6495,
"step": 9990
},
{
"epoch": 1.6594756057085962,
"grad_norm": 3.172175168991089,
"learning_rate": 2.2342073238190066e-05,
"loss": 0.6506,
"step": 10000
},
{
"epoch": 1.6594756057085962,
"eval_gen_len": 42.61882530120482,
"eval_loss": 0.6317981481552124,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1328.2593,
"eval_samples_per_second": 4.99,
"eval_steps_per_second": 0.312,
"step": 10000
},
{
"epoch": 1.6611350813143047,
"grad_norm": 3.085299015045166,
"learning_rate": 2.2314415311428253e-05,
"loss": 0.6738,
"step": 10010
},
{
"epoch": 1.6627945569200133,
"grad_norm": 4.871061325073242,
"learning_rate": 2.2286757384666444e-05,
"loss": 0.612,
"step": 10020
},
{
"epoch": 1.6644540325257218,
"grad_norm": 2.488039493560791,
"learning_rate": 2.2259099457904635e-05,
"loss": 0.612,
"step": 10030
},
{
"epoch": 1.6661135081314304,
"grad_norm": 3.3118488788604736,
"learning_rate": 2.223144153114283e-05,
"loss": 0.6858,
"step": 10040
},
{
"epoch": 1.6677729837371391,
"grad_norm": 2.2576591968536377,
"learning_rate": 2.220378360438102e-05,
"loss": 0.6412,
"step": 10050
},
{
"epoch": 1.6694324593428478,
"grad_norm": 3.7689051628112793,
"learning_rate": 2.2176125677619207e-05,
"loss": 0.7009,
"step": 10060
},
{
"epoch": 1.6710919349485562,
"grad_norm": 3.4723784923553467,
"learning_rate": 2.2148467750857397e-05,
"loss": 0.6825,
"step": 10070
},
{
"epoch": 1.672751410554265,
"grad_norm": 2.3455286026000977,
"learning_rate": 2.2120809824095588e-05,
"loss": 0.584,
"step": 10080
},
{
"epoch": 1.6744108861599734,
"grad_norm": 3.1980183124542236,
"learning_rate": 2.209315189733378e-05,
"loss": 0.6203,
"step": 10090
},
{
"epoch": 1.676070361765682,
"grad_norm": 3.0734760761260986,
"learning_rate": 2.206549397057197e-05,
"loss": 0.6228,
"step": 10100
},
{
"epoch": 1.6777298373713907,
"grad_norm": 2.5724833011627197,
"learning_rate": 2.2037836043810156e-05,
"loss": 0.6572,
"step": 10110
},
{
"epoch": 1.6793893129770994,
"grad_norm": 2.5542571544647217,
"learning_rate": 2.2010178117048347e-05,
"loss": 0.6888,
"step": 10120
},
{
"epoch": 1.6810487885828078,
"grad_norm": 2.8788347244262695,
"learning_rate": 2.1982520190286537e-05,
"loss": 0.6324,
"step": 10130
},
{
"epoch": 1.6827082641885163,
"grad_norm": 2.3643722534179688,
"learning_rate": 2.1954862263524728e-05,
"loss": 0.6487,
"step": 10140
},
{
"epoch": 1.684367739794225,
"grad_norm": 3.1754324436187744,
"learning_rate": 2.1927204336762915e-05,
"loss": 0.6625,
"step": 10150
},
{
"epoch": 1.6860272153999336,
"grad_norm": 2.355295419692993,
"learning_rate": 2.1899546410001106e-05,
"loss": 0.5971,
"step": 10160
},
{
"epoch": 1.6876866910056423,
"grad_norm": 2.548339605331421,
"learning_rate": 2.1871888483239297e-05,
"loss": 0.6521,
"step": 10170
},
{
"epoch": 1.689346166611351,
"grad_norm": 3.4530179500579834,
"learning_rate": 2.1844230556477487e-05,
"loss": 0.685,
"step": 10180
},
{
"epoch": 1.6910056422170594,
"grad_norm": 3.1450254917144775,
"learning_rate": 2.1816572629715678e-05,
"loss": 0.6059,
"step": 10190
},
{
"epoch": 1.6926651178227679,
"grad_norm": 3.041930913925171,
"learning_rate": 2.1788914702953865e-05,
"loss": 0.7124,
"step": 10200
},
{
"epoch": 1.6943245934284765,
"grad_norm": 2.4500343799591064,
"learning_rate": 2.1761256776192056e-05,
"loss": 0.5958,
"step": 10210
},
{
"epoch": 1.6959840690341852,
"grad_norm": 2.8591954708099365,
"learning_rate": 2.1733598849430246e-05,
"loss": 0.668,
"step": 10220
},
{
"epoch": 1.6976435446398939,
"grad_norm": 3.4934329986572266,
"learning_rate": 2.1705940922668437e-05,
"loss": 0.6782,
"step": 10230
},
{
"epoch": 1.6993030202456025,
"grad_norm": 2.7269980907440186,
"learning_rate": 2.1678282995906628e-05,
"loss": 0.6895,
"step": 10240
},
{
"epoch": 1.700962495851311,
"grad_norm": 3.1131813526153564,
"learning_rate": 2.1650625069144818e-05,
"loss": 0.6954,
"step": 10250
},
{
"epoch": 1.7026219714570194,
"grad_norm": 2.731657028198242,
"learning_rate": 2.162296714238301e-05,
"loss": 0.6513,
"step": 10260
},
{
"epoch": 1.704281447062728,
"grad_norm": 3.856985092163086,
"learning_rate": 2.15953092156212e-05,
"loss": 0.6292,
"step": 10270
},
{
"epoch": 1.7059409226684368,
"grad_norm": 3.4055871963500977,
"learning_rate": 2.156765128885939e-05,
"loss": 0.6825,
"step": 10280
},
{
"epoch": 1.7076003982741454,
"grad_norm": 3.0944137573242188,
"learning_rate": 2.1539993362097577e-05,
"loss": 0.6832,
"step": 10290
},
{
"epoch": 1.7092598738798541,
"grad_norm": 3.316096544265747,
"learning_rate": 2.1512335435335768e-05,
"loss": 0.6296,
"step": 10300
},
{
"epoch": 1.7109193494855626,
"grad_norm": 2.8443148136138916,
"learning_rate": 2.148467750857396e-05,
"loss": 0.667,
"step": 10310
},
{
"epoch": 1.712578825091271,
"grad_norm": 4.633572578430176,
"learning_rate": 2.145701958181215e-05,
"loss": 0.6503,
"step": 10320
},
{
"epoch": 1.7142383006969797,
"grad_norm": 2.528799057006836,
"learning_rate": 2.142936165505034e-05,
"loss": 0.6405,
"step": 10330
},
{
"epoch": 1.7158977763026884,
"grad_norm": 2.6630334854125977,
"learning_rate": 2.1401703728288527e-05,
"loss": 0.623,
"step": 10340
},
{
"epoch": 1.717557251908397,
"grad_norm": 3.258363962173462,
"learning_rate": 2.1374045801526718e-05,
"loss": 0.6795,
"step": 10350
},
{
"epoch": 1.7192167275141057,
"grad_norm": 2.7798945903778076,
"learning_rate": 2.1346387874764908e-05,
"loss": 0.6706,
"step": 10360
},
{
"epoch": 1.7208762031198142,
"grad_norm": 2.5304954051971436,
"learning_rate": 2.13187299480031e-05,
"loss": 0.5987,
"step": 10370
},
{
"epoch": 1.7225356787255226,
"grad_norm": 2.4129419326782227,
"learning_rate": 2.129107202124129e-05,
"loss": 0.6772,
"step": 10380
},
{
"epoch": 1.7241951543312313,
"grad_norm": 2.6216280460357666,
"learning_rate": 2.1263414094479477e-05,
"loss": 0.6406,
"step": 10390
},
{
"epoch": 1.72585462993694,
"grad_norm": 2.539292097091675,
"learning_rate": 2.1235756167717667e-05,
"loss": 0.635,
"step": 10400
},
{
"epoch": 1.7275141055426486,
"grad_norm": 2.541496992111206,
"learning_rate": 2.1208098240955858e-05,
"loss": 0.6457,
"step": 10410
},
{
"epoch": 1.729173581148357,
"grad_norm": 2.9823789596557617,
"learning_rate": 2.118044031419405e-05,
"loss": 0.6112,
"step": 10420
},
{
"epoch": 1.7308330567540657,
"grad_norm": 3.312760829925537,
"learning_rate": 2.115278238743224e-05,
"loss": 0.6349,
"step": 10430
},
{
"epoch": 1.7324925323597742,
"grad_norm": 2.683608055114746,
"learning_rate": 2.112512446067043e-05,
"loss": 0.6493,
"step": 10440
},
{
"epoch": 1.7341520079654829,
"grad_norm": 2.764828681945801,
"learning_rate": 2.109746653390862e-05,
"loss": 0.5791,
"step": 10450
},
{
"epoch": 1.7358114835711915,
"grad_norm": 2.8977785110473633,
"learning_rate": 2.106980860714681e-05,
"loss": 0.629,
"step": 10460
},
{
"epoch": 1.7374709591769002,
"grad_norm": 2.4492812156677246,
"learning_rate": 2.1042150680385e-05,
"loss": 0.6265,
"step": 10470
},
{
"epoch": 1.7391304347826086,
"grad_norm": 2.425192356109619,
"learning_rate": 2.101449275362319e-05,
"loss": 0.5907,
"step": 10480
},
{
"epoch": 1.7407899103883173,
"grad_norm": 2.9035537242889404,
"learning_rate": 2.098683482686138e-05,
"loss": 0.6585,
"step": 10490
},
{
"epoch": 1.7424493859940258,
"grad_norm": 2.507382392883301,
"learning_rate": 2.095917690009957e-05,
"loss": 0.6393,
"step": 10500
},
{
"epoch": 1.7441088615997344,
"grad_norm": 3.7915854454040527,
"learning_rate": 2.093151897333776e-05,
"loss": 0.6419,
"step": 10510
},
{
"epoch": 1.745768337205443,
"grad_norm": 2.4732701778411865,
"learning_rate": 2.090386104657595e-05,
"loss": 0.6623,
"step": 10520
},
{
"epoch": 1.7474278128111518,
"grad_norm": 2.7935047149658203,
"learning_rate": 2.087620311981414e-05,
"loss": 0.6817,
"step": 10530
},
{
"epoch": 1.7490872884168602,
"grad_norm": 2.607464075088501,
"learning_rate": 2.084854519305233e-05,
"loss": 0.6561,
"step": 10540
},
{
"epoch": 1.750746764022569,
"grad_norm": 2.660127878189087,
"learning_rate": 2.082088726629052e-05,
"loss": 0.6613,
"step": 10550
},
{
"epoch": 1.7524062396282774,
"grad_norm": 3.2085351943969727,
"learning_rate": 2.079322933952871e-05,
"loss": 0.6748,
"step": 10560
},
{
"epoch": 1.754065715233986,
"grad_norm": 2.7219228744506836,
"learning_rate": 2.0765571412766898e-05,
"loss": 0.6382,
"step": 10570
},
{
"epoch": 1.7557251908396947,
"grad_norm": 3.184359550476074,
"learning_rate": 2.0737913486005088e-05,
"loss": 0.6369,
"step": 10580
},
{
"epoch": 1.7573846664454034,
"grad_norm": 3.1994669437408447,
"learning_rate": 2.071025555924328e-05,
"loss": 0.6546,
"step": 10590
},
{
"epoch": 1.7590441420511118,
"grad_norm": 3.010939359664917,
"learning_rate": 2.068259763248147e-05,
"loss": 0.6864,
"step": 10600
},
{
"epoch": 1.7607036176568205,
"grad_norm": 2.753485918045044,
"learning_rate": 2.065493970571966e-05,
"loss": 0.6881,
"step": 10610
},
{
"epoch": 1.762363093262529,
"grad_norm": 2.8554036617279053,
"learning_rate": 2.062728177895785e-05,
"loss": 0.662,
"step": 10620
},
{
"epoch": 1.7640225688682376,
"grad_norm": 4.670107364654541,
"learning_rate": 2.059962385219604e-05,
"loss": 0.6106,
"step": 10630
},
{
"epoch": 1.7656820444739463,
"grad_norm": 2.309091806411743,
"learning_rate": 2.0571965925434232e-05,
"loss": 0.6692,
"step": 10640
},
{
"epoch": 1.767341520079655,
"grad_norm": 3.542398691177368,
"learning_rate": 2.0544307998672423e-05,
"loss": 0.6851,
"step": 10650
},
{
"epoch": 1.7690009956853634,
"grad_norm": 3.215907573699951,
"learning_rate": 2.0516650071910613e-05,
"loss": 0.6768,
"step": 10660
},
{
"epoch": 1.770660471291072,
"grad_norm": 2.82903790473938,
"learning_rate": 2.04889921451488e-05,
"loss": 0.6761,
"step": 10670
},
{
"epoch": 1.7723199468967805,
"grad_norm": 2.8713226318359375,
"learning_rate": 2.046133421838699e-05,
"loss": 0.6348,
"step": 10680
},
{
"epoch": 1.7739794225024892,
"grad_norm": 2.563518762588501,
"learning_rate": 2.043367629162518e-05,
"loss": 0.6564,
"step": 10690
},
{
"epoch": 1.7756388981081979,
"grad_norm": 2.8617069721221924,
"learning_rate": 2.0406018364863372e-05,
"loss": 0.6244,
"step": 10700
},
{
"epoch": 1.7772983737139065,
"grad_norm": 3.2314953804016113,
"learning_rate": 2.037836043810156e-05,
"loss": 0.6567,
"step": 10710
},
{
"epoch": 1.778957849319615,
"grad_norm": 3.1084470748901367,
"learning_rate": 2.035070251133975e-05,
"loss": 0.6223,
"step": 10720
},
{
"epoch": 1.7806173249253237,
"grad_norm": 3.896014928817749,
"learning_rate": 2.032304458457794e-05,
"loss": 0.6031,
"step": 10730
},
{
"epoch": 1.782276800531032,
"grad_norm": 2.7549264430999756,
"learning_rate": 2.029538665781613e-05,
"loss": 0.6582,
"step": 10740
},
{
"epoch": 1.7839362761367408,
"grad_norm": 3.149277925491333,
"learning_rate": 2.0267728731054322e-05,
"loss": 0.5781,
"step": 10750
},
{
"epoch": 1.7855957517424494,
"grad_norm": 2.9234907627105713,
"learning_rate": 2.024007080429251e-05,
"loss": 0.6705,
"step": 10760
},
{
"epoch": 1.7872552273481581,
"grad_norm": 2.4791808128356934,
"learning_rate": 2.02124128775307e-05,
"loss": 0.6696,
"step": 10770
},
{
"epoch": 1.7889147029538666,
"grad_norm": 2.654337167739868,
"learning_rate": 2.018475495076889e-05,
"loss": 0.6406,
"step": 10780
},
{
"epoch": 1.790574178559575,
"grad_norm": 3.0971858501434326,
"learning_rate": 2.015709702400708e-05,
"loss": 0.6552,
"step": 10790
},
{
"epoch": 1.7922336541652837,
"grad_norm": 2.394855260848999,
"learning_rate": 2.0129439097245272e-05,
"loss": 0.6378,
"step": 10800
},
{
"epoch": 1.7938931297709924,
"grad_norm": 2.530996799468994,
"learning_rate": 2.0101781170483462e-05,
"loss": 0.6484,
"step": 10810
},
{
"epoch": 1.795552605376701,
"grad_norm": 3.274632692337036,
"learning_rate": 2.0074123243721653e-05,
"loss": 0.6434,
"step": 10820
},
{
"epoch": 1.7972120809824097,
"grad_norm": 2.5829977989196777,
"learning_rate": 2.0046465316959844e-05,
"loss": 0.6431,
"step": 10830
},
{
"epoch": 1.7988715565881181,
"grad_norm": 2.7407333850860596,
"learning_rate": 2.0018807390198034e-05,
"loss": 0.659,
"step": 10840
},
{
"epoch": 1.8005310321938266,
"grad_norm": 2.6258132457733154,
"learning_rate": 1.999114946343622e-05,
"loss": 0.653,
"step": 10850
},
{
"epoch": 1.8021905077995353,
"grad_norm": 2.77689266204834,
"learning_rate": 1.9963491536674412e-05,
"loss": 0.6604,
"step": 10860
},
{
"epoch": 1.803849983405244,
"grad_norm": 2.838128089904785,
"learning_rate": 1.9935833609912603e-05,
"loss": 0.607,
"step": 10870
},
{
"epoch": 1.8055094590109526,
"grad_norm": 3.087151050567627,
"learning_rate": 1.9908175683150793e-05,
"loss": 0.6252,
"step": 10880
},
{
"epoch": 1.8071689346166613,
"grad_norm": 2.8684113025665283,
"learning_rate": 1.9880517756388984e-05,
"loss": 0.6945,
"step": 10890
},
{
"epoch": 1.8088284102223697,
"grad_norm": 3.120990037918091,
"learning_rate": 1.985285982962717e-05,
"loss": 0.684,
"step": 10900
},
{
"epoch": 1.8104878858280782,
"grad_norm": 2.8556265830993652,
"learning_rate": 1.9825201902865362e-05,
"loss": 0.6328,
"step": 10910
},
{
"epoch": 1.8121473614337869,
"grad_norm": 2.925619125366211,
"learning_rate": 1.9797543976103552e-05,
"loss": 0.5552,
"step": 10920
},
{
"epoch": 1.8138068370394955,
"grad_norm": 2.4405603408813477,
"learning_rate": 1.9769886049341743e-05,
"loss": 0.6689,
"step": 10930
},
{
"epoch": 1.8154663126452042,
"grad_norm": 3.013485908508301,
"learning_rate": 1.974222812257993e-05,
"loss": 0.6554,
"step": 10940
},
{
"epoch": 1.8171257882509129,
"grad_norm": 2.554626703262329,
"learning_rate": 1.971457019581812e-05,
"loss": 0.667,
"step": 10950
},
{
"epoch": 1.8187852638566213,
"grad_norm": 2.905426263809204,
"learning_rate": 1.968691226905631e-05,
"loss": 0.6502,
"step": 10960
},
{
"epoch": 1.8204447394623298,
"grad_norm": 3.87603497505188,
"learning_rate": 1.9659254342294502e-05,
"loss": 0.7144,
"step": 10970
},
{
"epoch": 1.8221042150680384,
"grad_norm": 2.709789276123047,
"learning_rate": 1.9631596415532693e-05,
"loss": 0.6134,
"step": 10980
},
{
"epoch": 1.823763690673747,
"grad_norm": 2.6182925701141357,
"learning_rate": 1.960393848877088e-05,
"loss": 0.6697,
"step": 10990
},
{
"epoch": 1.8254231662794558,
"grad_norm": 2.8797554969787598,
"learning_rate": 1.957628056200907e-05,
"loss": 0.6577,
"step": 11000
},
{
"epoch": 1.8254231662794558,
"eval_gen_len": 45.56867469879518,
"eval_loss": 0.6276843547821045,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1356.1116,
"eval_samples_per_second": 4.888,
"eval_steps_per_second": 0.306,
"step": 11000
},
{
"epoch": 1.8270826418851644,
"grad_norm": 2.888942241668701,
"learning_rate": 1.9548622635247265e-05,
"loss": 0.6432,
"step": 11010
},
{
"epoch": 1.828742117490873,
"grad_norm": 2.8916501998901367,
"learning_rate": 1.9520964708485455e-05,
"loss": 0.6114,
"step": 11020
},
{
"epoch": 1.8304015930965813,
"grad_norm": 2.345423698425293,
"learning_rate": 1.9493306781723646e-05,
"loss": 0.6203,
"step": 11030
},
{
"epoch": 1.83206106870229,
"grad_norm": 2.7087900638580322,
"learning_rate": 1.9465648854961833e-05,
"loss": 0.6254,
"step": 11040
},
{
"epoch": 1.8337205443079987,
"grad_norm": 3.901052474975586,
"learning_rate": 1.9437990928200024e-05,
"loss": 0.6279,
"step": 11050
},
{
"epoch": 1.8353800199137074,
"grad_norm": 2.765397787094116,
"learning_rate": 1.9410333001438214e-05,
"loss": 0.6609,
"step": 11060
},
{
"epoch": 1.8370394955194158,
"grad_norm": 2.2285208702087402,
"learning_rate": 1.9382675074676405e-05,
"loss": 0.6638,
"step": 11070
},
{
"epoch": 1.8386989711251245,
"grad_norm": 2.972564220428467,
"learning_rate": 1.9355017147914592e-05,
"loss": 0.6289,
"step": 11080
},
{
"epoch": 1.840358446730833,
"grad_norm": 2.643881320953369,
"learning_rate": 1.9327359221152783e-05,
"loss": 0.6638,
"step": 11090
},
{
"epoch": 1.8420179223365416,
"grad_norm": 2.7107179164886475,
"learning_rate": 1.9299701294390973e-05,
"loss": 0.64,
"step": 11100
},
{
"epoch": 1.8436773979422503,
"grad_norm": 2.4541239738464355,
"learning_rate": 1.9272043367629164e-05,
"loss": 0.5943,
"step": 11110
},
{
"epoch": 1.845336873547959,
"grad_norm": 2.520796060562134,
"learning_rate": 1.9244385440867355e-05,
"loss": 0.6804,
"step": 11120
},
{
"epoch": 1.8469963491536674,
"grad_norm": 2.7786107063293457,
"learning_rate": 1.9216727514105542e-05,
"loss": 0.6414,
"step": 11130
},
{
"epoch": 1.848655824759376,
"grad_norm": 3.0973737239837646,
"learning_rate": 1.9189069587343732e-05,
"loss": 0.6855,
"step": 11140
},
{
"epoch": 1.8503153003650845,
"grad_norm": 2.5851657390594482,
"learning_rate": 1.9161411660581923e-05,
"loss": 0.5443,
"step": 11150
},
{
"epoch": 1.8519747759707932,
"grad_norm": 2.8966641426086426,
"learning_rate": 1.9133753733820114e-05,
"loss": 0.5777,
"step": 11160
},
{
"epoch": 1.8536342515765019,
"grad_norm": 3.042029857635498,
"learning_rate": 1.9106095807058304e-05,
"loss": 0.7043,
"step": 11170
},
{
"epoch": 1.8552937271822105,
"grad_norm": 2.6720499992370605,
"learning_rate": 1.907843788029649e-05,
"loss": 0.6793,
"step": 11180
},
{
"epoch": 1.856953202787919,
"grad_norm": 2.850257158279419,
"learning_rate": 1.9050779953534682e-05,
"loss": 0.6232,
"step": 11190
},
{
"epoch": 1.8586126783936276,
"grad_norm": 2.7536768913269043,
"learning_rate": 1.9023122026772873e-05,
"loss": 0.6604,
"step": 11200
},
{
"epoch": 1.860272153999336,
"grad_norm": 2.6577181816101074,
"learning_rate": 1.8995464100011063e-05,
"loss": 0.5952,
"step": 11210
},
{
"epoch": 1.8619316296050448,
"grad_norm": 3.852022171020508,
"learning_rate": 1.8967806173249254e-05,
"loss": 0.6362,
"step": 11220
},
{
"epoch": 1.8635911052107534,
"grad_norm": 2.875685453414917,
"learning_rate": 1.8940148246487445e-05,
"loss": 0.7276,
"step": 11230
},
{
"epoch": 1.8652505808164621,
"grad_norm": 3.211580991744995,
"learning_rate": 1.8912490319725635e-05,
"loss": 0.6773,
"step": 11240
},
{
"epoch": 1.8669100564221706,
"grad_norm": 2.6323704719543457,
"learning_rate": 1.8884832392963826e-05,
"loss": 0.6487,
"step": 11250
},
{
"epoch": 1.8685695320278792,
"grad_norm": 3.9867587089538574,
"learning_rate": 1.8857174466202016e-05,
"loss": 0.6276,
"step": 11260
},
{
"epoch": 1.8702290076335877,
"grad_norm": 2.4788997173309326,
"learning_rate": 1.8829516539440204e-05,
"loss": 0.6239,
"step": 11270
},
{
"epoch": 1.8718884832392964,
"grad_norm": 2.5648040771484375,
"learning_rate": 1.8801858612678394e-05,
"loss": 0.6247,
"step": 11280
},
{
"epoch": 1.873547958845005,
"grad_norm": 2.8344156742095947,
"learning_rate": 1.8774200685916585e-05,
"loss": 0.5787,
"step": 11290
},
{
"epoch": 1.8752074344507137,
"grad_norm": 4.752839088439941,
"learning_rate": 1.8746542759154776e-05,
"loss": 0.6402,
"step": 11300
},
{
"epoch": 1.8768669100564221,
"grad_norm": 3.165907382965088,
"learning_rate": 1.8718884832392966e-05,
"loss": 0.655,
"step": 11310
},
{
"epoch": 1.8785263856621308,
"grad_norm": 2.813150405883789,
"learning_rate": 1.8691226905631153e-05,
"loss": 0.5664,
"step": 11320
},
{
"epoch": 1.8801858612678393,
"grad_norm": 2.6606945991516113,
"learning_rate": 1.8663568978869344e-05,
"loss": 0.6411,
"step": 11330
},
{
"epoch": 1.881845336873548,
"grad_norm": 2.7262566089630127,
"learning_rate": 1.8635911052107535e-05,
"loss": 0.6904,
"step": 11340
},
{
"epoch": 1.8835048124792566,
"grad_norm": 3.206533908843994,
"learning_rate": 1.8608253125345725e-05,
"loss": 0.6105,
"step": 11350
},
{
"epoch": 1.8851642880849653,
"grad_norm": 3.490770101547241,
"learning_rate": 1.8580595198583913e-05,
"loss": 0.6457,
"step": 11360
},
{
"epoch": 1.8868237636906737,
"grad_norm": 2.6691598892211914,
"learning_rate": 1.8552937271822103e-05,
"loss": 0.7151,
"step": 11370
},
{
"epoch": 1.8884832392963822,
"grad_norm": 2.681269884109497,
"learning_rate": 1.8525279345060294e-05,
"loss": 0.6186,
"step": 11380
},
{
"epoch": 1.8901427149020908,
"grad_norm": 2.527893543243408,
"learning_rate": 1.8497621418298484e-05,
"loss": 0.5586,
"step": 11390
},
{
"epoch": 1.8918021905077995,
"grad_norm": 2.677696704864502,
"learning_rate": 1.8469963491536675e-05,
"loss": 0.6233,
"step": 11400
},
{
"epoch": 1.8934616661135082,
"grad_norm": 2.9220569133758545,
"learning_rate": 1.8442305564774866e-05,
"loss": 0.6908,
"step": 11410
},
{
"epoch": 1.8951211417192169,
"grad_norm": 2.458573579788208,
"learning_rate": 1.8414647638013056e-05,
"loss": 0.639,
"step": 11420
},
{
"epoch": 1.8967806173249253,
"grad_norm": 2.7711801528930664,
"learning_rate": 1.8386989711251247e-05,
"loss": 0.5861,
"step": 11430
},
{
"epoch": 1.8984400929306338,
"grad_norm": 2.2654006481170654,
"learning_rate": 1.8359331784489437e-05,
"loss": 0.6429,
"step": 11440
},
{
"epoch": 1.9000995685363424,
"grad_norm": 2.199928045272827,
"learning_rate": 1.8331673857727625e-05,
"loss": 0.6192,
"step": 11450
},
{
"epoch": 1.901759044142051,
"grad_norm": 2.565298080444336,
"learning_rate": 1.8304015930965815e-05,
"loss": 0.6517,
"step": 11460
},
{
"epoch": 1.9034185197477598,
"grad_norm": 2.959390878677368,
"learning_rate": 1.8276358004204006e-05,
"loss": 0.6197,
"step": 11470
},
{
"epoch": 1.9050779953534684,
"grad_norm": 3.2915642261505127,
"learning_rate": 1.8248700077442197e-05,
"loss": 0.5705,
"step": 11480
},
{
"epoch": 1.906737470959177,
"grad_norm": 2.0758042335510254,
"learning_rate": 1.8221042150680387e-05,
"loss": 0.6337,
"step": 11490
},
{
"epoch": 1.9083969465648853,
"grad_norm": 2.862844467163086,
"learning_rate": 1.8193384223918574e-05,
"loss": 0.6384,
"step": 11500
},
{
"epoch": 1.910056422170594,
"grad_norm": 2.489260673522949,
"learning_rate": 1.8165726297156765e-05,
"loss": 0.6693,
"step": 11510
},
{
"epoch": 1.9117158977763027,
"grad_norm": 2.257587194442749,
"learning_rate": 1.8138068370394956e-05,
"loss": 0.5863,
"step": 11520
},
{
"epoch": 1.9133753733820114,
"grad_norm": 3.025855779647827,
"learning_rate": 1.8110410443633146e-05,
"loss": 0.6333,
"step": 11530
},
{
"epoch": 1.91503484898772,
"grad_norm": 2.7348458766937256,
"learning_rate": 1.8082752516871337e-05,
"loss": 0.6725,
"step": 11540
},
{
"epoch": 1.9166943245934285,
"grad_norm": 3.3557896614074707,
"learning_rate": 1.8055094590109524e-05,
"loss": 0.6569,
"step": 11550
},
{
"epoch": 1.918353800199137,
"grad_norm": 3.2904157638549805,
"learning_rate": 1.8027436663347715e-05,
"loss": 0.696,
"step": 11560
},
{
"epoch": 1.9200132758048456,
"grad_norm": 2.3019628524780273,
"learning_rate": 1.7999778736585905e-05,
"loss": 0.5247,
"step": 11570
},
{
"epoch": 1.9216727514105543,
"grad_norm": 2.766451597213745,
"learning_rate": 1.7972120809824096e-05,
"loss": 0.7265,
"step": 11580
},
{
"epoch": 1.923332227016263,
"grad_norm": 2.454838991165161,
"learning_rate": 1.7944462883062287e-05,
"loss": 0.5796,
"step": 11590
},
{
"epoch": 1.9249917026219716,
"grad_norm": 2.8631088733673096,
"learning_rate": 1.7916804956300477e-05,
"loss": 0.6788,
"step": 11600
},
{
"epoch": 1.92665117822768,
"grad_norm": 2.889618158340454,
"learning_rate": 1.7889147029538668e-05,
"loss": 0.6684,
"step": 11610
},
{
"epoch": 1.9283106538333885,
"grad_norm": 2.790698766708374,
"learning_rate": 1.786148910277686e-05,
"loss": 0.6312,
"step": 11620
},
{
"epoch": 1.9299701294390972,
"grad_norm": 2.6731488704681396,
"learning_rate": 1.783383117601505e-05,
"loss": 0.6283,
"step": 11630
},
{
"epoch": 1.9316296050448059,
"grad_norm": 2.563717842102051,
"learning_rate": 1.7806173249253236e-05,
"loss": 0.6463,
"step": 11640
},
{
"epoch": 1.9332890806505145,
"grad_norm": 2.9437766075134277,
"learning_rate": 1.7778515322491427e-05,
"loss": 0.6631,
"step": 11650
},
{
"epoch": 1.9349485562562232,
"grad_norm": 2.956129312515259,
"learning_rate": 1.7750857395729618e-05,
"loss": 0.5983,
"step": 11660
},
{
"epoch": 1.9366080318619316,
"grad_norm": 2.8484396934509277,
"learning_rate": 1.7723199468967808e-05,
"loss": 0.65,
"step": 11670
},
{
"epoch": 1.93826750746764,
"grad_norm": 2.8540737628936768,
"learning_rate": 1.7695541542206e-05,
"loss": 0.6743,
"step": 11680
},
{
"epoch": 1.9399269830733488,
"grad_norm": 2.9417595863342285,
"learning_rate": 1.7667883615444186e-05,
"loss": 0.6692,
"step": 11690
},
{
"epoch": 1.9415864586790574,
"grad_norm": 3.3319525718688965,
"learning_rate": 1.7640225688682377e-05,
"loss": 0.6147,
"step": 11700
},
{
"epoch": 1.943245934284766,
"grad_norm": 2.3488125801086426,
"learning_rate": 1.7612567761920567e-05,
"loss": 0.6646,
"step": 11710
},
{
"epoch": 1.9449054098904746,
"grad_norm": 3.0198192596435547,
"learning_rate": 1.7584909835158758e-05,
"loss": 0.6479,
"step": 11720
},
{
"epoch": 1.9465648854961832,
"grad_norm": 2.935741662979126,
"learning_rate": 1.7557251908396945e-05,
"loss": 0.6406,
"step": 11730
},
{
"epoch": 1.9482243611018917,
"grad_norm": 2.7237465381622314,
"learning_rate": 1.7529593981635136e-05,
"loss": 0.6627,
"step": 11740
},
{
"epoch": 1.9498838367076003,
"grad_norm": 3.8201403617858887,
"learning_rate": 1.7501936054873326e-05,
"loss": 0.6603,
"step": 11750
},
{
"epoch": 1.951543312313309,
"grad_norm": 2.511312484741211,
"learning_rate": 1.7474278128111517e-05,
"loss": 0.6557,
"step": 11760
},
{
"epoch": 1.9532027879190177,
"grad_norm": 3.6143879890441895,
"learning_rate": 1.7446620201349708e-05,
"loss": 0.6626,
"step": 11770
},
{
"epoch": 1.9548622635247261,
"grad_norm": 3.1605286598205566,
"learning_rate": 1.7418962274587898e-05,
"loss": 0.5847,
"step": 11780
},
{
"epoch": 1.9565217391304348,
"grad_norm": 3.041008949279785,
"learning_rate": 1.739130434782609e-05,
"loss": 0.6105,
"step": 11790
},
{
"epoch": 1.9581812147361433,
"grad_norm": 2.9670727252960205,
"learning_rate": 1.736364642106428e-05,
"loss": 0.6107,
"step": 11800
},
{
"epoch": 1.959840690341852,
"grad_norm": 3.077299118041992,
"learning_rate": 1.733598849430247e-05,
"loss": 0.6169,
"step": 11810
},
{
"epoch": 1.9615001659475606,
"grad_norm": 2.9002206325531006,
"learning_rate": 1.730833056754066e-05,
"loss": 0.6763,
"step": 11820
},
{
"epoch": 1.9631596415532693,
"grad_norm": 2.9046597480773926,
"learning_rate": 1.7280672640778848e-05,
"loss": 0.5729,
"step": 11830
},
{
"epoch": 1.9648191171589777,
"grad_norm": 3.2700092792510986,
"learning_rate": 1.725301471401704e-05,
"loss": 0.6393,
"step": 11840
},
{
"epoch": 1.9664785927646864,
"grad_norm": 2.5844552516937256,
"learning_rate": 1.722535678725523e-05,
"loss": 0.6303,
"step": 11850
},
{
"epoch": 1.9681380683703948,
"grad_norm": 3.339615821838379,
"learning_rate": 1.719769886049342e-05,
"loss": 0.6468,
"step": 11860
},
{
"epoch": 1.9697975439761035,
"grad_norm": 3.2334821224212646,
"learning_rate": 1.7170040933731607e-05,
"loss": 0.6765,
"step": 11870
},
{
"epoch": 1.9714570195818122,
"grad_norm": 2.5103602409362793,
"learning_rate": 1.7142383006969798e-05,
"loss": 0.7246,
"step": 11880
},
{
"epoch": 1.9731164951875209,
"grad_norm": 2.6350810527801514,
"learning_rate": 1.7114725080207988e-05,
"loss": 0.6222,
"step": 11890
},
{
"epoch": 1.9747759707932293,
"grad_norm": 3.489544630050659,
"learning_rate": 1.708706715344618e-05,
"loss": 0.6614,
"step": 11900
},
{
"epoch": 1.976435446398938,
"grad_norm": 2.692086935043335,
"learning_rate": 1.705940922668437e-05,
"loss": 0.705,
"step": 11910
},
{
"epoch": 1.9780949220046464,
"grad_norm": 3.1969101428985596,
"learning_rate": 1.7031751299922557e-05,
"loss": 0.6276,
"step": 11920
},
{
"epoch": 1.979754397610355,
"grad_norm": 2.7400033473968506,
"learning_rate": 1.7004093373160747e-05,
"loss": 0.6217,
"step": 11930
},
{
"epoch": 1.9814138732160638,
"grad_norm": 2.7665727138519287,
"learning_rate": 1.6976435446398938e-05,
"loss": 0.6248,
"step": 11940
},
{
"epoch": 1.9830733488217724,
"grad_norm": 3.2178454399108887,
"learning_rate": 1.694877751963713e-05,
"loss": 0.65,
"step": 11950
},
{
"epoch": 1.984732824427481,
"grad_norm": 2.3739986419677734,
"learning_rate": 1.692111959287532e-05,
"loss": 0.6424,
"step": 11960
},
{
"epoch": 1.9863923000331896,
"grad_norm": 3.2119979858398438,
"learning_rate": 1.6893461666113506e-05,
"loss": 0.6361,
"step": 11970
},
{
"epoch": 1.988051775638898,
"grad_norm": 3.085068941116333,
"learning_rate": 1.68658037393517e-05,
"loss": 0.6686,
"step": 11980
},
{
"epoch": 1.9897112512446067,
"grad_norm": 3.0079755783081055,
"learning_rate": 1.683814581258989e-05,
"loss": 0.6811,
"step": 11990
},
{
"epoch": 1.9913707268503154,
"grad_norm": 2.2022628784179688,
"learning_rate": 1.681048788582808e-05,
"loss": 0.6659,
"step": 12000
},
{
"epoch": 1.9913707268503154,
"eval_gen_len": 45.67319277108434,
"eval_loss": 0.6242377758026123,
"eval_model_preparation_time": 0.0137,
"eval_runtime": 1384.1691,
"eval_samples_per_second": 4.788,
"eval_steps_per_second": 0.3,
"step": 12000
}
],
"logging_steps": 10,
"max_steps": 18078,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.641411729138647e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}