cooking-qwen2.5-7b_v2 / trainer_state.json
lingcco's picture
Update
42f60fd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 2488,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020096463022508037,
"grad_norm": 25.43042279854588,
"learning_rate": 1.6064257028112448e-07,
"loss": 1.5516,
"step": 5
},
{
"epoch": 0.0040192926045016075,
"grad_norm": 32.55468907279309,
"learning_rate": 3.614457831325301e-07,
"loss": 1.6285,
"step": 10
},
{
"epoch": 0.006028938906752411,
"grad_norm": 22.457510712636555,
"learning_rate": 5.622489959839358e-07,
"loss": 1.544,
"step": 15
},
{
"epoch": 0.008038585209003215,
"grad_norm": 12.994153155174507,
"learning_rate": 7.630522088353415e-07,
"loss": 1.4871,
"step": 20
},
{
"epoch": 0.01004823151125402,
"grad_norm": 13.10275994035317,
"learning_rate": 9.638554216867472e-07,
"loss": 1.3155,
"step": 25
},
{
"epoch": 0.012057877813504822,
"grad_norm": 9.886808975385357,
"learning_rate": 1.1646586345381528e-06,
"loss": 1.2181,
"step": 30
},
{
"epoch": 0.014067524115755627,
"grad_norm": 9.765195243051854,
"learning_rate": 1.3654618473895584e-06,
"loss": 1.1915,
"step": 35
},
{
"epoch": 0.01607717041800643,
"grad_norm": 10.076388693447448,
"learning_rate": 1.566265060240964e-06,
"loss": 1.1387,
"step": 40
},
{
"epoch": 0.018086816720257234,
"grad_norm": 9.74974655232714,
"learning_rate": 1.7670682730923696e-06,
"loss": 1.0759,
"step": 45
},
{
"epoch": 0.02009646302250804,
"grad_norm": 8.925268910533704,
"learning_rate": 1.967871485943775e-06,
"loss": 1.0175,
"step": 50
},
{
"epoch": 0.02009646302250804,
"eval_cooking_sharegpt_test_loss": 0.9884688854217529,
"eval_cooking_sharegpt_test_runtime": 29.6069,
"eval_cooking_sharegpt_test_samples_per_second": 6.755,
"eval_cooking_sharegpt_test_steps_per_second": 0.338,
"step": 50
},
{
"epoch": 0.022106109324758844,
"grad_norm": 9.02573766663436,
"learning_rate": 2.168674698795181e-06,
"loss": 0.9542,
"step": 55
},
{
"epoch": 0.024115755627009645,
"grad_norm": 9.819497852533047,
"learning_rate": 2.3694779116465868e-06,
"loss": 0.9784,
"step": 60
},
{
"epoch": 0.02612540192926045,
"grad_norm": 10.82311981416087,
"learning_rate": 2.5702811244979918e-06,
"loss": 1.0231,
"step": 65
},
{
"epoch": 0.028135048231511254,
"grad_norm": 8.964625514542233,
"learning_rate": 2.771084337349398e-06,
"loss": 0.8573,
"step": 70
},
{
"epoch": 0.03014469453376206,
"grad_norm": 8.532301605077798,
"learning_rate": 2.9718875502008034e-06,
"loss": 0.9551,
"step": 75
},
{
"epoch": 0.03215434083601286,
"grad_norm": 9.990828654438014,
"learning_rate": 3.172690763052209e-06,
"loss": 0.9182,
"step": 80
},
{
"epoch": 0.034163987138263664,
"grad_norm": 6.995692527275145,
"learning_rate": 3.3734939759036146e-06,
"loss": 0.8639,
"step": 85
},
{
"epoch": 0.03617363344051447,
"grad_norm": 7.768080152065188,
"learning_rate": 3.5742971887550204e-06,
"loss": 0.8521,
"step": 90
},
{
"epoch": 0.03818327974276527,
"grad_norm": 8.012119974468852,
"learning_rate": 3.7751004016064258e-06,
"loss": 0.8477,
"step": 95
},
{
"epoch": 0.04019292604501608,
"grad_norm": 8.50117811151367,
"learning_rate": 3.975903614457832e-06,
"loss": 0.8473,
"step": 100
},
{
"epoch": 0.04019292604501608,
"eval_cooking_sharegpt_test_loss": 0.8732408285140991,
"eval_cooking_sharegpt_test_runtime": 29.1007,
"eval_cooking_sharegpt_test_samples_per_second": 6.873,
"eval_cooking_sharegpt_test_steps_per_second": 0.344,
"step": 100
},
{
"epoch": 0.04220257234726688,
"grad_norm": 8.560443518575713,
"learning_rate": 4.176706827309237e-06,
"loss": 0.8652,
"step": 105
},
{
"epoch": 0.04421221864951769,
"grad_norm": 9.719112811630923,
"learning_rate": 4.377510040160643e-06,
"loss": 0.8941,
"step": 110
},
{
"epoch": 0.04622186495176849,
"grad_norm": 9.985676476471362,
"learning_rate": 4.578313253012049e-06,
"loss": 0.8859,
"step": 115
},
{
"epoch": 0.04823151125401929,
"grad_norm": 8.414088670486853,
"learning_rate": 4.779116465863454e-06,
"loss": 0.8043,
"step": 120
},
{
"epoch": 0.050241157556270094,
"grad_norm": 8.096501738165966,
"learning_rate": 4.979919678714859e-06,
"loss": 0.8565,
"step": 125
},
{
"epoch": 0.0522508038585209,
"grad_norm": 9.410994081814192,
"learning_rate": 5.180722891566266e-06,
"loss": 0.9358,
"step": 130
},
{
"epoch": 0.0542604501607717,
"grad_norm": 7.421809482089455,
"learning_rate": 5.381526104417672e-06,
"loss": 0.8336,
"step": 135
},
{
"epoch": 0.05627009646302251,
"grad_norm": 8.907995665308611,
"learning_rate": 5.582329317269076e-06,
"loss": 0.8398,
"step": 140
},
{
"epoch": 0.05827974276527331,
"grad_norm": 7.103996712375502,
"learning_rate": 5.783132530120482e-06,
"loss": 0.8702,
"step": 145
},
{
"epoch": 0.06028938906752412,
"grad_norm": 8.485504235075577,
"learning_rate": 5.983935742971888e-06,
"loss": 0.8542,
"step": 150
},
{
"epoch": 0.06028938906752412,
"eval_cooking_sharegpt_test_loss": 0.8469827175140381,
"eval_cooking_sharegpt_test_runtime": 29.1354,
"eval_cooking_sharegpt_test_samples_per_second": 6.864,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 150
},
{
"epoch": 0.06229903536977492,
"grad_norm": 7.565513220130896,
"learning_rate": 6.184738955823294e-06,
"loss": 0.8558,
"step": 155
},
{
"epoch": 0.06430868167202572,
"grad_norm": 9.7278611728726,
"learning_rate": 6.385542168674699e-06,
"loss": 0.9122,
"step": 160
},
{
"epoch": 0.06631832797427653,
"grad_norm": 8.171433737702468,
"learning_rate": 6.586345381526105e-06,
"loss": 0.8225,
"step": 165
},
{
"epoch": 0.06832797427652733,
"grad_norm": 7.424345320287168,
"learning_rate": 6.78714859437751e-06,
"loss": 0.838,
"step": 170
},
{
"epoch": 0.07033762057877814,
"grad_norm": 6.778825035197842,
"learning_rate": 6.987951807228917e-06,
"loss": 0.7698,
"step": 175
},
{
"epoch": 0.07234726688102894,
"grad_norm": 8.160040638569848,
"learning_rate": 7.188755020080321e-06,
"loss": 0.8443,
"step": 180
},
{
"epoch": 0.07435691318327975,
"grad_norm": 7.56314606717551,
"learning_rate": 7.389558232931727e-06,
"loss": 0.7953,
"step": 185
},
{
"epoch": 0.07636655948553055,
"grad_norm": 7.860496549752045,
"learning_rate": 7.590361445783133e-06,
"loss": 0.8839,
"step": 190
},
{
"epoch": 0.07837620578778134,
"grad_norm": 6.887754234302554,
"learning_rate": 7.79116465863454e-06,
"loss": 0.796,
"step": 195
},
{
"epoch": 0.08038585209003216,
"grad_norm": 7.785553062894794,
"learning_rate": 7.991967871485944e-06,
"loss": 0.8336,
"step": 200
},
{
"epoch": 0.08038585209003216,
"eval_cooking_sharegpt_test_loss": 0.8259029984474182,
"eval_cooking_sharegpt_test_runtime": 29.1235,
"eval_cooking_sharegpt_test_samples_per_second": 6.867,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 200
},
{
"epoch": 0.08239549839228295,
"grad_norm": 6.994614576677781,
"learning_rate": 8.19277108433735e-06,
"loss": 0.8181,
"step": 205
},
{
"epoch": 0.08440514469453377,
"grad_norm": 7.772788858949606,
"learning_rate": 8.393574297188756e-06,
"loss": 0.8408,
"step": 210
},
{
"epoch": 0.08641479099678456,
"grad_norm": 7.254386585395993,
"learning_rate": 8.594377510040161e-06,
"loss": 0.9085,
"step": 215
},
{
"epoch": 0.08842443729903537,
"grad_norm": 7.026321018704356,
"learning_rate": 8.795180722891567e-06,
"loss": 0.8782,
"step": 220
},
{
"epoch": 0.09043408360128617,
"grad_norm": 7.051507761624435,
"learning_rate": 8.995983935742972e-06,
"loss": 0.9409,
"step": 225
},
{
"epoch": 0.09244372990353698,
"grad_norm": 7.606219516981015,
"learning_rate": 9.196787148594378e-06,
"loss": 0.8555,
"step": 230
},
{
"epoch": 0.09445337620578778,
"grad_norm": 6.28124366103456,
"learning_rate": 9.397590361445785e-06,
"loss": 0.7534,
"step": 235
},
{
"epoch": 0.09646302250803858,
"grad_norm": 8.394942968125275,
"learning_rate": 9.598393574297188e-06,
"loss": 0.8402,
"step": 240
},
{
"epoch": 0.09847266881028939,
"grad_norm": 7.156441873531409,
"learning_rate": 9.799196787148595e-06,
"loss": 0.8081,
"step": 245
},
{
"epoch": 0.10048231511254019,
"grad_norm": 7.327698916304687,
"learning_rate": 1e-05,
"loss": 0.7723,
"step": 250
},
{
"epoch": 0.10048231511254019,
"eval_cooking_sharegpt_test_loss": 0.8293350338935852,
"eval_cooking_sharegpt_test_runtime": 29.0949,
"eval_cooking_sharegpt_test_samples_per_second": 6.874,
"eval_cooking_sharegpt_test_steps_per_second": 0.344,
"step": 250
},
{
"epoch": 0.102491961414791,
"grad_norm": 7.4235500079351375,
"learning_rate": 9.999876953350016e-06,
"loss": 0.8151,
"step": 255
},
{
"epoch": 0.1045016077170418,
"grad_norm": 7.084895816067402,
"learning_rate": 9.999507819456254e-06,
"loss": 0.7621,
"step": 260
},
{
"epoch": 0.10651125401929261,
"grad_norm": 6.151331764138221,
"learning_rate": 9.998892616486991e-06,
"loss": 0.803,
"step": 265
},
{
"epoch": 0.1085209003215434,
"grad_norm": 7.692005262695669,
"learning_rate": 9.99803137472169e-06,
"loss": 0.7968,
"step": 270
},
{
"epoch": 0.11053054662379422,
"grad_norm": 7.296616213551366,
"learning_rate": 9.996924136549519e-06,
"loss": 0.8934,
"step": 275
},
{
"epoch": 0.11254019292604502,
"grad_norm": 6.78581453909987,
"learning_rate": 9.995570956467257e-06,
"loss": 0.8168,
"step": 280
},
{
"epoch": 0.11454983922829581,
"grad_norm": 6.629376557450756,
"learning_rate": 9.993971901076614e-06,
"loss": 0.8536,
"step": 285
},
{
"epoch": 0.11655948553054662,
"grad_norm": 6.125893044121695,
"learning_rate": 9.992127049080952e-06,
"loss": 0.8304,
"step": 290
},
{
"epoch": 0.11856913183279742,
"grad_norm": 6.3713359597576416,
"learning_rate": 9.990036491281418e-06,
"loss": 0.8069,
"step": 295
},
{
"epoch": 0.12057877813504823,
"grad_norm": 6.375162182116364,
"learning_rate": 9.98770033057246e-06,
"loss": 0.8101,
"step": 300
},
{
"epoch": 0.12057877813504823,
"eval_cooking_sharegpt_test_loss": 0.797538161277771,
"eval_cooking_sharegpt_test_runtime": 29.1177,
"eval_cooking_sharegpt_test_samples_per_second": 6.869,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 300
},
{
"epoch": 0.12258842443729903,
"grad_norm": 7.814959047420182,
"learning_rate": 9.985118681936783e-06,
"loss": 0.8315,
"step": 305
},
{
"epoch": 0.12459807073954984,
"grad_norm": 6.8494733103452194,
"learning_rate": 9.982291672439671e-06,
"loss": 0.7654,
"step": 310
},
{
"epoch": 0.12660771704180065,
"grad_norm": 5.286224086045848,
"learning_rate": 9.979219441222743e-06,
"loss": 0.7776,
"step": 315
},
{
"epoch": 0.12861736334405144,
"grad_norm": 6.103426753817979,
"learning_rate": 9.975902139497105e-06,
"loss": 0.815,
"step": 320
},
{
"epoch": 0.13062700964630225,
"grad_norm": 5.520317998483916,
"learning_rate": 9.972339930535897e-06,
"loss": 0.813,
"step": 325
},
{
"epoch": 0.13263665594855306,
"grad_norm": 5.9863235363550045,
"learning_rate": 9.968532989666277e-06,
"loss": 0.7504,
"step": 330
},
{
"epoch": 0.13464630225080385,
"grad_norm": 7.212782518237145,
"learning_rate": 9.96448150426077e-06,
"loss": 0.8715,
"step": 335
},
{
"epoch": 0.13665594855305466,
"grad_norm": 5.912590773720034,
"learning_rate": 9.96018567372806e-06,
"loss": 0.7558,
"step": 340
},
{
"epoch": 0.13866559485530547,
"grad_norm": 5.77435813010867,
"learning_rate": 9.95564570950317e-06,
"loss": 0.779,
"step": 345
},
{
"epoch": 0.14067524115755628,
"grad_norm": 6.505458706803495,
"learning_rate": 9.950861835037053e-06,
"loss": 0.8514,
"step": 350
},
{
"epoch": 0.14067524115755628,
"eval_cooking_sharegpt_test_loss": 0.7728434205055237,
"eval_cooking_sharegpt_test_runtime": 29.1638,
"eval_cooking_sharegpt_test_samples_per_second": 6.858,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 350
},
{
"epoch": 0.14268488745980706,
"grad_norm": 5.6452446665378115,
"learning_rate": 9.945834285785601e-06,
"loss": 0.6856,
"step": 355
},
{
"epoch": 0.14469453376205788,
"grad_norm": 5.6265260781387365,
"learning_rate": 9.94056330919805e-06,
"loss": 0.8083,
"step": 360
},
{
"epoch": 0.1467041800643087,
"grad_norm": 5.368904478416109,
"learning_rate": 9.935049164704809e-06,
"loss": 0.6928,
"step": 365
},
{
"epoch": 0.1487138263665595,
"grad_norm": 7.881458451902342,
"learning_rate": 9.929292123704677e-06,
"loss": 0.7741,
"step": 370
},
{
"epoch": 0.15072347266881028,
"grad_norm": 6.532061219960651,
"learning_rate": 9.923292469551498e-06,
"loss": 0.8097,
"step": 375
},
{
"epoch": 0.1527331189710611,
"grad_norm": 6.450102699245797,
"learning_rate": 9.91705049754021e-06,
"loss": 0.8684,
"step": 380
},
{
"epoch": 0.1547427652733119,
"grad_norm": 5.574142168212173,
"learning_rate": 9.910566514892311e-06,
"loss": 0.7809,
"step": 385
},
{
"epoch": 0.1567524115755627,
"grad_norm": 6.795118484011259,
"learning_rate": 9.903840840740739e-06,
"loss": 0.8092,
"step": 390
},
{
"epoch": 0.1587620578778135,
"grad_norm": 6.521136743595867,
"learning_rate": 9.896873806114164e-06,
"loss": 0.7888,
"step": 395
},
{
"epoch": 0.1607717041800643,
"grad_norm": 6.632901741168955,
"learning_rate": 9.889665753920693e-06,
"loss": 0.7539,
"step": 400
},
{
"epoch": 0.1607717041800643,
"eval_cooking_sharegpt_test_loss": 0.7521212697029114,
"eval_cooking_sharegpt_test_runtime": 29.1522,
"eval_cooking_sharegpt_test_samples_per_second": 6.861,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 400
},
{
"epoch": 0.16278135048231512,
"grad_norm": 5.843964500525819,
"learning_rate": 9.882217038930996e-06,
"loss": 0.7583,
"step": 405
},
{
"epoch": 0.1647909967845659,
"grad_norm": 5.922756974819583,
"learning_rate": 9.874528027760844e-06,
"loss": 0.7904,
"step": 410
},
{
"epoch": 0.16680064308681672,
"grad_norm": 5.184809873689736,
"learning_rate": 9.866599098853065e-06,
"loss": 0.6878,
"step": 415
},
{
"epoch": 0.16881028938906753,
"grad_norm": 6.0072795392775875,
"learning_rate": 9.858430642458911e-06,
"loss": 0.7625,
"step": 420
},
{
"epoch": 0.17081993569131831,
"grad_norm": 6.3898364197395505,
"learning_rate": 9.850023060618865e-06,
"loss": 0.8075,
"step": 425
},
{
"epoch": 0.17282958199356913,
"grad_norm": 5.4615901982761725,
"learning_rate": 9.841376767142836e-06,
"loss": 0.7334,
"step": 430
},
{
"epoch": 0.17483922829581994,
"grad_norm": 5.072280085120411,
"learning_rate": 9.832492187589803e-06,
"loss": 0.7006,
"step": 435
},
{
"epoch": 0.17684887459807075,
"grad_norm": 4.902329337695089,
"learning_rate": 9.823369759246866e-06,
"loss": 0.7779,
"step": 440
},
{
"epoch": 0.17885852090032153,
"grad_norm": 5.961804769639084,
"learning_rate": 9.814009931107724e-06,
"loss": 0.7983,
"step": 445
},
{
"epoch": 0.18086816720257234,
"grad_norm": 4.8028511364670115,
"learning_rate": 9.804413163850578e-06,
"loss": 0.6964,
"step": 450
},
{
"epoch": 0.18086816720257234,
"eval_cooking_sharegpt_test_loss": 0.7369500994682312,
"eval_cooking_sharegpt_test_runtime": 29.1063,
"eval_cooking_sharegpt_test_samples_per_second": 6.871,
"eval_cooking_sharegpt_test_steps_per_second": 0.344,
"step": 450
},
{
"epoch": 0.18287781350482316,
"grad_norm": 4.8914718117824405,
"learning_rate": 9.79457992981545e-06,
"loss": 0.7367,
"step": 455
},
{
"epoch": 0.18488745980707397,
"grad_norm": 5.402352390784586,
"learning_rate": 9.784510712980944e-06,
"loss": 0.6798,
"step": 460
},
{
"epoch": 0.18689710610932475,
"grad_norm": 5.701737276259646,
"learning_rate": 9.774206008940418e-06,
"loss": 0.7226,
"step": 465
},
{
"epoch": 0.18890675241157556,
"grad_norm": 6.263433603685716,
"learning_rate": 9.7636663248776e-06,
"loss": 0.8274,
"step": 470
},
{
"epoch": 0.19091639871382637,
"grad_norm": 5.312503789863195,
"learning_rate": 9.75289217954161e-06,
"loss": 0.7734,
"step": 475
},
{
"epoch": 0.19292604501607716,
"grad_norm": 5.156337860674348,
"learning_rate": 9.741884103221451e-06,
"loss": 0.7659,
"step": 480
},
{
"epoch": 0.19493569131832797,
"grad_norm": 6.800003297542305,
"learning_rate": 9.730642637719884e-06,
"loss": 0.7985,
"step": 485
},
{
"epoch": 0.19694533762057878,
"grad_norm": 5.800498897893185,
"learning_rate": 9.71916833632678e-06,
"loss": 0.7221,
"step": 490
},
{
"epoch": 0.1989549839228296,
"grad_norm": 4.793819929790139,
"learning_rate": 9.707461763791879e-06,
"loss": 0.715,
"step": 495
},
{
"epoch": 0.20096463022508038,
"grad_norm": 5.1410323292838065,
"learning_rate": 9.69552349629699e-06,
"loss": 0.7628,
"step": 500
},
{
"epoch": 0.20096463022508038,
"eval_cooking_sharegpt_test_loss": 0.7155391573905945,
"eval_cooking_sharegpt_test_runtime": 29.1357,
"eval_cooking_sharegpt_test_samples_per_second": 6.864,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 500
},
{
"epoch": 0.2029742765273312,
"grad_norm": 4.650244956653454,
"learning_rate": 9.683354121427645e-06,
"loss": 0.6865,
"step": 505
},
{
"epoch": 0.204983922829582,
"grad_norm": 5.005060659354274,
"learning_rate": 9.670954238144165e-06,
"loss": 0.7376,
"step": 510
},
{
"epoch": 0.2069935691318328,
"grad_norm": 4.9392891385103495,
"learning_rate": 9.658324456752194e-06,
"loss": 0.6808,
"step": 515
},
{
"epoch": 0.2090032154340836,
"grad_norm": 5.07290134380499,
"learning_rate": 9.645465398872645e-06,
"loss": 0.6335,
"step": 520
},
{
"epoch": 0.2110128617363344,
"grad_norm": 6.372463646337311,
"learning_rate": 9.632377697411114e-06,
"loss": 0.7125,
"step": 525
},
{
"epoch": 0.21302250803858522,
"grad_norm": 4.8784584544005565,
"learning_rate": 9.619061996526735e-06,
"loss": 0.7647,
"step": 530
},
{
"epoch": 0.215032154340836,
"grad_norm": 4.991918572196447,
"learning_rate": 9.605518951600456e-06,
"loss": 0.7159,
"step": 535
},
{
"epoch": 0.2170418006430868,
"grad_norm": 4.7469831719019,
"learning_rate": 9.591749229202805e-06,
"loss": 0.8187,
"step": 540
},
{
"epoch": 0.21905144694533762,
"grad_norm": 5.013038664469992,
"learning_rate": 9.577753507061063e-06,
"loss": 0.7215,
"step": 545
},
{
"epoch": 0.22106109324758844,
"grad_norm": 4.814679760540896,
"learning_rate": 9.563532474025922e-06,
"loss": 0.6789,
"step": 550
},
{
"epoch": 0.22106109324758844,
"eval_cooking_sharegpt_test_loss": 0.706028163433075,
"eval_cooking_sharegpt_test_runtime": 29.1544,
"eval_cooking_sharegpt_test_samples_per_second": 6.86,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 550
},
{
"epoch": 0.22307073954983922,
"grad_norm": 5.071008410167568,
"learning_rate": 9.549086830037573e-06,
"loss": 0.7722,
"step": 555
},
{
"epoch": 0.22508038585209003,
"grad_norm": 4.819320792169161,
"learning_rate": 9.534417286091254e-06,
"loss": 0.6459,
"step": 560
},
{
"epoch": 0.22709003215434084,
"grad_norm": 5.805791958928098,
"learning_rate": 9.519524564202261e-06,
"loss": 0.7018,
"step": 565
},
{
"epoch": 0.22909967845659163,
"grad_norm": 5.261658646649239,
"learning_rate": 9.50440939737041e-06,
"loss": 0.7692,
"step": 570
},
{
"epoch": 0.23110932475884244,
"grad_norm": 6.656814899492019,
"learning_rate": 9.489072529543955e-06,
"loss": 0.8188,
"step": 575
},
{
"epoch": 0.23311897106109325,
"grad_norm": 5.482276123459045,
"learning_rate": 9.473514715582982e-06,
"loss": 0.727,
"step": 580
},
{
"epoch": 0.23512861736334406,
"grad_norm": 5.089480873020227,
"learning_rate": 9.457736721222245e-06,
"loss": 0.7129,
"step": 585
},
{
"epoch": 0.23713826366559485,
"grad_norm": 4.422268494779517,
"learning_rate": 9.441739323033485e-06,
"loss": 0.6732,
"step": 590
},
{
"epoch": 0.23914790996784566,
"grad_norm": 5.561830088062397,
"learning_rate": 9.425523308387203e-06,
"loss": 0.625,
"step": 595
},
{
"epoch": 0.24115755627009647,
"grad_norm": 5.241025479724879,
"learning_rate": 9.409089475413912e-06,
"loss": 0.743,
"step": 600
},
{
"epoch": 0.24115755627009647,
"eval_cooking_sharegpt_test_loss": 0.7122946381568909,
"eval_cooking_sharegpt_test_runtime": 29.1346,
"eval_cooking_sharegpt_test_samples_per_second": 6.865,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 600
},
{
"epoch": 0.24316720257234728,
"grad_norm": 5.9963960390112945,
"learning_rate": 9.392438632964847e-06,
"loss": 0.7522,
"step": 605
},
{
"epoch": 0.24517684887459806,
"grad_norm": 5.407942176340359,
"learning_rate": 9.375571600572165e-06,
"loss": 0.7116,
"step": 610
},
{
"epoch": 0.24718649517684887,
"grad_norm": 5.318941810734261,
"learning_rate": 9.358489208408594e-06,
"loss": 0.7307,
"step": 615
},
{
"epoch": 0.2491961414790997,
"grad_norm": 4.610264529077515,
"learning_rate": 9.341192297246588e-06,
"loss": 0.7274,
"step": 620
},
{
"epoch": 0.2512057877813505,
"grad_norm": 4.432769811479363,
"learning_rate": 9.323681718416937e-06,
"loss": 0.6281,
"step": 625
},
{
"epoch": 0.2532154340836013,
"grad_norm": 4.6735771702678965,
"learning_rate": 9.305958333766867e-06,
"loss": 0.6655,
"step": 630
},
{
"epoch": 0.25522508038585207,
"grad_norm": 6.091308209571449,
"learning_rate": 9.288023015617618e-06,
"loss": 0.7275,
"step": 635
},
{
"epoch": 0.2572347266881029,
"grad_norm": 4.6463773174566,
"learning_rate": 9.269876646721519e-06,
"loss": 0.6827,
"step": 640
},
{
"epoch": 0.2592443729903537,
"grad_norm": 4.452810636438574,
"learning_rate": 9.251520120218528e-06,
"loss": 0.6883,
"step": 645
},
{
"epoch": 0.2612540192926045,
"grad_norm": 6.398787653321348,
"learning_rate": 9.232954339592285e-06,
"loss": 0.7807,
"step": 650
},
{
"epoch": 0.2612540192926045,
"eval_cooking_sharegpt_test_loss": 0.6995799541473389,
"eval_cooking_sharegpt_test_runtime": 29.1239,
"eval_cooking_sharegpt_test_samples_per_second": 6.867,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 650
},
{
"epoch": 0.2632636655948553,
"grad_norm": 5.489052927932185,
"learning_rate": 9.214180218625632e-06,
"loss": 0.6858,
"step": 655
},
{
"epoch": 0.2652733118971061,
"grad_norm": 5.330659230842556,
"learning_rate": 9.195198681355647e-06,
"loss": 0.711,
"step": 660
},
{
"epoch": 0.26728295819935693,
"grad_norm": 5.581942505955761,
"learning_rate": 9.176010662028157e-06,
"loss": 0.6628,
"step": 665
},
{
"epoch": 0.2692926045016077,
"grad_norm": 5.131847371991602,
"learning_rate": 9.156617105051763e-06,
"loss": 0.679,
"step": 670
},
{
"epoch": 0.2713022508038585,
"grad_norm": 4.7770105092526896,
"learning_rate": 9.13701896495135e-06,
"loss": 0.7146,
"step": 675
},
{
"epoch": 0.2733118971061093,
"grad_norm": 4.982534700495619,
"learning_rate": 9.117217206321113e-06,
"loss": 0.7721,
"step": 680
},
{
"epoch": 0.2753215434083601,
"grad_norm": 4.798649936226053,
"learning_rate": 9.09721280377708e-06,
"loss": 0.7748,
"step": 685
},
{
"epoch": 0.27733118971061094,
"grad_norm": 4.601964700694306,
"learning_rate": 9.077006741909133e-06,
"loss": 0.7435,
"step": 690
},
{
"epoch": 0.27934083601286175,
"grad_norm": 5.566494391677513,
"learning_rate": 9.056600015232567e-06,
"loss": 0.6952,
"step": 695
},
{
"epoch": 0.28135048231511256,
"grad_norm": 5.727808059117751,
"learning_rate": 9.035993628139117e-06,
"loss": 0.6711,
"step": 700
},
{
"epoch": 0.28135048231511256,
"eval_cooking_sharegpt_test_loss": 0.6903340220451355,
"eval_cooking_sharegpt_test_runtime": 29.1049,
"eval_cooking_sharegpt_test_samples_per_second": 6.872,
"eval_cooking_sharegpt_test_steps_per_second": 0.344,
"step": 700
},
{
"epoch": 0.28336012861736337,
"grad_norm": 3.8470896481653494,
"learning_rate": 9.01518859484755e-06,
"loss": 0.6729,
"step": 705
},
{
"epoch": 0.2853697749196141,
"grad_norm": 4.87977503518934,
"learning_rate": 8.99418593935372e-06,
"loss": 0.6863,
"step": 710
},
{
"epoch": 0.28737942122186494,
"grad_norm": 4.674016276471173,
"learning_rate": 8.972986695380189e-06,
"loss": 0.6651,
"step": 715
},
{
"epoch": 0.28938906752411575,
"grad_norm": 5.15746212298852,
"learning_rate": 8.95159190632534e-06,
"loss": 0.6642,
"step": 720
},
{
"epoch": 0.29139871382636656,
"grad_norm": 5.180219693756491,
"learning_rate": 8.930002625212018e-06,
"loss": 0.6115,
"step": 725
},
{
"epoch": 0.2934083601286174,
"grad_norm": 4.6310292453141315,
"learning_rate": 8.908219914635711e-06,
"loss": 0.7092,
"step": 730
},
{
"epoch": 0.2954180064308682,
"grad_norm": 5.683343490286694,
"learning_rate": 8.886244846712245e-06,
"loss": 0.7257,
"step": 735
},
{
"epoch": 0.297427652733119,
"grad_norm": 4.9104420352970015,
"learning_rate": 8.864078503025017e-06,
"loss": 0.7523,
"step": 740
},
{
"epoch": 0.29943729903536975,
"grad_norm": 4.821845146152399,
"learning_rate": 8.841721974571758e-06,
"loss": 0.6734,
"step": 745
},
{
"epoch": 0.30144694533762056,
"grad_norm": 4.700309572076156,
"learning_rate": 8.819176361710842e-06,
"loss": 0.6201,
"step": 750
},
{
"epoch": 0.30144694533762056,
"eval_cooking_sharegpt_test_loss": 0.6827989816665649,
"eval_cooking_sharegpt_test_runtime": 29.1653,
"eval_cooking_sharegpt_test_samples_per_second": 6.857,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 750
},
{
"epoch": 0.3034565916398714,
"grad_norm": 4.984810224063278,
"learning_rate": 8.796442774107123e-06,
"loss": 0.7233,
"step": 755
},
{
"epoch": 0.3054662379421222,
"grad_norm": 4.572885347797854,
"learning_rate": 8.77352233067732e-06,
"loss": 0.7791,
"step": 760
},
{
"epoch": 0.307475884244373,
"grad_norm": 4.723286735310018,
"learning_rate": 8.750416159534944e-06,
"loss": 0.692,
"step": 765
},
{
"epoch": 0.3094855305466238,
"grad_norm": 5.060218829161682,
"learning_rate": 8.727125397934777e-06,
"loss": 0.6615,
"step": 770
},
{
"epoch": 0.3114951768488746,
"grad_norm": 4.833251557506912,
"learning_rate": 8.703651192216896e-06,
"loss": 0.7046,
"step": 775
},
{
"epoch": 0.3135048231511254,
"grad_norm": 5.151428527882602,
"learning_rate": 8.67999469775025e-06,
"loss": 0.6859,
"step": 780
},
{
"epoch": 0.3155144694533762,
"grad_norm": 5.403069037330436,
"learning_rate": 8.656157078875794e-06,
"loss": 0.6585,
"step": 785
},
{
"epoch": 0.317524115755627,
"grad_norm": 4.594702329252282,
"learning_rate": 8.632139508849192e-06,
"loss": 0.6662,
"step": 790
},
{
"epoch": 0.3195337620578778,
"grad_norm": 3.82227587899388,
"learning_rate": 8.60794316978305e-06,
"loss": 0.6479,
"step": 795
},
{
"epoch": 0.3215434083601286,
"grad_norm": 4.050592455983538,
"learning_rate": 8.583569252588761e-06,
"loss": 0.6634,
"step": 800
},
{
"epoch": 0.3215434083601286,
"eval_cooking_sharegpt_test_loss": 0.6700084209442139,
"eval_cooking_sharegpt_test_runtime": 29.1558,
"eval_cooking_sharegpt_test_samples_per_second": 6.86,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 800
},
{
"epoch": 0.32355305466237944,
"grad_norm": 4.7546247450248895,
"learning_rate": 8.559018956917864e-06,
"loss": 0.6893,
"step": 805
},
{
"epoch": 0.32556270096463025,
"grad_norm": 4.892180049499131,
"learning_rate": 8.534293491103014e-06,
"loss": 0.7171,
"step": 810
},
{
"epoch": 0.327572347266881,
"grad_norm": 4.593573940130922,
"learning_rate": 8.50939407209851e-06,
"loss": 0.602,
"step": 815
},
{
"epoch": 0.3295819935691318,
"grad_norm": 4.812442397504877,
"learning_rate": 8.484321925420383e-06,
"loss": 0.6965,
"step": 820
},
{
"epoch": 0.3315916398713826,
"grad_norm": 4.995658301324656,
"learning_rate": 8.459078285086103e-06,
"loss": 0.6757,
"step": 825
},
{
"epoch": 0.33360128617363344,
"grad_norm": 4.411896351243907,
"learning_rate": 8.433664393553815e-06,
"loss": 0.6125,
"step": 830
},
{
"epoch": 0.33561093247588425,
"grad_norm": 4.037149487682339,
"learning_rate": 8.40808150166121e-06,
"loss": 0.5841,
"step": 835
},
{
"epoch": 0.33762057877813506,
"grad_norm": 4.6302565097915656,
"learning_rate": 8.382330868563943e-06,
"loss": 0.669,
"step": 840
},
{
"epoch": 0.3396302250803859,
"grad_norm": 5.037886894281096,
"learning_rate": 8.35641376167367e-06,
"loss": 0.6602,
"step": 845
},
{
"epoch": 0.34163987138263663,
"grad_norm": 4.116477405728893,
"learning_rate": 8.330331456595663e-06,
"loss": 0.6318,
"step": 850
},
{
"epoch": 0.34163987138263663,
"eval_cooking_sharegpt_test_loss": 0.6634958386421204,
"eval_cooking_sharegpt_test_runtime": 29.1155,
"eval_cooking_sharegpt_test_samples_per_second": 6.869,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 850
},
{
"epoch": 0.34364951768488744,
"grad_norm": 4.53205416614136,
"learning_rate": 8.304085237066027e-06,
"loss": 0.7296,
"step": 855
},
{
"epoch": 0.34565916398713825,
"grad_norm": 4.562829710605114,
"learning_rate": 8.277676394888518e-06,
"loss": 0.6152,
"step": 860
},
{
"epoch": 0.34766881028938906,
"grad_norm": 4.850911945798656,
"learning_rate": 8.25110622987096e-06,
"loss": 0.6514,
"step": 865
},
{
"epoch": 0.3496784565916399,
"grad_norm": 4.655697540410078,
"learning_rate": 8.22437604976127e-06,
"loss": 0.7196,
"step": 870
},
{
"epoch": 0.3516881028938907,
"grad_norm": 5.125990956860812,
"learning_rate": 8.197487170183092e-06,
"loss": 0.6654,
"step": 875
},
{
"epoch": 0.3536977491961415,
"grad_norm": 4.758879553215313,
"learning_rate": 8.170440914571052e-06,
"loss": 0.6771,
"step": 880
},
{
"epoch": 0.3557073954983923,
"grad_norm": 4.800616804892499,
"learning_rate": 8.143238614105608e-06,
"loss": 0.6825,
"step": 885
},
{
"epoch": 0.35771704180064307,
"grad_norm": 5.081512148323733,
"learning_rate": 8.115881607647538e-06,
"loss": 0.6968,
"step": 890
},
{
"epoch": 0.3597266881028939,
"grad_norm": 4.824963989656846,
"learning_rate": 8.08837124167204e-06,
"loss": 0.6879,
"step": 895
},
{
"epoch": 0.3617363344051447,
"grad_norm": 5.409545786784468,
"learning_rate": 8.060708870202462e-06,
"loss": 0.7033,
"step": 900
},
{
"epoch": 0.3617363344051447,
"eval_cooking_sharegpt_test_loss": 0.6626113653182983,
"eval_cooking_sharegpt_test_runtime": 29.1696,
"eval_cooking_sharegpt_test_samples_per_second": 6.856,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 900
},
{
"epoch": 0.3637459807073955,
"grad_norm": 3.7618538333099982,
"learning_rate": 8.032895854743661e-06,
"loss": 0.6522,
"step": 905
},
{
"epoch": 0.3657556270096463,
"grad_norm": 4.543338033090877,
"learning_rate": 8.004933564214991e-06,
"loss": 0.589,
"step": 910
},
{
"epoch": 0.3677652733118971,
"grad_norm": 4.486029590225905,
"learning_rate": 7.976823374882919e-06,
"loss": 0.6684,
"step": 915
},
{
"epoch": 0.36977491961414793,
"grad_norm": 4.184644901714797,
"learning_rate": 7.948566670293298e-06,
"loss": 0.6203,
"step": 920
},
{
"epoch": 0.3717845659163987,
"grad_norm": 4.232384871040644,
"learning_rate": 7.920164841203262e-06,
"loss": 0.6393,
"step": 925
},
{
"epoch": 0.3737942122186495,
"grad_norm": 4.593579658625747,
"learning_rate": 7.891619285512781e-06,
"loss": 0.7574,
"step": 930
},
{
"epoch": 0.3758038585209003,
"grad_norm": 4.23534793280711,
"learning_rate": 7.862931408195855e-06,
"loss": 0.5811,
"step": 935
},
{
"epoch": 0.3778135048231511,
"grad_norm": 4.805442331593937,
"learning_rate": 7.834102621231364e-06,
"loss": 0.6265,
"step": 940
},
{
"epoch": 0.37982315112540194,
"grad_norm": 4.667120661635427,
"learning_rate": 7.805134343533572e-06,
"loss": 0.6295,
"step": 945
},
{
"epoch": 0.38183279742765275,
"grad_norm": 4.3862142407448905,
"learning_rate": 7.776028000882288e-06,
"loss": 0.6715,
"step": 950
},
{
"epoch": 0.38183279742765275,
"eval_cooking_sharegpt_test_loss": 0.6476317048072815,
"eval_cooking_sharegpt_test_runtime": 29.1486,
"eval_cooking_sharegpt_test_samples_per_second": 6.861,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 950
},
{
"epoch": 0.38384244372990356,
"grad_norm": 4.516234494825295,
"learning_rate": 7.746785025852695e-06,
"loss": 0.6513,
"step": 955
},
{
"epoch": 0.3858520900321543,
"grad_norm": 4.063375240398369,
"learning_rate": 7.717406857744837e-06,
"loss": 0.5945,
"step": 960
},
{
"epoch": 0.3878617363344051,
"grad_norm": 4.280617916285887,
"learning_rate": 7.687894942512786e-06,
"loss": 0.6263,
"step": 965
},
{
"epoch": 0.38987138263665594,
"grad_norm": 4.2306442061102585,
"learning_rate": 7.65825073269346e-06,
"loss": 0.6307,
"step": 970
},
{
"epoch": 0.39188102893890675,
"grad_norm": 3.83492814563052,
"learning_rate": 7.628475687335142e-06,
"loss": 0.6768,
"step": 975
},
{
"epoch": 0.39389067524115756,
"grad_norm": 3.91144567078091,
"learning_rate": 7.598571271925667e-06,
"loss": 0.5288,
"step": 980
},
{
"epoch": 0.3959003215434084,
"grad_norm": 4.8926561756291145,
"learning_rate": 7.568538958320291e-06,
"loss": 0.5691,
"step": 985
},
{
"epoch": 0.3979099678456592,
"grad_norm": 5.343345050879741,
"learning_rate": 7.538380224669244e-06,
"loss": 0.681,
"step": 990
},
{
"epoch": 0.39991961414790994,
"grad_norm": 4.801310495851274,
"learning_rate": 7.5080965553449834e-06,
"loss": 0.6365,
"step": 995
},
{
"epoch": 0.40192926045016075,
"grad_norm": 4.263810617817441,
"learning_rate": 7.477689440869135e-06,
"loss": 0.6511,
"step": 1000
},
{
"epoch": 0.40192926045016075,
"eval_cooking_sharegpt_test_loss": 0.640380859375,
"eval_cooking_sharegpt_test_runtime": 29.1538,
"eval_cooking_sharegpt_test_samples_per_second": 6.86,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1000
},
{
"epoch": 0.40393890675241156,
"grad_norm": 4.608176994643662,
"learning_rate": 7.447160377839125e-06,
"loss": 0.6558,
"step": 1005
},
{
"epoch": 0.4059485530546624,
"grad_norm": 4.184997495410128,
"learning_rate": 7.416510868854529e-06,
"loss": 0.6028,
"step": 1010
},
{
"epoch": 0.4079581993569132,
"grad_norm": 4.14811982617309,
"learning_rate": 7.385742422443108e-06,
"loss": 0.6116,
"step": 1015
},
{
"epoch": 0.409967845659164,
"grad_norm": 4.4179772033730975,
"learning_rate": 7.354856552986563e-06,
"loss": 0.6657,
"step": 1020
},
{
"epoch": 0.4119774919614148,
"grad_norm": 3.872451257996019,
"learning_rate": 7.323854780646002e-06,
"loss": 0.616,
"step": 1025
},
{
"epoch": 0.4139871382636656,
"grad_norm": 4.575816735109395,
"learning_rate": 7.2927386312871185e-06,
"loss": 0.6595,
"step": 1030
},
{
"epoch": 0.4159967845659164,
"grad_norm": 4.636123500937769,
"learning_rate": 7.261509636405087e-06,
"loss": 0.537,
"step": 1035
},
{
"epoch": 0.4180064308681672,
"grad_norm": 5.110397360196885,
"learning_rate": 7.230169333049188e-06,
"loss": 0.6751,
"step": 1040
},
{
"epoch": 0.420016077170418,
"grad_norm": 4.095989085644148,
"learning_rate": 7.198719263747158e-06,
"loss": 0.6638,
"step": 1045
},
{
"epoch": 0.4220257234726688,
"grad_norm": 5.40481290202407,
"learning_rate": 7.167160976429264e-06,
"loss": 0.6804,
"step": 1050
},
{
"epoch": 0.4220257234726688,
"eval_cooking_sharegpt_test_loss": 0.6353456974029541,
"eval_cooking_sharegpt_test_runtime": 29.1174,
"eval_cooking_sharegpt_test_samples_per_second": 6.869,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1050
},
{
"epoch": 0.4240353697749196,
"grad_norm": 4.345481799467673,
"learning_rate": 7.13549602435212e-06,
"loss": 0.6407,
"step": 1055
},
{
"epoch": 0.42604501607717044,
"grad_norm": 4.1689356990951145,
"learning_rate": 7.103725966022233e-06,
"loss": 0.6676,
"step": 1060
},
{
"epoch": 0.42805466237942125,
"grad_norm": 4.124587131944435,
"learning_rate": 7.071852365119306e-06,
"loss": 0.5613,
"step": 1065
},
{
"epoch": 0.430064308681672,
"grad_norm": 5.215102388713326,
"learning_rate": 7.039876790419262e-06,
"loss": 0.6349,
"step": 1070
},
{
"epoch": 0.4320739549839228,
"grad_norm": 4.222772794043426,
"learning_rate": 7.0078008157170415e-06,
"loss": 0.5982,
"step": 1075
},
{
"epoch": 0.4340836012861736,
"grad_norm": 4.044144044404326,
"learning_rate": 6.975626019749137e-06,
"loss": 0.6009,
"step": 1080
},
{
"epoch": 0.43609324758842444,
"grad_norm": 4.504810284400397,
"learning_rate": 6.943353986115893e-06,
"loss": 0.6371,
"step": 1085
},
{
"epoch": 0.43810289389067525,
"grad_norm": 4.200082136051,
"learning_rate": 6.910986303203556e-06,
"loss": 0.6367,
"step": 1090
},
{
"epoch": 0.44011254019292606,
"grad_norm": 4.474891462331243,
"learning_rate": 6.87852456410611e-06,
"loss": 0.6916,
"step": 1095
},
{
"epoch": 0.44212218649517687,
"grad_norm": 5.613000149936893,
"learning_rate": 6.845970366546856e-06,
"loss": 0.6355,
"step": 1100
},
{
"epoch": 0.44212218649517687,
"eval_cooking_sharegpt_test_loss": 0.6258378028869629,
"eval_cooking_sharegpt_test_runtime": 29.1265,
"eval_cooking_sharegpt_test_samples_per_second": 6.867,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1100
},
{
"epoch": 0.44413183279742763,
"grad_norm": 5.123887472129782,
"learning_rate": 6.813325312799769e-06,
"loss": 0.6296,
"step": 1105
},
{
"epoch": 0.44614147909967844,
"grad_norm": 4.4261628776726605,
"learning_rate": 6.7805910096106555e-06,
"loss": 0.5624,
"step": 1110
},
{
"epoch": 0.44815112540192925,
"grad_norm": 4.638630416730711,
"learning_rate": 6.747769068118049e-06,
"loss": 0.6354,
"step": 1115
},
{
"epoch": 0.45016077170418006,
"grad_norm": 4.461694988531885,
"learning_rate": 6.714861103773934e-06,
"loss": 0.5248,
"step": 1120
},
{
"epoch": 0.4521704180064309,
"grad_norm": 5.163765933017904,
"learning_rate": 6.681868736264215e-06,
"loss": 0.6462,
"step": 1125
},
{
"epoch": 0.4541800643086817,
"grad_norm": 4.6310268658457545,
"learning_rate": 6.648793589429011e-06,
"loss": 0.6174,
"step": 1130
},
{
"epoch": 0.4561897106109325,
"grad_norm": 5.126634815687219,
"learning_rate": 6.61563729118273e-06,
"loss": 0.6466,
"step": 1135
},
{
"epoch": 0.45819935691318325,
"grad_norm": 4.319380604514048,
"learning_rate": 6.582401473433941e-06,
"loss": 0.654,
"step": 1140
},
{
"epoch": 0.46020900321543406,
"grad_norm": 4.247314315589029,
"learning_rate": 6.5490877720050574e-06,
"loss": 0.5634,
"step": 1145
},
{
"epoch": 0.4622186495176849,
"grad_norm": 4.906257391887683,
"learning_rate": 6.515697826551822e-06,
"loss": 0.692,
"step": 1150
},
{
"epoch": 0.4622186495176849,
"eval_cooking_sharegpt_test_loss": 0.6236215829849243,
"eval_cooking_sharegpt_test_runtime": 29.1167,
"eval_cooking_sharegpt_test_samples_per_second": 6.869,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1150
},
{
"epoch": 0.4642282958199357,
"grad_norm": 4.819507990509822,
"learning_rate": 6.482233280482608e-06,
"loss": 0.6147,
"step": 1155
},
{
"epoch": 0.4662379421221865,
"grad_norm": 4.478005354409713,
"learning_rate": 6.448695780877532e-06,
"loss": 0.6581,
"step": 1160
},
{
"epoch": 0.4682475884244373,
"grad_norm": 4.193130144327688,
"learning_rate": 6.415086978407382e-06,
"loss": 0.6124,
"step": 1165
},
{
"epoch": 0.4702572347266881,
"grad_norm": 4.1578998395779365,
"learning_rate": 6.381408527252381e-06,
"loss": 0.6238,
"step": 1170
},
{
"epoch": 0.47226688102893893,
"grad_norm": 3.879162148025109,
"learning_rate": 6.347662085020764e-06,
"loss": 0.5786,
"step": 1175
},
{
"epoch": 0.4742765273311897,
"grad_norm": 4.9955060839894765,
"learning_rate": 6.313849312667197e-06,
"loss": 0.6763,
"step": 1180
},
{
"epoch": 0.4762861736334405,
"grad_norm": 4.446633316148537,
"learning_rate": 6.279971874411027e-06,
"loss": 0.6339,
"step": 1185
},
{
"epoch": 0.4782958199356913,
"grad_norm": 4.288694053242222,
"learning_rate": 6.246031437654368e-06,
"loss": 0.616,
"step": 1190
},
{
"epoch": 0.4803054662379421,
"grad_norm": 3.8400933875164087,
"learning_rate": 6.2120296729000395e-06,
"loss": 0.6927,
"step": 1195
},
{
"epoch": 0.48231511254019294,
"grad_norm": 5.8265497485402955,
"learning_rate": 6.177968253669337e-06,
"loss": 0.7054,
"step": 1200
},
{
"epoch": 0.48231511254019294,
"eval_cooking_sharegpt_test_loss": 0.6158734560012817,
"eval_cooking_sharegpt_test_runtime": 29.1595,
"eval_cooking_sharegpt_test_samples_per_second": 6.859,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1200
},
{
"epoch": 0.48432475884244375,
"grad_norm": 4.586416002456061,
"learning_rate": 6.143848856419675e-06,
"loss": 0.6032,
"step": 1205
},
{
"epoch": 0.48633440514469456,
"grad_norm": 4.9377135174761895,
"learning_rate": 6.109673160462063e-06,
"loss": 0.6026,
"step": 1210
},
{
"epoch": 0.4883440514469453,
"grad_norm": 5.323719984452999,
"learning_rate": 6.075442847878463e-06,
"loss": 0.671,
"step": 1215
},
{
"epoch": 0.4903536977491961,
"grad_norm": 4.045561951621483,
"learning_rate": 6.041159603438991e-06,
"loss": 0.5717,
"step": 1220
},
{
"epoch": 0.49236334405144694,
"grad_norm": 3.7625870662514562,
"learning_rate": 6.006825114518998e-06,
"loss": 0.5493,
"step": 1225
},
{
"epoch": 0.49437299035369775,
"grad_norm": 4.58456420076278,
"learning_rate": 5.9724410710160184e-06,
"loss": 0.5905,
"step": 1230
},
{
"epoch": 0.49638263665594856,
"grad_norm": 4.536041072205623,
"learning_rate": 5.938009165266603e-06,
"loss": 0.6284,
"step": 1235
},
{
"epoch": 0.4983922829581994,
"grad_norm": 4.028915918788256,
"learning_rate": 5.903531091963011e-06,
"loss": 0.5853,
"step": 1240
},
{
"epoch": 0.5004019292604501,
"grad_norm": 4.599870175094803,
"learning_rate": 5.8690085480698075e-06,
"loss": 0.5881,
"step": 1245
},
{
"epoch": 0.502411575562701,
"grad_norm": 4.439772860645252,
"learning_rate": 5.834443232740346e-06,
"loss": 0.6095,
"step": 1250
},
{
"epoch": 0.502411575562701,
"eval_cooking_sharegpt_test_loss": 0.6074568033218384,
"eval_cooking_sharegpt_test_runtime": 29.1196,
"eval_cooking_sharegpt_test_samples_per_second": 6.868,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1250
},
{
"epoch": 0.5044212218649518,
"grad_norm": 4.467706371524093,
"learning_rate": 5.799836847233129e-06,
"loss": 0.6264,
"step": 1255
},
{
"epoch": 0.5064308681672026,
"grad_norm": 4.101563851958729,
"learning_rate": 5.765191094828078e-06,
"loss": 0.555,
"step": 1260
},
{
"epoch": 0.5084405144694534,
"grad_norm": 4.466205286827134,
"learning_rate": 5.7305076807426975e-06,
"loss": 0.5756,
"step": 1265
},
{
"epoch": 0.5104501607717041,
"grad_norm": 4.701080377162357,
"learning_rate": 5.695788312048159e-06,
"loss": 0.6317,
"step": 1270
},
{
"epoch": 0.512459807073955,
"grad_norm": 4.433691534657307,
"learning_rate": 5.66103469758526e-06,
"loss": 0.6215,
"step": 1275
},
{
"epoch": 0.5144694533762058,
"grad_norm": 4.755580900859219,
"learning_rate": 5.626248547880337e-06,
"loss": 0.5824,
"step": 1280
},
{
"epoch": 0.5164790996784566,
"grad_norm": 3.679078766710477,
"learning_rate": 5.591431575061064e-06,
"loss": 0.5474,
"step": 1285
},
{
"epoch": 0.5184887459807074,
"grad_norm": 4.484327374671317,
"learning_rate": 5.55658549277219e-06,
"loss": 0.649,
"step": 1290
},
{
"epoch": 0.5204983922829582,
"grad_norm": 4.505852120752897,
"learning_rate": 5.5217120160911886e-06,
"loss": 0.6159,
"step": 1295
},
{
"epoch": 0.522508038585209,
"grad_norm": 5.01215860870057,
"learning_rate": 5.486812861443852e-06,
"loss": 0.6294,
"step": 1300
},
{
"epoch": 0.522508038585209,
"eval_cooking_sharegpt_test_loss": 0.595450758934021,
"eval_cooking_sharegpt_test_runtime": 29.1321,
"eval_cooking_sharegpt_test_samples_per_second": 6.865,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1300
},
{
"epoch": 0.5245176848874598,
"grad_norm": 4.622631736422735,
"learning_rate": 5.45188974651981e-06,
"loss": 0.6114,
"step": 1305
},
{
"epoch": 0.5265273311897106,
"grad_norm": 4.518942479671849,
"learning_rate": 5.416944390187977e-06,
"loss": 0.6818,
"step": 1310
},
{
"epoch": 0.5285369774919614,
"grad_norm": 4.341948028095467,
"learning_rate": 5.381978512411968e-06,
"loss": 0.5809,
"step": 1315
},
{
"epoch": 0.5305466237942122,
"grad_norm": 3.88446157180582,
"learning_rate": 5.346993834165431e-06,
"loss": 0.5869,
"step": 1320
},
{
"epoch": 0.532556270096463,
"grad_norm": 4.430345179234876,
"learning_rate": 5.311992077347351e-06,
"loss": 0.6948,
"step": 1325
},
{
"epoch": 0.5345659163987139,
"grad_norm": 4.164240920296163,
"learning_rate": 5.2769749646972935e-06,
"loss": 0.5607,
"step": 1330
},
{
"epoch": 0.5365755627009646,
"grad_norm": 4.2836756208794515,
"learning_rate": 5.241944219710624e-06,
"loss": 0.6401,
"step": 1335
},
{
"epoch": 0.5385852090032154,
"grad_norm": 4.126109249564745,
"learning_rate": 5.206901566553665e-06,
"loss": 0.5776,
"step": 1340
},
{
"epoch": 0.5405948553054662,
"grad_norm": 4.661804105805685,
"learning_rate": 5.171848729978851e-06,
"loss": 0.6129,
"step": 1345
},
{
"epoch": 0.542604501607717,
"grad_norm": 4.448287794131716,
"learning_rate": 5.136787435239825e-06,
"loss": 0.615,
"step": 1350
},
{
"epoch": 0.542604501607717,
"eval_cooking_sharegpt_test_loss": 0.5893608331680298,
"eval_cooking_sharegpt_test_runtime": 29.1232,
"eval_cooking_sharegpt_test_samples_per_second": 6.867,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1350
},
{
"epoch": 0.5446141479099679,
"grad_norm": 3.831547012944043,
"learning_rate": 5.101719408006534e-06,
"loss": 0.5785,
"step": 1355
},
{
"epoch": 0.5466237942122186,
"grad_norm": 4.528939245986926,
"learning_rate": 5.0666463742802855e-06,
"loss": 0.6062,
"step": 1360
},
{
"epoch": 0.5486334405144695,
"grad_norm": 4.507770528828299,
"learning_rate": 5.031570060308799e-06,
"loss": 0.5992,
"step": 1365
},
{
"epoch": 0.5506430868167203,
"grad_norm": 3.890107991234021,
"learning_rate": 4.996492192501251e-06,
"loss": 0.5942,
"step": 1370
},
{
"epoch": 0.552652733118971,
"grad_norm": 4.065283624907987,
"learning_rate": 4.9614144973432855e-06,
"loss": 0.5971,
"step": 1375
},
{
"epoch": 0.5546623794212219,
"grad_norm": 4.937482234167603,
"learning_rate": 4.926338701312059e-06,
"loss": 0.6404,
"step": 1380
},
{
"epoch": 0.5566720257234726,
"grad_norm": 3.9548565502184325,
"learning_rate": 4.8912665307912435e-06,
"loss": 0.5026,
"step": 1385
},
{
"epoch": 0.5586816720257235,
"grad_norm": 4.423584782540341,
"learning_rate": 4.856199711986082e-06,
"loss": 0.6386,
"step": 1390
},
{
"epoch": 0.5606913183279743,
"grad_norm": 4.303957079156144,
"learning_rate": 4.8211399708384e-06,
"loss": 0.581,
"step": 1395
},
{
"epoch": 0.5627009646302251,
"grad_norm": 4.666098899764007,
"learning_rate": 4.786089032941683e-06,
"loss": 0.6602,
"step": 1400
},
{
"epoch": 0.5627009646302251,
"eval_cooking_sharegpt_test_loss": 0.5806075930595398,
"eval_cooking_sharegpt_test_runtime": 29.1487,
"eval_cooking_sharegpt_test_samples_per_second": 6.861,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1400
},
{
"epoch": 0.5647106109324759,
"grad_norm": 4.509076285193486,
"learning_rate": 4.75104862345612e-06,
"loss": 0.738,
"step": 1405
},
{
"epoch": 0.5667202572347267,
"grad_norm": 4.238722358099732,
"learning_rate": 4.716020467023716e-06,
"loss": 0.564,
"step": 1410
},
{
"epoch": 0.5687299035369775,
"grad_norm": 3.7723995730660347,
"learning_rate": 4.68100628768339e-06,
"loss": 0.584,
"step": 1415
},
{
"epoch": 0.5707395498392283,
"grad_norm": 4.403249450502511,
"learning_rate": 4.646007808786132e-06,
"loss": 0.5753,
"step": 1420
},
{
"epoch": 0.5727491961414791,
"grad_norm": 3.4896736313458927,
"learning_rate": 4.611026752910172e-06,
"loss": 0.4941,
"step": 1425
},
{
"epoch": 0.5747588424437299,
"grad_norm": 4.782921151183437,
"learning_rate": 4.576064841776207e-06,
"loss": 0.5882,
"step": 1430
},
{
"epoch": 0.5767684887459807,
"grad_norm": 4.523388455740361,
"learning_rate": 4.541123796162656e-06,
"loss": 0.6504,
"step": 1435
},
{
"epoch": 0.5787781350482315,
"grad_norm": 4.846919506027111,
"learning_rate": 4.506205335820959e-06,
"loss": 0.6503,
"step": 1440
},
{
"epoch": 0.5807877813504824,
"grad_norm": 4.353820829633878,
"learning_rate": 4.471311179390946e-06,
"loss": 0.5788,
"step": 1445
},
{
"epoch": 0.5827974276527331,
"grad_norm": 4.515430059421715,
"learning_rate": 4.436443044316236e-06,
"loss": 0.6004,
"step": 1450
},
{
"epoch": 0.5827974276527331,
"eval_cooking_sharegpt_test_loss": 0.5744032263755798,
"eval_cooking_sharegpt_test_runtime": 29.1343,
"eval_cooking_sharegpt_test_samples_per_second": 6.865,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1450
},
{
"epoch": 0.5848070739549839,
"grad_norm": 3.80274840853952,
"learning_rate": 4.401602646759717e-06,
"loss": 0.5645,
"step": 1455
},
{
"epoch": 0.5868167202572347,
"grad_norm": 3.626670598918427,
"learning_rate": 4.366791701519065e-06,
"loss": 0.5602,
"step": 1460
},
{
"epoch": 0.5888263665594855,
"grad_norm": 3.9743598170373886,
"learning_rate": 4.332011921942365e-06,
"loss": 0.5964,
"step": 1465
},
{
"epoch": 0.5908360128617364,
"grad_norm": 3.83250555228501,
"learning_rate": 4.297265019843755e-06,
"loss": 0.5535,
"step": 1470
},
{
"epoch": 0.5928456591639871,
"grad_norm": 3.6139874244672194,
"learning_rate": 4.262552705419203e-06,
"loss": 0.5168,
"step": 1475
},
{
"epoch": 0.594855305466238,
"grad_norm": 4.740911098327277,
"learning_rate": 4.227876687162303e-06,
"loss": 0.525,
"step": 1480
},
{
"epoch": 0.5968649517684887,
"grad_norm": 3.7870286568972276,
"learning_rate": 4.193238671780212e-06,
"loss": 0.5515,
"step": 1485
},
{
"epoch": 0.5988745980707395,
"grad_norm": 5.1311308301838086,
"learning_rate": 4.15864036410963e-06,
"loss": 0.6117,
"step": 1490
},
{
"epoch": 0.6008842443729904,
"grad_norm": 4.4089674794586236,
"learning_rate": 4.124083467032902e-06,
"loss": 0.5846,
"step": 1495
},
{
"epoch": 0.6028938906752411,
"grad_norm": 4.051889135358413,
"learning_rate": 4.08956968139419e-06,
"loss": 0.5855,
"step": 1500
},
{
"epoch": 0.6028938906752411,
"eval_cooking_sharegpt_test_loss": 0.5694165229797363,
"eval_cooking_sharegpt_test_runtime": 29.1123,
"eval_cooking_sharegpt_test_samples_per_second": 6.87,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1500
},
{
"epoch": 0.604903536977492,
"grad_norm": 4.1318438937071,
"learning_rate": 4.05510070591578e-06,
"loss": 0.6138,
"step": 1505
},
{
"epoch": 0.6069131832797428,
"grad_norm": 3.601915970098065,
"learning_rate": 4.020678237114451e-06,
"loss": 0.5932,
"step": 1510
},
{
"epoch": 0.6089228295819936,
"grad_norm": 3.9179716478128372,
"learning_rate": 3.986303969217996e-06,
"loss": 0.5754,
"step": 1515
},
{
"epoch": 0.6109324758842444,
"grad_norm": 4.231240281833697,
"learning_rate": 3.951979594081818e-06,
"loss": 0.5833,
"step": 1520
},
{
"epoch": 0.6129421221864951,
"grad_norm": 3.6790737814106462,
"learning_rate": 3.917706801105663e-06,
"loss": 0.5875,
"step": 1525
},
{
"epoch": 0.614951768488746,
"grad_norm": 3.83516561393742,
"learning_rate": 3.883487277150481e-06,
"loss": 0.5629,
"step": 1530
},
{
"epoch": 0.6169614147909968,
"grad_norm": 4.844335590656684,
"learning_rate": 3.849322706455379e-06,
"loss": 0.5862,
"step": 1535
},
{
"epoch": 0.6189710610932476,
"grad_norm": 3.9186087206693996,
"learning_rate": 3.815214770554755e-06,
"loss": 0.5158,
"step": 1540
},
{
"epoch": 0.6209807073954984,
"grad_norm": 3.9221369945811198,
"learning_rate": 3.781165148195501e-06,
"loss": 0.5216,
"step": 1545
},
{
"epoch": 0.6229903536977492,
"grad_norm": 3.3756618558855633,
"learning_rate": 3.74717551525441e-06,
"loss": 0.5138,
"step": 1550
},
{
"epoch": 0.6229903536977492,
"eval_cooking_sharegpt_test_loss": 0.5629362463951111,
"eval_cooking_sharegpt_test_runtime": 29.1527,
"eval_cooking_sharegpt_test_samples_per_second": 6.86,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1550
},
{
"epoch": 0.625,
"grad_norm": 4.312505621340993,
"learning_rate": 3.713247544655663e-06,
"loss": 0.5655,
"step": 1555
},
{
"epoch": 0.6270096463022508,
"grad_norm": 4.442878870711612,
"learning_rate": 3.6793829062885133e-06,
"loss": 0.5324,
"step": 1560
},
{
"epoch": 0.6290192926045016,
"grad_norm": 4.1821188904384785,
"learning_rate": 3.6455832669250798e-06,
"loss": 0.5367,
"step": 1565
},
{
"epoch": 0.6310289389067524,
"grad_norm": 4.16995310964458,
"learning_rate": 3.611850290138322e-06,
"loss": 0.5449,
"step": 1570
},
{
"epoch": 0.6330385852090032,
"grad_norm": 4.103557618161658,
"learning_rate": 3.578185636220154e-06,
"loss": 0.547,
"step": 1575
},
{
"epoch": 0.635048231511254,
"grad_norm": 4.221137750943906,
"learning_rate": 3.5445909620997317e-06,
"loss": 0.6128,
"step": 1580
},
{
"epoch": 0.6370578778135049,
"grad_norm": 4.685243877985315,
"learning_rate": 3.511067921261897e-06,
"loss": 0.5288,
"step": 1585
},
{
"epoch": 0.6390675241157556,
"grad_norm": 3.7485957955570526,
"learning_rate": 3.4776181636658004e-06,
"loss": 0.5361,
"step": 1590
},
{
"epoch": 0.6410771704180064,
"grad_norm": 5.13753736382163,
"learning_rate": 3.444243335663685e-06,
"loss": 0.6099,
"step": 1595
},
{
"epoch": 0.6430868167202572,
"grad_norm": 4.20430212422138,
"learning_rate": 3.4109450799198667e-06,
"loss": 0.5544,
"step": 1600
},
{
"epoch": 0.6430868167202572,
"eval_cooking_sharegpt_test_loss": 0.5597677826881409,
"eval_cooking_sharegpt_test_runtime": 29.1383,
"eval_cooking_sharegpt_test_samples_per_second": 6.864,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1600
},
{
"epoch": 0.645096463022508,
"grad_norm": 4.178198769845903,
"learning_rate": 3.3777250353298725e-06,
"loss": 0.5958,
"step": 1605
},
{
"epoch": 0.6471061093247589,
"grad_norm": 4.042255503484116,
"learning_rate": 3.344584836939777e-06,
"loss": 0.596,
"step": 1610
},
{
"epoch": 0.6491157556270096,
"grad_norm": 4.187189657432103,
"learning_rate": 3.3115261158657443e-06,
"loss": 0.5823,
"step": 1615
},
{
"epoch": 0.6511254019292605,
"grad_norm": 4.893019445921785,
"learning_rate": 3.2785504992137208e-06,
"loss": 0.5981,
"step": 1620
},
{
"epoch": 0.6531350482315113,
"grad_norm": 4.211816172053425,
"learning_rate": 3.2456596099993744e-06,
"loss": 0.6481,
"step": 1625
},
{
"epoch": 0.655144694533762,
"grad_norm": 4.1700590638139925,
"learning_rate": 3.2128550670681946e-06,
"loss": 0.5761,
"step": 1630
},
{
"epoch": 0.6571543408360129,
"grad_norm": 4.161630224139953,
"learning_rate": 3.18013848501583e-06,
"loss": 0.5866,
"step": 1635
},
{
"epoch": 0.6591639871382636,
"grad_norm": 3.557727720022592,
"learning_rate": 3.1475114741086064e-06,
"loss": 0.4835,
"step": 1640
},
{
"epoch": 0.6611736334405145,
"grad_norm": 4.852478551572312,
"learning_rate": 3.114975640204282e-06,
"loss": 0.5574,
"step": 1645
},
{
"epoch": 0.6631832797427653,
"grad_norm": 3.859784406459026,
"learning_rate": 3.0825325846730013e-06,
"loss": 0.5624,
"step": 1650
},
{
"epoch": 0.6631832797427653,
"eval_cooking_sharegpt_test_loss": 0.5586913228034973,
"eval_cooking_sharegpt_test_runtime": 29.1426,
"eval_cooking_sharegpt_test_samples_per_second": 6.863,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1650
},
{
"epoch": 0.6651929260450161,
"grad_norm": 4.482495052564249,
"learning_rate": 3.0501839043184858e-06,
"loss": 0.5688,
"step": 1655
},
{
"epoch": 0.6672025723472669,
"grad_norm": 4.042365197538812,
"learning_rate": 3.017931191299433e-06,
"loss": 0.5349,
"step": 1660
},
{
"epoch": 0.6692122186495176,
"grad_norm": 4.165393275797863,
"learning_rate": 2.985776033051161e-06,
"loss": 0.5798,
"step": 1665
},
{
"epoch": 0.6712218649517685,
"grad_norm": 4.037795568611387,
"learning_rate": 2.9537200122074684e-06,
"loss": 0.5308,
"step": 1670
},
{
"epoch": 0.6732315112540193,
"grad_norm": 3.870075592190109,
"learning_rate": 2.9217647065227474e-06,
"loss": 0.5248,
"step": 1675
},
{
"epoch": 0.6752411575562701,
"grad_norm": 2.871993790875757,
"learning_rate": 2.889911688794322e-06,
"loss": 0.5273,
"step": 1680
},
{
"epoch": 0.6772508038585209,
"grad_norm": 4.430515877745258,
"learning_rate": 2.858162526785046e-06,
"loss": 0.5656,
"step": 1685
},
{
"epoch": 0.6792604501607717,
"grad_norm": 5.1614822907000395,
"learning_rate": 2.8265187831461234e-06,
"loss": 0.5579,
"step": 1690
},
{
"epoch": 0.6812700964630225,
"grad_norm": 4.630422383447896,
"learning_rate": 2.7949820153402163e-06,
"loss": 0.6282,
"step": 1695
},
{
"epoch": 0.6832797427652733,
"grad_norm": 3.181023467670562,
"learning_rate": 2.763553775564778e-06,
"loss": 0.5093,
"step": 1700
},
{
"epoch": 0.6832797427652733,
"eval_cooking_sharegpt_test_loss": 0.5516761541366577,
"eval_cooking_sharegpt_test_runtime": 29.1563,
"eval_cooking_sharegpt_test_samples_per_second": 6.86,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1700
},
{
"epoch": 0.6852893890675241,
"grad_norm": 4.646508693714103,
"learning_rate": 2.732235610675652e-06,
"loss": 0.6124,
"step": 1705
},
{
"epoch": 0.6872990353697749,
"grad_norm": 4.162112396548259,
"learning_rate": 2.7010290621109527e-06,
"loss": 0.5413,
"step": 1710
},
{
"epoch": 0.6893086816720257,
"grad_norm": 3.6513543600819998,
"learning_rate": 2.6699356658151766e-06,
"loss": 0.535,
"step": 1715
},
{
"epoch": 0.6913183279742765,
"grad_norm": 5.33408477701188,
"learning_rate": 2.6389569521636325e-06,
"loss": 0.6191,
"step": 1720
},
{
"epoch": 0.6933279742765274,
"grad_norm": 4.0009358447843315,
"learning_rate": 2.6080944458870884e-06,
"loss": 0.5353,
"step": 1725
},
{
"epoch": 0.6953376205787781,
"grad_norm": 4.288625288455154,
"learning_rate": 2.577349665996752e-06,
"loss": 0.605,
"step": 1730
},
{
"epoch": 0.697347266881029,
"grad_norm": 3.7812731515985427,
"learning_rate": 2.5467241257094844e-06,
"loss": 0.4522,
"step": 1735
},
{
"epoch": 0.6993569131832797,
"grad_norm": 4.4091214999888635,
"learning_rate": 2.5162193323733475e-06,
"loss": 0.598,
"step": 1740
},
{
"epoch": 0.7013665594855305,
"grad_norm": 4.1788240029265,
"learning_rate": 2.4858367873933885e-06,
"loss": 0.5406,
"step": 1745
},
{
"epoch": 0.7033762057877814,
"grad_norm": 3.944073910719997,
"learning_rate": 2.455577986157762e-06,
"loss": 0.5658,
"step": 1750
},
{
"epoch": 0.7033762057877814,
"eval_cooking_sharegpt_test_loss": 0.54704350233078,
"eval_cooking_sharegpt_test_runtime": 29.1394,
"eval_cooking_sharegpt_test_samples_per_second": 6.864,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1750
},
{
"epoch": 0.7053858520900321,
"grad_norm": 4.0409641002525065,
"learning_rate": 2.425444417964112e-06,
"loss": 0.5993,
"step": 1755
},
{
"epoch": 0.707395498392283,
"grad_norm": 3.830335679595113,
"learning_rate": 2.395437565946291e-06,
"loss": 0.4863,
"step": 1760
},
{
"epoch": 0.7094051446945338,
"grad_norm": 3.9483809252674984,
"learning_rate": 2.3655589070013434e-06,
"loss": 0.538,
"step": 1765
},
{
"epoch": 0.7114147909967846,
"grad_norm": 4.721569878991402,
"learning_rate": 2.3358099117168277e-06,
"loss": 0.6086,
"step": 1770
},
{
"epoch": 0.7134244372990354,
"grad_norm": 3.817648873180748,
"learning_rate": 2.3061920442984237e-06,
"loss": 0.5537,
"step": 1775
},
{
"epoch": 0.7154340836012861,
"grad_norm": 4.952943362320339,
"learning_rate": 2.276706762497881e-06,
"loss": 0.5734,
"step": 1780
},
{
"epoch": 0.717443729903537,
"grad_norm": 4.308685442110032,
"learning_rate": 2.247355517541259e-06,
"loss": 0.5245,
"step": 1785
},
{
"epoch": 0.7194533762057878,
"grad_norm": 3.6876910060455015,
"learning_rate": 2.2181397540575012e-06,
"loss": 0.4904,
"step": 1790
},
{
"epoch": 0.7214630225080386,
"grad_norm": 4.056961760469262,
"learning_rate": 2.1890609100073406e-06,
"loss": 0.5792,
"step": 1795
},
{
"epoch": 0.7234726688102894,
"grad_norm": 5.194938789182878,
"learning_rate": 2.1601204166125097e-06,
"loss": 0.5797,
"step": 1800
},
{
"epoch": 0.7234726688102894,
"eval_cooking_sharegpt_test_loss": 0.5419730544090271,
"eval_cooking_sharegpt_test_runtime": 29.1573,
"eval_cooking_sharegpt_test_samples_per_second": 6.859,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1800
},
{
"epoch": 0.7254823151125402,
"grad_norm": 4.5439902420076,
"learning_rate": 2.131319698285321e-06,
"loss": 0.5149,
"step": 1805
},
{
"epoch": 0.727491961414791,
"grad_norm": 4.193817296752171,
"learning_rate": 2.1026601725585303e-06,
"loss": 0.5707,
"step": 1810
},
{
"epoch": 0.7295016077170418,
"grad_norm": 4.352857862781889,
"learning_rate": 2.0741432500155957e-06,
"loss": 0.5501,
"step": 1815
},
{
"epoch": 0.7315112540192926,
"grad_norm": 4.693742662043891,
"learning_rate": 2.045770334221227e-06,
"loss": 0.5476,
"step": 1820
},
{
"epoch": 0.7335209003215434,
"grad_norm": 3.811953363220478,
"learning_rate": 2.017542821652321e-06,
"loss": 0.5512,
"step": 1825
},
{
"epoch": 0.7355305466237942,
"grad_norm": 4.0209013372552285,
"learning_rate": 1.9894621016292233e-06,
"loss": 0.5004,
"step": 1830
},
{
"epoch": 0.737540192926045,
"grad_norm": 3.7470655457186286,
"learning_rate": 1.9615295562473445e-06,
"loss": 0.5138,
"step": 1835
},
{
"epoch": 0.7395498392282959,
"grad_norm": 5.0495297609749255,
"learning_rate": 1.933746560309137e-06,
"loss": 0.5589,
"step": 1840
},
{
"epoch": 0.7415594855305466,
"grad_norm": 4.4178604447303895,
"learning_rate": 1.906114481256432e-06,
"loss": 0.5416,
"step": 1845
},
{
"epoch": 0.7435691318327974,
"grad_norm": 3.8034316360410676,
"learning_rate": 1.8786346791031356e-06,
"loss": 0.5376,
"step": 1850
},
{
"epoch": 0.7435691318327974,
"eval_cooking_sharegpt_test_loss": 0.5386413335800171,
"eval_cooking_sharegpt_test_runtime": 29.1364,
"eval_cooking_sharegpt_test_samples_per_second": 6.864,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1850
},
{
"epoch": 0.7455787781350482,
"grad_norm": 3.885403344737721,
"learning_rate": 1.8513085063682828e-06,
"loss": 0.5474,
"step": 1855
},
{
"epoch": 0.747588424437299,
"grad_norm": 3.5164892334257343,
"learning_rate": 1.8241373080094822e-06,
"loss": 0.4625,
"step": 1860
},
{
"epoch": 0.7495980707395499,
"grad_norm": 3.5334086129543008,
"learning_rate": 1.7971224213567017e-06,
"loss": 0.4698,
"step": 1865
},
{
"epoch": 0.7516077170418006,
"grad_norm": 3.7281802070266736,
"learning_rate": 1.77026517604647e-06,
"loss": 0.5437,
"step": 1870
},
{
"epoch": 0.7536173633440515,
"grad_norm": 6.112903495715869,
"learning_rate": 1.7435668939564065e-06,
"loss": 0.5897,
"step": 1875
},
{
"epoch": 0.7556270096463023,
"grad_norm": 4.903593470757499,
"learning_rate": 1.7170288891401836e-06,
"loss": 0.543,
"step": 1880
},
{
"epoch": 0.757636655948553,
"grad_norm": 4.1471801370352175,
"learning_rate": 1.6906524677628345e-06,
"loss": 0.5533,
"step": 1885
},
{
"epoch": 0.7596463022508039,
"grad_norm": 4.44067669671905,
"learning_rate": 1.6644389280364748e-06,
"loss": 0.5232,
"step": 1890
},
{
"epoch": 0.7616559485530546,
"grad_norm": 4.25010398827249,
"learning_rate": 1.6383895601564047e-06,
"loss": 0.6047,
"step": 1895
},
{
"epoch": 0.7636655948553055,
"grad_norm": 4.312065419247702,
"learning_rate": 1.6125056462376065e-06,
"loss": 0.5323,
"step": 1900
},
{
"epoch": 0.7636655948553055,
"eval_cooking_sharegpt_test_loss": 0.534070611000061,
"eval_cooking_sharegpt_test_runtime": 29.1362,
"eval_cooking_sharegpt_test_samples_per_second": 6.864,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1900
},
{
"epoch": 0.7656752411575563,
"grad_norm": 4.311264813952778,
"learning_rate": 1.586788460251636e-06,
"loss": 0.4919,
"step": 1905
},
{
"epoch": 0.7676848874598071,
"grad_norm": 3.9822945127982186,
"learning_rate": 1.561239267963926e-06,
"loss": 0.4988,
"step": 1910
},
{
"epoch": 0.7696945337620579,
"grad_norm": 3.5862382118282814,
"learning_rate": 1.5358593268714866e-06,
"loss": 0.556,
"step": 1915
},
{
"epoch": 0.7717041800643086,
"grad_norm": 4.65685750488049,
"learning_rate": 1.5106498861410101e-06,
"loss": 0.5705,
"step": 1920
},
{
"epoch": 0.7737138263665595,
"grad_norm": 4.93082892843028,
"learning_rate": 1.4856121865473855e-06,
"loss": 0.5442,
"step": 1925
},
{
"epoch": 0.7757234726688103,
"grad_norm": 4.115014663671977,
"learning_rate": 1.460747460412637e-06,
"loss": 0.5497,
"step": 1930
},
{
"epoch": 0.7777331189710611,
"grad_norm": 3.5438779332216965,
"learning_rate": 1.4360569315452682e-06,
"loss": 0.4903,
"step": 1935
},
{
"epoch": 0.7797427652733119,
"grad_norm": 4.031371623093601,
"learning_rate": 1.4115418151800215e-06,
"loss": 0.5644,
"step": 1940
},
{
"epoch": 0.7817524115755627,
"grad_norm": 3.54725412927624,
"learning_rate": 1.3872033179180767e-06,
"loss": 0.5178,
"step": 1945
},
{
"epoch": 0.7837620578778135,
"grad_norm": 4.445227207707671,
"learning_rate": 1.363042637667652e-06,
"loss": 0.5802,
"step": 1950
},
{
"epoch": 0.7837620578778135,
"eval_cooking_sharegpt_test_loss": 0.5310518145561218,
"eval_cooking_sharegpt_test_runtime": 29.1634,
"eval_cooking_sharegpt_test_samples_per_second": 6.858,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 1950
},
{
"epoch": 0.7857717041800643,
"grad_norm": 5.019716908374287,
"learning_rate": 1.339060963585056e-06,
"loss": 0.5734,
"step": 1955
},
{
"epoch": 0.7877813504823151,
"grad_norm": 3.8522224862461147,
"learning_rate": 1.3152594760161513e-06,
"loss": 0.4906,
"step": 1960
},
{
"epoch": 0.7897909967845659,
"grad_norm": 3.6624257408254457,
"learning_rate": 1.2916393464382632e-06,
"loss": 0.4873,
"step": 1965
},
{
"epoch": 0.7918006430868167,
"grad_norm": 5.220019531776816,
"learning_rate": 1.2682017374025158e-06,
"loss": 0.5863,
"step": 1970
},
{
"epoch": 0.7938102893890675,
"grad_norm": 3.978993529598662,
"learning_rate": 1.2449478024766205e-06,
"loss": 0.4623,
"step": 1975
},
{
"epoch": 0.7958199356913184,
"grad_norm": 4.0726044586510675,
"learning_rate": 1.2218786861880937e-06,
"loss": 0.496,
"step": 1980
},
{
"epoch": 0.7978295819935691,
"grad_norm": 3.8206385965353222,
"learning_rate": 1.1989955239679279e-06,
"loss": 0.5187,
"step": 1985
},
{
"epoch": 0.7998392282958199,
"grad_norm": 3.9512193648944653,
"learning_rate": 1.1762994420947016e-06,
"loss": 0.4982,
"step": 1990
},
{
"epoch": 0.8018488745980707,
"grad_norm": 3.8053938839616412,
"learning_rate": 1.153791557639153e-06,
"loss": 0.5194,
"step": 1995
},
{
"epoch": 0.8038585209003215,
"grad_norm": 3.6251196777845616,
"learning_rate": 1.1314729784091937e-06,
"loss": 0.5537,
"step": 2000
},
{
"epoch": 0.8038585209003215,
"eval_cooking_sharegpt_test_loss": 0.5269535779953003,
"eval_cooking_sharegpt_test_runtime": 29.147,
"eval_cooking_sharegpt_test_samples_per_second": 6.862,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2000
},
{
"epoch": 0.8058681672025724,
"grad_norm": 4.043004314873042,
"learning_rate": 1.1093448028953886e-06,
"loss": 0.4801,
"step": 2005
},
{
"epoch": 0.8078778135048231,
"grad_norm": 3.864239177756637,
"learning_rate": 1.0874081202168806e-06,
"loss": 0.4985,
"step": 2010
},
{
"epoch": 0.809887459807074,
"grad_norm": 3.822597851846882,
"learning_rate": 1.065664010067799e-06,
"loss": 0.4991,
"step": 2015
},
{
"epoch": 0.8118971061093248,
"grad_norm": 3.857276347852937,
"learning_rate": 1.0441135426641074e-06,
"loss": 0.4637,
"step": 2020
},
{
"epoch": 0.8139067524115756,
"grad_norm": 4.4337279604757,
"learning_rate": 1.0227577786909332e-06,
"loss": 0.5738,
"step": 2025
},
{
"epoch": 0.8159163987138264,
"grad_norm": 3.6706223677728254,
"learning_rate": 1.0015977692503632e-06,
"loss": 0.5243,
"step": 2030
},
{
"epoch": 0.8179260450160771,
"grad_norm": 4.517936325301277,
"learning_rate": 9.806345558097053e-07,
"loss": 0.5106,
"step": 2035
},
{
"epoch": 0.819935691318328,
"grad_norm": 4.230476927090911,
"learning_rate": 9.59869170150236e-07,
"loss": 0.5789,
"step": 2040
},
{
"epoch": 0.8219453376205788,
"grad_norm": 4.043867851008032,
"learning_rate": 9.393026343164114e-07,
"loss": 0.5238,
"step": 2045
},
{
"epoch": 0.8239549839228296,
"grad_norm": 3.433756831229721,
"learning_rate": 9.189359605655668e-07,
"loss": 0.4972,
"step": 2050
},
{
"epoch": 0.8239549839228296,
"eval_cooking_sharegpt_test_loss": 0.5249894261360168,
"eval_cooking_sharegpt_test_runtime": 29.1006,
"eval_cooking_sharegpt_test_samples_per_second": 6.873,
"eval_cooking_sharegpt_test_steps_per_second": 0.344,
"step": 2050
},
{
"epoch": 0.8259646302250804,
"grad_norm": 4.6112587559432,
"learning_rate": 8.987701513180907e-07,
"loss": 0.5356,
"step": 2055
},
{
"epoch": 0.8279742765273312,
"grad_norm": 4.050624512976509,
"learning_rate": 8.788061991080937e-07,
"loss": 0.519,
"step": 2060
},
{
"epoch": 0.829983922829582,
"grad_norm": 3.9107977100160123,
"learning_rate": 8.590450865345512e-07,
"loss": 0.5988,
"step": 2065
},
{
"epoch": 0.8319935691318328,
"grad_norm": 4.197001300269493,
"learning_rate": 8.394877862129446e-07,
"loss": 0.4833,
"step": 2070
},
{
"epoch": 0.8340032154340836,
"grad_norm": 4.289046142719884,
"learning_rate": 8.201352607273877e-07,
"loss": 0.5961,
"step": 2075
},
{
"epoch": 0.8360128617363344,
"grad_norm": 4.718556012278376,
"learning_rate": 8.009884625832531e-07,
"loss": 0.5824,
"step": 2080
},
{
"epoch": 0.8380225080385852,
"grad_norm": 4.096609919204836,
"learning_rate": 7.82048334160288e-07,
"loss": 0.5427,
"step": 2085
},
{
"epoch": 0.840032154340836,
"grad_norm": 4.906614424028583,
"learning_rate": 7.633158076662356e-07,
"loss": 0.6349,
"step": 2090
},
{
"epoch": 0.8420418006430869,
"grad_norm": 4.694926277231841,
"learning_rate": 7.447918050909453e-07,
"loss": 0.5806,
"step": 2095
},
{
"epoch": 0.8440514469453376,
"grad_norm": 3.9943595352709735,
"learning_rate": 7.264772381610041e-07,
"loss": 0.5315,
"step": 2100
},
{
"epoch": 0.8440514469453376,
"eval_cooking_sharegpt_test_loss": 0.5224404335021973,
"eval_cooking_sharegpt_test_runtime": 29.1427,
"eval_cooking_sharegpt_test_samples_per_second": 6.863,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2100
},
{
"epoch": 0.8460610932475884,
"grad_norm": 3.8949136566294995,
"learning_rate": 7.083730082948526e-07,
"loss": 0.4789,
"step": 2105
},
{
"epoch": 0.8480707395498392,
"grad_norm": 4.010454819797958,
"learning_rate": 6.904800065584255e-07,
"loss": 0.4783,
"step": 2110
},
{
"epoch": 0.85008038585209,
"grad_norm": 4.746242279274611,
"learning_rate": 6.727991136212931e-07,
"loss": 0.546,
"step": 2115
},
{
"epoch": 0.8520900321543409,
"grad_norm": 4.283372719633206,
"learning_rate": 6.553311997133111e-07,
"loss": 0.5003,
"step": 2120
},
{
"epoch": 0.8540996784565916,
"grad_norm": 3.560283237601326,
"learning_rate": 6.380771245817957e-07,
"loss": 0.4842,
"step": 2125
},
{
"epoch": 0.8561093247588425,
"grad_norm": 3.79934997015618,
"learning_rate": 6.210377374492049e-07,
"loss": 0.4678,
"step": 2130
},
{
"epoch": 0.8581189710610932,
"grad_norm": 4.1885519690593185,
"learning_rate": 6.042138769713413e-07,
"loss": 0.5096,
"step": 2135
},
{
"epoch": 0.860128617363344,
"grad_norm": 3.97970854328499,
"learning_rate": 5.876063711960706e-07,
"loss": 0.4941,
"step": 2140
},
{
"epoch": 0.8621382636655949,
"grad_norm": 4.394448792908495,
"learning_rate": 5.712160375225756e-07,
"loss": 0.5573,
"step": 2145
},
{
"epoch": 0.8641479099678456,
"grad_norm": 4.127283021998723,
"learning_rate": 5.55043682661115e-07,
"loss": 0.5165,
"step": 2150
},
{
"epoch": 0.8641479099678456,
"eval_cooking_sharegpt_test_loss": 0.5204899311065674,
"eval_cooking_sharegpt_test_runtime": 29.1702,
"eval_cooking_sharegpt_test_samples_per_second": 6.856,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2150
},
{
"epoch": 0.8661575562700965,
"grad_norm": 3.69665008473374,
"learning_rate": 5.39090102593326e-07,
"loss": 0.5317,
"step": 2155
},
{
"epoch": 0.8681672025723473,
"grad_norm": 3.5634773461727223,
"learning_rate": 5.233560825330387e-07,
"loss": 0.5341,
"step": 2160
},
{
"epoch": 0.8701768488745981,
"grad_norm": 3.450288752072914,
"learning_rate": 5.0784239688764e-07,
"loss": 0.4069,
"step": 2165
},
{
"epoch": 0.8721864951768489,
"grad_norm": 4.428896151960056,
"learning_rate": 4.925498092199449e-07,
"loss": 0.5154,
"step": 2170
},
{
"epoch": 0.8741961414790996,
"grad_norm": 3.6865145625805633,
"learning_rate": 4.774790722106309e-07,
"loss": 0.5408,
"step": 2175
},
{
"epoch": 0.8762057877813505,
"grad_norm": 3.8579452526033293,
"learning_rate": 4.6263092762117546e-07,
"loss": 0.5051,
"step": 2180
},
{
"epoch": 0.8782154340836013,
"grad_norm": 3.649942640889945,
"learning_rate": 4.480061062573604e-07,
"loss": 0.4879,
"step": 2185
},
{
"epoch": 0.8802250803858521,
"grad_norm": 4.4196873140991935,
"learning_rate": 4.336053279332941e-07,
"loss": 0.5404,
"step": 2190
},
{
"epoch": 0.8822347266881029,
"grad_norm": 3.8686685069961064,
"learning_rate": 4.1942930143599014e-07,
"loss": 0.4976,
"step": 2195
},
{
"epoch": 0.8842443729903537,
"grad_norm": 4.070283502525038,
"learning_rate": 4.0547872449047674e-07,
"loss": 0.5689,
"step": 2200
},
{
"epoch": 0.8842443729903537,
"eval_cooking_sharegpt_test_loss": 0.5188571810722351,
"eval_cooking_sharegpt_test_runtime": 29.1495,
"eval_cooking_sharegpt_test_samples_per_second": 6.861,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2200
},
{
"epoch": 0.8862540192926045,
"grad_norm": 3.619269487786876,
"learning_rate": 3.917542837254562e-07,
"loss": 0.5528,
"step": 2205
},
{
"epoch": 0.8882636655948553,
"grad_norm": 3.6657937873825137,
"learning_rate": 3.7825665463951224e-07,
"loss": 0.541,
"step": 2210
},
{
"epoch": 0.8902733118971061,
"grad_norm": 3.4204803639440526,
"learning_rate": 3.649865015678622e-07,
"loss": 0.4743,
"step": 2215
},
{
"epoch": 0.8922829581993569,
"grad_norm": 4.380202502107755,
"learning_rate": 3.5194447764965887e-07,
"loss": 0.546,
"step": 2220
},
{
"epoch": 0.8942926045016077,
"grad_norm": 4.402021876559463,
"learning_rate": 3.391312247958417e-07,
"loss": 0.5446,
"step": 2225
},
{
"epoch": 0.8963022508038585,
"grad_norm": 4.747626450695015,
"learning_rate": 3.265473736575475e-07,
"loss": 0.5655,
"step": 2230
},
{
"epoch": 0.8983118971061094,
"grad_norm": 3.68468614908709,
"learning_rate": 3.141935435950644e-07,
"loss": 0.6147,
"step": 2235
},
{
"epoch": 0.9003215434083601,
"grad_norm": 3.535700858658026,
"learning_rate": 3.0207034264735756e-07,
"loss": 0.5006,
"step": 2240
},
{
"epoch": 0.9023311897106109,
"grad_norm": 3.848416834162935,
"learning_rate": 2.901783675021297e-07,
"loss": 0.5161,
"step": 2245
},
{
"epoch": 0.9043408360128617,
"grad_norm": 3.730188112771092,
"learning_rate": 2.785182034664641e-07,
"loss": 0.5191,
"step": 2250
},
{
"epoch": 0.9043408360128617,
"eval_cooking_sharegpt_test_loss": 0.5172947645187378,
"eval_cooking_sharegpt_test_runtime": 29.1343,
"eval_cooking_sharegpt_test_samples_per_second": 6.865,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2250
},
{
"epoch": 0.9063504823151125,
"grad_norm": 4.611573430946674,
"learning_rate": 2.670904244380068e-07,
"loss": 0.5719,
"step": 2255
},
{
"epoch": 0.9083601286173634,
"grad_norm": 3.4190333394817625,
"learning_rate": 2.5589559287673205e-07,
"loss": 0.5415,
"step": 2260
},
{
"epoch": 0.9103697749196141,
"grad_norm": 3.7043226879021027,
"learning_rate": 2.4493425977724585e-07,
"loss": 0.5431,
"step": 2265
},
{
"epoch": 0.912379421221865,
"grad_norm": 3.5791803584446877,
"learning_rate": 2.3420696464167614e-07,
"loss": 0.5563,
"step": 2270
},
{
"epoch": 0.9143890675241158,
"grad_norm": 3.6102349926375292,
"learning_rate": 2.237142354531141e-07,
"loss": 0.5127,
"step": 2275
},
{
"epoch": 0.9163987138263665,
"grad_norm": 4.04764189714931,
"learning_rate": 2.1345658864962982e-07,
"loss": 0.5183,
"step": 2280
},
{
"epoch": 0.9184083601286174,
"grad_norm": 4.12380254309591,
"learning_rate": 2.0343452909885487e-07,
"loss": 0.5244,
"step": 2285
},
{
"epoch": 0.9204180064308681,
"grad_norm": 2.9917916489817657,
"learning_rate": 1.9364855007313e-07,
"loss": 0.6096,
"step": 2290
},
{
"epoch": 0.922427652733119,
"grad_norm": 4.666115781878385,
"learning_rate": 1.84099133225229e-07,
"loss": 0.5603,
"step": 2295
},
{
"epoch": 0.9244372990353698,
"grad_norm": 4.1461620270350545,
"learning_rate": 1.747867485646537e-07,
"loss": 0.4639,
"step": 2300
},
{
"epoch": 0.9244372990353698,
"eval_cooking_sharegpt_test_loss": 0.5164940357208252,
"eval_cooking_sharegpt_test_runtime": 29.1503,
"eval_cooking_sharegpt_test_samples_per_second": 6.861,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2300
},
{
"epoch": 0.9264469453376206,
"grad_norm": 3.424368627441663,
"learning_rate": 1.6571185443449934e-07,
"loss": 0.4837,
"step": 2305
},
{
"epoch": 0.9284565916398714,
"grad_norm": 3.3861373326786874,
"learning_rate": 1.5687489748889228e-07,
"loss": 0.5225,
"step": 2310
},
{
"epoch": 0.9304662379421221,
"grad_norm": 4.0432557223167365,
"learning_rate": 1.482763126710135e-07,
"loss": 0.505,
"step": 2315
},
{
"epoch": 0.932475884244373,
"grad_norm": 4.149053064263076,
"learning_rate": 1.3991652319168436e-07,
"loss": 0.5735,
"step": 2320
},
{
"epoch": 0.9344855305466238,
"grad_norm": 4.436444939575187,
"learning_rate": 1.3179594050854227e-07,
"loss": 0.572,
"step": 2325
},
{
"epoch": 0.9364951768488746,
"grad_norm": 4.509769479123379,
"learning_rate": 1.239149643057841e-07,
"loss": 0.4864,
"step": 2330
},
{
"epoch": 0.9385048231511254,
"grad_norm": 4.668035022637942,
"learning_rate": 1.1627398247449906e-07,
"loss": 0.5206,
"step": 2335
},
{
"epoch": 0.9405144694533762,
"grad_norm": 3.9506527567648697,
"learning_rate": 1.08873371093573e-07,
"loss": 0.6029,
"step": 2340
},
{
"epoch": 0.942524115755627,
"grad_norm": 4.230839087401455,
"learning_rate": 1.017134944111814e-07,
"loss": 0.5838,
"step": 2345
},
{
"epoch": 0.9445337620578779,
"grad_norm": 4.151545046693275,
"learning_rate": 9.479470482686048e-08,
"loss": 0.487,
"step": 2350
},
{
"epoch": 0.9445337620578779,
"eval_cooking_sharegpt_test_loss": 0.5156686305999756,
"eval_cooking_sharegpt_test_runtime": 29.1666,
"eval_cooking_sharegpt_test_samples_per_second": 6.857,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2350
},
{
"epoch": 0.9465434083601286,
"grad_norm": 4.142210605255353,
"learning_rate": 8.811734287416274e-08,
"loss": 0.5335,
"step": 2355
},
{
"epoch": 0.9485530546623794,
"grad_norm": 3.850811332598397,
"learning_rate": 8.168173720389472e-08,
"loss": 0.5186,
"step": 2360
},
{
"epoch": 0.9505627009646302,
"grad_norm": 3.597795214532193,
"learning_rate": 7.548820456794448e-08,
"loss": 0.4817,
"step": 2365
},
{
"epoch": 0.952572347266881,
"grad_norm": 3.8186393981188114,
"learning_rate": 6.953704980368958e-08,
"loss": 0.4579,
"step": 2370
},
{
"epoch": 0.9545819935691319,
"grad_norm": 4.174150444604705,
"learning_rate": 6.382856581899133e-08,
"loss": 0.5271,
"step": 2375
},
{
"epoch": 0.9565916398713826,
"grad_norm": 4.2074795390285225,
"learning_rate": 5.8363033577784055e-08,
"loss": 0.5917,
"step": 2380
},
{
"epoch": 0.9586012861736335,
"grad_norm": 3.9305248224525404,
"learning_rate": 5.314072208623844e-08,
"loss": 0.4585,
"step": 2385
},
{
"epoch": 0.9606109324758842,
"grad_norm": 3.9808292965527037,
"learning_rate": 4.81618883795304e-08,
"loss": 0.4707,
"step": 2390
},
{
"epoch": 0.962620578778135,
"grad_norm": 3.753591113764896,
"learning_rate": 4.342677750918178e-08,
"loss": 0.5422,
"step": 2395
},
{
"epoch": 0.9646302250803859,
"grad_norm": 3.6932078456444373,
"learning_rate": 3.8935622531006136e-08,
"loss": 0.4571,
"step": 2400
},
{
"epoch": 0.9646302250803859,
"eval_cooking_sharegpt_test_loss": 0.5150659680366516,
"eval_cooking_sharegpt_test_runtime": 29.1762,
"eval_cooking_sharegpt_test_samples_per_second": 6.855,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2400
},
{
"epoch": 0.9666398713826366,
"grad_norm": 3.7902631860251472,
"learning_rate": 3.468864449363119e-08,
"loss": 0.472,
"step": 2405
},
{
"epoch": 0.9686495176848875,
"grad_norm": 4.873378997207432,
"learning_rate": 3.0686052427626454e-08,
"loss": 0.5863,
"step": 2410
},
{
"epoch": 0.9706591639871383,
"grad_norm": 3.41955309662197,
"learning_rate": 2.692804333520982e-08,
"loss": 0.5606,
"step": 2415
},
{
"epoch": 0.9726688102893891,
"grad_norm": 3.9261487092055645,
"learning_rate": 2.341480218055303e-08,
"loss": 0.5137,
"step": 2420
},
{
"epoch": 0.9746784565916399,
"grad_norm": 3.908466740159429,
"learning_rate": 2.014650188067735e-08,
"loss": 0.5444,
"step": 2425
},
{
"epoch": 0.9766881028938906,
"grad_norm": 4.095217894179939,
"learning_rate": 1.7123303296944226e-08,
"loss": 0.5302,
"step": 2430
},
{
"epoch": 0.9786977491961415,
"grad_norm": 3.3767195966017485,
"learning_rate": 1.4345355227137203e-08,
"loss": 0.5031,
"step": 2435
},
{
"epoch": 0.9807073954983923,
"grad_norm": 3.8928483675929337,
"learning_rate": 1.1812794398137762e-08,
"loss": 0.5233,
"step": 2440
},
{
"epoch": 0.9827170418006431,
"grad_norm": 4.410721124785433,
"learning_rate": 9.525745459195712e-09,
"loss": 0.5351,
"step": 2445
},
{
"epoch": 0.9847266881028939,
"grad_norm": 4.367071424030973,
"learning_rate": 7.484320975795766e-09,
"loss": 0.45,
"step": 2450
},
{
"epoch": 0.9847266881028939,
"eval_cooking_sharegpt_test_loss": 0.5149813294410706,
"eval_cooking_sharegpt_test_runtime": 29.1432,
"eval_cooking_sharegpt_test_samples_per_second": 6.863,
"eval_cooking_sharegpt_test_steps_per_second": 0.343,
"step": 2450
},
{
"epoch": 0.9867363344051447,
"grad_norm": 4.348603146645124,
"learning_rate": 5.688621424115304e-09,
"loss": 0.4929,
"step": 2455
},
{
"epoch": 0.9887459807073955,
"grad_norm": 3.4464338928700866,
"learning_rate": 4.1387351860799894e-09,
"loss": 0.5602,
"step": 2460
},
{
"epoch": 0.9907556270096463,
"grad_norm": 4.117129096266541,
"learning_rate": 2.8347385450133715e-09,
"loss": 0.4746,
"step": 2465
},
{
"epoch": 0.9927652733118971,
"grad_norm": 3.7130286969995274,
"learning_rate": 1.7766956818832116e-09,
"loss": 0.4795,
"step": 2470
},
{
"epoch": 0.9947749196141479,
"grad_norm": 3.8454728155367164,
"learning_rate": 9.646586721412388e-10,
"loss": 0.5535,
"step": 2475
},
{
"epoch": 0.9967845659163987,
"grad_norm": 3.694656061423026,
"learning_rate": 3.986674831607529e-10,
"loss": 0.5087,
"step": 2480
},
{
"epoch": 0.9987942122186495,
"grad_norm": 4.211180713946689,
"learning_rate": 7.87499722693097e-11,
"loss": 0.4456,
"step": 2485
},
{
"epoch": 1.0,
"step": 2488,
"total_flos": 19663349465088.0,
"train_loss": 0.6474186228019249,
"train_runtime": 24049.0144,
"train_samples_per_second": 0.827,
"train_steps_per_second": 0.103
}
],
"logging_steps": 5,
"max_steps": 2488,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 19663349465088.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}