{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 2488, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020096463022508037, "grad_norm": 25.43042279854588, "learning_rate": 1.6064257028112448e-07, "loss": 1.5516, "step": 5 }, { "epoch": 0.0040192926045016075, "grad_norm": 32.55468907279309, "learning_rate": 3.614457831325301e-07, "loss": 1.6285, "step": 10 }, { "epoch": 0.006028938906752411, "grad_norm": 22.457510712636555, "learning_rate": 5.622489959839358e-07, "loss": 1.544, "step": 15 }, { "epoch": 0.008038585209003215, "grad_norm": 12.994153155174507, "learning_rate": 7.630522088353415e-07, "loss": 1.4871, "step": 20 }, { "epoch": 0.01004823151125402, "grad_norm": 13.10275994035317, "learning_rate": 9.638554216867472e-07, "loss": 1.3155, "step": 25 }, { "epoch": 0.012057877813504822, "grad_norm": 9.886808975385357, "learning_rate": 1.1646586345381528e-06, "loss": 1.2181, "step": 30 }, { "epoch": 0.014067524115755627, "grad_norm": 9.765195243051854, "learning_rate": 1.3654618473895584e-06, "loss": 1.1915, "step": 35 }, { "epoch": 0.01607717041800643, "grad_norm": 10.076388693447448, "learning_rate": 1.566265060240964e-06, "loss": 1.1387, "step": 40 }, { "epoch": 0.018086816720257234, "grad_norm": 9.74974655232714, "learning_rate": 1.7670682730923696e-06, "loss": 1.0759, "step": 45 }, { "epoch": 0.02009646302250804, "grad_norm": 8.925268910533704, "learning_rate": 1.967871485943775e-06, "loss": 1.0175, "step": 50 }, { "epoch": 0.02009646302250804, "eval_cooking_sharegpt_test_loss": 0.9884688854217529, "eval_cooking_sharegpt_test_runtime": 29.6069, "eval_cooking_sharegpt_test_samples_per_second": 6.755, "eval_cooking_sharegpt_test_steps_per_second": 0.338, "step": 50 }, { "epoch": 0.022106109324758844, "grad_norm": 9.02573766663436, "learning_rate": 2.168674698795181e-06, "loss": 0.9542, "step": 55 }, { "epoch": 0.024115755627009645, "grad_norm": 9.819497852533047, "learning_rate": 2.3694779116465868e-06, "loss": 0.9784, "step": 60 }, { "epoch": 0.02612540192926045, "grad_norm": 10.82311981416087, "learning_rate": 2.5702811244979918e-06, "loss": 1.0231, "step": 65 }, { "epoch": 0.028135048231511254, "grad_norm": 8.964625514542233, "learning_rate": 2.771084337349398e-06, "loss": 0.8573, "step": 70 }, { "epoch": 0.03014469453376206, "grad_norm": 8.532301605077798, "learning_rate": 2.9718875502008034e-06, "loss": 0.9551, "step": 75 }, { "epoch": 0.03215434083601286, "grad_norm": 9.990828654438014, "learning_rate": 3.172690763052209e-06, "loss": 0.9182, "step": 80 }, { "epoch": 0.034163987138263664, "grad_norm": 6.995692527275145, "learning_rate": 3.3734939759036146e-06, "loss": 0.8639, "step": 85 }, { "epoch": 0.03617363344051447, "grad_norm": 7.768080152065188, "learning_rate": 3.5742971887550204e-06, "loss": 0.8521, "step": 90 }, { "epoch": 0.03818327974276527, "grad_norm": 8.012119974468852, "learning_rate": 3.7751004016064258e-06, "loss": 0.8477, "step": 95 }, { "epoch": 0.04019292604501608, "grad_norm": 8.50117811151367, "learning_rate": 3.975903614457832e-06, "loss": 0.8473, "step": 100 }, { "epoch": 0.04019292604501608, "eval_cooking_sharegpt_test_loss": 0.8732408285140991, "eval_cooking_sharegpt_test_runtime": 29.1007, "eval_cooking_sharegpt_test_samples_per_second": 6.873, "eval_cooking_sharegpt_test_steps_per_second": 0.344, "step": 100 }, { "epoch": 0.04220257234726688, "grad_norm": 8.560443518575713, "learning_rate": 4.176706827309237e-06, "loss": 0.8652, "step": 105 }, { "epoch": 0.04421221864951769, "grad_norm": 9.719112811630923, "learning_rate": 4.377510040160643e-06, "loss": 0.8941, "step": 110 }, { "epoch": 0.04622186495176849, "grad_norm": 9.985676476471362, "learning_rate": 4.578313253012049e-06, "loss": 0.8859, "step": 115 }, { "epoch": 0.04823151125401929, "grad_norm": 8.414088670486853, "learning_rate": 4.779116465863454e-06, "loss": 0.8043, "step": 120 }, { "epoch": 0.050241157556270094, "grad_norm": 8.096501738165966, "learning_rate": 4.979919678714859e-06, "loss": 0.8565, "step": 125 }, { "epoch": 0.0522508038585209, "grad_norm": 9.410994081814192, "learning_rate": 5.180722891566266e-06, "loss": 0.9358, "step": 130 }, { "epoch": 0.0542604501607717, "grad_norm": 7.421809482089455, "learning_rate": 5.381526104417672e-06, "loss": 0.8336, "step": 135 }, { "epoch": 0.05627009646302251, "grad_norm": 8.907995665308611, "learning_rate": 5.582329317269076e-06, "loss": 0.8398, "step": 140 }, { "epoch": 0.05827974276527331, "grad_norm": 7.103996712375502, "learning_rate": 5.783132530120482e-06, "loss": 0.8702, "step": 145 }, { "epoch": 0.06028938906752412, "grad_norm": 8.485504235075577, "learning_rate": 5.983935742971888e-06, "loss": 0.8542, "step": 150 }, { "epoch": 0.06028938906752412, "eval_cooking_sharegpt_test_loss": 0.8469827175140381, "eval_cooking_sharegpt_test_runtime": 29.1354, "eval_cooking_sharegpt_test_samples_per_second": 6.864, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 150 }, { "epoch": 0.06229903536977492, "grad_norm": 7.565513220130896, "learning_rate": 6.184738955823294e-06, "loss": 0.8558, "step": 155 }, { "epoch": 0.06430868167202572, "grad_norm": 9.7278611728726, "learning_rate": 6.385542168674699e-06, "loss": 0.9122, "step": 160 }, { "epoch": 0.06631832797427653, "grad_norm": 8.171433737702468, "learning_rate": 6.586345381526105e-06, "loss": 0.8225, "step": 165 }, { "epoch": 0.06832797427652733, "grad_norm": 7.424345320287168, "learning_rate": 6.78714859437751e-06, "loss": 0.838, "step": 170 }, { "epoch": 0.07033762057877814, "grad_norm": 6.778825035197842, "learning_rate": 6.987951807228917e-06, "loss": 0.7698, "step": 175 }, { "epoch": 0.07234726688102894, "grad_norm": 8.160040638569848, "learning_rate": 7.188755020080321e-06, "loss": 0.8443, "step": 180 }, { "epoch": 0.07435691318327975, "grad_norm": 7.56314606717551, "learning_rate": 7.389558232931727e-06, "loss": 0.7953, "step": 185 }, { "epoch": 0.07636655948553055, "grad_norm": 7.860496549752045, "learning_rate": 7.590361445783133e-06, "loss": 0.8839, "step": 190 }, { "epoch": 0.07837620578778134, "grad_norm": 6.887754234302554, "learning_rate": 7.79116465863454e-06, "loss": 0.796, "step": 195 }, { "epoch": 0.08038585209003216, "grad_norm": 7.785553062894794, "learning_rate": 7.991967871485944e-06, "loss": 0.8336, "step": 200 }, { "epoch": 0.08038585209003216, "eval_cooking_sharegpt_test_loss": 0.8259029984474182, "eval_cooking_sharegpt_test_runtime": 29.1235, "eval_cooking_sharegpt_test_samples_per_second": 6.867, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 200 }, { "epoch": 0.08239549839228295, "grad_norm": 6.994614576677781, "learning_rate": 8.19277108433735e-06, "loss": 0.8181, "step": 205 }, { "epoch": 0.08440514469453377, "grad_norm": 7.772788858949606, "learning_rate": 8.393574297188756e-06, "loss": 0.8408, "step": 210 }, { "epoch": 0.08641479099678456, "grad_norm": 7.254386585395993, "learning_rate": 8.594377510040161e-06, "loss": 0.9085, "step": 215 }, { "epoch": 0.08842443729903537, "grad_norm": 7.026321018704356, "learning_rate": 8.795180722891567e-06, "loss": 0.8782, "step": 220 }, { "epoch": 0.09043408360128617, "grad_norm": 7.051507761624435, "learning_rate": 8.995983935742972e-06, "loss": 0.9409, "step": 225 }, { "epoch": 0.09244372990353698, "grad_norm": 7.606219516981015, "learning_rate": 9.196787148594378e-06, "loss": 0.8555, "step": 230 }, { "epoch": 0.09445337620578778, "grad_norm": 6.28124366103456, "learning_rate": 9.397590361445785e-06, "loss": 0.7534, "step": 235 }, { "epoch": 0.09646302250803858, "grad_norm": 8.394942968125275, "learning_rate": 9.598393574297188e-06, "loss": 0.8402, "step": 240 }, { "epoch": 0.09847266881028939, "grad_norm": 7.156441873531409, "learning_rate": 9.799196787148595e-06, "loss": 0.8081, "step": 245 }, { "epoch": 0.10048231511254019, "grad_norm": 7.327698916304687, "learning_rate": 1e-05, "loss": 0.7723, "step": 250 }, { "epoch": 0.10048231511254019, "eval_cooking_sharegpt_test_loss": 0.8293350338935852, "eval_cooking_sharegpt_test_runtime": 29.0949, "eval_cooking_sharegpt_test_samples_per_second": 6.874, "eval_cooking_sharegpt_test_steps_per_second": 0.344, "step": 250 }, { "epoch": 0.102491961414791, "grad_norm": 7.4235500079351375, "learning_rate": 9.999876953350016e-06, "loss": 0.8151, "step": 255 }, { "epoch": 0.1045016077170418, "grad_norm": 7.084895816067402, "learning_rate": 9.999507819456254e-06, "loss": 0.7621, "step": 260 }, { "epoch": 0.10651125401929261, "grad_norm": 6.151331764138221, "learning_rate": 9.998892616486991e-06, "loss": 0.803, "step": 265 }, { "epoch": 0.1085209003215434, "grad_norm": 7.692005262695669, "learning_rate": 9.99803137472169e-06, "loss": 0.7968, "step": 270 }, { "epoch": 0.11053054662379422, "grad_norm": 7.296616213551366, "learning_rate": 9.996924136549519e-06, "loss": 0.8934, "step": 275 }, { "epoch": 0.11254019292604502, "grad_norm": 6.78581453909987, "learning_rate": 9.995570956467257e-06, "loss": 0.8168, "step": 280 }, { "epoch": 0.11454983922829581, "grad_norm": 6.629376557450756, "learning_rate": 9.993971901076614e-06, "loss": 0.8536, "step": 285 }, { "epoch": 0.11655948553054662, "grad_norm": 6.125893044121695, "learning_rate": 9.992127049080952e-06, "loss": 0.8304, "step": 290 }, { "epoch": 0.11856913183279742, "grad_norm": 6.3713359597576416, "learning_rate": 9.990036491281418e-06, "loss": 0.8069, "step": 295 }, { "epoch": 0.12057877813504823, "grad_norm": 6.375162182116364, "learning_rate": 9.98770033057246e-06, "loss": 0.8101, "step": 300 }, { "epoch": 0.12057877813504823, "eval_cooking_sharegpt_test_loss": 0.797538161277771, "eval_cooking_sharegpt_test_runtime": 29.1177, "eval_cooking_sharegpt_test_samples_per_second": 6.869, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 300 }, { "epoch": 0.12258842443729903, "grad_norm": 7.814959047420182, "learning_rate": 9.985118681936783e-06, "loss": 0.8315, "step": 305 }, { "epoch": 0.12459807073954984, "grad_norm": 6.8494733103452194, "learning_rate": 9.982291672439671e-06, "loss": 0.7654, "step": 310 }, { "epoch": 0.12660771704180065, "grad_norm": 5.286224086045848, "learning_rate": 9.979219441222743e-06, "loss": 0.7776, "step": 315 }, { "epoch": 0.12861736334405144, "grad_norm": 6.103426753817979, "learning_rate": 9.975902139497105e-06, "loss": 0.815, "step": 320 }, { "epoch": 0.13062700964630225, "grad_norm": 5.520317998483916, "learning_rate": 9.972339930535897e-06, "loss": 0.813, "step": 325 }, { "epoch": 0.13263665594855306, "grad_norm": 5.9863235363550045, "learning_rate": 9.968532989666277e-06, "loss": 0.7504, "step": 330 }, { "epoch": 0.13464630225080385, "grad_norm": 7.212782518237145, "learning_rate": 9.96448150426077e-06, "loss": 0.8715, "step": 335 }, { "epoch": 0.13665594855305466, "grad_norm": 5.912590773720034, "learning_rate": 9.96018567372806e-06, "loss": 0.7558, "step": 340 }, { "epoch": 0.13866559485530547, "grad_norm": 5.77435813010867, "learning_rate": 9.95564570950317e-06, "loss": 0.779, "step": 345 }, { "epoch": 0.14067524115755628, "grad_norm": 6.505458706803495, "learning_rate": 9.950861835037053e-06, "loss": 0.8514, "step": 350 }, { "epoch": 0.14067524115755628, "eval_cooking_sharegpt_test_loss": 0.7728434205055237, "eval_cooking_sharegpt_test_runtime": 29.1638, "eval_cooking_sharegpt_test_samples_per_second": 6.858, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 350 }, { "epoch": 0.14268488745980706, "grad_norm": 5.6452446665378115, "learning_rate": 9.945834285785601e-06, "loss": 0.6856, "step": 355 }, { "epoch": 0.14469453376205788, "grad_norm": 5.6265260781387365, "learning_rate": 9.94056330919805e-06, "loss": 0.8083, "step": 360 }, { "epoch": 0.1467041800643087, "grad_norm": 5.368904478416109, "learning_rate": 9.935049164704809e-06, "loss": 0.6928, "step": 365 }, { "epoch": 0.1487138263665595, "grad_norm": 7.881458451902342, "learning_rate": 9.929292123704677e-06, "loss": 0.7741, "step": 370 }, { "epoch": 0.15072347266881028, "grad_norm": 6.532061219960651, "learning_rate": 9.923292469551498e-06, "loss": 0.8097, "step": 375 }, { "epoch": 0.1527331189710611, "grad_norm": 6.450102699245797, "learning_rate": 9.91705049754021e-06, "loss": 0.8684, "step": 380 }, { "epoch": 0.1547427652733119, "grad_norm": 5.574142168212173, "learning_rate": 9.910566514892311e-06, "loss": 0.7809, "step": 385 }, { "epoch": 0.1567524115755627, "grad_norm": 6.795118484011259, "learning_rate": 9.903840840740739e-06, "loss": 0.8092, "step": 390 }, { "epoch": 0.1587620578778135, "grad_norm": 6.521136743595867, "learning_rate": 9.896873806114164e-06, "loss": 0.7888, "step": 395 }, { "epoch": 0.1607717041800643, "grad_norm": 6.632901741168955, "learning_rate": 9.889665753920693e-06, "loss": 0.7539, "step": 400 }, { "epoch": 0.1607717041800643, "eval_cooking_sharegpt_test_loss": 0.7521212697029114, "eval_cooking_sharegpt_test_runtime": 29.1522, "eval_cooking_sharegpt_test_samples_per_second": 6.861, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 400 }, { "epoch": 0.16278135048231512, "grad_norm": 5.843964500525819, "learning_rate": 9.882217038930996e-06, "loss": 0.7583, "step": 405 }, { "epoch": 0.1647909967845659, "grad_norm": 5.922756974819583, "learning_rate": 9.874528027760844e-06, "loss": 0.7904, "step": 410 }, { "epoch": 0.16680064308681672, "grad_norm": 5.184809873689736, "learning_rate": 9.866599098853065e-06, "loss": 0.6878, "step": 415 }, { "epoch": 0.16881028938906753, "grad_norm": 6.0072795392775875, "learning_rate": 9.858430642458911e-06, "loss": 0.7625, "step": 420 }, { "epoch": 0.17081993569131831, "grad_norm": 6.3898364197395505, "learning_rate": 9.850023060618865e-06, "loss": 0.8075, "step": 425 }, { "epoch": 0.17282958199356913, "grad_norm": 5.4615901982761725, "learning_rate": 9.841376767142836e-06, "loss": 0.7334, "step": 430 }, { "epoch": 0.17483922829581994, "grad_norm": 5.072280085120411, "learning_rate": 9.832492187589803e-06, "loss": 0.7006, "step": 435 }, { "epoch": 0.17684887459807075, "grad_norm": 4.902329337695089, "learning_rate": 9.823369759246866e-06, "loss": 0.7779, "step": 440 }, { "epoch": 0.17885852090032153, "grad_norm": 5.961804769639084, "learning_rate": 9.814009931107724e-06, "loss": 0.7983, "step": 445 }, { "epoch": 0.18086816720257234, "grad_norm": 4.8028511364670115, "learning_rate": 9.804413163850578e-06, "loss": 0.6964, "step": 450 }, { "epoch": 0.18086816720257234, "eval_cooking_sharegpt_test_loss": 0.7369500994682312, "eval_cooking_sharegpt_test_runtime": 29.1063, "eval_cooking_sharegpt_test_samples_per_second": 6.871, "eval_cooking_sharegpt_test_steps_per_second": 0.344, "step": 450 }, { "epoch": 0.18287781350482316, "grad_norm": 4.8914718117824405, "learning_rate": 9.79457992981545e-06, "loss": 0.7367, "step": 455 }, { "epoch": 0.18488745980707397, "grad_norm": 5.402352390784586, "learning_rate": 9.784510712980944e-06, "loss": 0.6798, "step": 460 }, { "epoch": 0.18689710610932475, "grad_norm": 5.701737276259646, "learning_rate": 9.774206008940418e-06, "loss": 0.7226, "step": 465 }, { "epoch": 0.18890675241157556, "grad_norm": 6.263433603685716, "learning_rate": 9.7636663248776e-06, "loss": 0.8274, "step": 470 }, { "epoch": 0.19091639871382637, "grad_norm": 5.312503789863195, "learning_rate": 9.75289217954161e-06, "loss": 0.7734, "step": 475 }, { "epoch": 0.19292604501607716, "grad_norm": 5.156337860674348, "learning_rate": 9.741884103221451e-06, "loss": 0.7659, "step": 480 }, { "epoch": 0.19493569131832797, "grad_norm": 6.800003297542305, "learning_rate": 9.730642637719884e-06, "loss": 0.7985, "step": 485 }, { "epoch": 0.19694533762057878, "grad_norm": 5.800498897893185, "learning_rate": 9.71916833632678e-06, "loss": 0.7221, "step": 490 }, { "epoch": 0.1989549839228296, "grad_norm": 4.793819929790139, "learning_rate": 9.707461763791879e-06, "loss": 0.715, "step": 495 }, { "epoch": 0.20096463022508038, "grad_norm": 5.1410323292838065, "learning_rate": 9.69552349629699e-06, "loss": 0.7628, "step": 500 }, { "epoch": 0.20096463022508038, "eval_cooking_sharegpt_test_loss": 0.7155391573905945, "eval_cooking_sharegpt_test_runtime": 29.1357, "eval_cooking_sharegpt_test_samples_per_second": 6.864, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 500 }, { "epoch": 0.2029742765273312, "grad_norm": 4.650244956653454, "learning_rate": 9.683354121427645e-06, "loss": 0.6865, "step": 505 }, { "epoch": 0.204983922829582, "grad_norm": 5.005060659354274, "learning_rate": 9.670954238144165e-06, "loss": 0.7376, "step": 510 }, { "epoch": 0.2069935691318328, "grad_norm": 4.9392891385103495, "learning_rate": 9.658324456752194e-06, "loss": 0.6808, "step": 515 }, { "epoch": 0.2090032154340836, "grad_norm": 5.07290134380499, "learning_rate": 9.645465398872645e-06, "loss": 0.6335, "step": 520 }, { "epoch": 0.2110128617363344, "grad_norm": 6.372463646337311, "learning_rate": 9.632377697411114e-06, "loss": 0.7125, "step": 525 }, { "epoch": 0.21302250803858522, "grad_norm": 4.8784584544005565, "learning_rate": 9.619061996526735e-06, "loss": 0.7647, "step": 530 }, { "epoch": 0.215032154340836, "grad_norm": 4.991918572196447, "learning_rate": 9.605518951600456e-06, "loss": 0.7159, "step": 535 }, { "epoch": 0.2170418006430868, "grad_norm": 4.7469831719019, "learning_rate": 9.591749229202805e-06, "loss": 0.8187, "step": 540 }, { "epoch": 0.21905144694533762, "grad_norm": 5.013038664469992, "learning_rate": 9.577753507061063e-06, "loss": 0.7215, "step": 545 }, { "epoch": 0.22106109324758844, "grad_norm": 4.814679760540896, "learning_rate": 9.563532474025922e-06, "loss": 0.6789, "step": 550 }, { "epoch": 0.22106109324758844, "eval_cooking_sharegpt_test_loss": 0.706028163433075, "eval_cooking_sharegpt_test_runtime": 29.1544, "eval_cooking_sharegpt_test_samples_per_second": 6.86, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 550 }, { "epoch": 0.22307073954983922, "grad_norm": 5.071008410167568, "learning_rate": 9.549086830037573e-06, "loss": 0.7722, "step": 555 }, { "epoch": 0.22508038585209003, "grad_norm": 4.819320792169161, "learning_rate": 9.534417286091254e-06, "loss": 0.6459, "step": 560 }, { "epoch": 0.22709003215434084, "grad_norm": 5.805791958928098, "learning_rate": 9.519524564202261e-06, "loss": 0.7018, "step": 565 }, { "epoch": 0.22909967845659163, "grad_norm": 5.261658646649239, "learning_rate": 9.50440939737041e-06, "loss": 0.7692, "step": 570 }, { "epoch": 0.23110932475884244, "grad_norm": 6.656814899492019, "learning_rate": 9.489072529543955e-06, "loss": 0.8188, "step": 575 }, { "epoch": 0.23311897106109325, "grad_norm": 5.482276123459045, "learning_rate": 9.473514715582982e-06, "loss": 0.727, "step": 580 }, { "epoch": 0.23512861736334406, "grad_norm": 5.089480873020227, "learning_rate": 9.457736721222245e-06, "loss": 0.7129, "step": 585 }, { "epoch": 0.23713826366559485, "grad_norm": 4.422268494779517, "learning_rate": 9.441739323033485e-06, "loss": 0.6732, "step": 590 }, { "epoch": 0.23914790996784566, "grad_norm": 5.561830088062397, "learning_rate": 9.425523308387203e-06, "loss": 0.625, "step": 595 }, { "epoch": 0.24115755627009647, "grad_norm": 5.241025479724879, "learning_rate": 9.409089475413912e-06, "loss": 0.743, "step": 600 }, { "epoch": 0.24115755627009647, "eval_cooking_sharegpt_test_loss": 0.7122946381568909, "eval_cooking_sharegpt_test_runtime": 29.1346, "eval_cooking_sharegpt_test_samples_per_second": 6.865, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 600 }, { "epoch": 0.24316720257234728, "grad_norm": 5.9963960390112945, "learning_rate": 9.392438632964847e-06, "loss": 0.7522, "step": 605 }, { "epoch": 0.24517684887459806, "grad_norm": 5.407942176340359, "learning_rate": 9.375571600572165e-06, "loss": 0.7116, "step": 610 }, { "epoch": 0.24718649517684887, "grad_norm": 5.318941810734261, "learning_rate": 9.358489208408594e-06, "loss": 0.7307, "step": 615 }, { "epoch": 0.2491961414790997, "grad_norm": 4.610264529077515, "learning_rate": 9.341192297246588e-06, "loss": 0.7274, "step": 620 }, { "epoch": 0.2512057877813505, "grad_norm": 4.432769811479363, "learning_rate": 9.323681718416937e-06, "loss": 0.6281, "step": 625 }, { "epoch": 0.2532154340836013, "grad_norm": 4.6735771702678965, "learning_rate": 9.305958333766867e-06, "loss": 0.6655, "step": 630 }, { "epoch": 0.25522508038585207, "grad_norm": 6.091308209571449, "learning_rate": 9.288023015617618e-06, "loss": 0.7275, "step": 635 }, { "epoch": 0.2572347266881029, "grad_norm": 4.6463773174566, "learning_rate": 9.269876646721519e-06, "loss": 0.6827, "step": 640 }, { "epoch": 0.2592443729903537, "grad_norm": 4.452810636438574, "learning_rate": 9.251520120218528e-06, "loss": 0.6883, "step": 645 }, { "epoch": 0.2612540192926045, "grad_norm": 6.398787653321348, "learning_rate": 9.232954339592285e-06, "loss": 0.7807, "step": 650 }, { "epoch": 0.2612540192926045, "eval_cooking_sharegpt_test_loss": 0.6995799541473389, "eval_cooking_sharegpt_test_runtime": 29.1239, "eval_cooking_sharegpt_test_samples_per_second": 6.867, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 650 }, { "epoch": 0.2632636655948553, "grad_norm": 5.489052927932185, "learning_rate": 9.214180218625632e-06, "loss": 0.6858, "step": 655 }, { "epoch": 0.2652733118971061, "grad_norm": 5.330659230842556, "learning_rate": 9.195198681355647e-06, "loss": 0.711, "step": 660 }, { "epoch": 0.26728295819935693, "grad_norm": 5.581942505955761, "learning_rate": 9.176010662028157e-06, "loss": 0.6628, "step": 665 }, { "epoch": 0.2692926045016077, "grad_norm": 5.131847371991602, "learning_rate": 9.156617105051763e-06, "loss": 0.679, "step": 670 }, { "epoch": 0.2713022508038585, "grad_norm": 4.7770105092526896, "learning_rate": 9.13701896495135e-06, "loss": 0.7146, "step": 675 }, { "epoch": 0.2733118971061093, "grad_norm": 4.982534700495619, "learning_rate": 9.117217206321113e-06, "loss": 0.7721, "step": 680 }, { "epoch": 0.2753215434083601, "grad_norm": 4.798649936226053, "learning_rate": 9.09721280377708e-06, "loss": 0.7748, "step": 685 }, { "epoch": 0.27733118971061094, "grad_norm": 4.601964700694306, "learning_rate": 9.077006741909133e-06, "loss": 0.7435, "step": 690 }, { "epoch": 0.27934083601286175, "grad_norm": 5.566494391677513, "learning_rate": 9.056600015232567e-06, "loss": 0.6952, "step": 695 }, { "epoch": 0.28135048231511256, "grad_norm": 5.727808059117751, "learning_rate": 9.035993628139117e-06, "loss": 0.6711, "step": 700 }, { "epoch": 0.28135048231511256, "eval_cooking_sharegpt_test_loss": 0.6903340220451355, "eval_cooking_sharegpt_test_runtime": 29.1049, "eval_cooking_sharegpt_test_samples_per_second": 6.872, "eval_cooking_sharegpt_test_steps_per_second": 0.344, "step": 700 }, { "epoch": 0.28336012861736337, "grad_norm": 3.8470896481653494, "learning_rate": 9.01518859484755e-06, "loss": 0.6729, "step": 705 }, { "epoch": 0.2853697749196141, "grad_norm": 4.87977503518934, "learning_rate": 8.99418593935372e-06, "loss": 0.6863, "step": 710 }, { "epoch": 0.28737942122186494, "grad_norm": 4.674016276471173, "learning_rate": 8.972986695380189e-06, "loss": 0.6651, "step": 715 }, { "epoch": 0.28938906752411575, "grad_norm": 5.15746212298852, "learning_rate": 8.95159190632534e-06, "loss": 0.6642, "step": 720 }, { "epoch": 0.29139871382636656, "grad_norm": 5.180219693756491, "learning_rate": 8.930002625212018e-06, "loss": 0.6115, "step": 725 }, { "epoch": 0.2934083601286174, "grad_norm": 4.6310292453141315, "learning_rate": 8.908219914635711e-06, "loss": 0.7092, "step": 730 }, { "epoch": 0.2954180064308682, "grad_norm": 5.683343490286694, "learning_rate": 8.886244846712245e-06, "loss": 0.7257, "step": 735 }, { "epoch": 0.297427652733119, "grad_norm": 4.9104420352970015, "learning_rate": 8.864078503025017e-06, "loss": 0.7523, "step": 740 }, { "epoch": 0.29943729903536975, "grad_norm": 4.821845146152399, "learning_rate": 8.841721974571758e-06, "loss": 0.6734, "step": 745 }, { "epoch": 0.30144694533762056, "grad_norm": 4.700309572076156, "learning_rate": 8.819176361710842e-06, "loss": 0.6201, "step": 750 }, { "epoch": 0.30144694533762056, "eval_cooking_sharegpt_test_loss": 0.6827989816665649, "eval_cooking_sharegpt_test_runtime": 29.1653, "eval_cooking_sharegpt_test_samples_per_second": 6.857, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 750 }, { "epoch": 0.3034565916398714, "grad_norm": 4.984810224063278, "learning_rate": 8.796442774107123e-06, "loss": 0.7233, "step": 755 }, { "epoch": 0.3054662379421222, "grad_norm": 4.572885347797854, "learning_rate": 8.77352233067732e-06, "loss": 0.7791, "step": 760 }, { "epoch": 0.307475884244373, "grad_norm": 4.723286735310018, "learning_rate": 8.750416159534944e-06, "loss": 0.692, "step": 765 }, { "epoch": 0.3094855305466238, "grad_norm": 5.060218829161682, "learning_rate": 8.727125397934777e-06, "loss": 0.6615, "step": 770 }, { "epoch": 0.3114951768488746, "grad_norm": 4.833251557506912, "learning_rate": 8.703651192216896e-06, "loss": 0.7046, "step": 775 }, { "epoch": 0.3135048231511254, "grad_norm": 5.151428527882602, "learning_rate": 8.67999469775025e-06, "loss": 0.6859, "step": 780 }, { "epoch": 0.3155144694533762, "grad_norm": 5.403069037330436, "learning_rate": 8.656157078875794e-06, "loss": 0.6585, "step": 785 }, { "epoch": 0.317524115755627, "grad_norm": 4.594702329252282, "learning_rate": 8.632139508849192e-06, "loss": 0.6662, "step": 790 }, { "epoch": 0.3195337620578778, "grad_norm": 3.82227587899388, "learning_rate": 8.60794316978305e-06, "loss": 0.6479, "step": 795 }, { "epoch": 0.3215434083601286, "grad_norm": 4.050592455983538, "learning_rate": 8.583569252588761e-06, "loss": 0.6634, "step": 800 }, { "epoch": 0.3215434083601286, "eval_cooking_sharegpt_test_loss": 0.6700084209442139, "eval_cooking_sharegpt_test_runtime": 29.1558, "eval_cooking_sharegpt_test_samples_per_second": 6.86, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 800 }, { "epoch": 0.32355305466237944, "grad_norm": 4.7546247450248895, "learning_rate": 8.559018956917864e-06, "loss": 0.6893, "step": 805 }, { "epoch": 0.32556270096463025, "grad_norm": 4.892180049499131, "learning_rate": 8.534293491103014e-06, "loss": 0.7171, "step": 810 }, { "epoch": 0.327572347266881, "grad_norm": 4.593573940130922, "learning_rate": 8.50939407209851e-06, "loss": 0.602, "step": 815 }, { "epoch": 0.3295819935691318, "grad_norm": 4.812442397504877, "learning_rate": 8.484321925420383e-06, "loss": 0.6965, "step": 820 }, { "epoch": 0.3315916398713826, "grad_norm": 4.995658301324656, "learning_rate": 8.459078285086103e-06, "loss": 0.6757, "step": 825 }, { "epoch": 0.33360128617363344, "grad_norm": 4.411896351243907, "learning_rate": 8.433664393553815e-06, "loss": 0.6125, "step": 830 }, { "epoch": 0.33561093247588425, "grad_norm": 4.037149487682339, "learning_rate": 8.40808150166121e-06, "loss": 0.5841, "step": 835 }, { "epoch": 0.33762057877813506, "grad_norm": 4.6302565097915656, "learning_rate": 8.382330868563943e-06, "loss": 0.669, "step": 840 }, { "epoch": 0.3396302250803859, "grad_norm": 5.037886894281096, "learning_rate": 8.35641376167367e-06, "loss": 0.6602, "step": 845 }, { "epoch": 0.34163987138263663, "grad_norm": 4.116477405728893, "learning_rate": 8.330331456595663e-06, "loss": 0.6318, "step": 850 }, { "epoch": 0.34163987138263663, "eval_cooking_sharegpt_test_loss": 0.6634958386421204, "eval_cooking_sharegpt_test_runtime": 29.1155, "eval_cooking_sharegpt_test_samples_per_second": 6.869, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 850 }, { "epoch": 0.34364951768488744, "grad_norm": 4.53205416614136, "learning_rate": 8.304085237066027e-06, "loss": 0.7296, "step": 855 }, { "epoch": 0.34565916398713825, "grad_norm": 4.562829710605114, "learning_rate": 8.277676394888518e-06, "loss": 0.6152, "step": 860 }, { "epoch": 0.34766881028938906, "grad_norm": 4.850911945798656, "learning_rate": 8.25110622987096e-06, "loss": 0.6514, "step": 865 }, { "epoch": 0.3496784565916399, "grad_norm": 4.655697540410078, "learning_rate": 8.22437604976127e-06, "loss": 0.7196, "step": 870 }, { "epoch": 0.3516881028938907, "grad_norm": 5.125990956860812, "learning_rate": 8.197487170183092e-06, "loss": 0.6654, "step": 875 }, { "epoch": 0.3536977491961415, "grad_norm": 4.758879553215313, "learning_rate": 8.170440914571052e-06, "loss": 0.6771, "step": 880 }, { "epoch": 0.3557073954983923, "grad_norm": 4.800616804892499, "learning_rate": 8.143238614105608e-06, "loss": 0.6825, "step": 885 }, { "epoch": 0.35771704180064307, "grad_norm": 5.081512148323733, "learning_rate": 8.115881607647538e-06, "loss": 0.6968, "step": 890 }, { "epoch": 0.3597266881028939, "grad_norm": 4.824963989656846, "learning_rate": 8.08837124167204e-06, "loss": 0.6879, "step": 895 }, { "epoch": 0.3617363344051447, "grad_norm": 5.409545786784468, "learning_rate": 8.060708870202462e-06, "loss": 0.7033, "step": 900 }, { "epoch": 0.3617363344051447, "eval_cooking_sharegpt_test_loss": 0.6626113653182983, "eval_cooking_sharegpt_test_runtime": 29.1696, "eval_cooking_sharegpt_test_samples_per_second": 6.856, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 900 }, { "epoch": 0.3637459807073955, "grad_norm": 3.7618538333099982, "learning_rate": 8.032895854743661e-06, "loss": 0.6522, "step": 905 }, { "epoch": 0.3657556270096463, "grad_norm": 4.543338033090877, "learning_rate": 8.004933564214991e-06, "loss": 0.589, "step": 910 }, { "epoch": 0.3677652733118971, "grad_norm": 4.486029590225905, "learning_rate": 7.976823374882919e-06, "loss": 0.6684, "step": 915 }, { "epoch": 0.36977491961414793, "grad_norm": 4.184644901714797, "learning_rate": 7.948566670293298e-06, "loss": 0.6203, "step": 920 }, { "epoch": 0.3717845659163987, "grad_norm": 4.232384871040644, "learning_rate": 7.920164841203262e-06, "loss": 0.6393, "step": 925 }, { "epoch": 0.3737942122186495, "grad_norm": 4.593579658625747, "learning_rate": 7.891619285512781e-06, "loss": 0.7574, "step": 930 }, { "epoch": 0.3758038585209003, "grad_norm": 4.23534793280711, "learning_rate": 7.862931408195855e-06, "loss": 0.5811, "step": 935 }, { "epoch": 0.3778135048231511, "grad_norm": 4.805442331593937, "learning_rate": 7.834102621231364e-06, "loss": 0.6265, "step": 940 }, { "epoch": 0.37982315112540194, "grad_norm": 4.667120661635427, "learning_rate": 7.805134343533572e-06, "loss": 0.6295, "step": 945 }, { "epoch": 0.38183279742765275, "grad_norm": 4.3862142407448905, "learning_rate": 7.776028000882288e-06, "loss": 0.6715, "step": 950 }, { "epoch": 0.38183279742765275, "eval_cooking_sharegpt_test_loss": 0.6476317048072815, "eval_cooking_sharegpt_test_runtime": 29.1486, "eval_cooking_sharegpt_test_samples_per_second": 6.861, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 950 }, { "epoch": 0.38384244372990356, "grad_norm": 4.516234494825295, "learning_rate": 7.746785025852695e-06, "loss": 0.6513, "step": 955 }, { "epoch": 0.3858520900321543, "grad_norm": 4.063375240398369, "learning_rate": 7.717406857744837e-06, "loss": 0.5945, "step": 960 }, { "epoch": 0.3878617363344051, "grad_norm": 4.280617916285887, "learning_rate": 7.687894942512786e-06, "loss": 0.6263, "step": 965 }, { "epoch": 0.38987138263665594, "grad_norm": 4.2306442061102585, "learning_rate": 7.65825073269346e-06, "loss": 0.6307, "step": 970 }, { "epoch": 0.39188102893890675, "grad_norm": 3.83492814563052, "learning_rate": 7.628475687335142e-06, "loss": 0.6768, "step": 975 }, { "epoch": 0.39389067524115756, "grad_norm": 3.91144567078091, "learning_rate": 7.598571271925667e-06, "loss": 0.5288, "step": 980 }, { "epoch": 0.3959003215434084, "grad_norm": 4.8926561756291145, "learning_rate": 7.568538958320291e-06, "loss": 0.5691, "step": 985 }, { "epoch": 0.3979099678456592, "grad_norm": 5.343345050879741, "learning_rate": 7.538380224669244e-06, "loss": 0.681, "step": 990 }, { "epoch": 0.39991961414790994, "grad_norm": 4.801310495851274, "learning_rate": 7.5080965553449834e-06, "loss": 0.6365, "step": 995 }, { "epoch": 0.40192926045016075, "grad_norm": 4.263810617817441, "learning_rate": 7.477689440869135e-06, "loss": 0.6511, "step": 1000 }, { "epoch": 0.40192926045016075, "eval_cooking_sharegpt_test_loss": 0.640380859375, "eval_cooking_sharegpt_test_runtime": 29.1538, "eval_cooking_sharegpt_test_samples_per_second": 6.86, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1000 }, { "epoch": 0.40393890675241156, "grad_norm": 4.608176994643662, "learning_rate": 7.447160377839125e-06, "loss": 0.6558, "step": 1005 }, { "epoch": 0.4059485530546624, "grad_norm": 4.184997495410128, "learning_rate": 7.416510868854529e-06, "loss": 0.6028, "step": 1010 }, { "epoch": 0.4079581993569132, "grad_norm": 4.14811982617309, "learning_rate": 7.385742422443108e-06, "loss": 0.6116, "step": 1015 }, { "epoch": 0.409967845659164, "grad_norm": 4.4179772033730975, "learning_rate": 7.354856552986563e-06, "loss": 0.6657, "step": 1020 }, { "epoch": 0.4119774919614148, "grad_norm": 3.872451257996019, "learning_rate": 7.323854780646002e-06, "loss": 0.616, "step": 1025 }, { "epoch": 0.4139871382636656, "grad_norm": 4.575816735109395, "learning_rate": 7.2927386312871185e-06, "loss": 0.6595, "step": 1030 }, { "epoch": 0.4159967845659164, "grad_norm": 4.636123500937769, "learning_rate": 7.261509636405087e-06, "loss": 0.537, "step": 1035 }, { "epoch": 0.4180064308681672, "grad_norm": 5.110397360196885, "learning_rate": 7.230169333049188e-06, "loss": 0.6751, "step": 1040 }, { "epoch": 0.420016077170418, "grad_norm": 4.095989085644148, "learning_rate": 7.198719263747158e-06, "loss": 0.6638, "step": 1045 }, { "epoch": 0.4220257234726688, "grad_norm": 5.40481290202407, "learning_rate": 7.167160976429264e-06, "loss": 0.6804, "step": 1050 }, { "epoch": 0.4220257234726688, "eval_cooking_sharegpt_test_loss": 0.6353456974029541, "eval_cooking_sharegpt_test_runtime": 29.1174, "eval_cooking_sharegpt_test_samples_per_second": 6.869, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1050 }, { "epoch": 0.4240353697749196, "grad_norm": 4.345481799467673, "learning_rate": 7.13549602435212e-06, "loss": 0.6407, "step": 1055 }, { "epoch": 0.42604501607717044, "grad_norm": 4.1689356990951145, "learning_rate": 7.103725966022233e-06, "loss": 0.6676, "step": 1060 }, { "epoch": 0.42805466237942125, "grad_norm": 4.124587131944435, "learning_rate": 7.071852365119306e-06, "loss": 0.5613, "step": 1065 }, { "epoch": 0.430064308681672, "grad_norm": 5.215102388713326, "learning_rate": 7.039876790419262e-06, "loss": 0.6349, "step": 1070 }, { "epoch": 0.4320739549839228, "grad_norm": 4.222772794043426, "learning_rate": 7.0078008157170415e-06, "loss": 0.5982, "step": 1075 }, { "epoch": 0.4340836012861736, "grad_norm": 4.044144044404326, "learning_rate": 6.975626019749137e-06, "loss": 0.6009, "step": 1080 }, { "epoch": 0.43609324758842444, "grad_norm": 4.504810284400397, "learning_rate": 6.943353986115893e-06, "loss": 0.6371, "step": 1085 }, { "epoch": 0.43810289389067525, "grad_norm": 4.200082136051, "learning_rate": 6.910986303203556e-06, "loss": 0.6367, "step": 1090 }, { "epoch": 0.44011254019292606, "grad_norm": 4.474891462331243, "learning_rate": 6.87852456410611e-06, "loss": 0.6916, "step": 1095 }, { "epoch": 0.44212218649517687, "grad_norm": 5.613000149936893, "learning_rate": 6.845970366546856e-06, "loss": 0.6355, "step": 1100 }, { "epoch": 0.44212218649517687, "eval_cooking_sharegpt_test_loss": 0.6258378028869629, "eval_cooking_sharegpt_test_runtime": 29.1265, "eval_cooking_sharegpt_test_samples_per_second": 6.867, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1100 }, { "epoch": 0.44413183279742763, "grad_norm": 5.123887472129782, "learning_rate": 6.813325312799769e-06, "loss": 0.6296, "step": 1105 }, { "epoch": 0.44614147909967844, "grad_norm": 4.4261628776726605, "learning_rate": 6.7805910096106555e-06, "loss": 0.5624, "step": 1110 }, { "epoch": 0.44815112540192925, "grad_norm": 4.638630416730711, "learning_rate": 6.747769068118049e-06, "loss": 0.6354, "step": 1115 }, { "epoch": 0.45016077170418006, "grad_norm": 4.461694988531885, "learning_rate": 6.714861103773934e-06, "loss": 0.5248, "step": 1120 }, { "epoch": 0.4521704180064309, "grad_norm": 5.163765933017904, "learning_rate": 6.681868736264215e-06, "loss": 0.6462, "step": 1125 }, { "epoch": 0.4541800643086817, "grad_norm": 4.6310268658457545, "learning_rate": 6.648793589429011e-06, "loss": 0.6174, "step": 1130 }, { "epoch": 0.4561897106109325, "grad_norm": 5.126634815687219, "learning_rate": 6.61563729118273e-06, "loss": 0.6466, "step": 1135 }, { "epoch": 0.45819935691318325, "grad_norm": 4.319380604514048, "learning_rate": 6.582401473433941e-06, "loss": 0.654, "step": 1140 }, { "epoch": 0.46020900321543406, "grad_norm": 4.247314315589029, "learning_rate": 6.5490877720050574e-06, "loss": 0.5634, "step": 1145 }, { "epoch": 0.4622186495176849, "grad_norm": 4.906257391887683, "learning_rate": 6.515697826551822e-06, "loss": 0.692, "step": 1150 }, { "epoch": 0.4622186495176849, "eval_cooking_sharegpt_test_loss": 0.6236215829849243, "eval_cooking_sharegpt_test_runtime": 29.1167, "eval_cooking_sharegpt_test_samples_per_second": 6.869, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1150 }, { "epoch": 0.4642282958199357, "grad_norm": 4.819507990509822, "learning_rate": 6.482233280482608e-06, "loss": 0.6147, "step": 1155 }, { "epoch": 0.4662379421221865, "grad_norm": 4.478005354409713, "learning_rate": 6.448695780877532e-06, "loss": 0.6581, "step": 1160 }, { "epoch": 0.4682475884244373, "grad_norm": 4.193130144327688, "learning_rate": 6.415086978407382e-06, "loss": 0.6124, "step": 1165 }, { "epoch": 0.4702572347266881, "grad_norm": 4.1578998395779365, "learning_rate": 6.381408527252381e-06, "loss": 0.6238, "step": 1170 }, { "epoch": 0.47226688102893893, "grad_norm": 3.879162148025109, "learning_rate": 6.347662085020764e-06, "loss": 0.5786, "step": 1175 }, { "epoch": 0.4742765273311897, "grad_norm": 4.9955060839894765, "learning_rate": 6.313849312667197e-06, "loss": 0.6763, "step": 1180 }, { "epoch": 0.4762861736334405, "grad_norm": 4.446633316148537, "learning_rate": 6.279971874411027e-06, "loss": 0.6339, "step": 1185 }, { "epoch": 0.4782958199356913, "grad_norm": 4.288694053242222, "learning_rate": 6.246031437654368e-06, "loss": 0.616, "step": 1190 }, { "epoch": 0.4803054662379421, "grad_norm": 3.8400933875164087, "learning_rate": 6.2120296729000395e-06, "loss": 0.6927, "step": 1195 }, { "epoch": 0.48231511254019294, "grad_norm": 5.8265497485402955, "learning_rate": 6.177968253669337e-06, "loss": 0.7054, "step": 1200 }, { "epoch": 0.48231511254019294, "eval_cooking_sharegpt_test_loss": 0.6158734560012817, "eval_cooking_sharegpt_test_runtime": 29.1595, "eval_cooking_sharegpt_test_samples_per_second": 6.859, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1200 }, { "epoch": 0.48432475884244375, "grad_norm": 4.586416002456061, "learning_rate": 6.143848856419675e-06, "loss": 0.6032, "step": 1205 }, { "epoch": 0.48633440514469456, "grad_norm": 4.9377135174761895, "learning_rate": 6.109673160462063e-06, "loss": 0.6026, "step": 1210 }, { "epoch": 0.4883440514469453, "grad_norm": 5.323719984452999, "learning_rate": 6.075442847878463e-06, "loss": 0.671, "step": 1215 }, { "epoch": 0.4903536977491961, "grad_norm": 4.045561951621483, "learning_rate": 6.041159603438991e-06, "loss": 0.5717, "step": 1220 }, { "epoch": 0.49236334405144694, "grad_norm": 3.7625870662514562, "learning_rate": 6.006825114518998e-06, "loss": 0.5493, "step": 1225 }, { "epoch": 0.49437299035369775, "grad_norm": 4.58456420076278, "learning_rate": 5.9724410710160184e-06, "loss": 0.5905, "step": 1230 }, { "epoch": 0.49638263665594856, "grad_norm": 4.536041072205623, "learning_rate": 5.938009165266603e-06, "loss": 0.6284, "step": 1235 }, { "epoch": 0.4983922829581994, "grad_norm": 4.028915918788256, "learning_rate": 5.903531091963011e-06, "loss": 0.5853, "step": 1240 }, { "epoch": 0.5004019292604501, "grad_norm": 4.599870175094803, "learning_rate": 5.8690085480698075e-06, "loss": 0.5881, "step": 1245 }, { "epoch": 0.502411575562701, "grad_norm": 4.439772860645252, "learning_rate": 5.834443232740346e-06, "loss": 0.6095, "step": 1250 }, { "epoch": 0.502411575562701, "eval_cooking_sharegpt_test_loss": 0.6074568033218384, "eval_cooking_sharegpt_test_runtime": 29.1196, "eval_cooking_sharegpt_test_samples_per_second": 6.868, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1250 }, { "epoch": 0.5044212218649518, "grad_norm": 4.467706371524093, "learning_rate": 5.799836847233129e-06, "loss": 0.6264, "step": 1255 }, { "epoch": 0.5064308681672026, "grad_norm": 4.101563851958729, "learning_rate": 5.765191094828078e-06, "loss": 0.555, "step": 1260 }, { "epoch": 0.5084405144694534, "grad_norm": 4.466205286827134, "learning_rate": 5.7305076807426975e-06, "loss": 0.5756, "step": 1265 }, { "epoch": 0.5104501607717041, "grad_norm": 4.701080377162357, "learning_rate": 5.695788312048159e-06, "loss": 0.6317, "step": 1270 }, { "epoch": 0.512459807073955, "grad_norm": 4.433691534657307, "learning_rate": 5.66103469758526e-06, "loss": 0.6215, "step": 1275 }, { "epoch": 0.5144694533762058, "grad_norm": 4.755580900859219, "learning_rate": 5.626248547880337e-06, "loss": 0.5824, "step": 1280 }, { "epoch": 0.5164790996784566, "grad_norm": 3.679078766710477, "learning_rate": 5.591431575061064e-06, "loss": 0.5474, "step": 1285 }, { "epoch": 0.5184887459807074, "grad_norm": 4.484327374671317, "learning_rate": 5.55658549277219e-06, "loss": 0.649, "step": 1290 }, { "epoch": 0.5204983922829582, "grad_norm": 4.505852120752897, "learning_rate": 5.5217120160911886e-06, "loss": 0.6159, "step": 1295 }, { "epoch": 0.522508038585209, "grad_norm": 5.01215860870057, "learning_rate": 5.486812861443852e-06, "loss": 0.6294, "step": 1300 }, { "epoch": 0.522508038585209, "eval_cooking_sharegpt_test_loss": 0.595450758934021, "eval_cooking_sharegpt_test_runtime": 29.1321, "eval_cooking_sharegpt_test_samples_per_second": 6.865, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1300 }, { "epoch": 0.5245176848874598, "grad_norm": 4.622631736422735, "learning_rate": 5.45188974651981e-06, "loss": 0.6114, "step": 1305 }, { "epoch": 0.5265273311897106, "grad_norm": 4.518942479671849, "learning_rate": 5.416944390187977e-06, "loss": 0.6818, "step": 1310 }, { "epoch": 0.5285369774919614, "grad_norm": 4.341948028095467, "learning_rate": 5.381978512411968e-06, "loss": 0.5809, "step": 1315 }, { "epoch": 0.5305466237942122, "grad_norm": 3.88446157180582, "learning_rate": 5.346993834165431e-06, "loss": 0.5869, "step": 1320 }, { "epoch": 0.532556270096463, "grad_norm": 4.430345179234876, "learning_rate": 5.311992077347351e-06, "loss": 0.6948, "step": 1325 }, { "epoch": 0.5345659163987139, "grad_norm": 4.164240920296163, "learning_rate": 5.2769749646972935e-06, "loss": 0.5607, "step": 1330 }, { "epoch": 0.5365755627009646, "grad_norm": 4.2836756208794515, "learning_rate": 5.241944219710624e-06, "loss": 0.6401, "step": 1335 }, { "epoch": 0.5385852090032154, "grad_norm": 4.126109249564745, "learning_rate": 5.206901566553665e-06, "loss": 0.5776, "step": 1340 }, { "epoch": 0.5405948553054662, "grad_norm": 4.661804105805685, "learning_rate": 5.171848729978851e-06, "loss": 0.6129, "step": 1345 }, { "epoch": 0.542604501607717, "grad_norm": 4.448287794131716, "learning_rate": 5.136787435239825e-06, "loss": 0.615, "step": 1350 }, { "epoch": 0.542604501607717, "eval_cooking_sharegpt_test_loss": 0.5893608331680298, "eval_cooking_sharegpt_test_runtime": 29.1232, "eval_cooking_sharegpt_test_samples_per_second": 6.867, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1350 }, { "epoch": 0.5446141479099679, "grad_norm": 3.831547012944043, "learning_rate": 5.101719408006534e-06, "loss": 0.5785, "step": 1355 }, { "epoch": 0.5466237942122186, "grad_norm": 4.528939245986926, "learning_rate": 5.0666463742802855e-06, "loss": 0.6062, "step": 1360 }, { "epoch": 0.5486334405144695, "grad_norm": 4.507770528828299, "learning_rate": 5.031570060308799e-06, "loss": 0.5992, "step": 1365 }, { "epoch": 0.5506430868167203, "grad_norm": 3.890107991234021, "learning_rate": 4.996492192501251e-06, "loss": 0.5942, "step": 1370 }, { "epoch": 0.552652733118971, "grad_norm": 4.065283624907987, "learning_rate": 4.9614144973432855e-06, "loss": 0.5971, "step": 1375 }, { "epoch": 0.5546623794212219, "grad_norm": 4.937482234167603, "learning_rate": 4.926338701312059e-06, "loss": 0.6404, "step": 1380 }, { "epoch": 0.5566720257234726, "grad_norm": 3.9548565502184325, "learning_rate": 4.8912665307912435e-06, "loss": 0.5026, "step": 1385 }, { "epoch": 0.5586816720257235, "grad_norm": 4.423584782540341, "learning_rate": 4.856199711986082e-06, "loss": 0.6386, "step": 1390 }, { "epoch": 0.5606913183279743, "grad_norm": 4.303957079156144, "learning_rate": 4.8211399708384e-06, "loss": 0.581, "step": 1395 }, { "epoch": 0.5627009646302251, "grad_norm": 4.666098899764007, "learning_rate": 4.786089032941683e-06, "loss": 0.6602, "step": 1400 }, { "epoch": 0.5627009646302251, "eval_cooking_sharegpt_test_loss": 0.5806075930595398, "eval_cooking_sharegpt_test_runtime": 29.1487, "eval_cooking_sharegpt_test_samples_per_second": 6.861, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1400 }, { "epoch": 0.5647106109324759, "grad_norm": 4.509076285193486, "learning_rate": 4.75104862345612e-06, "loss": 0.738, "step": 1405 }, { "epoch": 0.5667202572347267, "grad_norm": 4.238722358099732, "learning_rate": 4.716020467023716e-06, "loss": 0.564, "step": 1410 }, { "epoch": 0.5687299035369775, "grad_norm": 3.7723995730660347, "learning_rate": 4.68100628768339e-06, "loss": 0.584, "step": 1415 }, { "epoch": 0.5707395498392283, "grad_norm": 4.403249450502511, "learning_rate": 4.646007808786132e-06, "loss": 0.5753, "step": 1420 }, { "epoch": 0.5727491961414791, "grad_norm": 3.4896736313458927, "learning_rate": 4.611026752910172e-06, "loss": 0.4941, "step": 1425 }, { "epoch": 0.5747588424437299, "grad_norm": 4.782921151183437, "learning_rate": 4.576064841776207e-06, "loss": 0.5882, "step": 1430 }, { "epoch": 0.5767684887459807, "grad_norm": 4.523388455740361, "learning_rate": 4.541123796162656e-06, "loss": 0.6504, "step": 1435 }, { "epoch": 0.5787781350482315, "grad_norm": 4.846919506027111, "learning_rate": 4.506205335820959e-06, "loss": 0.6503, "step": 1440 }, { "epoch": 0.5807877813504824, "grad_norm": 4.353820829633878, "learning_rate": 4.471311179390946e-06, "loss": 0.5788, "step": 1445 }, { "epoch": 0.5827974276527331, "grad_norm": 4.515430059421715, "learning_rate": 4.436443044316236e-06, "loss": 0.6004, "step": 1450 }, { "epoch": 0.5827974276527331, "eval_cooking_sharegpt_test_loss": 0.5744032263755798, "eval_cooking_sharegpt_test_runtime": 29.1343, "eval_cooking_sharegpt_test_samples_per_second": 6.865, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1450 }, { "epoch": 0.5848070739549839, "grad_norm": 3.80274840853952, "learning_rate": 4.401602646759717e-06, "loss": 0.5645, "step": 1455 }, { "epoch": 0.5868167202572347, "grad_norm": 3.626670598918427, "learning_rate": 4.366791701519065e-06, "loss": 0.5602, "step": 1460 }, { "epoch": 0.5888263665594855, "grad_norm": 3.9743598170373886, "learning_rate": 4.332011921942365e-06, "loss": 0.5964, "step": 1465 }, { "epoch": 0.5908360128617364, "grad_norm": 3.83250555228501, "learning_rate": 4.297265019843755e-06, "loss": 0.5535, "step": 1470 }, { "epoch": 0.5928456591639871, "grad_norm": 3.6139874244672194, "learning_rate": 4.262552705419203e-06, "loss": 0.5168, "step": 1475 }, { "epoch": 0.594855305466238, "grad_norm": 4.740911098327277, "learning_rate": 4.227876687162303e-06, "loss": 0.525, "step": 1480 }, { "epoch": 0.5968649517684887, "grad_norm": 3.7870286568972276, "learning_rate": 4.193238671780212e-06, "loss": 0.5515, "step": 1485 }, { "epoch": 0.5988745980707395, "grad_norm": 5.1311308301838086, "learning_rate": 4.15864036410963e-06, "loss": 0.6117, "step": 1490 }, { "epoch": 0.6008842443729904, "grad_norm": 4.4089674794586236, "learning_rate": 4.124083467032902e-06, "loss": 0.5846, "step": 1495 }, { "epoch": 0.6028938906752411, "grad_norm": 4.051889135358413, "learning_rate": 4.08956968139419e-06, "loss": 0.5855, "step": 1500 }, { "epoch": 0.6028938906752411, "eval_cooking_sharegpt_test_loss": 0.5694165229797363, "eval_cooking_sharegpt_test_runtime": 29.1123, "eval_cooking_sharegpt_test_samples_per_second": 6.87, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1500 }, { "epoch": 0.604903536977492, "grad_norm": 4.1318438937071, "learning_rate": 4.05510070591578e-06, "loss": 0.6138, "step": 1505 }, { "epoch": 0.6069131832797428, "grad_norm": 3.601915970098065, "learning_rate": 4.020678237114451e-06, "loss": 0.5932, "step": 1510 }, { "epoch": 0.6089228295819936, "grad_norm": 3.9179716478128372, "learning_rate": 3.986303969217996e-06, "loss": 0.5754, "step": 1515 }, { "epoch": 0.6109324758842444, "grad_norm": 4.231240281833697, "learning_rate": 3.951979594081818e-06, "loss": 0.5833, "step": 1520 }, { "epoch": 0.6129421221864951, "grad_norm": 3.6790737814106462, "learning_rate": 3.917706801105663e-06, "loss": 0.5875, "step": 1525 }, { "epoch": 0.614951768488746, "grad_norm": 3.83516561393742, "learning_rate": 3.883487277150481e-06, "loss": 0.5629, "step": 1530 }, { "epoch": 0.6169614147909968, "grad_norm": 4.844335590656684, "learning_rate": 3.849322706455379e-06, "loss": 0.5862, "step": 1535 }, { "epoch": 0.6189710610932476, "grad_norm": 3.9186087206693996, "learning_rate": 3.815214770554755e-06, "loss": 0.5158, "step": 1540 }, { "epoch": 0.6209807073954984, "grad_norm": 3.9221369945811198, "learning_rate": 3.781165148195501e-06, "loss": 0.5216, "step": 1545 }, { "epoch": 0.6229903536977492, "grad_norm": 3.3756618558855633, "learning_rate": 3.74717551525441e-06, "loss": 0.5138, "step": 1550 }, { "epoch": 0.6229903536977492, "eval_cooking_sharegpt_test_loss": 0.5629362463951111, "eval_cooking_sharegpt_test_runtime": 29.1527, "eval_cooking_sharegpt_test_samples_per_second": 6.86, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1550 }, { "epoch": 0.625, "grad_norm": 4.312505621340993, "learning_rate": 3.713247544655663e-06, "loss": 0.5655, "step": 1555 }, { "epoch": 0.6270096463022508, "grad_norm": 4.442878870711612, "learning_rate": 3.6793829062885133e-06, "loss": 0.5324, "step": 1560 }, { "epoch": 0.6290192926045016, "grad_norm": 4.1821188904384785, "learning_rate": 3.6455832669250798e-06, "loss": 0.5367, "step": 1565 }, { "epoch": 0.6310289389067524, "grad_norm": 4.16995310964458, "learning_rate": 3.611850290138322e-06, "loss": 0.5449, "step": 1570 }, { "epoch": 0.6330385852090032, "grad_norm": 4.103557618161658, "learning_rate": 3.578185636220154e-06, "loss": 0.547, "step": 1575 }, { "epoch": 0.635048231511254, "grad_norm": 4.221137750943906, "learning_rate": 3.5445909620997317e-06, "loss": 0.6128, "step": 1580 }, { "epoch": 0.6370578778135049, "grad_norm": 4.685243877985315, "learning_rate": 3.511067921261897e-06, "loss": 0.5288, "step": 1585 }, { "epoch": 0.6390675241157556, "grad_norm": 3.7485957955570526, "learning_rate": 3.4776181636658004e-06, "loss": 0.5361, "step": 1590 }, { "epoch": 0.6410771704180064, "grad_norm": 5.13753736382163, "learning_rate": 3.444243335663685e-06, "loss": 0.6099, "step": 1595 }, { "epoch": 0.6430868167202572, "grad_norm": 4.20430212422138, "learning_rate": 3.4109450799198667e-06, "loss": 0.5544, "step": 1600 }, { "epoch": 0.6430868167202572, "eval_cooking_sharegpt_test_loss": 0.5597677826881409, "eval_cooking_sharegpt_test_runtime": 29.1383, "eval_cooking_sharegpt_test_samples_per_second": 6.864, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1600 }, { "epoch": 0.645096463022508, "grad_norm": 4.178198769845903, "learning_rate": 3.3777250353298725e-06, "loss": 0.5958, "step": 1605 }, { "epoch": 0.6471061093247589, "grad_norm": 4.042255503484116, "learning_rate": 3.344584836939777e-06, "loss": 0.596, "step": 1610 }, { "epoch": 0.6491157556270096, "grad_norm": 4.187189657432103, "learning_rate": 3.3115261158657443e-06, "loss": 0.5823, "step": 1615 }, { "epoch": 0.6511254019292605, "grad_norm": 4.893019445921785, "learning_rate": 3.2785504992137208e-06, "loss": 0.5981, "step": 1620 }, { "epoch": 0.6531350482315113, "grad_norm": 4.211816172053425, "learning_rate": 3.2456596099993744e-06, "loss": 0.6481, "step": 1625 }, { "epoch": 0.655144694533762, "grad_norm": 4.1700590638139925, "learning_rate": 3.2128550670681946e-06, "loss": 0.5761, "step": 1630 }, { "epoch": 0.6571543408360129, "grad_norm": 4.161630224139953, "learning_rate": 3.18013848501583e-06, "loss": 0.5866, "step": 1635 }, { "epoch": 0.6591639871382636, "grad_norm": 3.557727720022592, "learning_rate": 3.1475114741086064e-06, "loss": 0.4835, "step": 1640 }, { "epoch": 0.6611736334405145, "grad_norm": 4.852478551572312, "learning_rate": 3.114975640204282e-06, "loss": 0.5574, "step": 1645 }, { "epoch": 0.6631832797427653, "grad_norm": 3.859784406459026, "learning_rate": 3.0825325846730013e-06, "loss": 0.5624, "step": 1650 }, { "epoch": 0.6631832797427653, "eval_cooking_sharegpt_test_loss": 0.5586913228034973, "eval_cooking_sharegpt_test_runtime": 29.1426, "eval_cooking_sharegpt_test_samples_per_second": 6.863, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1650 }, { "epoch": 0.6651929260450161, "grad_norm": 4.482495052564249, "learning_rate": 3.0501839043184858e-06, "loss": 0.5688, "step": 1655 }, { "epoch": 0.6672025723472669, "grad_norm": 4.042365197538812, "learning_rate": 3.017931191299433e-06, "loss": 0.5349, "step": 1660 }, { "epoch": 0.6692122186495176, "grad_norm": 4.165393275797863, "learning_rate": 2.985776033051161e-06, "loss": 0.5798, "step": 1665 }, { "epoch": 0.6712218649517685, "grad_norm": 4.037795568611387, "learning_rate": 2.9537200122074684e-06, "loss": 0.5308, "step": 1670 }, { "epoch": 0.6732315112540193, "grad_norm": 3.870075592190109, "learning_rate": 2.9217647065227474e-06, "loss": 0.5248, "step": 1675 }, { "epoch": 0.6752411575562701, "grad_norm": 2.871993790875757, "learning_rate": 2.889911688794322e-06, "loss": 0.5273, "step": 1680 }, { "epoch": 0.6772508038585209, "grad_norm": 4.430515877745258, "learning_rate": 2.858162526785046e-06, "loss": 0.5656, "step": 1685 }, { "epoch": 0.6792604501607717, "grad_norm": 5.1614822907000395, "learning_rate": 2.8265187831461234e-06, "loss": 0.5579, "step": 1690 }, { "epoch": 0.6812700964630225, "grad_norm": 4.630422383447896, "learning_rate": 2.7949820153402163e-06, "loss": 0.6282, "step": 1695 }, { "epoch": 0.6832797427652733, "grad_norm": 3.181023467670562, "learning_rate": 2.763553775564778e-06, "loss": 0.5093, "step": 1700 }, { "epoch": 0.6832797427652733, "eval_cooking_sharegpt_test_loss": 0.5516761541366577, "eval_cooking_sharegpt_test_runtime": 29.1563, "eval_cooking_sharegpt_test_samples_per_second": 6.86, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1700 }, { "epoch": 0.6852893890675241, "grad_norm": 4.646508693714103, "learning_rate": 2.732235610675652e-06, "loss": 0.6124, "step": 1705 }, { "epoch": 0.6872990353697749, "grad_norm": 4.162112396548259, "learning_rate": 2.7010290621109527e-06, "loss": 0.5413, "step": 1710 }, { "epoch": 0.6893086816720257, "grad_norm": 3.6513543600819998, "learning_rate": 2.6699356658151766e-06, "loss": 0.535, "step": 1715 }, { "epoch": 0.6913183279742765, "grad_norm": 5.33408477701188, "learning_rate": 2.6389569521636325e-06, "loss": 0.6191, "step": 1720 }, { "epoch": 0.6933279742765274, "grad_norm": 4.0009358447843315, "learning_rate": 2.6080944458870884e-06, "loss": 0.5353, "step": 1725 }, { "epoch": 0.6953376205787781, "grad_norm": 4.288625288455154, "learning_rate": 2.577349665996752e-06, "loss": 0.605, "step": 1730 }, { "epoch": 0.697347266881029, "grad_norm": 3.7812731515985427, "learning_rate": 2.5467241257094844e-06, "loss": 0.4522, "step": 1735 }, { "epoch": 0.6993569131832797, "grad_norm": 4.4091214999888635, "learning_rate": 2.5162193323733475e-06, "loss": 0.598, "step": 1740 }, { "epoch": 0.7013665594855305, "grad_norm": 4.1788240029265, "learning_rate": 2.4858367873933885e-06, "loss": 0.5406, "step": 1745 }, { "epoch": 0.7033762057877814, "grad_norm": 3.944073910719997, "learning_rate": 2.455577986157762e-06, "loss": 0.5658, "step": 1750 }, { "epoch": 0.7033762057877814, "eval_cooking_sharegpt_test_loss": 0.54704350233078, "eval_cooking_sharegpt_test_runtime": 29.1394, "eval_cooking_sharegpt_test_samples_per_second": 6.864, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1750 }, { "epoch": 0.7053858520900321, "grad_norm": 4.0409641002525065, "learning_rate": 2.425444417964112e-06, "loss": 0.5993, "step": 1755 }, { "epoch": 0.707395498392283, "grad_norm": 3.830335679595113, "learning_rate": 2.395437565946291e-06, "loss": 0.4863, "step": 1760 }, { "epoch": 0.7094051446945338, "grad_norm": 3.9483809252674984, "learning_rate": 2.3655589070013434e-06, "loss": 0.538, "step": 1765 }, { "epoch": 0.7114147909967846, "grad_norm": 4.721569878991402, "learning_rate": 2.3358099117168277e-06, "loss": 0.6086, "step": 1770 }, { "epoch": 0.7134244372990354, "grad_norm": 3.817648873180748, "learning_rate": 2.3061920442984237e-06, "loss": 0.5537, "step": 1775 }, { "epoch": 0.7154340836012861, "grad_norm": 4.952943362320339, "learning_rate": 2.276706762497881e-06, "loss": 0.5734, "step": 1780 }, { "epoch": 0.717443729903537, "grad_norm": 4.308685442110032, "learning_rate": 2.247355517541259e-06, "loss": 0.5245, "step": 1785 }, { "epoch": 0.7194533762057878, "grad_norm": 3.6876910060455015, "learning_rate": 2.2181397540575012e-06, "loss": 0.4904, "step": 1790 }, { "epoch": 0.7214630225080386, "grad_norm": 4.056961760469262, "learning_rate": 2.1890609100073406e-06, "loss": 0.5792, "step": 1795 }, { "epoch": 0.7234726688102894, "grad_norm": 5.194938789182878, "learning_rate": 2.1601204166125097e-06, "loss": 0.5797, "step": 1800 }, { "epoch": 0.7234726688102894, "eval_cooking_sharegpt_test_loss": 0.5419730544090271, "eval_cooking_sharegpt_test_runtime": 29.1573, "eval_cooking_sharegpt_test_samples_per_second": 6.859, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1800 }, { "epoch": 0.7254823151125402, "grad_norm": 4.5439902420076, "learning_rate": 2.131319698285321e-06, "loss": 0.5149, "step": 1805 }, { "epoch": 0.727491961414791, "grad_norm": 4.193817296752171, "learning_rate": 2.1026601725585303e-06, "loss": 0.5707, "step": 1810 }, { "epoch": 0.7295016077170418, "grad_norm": 4.352857862781889, "learning_rate": 2.0741432500155957e-06, "loss": 0.5501, "step": 1815 }, { "epoch": 0.7315112540192926, "grad_norm": 4.693742662043891, "learning_rate": 2.045770334221227e-06, "loss": 0.5476, "step": 1820 }, { "epoch": 0.7335209003215434, "grad_norm": 3.811953363220478, "learning_rate": 2.017542821652321e-06, "loss": 0.5512, "step": 1825 }, { "epoch": 0.7355305466237942, "grad_norm": 4.0209013372552285, "learning_rate": 1.9894621016292233e-06, "loss": 0.5004, "step": 1830 }, { "epoch": 0.737540192926045, "grad_norm": 3.7470655457186286, "learning_rate": 1.9615295562473445e-06, "loss": 0.5138, "step": 1835 }, { "epoch": 0.7395498392282959, "grad_norm": 5.0495297609749255, "learning_rate": 1.933746560309137e-06, "loss": 0.5589, "step": 1840 }, { "epoch": 0.7415594855305466, "grad_norm": 4.4178604447303895, "learning_rate": 1.906114481256432e-06, "loss": 0.5416, "step": 1845 }, { "epoch": 0.7435691318327974, "grad_norm": 3.8034316360410676, "learning_rate": 1.8786346791031356e-06, "loss": 0.5376, "step": 1850 }, { "epoch": 0.7435691318327974, "eval_cooking_sharegpt_test_loss": 0.5386413335800171, "eval_cooking_sharegpt_test_runtime": 29.1364, "eval_cooking_sharegpt_test_samples_per_second": 6.864, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1850 }, { "epoch": 0.7455787781350482, "grad_norm": 3.885403344737721, "learning_rate": 1.8513085063682828e-06, "loss": 0.5474, "step": 1855 }, { "epoch": 0.747588424437299, "grad_norm": 3.5164892334257343, "learning_rate": 1.8241373080094822e-06, "loss": 0.4625, "step": 1860 }, { "epoch": 0.7495980707395499, "grad_norm": 3.5334086129543008, "learning_rate": 1.7971224213567017e-06, "loss": 0.4698, "step": 1865 }, { "epoch": 0.7516077170418006, "grad_norm": 3.7281802070266736, "learning_rate": 1.77026517604647e-06, "loss": 0.5437, "step": 1870 }, { "epoch": 0.7536173633440515, "grad_norm": 6.112903495715869, "learning_rate": 1.7435668939564065e-06, "loss": 0.5897, "step": 1875 }, { "epoch": 0.7556270096463023, "grad_norm": 4.903593470757499, "learning_rate": 1.7170288891401836e-06, "loss": 0.543, "step": 1880 }, { "epoch": 0.757636655948553, "grad_norm": 4.1471801370352175, "learning_rate": 1.6906524677628345e-06, "loss": 0.5533, "step": 1885 }, { "epoch": 0.7596463022508039, "grad_norm": 4.44067669671905, "learning_rate": 1.6644389280364748e-06, "loss": 0.5232, "step": 1890 }, { "epoch": 0.7616559485530546, "grad_norm": 4.25010398827249, "learning_rate": 1.6383895601564047e-06, "loss": 0.6047, "step": 1895 }, { "epoch": 0.7636655948553055, "grad_norm": 4.312065419247702, "learning_rate": 1.6125056462376065e-06, "loss": 0.5323, "step": 1900 }, { "epoch": 0.7636655948553055, "eval_cooking_sharegpt_test_loss": 0.534070611000061, "eval_cooking_sharegpt_test_runtime": 29.1362, "eval_cooking_sharegpt_test_samples_per_second": 6.864, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1900 }, { "epoch": 0.7656752411575563, "grad_norm": 4.311264813952778, "learning_rate": 1.586788460251636e-06, "loss": 0.4919, "step": 1905 }, { "epoch": 0.7676848874598071, "grad_norm": 3.9822945127982186, "learning_rate": 1.561239267963926e-06, "loss": 0.4988, "step": 1910 }, { "epoch": 0.7696945337620579, "grad_norm": 3.5862382118282814, "learning_rate": 1.5358593268714866e-06, "loss": 0.556, "step": 1915 }, { "epoch": 0.7717041800643086, "grad_norm": 4.65685750488049, "learning_rate": 1.5106498861410101e-06, "loss": 0.5705, "step": 1920 }, { "epoch": 0.7737138263665595, "grad_norm": 4.93082892843028, "learning_rate": 1.4856121865473855e-06, "loss": 0.5442, "step": 1925 }, { "epoch": 0.7757234726688103, "grad_norm": 4.115014663671977, "learning_rate": 1.460747460412637e-06, "loss": 0.5497, "step": 1930 }, { "epoch": 0.7777331189710611, "grad_norm": 3.5438779332216965, "learning_rate": 1.4360569315452682e-06, "loss": 0.4903, "step": 1935 }, { "epoch": 0.7797427652733119, "grad_norm": 4.031371623093601, "learning_rate": 1.4115418151800215e-06, "loss": 0.5644, "step": 1940 }, { "epoch": 0.7817524115755627, "grad_norm": 3.54725412927624, "learning_rate": 1.3872033179180767e-06, "loss": 0.5178, "step": 1945 }, { "epoch": 0.7837620578778135, "grad_norm": 4.445227207707671, "learning_rate": 1.363042637667652e-06, "loss": 0.5802, "step": 1950 }, { "epoch": 0.7837620578778135, "eval_cooking_sharegpt_test_loss": 0.5310518145561218, "eval_cooking_sharegpt_test_runtime": 29.1634, "eval_cooking_sharegpt_test_samples_per_second": 6.858, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 1950 }, { "epoch": 0.7857717041800643, "grad_norm": 5.019716908374287, "learning_rate": 1.339060963585056e-06, "loss": 0.5734, "step": 1955 }, { "epoch": 0.7877813504823151, "grad_norm": 3.8522224862461147, "learning_rate": 1.3152594760161513e-06, "loss": 0.4906, "step": 1960 }, { "epoch": 0.7897909967845659, "grad_norm": 3.6624257408254457, "learning_rate": 1.2916393464382632e-06, "loss": 0.4873, "step": 1965 }, { "epoch": 0.7918006430868167, "grad_norm": 5.220019531776816, "learning_rate": 1.2682017374025158e-06, "loss": 0.5863, "step": 1970 }, { "epoch": 0.7938102893890675, "grad_norm": 3.978993529598662, "learning_rate": 1.2449478024766205e-06, "loss": 0.4623, "step": 1975 }, { "epoch": 0.7958199356913184, "grad_norm": 4.0726044586510675, "learning_rate": 1.2218786861880937e-06, "loss": 0.496, "step": 1980 }, { "epoch": 0.7978295819935691, "grad_norm": 3.8206385965353222, "learning_rate": 1.1989955239679279e-06, "loss": 0.5187, "step": 1985 }, { "epoch": 0.7998392282958199, "grad_norm": 3.9512193648944653, "learning_rate": 1.1762994420947016e-06, "loss": 0.4982, "step": 1990 }, { "epoch": 0.8018488745980707, "grad_norm": 3.8053938839616412, "learning_rate": 1.153791557639153e-06, "loss": 0.5194, "step": 1995 }, { "epoch": 0.8038585209003215, "grad_norm": 3.6251196777845616, "learning_rate": 1.1314729784091937e-06, "loss": 0.5537, "step": 2000 }, { "epoch": 0.8038585209003215, "eval_cooking_sharegpt_test_loss": 0.5269535779953003, "eval_cooking_sharegpt_test_runtime": 29.147, "eval_cooking_sharegpt_test_samples_per_second": 6.862, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2000 }, { "epoch": 0.8058681672025724, "grad_norm": 4.043004314873042, "learning_rate": 1.1093448028953886e-06, "loss": 0.4801, "step": 2005 }, { "epoch": 0.8078778135048231, "grad_norm": 3.864239177756637, "learning_rate": 1.0874081202168806e-06, "loss": 0.4985, "step": 2010 }, { "epoch": 0.809887459807074, "grad_norm": 3.822597851846882, "learning_rate": 1.065664010067799e-06, "loss": 0.4991, "step": 2015 }, { "epoch": 0.8118971061093248, "grad_norm": 3.857276347852937, "learning_rate": 1.0441135426641074e-06, "loss": 0.4637, "step": 2020 }, { "epoch": 0.8139067524115756, "grad_norm": 4.4337279604757, "learning_rate": 1.0227577786909332e-06, "loss": 0.5738, "step": 2025 }, { "epoch": 0.8159163987138264, "grad_norm": 3.6706223677728254, "learning_rate": 1.0015977692503632e-06, "loss": 0.5243, "step": 2030 }, { "epoch": 0.8179260450160771, "grad_norm": 4.517936325301277, "learning_rate": 9.806345558097053e-07, "loss": 0.5106, "step": 2035 }, { "epoch": 0.819935691318328, "grad_norm": 4.230476927090911, "learning_rate": 9.59869170150236e-07, "loss": 0.5789, "step": 2040 }, { "epoch": 0.8219453376205788, "grad_norm": 4.043867851008032, "learning_rate": 9.393026343164114e-07, "loss": 0.5238, "step": 2045 }, { "epoch": 0.8239549839228296, "grad_norm": 3.433756831229721, "learning_rate": 9.189359605655668e-07, "loss": 0.4972, "step": 2050 }, { "epoch": 0.8239549839228296, "eval_cooking_sharegpt_test_loss": 0.5249894261360168, "eval_cooking_sharegpt_test_runtime": 29.1006, "eval_cooking_sharegpt_test_samples_per_second": 6.873, "eval_cooking_sharegpt_test_steps_per_second": 0.344, "step": 2050 }, { "epoch": 0.8259646302250804, "grad_norm": 4.6112587559432, "learning_rate": 8.987701513180907e-07, "loss": 0.5356, "step": 2055 }, { "epoch": 0.8279742765273312, "grad_norm": 4.050624512976509, "learning_rate": 8.788061991080937e-07, "loss": 0.519, "step": 2060 }, { "epoch": 0.829983922829582, "grad_norm": 3.9107977100160123, "learning_rate": 8.590450865345512e-07, "loss": 0.5988, "step": 2065 }, { "epoch": 0.8319935691318328, "grad_norm": 4.197001300269493, "learning_rate": 8.394877862129446e-07, "loss": 0.4833, "step": 2070 }, { "epoch": 0.8340032154340836, "grad_norm": 4.289046142719884, "learning_rate": 8.201352607273877e-07, "loss": 0.5961, "step": 2075 }, { "epoch": 0.8360128617363344, "grad_norm": 4.718556012278376, "learning_rate": 8.009884625832531e-07, "loss": 0.5824, "step": 2080 }, { "epoch": 0.8380225080385852, "grad_norm": 4.096609919204836, "learning_rate": 7.82048334160288e-07, "loss": 0.5427, "step": 2085 }, { "epoch": 0.840032154340836, "grad_norm": 4.906614424028583, "learning_rate": 7.633158076662356e-07, "loss": 0.6349, "step": 2090 }, { "epoch": 0.8420418006430869, "grad_norm": 4.694926277231841, "learning_rate": 7.447918050909453e-07, "loss": 0.5806, "step": 2095 }, { "epoch": 0.8440514469453376, "grad_norm": 3.9943595352709735, "learning_rate": 7.264772381610041e-07, "loss": 0.5315, "step": 2100 }, { "epoch": 0.8440514469453376, "eval_cooking_sharegpt_test_loss": 0.5224404335021973, "eval_cooking_sharegpt_test_runtime": 29.1427, "eval_cooking_sharegpt_test_samples_per_second": 6.863, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2100 }, { "epoch": 0.8460610932475884, "grad_norm": 3.8949136566294995, "learning_rate": 7.083730082948526e-07, "loss": 0.4789, "step": 2105 }, { "epoch": 0.8480707395498392, "grad_norm": 4.010454819797958, "learning_rate": 6.904800065584255e-07, "loss": 0.4783, "step": 2110 }, { "epoch": 0.85008038585209, "grad_norm": 4.746242279274611, "learning_rate": 6.727991136212931e-07, "loss": 0.546, "step": 2115 }, { "epoch": 0.8520900321543409, "grad_norm": 4.283372719633206, "learning_rate": 6.553311997133111e-07, "loss": 0.5003, "step": 2120 }, { "epoch": 0.8540996784565916, "grad_norm": 3.560283237601326, "learning_rate": 6.380771245817957e-07, "loss": 0.4842, "step": 2125 }, { "epoch": 0.8561093247588425, "grad_norm": 3.79934997015618, "learning_rate": 6.210377374492049e-07, "loss": 0.4678, "step": 2130 }, { "epoch": 0.8581189710610932, "grad_norm": 4.1885519690593185, "learning_rate": 6.042138769713413e-07, "loss": 0.5096, "step": 2135 }, { "epoch": 0.860128617363344, "grad_norm": 3.97970854328499, "learning_rate": 5.876063711960706e-07, "loss": 0.4941, "step": 2140 }, { "epoch": 0.8621382636655949, "grad_norm": 4.394448792908495, "learning_rate": 5.712160375225756e-07, "loss": 0.5573, "step": 2145 }, { "epoch": 0.8641479099678456, "grad_norm": 4.127283021998723, "learning_rate": 5.55043682661115e-07, "loss": 0.5165, "step": 2150 }, { "epoch": 0.8641479099678456, "eval_cooking_sharegpt_test_loss": 0.5204899311065674, "eval_cooking_sharegpt_test_runtime": 29.1702, "eval_cooking_sharegpt_test_samples_per_second": 6.856, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2150 }, { "epoch": 0.8661575562700965, "grad_norm": 3.69665008473374, "learning_rate": 5.39090102593326e-07, "loss": 0.5317, "step": 2155 }, { "epoch": 0.8681672025723473, "grad_norm": 3.5634773461727223, "learning_rate": 5.233560825330387e-07, "loss": 0.5341, "step": 2160 }, { "epoch": 0.8701768488745981, "grad_norm": 3.450288752072914, "learning_rate": 5.0784239688764e-07, "loss": 0.4069, "step": 2165 }, { "epoch": 0.8721864951768489, "grad_norm": 4.428896151960056, "learning_rate": 4.925498092199449e-07, "loss": 0.5154, "step": 2170 }, { "epoch": 0.8741961414790996, "grad_norm": 3.6865145625805633, "learning_rate": 4.774790722106309e-07, "loss": 0.5408, "step": 2175 }, { "epoch": 0.8762057877813505, "grad_norm": 3.8579452526033293, "learning_rate": 4.6263092762117546e-07, "loss": 0.5051, "step": 2180 }, { "epoch": 0.8782154340836013, "grad_norm": 3.649942640889945, "learning_rate": 4.480061062573604e-07, "loss": 0.4879, "step": 2185 }, { "epoch": 0.8802250803858521, "grad_norm": 4.4196873140991935, "learning_rate": 4.336053279332941e-07, "loss": 0.5404, "step": 2190 }, { "epoch": 0.8822347266881029, "grad_norm": 3.8686685069961064, "learning_rate": 4.1942930143599014e-07, "loss": 0.4976, "step": 2195 }, { "epoch": 0.8842443729903537, "grad_norm": 4.070283502525038, "learning_rate": 4.0547872449047674e-07, "loss": 0.5689, "step": 2200 }, { "epoch": 0.8842443729903537, "eval_cooking_sharegpt_test_loss": 0.5188571810722351, "eval_cooking_sharegpt_test_runtime": 29.1495, "eval_cooking_sharegpt_test_samples_per_second": 6.861, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2200 }, { "epoch": 0.8862540192926045, "grad_norm": 3.619269487786876, "learning_rate": 3.917542837254562e-07, "loss": 0.5528, "step": 2205 }, { "epoch": 0.8882636655948553, "grad_norm": 3.6657937873825137, "learning_rate": 3.7825665463951224e-07, "loss": 0.541, "step": 2210 }, { "epoch": 0.8902733118971061, "grad_norm": 3.4204803639440526, "learning_rate": 3.649865015678622e-07, "loss": 0.4743, "step": 2215 }, { "epoch": 0.8922829581993569, "grad_norm": 4.380202502107755, "learning_rate": 3.5194447764965887e-07, "loss": 0.546, "step": 2220 }, { "epoch": 0.8942926045016077, "grad_norm": 4.402021876559463, "learning_rate": 3.391312247958417e-07, "loss": 0.5446, "step": 2225 }, { "epoch": 0.8963022508038585, "grad_norm": 4.747626450695015, "learning_rate": 3.265473736575475e-07, "loss": 0.5655, "step": 2230 }, { "epoch": 0.8983118971061094, "grad_norm": 3.68468614908709, "learning_rate": 3.141935435950644e-07, "loss": 0.6147, "step": 2235 }, { "epoch": 0.9003215434083601, "grad_norm": 3.535700858658026, "learning_rate": 3.0207034264735756e-07, "loss": 0.5006, "step": 2240 }, { "epoch": 0.9023311897106109, "grad_norm": 3.848416834162935, "learning_rate": 2.901783675021297e-07, "loss": 0.5161, "step": 2245 }, { "epoch": 0.9043408360128617, "grad_norm": 3.730188112771092, "learning_rate": 2.785182034664641e-07, "loss": 0.5191, "step": 2250 }, { "epoch": 0.9043408360128617, "eval_cooking_sharegpt_test_loss": 0.5172947645187378, "eval_cooking_sharegpt_test_runtime": 29.1343, "eval_cooking_sharegpt_test_samples_per_second": 6.865, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2250 }, { "epoch": 0.9063504823151125, "grad_norm": 4.611573430946674, "learning_rate": 2.670904244380068e-07, "loss": 0.5719, "step": 2255 }, { "epoch": 0.9083601286173634, "grad_norm": 3.4190333394817625, "learning_rate": 2.5589559287673205e-07, "loss": 0.5415, "step": 2260 }, { "epoch": 0.9103697749196141, "grad_norm": 3.7043226879021027, "learning_rate": 2.4493425977724585e-07, "loss": 0.5431, "step": 2265 }, { "epoch": 0.912379421221865, "grad_norm": 3.5791803584446877, "learning_rate": 2.3420696464167614e-07, "loss": 0.5563, "step": 2270 }, { "epoch": 0.9143890675241158, "grad_norm": 3.6102349926375292, "learning_rate": 2.237142354531141e-07, "loss": 0.5127, "step": 2275 }, { "epoch": 0.9163987138263665, "grad_norm": 4.04764189714931, "learning_rate": 2.1345658864962982e-07, "loss": 0.5183, "step": 2280 }, { "epoch": 0.9184083601286174, "grad_norm": 4.12380254309591, "learning_rate": 2.0343452909885487e-07, "loss": 0.5244, "step": 2285 }, { "epoch": 0.9204180064308681, "grad_norm": 2.9917916489817657, "learning_rate": 1.9364855007313e-07, "loss": 0.6096, "step": 2290 }, { "epoch": 0.922427652733119, "grad_norm": 4.666115781878385, "learning_rate": 1.84099133225229e-07, "loss": 0.5603, "step": 2295 }, { "epoch": 0.9244372990353698, "grad_norm": 4.1461620270350545, "learning_rate": 1.747867485646537e-07, "loss": 0.4639, "step": 2300 }, { "epoch": 0.9244372990353698, "eval_cooking_sharegpt_test_loss": 0.5164940357208252, "eval_cooking_sharegpt_test_runtime": 29.1503, "eval_cooking_sharegpt_test_samples_per_second": 6.861, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2300 }, { "epoch": 0.9264469453376206, "grad_norm": 3.424368627441663, "learning_rate": 1.6571185443449934e-07, "loss": 0.4837, "step": 2305 }, { "epoch": 0.9284565916398714, "grad_norm": 3.3861373326786874, "learning_rate": 1.5687489748889228e-07, "loss": 0.5225, "step": 2310 }, { "epoch": 0.9304662379421221, "grad_norm": 4.0432557223167365, "learning_rate": 1.482763126710135e-07, "loss": 0.505, "step": 2315 }, { "epoch": 0.932475884244373, "grad_norm": 4.149053064263076, "learning_rate": 1.3991652319168436e-07, "loss": 0.5735, "step": 2320 }, { "epoch": 0.9344855305466238, "grad_norm": 4.436444939575187, "learning_rate": 1.3179594050854227e-07, "loss": 0.572, "step": 2325 }, { "epoch": 0.9364951768488746, "grad_norm": 4.509769479123379, "learning_rate": 1.239149643057841e-07, "loss": 0.4864, "step": 2330 }, { "epoch": 0.9385048231511254, "grad_norm": 4.668035022637942, "learning_rate": 1.1627398247449906e-07, "loss": 0.5206, "step": 2335 }, { "epoch": 0.9405144694533762, "grad_norm": 3.9506527567648697, "learning_rate": 1.08873371093573e-07, "loss": 0.6029, "step": 2340 }, { "epoch": 0.942524115755627, "grad_norm": 4.230839087401455, "learning_rate": 1.017134944111814e-07, "loss": 0.5838, "step": 2345 }, { "epoch": 0.9445337620578779, "grad_norm": 4.151545046693275, "learning_rate": 9.479470482686048e-08, "loss": 0.487, "step": 2350 }, { "epoch": 0.9445337620578779, "eval_cooking_sharegpt_test_loss": 0.5156686305999756, "eval_cooking_sharegpt_test_runtime": 29.1666, "eval_cooking_sharegpt_test_samples_per_second": 6.857, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2350 }, { "epoch": 0.9465434083601286, "grad_norm": 4.142210605255353, "learning_rate": 8.811734287416274e-08, "loss": 0.5335, "step": 2355 }, { "epoch": 0.9485530546623794, "grad_norm": 3.850811332598397, "learning_rate": 8.168173720389472e-08, "loss": 0.5186, "step": 2360 }, { "epoch": 0.9505627009646302, "grad_norm": 3.597795214532193, "learning_rate": 7.548820456794448e-08, "loss": 0.4817, "step": 2365 }, { "epoch": 0.952572347266881, "grad_norm": 3.8186393981188114, "learning_rate": 6.953704980368958e-08, "loss": 0.4579, "step": 2370 }, { "epoch": 0.9545819935691319, "grad_norm": 4.174150444604705, "learning_rate": 6.382856581899133e-08, "loss": 0.5271, "step": 2375 }, { "epoch": 0.9565916398713826, "grad_norm": 4.2074795390285225, "learning_rate": 5.8363033577784055e-08, "loss": 0.5917, "step": 2380 }, { "epoch": 0.9586012861736335, "grad_norm": 3.9305248224525404, "learning_rate": 5.314072208623844e-08, "loss": 0.4585, "step": 2385 }, { "epoch": 0.9606109324758842, "grad_norm": 3.9808292965527037, "learning_rate": 4.81618883795304e-08, "loss": 0.4707, "step": 2390 }, { "epoch": 0.962620578778135, "grad_norm": 3.753591113764896, "learning_rate": 4.342677750918178e-08, "loss": 0.5422, "step": 2395 }, { "epoch": 0.9646302250803859, "grad_norm": 3.6932078456444373, "learning_rate": 3.8935622531006136e-08, "loss": 0.4571, "step": 2400 }, { "epoch": 0.9646302250803859, "eval_cooking_sharegpt_test_loss": 0.5150659680366516, "eval_cooking_sharegpt_test_runtime": 29.1762, "eval_cooking_sharegpt_test_samples_per_second": 6.855, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2400 }, { "epoch": 0.9666398713826366, "grad_norm": 3.7902631860251472, "learning_rate": 3.468864449363119e-08, "loss": 0.472, "step": 2405 }, { "epoch": 0.9686495176848875, "grad_norm": 4.873378997207432, "learning_rate": 3.0686052427626454e-08, "loss": 0.5863, "step": 2410 }, { "epoch": 0.9706591639871383, "grad_norm": 3.41955309662197, "learning_rate": 2.692804333520982e-08, "loss": 0.5606, "step": 2415 }, { "epoch": 0.9726688102893891, "grad_norm": 3.9261487092055645, "learning_rate": 2.341480218055303e-08, "loss": 0.5137, "step": 2420 }, { "epoch": 0.9746784565916399, "grad_norm": 3.908466740159429, "learning_rate": 2.014650188067735e-08, "loss": 0.5444, "step": 2425 }, { "epoch": 0.9766881028938906, "grad_norm": 4.095217894179939, "learning_rate": 1.7123303296944226e-08, "loss": 0.5302, "step": 2430 }, { "epoch": 0.9786977491961415, "grad_norm": 3.3767195966017485, "learning_rate": 1.4345355227137203e-08, "loss": 0.5031, "step": 2435 }, { "epoch": 0.9807073954983923, "grad_norm": 3.8928483675929337, "learning_rate": 1.1812794398137762e-08, "loss": 0.5233, "step": 2440 }, { "epoch": 0.9827170418006431, "grad_norm": 4.410721124785433, "learning_rate": 9.525745459195712e-09, "loss": 0.5351, "step": 2445 }, { "epoch": 0.9847266881028939, "grad_norm": 4.367071424030973, "learning_rate": 7.484320975795766e-09, "loss": 0.45, "step": 2450 }, { "epoch": 0.9847266881028939, "eval_cooking_sharegpt_test_loss": 0.5149813294410706, "eval_cooking_sharegpt_test_runtime": 29.1432, "eval_cooking_sharegpt_test_samples_per_second": 6.863, "eval_cooking_sharegpt_test_steps_per_second": 0.343, "step": 2450 }, { "epoch": 0.9867363344051447, "grad_norm": 4.348603146645124, "learning_rate": 5.688621424115304e-09, "loss": 0.4929, "step": 2455 }, { "epoch": 0.9887459807073955, "grad_norm": 3.4464338928700866, "learning_rate": 4.1387351860799894e-09, "loss": 0.5602, "step": 2460 }, { "epoch": 0.9907556270096463, "grad_norm": 4.117129096266541, "learning_rate": 2.8347385450133715e-09, "loss": 0.4746, "step": 2465 }, { "epoch": 0.9927652733118971, "grad_norm": 3.7130286969995274, "learning_rate": 1.7766956818832116e-09, "loss": 0.4795, "step": 2470 }, { "epoch": 0.9947749196141479, "grad_norm": 3.8454728155367164, "learning_rate": 9.646586721412388e-10, "loss": 0.5535, "step": 2475 }, { "epoch": 0.9967845659163987, "grad_norm": 3.694656061423026, "learning_rate": 3.986674831607529e-10, "loss": 0.5087, "step": 2480 }, { "epoch": 0.9987942122186495, "grad_norm": 4.211180713946689, "learning_rate": 7.87499722693097e-11, "loss": 0.4456, "step": 2485 }, { "epoch": 1.0, "step": 2488, "total_flos": 19663349465088.0, "train_loss": 0.6474186228019249, "train_runtime": 24049.0144, "train_samples_per_second": 0.827, "train_steps_per_second": 0.103 } ], "logging_steps": 5, "max_steps": 2488, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 19663349465088.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }