{ "best_metric": null, "best_model_checkpoint": null, "epoch": 39.95179987797437, "eval_steps": 100.0, "global_step": 32760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6101281269066504, "grad_norm": 41.414833068847656, "learning_rate": 1.188e-06, "loss": 21.6118, "step": 500 }, { "epoch": 1.0, "eval_cer": 1.1283783783783783, "eval_loss": 8.950940132141113, "eval_runtime": 87.6114, "eval_samples_per_second": 78.072, "eval_steps_per_second": 9.759, "eval_wer": 1.0, "step": 820 }, { "epoch": 1.2196461256863942, "grad_norm": 35.01482009887695, "learning_rate": 2.3880000000000003e-06, "loss": 9.1302, "step": 1000 }, { "epoch": 1.8297742525930445, "grad_norm": 27.13286590576172, "learning_rate": 3.588e-06, "loss": 7.4878, "step": 1500 }, { "epoch": 2.0, "eval_cer": 1.1284376481744902, "eval_loss": 6.385559558868408, "eval_runtime": 76.6753, "eval_samples_per_second": 89.207, "eval_steps_per_second": 11.151, "eval_wer": 1.0, "step": 1640 }, { "epoch": 2.4392922513727884, "grad_norm": 13.3228120803833, "learning_rate": 4.788e-06, "loss": 5.9004, "step": 2000 }, { "epoch": 3.0, "eval_cer": 1.1284376481744902, "eval_loss": 3.928687572479248, "eval_runtime": 74.707, "eval_samples_per_second": 91.558, "eval_steps_per_second": 11.445, "eval_wer": 1.0, "step": 2460 }, { "epoch": 3.048810250152532, "grad_norm": 6.096343994140625, "learning_rate": 5.988e-06, "loss": 4.3944, "step": 2500 }, { "epoch": 3.6589383770591826, "grad_norm": 3.3120861053466797, "learning_rate": 7.1880000000000005e-06, "loss": 3.4882, "step": 3000 }, { "epoch": 4.0, "eval_cer": 1.1284376481744902, "eval_loss": 2.909001588821411, "eval_runtime": 77.6836, "eval_samples_per_second": 88.05, "eval_steps_per_second": 11.006, "eval_wer": 1.0, "step": 3280 }, { "epoch": 4.268456375838926, "grad_norm": 1.7695776224136353, "learning_rate": 8.388e-06, "loss": 2.963, "step": 3500 }, { "epoch": 4.878584502745577, "grad_norm": 1.4808818101882935, "learning_rate": 9.588e-06, "loss": 2.6365, "step": 4000 }, { "epoch": 5.0, "eval_cer": 1.1284376481744902, "eval_loss": 2.3862359523773193, "eval_runtime": 88.7308, "eval_samples_per_second": 77.087, "eval_steps_per_second": 9.636, "eval_wer": 1.0, "step": 4100 }, { "epoch": 5.48810250152532, "grad_norm": 5.100676536560059, "learning_rate": 1.0787999999999999e-05, "loss": 2.2815, "step": 4500 }, { "epoch": 6.0, "eval_cer": 1.2580903271692745, "eval_loss": 1.4563010931015015, "eval_runtime": 88.3297, "eval_samples_per_second": 77.437, "eval_steps_per_second": 9.68, "eval_wer": 1.0, "step": 4920 }, { "epoch": 6.097620500305064, "grad_norm": 2.4356448650360107, "learning_rate": 1.1988000000000001e-05, "loss": 1.7295, "step": 5000 }, { "epoch": 6.707748627211714, "grad_norm": 4.461264133453369, "learning_rate": 1.3188e-05, "loss": 1.0892, "step": 5500 }, { "epoch": 7.0, "eval_cer": 1.0564248458985301, "eval_loss": 0.42875832319259644, "eval_runtime": 102.6377, "eval_samples_per_second": 66.642, "eval_steps_per_second": 8.33, "eval_wer": 0.9998538011695907, "step": 5740 }, { "epoch": 7.317266625991458, "grad_norm": 1.982809066772461, "learning_rate": 1.4388000000000002e-05, "loss": 0.6362, "step": 6000 }, { "epoch": 7.927394752898109, "grad_norm": 2.818439483642578, "learning_rate": 1.5588e-05, "loss": 0.4741, "step": 6500 }, { "epoch": 8.0, "eval_cer": 1.0188477951635846, "eval_loss": 0.18962983787059784, "eval_runtime": 91.2741, "eval_samples_per_second": 74.939, "eval_steps_per_second": 9.367, "eval_wer": 0.9994152046783625, "step": 6560 }, { "epoch": 8.536912751677852, "grad_norm": 11.691442489624023, "learning_rate": 1.6788e-05, "loss": 0.3822, "step": 7000 }, { "epoch": 9.0, "eval_cer": 1.0211593172119489, "eval_loss": 0.17385753989219666, "eval_runtime": 89.6334, "eval_samples_per_second": 76.311, "eval_steps_per_second": 9.539, "eval_wer": 0.9989766081871345, "step": 7380 }, { "epoch": 9.146430750457595, "grad_norm": 5.473918914794922, "learning_rate": 1.7988e-05, "loss": 0.34, "step": 7500 }, { "epoch": 9.756558877364247, "grad_norm": 3.1958296298980713, "learning_rate": 1.9188e-05, "loss": 0.3101, "step": 8000 }, { "epoch": 10.0, "eval_cer": 1.0287754860123282, "eval_loss": 0.17125986516475677, "eval_runtime": 101.7075, "eval_samples_per_second": 67.252, "eval_steps_per_second": 8.406, "eval_wer": 0.9988304093567252, "step": 8200 }, { "epoch": 10.36607687614399, "grad_norm": 3.826345443725586, "learning_rate": 2.0388e-05, "loss": 0.2837, "step": 8500 }, { "epoch": 10.97620500305064, "grad_norm": 4.758739471435547, "learning_rate": 2.1588e-05, "loss": 0.2644, "step": 9000 }, { "epoch": 11.0, "eval_cer": 1.0217816500711236, "eval_loss": 0.11893380433320999, "eval_runtime": 97.2791, "eval_samples_per_second": 70.313, "eval_steps_per_second": 8.789, "eval_wer": 0.9988304093567252, "step": 9020 }, { "epoch": 11.585723001830385, "grad_norm": 5.496431350708008, "learning_rate": 2.2788000000000003e-05, "loss": 0.2476, "step": 9500 }, { "epoch": 12.0, "eval_cer": 1.0172178757705073, "eval_loss": 0.05396101996302605, "eval_runtime": 100.4809, "eval_samples_per_second": 68.073, "eval_steps_per_second": 8.509, "eval_wer": 0.9988304093567252, "step": 9840 }, { "epoch": 12.195241000610128, "grad_norm": 3.7265303134918213, "learning_rate": 2.3988e-05, "loss": 0.2479, "step": 10000 }, { "epoch": 12.805369127516778, "grad_norm": 1.958551287651062, "learning_rate": 2.5188e-05, "loss": 0.2302, "step": 10500 }, { "epoch": 13.0, "eval_cer": 1.0152027027027026, "eval_loss": 0.029684651643037796, "eval_runtime": 80.6829, "eval_samples_per_second": 84.776, "eval_steps_per_second": 10.597, "eval_wer": 0.9988304093567252, "step": 10660 }, { "epoch": 13.414887126296522, "grad_norm": 7.186318397521973, "learning_rate": 2.6388000000000002e-05, "loss": 0.2182, "step": 11000 }, { "epoch": 14.0, "eval_cer": 1.0188477951635846, "eval_loss": 0.043104566633701324, "eval_runtime": 66.2916, "eval_samples_per_second": 103.18, "eval_steps_per_second": 12.898, "eval_wer": 0.9988304093567252, "step": 11480 }, { "epoch": 14.024405125076266, "grad_norm": 5.963690757751465, "learning_rate": 2.7588e-05, "loss": 0.2228, "step": 11500 }, { "epoch": 14.634533251982916, "grad_norm": 5.593617916107178, "learning_rate": 2.8788e-05, "loss": 0.2154, "step": 12000 }, { "epoch": 15.0, "eval_cer": 1.0157657657657657, "eval_loss": 0.017413927242159843, "eval_runtime": 61.3441, "eval_samples_per_second": 111.502, "eval_steps_per_second": 13.938, "eval_wer": 0.9988304093567252, "step": 12300 }, { "epoch": 15.24405125076266, "grad_norm": 3.418288230895996, "learning_rate": 2.99856e-05, "loss": 0.22, "step": 12500 }, { "epoch": 15.854179377669311, "grad_norm": 5.651284694671631, "learning_rate": 2.9956190887883116e-05, "loss": 0.2072, "step": 13000 }, { "epoch": 16.0, "eval_cer": 1.0153805120910384, "eval_loss": 0.015717793256044388, "eval_runtime": 60.2687, "eval_samples_per_second": 113.492, "eval_steps_per_second": 14.186, "eval_wer": 0.9991228070175439, "step": 13120 }, { "epoch": 16.463697376449055, "grad_norm": 4.6377177238464355, "learning_rate": 2.982253104799521e-05, "loss": 0.1986, "step": 13500 }, { "epoch": 17.0, "eval_cer": 1.0149359886201992, "eval_loss": 0.027271753177046776, "eval_runtime": 61.2915, "eval_samples_per_second": 111.598, "eval_steps_per_second": 13.95, "eval_wer": 0.9989766081871345, "step": 13940 }, { "epoch": 17.0732153752288, "grad_norm": 0.13217875361442566, "learning_rate": 2.9599814696946643e-05, "loss": 0.2056, "step": 14000 }, { "epoch": 17.683343502135447, "grad_norm": 0.8718374371528625, "learning_rate": 2.9289379955813937e-05, "loss": 0.1919, "step": 14500 }, { "epoch": 18.0, "eval_cer": 1.014461830251304, "eval_loss": 0.01104104146361351, "eval_runtime": 61.3175, "eval_samples_per_second": 111.551, "eval_steps_per_second": 13.944, "eval_wer": 0.9988304093567252, "step": 14760 }, { "epoch": 18.29286150091519, "grad_norm": 6.6985273361206055, "learning_rate": 2.8893091974003682e-05, "loss": 0.1776, "step": 15000 }, { "epoch": 18.902989627821842, "grad_norm": 6.272310733795166, "learning_rate": 2.841333172308954e-05, "loss": 0.1763, "step": 15500 }, { "epoch": 19.0, "eval_cer": 1.014432195353248, "eval_loss": 0.014498263597488403, "eval_runtime": 61.4346, "eval_samples_per_second": 111.338, "eval_steps_per_second": 13.917, "eval_wer": 0.9988304093567252, "step": 15580 }, { "epoch": 19.512507626601586, "grad_norm": 1.4895048141479492, "learning_rate": 2.785418066112353e-05, "loss": 0.1759, "step": 16000 }, { "epoch": 20.0, "eval_cer": 1.016714082503556, "eval_loss": 0.07009146362543106, "eval_runtime": 62.172, "eval_samples_per_second": 110.017, "eval_steps_per_second": 13.752, "eval_wer": 0.9988304093567252, "step": 16400 }, { "epoch": 20.12202562538133, "grad_norm": 4.595536231994629, "learning_rate": 2.7216758309791792e-05, "loss": 0.1829, "step": 16500 }, { "epoch": 20.73215375228798, "grad_norm": 6.24274206161499, "learning_rate": 2.6505935412410244e-05, "loss": 0.1673, "step": 17000 }, { "epoch": 21.0, "eval_cer": 1.0136024182076813, "eval_loss": 0.012840056791901588, "eval_runtime": 62.0714, "eval_samples_per_second": 110.196, "eval_steps_per_second": 13.774, "eval_wer": 0.9988304093567252, "step": 17220 }, { "epoch": 21.341671751067725, "grad_norm": 4.522493362426758, "learning_rate": 2.5725982724566367e-05, "loss": 0.162, "step": 17500 }, { "epoch": 21.951799877974373, "grad_norm": 6.085182189941406, "learning_rate": 2.4881586346429215e-05, "loss": 0.157, "step": 18000 }, { "epoch": 22.0, "eval_cer": 1.014432195353248, "eval_loss": 0.013646350242197514, "eval_runtime": 61.3925, "eval_samples_per_second": 111.414, "eval_steps_per_second": 13.927, "eval_wer": 0.9988304093567252, "step": 18040 }, { "epoch": 22.561317876754117, "grad_norm": 0.7235033512115479, "learning_rate": 2.3977819567791885e-05, "loss": 0.1642, "step": 18500 }, { "epoch": 23.0, "eval_cer": 1.0074383594120435, "eval_loss": 0.1374504715204239, "eval_runtime": 59.8534, "eval_samples_per_second": 114.279, "eval_steps_per_second": 14.285, "eval_wer": 0.9988304093567252, "step": 18860 }, { "epoch": 23.17083587553386, "grad_norm": 1.7966482639312744, "learning_rate": 2.3022077859705676e-05, "loss": 0.155, "step": 19000 }, { "epoch": 23.780964002440513, "grad_norm": 4.931192874908447, "learning_rate": 2.2016274790151287e-05, "loss": 0.1529, "step": 19500 }, { "epoch": 24.0, "eval_cer": 1.0139580369843528, "eval_loss": 0.013449819758534431, "eval_runtime": 58.9726, "eval_samples_per_second": 115.986, "eval_steps_per_second": 14.498, "eval_wer": 0.9988304093567252, "step": 19680 }, { "epoch": 24.390482001220256, "grad_norm": 0.1271979659795761, "learning_rate": 2.0968316642484253e-05, "loss": 0.1501, "step": 20000 }, { "epoch": 25.0, "grad_norm": 3.955129623413086, "learning_rate": 1.9884499743301647e-05, "loss": 0.1511, "step": 20500 }, { "epoch": 25.0, "eval_cer": 1.013839497392129, "eval_loss": 0.007342994213104248, "eval_runtime": 61.5659, "eval_samples_per_second": 111.101, "eval_steps_per_second": 13.888, "eval_wer": 0.9989766081871345, "step": 20500 }, { "epoch": 25.610128126906652, "grad_norm": 4.402960300445557, "learning_rate": 1.8771335865219483e-05, "loss": 0.1415, "step": 21000 }, { "epoch": 26.0, "eval_cer": 1.0136320531057372, "eval_loss": 0.006196827162057161, "eval_runtime": 63.0154, "eval_samples_per_second": 108.545, "eval_steps_per_second": 13.568, "eval_wer": 0.9988304093567252, "step": 21320 }, { "epoch": 26.219646125686396, "grad_norm": 5.023196220397949, "learning_rate": 1.7635513102937044e-05, "loss": 0.1329, "step": 21500 }, { "epoch": 26.829774252593044, "grad_norm": 4.451430797576904, "learning_rate": 1.6483855689925534e-05, "loss": 0.1338, "step": 22000 }, { "epoch": 27.0, "eval_cer": 1.0135135135135136, "eval_loss": 0.006267285440117121, "eval_runtime": 62.0749, "eval_samples_per_second": 110.189, "eval_steps_per_second": 13.774, "eval_wer": 0.9988304093567252, "step": 22140 }, { "epoch": 27.439292251372787, "grad_norm": 0.7792350649833679, "learning_rate": 1.5325608410059234e-05, "loss": 0.1373, "step": 22500 }, { "epoch": 28.0, "eval_cer": 1.01309862494073, "eval_loss": 0.013936175964772701, "eval_runtime": 59.8236, "eval_samples_per_second": 114.336, "eval_steps_per_second": 14.292, "eval_wer": 0.9988304093567252, "step": 22960 }, { "epoch": 28.04881025015253, "grad_norm": 1.8355393409729004, "learning_rate": 1.4163090284146517e-05, "loss": 0.128, "step": 23000 }, { "epoch": 28.658938377059183, "grad_norm": 1.9466981887817383, "learning_rate": 1.3005600466773616e-05, "loss": 0.1224, "step": 23500 }, { "epoch": 29.0, "eval_cer": 1.0135135135135136, "eval_loss": 0.0075889285653829575, "eval_runtime": 61.2053, "eval_samples_per_second": 111.755, "eval_steps_per_second": 13.969, "eval_wer": 0.9988304093567252, "step": 23780 }, { "epoch": 29.268456375838927, "grad_norm": 2.3899123668670654, "learning_rate": 1.186009337109073e-05, "loss": 0.1245, "step": 24000 }, { "epoch": 29.878584502745575, "grad_norm": 0.28489622473716736, "learning_rate": 1.0733451415837331e-05, "loss": 0.1217, "step": 24500 }, { "epoch": 30.0, "eval_cer": 1.013869132290185, "eval_loss": 0.01911773718893528, "eval_runtime": 61.6884, "eval_samples_per_second": 110.88, "eval_steps_per_second": 13.86, "eval_wer": 0.9988304093567252, "step": 24600 }, { "epoch": 30.48810250152532, "grad_norm": 6.990693092346191, "learning_rate": 9.632443674496023e-06, "loss": 0.119, "step": 25000 }, { "epoch": 31.0, "eval_cer": 1.0134542437174017, "eval_loss": 0.026616927236318588, "eval_runtime": 61.1134, "eval_samples_per_second": 111.923, "eval_steps_per_second": 13.99, "eval_wer": 0.9988304093567252, "step": 25420 }, { "epoch": 31.097620500305062, "grad_norm": 1.3103210926055908, "learning_rate": 8.563685205445662e-06, "loss": 0.1102, "step": 25500 }, { "epoch": 31.707748627211714, "grad_norm": 3.154486656188965, "learning_rate": 7.533597307465705e-06, "loss": 0.1122, "step": 26000 }, { "epoch": 32.0, "eval_cer": 1.0135727833096255, "eval_loss": 0.006075156386941671, "eval_runtime": 58.6707, "eval_samples_per_second": 116.583, "eval_steps_per_second": 14.573, "eval_wer": 0.9988304093567252, "step": 26240 }, { "epoch": 32.31726662599146, "grad_norm": 0.17011937499046326, "learning_rate": 6.550290643366546e-06, "loss": 0.11, "step": 26500 }, { "epoch": 32.92739475289811, "grad_norm": 0.06195596233010292, "learning_rate": 5.615733971162722e-06, "loss": 0.1077, "step": 27000 }, { "epoch": 33.0, "eval_cer": 1.0133949739212897, "eval_loss": 0.00502835214138031, "eval_runtime": 61.3153, "eval_samples_per_second": 111.555, "eval_steps_per_second": 13.944, "eval_wer": 0.9988304093567252, "step": 27060 }, { "epoch": 33.53691275167785, "grad_norm": 5.214883804321289, "learning_rate": 4.737559706904321e-06, "loss": 0.1058, "step": 27500 }, { "epoch": 34.0, "eval_cer": 1.0134838786154576, "eval_loss": 0.0068115307949483395, "eval_runtime": 61.475, "eval_samples_per_second": 111.265, "eval_steps_per_second": 13.908, "eval_wer": 0.9988304093567252, "step": 27880 }, { "epoch": 34.1464307504576, "grad_norm": 0.43510904908180237, "learning_rate": 3.921044084178765e-06, "loss": 0.1088, "step": 28000 }, { "epoch": 34.75655887736425, "grad_norm": 1.829236388206482, "learning_rate": 3.1725232868909293e-06, "loss": 0.0992, "step": 28500 }, { "epoch": 35.0, "eval_cer": 1.0135431484115696, "eval_loss": 0.005784249398857355, "eval_runtime": 61.4985, "eval_samples_per_second": 111.222, "eval_steps_per_second": 13.903, "eval_wer": 0.9988304093567252, "step": 28700 }, { "epoch": 35.36607687614399, "grad_norm": 2.7739064693450928, "learning_rate": 2.493495989231198e-06, "loss": 0.1082, "step": 29000 }, { "epoch": 35.97620500305064, "grad_norm": 3.2704861164093018, "learning_rate": 1.8896100834437107e-06, "loss": 0.0977, "step": 29500 }, { "epoch": 36.0, "eval_cer": 1.0134542437174017, "eval_loss": 0.006506683304905891, "eval_runtime": 60.4558, "eval_samples_per_second": 113.14, "eval_steps_per_second": 14.143, "eval_wer": 0.9988304093567252, "step": 29520 }, { "epoch": 36.58572300183038, "grad_norm": 1.9117754697799683, "learning_rate": 1.3644938278693997e-06, "loss": 0.093, "step": 30000 }, { "epoch": 37.0, "eval_cer": 1.0133060692271219, "eval_loss": 0.005826306063681841, "eval_runtime": 61.6467, "eval_samples_per_second": 110.955, "eval_steps_per_second": 13.869, "eval_wer": 0.9988304093567252, "step": 30340 }, { "epoch": 37.195241000610125, "grad_norm": 0.03339027985930443, "learning_rate": 9.213022182052699e-07, "loss": 0.099, "step": 30500 }, { "epoch": 37.80536912751678, "grad_norm": 0.10268145054578781, "learning_rate": 5.626980317060648e-07, "loss": 0.0959, "step": 31000 }, { "epoch": 38.0, "eval_cer": 1.013276434329066, "eval_loss": 0.005811047740280628, "eval_runtime": 60.4371, "eval_samples_per_second": 113.175, "eval_steps_per_second": 14.147, "eval_wer": 0.9988304093567252, "step": 31160 }, { "epoch": 38.41488712629652, "grad_norm": 1.405590534210205, "learning_rate": 2.912918111057888e-07, "loss": 0.093, "step": 31500 }, { "epoch": 39.0, "eval_cer": 1.0133060692271219, "eval_loss": 0.005707182455807924, "eval_runtime": 57.274, "eval_samples_per_second": 119.426, "eval_steps_per_second": 14.928, "eval_wer": 0.9988304093567252, "step": 31980 }, { "epoch": 39.024405125076264, "grad_norm": 0.9183106422424316, "learning_rate": 1.0762696080869105e-07, "loss": 0.0983, "step": 32000 }, { "epoch": 39.634533251982916, "grad_norm": 15.433984756469727, "learning_rate": 1.3536859442666582e-08, "loss": 0.0951, "step": 32500 }, { "epoch": 39.95179987797437, "eval_cer": 1.0133357041251778, "eval_loss": 0.005610902328044176, "eval_runtime": 56.5957, "eval_samples_per_second": 120.857, "eval_steps_per_second": 15.107, "eval_wer": 0.9988304093567252, "step": 32760 }, { "epoch": 39.95179987797437, "step": 32760, "total_flos": 1.3506323652949156e+19, "train_loss": 1.1069419495788686, "train_runtime": 26304.0609, "train_samples_per_second": 79.744, "train_steps_per_second": 1.245 } ], "logging_steps": 500, "max_steps": 32760, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3506323652949156e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }