{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 7670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1303780964797914, "grad_norm": 5.5828070640563965, "learning_rate": 1.290743155149935e-05, "loss": 7.7625, "step": 100 }, { "epoch": 0.2607561929595828, "grad_norm": 2.0663061141967773, "learning_rate": 2.5945241199478487e-05, "loss": 5.9575, "step": 200 }, { "epoch": 0.39113428943937417, "grad_norm": 0.9179163575172424, "learning_rate": 3.898305084745763e-05, "loss": 5.0578, "step": 300 }, { "epoch": 0.5215123859191656, "grad_norm": 0.9352315068244934, "learning_rate": 5.202086049543677e-05, "loss": 4.3731, "step": 400 }, { "epoch": 0.651890482398957, "grad_norm": 1.5910133123397827, "learning_rate": 6.505867014341591e-05, "loss": 4.2998, "step": 500 }, { "epoch": 0.7822685788787483, "grad_norm": 1.2363195419311523, "learning_rate": 7.809647979139506e-05, "loss": 4.2803, "step": 600 }, { "epoch": 0.9126466753585397, "grad_norm": 1.3712793588638306, "learning_rate": 9.113428943937419e-05, "loss": 4.1966, "step": 700 }, { "epoch": 1.0, "eval_loss": 3.70353627204895, "eval_runtime": 0.5442, "eval_samples_per_second": 626.647, "eval_steps_per_second": 79.02, "step": 767 }, { "epoch": 1.0430247718383312, "grad_norm": 1.1920268535614014, "learning_rate": 9.953643343473853e-05, "loss": 4.0134, "step": 800 }, { "epoch": 1.1734028683181226, "grad_norm": 1.2710912227630615, "learning_rate": 9.808778791829639e-05, "loss": 3.9779, "step": 900 }, { "epoch": 1.303780964797914, "grad_norm": 1.2132948637008667, "learning_rate": 9.663914240185426e-05, "loss": 3.9291, "step": 1000 }, { "epoch": 1.4341590612777053, "grad_norm": 2.312605857849121, "learning_rate": 9.519049688541214e-05, "loss": 3.9068, "step": 1100 }, { "epoch": 1.5645371577574967, "grad_norm": 1.2230041027069092, "learning_rate": 9.374185136897002e-05, "loss": 3.7321, "step": 1200 }, { "epoch": 1.694915254237288, "grad_norm": 2.18503999710083, "learning_rate": 9.229320585252789e-05, "loss": 3.861, "step": 1300 }, { "epoch": 1.8252933507170797, "grad_norm": 1.5787285566329956, "learning_rate": 9.084456033608576e-05, "loss": 3.7423, "step": 1400 }, { "epoch": 1.9556714471968708, "grad_norm": 1.13662588596344, "learning_rate": 8.939591481964363e-05, "loss": 3.5921, "step": 1500 }, { "epoch": 2.0, "eval_loss": 3.4353556632995605, "eval_runtime": 0.5399, "eval_samples_per_second": 631.585, "eval_steps_per_second": 79.643, "step": 1534 }, { "epoch": 2.0860495436766624, "grad_norm": 1.2315106391906738, "learning_rate": 8.79472693032015e-05, "loss": 3.6158, "step": 1600 }, { "epoch": 2.2164276401564535, "grad_norm": 1.7532989978790283, "learning_rate": 8.649862378675939e-05, "loss": 3.5386, "step": 1700 }, { "epoch": 2.346805736636245, "grad_norm": 1.2041728496551514, "learning_rate": 8.504997827031726e-05, "loss": 3.6382, "step": 1800 }, { "epoch": 2.4771838331160367, "grad_norm": 1.6889125108718872, "learning_rate": 8.360133275387513e-05, "loss": 3.7501, "step": 1900 }, { "epoch": 2.607561929595828, "grad_norm": 1.140479564666748, "learning_rate": 8.2152687237433e-05, "loss": 3.5677, "step": 2000 }, { "epoch": 2.737940026075619, "grad_norm": 1.1325074434280396, "learning_rate": 8.070404172099087e-05, "loss": 3.6123, "step": 2100 }, { "epoch": 2.8683181225554106, "grad_norm": 2.513603687286377, "learning_rate": 7.925539620454874e-05, "loss": 3.4136, "step": 2200 }, { "epoch": 2.9986962190352022, "grad_norm": 1.8777894973754883, "learning_rate": 7.780675068810663e-05, "loss": 3.5583, "step": 2300 }, { "epoch": 3.0, "eval_loss": 3.3344805240631104, "eval_runtime": 0.5422, "eval_samples_per_second": 628.862, "eval_steps_per_second": 79.299, "step": 2301 }, { "epoch": 3.1290743155149934, "grad_norm": 1.243633508682251, "learning_rate": 7.63581051716645e-05, "loss": 3.3331, "step": 2400 }, { "epoch": 3.259452411994785, "grad_norm": 1.1011683940887451, "learning_rate": 7.490945965522237e-05, "loss": 3.3973, "step": 2500 }, { "epoch": 3.389830508474576, "grad_norm": 1.1808488368988037, "learning_rate": 7.346081413878024e-05, "loss": 3.5169, "step": 2600 }, { "epoch": 3.5202086049543677, "grad_norm": 1.0594677925109863, "learning_rate": 7.201216862233811e-05, "loss": 3.3933, "step": 2700 }, { "epoch": 3.6505867014341593, "grad_norm": 1.4269371032714844, "learning_rate": 7.056352310589598e-05, "loss": 3.2929, "step": 2800 }, { "epoch": 3.7809647979139505, "grad_norm": 1.1755293607711792, "learning_rate": 6.911487758945387e-05, "loss": 3.3259, "step": 2900 }, { "epoch": 3.9113428943937416, "grad_norm": 1.0655115842819214, "learning_rate": 6.766623207301174e-05, "loss": 3.4441, "step": 3000 }, { "epoch": 4.0, "eval_loss": 3.2687411308288574, "eval_runtime": 0.5471, "eval_samples_per_second": 623.262, "eval_steps_per_second": 78.593, "step": 3068 }, { "epoch": 4.041720990873533, "grad_norm": 1.196297287940979, "learning_rate": 6.621758655656961e-05, "loss": 3.2682, "step": 3100 }, { "epoch": 4.172099087353325, "grad_norm": 3.8496177196502686, "learning_rate": 6.476894104012748e-05, "loss": 3.3519, "step": 3200 }, { "epoch": 4.302477183833116, "grad_norm": 1.2503979206085205, "learning_rate": 6.332029552368535e-05, "loss": 3.2477, "step": 3300 }, { "epoch": 4.432855280312907, "grad_norm": 2.556914806365967, "learning_rate": 6.187165000724323e-05, "loss": 3.3618, "step": 3400 }, { "epoch": 4.563233376792699, "grad_norm": 1.4650604724884033, "learning_rate": 6.04230044908011e-05, "loss": 3.3299, "step": 3500 }, { "epoch": 4.69361147327249, "grad_norm": 1.1140531301498413, "learning_rate": 5.897435897435898e-05, "loss": 3.2246, "step": 3600 }, { "epoch": 4.823989569752282, "grad_norm": 2.14631724357605, "learning_rate": 5.752571345791685e-05, "loss": 3.2511, "step": 3700 }, { "epoch": 4.9543676662320735, "grad_norm": 1.8546875715255737, "learning_rate": 5.6077067941474724e-05, "loss": 3.2367, "step": 3800 }, { "epoch": 5.0, "eval_loss": 3.2330377101898193, "eval_runtime": 0.5435, "eval_samples_per_second": 627.363, "eval_steps_per_second": 79.11, "step": 3835 }, { "epoch": 5.084745762711864, "grad_norm": 1.1964752674102783, "learning_rate": 5.46284224250326e-05, "loss": 3.1982, "step": 3900 }, { "epoch": 5.215123859191656, "grad_norm": 1.1956731081008911, "learning_rate": 5.3179776908590473e-05, "loss": 3.1122, "step": 4000 }, { "epoch": 5.345501955671447, "grad_norm": 1.7757279872894287, "learning_rate": 5.1731131392148345e-05, "loss": 3.1673, "step": 4100 }, { "epoch": 5.475880052151239, "grad_norm": 1.4564849138259888, "learning_rate": 5.028248587570622e-05, "loss": 3.2143, "step": 4200 }, { "epoch": 5.60625814863103, "grad_norm": 1.9355357885360718, "learning_rate": 4.883384035926409e-05, "loss": 3.1056, "step": 4300 }, { "epoch": 5.736636245110821, "grad_norm": 1.1551567316055298, "learning_rate": 4.738519484282196e-05, "loss": 3.1004, "step": 4400 }, { "epoch": 5.867014341590613, "grad_norm": 1.4927942752838135, "learning_rate": 4.593654932637984e-05, "loss": 3.0854, "step": 4500 }, { "epoch": 5.9973924380704045, "grad_norm": 2.1001391410827637, "learning_rate": 4.448790380993771e-05, "loss": 3.1018, "step": 4600 }, { "epoch": 6.0, "eval_loss": 3.213176727294922, "eval_runtime": 0.5419, "eval_samples_per_second": 629.279, "eval_steps_per_second": 79.352, "step": 4602 }, { "epoch": 6.127770534550195, "grad_norm": 2.20295786857605, "learning_rate": 4.303925829349558e-05, "loss": 3.0204, "step": 4700 }, { "epoch": 6.258148631029987, "grad_norm": 1.4948354959487915, "learning_rate": 4.159061277705346e-05, "loss": 3.089, "step": 4800 }, { "epoch": 6.388526727509778, "grad_norm": 1.5617390871047974, "learning_rate": 4.014196726061133e-05, "loss": 3.0053, "step": 4900 }, { "epoch": 6.51890482398957, "grad_norm": 1.6474759578704834, "learning_rate": 3.86933217441692e-05, "loss": 3.1227, "step": 5000 }, { "epoch": 6.6492829204693615, "grad_norm": 1.5542720556259155, "learning_rate": 3.724467622772708e-05, "loss": 3.0354, "step": 5100 }, { "epoch": 6.779661016949152, "grad_norm": 1.8775848150253296, "learning_rate": 3.579603071128495e-05, "loss": 3.0221, "step": 5200 }, { "epoch": 6.910039113428944, "grad_norm": 1.8488330841064453, "learning_rate": 3.434738519484282e-05, "loss": 3.0161, "step": 5300 }, { "epoch": 7.0, "eval_loss": 3.2017745971679688, "eval_runtime": 0.5395, "eval_samples_per_second": 632.058, "eval_steps_per_second": 79.702, "step": 5369 }, { "epoch": 7.040417209908735, "grad_norm": 1.7003470659255981, "learning_rate": 3.28987396784007e-05, "loss": 3.0074, "step": 5400 }, { "epoch": 7.170795306388527, "grad_norm": 1.5171571969985962, "learning_rate": 3.145009416195857e-05, "loss": 2.9606, "step": 5500 }, { "epoch": 7.301173402868318, "grad_norm": 2.7157018184661865, "learning_rate": 3.0001448645516445e-05, "loss": 2.9081, "step": 5600 }, { "epoch": 7.431551499348109, "grad_norm": 1.5487765073776245, "learning_rate": 2.855280312907432e-05, "loss": 3.0004, "step": 5700 }, { "epoch": 7.561929595827901, "grad_norm": 3.577585220336914, "learning_rate": 2.710415761263219e-05, "loss": 2.9776, "step": 5800 }, { "epoch": 7.6923076923076925, "grad_norm": 1.4749557971954346, "learning_rate": 2.5655512096190066e-05, "loss": 2.8426, "step": 5900 }, { "epoch": 7.822685788787483, "grad_norm": 1.6603304147720337, "learning_rate": 2.4206866579747937e-05, "loss": 3.068, "step": 6000 }, { "epoch": 7.953063885267275, "grad_norm": 1.6099046468734741, "learning_rate": 2.275822106330581e-05, "loss": 3.0699, "step": 6100 }, { "epoch": 8.0, "eval_loss": 3.1977951526641846, "eval_runtime": 0.5473, "eval_samples_per_second": 623.018, "eval_steps_per_second": 78.562, "step": 6136 }, { "epoch": 8.083441981747066, "grad_norm": 1.4448668956756592, "learning_rate": 2.1309575546863683e-05, "loss": 3.0584, "step": 6200 }, { "epoch": 8.213820078226858, "grad_norm": 1.8011598587036133, "learning_rate": 1.9860930030421558e-05, "loss": 2.924, "step": 6300 }, { "epoch": 8.34419817470665, "grad_norm": 1.7278883457183838, "learning_rate": 1.841228451397943e-05, "loss": 2.9542, "step": 6400 }, { "epoch": 8.474576271186441, "grad_norm": 1.625506043434143, "learning_rate": 1.6963638997537304e-05, "loss": 2.8727, "step": 6500 }, { "epoch": 8.604954367666233, "grad_norm": 1.808686375617981, "learning_rate": 1.551499348109518e-05, "loss": 2.9608, "step": 6600 }, { "epoch": 8.735332464146023, "grad_norm": 2.093956708908081, "learning_rate": 1.4066347964653052e-05, "loss": 2.9101, "step": 6700 }, { "epoch": 8.865710560625814, "grad_norm": 2.1681623458862305, "learning_rate": 1.2617702448210925e-05, "loss": 3.0384, "step": 6800 }, { "epoch": 8.996088657105606, "grad_norm": 1.7489618062973022, "learning_rate": 1.1169056931768796e-05, "loss": 2.8755, "step": 6900 }, { "epoch": 9.0, "eval_loss": 3.1901769638061523, "eval_runtime": 0.554, "eval_samples_per_second": 615.522, "eval_steps_per_second": 77.617, "step": 6903 }, { "epoch": 9.126466753585397, "grad_norm": 1.2608046531677246, "learning_rate": 9.72041141532667e-06, "loss": 2.8802, "step": 7000 }, { "epoch": 9.256844850065189, "grad_norm": 2.597111940383911, "learning_rate": 8.271765898884544e-06, "loss": 2.8229, "step": 7100 }, { "epoch": 9.38722294654498, "grad_norm": 1.8257884979248047, "learning_rate": 6.823120382442416e-06, "loss": 2.958, "step": 7200 }, { "epoch": 9.517601043024772, "grad_norm": 2.310960292816162, "learning_rate": 5.37447486600029e-06, "loss": 2.807, "step": 7300 }, { "epoch": 9.647979139504564, "grad_norm": 1.5776140689849854, "learning_rate": 3.925829349558164e-06, "loss": 2.9398, "step": 7400 }, { "epoch": 9.778357235984355, "grad_norm": 1.8965390920639038, "learning_rate": 2.4771838331160366e-06, "loss": 2.842, "step": 7500 }, { "epoch": 9.908735332464147, "grad_norm": 1.6865416765213013, "learning_rate": 1.0285383166739098e-06, "loss": 2.8051, "step": 7600 }, { "epoch": 10.0, "eval_loss": 3.1988234519958496, "eval_runtime": 0.5524, "eval_samples_per_second": 617.283, "eval_steps_per_second": 77.839, "step": 7670 } ], "logging_steps": 100, "max_steps": 7670, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1023067594874880.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }