| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 7670, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.1303780964797914, |
| "grad_norm": 5.5828070640563965, |
| "learning_rate": 1.290743155149935e-05, |
| "loss": 7.7625, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2607561929595828, |
| "grad_norm": 2.0663061141967773, |
| "learning_rate": 2.5945241199478487e-05, |
| "loss": 5.9575, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.39113428943937417, |
| "grad_norm": 0.9179163575172424, |
| "learning_rate": 3.898305084745763e-05, |
| "loss": 5.0578, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5215123859191656, |
| "grad_norm": 0.9352315068244934, |
| "learning_rate": 5.202086049543677e-05, |
| "loss": 4.3731, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.651890482398957, |
| "grad_norm": 1.5910133123397827, |
| "learning_rate": 6.505867014341591e-05, |
| "loss": 4.2998, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7822685788787483, |
| "grad_norm": 1.2363195419311523, |
| "learning_rate": 7.809647979139506e-05, |
| "loss": 4.2803, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9126466753585397, |
| "grad_norm": 1.3712793588638306, |
| "learning_rate": 9.113428943937419e-05, |
| "loss": 4.1966, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 3.70353627204895, |
| "eval_runtime": 0.5442, |
| "eval_samples_per_second": 626.647, |
| "eval_steps_per_second": 79.02, |
| "step": 767 |
| }, |
| { |
| "epoch": 1.0430247718383312, |
| "grad_norm": 1.1920268535614014, |
| "learning_rate": 9.953643343473853e-05, |
| "loss": 4.0134, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.1734028683181226, |
| "grad_norm": 1.2710912227630615, |
| "learning_rate": 9.808778791829639e-05, |
| "loss": 3.9779, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.303780964797914, |
| "grad_norm": 1.2132948637008667, |
| "learning_rate": 9.663914240185426e-05, |
| "loss": 3.9291, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.4341590612777053, |
| "grad_norm": 2.312605857849121, |
| "learning_rate": 9.519049688541214e-05, |
| "loss": 3.9068, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.5645371577574967, |
| "grad_norm": 1.2230041027069092, |
| "learning_rate": 9.374185136897002e-05, |
| "loss": 3.7321, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.694915254237288, |
| "grad_norm": 2.18503999710083, |
| "learning_rate": 9.229320585252789e-05, |
| "loss": 3.861, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.8252933507170797, |
| "grad_norm": 1.5787285566329956, |
| "learning_rate": 9.084456033608576e-05, |
| "loss": 3.7423, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.9556714471968708, |
| "grad_norm": 1.13662588596344, |
| "learning_rate": 8.939591481964363e-05, |
| "loss": 3.5921, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 3.4353556632995605, |
| "eval_runtime": 0.5399, |
| "eval_samples_per_second": 631.585, |
| "eval_steps_per_second": 79.643, |
| "step": 1534 |
| }, |
| { |
| "epoch": 2.0860495436766624, |
| "grad_norm": 1.2315106391906738, |
| "learning_rate": 8.79472693032015e-05, |
| "loss": 3.6158, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.2164276401564535, |
| "grad_norm": 1.7532989978790283, |
| "learning_rate": 8.649862378675939e-05, |
| "loss": 3.5386, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.346805736636245, |
| "grad_norm": 1.2041728496551514, |
| "learning_rate": 8.504997827031726e-05, |
| "loss": 3.6382, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.4771838331160367, |
| "grad_norm": 1.6889125108718872, |
| "learning_rate": 8.360133275387513e-05, |
| "loss": 3.7501, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.607561929595828, |
| "grad_norm": 1.140479564666748, |
| "learning_rate": 8.2152687237433e-05, |
| "loss": 3.5677, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.737940026075619, |
| "grad_norm": 1.1325074434280396, |
| "learning_rate": 8.070404172099087e-05, |
| "loss": 3.6123, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.8683181225554106, |
| "grad_norm": 2.513603687286377, |
| "learning_rate": 7.925539620454874e-05, |
| "loss": 3.4136, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.9986962190352022, |
| "grad_norm": 1.8777894973754883, |
| "learning_rate": 7.780675068810663e-05, |
| "loss": 3.5583, |
| "step": 2300 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 3.3344805240631104, |
| "eval_runtime": 0.5422, |
| "eval_samples_per_second": 628.862, |
| "eval_steps_per_second": 79.299, |
| "step": 2301 |
| }, |
| { |
| "epoch": 3.1290743155149934, |
| "grad_norm": 1.243633508682251, |
| "learning_rate": 7.63581051716645e-05, |
| "loss": 3.3331, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.259452411994785, |
| "grad_norm": 1.1011683940887451, |
| "learning_rate": 7.490945965522237e-05, |
| "loss": 3.3973, |
| "step": 2500 |
| }, |
| { |
| "epoch": 3.389830508474576, |
| "grad_norm": 1.1808488368988037, |
| "learning_rate": 7.346081413878024e-05, |
| "loss": 3.5169, |
| "step": 2600 |
| }, |
| { |
| "epoch": 3.5202086049543677, |
| "grad_norm": 1.0594677925109863, |
| "learning_rate": 7.201216862233811e-05, |
| "loss": 3.3933, |
| "step": 2700 |
| }, |
| { |
| "epoch": 3.6505867014341593, |
| "grad_norm": 1.4269371032714844, |
| "learning_rate": 7.056352310589598e-05, |
| "loss": 3.2929, |
| "step": 2800 |
| }, |
| { |
| "epoch": 3.7809647979139505, |
| "grad_norm": 1.1755293607711792, |
| "learning_rate": 6.911487758945387e-05, |
| "loss": 3.3259, |
| "step": 2900 |
| }, |
| { |
| "epoch": 3.9113428943937416, |
| "grad_norm": 1.0655115842819214, |
| "learning_rate": 6.766623207301174e-05, |
| "loss": 3.4441, |
| "step": 3000 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 3.2687411308288574, |
| "eval_runtime": 0.5471, |
| "eval_samples_per_second": 623.262, |
| "eval_steps_per_second": 78.593, |
| "step": 3068 |
| }, |
| { |
| "epoch": 4.041720990873533, |
| "grad_norm": 1.196297287940979, |
| "learning_rate": 6.621758655656961e-05, |
| "loss": 3.2682, |
| "step": 3100 |
| }, |
| { |
| "epoch": 4.172099087353325, |
| "grad_norm": 3.8496177196502686, |
| "learning_rate": 6.476894104012748e-05, |
| "loss": 3.3519, |
| "step": 3200 |
| }, |
| { |
| "epoch": 4.302477183833116, |
| "grad_norm": 1.2503979206085205, |
| "learning_rate": 6.332029552368535e-05, |
| "loss": 3.2477, |
| "step": 3300 |
| }, |
| { |
| "epoch": 4.432855280312907, |
| "grad_norm": 2.556914806365967, |
| "learning_rate": 6.187165000724323e-05, |
| "loss": 3.3618, |
| "step": 3400 |
| }, |
| { |
| "epoch": 4.563233376792699, |
| "grad_norm": 1.4650604724884033, |
| "learning_rate": 6.04230044908011e-05, |
| "loss": 3.3299, |
| "step": 3500 |
| }, |
| { |
| "epoch": 4.69361147327249, |
| "grad_norm": 1.1140531301498413, |
| "learning_rate": 5.897435897435898e-05, |
| "loss": 3.2246, |
| "step": 3600 |
| }, |
| { |
| "epoch": 4.823989569752282, |
| "grad_norm": 2.14631724357605, |
| "learning_rate": 5.752571345791685e-05, |
| "loss": 3.2511, |
| "step": 3700 |
| }, |
| { |
| "epoch": 4.9543676662320735, |
| "grad_norm": 1.8546875715255737, |
| "learning_rate": 5.6077067941474724e-05, |
| "loss": 3.2367, |
| "step": 3800 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 3.2330377101898193, |
| "eval_runtime": 0.5435, |
| "eval_samples_per_second": 627.363, |
| "eval_steps_per_second": 79.11, |
| "step": 3835 |
| }, |
| { |
| "epoch": 5.084745762711864, |
| "grad_norm": 1.1964752674102783, |
| "learning_rate": 5.46284224250326e-05, |
| "loss": 3.1982, |
| "step": 3900 |
| }, |
| { |
| "epoch": 5.215123859191656, |
| "grad_norm": 1.1956731081008911, |
| "learning_rate": 5.3179776908590473e-05, |
| "loss": 3.1122, |
| "step": 4000 |
| }, |
| { |
| "epoch": 5.345501955671447, |
| "grad_norm": 1.7757279872894287, |
| "learning_rate": 5.1731131392148345e-05, |
| "loss": 3.1673, |
| "step": 4100 |
| }, |
| { |
| "epoch": 5.475880052151239, |
| "grad_norm": 1.4564849138259888, |
| "learning_rate": 5.028248587570622e-05, |
| "loss": 3.2143, |
| "step": 4200 |
| }, |
| { |
| "epoch": 5.60625814863103, |
| "grad_norm": 1.9355357885360718, |
| "learning_rate": 4.883384035926409e-05, |
| "loss": 3.1056, |
| "step": 4300 |
| }, |
| { |
| "epoch": 5.736636245110821, |
| "grad_norm": 1.1551567316055298, |
| "learning_rate": 4.738519484282196e-05, |
| "loss": 3.1004, |
| "step": 4400 |
| }, |
| { |
| "epoch": 5.867014341590613, |
| "grad_norm": 1.4927942752838135, |
| "learning_rate": 4.593654932637984e-05, |
| "loss": 3.0854, |
| "step": 4500 |
| }, |
| { |
| "epoch": 5.9973924380704045, |
| "grad_norm": 2.1001391410827637, |
| "learning_rate": 4.448790380993771e-05, |
| "loss": 3.1018, |
| "step": 4600 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 3.213176727294922, |
| "eval_runtime": 0.5419, |
| "eval_samples_per_second": 629.279, |
| "eval_steps_per_second": 79.352, |
| "step": 4602 |
| }, |
| { |
| "epoch": 6.127770534550195, |
| "grad_norm": 2.20295786857605, |
| "learning_rate": 4.303925829349558e-05, |
| "loss": 3.0204, |
| "step": 4700 |
| }, |
| { |
| "epoch": 6.258148631029987, |
| "grad_norm": 1.4948354959487915, |
| "learning_rate": 4.159061277705346e-05, |
| "loss": 3.089, |
| "step": 4800 |
| }, |
| { |
| "epoch": 6.388526727509778, |
| "grad_norm": 1.5617390871047974, |
| "learning_rate": 4.014196726061133e-05, |
| "loss": 3.0053, |
| "step": 4900 |
| }, |
| { |
| "epoch": 6.51890482398957, |
| "grad_norm": 1.6474759578704834, |
| "learning_rate": 3.86933217441692e-05, |
| "loss": 3.1227, |
| "step": 5000 |
| }, |
| { |
| "epoch": 6.6492829204693615, |
| "grad_norm": 1.5542720556259155, |
| "learning_rate": 3.724467622772708e-05, |
| "loss": 3.0354, |
| "step": 5100 |
| }, |
| { |
| "epoch": 6.779661016949152, |
| "grad_norm": 1.8775848150253296, |
| "learning_rate": 3.579603071128495e-05, |
| "loss": 3.0221, |
| "step": 5200 |
| }, |
| { |
| "epoch": 6.910039113428944, |
| "grad_norm": 1.8488330841064453, |
| "learning_rate": 3.434738519484282e-05, |
| "loss": 3.0161, |
| "step": 5300 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 3.2017745971679688, |
| "eval_runtime": 0.5395, |
| "eval_samples_per_second": 632.058, |
| "eval_steps_per_second": 79.702, |
| "step": 5369 |
| }, |
| { |
| "epoch": 7.040417209908735, |
| "grad_norm": 1.7003470659255981, |
| "learning_rate": 3.28987396784007e-05, |
| "loss": 3.0074, |
| "step": 5400 |
| }, |
| { |
| "epoch": 7.170795306388527, |
| "grad_norm": 1.5171571969985962, |
| "learning_rate": 3.145009416195857e-05, |
| "loss": 2.9606, |
| "step": 5500 |
| }, |
| { |
| "epoch": 7.301173402868318, |
| "grad_norm": 2.7157018184661865, |
| "learning_rate": 3.0001448645516445e-05, |
| "loss": 2.9081, |
| "step": 5600 |
| }, |
| { |
| "epoch": 7.431551499348109, |
| "grad_norm": 1.5487765073776245, |
| "learning_rate": 2.855280312907432e-05, |
| "loss": 3.0004, |
| "step": 5700 |
| }, |
| { |
| "epoch": 7.561929595827901, |
| "grad_norm": 3.577585220336914, |
| "learning_rate": 2.710415761263219e-05, |
| "loss": 2.9776, |
| "step": 5800 |
| }, |
| { |
| "epoch": 7.6923076923076925, |
| "grad_norm": 1.4749557971954346, |
| "learning_rate": 2.5655512096190066e-05, |
| "loss": 2.8426, |
| "step": 5900 |
| }, |
| { |
| "epoch": 7.822685788787483, |
| "grad_norm": 1.6603304147720337, |
| "learning_rate": 2.4206866579747937e-05, |
| "loss": 3.068, |
| "step": 6000 |
| }, |
| { |
| "epoch": 7.953063885267275, |
| "grad_norm": 1.6099046468734741, |
| "learning_rate": 2.275822106330581e-05, |
| "loss": 3.0699, |
| "step": 6100 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 3.1977951526641846, |
| "eval_runtime": 0.5473, |
| "eval_samples_per_second": 623.018, |
| "eval_steps_per_second": 78.562, |
| "step": 6136 |
| }, |
| { |
| "epoch": 8.083441981747066, |
| "grad_norm": 1.4448668956756592, |
| "learning_rate": 2.1309575546863683e-05, |
| "loss": 3.0584, |
| "step": 6200 |
| }, |
| { |
| "epoch": 8.213820078226858, |
| "grad_norm": 1.8011598587036133, |
| "learning_rate": 1.9860930030421558e-05, |
| "loss": 2.924, |
| "step": 6300 |
| }, |
| { |
| "epoch": 8.34419817470665, |
| "grad_norm": 1.7278883457183838, |
| "learning_rate": 1.841228451397943e-05, |
| "loss": 2.9542, |
| "step": 6400 |
| }, |
| { |
| "epoch": 8.474576271186441, |
| "grad_norm": 1.625506043434143, |
| "learning_rate": 1.6963638997537304e-05, |
| "loss": 2.8727, |
| "step": 6500 |
| }, |
| { |
| "epoch": 8.604954367666233, |
| "grad_norm": 1.808686375617981, |
| "learning_rate": 1.551499348109518e-05, |
| "loss": 2.9608, |
| "step": 6600 |
| }, |
| { |
| "epoch": 8.735332464146023, |
| "grad_norm": 2.093956708908081, |
| "learning_rate": 1.4066347964653052e-05, |
| "loss": 2.9101, |
| "step": 6700 |
| }, |
| { |
| "epoch": 8.865710560625814, |
| "grad_norm": 2.1681623458862305, |
| "learning_rate": 1.2617702448210925e-05, |
| "loss": 3.0384, |
| "step": 6800 |
| }, |
| { |
| "epoch": 8.996088657105606, |
| "grad_norm": 1.7489618062973022, |
| "learning_rate": 1.1169056931768796e-05, |
| "loss": 2.8755, |
| "step": 6900 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 3.1901769638061523, |
| "eval_runtime": 0.554, |
| "eval_samples_per_second": 615.522, |
| "eval_steps_per_second": 77.617, |
| "step": 6903 |
| }, |
| { |
| "epoch": 9.126466753585397, |
| "grad_norm": 1.2608046531677246, |
| "learning_rate": 9.72041141532667e-06, |
| "loss": 2.8802, |
| "step": 7000 |
| }, |
| { |
| "epoch": 9.256844850065189, |
| "grad_norm": 2.597111940383911, |
| "learning_rate": 8.271765898884544e-06, |
| "loss": 2.8229, |
| "step": 7100 |
| }, |
| { |
| "epoch": 9.38722294654498, |
| "grad_norm": 1.8257884979248047, |
| "learning_rate": 6.823120382442416e-06, |
| "loss": 2.958, |
| "step": 7200 |
| }, |
| { |
| "epoch": 9.517601043024772, |
| "grad_norm": 2.310960292816162, |
| "learning_rate": 5.37447486600029e-06, |
| "loss": 2.807, |
| "step": 7300 |
| }, |
| { |
| "epoch": 9.647979139504564, |
| "grad_norm": 1.5776140689849854, |
| "learning_rate": 3.925829349558164e-06, |
| "loss": 2.9398, |
| "step": 7400 |
| }, |
| { |
| "epoch": 9.778357235984355, |
| "grad_norm": 1.8965390920639038, |
| "learning_rate": 2.4771838331160366e-06, |
| "loss": 2.842, |
| "step": 7500 |
| }, |
| { |
| "epoch": 9.908735332464147, |
| "grad_norm": 1.6865416765213013, |
| "learning_rate": 1.0285383166739098e-06, |
| "loss": 2.8051, |
| "step": 7600 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 3.1988234519958496, |
| "eval_runtime": 0.5524, |
| "eval_samples_per_second": 617.283, |
| "eval_steps_per_second": 77.839, |
| "step": 7670 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 7670, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1023067594874880.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|