jekunz's picture
Upload folder using huggingface_hub
f13e623 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 7670,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1303780964797914,
"grad_norm": 5.5828070640563965,
"learning_rate": 1.290743155149935e-05,
"loss": 7.7625,
"step": 100
},
{
"epoch": 0.2607561929595828,
"grad_norm": 2.0663061141967773,
"learning_rate": 2.5945241199478487e-05,
"loss": 5.9575,
"step": 200
},
{
"epoch": 0.39113428943937417,
"grad_norm": 0.9179163575172424,
"learning_rate": 3.898305084745763e-05,
"loss": 5.0578,
"step": 300
},
{
"epoch": 0.5215123859191656,
"grad_norm": 0.9352315068244934,
"learning_rate": 5.202086049543677e-05,
"loss": 4.3731,
"step": 400
},
{
"epoch": 0.651890482398957,
"grad_norm": 1.5910133123397827,
"learning_rate": 6.505867014341591e-05,
"loss": 4.2998,
"step": 500
},
{
"epoch": 0.7822685788787483,
"grad_norm": 1.2363195419311523,
"learning_rate": 7.809647979139506e-05,
"loss": 4.2803,
"step": 600
},
{
"epoch": 0.9126466753585397,
"grad_norm": 1.3712793588638306,
"learning_rate": 9.113428943937419e-05,
"loss": 4.1966,
"step": 700
},
{
"epoch": 1.0,
"eval_loss": 3.70353627204895,
"eval_runtime": 0.5442,
"eval_samples_per_second": 626.647,
"eval_steps_per_second": 79.02,
"step": 767
},
{
"epoch": 1.0430247718383312,
"grad_norm": 1.1920268535614014,
"learning_rate": 9.953643343473853e-05,
"loss": 4.0134,
"step": 800
},
{
"epoch": 1.1734028683181226,
"grad_norm": 1.2710912227630615,
"learning_rate": 9.808778791829639e-05,
"loss": 3.9779,
"step": 900
},
{
"epoch": 1.303780964797914,
"grad_norm": 1.2132948637008667,
"learning_rate": 9.663914240185426e-05,
"loss": 3.9291,
"step": 1000
},
{
"epoch": 1.4341590612777053,
"grad_norm": 2.312605857849121,
"learning_rate": 9.519049688541214e-05,
"loss": 3.9068,
"step": 1100
},
{
"epoch": 1.5645371577574967,
"grad_norm": 1.2230041027069092,
"learning_rate": 9.374185136897002e-05,
"loss": 3.7321,
"step": 1200
},
{
"epoch": 1.694915254237288,
"grad_norm": 2.18503999710083,
"learning_rate": 9.229320585252789e-05,
"loss": 3.861,
"step": 1300
},
{
"epoch": 1.8252933507170797,
"grad_norm": 1.5787285566329956,
"learning_rate": 9.084456033608576e-05,
"loss": 3.7423,
"step": 1400
},
{
"epoch": 1.9556714471968708,
"grad_norm": 1.13662588596344,
"learning_rate": 8.939591481964363e-05,
"loss": 3.5921,
"step": 1500
},
{
"epoch": 2.0,
"eval_loss": 3.4353556632995605,
"eval_runtime": 0.5399,
"eval_samples_per_second": 631.585,
"eval_steps_per_second": 79.643,
"step": 1534
},
{
"epoch": 2.0860495436766624,
"grad_norm": 1.2315106391906738,
"learning_rate": 8.79472693032015e-05,
"loss": 3.6158,
"step": 1600
},
{
"epoch": 2.2164276401564535,
"grad_norm": 1.7532989978790283,
"learning_rate": 8.649862378675939e-05,
"loss": 3.5386,
"step": 1700
},
{
"epoch": 2.346805736636245,
"grad_norm": 1.2041728496551514,
"learning_rate": 8.504997827031726e-05,
"loss": 3.6382,
"step": 1800
},
{
"epoch": 2.4771838331160367,
"grad_norm": 1.6889125108718872,
"learning_rate": 8.360133275387513e-05,
"loss": 3.7501,
"step": 1900
},
{
"epoch": 2.607561929595828,
"grad_norm": 1.140479564666748,
"learning_rate": 8.2152687237433e-05,
"loss": 3.5677,
"step": 2000
},
{
"epoch": 2.737940026075619,
"grad_norm": 1.1325074434280396,
"learning_rate": 8.070404172099087e-05,
"loss": 3.6123,
"step": 2100
},
{
"epoch": 2.8683181225554106,
"grad_norm": 2.513603687286377,
"learning_rate": 7.925539620454874e-05,
"loss": 3.4136,
"step": 2200
},
{
"epoch": 2.9986962190352022,
"grad_norm": 1.8777894973754883,
"learning_rate": 7.780675068810663e-05,
"loss": 3.5583,
"step": 2300
},
{
"epoch": 3.0,
"eval_loss": 3.3344805240631104,
"eval_runtime": 0.5422,
"eval_samples_per_second": 628.862,
"eval_steps_per_second": 79.299,
"step": 2301
},
{
"epoch": 3.1290743155149934,
"grad_norm": 1.243633508682251,
"learning_rate": 7.63581051716645e-05,
"loss": 3.3331,
"step": 2400
},
{
"epoch": 3.259452411994785,
"grad_norm": 1.1011683940887451,
"learning_rate": 7.490945965522237e-05,
"loss": 3.3973,
"step": 2500
},
{
"epoch": 3.389830508474576,
"grad_norm": 1.1808488368988037,
"learning_rate": 7.346081413878024e-05,
"loss": 3.5169,
"step": 2600
},
{
"epoch": 3.5202086049543677,
"grad_norm": 1.0594677925109863,
"learning_rate": 7.201216862233811e-05,
"loss": 3.3933,
"step": 2700
},
{
"epoch": 3.6505867014341593,
"grad_norm": 1.4269371032714844,
"learning_rate": 7.056352310589598e-05,
"loss": 3.2929,
"step": 2800
},
{
"epoch": 3.7809647979139505,
"grad_norm": 1.1755293607711792,
"learning_rate": 6.911487758945387e-05,
"loss": 3.3259,
"step": 2900
},
{
"epoch": 3.9113428943937416,
"grad_norm": 1.0655115842819214,
"learning_rate": 6.766623207301174e-05,
"loss": 3.4441,
"step": 3000
},
{
"epoch": 4.0,
"eval_loss": 3.2687411308288574,
"eval_runtime": 0.5471,
"eval_samples_per_second": 623.262,
"eval_steps_per_second": 78.593,
"step": 3068
},
{
"epoch": 4.041720990873533,
"grad_norm": 1.196297287940979,
"learning_rate": 6.621758655656961e-05,
"loss": 3.2682,
"step": 3100
},
{
"epoch": 4.172099087353325,
"grad_norm": 3.8496177196502686,
"learning_rate": 6.476894104012748e-05,
"loss": 3.3519,
"step": 3200
},
{
"epoch": 4.302477183833116,
"grad_norm": 1.2503979206085205,
"learning_rate": 6.332029552368535e-05,
"loss": 3.2477,
"step": 3300
},
{
"epoch": 4.432855280312907,
"grad_norm": 2.556914806365967,
"learning_rate": 6.187165000724323e-05,
"loss": 3.3618,
"step": 3400
},
{
"epoch": 4.563233376792699,
"grad_norm": 1.4650604724884033,
"learning_rate": 6.04230044908011e-05,
"loss": 3.3299,
"step": 3500
},
{
"epoch": 4.69361147327249,
"grad_norm": 1.1140531301498413,
"learning_rate": 5.897435897435898e-05,
"loss": 3.2246,
"step": 3600
},
{
"epoch": 4.823989569752282,
"grad_norm": 2.14631724357605,
"learning_rate": 5.752571345791685e-05,
"loss": 3.2511,
"step": 3700
},
{
"epoch": 4.9543676662320735,
"grad_norm": 1.8546875715255737,
"learning_rate": 5.6077067941474724e-05,
"loss": 3.2367,
"step": 3800
},
{
"epoch": 5.0,
"eval_loss": 3.2330377101898193,
"eval_runtime": 0.5435,
"eval_samples_per_second": 627.363,
"eval_steps_per_second": 79.11,
"step": 3835
},
{
"epoch": 5.084745762711864,
"grad_norm": 1.1964752674102783,
"learning_rate": 5.46284224250326e-05,
"loss": 3.1982,
"step": 3900
},
{
"epoch": 5.215123859191656,
"grad_norm": 1.1956731081008911,
"learning_rate": 5.3179776908590473e-05,
"loss": 3.1122,
"step": 4000
},
{
"epoch": 5.345501955671447,
"grad_norm": 1.7757279872894287,
"learning_rate": 5.1731131392148345e-05,
"loss": 3.1673,
"step": 4100
},
{
"epoch": 5.475880052151239,
"grad_norm": 1.4564849138259888,
"learning_rate": 5.028248587570622e-05,
"loss": 3.2143,
"step": 4200
},
{
"epoch": 5.60625814863103,
"grad_norm": 1.9355357885360718,
"learning_rate": 4.883384035926409e-05,
"loss": 3.1056,
"step": 4300
},
{
"epoch": 5.736636245110821,
"grad_norm": 1.1551567316055298,
"learning_rate": 4.738519484282196e-05,
"loss": 3.1004,
"step": 4400
},
{
"epoch": 5.867014341590613,
"grad_norm": 1.4927942752838135,
"learning_rate": 4.593654932637984e-05,
"loss": 3.0854,
"step": 4500
},
{
"epoch": 5.9973924380704045,
"grad_norm": 2.1001391410827637,
"learning_rate": 4.448790380993771e-05,
"loss": 3.1018,
"step": 4600
},
{
"epoch": 6.0,
"eval_loss": 3.213176727294922,
"eval_runtime": 0.5419,
"eval_samples_per_second": 629.279,
"eval_steps_per_second": 79.352,
"step": 4602
},
{
"epoch": 6.127770534550195,
"grad_norm": 2.20295786857605,
"learning_rate": 4.303925829349558e-05,
"loss": 3.0204,
"step": 4700
},
{
"epoch": 6.258148631029987,
"grad_norm": 1.4948354959487915,
"learning_rate": 4.159061277705346e-05,
"loss": 3.089,
"step": 4800
},
{
"epoch": 6.388526727509778,
"grad_norm": 1.5617390871047974,
"learning_rate": 4.014196726061133e-05,
"loss": 3.0053,
"step": 4900
},
{
"epoch": 6.51890482398957,
"grad_norm": 1.6474759578704834,
"learning_rate": 3.86933217441692e-05,
"loss": 3.1227,
"step": 5000
},
{
"epoch": 6.6492829204693615,
"grad_norm": 1.5542720556259155,
"learning_rate": 3.724467622772708e-05,
"loss": 3.0354,
"step": 5100
},
{
"epoch": 6.779661016949152,
"grad_norm": 1.8775848150253296,
"learning_rate": 3.579603071128495e-05,
"loss": 3.0221,
"step": 5200
},
{
"epoch": 6.910039113428944,
"grad_norm": 1.8488330841064453,
"learning_rate": 3.434738519484282e-05,
"loss": 3.0161,
"step": 5300
},
{
"epoch": 7.0,
"eval_loss": 3.2017745971679688,
"eval_runtime": 0.5395,
"eval_samples_per_second": 632.058,
"eval_steps_per_second": 79.702,
"step": 5369
},
{
"epoch": 7.040417209908735,
"grad_norm": 1.7003470659255981,
"learning_rate": 3.28987396784007e-05,
"loss": 3.0074,
"step": 5400
},
{
"epoch": 7.170795306388527,
"grad_norm": 1.5171571969985962,
"learning_rate": 3.145009416195857e-05,
"loss": 2.9606,
"step": 5500
},
{
"epoch": 7.301173402868318,
"grad_norm": 2.7157018184661865,
"learning_rate": 3.0001448645516445e-05,
"loss": 2.9081,
"step": 5600
},
{
"epoch": 7.431551499348109,
"grad_norm": 1.5487765073776245,
"learning_rate": 2.855280312907432e-05,
"loss": 3.0004,
"step": 5700
},
{
"epoch": 7.561929595827901,
"grad_norm": 3.577585220336914,
"learning_rate": 2.710415761263219e-05,
"loss": 2.9776,
"step": 5800
},
{
"epoch": 7.6923076923076925,
"grad_norm": 1.4749557971954346,
"learning_rate": 2.5655512096190066e-05,
"loss": 2.8426,
"step": 5900
},
{
"epoch": 7.822685788787483,
"grad_norm": 1.6603304147720337,
"learning_rate": 2.4206866579747937e-05,
"loss": 3.068,
"step": 6000
},
{
"epoch": 7.953063885267275,
"grad_norm": 1.6099046468734741,
"learning_rate": 2.275822106330581e-05,
"loss": 3.0699,
"step": 6100
},
{
"epoch": 8.0,
"eval_loss": 3.1977951526641846,
"eval_runtime": 0.5473,
"eval_samples_per_second": 623.018,
"eval_steps_per_second": 78.562,
"step": 6136
},
{
"epoch": 8.083441981747066,
"grad_norm": 1.4448668956756592,
"learning_rate": 2.1309575546863683e-05,
"loss": 3.0584,
"step": 6200
},
{
"epoch": 8.213820078226858,
"grad_norm": 1.8011598587036133,
"learning_rate": 1.9860930030421558e-05,
"loss": 2.924,
"step": 6300
},
{
"epoch": 8.34419817470665,
"grad_norm": 1.7278883457183838,
"learning_rate": 1.841228451397943e-05,
"loss": 2.9542,
"step": 6400
},
{
"epoch": 8.474576271186441,
"grad_norm": 1.625506043434143,
"learning_rate": 1.6963638997537304e-05,
"loss": 2.8727,
"step": 6500
},
{
"epoch": 8.604954367666233,
"grad_norm": 1.808686375617981,
"learning_rate": 1.551499348109518e-05,
"loss": 2.9608,
"step": 6600
},
{
"epoch": 8.735332464146023,
"grad_norm": 2.093956708908081,
"learning_rate": 1.4066347964653052e-05,
"loss": 2.9101,
"step": 6700
},
{
"epoch": 8.865710560625814,
"grad_norm": 2.1681623458862305,
"learning_rate": 1.2617702448210925e-05,
"loss": 3.0384,
"step": 6800
},
{
"epoch": 8.996088657105606,
"grad_norm": 1.7489618062973022,
"learning_rate": 1.1169056931768796e-05,
"loss": 2.8755,
"step": 6900
},
{
"epoch": 9.0,
"eval_loss": 3.1901769638061523,
"eval_runtime": 0.554,
"eval_samples_per_second": 615.522,
"eval_steps_per_second": 77.617,
"step": 6903
},
{
"epoch": 9.126466753585397,
"grad_norm": 1.2608046531677246,
"learning_rate": 9.72041141532667e-06,
"loss": 2.8802,
"step": 7000
},
{
"epoch": 9.256844850065189,
"grad_norm": 2.597111940383911,
"learning_rate": 8.271765898884544e-06,
"loss": 2.8229,
"step": 7100
},
{
"epoch": 9.38722294654498,
"grad_norm": 1.8257884979248047,
"learning_rate": 6.823120382442416e-06,
"loss": 2.958,
"step": 7200
},
{
"epoch": 9.517601043024772,
"grad_norm": 2.310960292816162,
"learning_rate": 5.37447486600029e-06,
"loss": 2.807,
"step": 7300
},
{
"epoch": 9.647979139504564,
"grad_norm": 1.5776140689849854,
"learning_rate": 3.925829349558164e-06,
"loss": 2.9398,
"step": 7400
},
{
"epoch": 9.778357235984355,
"grad_norm": 1.8965390920639038,
"learning_rate": 2.4771838331160366e-06,
"loss": 2.842,
"step": 7500
},
{
"epoch": 9.908735332464147,
"grad_norm": 1.6865416765213013,
"learning_rate": 1.0285383166739098e-06,
"loss": 2.8051,
"step": 7600
},
{
"epoch": 10.0,
"eval_loss": 3.1988234519958496,
"eval_runtime": 0.5524,
"eval_samples_per_second": 617.283,
"eval_steps_per_second": 77.839,
"step": 7670
}
],
"logging_steps": 100,
"max_steps": 7670,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1023067594874880.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}