Buckets:

Fizzarolli's picture
download
raw
30.4 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3505154639175259,
"eval_steps": 500,
"global_step": 66,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020618556701030927,
"grad_norm": 2.13191556930542,
"learning_rate": 0.0,
"loss": 0.886581540107727,
"memory/device_reserved (GiB)": 108.72,
"memory/max_active (GiB)": 107.91,
"memory/max_allocated (GiB)": 107.91,
"ppl": 2.42682,
"step": 1,
"tokens/total": 262144,
"tokens/train_per_sec_per_gpu": 106.66,
"tokens/trainable": 222902
},
{
"epoch": 0.041237113402061855,
"grad_norm": 2.421574831008911,
"learning_rate": 2.499999936844688e-06,
"loss": 0.9712697267532349,
"memory/device_reserved (GiB)": 115.82,
"memory/max_active (GiB)": 114.88,
"memory/max_allocated (GiB)": 114.88,
"ppl": 2.6413,
"step": 2,
"tokens/total": 524288,
"tokens/train_per_sec_per_gpu": 106.37,
"tokens/trainable": 439627
},
{
"epoch": 0.061855670103092786,
"grad_norm": 2.0940639972686768,
"learning_rate": 4.999999873689376e-06,
"loss": 0.9303156137466431,
"memory/device_reserved (GiB)": 115.92,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.53531,
"step": 3,
"tokens/total": 786432,
"tokens/train_per_sec_per_gpu": 115.4,
"tokens/trainable": 662779
},
{
"epoch": 0.08247422680412371,
"grad_norm": 1.3038409948349,
"learning_rate": 7.499999810534064e-06,
"loss": 0.9193090796470642,
"memory/device_reserved (GiB)": 115.92,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 2.50756,
"step": 4,
"tokens/total": 1048576,
"tokens/train_per_sec_per_gpu": 103.0,
"tokens/trainable": 884034
},
{
"epoch": 0.10309278350515463,
"grad_norm": 1.296083688735962,
"learning_rate": 9.999999747378752e-06,
"loss": 0.9182717204093933,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.50496,
"step": 5,
"tokens/total": 1310720,
"tokens/train_per_sec_per_gpu": 114.25,
"tokens/trainable": 1106329
},
{
"epoch": 0.12371134020618557,
"grad_norm": 1.2671147584915161,
"learning_rate": 9.999999747378752e-06,
"loss": 0.8568199276924133,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.35566,
"step": 6,
"tokens/total": 1572864,
"tokens/train_per_sec_per_gpu": 80.33,
"tokens/trainable": 1323028
},
{
"epoch": 0.14432989690721648,
"grad_norm": 1.111126184463501,
"learning_rate": 9.999999747378752e-06,
"loss": 0.8340408205986023,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.99,
"memory/max_allocated (GiB)": 114.99,
"ppl": 2.3026,
"step": 7,
"tokens/total": 1835008,
"tokens/train_per_sec_per_gpu": 116.58,
"tokens/trainable": 1554666
},
{
"epoch": 0.16494845360824742,
"grad_norm": 0.9103517532348633,
"learning_rate": 9.999999747378752e-06,
"loss": 0.8362194895744324,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.88,
"memory/max_allocated (GiB)": 114.88,
"ppl": 2.30763,
"step": 8,
"tokens/total": 2097152,
"tokens/train_per_sec_per_gpu": 104.84,
"tokens/trainable": 1769387
},
{
"epoch": 0.18556701030927836,
"grad_norm": 0.7094802856445312,
"learning_rate": 9.999999747378752e-06,
"loss": 0.8117857575416565,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.25193,
"step": 9,
"tokens/total": 2359296,
"tokens/train_per_sec_per_gpu": 103.85,
"tokens/trainable": 1996059
},
{
"epoch": 0.20618556701030927,
"grad_norm": 0.6205381155014038,
"learning_rate": 9.999999747378752e-06,
"loss": 0.8286277055740356,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.29017,
"step": 10,
"tokens/total": 2621440,
"tokens/train_per_sec_per_gpu": 112.29,
"tokens/trainable": 2220426
},
{
"epoch": 0.2268041237113402,
"grad_norm": 0.5327035188674927,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7526419162750244,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 2.1226,
"step": 11,
"tokens/total": 2883584,
"tokens/train_per_sec_per_gpu": 110.97,
"tokens/trainable": 2448107
},
{
"epoch": 0.24742268041237114,
"grad_norm": 0.48926931619644165,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7427734136581421,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 2.10176,
"step": 12,
"tokens/total": 3145728,
"tokens/train_per_sec_per_gpu": 113.22,
"tokens/trainable": 2673714
},
{
"epoch": 0.26804123711340205,
"grad_norm": 0.5291987657546997,
"learning_rate": 9.999999747378752e-06,
"loss": 0.8142989873886108,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.77,
"memory/max_allocated (GiB)": 114.77,
"ppl": 2.25759,
"step": 13,
"tokens/total": 3407872,
"tokens/train_per_sec_per_gpu": 97.7,
"tokens/trainable": 2897892
},
{
"epoch": 0.28865979381443296,
"grad_norm": 0.4834632873535156,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7223080396652222,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.05918,
"step": 14,
"tokens/total": 3670016,
"tokens/train_per_sec_per_gpu": 106.63,
"tokens/trainable": 3118864
},
{
"epoch": 0.30927835051546393,
"grad_norm": 0.4591718018054962,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7303102016448975,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.07572,
"step": 15,
"tokens/total": 3932160,
"tokens/train_per_sec_per_gpu": 113.92,
"tokens/trainable": 3341060
},
{
"epoch": 0.32989690721649484,
"grad_norm": 0.43315476179122925,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7113853693008423,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 115.0,
"memory/max_allocated (GiB)": 115.0,
"ppl": 2.03681,
"step": 16,
"tokens/total": 4194304,
"tokens/train_per_sec_per_gpu": 96.81,
"tokens/trainable": 3571384
},
{
"epoch": 0.35051546391752575,
"grad_norm": 0.505426824092865,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7289949059486389,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.77,
"memory/max_allocated (GiB)": 114.77,
"ppl": 2.073,
"step": 17,
"tokens/total": 4456448,
"tokens/train_per_sec_per_gpu": 114.79,
"tokens/trainable": 3787852
},
{
"epoch": 0.3711340206185567,
"grad_norm": 0.44820281863212585,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7328810095787048,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.88,
"memory/max_allocated (GiB)": 114.88,
"ppl": 2.08107,
"step": 18,
"tokens/total": 4718592,
"tokens/train_per_sec_per_gpu": 92.87,
"tokens/trainable": 4009835
},
{
"epoch": 0.3917525773195876,
"grad_norm": 0.4282693862915039,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7495803236961365,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 2.11611,
"step": 19,
"tokens/total": 4980736,
"tokens/train_per_sec_per_gpu": 114.11,
"tokens/trainable": 4228885
},
{
"epoch": 0.41237113402061853,
"grad_norm": 0.48812466859817505,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7582152485847473,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.88,
"memory/max_allocated (GiB)": 114.88,
"ppl": 2.13446,
"step": 20,
"tokens/total": 5242880,
"tokens/train_per_sec_per_gpu": 115.11,
"tokens/trainable": 4452413
},
{
"epoch": 0.4329896907216495,
"grad_norm": 0.42291077971458435,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7270039319992065,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.06887,
"step": 21,
"tokens/total": 5505024,
"tokens/train_per_sec_per_gpu": 111.47,
"tokens/trainable": 4680388
},
{
"epoch": 0.4536082474226804,
"grad_norm": 0.5080971717834473,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7049570679664612,
"memory/device_reserved (GiB)": 115.96,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.02376,
"step": 22,
"tokens/total": 5767168,
"tokens/train_per_sec_per_gpu": 107.03,
"tokens/trainable": 4908673
},
{
"epoch": 0.4742268041237113,
"grad_norm": 0.42090919613838196,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7097281217575073,
"memory/device_reserved (GiB)": 115.98,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 2.03344,
"step": 23,
"tokens/total": 6029312,
"tokens/train_per_sec_per_gpu": 98.28,
"tokens/trainable": 5132935
},
{
"epoch": 0.4948453608247423,
"grad_norm": 0.4169037938117981,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7158472537994385,
"memory/device_reserved (GiB)": 115.98,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 2.04592,
"step": 24,
"tokens/total": 6291456,
"tokens/train_per_sec_per_gpu": 109.53,
"tokens/trainable": 5349445
},
{
"epoch": 0.5154639175257731,
"grad_norm": 0.4476269781589508,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7219246625900269,
"memory/device_reserved (GiB)": 115.98,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 2.05839,
"step": 25,
"tokens/total": 6553600,
"tokens/train_per_sec_per_gpu": 109.68,
"tokens/trainable": 5574620
},
{
"epoch": 0.5360824742268041,
"grad_norm": 0.4355926513671875,
"learning_rate": 9.999999747378752e-06,
"loss": 0.736411452293396,
"memory/device_reserved (GiB)": 115.98,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.08843,
"step": 26,
"tokens/total": 6815744,
"tokens/train_per_sec_per_gpu": 94.06,
"tokens/trainable": 5776291
},
{
"epoch": 0.5567010309278351,
"grad_norm": 0.4793473184108734,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7394890785217285,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 2.09486,
"step": 27,
"tokens/total": 7077888,
"tokens/train_per_sec_per_gpu": 105.19,
"tokens/trainable": 6000941
},
{
"epoch": 0.5773195876288659,
"grad_norm": 0.4275985360145569,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6823574304580688,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.9,
"memory/max_allocated (GiB)": 114.9,
"ppl": 1.97854,
"step": 28,
"tokens/total": 7340032,
"tokens/train_per_sec_per_gpu": 100.54,
"tokens/trainable": 6218099
},
{
"epoch": 0.5979381443298969,
"grad_norm": 0.39370277523994446,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6903939247131348,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.9945,
"step": 29,
"tokens/total": 7602176,
"tokens/train_per_sec_per_gpu": 108.88,
"tokens/trainable": 6450306
},
{
"epoch": 0.6185567010309279,
"grad_norm": 0.399675577878952,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7078354358673096,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 2.02959,
"step": 30,
"tokens/total": 7864320,
"tokens/train_per_sec_per_gpu": 115.99,
"tokens/trainable": 6682454
},
{
"epoch": 0.6391752577319587,
"grad_norm": 0.4058721661567688,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7107478976249695,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 2.03551,
"step": 31,
"tokens/total": 8126464,
"tokens/train_per_sec_per_gpu": 113.24,
"tokens/trainable": 6908540
},
{
"epoch": 0.6597938144329897,
"grad_norm": 0.38097304105758667,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6587168574333191,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.93231,
"step": 32,
"tokens/total": 8388608,
"tokens/train_per_sec_per_gpu": 96.59,
"tokens/trainable": 7136710
},
{
"epoch": 0.6804123711340206,
"grad_norm": 0.4215092957019806,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7302130460739136,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.07552,
"step": 33,
"tokens/total": 8650752,
"tokens/train_per_sec_per_gpu": 94.87,
"tokens/trainable": 7357757
},
{
"epoch": 0.7010309278350515,
"grad_norm": 0.419493705034256,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6523993015289307,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.92014,
"step": 34,
"tokens/total": 8912896,
"tokens/train_per_sec_per_gpu": 94.61,
"tokens/trainable": 7585151
},
{
"epoch": 0.7216494845360825,
"grad_norm": 0.4168092608451843,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7259938716888428,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 2.06678,
"step": 35,
"tokens/total": 9175040,
"tokens/train_per_sec_per_gpu": 101.0,
"tokens/trainable": 7807084
},
{
"epoch": 0.7422680412371134,
"grad_norm": 0.4247998297214508,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7215479612350464,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.05762,
"step": 36,
"tokens/total": 9437184,
"tokens/train_per_sec_per_gpu": 99.14,
"tokens/trainable": 8020010
},
{
"epoch": 0.7628865979381443,
"grad_norm": 0.42145678400993347,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7196784615516663,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.9,
"memory/max_allocated (GiB)": 114.9,
"ppl": 2.05377,
"step": 37,
"tokens/total": 9699328,
"tokens/train_per_sec_per_gpu": 98.31,
"tokens/trainable": 8237657
},
{
"epoch": 0.7835051546391752,
"grad_norm": 0.5614020228385925,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6845019459724426,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 1.98278,
"step": 38,
"tokens/total": 9961472,
"tokens/train_per_sec_per_gpu": 109.57,
"tokens/trainable": 8441722
},
{
"epoch": 0.8041237113402062,
"grad_norm": 0.40829339623451233,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7005173563957214,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.9,
"memory/max_allocated (GiB)": 114.9,
"ppl": 2.01479,
"step": 39,
"tokens/total": 10223616,
"tokens/train_per_sec_per_gpu": 106.43,
"tokens/trainable": 8667693
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.41540759801864624,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6717456579208374,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.9,
"memory/max_allocated (GiB)": 114.9,
"ppl": 1.95765,
"step": 40,
"tokens/total": 10485760,
"tokens/train_per_sec_per_gpu": 114.21,
"tokens/trainable": 8886348
},
{
"epoch": 0.845360824742268,
"grad_norm": 0.4369148910045624,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7158852815628052,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.76,
"memory/max_allocated (GiB)": 114.76,
"ppl": 2.046,
"step": 41,
"tokens/total": 10747904,
"tokens/train_per_sec_per_gpu": 104.71,
"tokens/trainable": 9105785
},
{
"epoch": 0.865979381443299,
"grad_norm": 0.41476163268089294,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6955273747444153,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.00477,
"step": 42,
"tokens/total": 11010048,
"tokens/train_per_sec_per_gpu": 88.0,
"tokens/trainable": 9326223
},
{
"epoch": 0.8865979381443299,
"grad_norm": 0.4271896481513977,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7322195172309875,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 2.07969,
"step": 43,
"tokens/total": 11272192,
"tokens/train_per_sec_per_gpu": 113.13,
"tokens/trainable": 9548339
},
{
"epoch": 0.9072164948453608,
"grad_norm": 0.4813380539417267,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6868515014648438,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.98745,
"step": 44,
"tokens/total": 11534336,
"tokens/train_per_sec_per_gpu": 90.58,
"tokens/trainable": 9766004
},
{
"epoch": 0.9278350515463918,
"grad_norm": 0.42919376492500305,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6917351484298706,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 1.99718,
"step": 45,
"tokens/total": 11796480,
"tokens/train_per_sec_per_gpu": 115.72,
"tokens/trainable": 9990528
},
{
"epoch": 0.9484536082474226,
"grad_norm": 0.409891813993454,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6957749724388123,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.00526,
"step": 46,
"tokens/total": 12058624,
"tokens/train_per_sec_per_gpu": 102.04,
"tokens/trainable": 10215266
},
{
"epoch": 0.9690721649484536,
"grad_norm": 0.4291461110115051,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7449676990509033,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 2.10637,
"step": 47,
"tokens/total": 12320768,
"tokens/train_per_sec_per_gpu": 111.94,
"tokens/trainable": 10433241
},
{
"epoch": 0.9896907216494846,
"grad_norm": 0.5046327710151672,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7033790946006775,
"memory/device_reserved (GiB)": 115.99,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 2.02057,
"step": 48,
"tokens/total": 12582912,
"tokens/train_per_sec_per_gpu": 111.07,
"tokens/trainable": 10653553
},
{
"epoch": 1.0,
"grad_norm": 0.8141054511070251,
"learning_rate": 9.999999747378752e-06,
"loss": 0.7225552797317505,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.0,
"memory/max_allocated (GiB)": 115.0,
"ppl": 2.05969,
"step": 49,
"tokens/total": 12705792,
"tokens/train_per_sec_per_gpu": 88.6,
"tokens/trainable": 10742066
},
{
"epoch": 1.0206185567010309,
"grad_norm": 0.5047632455825806,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6459757685661316,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 1.90785,
"step": 50,
"tokens/total": 12967936,
"tokens/train_per_sec_per_gpu": 102.43,
"tokens/trainable": 10957216
},
{
"epoch": 1.041237113402062,
"grad_norm": 0.49363699555397034,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6044580340385437,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.83026,
"step": 51,
"tokens/total": 13230080,
"tokens/train_per_sec_per_gpu": 103.01,
"tokens/trainable": 11168351
},
{
"epoch": 1.0618556701030928,
"grad_norm": 0.4461610019207001,
"learning_rate": 9.999999747378752e-06,
"loss": 0.620111346244812,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.85914,
"step": 52,
"tokens/total": 13492224,
"tokens/train_per_sec_per_gpu": 115.62,
"tokens/trainable": 11391383
},
{
"epoch": 1.0824742268041236,
"grad_norm": 0.40221890807151794,
"learning_rate": 9.999999747378752e-06,
"loss": 0.5692113041877747,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 1.76687,
"step": 53,
"tokens/total": 13754368,
"tokens/train_per_sec_per_gpu": 117.49,
"tokens/trainable": 11620324
},
{
"epoch": 1.1030927835051547,
"grad_norm": 0.4584852457046509,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6176695823669434,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 1.8546,
"step": 54,
"tokens/total": 14016512,
"tokens/train_per_sec_per_gpu": 93.72,
"tokens/trainable": 11838620
},
{
"epoch": 1.1237113402061856,
"grad_norm": 0.48267844319343567,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6084916591644287,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 1.83766,
"step": 55,
"tokens/total": 14278656,
"tokens/train_per_sec_per_gpu": 81.91,
"tokens/trainable": 12051077
},
{
"epoch": 1.1443298969072164,
"grad_norm": 0.4616500437259674,
"learning_rate": 9.999999747378752e-06,
"loss": 0.5891711711883545,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.80249,
"step": 56,
"tokens/total": 14540800,
"tokens/train_per_sec_per_gpu": 117.34,
"tokens/trainable": 12280103
},
{
"epoch": 1.1649484536082475,
"grad_norm": 0.5742337107658386,
"learning_rate": 9.999999747378752e-06,
"loss": 0.601008951663971,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.82396,
"step": 57,
"tokens/total": 14802944,
"tokens/train_per_sec_per_gpu": 108.85,
"tokens/trainable": 12497669
},
{
"epoch": 1.1855670103092784,
"grad_norm": 0.5385653376579285,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6246358156204224,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 1.86757,
"step": 58,
"tokens/total": 15065088,
"tokens/train_per_sec_per_gpu": 105.9,
"tokens/trainable": 12722771
},
{
"epoch": 1.2061855670103092,
"grad_norm": 0.45170825719833374,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6120405197143555,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 1.84419,
"step": 59,
"tokens/total": 15327232,
"tokens/train_per_sec_per_gpu": 100.79,
"tokens/trainable": 12952093
},
{
"epoch": 1.2268041237113403,
"grad_norm": 0.45874664187431335,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6256142854690552,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 1.86939,
"step": 60,
"tokens/total": 15589376,
"tokens/train_per_sec_per_gpu": 114.58,
"tokens/trainable": 13177393
},
{
"epoch": 1.2474226804123711,
"grad_norm": 0.42568033933639526,
"learning_rate": 9.999999747378752e-06,
"loss": 0.546139121055603,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.72657,
"step": 61,
"tokens/total": 15851520,
"tokens/train_per_sec_per_gpu": 107.76,
"tokens/trainable": 13402107
},
{
"epoch": 1.268041237113402,
"grad_norm": 0.4116363525390625,
"learning_rate": 9.999999747378752e-06,
"loss": 0.5737490653991699,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 114.9,
"memory/max_allocated (GiB)": 114.9,
"ppl": 1.77491,
"step": 62,
"tokens/total": 16113664,
"tokens/train_per_sec_per_gpu": 85.83,
"tokens/trainable": 13620089
},
{
"epoch": 1.2886597938144329,
"grad_norm": 0.6544530987739563,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6034448146820068,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.02,
"memory/max_allocated (GiB)": 115.02,
"ppl": 1.82841,
"step": 63,
"tokens/total": 16375808,
"tokens/train_per_sec_per_gpu": 112.67,
"tokens/trainable": 13850758
},
{
"epoch": 1.309278350515464,
"grad_norm": 0.4090299904346466,
"learning_rate": 9.999999747378752e-06,
"loss": 0.5996339321136475,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.82145,
"step": 64,
"tokens/total": 16637952,
"tokens/train_per_sec_per_gpu": 117.01,
"tokens/trainable": 14081234
},
{
"epoch": 1.3298969072164948,
"grad_norm": 0.42677614092826843,
"learning_rate": 9.999999747378752e-06,
"loss": 0.6008721590042114,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 114.89,
"memory/max_allocated (GiB)": 114.89,
"ppl": 1.82371,
"step": 65,
"tokens/total": 16900096,
"tokens/train_per_sec_per_gpu": 111.99,
"tokens/trainable": 14313785
},
{
"epoch": 1.3505154639175259,
"grad_norm": 0.4415728747844696,
"learning_rate": 9.999999747378752e-06,
"loss": 0.5510027408599854,
"memory/device_reserved (GiB)": 116.01,
"memory/max_active (GiB)": 115.01,
"memory/max_allocated (GiB)": 115.01,
"ppl": 1.73499,
"step": 66,
"tokens/total": 17162240,
"tokens/train_per_sec_per_gpu": 101.11,
"tokens/trainable": 14531975
}
],
"logging_steps": 1,
"max_steps": 96,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 6,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.9484107215221555e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}

Xet Storage Details

Size:
30.4 kB
·
Xet hash:
2d21d148f5e7db16e0f1ca96c1e29466faf5299147a68ca0962139bfc31918a7

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.