Buckets:
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.3505154639175259, | |
| "eval_steps": 500, | |
| "global_step": 66, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020618556701030927, | |
| "grad_norm": 2.13191556930542, | |
| "learning_rate": 0.0, | |
| "loss": 0.886581540107727, | |
| "memory/device_reserved (GiB)": 108.72, | |
| "memory/max_active (GiB)": 107.91, | |
| "memory/max_allocated (GiB)": 107.91, | |
| "ppl": 2.42682, | |
| "step": 1, | |
| "tokens/total": 262144, | |
| "tokens/train_per_sec_per_gpu": 106.66, | |
| "tokens/trainable": 222902 | |
| }, | |
| { | |
| "epoch": 0.041237113402061855, | |
| "grad_norm": 2.421574831008911, | |
| "learning_rate": 2.499999936844688e-06, | |
| "loss": 0.9712697267532349, | |
| "memory/device_reserved (GiB)": 115.82, | |
| "memory/max_active (GiB)": 114.88, | |
| "memory/max_allocated (GiB)": 114.88, | |
| "ppl": 2.6413, | |
| "step": 2, | |
| "tokens/total": 524288, | |
| "tokens/train_per_sec_per_gpu": 106.37, | |
| "tokens/trainable": 439627 | |
| }, | |
| { | |
| "epoch": 0.061855670103092786, | |
| "grad_norm": 2.0940639972686768, | |
| "learning_rate": 4.999999873689376e-06, | |
| "loss": 0.9303156137466431, | |
| "memory/device_reserved (GiB)": 115.92, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.53531, | |
| "step": 3, | |
| "tokens/total": 786432, | |
| "tokens/train_per_sec_per_gpu": 115.4, | |
| "tokens/trainable": 662779 | |
| }, | |
| { | |
| "epoch": 0.08247422680412371, | |
| "grad_norm": 1.3038409948349, | |
| "learning_rate": 7.499999810534064e-06, | |
| "loss": 0.9193090796470642, | |
| "memory/device_reserved (GiB)": 115.92, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 2.50756, | |
| "step": 4, | |
| "tokens/total": 1048576, | |
| "tokens/train_per_sec_per_gpu": 103.0, | |
| "tokens/trainable": 884034 | |
| }, | |
| { | |
| "epoch": 0.10309278350515463, | |
| "grad_norm": 1.296083688735962, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.9182717204093933, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.50496, | |
| "step": 5, | |
| "tokens/total": 1310720, | |
| "tokens/train_per_sec_per_gpu": 114.25, | |
| "tokens/trainable": 1106329 | |
| }, | |
| { | |
| "epoch": 0.12371134020618557, | |
| "grad_norm": 1.2671147584915161, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.8568199276924133, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.35566, | |
| "step": 6, | |
| "tokens/total": 1572864, | |
| "tokens/train_per_sec_per_gpu": 80.33, | |
| "tokens/trainable": 1323028 | |
| }, | |
| { | |
| "epoch": 0.14432989690721648, | |
| "grad_norm": 1.111126184463501, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.8340408205986023, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.99, | |
| "memory/max_allocated (GiB)": 114.99, | |
| "ppl": 2.3026, | |
| "step": 7, | |
| "tokens/total": 1835008, | |
| "tokens/train_per_sec_per_gpu": 116.58, | |
| "tokens/trainable": 1554666 | |
| }, | |
| { | |
| "epoch": 0.16494845360824742, | |
| "grad_norm": 0.9103517532348633, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.8362194895744324, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.88, | |
| "memory/max_allocated (GiB)": 114.88, | |
| "ppl": 2.30763, | |
| "step": 8, | |
| "tokens/total": 2097152, | |
| "tokens/train_per_sec_per_gpu": 104.84, | |
| "tokens/trainable": 1769387 | |
| }, | |
| { | |
| "epoch": 0.18556701030927836, | |
| "grad_norm": 0.7094802856445312, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.8117857575416565, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.25193, | |
| "step": 9, | |
| "tokens/total": 2359296, | |
| "tokens/train_per_sec_per_gpu": 103.85, | |
| "tokens/trainable": 1996059 | |
| }, | |
| { | |
| "epoch": 0.20618556701030927, | |
| "grad_norm": 0.6205381155014038, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.8286277055740356, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.29017, | |
| "step": 10, | |
| "tokens/total": 2621440, | |
| "tokens/train_per_sec_per_gpu": 112.29, | |
| "tokens/trainable": 2220426 | |
| }, | |
| { | |
| "epoch": 0.2268041237113402, | |
| "grad_norm": 0.5327035188674927, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7526419162750244, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 2.1226, | |
| "step": 11, | |
| "tokens/total": 2883584, | |
| "tokens/train_per_sec_per_gpu": 110.97, | |
| "tokens/trainable": 2448107 | |
| }, | |
| { | |
| "epoch": 0.24742268041237114, | |
| "grad_norm": 0.48926931619644165, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7427734136581421, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 2.10176, | |
| "step": 12, | |
| "tokens/total": 3145728, | |
| "tokens/train_per_sec_per_gpu": 113.22, | |
| "tokens/trainable": 2673714 | |
| }, | |
| { | |
| "epoch": 0.26804123711340205, | |
| "grad_norm": 0.5291987657546997, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.8142989873886108, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.77, | |
| "memory/max_allocated (GiB)": 114.77, | |
| "ppl": 2.25759, | |
| "step": 13, | |
| "tokens/total": 3407872, | |
| "tokens/train_per_sec_per_gpu": 97.7, | |
| "tokens/trainable": 2897892 | |
| }, | |
| { | |
| "epoch": 0.28865979381443296, | |
| "grad_norm": 0.4834632873535156, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7223080396652222, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.05918, | |
| "step": 14, | |
| "tokens/total": 3670016, | |
| "tokens/train_per_sec_per_gpu": 106.63, | |
| "tokens/trainable": 3118864 | |
| }, | |
| { | |
| "epoch": 0.30927835051546393, | |
| "grad_norm": 0.4591718018054962, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7303102016448975, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.07572, | |
| "step": 15, | |
| "tokens/total": 3932160, | |
| "tokens/train_per_sec_per_gpu": 113.92, | |
| "tokens/trainable": 3341060 | |
| }, | |
| { | |
| "epoch": 0.32989690721649484, | |
| "grad_norm": 0.43315476179122925, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7113853693008423, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 115.0, | |
| "memory/max_allocated (GiB)": 115.0, | |
| "ppl": 2.03681, | |
| "step": 16, | |
| "tokens/total": 4194304, | |
| "tokens/train_per_sec_per_gpu": 96.81, | |
| "tokens/trainable": 3571384 | |
| }, | |
| { | |
| "epoch": 0.35051546391752575, | |
| "grad_norm": 0.505426824092865, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7289949059486389, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.77, | |
| "memory/max_allocated (GiB)": 114.77, | |
| "ppl": 2.073, | |
| "step": 17, | |
| "tokens/total": 4456448, | |
| "tokens/train_per_sec_per_gpu": 114.79, | |
| "tokens/trainable": 3787852 | |
| }, | |
| { | |
| "epoch": 0.3711340206185567, | |
| "grad_norm": 0.44820281863212585, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7328810095787048, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.88, | |
| "memory/max_allocated (GiB)": 114.88, | |
| "ppl": 2.08107, | |
| "step": 18, | |
| "tokens/total": 4718592, | |
| "tokens/train_per_sec_per_gpu": 92.87, | |
| "tokens/trainable": 4009835 | |
| }, | |
| { | |
| "epoch": 0.3917525773195876, | |
| "grad_norm": 0.4282693862915039, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7495803236961365, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 2.11611, | |
| "step": 19, | |
| "tokens/total": 4980736, | |
| "tokens/train_per_sec_per_gpu": 114.11, | |
| "tokens/trainable": 4228885 | |
| }, | |
| { | |
| "epoch": 0.41237113402061853, | |
| "grad_norm": 0.48812466859817505, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7582152485847473, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.88, | |
| "memory/max_allocated (GiB)": 114.88, | |
| "ppl": 2.13446, | |
| "step": 20, | |
| "tokens/total": 5242880, | |
| "tokens/train_per_sec_per_gpu": 115.11, | |
| "tokens/trainable": 4452413 | |
| }, | |
| { | |
| "epoch": 0.4329896907216495, | |
| "grad_norm": 0.42291077971458435, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7270039319992065, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.06887, | |
| "step": 21, | |
| "tokens/total": 5505024, | |
| "tokens/train_per_sec_per_gpu": 111.47, | |
| "tokens/trainable": 4680388 | |
| }, | |
| { | |
| "epoch": 0.4536082474226804, | |
| "grad_norm": 0.5080971717834473, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7049570679664612, | |
| "memory/device_reserved (GiB)": 115.96, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.02376, | |
| "step": 22, | |
| "tokens/total": 5767168, | |
| "tokens/train_per_sec_per_gpu": 107.03, | |
| "tokens/trainable": 4908673 | |
| }, | |
| { | |
| "epoch": 0.4742268041237113, | |
| "grad_norm": 0.42090919613838196, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7097281217575073, | |
| "memory/device_reserved (GiB)": 115.98, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 2.03344, | |
| "step": 23, | |
| "tokens/total": 6029312, | |
| "tokens/train_per_sec_per_gpu": 98.28, | |
| "tokens/trainable": 5132935 | |
| }, | |
| { | |
| "epoch": 0.4948453608247423, | |
| "grad_norm": 0.4169037938117981, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7158472537994385, | |
| "memory/device_reserved (GiB)": 115.98, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 2.04592, | |
| "step": 24, | |
| "tokens/total": 6291456, | |
| "tokens/train_per_sec_per_gpu": 109.53, | |
| "tokens/trainable": 5349445 | |
| }, | |
| { | |
| "epoch": 0.5154639175257731, | |
| "grad_norm": 0.4476269781589508, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7219246625900269, | |
| "memory/device_reserved (GiB)": 115.98, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 2.05839, | |
| "step": 25, | |
| "tokens/total": 6553600, | |
| "tokens/train_per_sec_per_gpu": 109.68, | |
| "tokens/trainable": 5574620 | |
| }, | |
| { | |
| "epoch": 0.5360824742268041, | |
| "grad_norm": 0.4355926513671875, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.736411452293396, | |
| "memory/device_reserved (GiB)": 115.98, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.08843, | |
| "step": 26, | |
| "tokens/total": 6815744, | |
| "tokens/train_per_sec_per_gpu": 94.06, | |
| "tokens/trainable": 5776291 | |
| }, | |
| { | |
| "epoch": 0.5567010309278351, | |
| "grad_norm": 0.4793473184108734, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7394890785217285, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 2.09486, | |
| "step": 27, | |
| "tokens/total": 7077888, | |
| "tokens/train_per_sec_per_gpu": 105.19, | |
| "tokens/trainable": 6000941 | |
| }, | |
| { | |
| "epoch": 0.5773195876288659, | |
| "grad_norm": 0.4275985360145569, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6823574304580688, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.9, | |
| "memory/max_allocated (GiB)": 114.9, | |
| "ppl": 1.97854, | |
| "step": 28, | |
| "tokens/total": 7340032, | |
| "tokens/train_per_sec_per_gpu": 100.54, | |
| "tokens/trainable": 6218099 | |
| }, | |
| { | |
| "epoch": 0.5979381443298969, | |
| "grad_norm": 0.39370277523994446, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6903939247131348, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.9945, | |
| "step": 29, | |
| "tokens/total": 7602176, | |
| "tokens/train_per_sec_per_gpu": 108.88, | |
| "tokens/trainable": 6450306 | |
| }, | |
| { | |
| "epoch": 0.6185567010309279, | |
| "grad_norm": 0.399675577878952, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7078354358673096, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 2.02959, | |
| "step": 30, | |
| "tokens/total": 7864320, | |
| "tokens/train_per_sec_per_gpu": 115.99, | |
| "tokens/trainable": 6682454 | |
| }, | |
| { | |
| "epoch": 0.6391752577319587, | |
| "grad_norm": 0.4058721661567688, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7107478976249695, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 2.03551, | |
| "step": 31, | |
| "tokens/total": 8126464, | |
| "tokens/train_per_sec_per_gpu": 113.24, | |
| "tokens/trainable": 6908540 | |
| }, | |
| { | |
| "epoch": 0.6597938144329897, | |
| "grad_norm": 0.38097304105758667, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6587168574333191, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.93231, | |
| "step": 32, | |
| "tokens/total": 8388608, | |
| "tokens/train_per_sec_per_gpu": 96.59, | |
| "tokens/trainable": 7136710 | |
| }, | |
| { | |
| "epoch": 0.6804123711340206, | |
| "grad_norm": 0.4215092957019806, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7302130460739136, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.07552, | |
| "step": 33, | |
| "tokens/total": 8650752, | |
| "tokens/train_per_sec_per_gpu": 94.87, | |
| "tokens/trainable": 7357757 | |
| }, | |
| { | |
| "epoch": 0.7010309278350515, | |
| "grad_norm": 0.419493705034256, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6523993015289307, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.92014, | |
| "step": 34, | |
| "tokens/total": 8912896, | |
| "tokens/train_per_sec_per_gpu": 94.61, | |
| "tokens/trainable": 7585151 | |
| }, | |
| { | |
| "epoch": 0.7216494845360825, | |
| "grad_norm": 0.4168092608451843, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7259938716888428, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 2.06678, | |
| "step": 35, | |
| "tokens/total": 9175040, | |
| "tokens/train_per_sec_per_gpu": 101.0, | |
| "tokens/trainable": 7807084 | |
| }, | |
| { | |
| "epoch": 0.7422680412371134, | |
| "grad_norm": 0.4247998297214508, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7215479612350464, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.05762, | |
| "step": 36, | |
| "tokens/total": 9437184, | |
| "tokens/train_per_sec_per_gpu": 99.14, | |
| "tokens/trainable": 8020010 | |
| }, | |
| { | |
| "epoch": 0.7628865979381443, | |
| "grad_norm": 0.42145678400993347, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7196784615516663, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.9, | |
| "memory/max_allocated (GiB)": 114.9, | |
| "ppl": 2.05377, | |
| "step": 37, | |
| "tokens/total": 9699328, | |
| "tokens/train_per_sec_per_gpu": 98.31, | |
| "tokens/trainable": 8237657 | |
| }, | |
| { | |
| "epoch": 0.7835051546391752, | |
| "grad_norm": 0.5614020228385925, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6845019459724426, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 1.98278, | |
| "step": 38, | |
| "tokens/total": 9961472, | |
| "tokens/train_per_sec_per_gpu": 109.57, | |
| "tokens/trainable": 8441722 | |
| }, | |
| { | |
| "epoch": 0.8041237113402062, | |
| "grad_norm": 0.40829339623451233, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7005173563957214, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.9, | |
| "memory/max_allocated (GiB)": 114.9, | |
| "ppl": 2.01479, | |
| "step": 39, | |
| "tokens/total": 10223616, | |
| "tokens/train_per_sec_per_gpu": 106.43, | |
| "tokens/trainable": 8667693 | |
| }, | |
| { | |
| "epoch": 0.8247422680412371, | |
| "grad_norm": 0.41540759801864624, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6717456579208374, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.9, | |
| "memory/max_allocated (GiB)": 114.9, | |
| "ppl": 1.95765, | |
| "step": 40, | |
| "tokens/total": 10485760, | |
| "tokens/train_per_sec_per_gpu": 114.21, | |
| "tokens/trainable": 8886348 | |
| }, | |
| { | |
| "epoch": 0.845360824742268, | |
| "grad_norm": 0.4369148910045624, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7158852815628052, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.76, | |
| "memory/max_allocated (GiB)": 114.76, | |
| "ppl": 2.046, | |
| "step": 41, | |
| "tokens/total": 10747904, | |
| "tokens/train_per_sec_per_gpu": 104.71, | |
| "tokens/trainable": 9105785 | |
| }, | |
| { | |
| "epoch": 0.865979381443299, | |
| "grad_norm": 0.41476163268089294, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6955273747444153, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.00477, | |
| "step": 42, | |
| "tokens/total": 11010048, | |
| "tokens/train_per_sec_per_gpu": 88.0, | |
| "tokens/trainable": 9326223 | |
| }, | |
| { | |
| "epoch": 0.8865979381443299, | |
| "grad_norm": 0.4271896481513977, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7322195172309875, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 2.07969, | |
| "step": 43, | |
| "tokens/total": 11272192, | |
| "tokens/train_per_sec_per_gpu": 113.13, | |
| "tokens/trainable": 9548339 | |
| }, | |
| { | |
| "epoch": 0.9072164948453608, | |
| "grad_norm": 0.4813380539417267, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6868515014648438, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.98745, | |
| "step": 44, | |
| "tokens/total": 11534336, | |
| "tokens/train_per_sec_per_gpu": 90.58, | |
| "tokens/trainable": 9766004 | |
| }, | |
| { | |
| "epoch": 0.9278350515463918, | |
| "grad_norm": 0.42919376492500305, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6917351484298706, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 1.99718, | |
| "step": 45, | |
| "tokens/total": 11796480, | |
| "tokens/train_per_sec_per_gpu": 115.72, | |
| "tokens/trainable": 9990528 | |
| }, | |
| { | |
| "epoch": 0.9484536082474226, | |
| "grad_norm": 0.409891813993454, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6957749724388123, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.00526, | |
| "step": 46, | |
| "tokens/total": 12058624, | |
| "tokens/train_per_sec_per_gpu": 102.04, | |
| "tokens/trainable": 10215266 | |
| }, | |
| { | |
| "epoch": 0.9690721649484536, | |
| "grad_norm": 0.4291461110115051, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7449676990509033, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 2.10637, | |
| "step": 47, | |
| "tokens/total": 12320768, | |
| "tokens/train_per_sec_per_gpu": 111.94, | |
| "tokens/trainable": 10433241 | |
| }, | |
| { | |
| "epoch": 0.9896907216494846, | |
| "grad_norm": 0.5046327710151672, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7033790946006775, | |
| "memory/device_reserved (GiB)": 115.99, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 2.02057, | |
| "step": 48, | |
| "tokens/total": 12582912, | |
| "tokens/train_per_sec_per_gpu": 111.07, | |
| "tokens/trainable": 10653553 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.8141054511070251, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.7225552797317505, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.0, | |
| "memory/max_allocated (GiB)": 115.0, | |
| "ppl": 2.05969, | |
| "step": 49, | |
| "tokens/total": 12705792, | |
| "tokens/train_per_sec_per_gpu": 88.6, | |
| "tokens/trainable": 10742066 | |
| }, | |
| { | |
| "epoch": 1.0206185567010309, | |
| "grad_norm": 0.5047632455825806, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6459757685661316, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 1.90785, | |
| "step": 50, | |
| "tokens/total": 12967936, | |
| "tokens/train_per_sec_per_gpu": 102.43, | |
| "tokens/trainable": 10957216 | |
| }, | |
| { | |
| "epoch": 1.041237113402062, | |
| "grad_norm": 0.49363699555397034, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6044580340385437, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.83026, | |
| "step": 51, | |
| "tokens/total": 13230080, | |
| "tokens/train_per_sec_per_gpu": 103.01, | |
| "tokens/trainable": 11168351 | |
| }, | |
| { | |
| "epoch": 1.0618556701030928, | |
| "grad_norm": 0.4461610019207001, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.620111346244812, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.85914, | |
| "step": 52, | |
| "tokens/total": 13492224, | |
| "tokens/train_per_sec_per_gpu": 115.62, | |
| "tokens/trainable": 11391383 | |
| }, | |
| { | |
| "epoch": 1.0824742268041236, | |
| "grad_norm": 0.40221890807151794, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.5692113041877747, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 1.76687, | |
| "step": 53, | |
| "tokens/total": 13754368, | |
| "tokens/train_per_sec_per_gpu": 117.49, | |
| "tokens/trainable": 11620324 | |
| }, | |
| { | |
| "epoch": 1.1030927835051547, | |
| "grad_norm": 0.4584852457046509, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6176695823669434, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 1.8546, | |
| "step": 54, | |
| "tokens/total": 14016512, | |
| "tokens/train_per_sec_per_gpu": 93.72, | |
| "tokens/trainable": 11838620 | |
| }, | |
| { | |
| "epoch": 1.1237113402061856, | |
| "grad_norm": 0.48267844319343567, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6084916591644287, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 1.83766, | |
| "step": 55, | |
| "tokens/total": 14278656, | |
| "tokens/train_per_sec_per_gpu": 81.91, | |
| "tokens/trainable": 12051077 | |
| }, | |
| { | |
| "epoch": 1.1443298969072164, | |
| "grad_norm": 0.4616500437259674, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.5891711711883545, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.80249, | |
| "step": 56, | |
| "tokens/total": 14540800, | |
| "tokens/train_per_sec_per_gpu": 117.34, | |
| "tokens/trainable": 12280103 | |
| }, | |
| { | |
| "epoch": 1.1649484536082475, | |
| "grad_norm": 0.5742337107658386, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.601008951663971, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.82396, | |
| "step": 57, | |
| "tokens/total": 14802944, | |
| "tokens/train_per_sec_per_gpu": 108.85, | |
| "tokens/trainable": 12497669 | |
| }, | |
| { | |
| "epoch": 1.1855670103092784, | |
| "grad_norm": 0.5385653376579285, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6246358156204224, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 1.86757, | |
| "step": 58, | |
| "tokens/total": 15065088, | |
| "tokens/train_per_sec_per_gpu": 105.9, | |
| "tokens/trainable": 12722771 | |
| }, | |
| { | |
| "epoch": 1.2061855670103092, | |
| "grad_norm": 0.45170825719833374, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6120405197143555, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 1.84419, | |
| "step": 59, | |
| "tokens/total": 15327232, | |
| "tokens/train_per_sec_per_gpu": 100.79, | |
| "tokens/trainable": 12952093 | |
| }, | |
| { | |
| "epoch": 1.2268041237113403, | |
| "grad_norm": 0.45874664187431335, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6256142854690552, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 1.86939, | |
| "step": 60, | |
| "tokens/total": 15589376, | |
| "tokens/train_per_sec_per_gpu": 114.58, | |
| "tokens/trainable": 13177393 | |
| }, | |
| { | |
| "epoch": 1.2474226804123711, | |
| "grad_norm": 0.42568033933639526, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.546139121055603, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.72657, | |
| "step": 61, | |
| "tokens/total": 15851520, | |
| "tokens/train_per_sec_per_gpu": 107.76, | |
| "tokens/trainable": 13402107 | |
| }, | |
| { | |
| "epoch": 1.268041237113402, | |
| "grad_norm": 0.4116363525390625, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.5737490653991699, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 114.9, | |
| "memory/max_allocated (GiB)": 114.9, | |
| "ppl": 1.77491, | |
| "step": 62, | |
| "tokens/total": 16113664, | |
| "tokens/train_per_sec_per_gpu": 85.83, | |
| "tokens/trainable": 13620089 | |
| }, | |
| { | |
| "epoch": 1.2886597938144329, | |
| "grad_norm": 0.6544530987739563, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6034448146820068, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.02, | |
| "memory/max_allocated (GiB)": 115.02, | |
| "ppl": 1.82841, | |
| "step": 63, | |
| "tokens/total": 16375808, | |
| "tokens/train_per_sec_per_gpu": 112.67, | |
| "tokens/trainable": 13850758 | |
| }, | |
| { | |
| "epoch": 1.309278350515464, | |
| "grad_norm": 0.4090299904346466, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.5996339321136475, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.82145, | |
| "step": 64, | |
| "tokens/total": 16637952, | |
| "tokens/train_per_sec_per_gpu": 117.01, | |
| "tokens/trainable": 14081234 | |
| }, | |
| { | |
| "epoch": 1.3298969072164948, | |
| "grad_norm": 0.42677614092826843, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.6008721590042114, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 114.89, | |
| "memory/max_allocated (GiB)": 114.89, | |
| "ppl": 1.82371, | |
| "step": 65, | |
| "tokens/total": 16900096, | |
| "tokens/train_per_sec_per_gpu": 111.99, | |
| "tokens/trainable": 14313785 | |
| }, | |
| { | |
| "epoch": 1.3505154639175259, | |
| "grad_norm": 0.4415728747844696, | |
| "learning_rate": 9.999999747378752e-06, | |
| "loss": 0.5510027408599854, | |
| "memory/device_reserved (GiB)": 116.01, | |
| "memory/max_active (GiB)": 115.01, | |
| "memory/max_allocated (GiB)": 115.01, | |
| "ppl": 1.73499, | |
| "step": 66, | |
| "tokens/total": 17162240, | |
| "tokens/train_per_sec_per_gpu": 101.11, | |
| "tokens/trainable": 14531975 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 96, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 6, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.9484107215221555e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |
Xet Storage Details
- Size:
- 30.4 kB
- Xet hash:
- 2d21d148f5e7db16e0f1ca96c1e29466faf5299147a68ca0962139bfc31918a7
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.