| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 15.5, | |
| "learning_rate": 1.9000000000000002e-06, | |
| "loss": 1.6489, | |
| "mean_token_accuracy": 0.6407988503575325, | |
| "num_tokens": 9165.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 13.25, | |
| "learning_rate": 3.900000000000001e-06, | |
| "loss": 1.5796, | |
| "mean_token_accuracy": 0.6642300620675087, | |
| "num_tokens": 19130.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 5.9e-06, | |
| "loss": 1.5051, | |
| "mean_token_accuracy": 0.6587439611554146, | |
| "num_tokens": 30279.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 47.0, | |
| "learning_rate": 7.9e-06, | |
| "loss": 1.0906, | |
| "mean_token_accuracy": 0.7278790444135665, | |
| "num_tokens": 42533.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 18.0, | |
| "learning_rate": 9.9e-06, | |
| "loss": 1.6005, | |
| "mean_token_accuracy": 0.6399814695119858, | |
| "num_tokens": 48242.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 23.875, | |
| "learning_rate": 1.1900000000000001e-05, | |
| "loss": 1.1348, | |
| "mean_token_accuracy": 0.7180342584848404, | |
| "num_tokens": 59331.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 1.39e-05, | |
| "loss": 1.1225, | |
| "mean_token_accuracy": 0.7080226972699165, | |
| "num_tokens": 68979.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 1.5900000000000004e-05, | |
| "loss": 1.2722, | |
| "mean_token_accuracy": 0.6867348909378052, | |
| "num_tokens": 76070.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.79e-05, | |
| "loss": 1.0984, | |
| "mean_token_accuracy": 0.7116657719016075, | |
| "num_tokens": 85738.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 29.75, | |
| "learning_rate": 1.9900000000000003e-05, | |
| "loss": 1.2382, | |
| "mean_token_accuracy": 0.6763741672039032, | |
| "num_tokens": 94928.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.997217754240393e-05, | |
| "loss": 1.1036, | |
| "mean_token_accuracy": 0.7029919415712357, | |
| "num_tokens": 104773.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 8.875, | |
| "learning_rate": 1.9882950400358694e-05, | |
| "loss": 1.086, | |
| "mean_token_accuracy": 0.7115180641412735, | |
| "num_tokens": 114064.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.973279158268075e-05, | |
| "loss": 1.0707, | |
| "mean_token_accuracy": 0.7133749336004257, | |
| "num_tokens": 123805.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.875, | |
| "learning_rate": 1.9522626868413956e-05, | |
| "loss": 1.0925, | |
| "mean_token_accuracy": 0.6988732308149338, | |
| "num_tokens": 134324.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 15.75, | |
| "learning_rate": 1.9253751992908624e-05, | |
| "loss": 1.0249, | |
| "mean_token_accuracy": 0.7223192781209946, | |
| "num_tokens": 143153.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 15.25, | |
| "learning_rate": 1.892782465918221e-05, | |
| "loss": 0.9397, | |
| "mean_token_accuracy": 0.7436516433954239, | |
| "num_tokens": 154603.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 1.854685431762898e-05, | |
| "loss": 0.9858, | |
| "mean_token_accuracy": 0.7232172518968583, | |
| "num_tokens": 167175.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.81131897770901e-05, | |
| "loss": 1.1003, | |
| "mean_token_accuracy": 0.710582047700882, | |
| "num_tokens": 176751.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 1.762950472366609e-05, | |
| "loss": 1.1021, | |
| "mean_token_accuracy": 0.7087165921926498, | |
| "num_tokens": 185233.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 1.7098781236552905e-05, | |
| "loss": 1.0907, | |
| "mean_token_accuracy": 0.7013399839401245, | |
| "num_tokens": 193072.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 8.125, | |
| "learning_rate": 1.6524291402532068e-05, | |
| "loss": 1.0278, | |
| "mean_token_accuracy": 0.7198010861873627, | |
| "num_tokens": 204372.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 1.5909577142467575e-05, | |
| "loss": 0.9938, | |
| "mean_token_accuracy": 0.7272346079349518, | |
| "num_tokens": 216681.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 1.5258428374185957e-05, | |
| "loss": 1.0869, | |
| "mean_token_accuracy": 0.7040712654590606, | |
| "num_tokens": 226708.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 16.5, | |
| "learning_rate": 1.4574859646372605e-05, | |
| "loss": 0.9903, | |
| "mean_token_accuracy": 0.718842813372612, | |
| "num_tokens": 237866.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 8.25, | |
| "learning_rate": 1.3863085387544162e-05, | |
| "loss": 1.1209, | |
| "mean_token_accuracy": 0.6857956349849701, | |
| "num_tokens": 245487.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 1.312749392269526e-05, | |
| "loss": 0.9098, | |
| "mean_token_accuracy": 0.7424555242061615, | |
| "num_tokens": 257916.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 9.125, | |
| "learning_rate": 1.237262041781568e-05, | |
| "loss": 0.9883, | |
| "mean_token_accuracy": 0.7290615320205689, | |
| "num_tokens": 268576.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 29.5, | |
| "learning_rate": 1.1603118919083913e-05, | |
| "loss": 0.9659, | |
| "mean_token_accuracy": 0.7375754147768021, | |
| "num_tokens": 279922.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 9.0, | |
| "learning_rate": 1.0823733659124857e-05, | |
| "loss": 0.9751, | |
| "mean_token_accuracy": 0.7351507008075714, | |
| "num_tokens": 291510.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 11.125, | |
| "learning_rate": 1.0039269807238061e-05, | |
| "loss": 0.9839, | |
| "mean_token_accuracy": 0.7332630962133407, | |
| "num_tokens": 299117.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 9.254563843931104e-06, | |
| "loss": 0.874, | |
| "mean_token_accuracy": 0.759978985786438, | |
| "num_tokens": 308284.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 8.474453742408992e-06, | |
| "loss": 1.117, | |
| "mean_token_accuracy": 0.700233319401741, | |
| "num_tokens": 317108.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 7.125, | |
| "learning_rate": 7.703749140860532e-06, | |
| "loss": 0.8497, | |
| "mean_token_accuracy": 0.7608069866895676, | |
| "num_tokens": 329158.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 6.947201689439566e-06, | |
| "loss": 0.9534, | |
| "mean_token_accuracy": 0.738220265507698, | |
| "num_tokens": 341065.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 29.125, | |
| "learning_rate": 6.209475754761656e-06, | |
| "loss": 1.0181, | |
| "mean_token_accuracy": 0.7240578979253769, | |
| "num_tokens": 350407.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 7.625, | |
| "learning_rate": 5.495119662532857e-06, | |
| "loss": 0.9699, | |
| "mean_token_accuracy": 0.7413066476583481, | |
| "num_tokens": 359556.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 4.808537655609275e-06, | |
| "loss": 0.918, | |
| "mean_token_accuracy": 0.733815786242485, | |
| "num_tokens": 369946.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 14.75, | |
| "learning_rate": 4.153962740375267e-06, | |
| "loss": 1.0485, | |
| "mean_token_accuracy": 0.7134604543447495, | |
| "num_tokens": 378913.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 32.0, | |
| "learning_rate": 3.535430588851503e-06, | |
| "loss": 0.9323, | |
| "mean_token_accuracy": 0.7460434168577195, | |
| "num_tokens": 388498.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 22.125, | |
| "learning_rate": 2.956754657434937e-06, | |
| "loss": 0.9263, | |
| "mean_token_accuracy": 0.7375149309635163, | |
| "num_tokens": 399344.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 9.625, | |
| "learning_rate": 2.4215026756720326e-06, | |
| "loss": 0.9806, | |
| "mean_token_accuracy": 0.7308182954788208, | |
| "num_tokens": 408984.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 9.5, | |
| "learning_rate": 1.932974650019702e-06, | |
| "loss": 0.8883, | |
| "mean_token_accuracy": 0.7622001320123672, | |
| "num_tokens": 420774.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.4941825182081392e-06, | |
| "loss": 0.9587, | |
| "mean_token_accuracy": 0.7398986458778382, | |
| "num_tokens": 431337.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 1.107831579643157e-06, | |
| "loss": 0.968, | |
| "mean_token_accuracy": 0.7305586785078049, | |
| "num_tokens": 441642.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 5.25, | |
| "learning_rate": 7.763038163357317e-07, | |
| "loss": 1.0572, | |
| "mean_token_accuracy": 0.7166826635599136, | |
| "num_tokens": 451392.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 5.016432071908228e-07, | |
| "loss": 0.9599, | |
| "mean_token_accuracy": 0.7386292368173599, | |
| "num_tokens": 462976.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 2.855431261977315e-07, | |
| "loss": 1.1379, | |
| "mean_token_accuracy": 0.7005489379167557, | |
| "num_tokens": 470774.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 34.5, | |
| "learning_rate": 1.293359022163443e-07, | |
| "loss": 1.0575, | |
| "mean_token_accuracy": 0.716192701458931, | |
| "num_tokens": 482102.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 3.398460472668341e-08, | |
| "loss": 0.8541, | |
| "mean_token_accuracy": 0.7635544419288636, | |
| "num_tokens": 494552.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 7.710618529443992e-11, | |
| "loss": 0.9549, | |
| "mean_token_accuracy": 0.7463011115789413, | |
| "num_tokens": 504368.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1000, | |
| "total_flos": 1.0995762888818688e+16, | |
| "train_loss": 1.0670766773223876, | |
| "train_runtime": 40993.3469, | |
| "train_samples_per_second": 0.024, | |
| "train_steps_per_second": 0.024 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0995762888818688e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |