{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 15.5, "learning_rate": 1.9000000000000002e-06, "loss": 1.6489, "mean_token_accuracy": 0.6407988503575325, "num_tokens": 9165.0, "step": 20 }, { "epoch": 0.04, "grad_norm": 13.25, "learning_rate": 3.900000000000001e-06, "loss": 1.5796, "mean_token_accuracy": 0.6642300620675087, "num_tokens": 19130.0, "step": 40 }, { "epoch": 0.06, "grad_norm": 8.5625, "learning_rate": 5.9e-06, "loss": 1.5051, "mean_token_accuracy": 0.6587439611554146, "num_tokens": 30279.0, "step": 60 }, { "epoch": 0.08, "grad_norm": 47.0, "learning_rate": 7.9e-06, "loss": 1.0906, "mean_token_accuracy": 0.7278790444135665, "num_tokens": 42533.0, "step": 80 }, { "epoch": 0.1, "grad_norm": 18.0, "learning_rate": 9.9e-06, "loss": 1.6005, "mean_token_accuracy": 0.6399814695119858, "num_tokens": 48242.0, "step": 100 }, { "epoch": 0.12, "grad_norm": 23.875, "learning_rate": 1.1900000000000001e-05, "loss": 1.1348, "mean_token_accuracy": 0.7180342584848404, "num_tokens": 59331.0, "step": 120 }, { "epoch": 0.14, "grad_norm": 5.53125, "learning_rate": 1.39e-05, "loss": 1.1225, "mean_token_accuracy": 0.7080226972699165, "num_tokens": 68979.0, "step": 140 }, { "epoch": 0.16, "grad_norm": 7.53125, "learning_rate": 1.5900000000000004e-05, "loss": 1.2722, "mean_token_accuracy": 0.6867348909378052, "num_tokens": 76070.0, "step": 160 }, { "epoch": 0.18, "grad_norm": 10.0, "learning_rate": 1.79e-05, "loss": 1.0984, "mean_token_accuracy": 0.7116657719016075, "num_tokens": 85738.0, "step": 180 }, { "epoch": 0.2, "grad_norm": 29.75, "learning_rate": 1.9900000000000003e-05, "loss": 1.2382, "mean_token_accuracy": 0.6763741672039032, "num_tokens": 94928.0, "step": 200 }, { "epoch": 0.22, "grad_norm": 10.0, "learning_rate": 1.997217754240393e-05, "loss": 1.1036, "mean_token_accuracy": 0.7029919415712357, "num_tokens": 104773.0, "step": 220 }, { "epoch": 0.24, "grad_norm": 8.875, "learning_rate": 1.9882950400358694e-05, "loss": 1.086, "mean_token_accuracy": 0.7115180641412735, "num_tokens": 114064.0, "step": 240 }, { "epoch": 0.26, "grad_norm": 21.625, "learning_rate": 1.973279158268075e-05, "loss": 1.0707, "mean_token_accuracy": 0.7133749336004257, "num_tokens": 123805.0, "step": 260 }, { "epoch": 0.28, "grad_norm": 6.875, "learning_rate": 1.9522626868413956e-05, "loss": 1.0925, "mean_token_accuracy": 0.6988732308149338, "num_tokens": 134324.0, "step": 280 }, { "epoch": 0.3, "grad_norm": 15.75, "learning_rate": 1.9253751992908624e-05, "loss": 1.0249, "mean_token_accuracy": 0.7223192781209946, "num_tokens": 143153.0, "step": 300 }, { "epoch": 0.32, "grad_norm": 15.25, "learning_rate": 1.892782465918221e-05, "loss": 0.9397, "mean_token_accuracy": 0.7436516433954239, "num_tokens": 154603.0, "step": 320 }, { "epoch": 0.34, "grad_norm": 9.1875, "learning_rate": 1.854685431762898e-05, "loss": 0.9858, "mean_token_accuracy": 0.7232172518968583, "num_tokens": 167175.0, "step": 340 }, { "epoch": 0.36, "grad_norm": 20.5, "learning_rate": 1.81131897770901e-05, "loss": 1.1003, "mean_token_accuracy": 0.710582047700882, "num_tokens": 176751.0, "step": 360 }, { "epoch": 0.38, "grad_norm": 7.6875, "learning_rate": 1.762950472366609e-05, "loss": 1.1021, "mean_token_accuracy": 0.7087165921926498, "num_tokens": 185233.0, "step": 380 }, { "epoch": 0.4, "grad_norm": 9.1875, "learning_rate": 1.7098781236552905e-05, "loss": 1.0907, "mean_token_accuracy": 0.7013399839401245, "num_tokens": 193072.0, "step": 400 }, { "epoch": 0.42, "grad_norm": 8.125, "learning_rate": 1.6524291402532068e-05, "loss": 1.0278, "mean_token_accuracy": 0.7198010861873627, "num_tokens": 204372.0, "step": 420 }, { "epoch": 0.44, "grad_norm": 15.6875, "learning_rate": 1.5909577142467575e-05, "loss": 0.9938, "mean_token_accuracy": 0.7272346079349518, "num_tokens": 216681.0, "step": 440 }, { "epoch": 0.46, "grad_norm": 6.90625, "learning_rate": 1.5258428374185957e-05, "loss": 1.0869, "mean_token_accuracy": 0.7040712654590606, "num_tokens": 226708.0, "step": 460 }, { "epoch": 0.48, "grad_norm": 16.5, "learning_rate": 1.4574859646372605e-05, "loss": 0.9903, "mean_token_accuracy": 0.718842813372612, "num_tokens": 237866.0, "step": 480 }, { "epoch": 0.5, "grad_norm": 8.25, "learning_rate": 1.3863085387544162e-05, "loss": 1.1209, "mean_token_accuracy": 0.6857956349849701, "num_tokens": 245487.0, "step": 500 }, { "epoch": 0.52, "grad_norm": 6.21875, "learning_rate": 1.312749392269526e-05, "loss": 0.9098, "mean_token_accuracy": 0.7424555242061615, "num_tokens": 257916.0, "step": 520 }, { "epoch": 0.54, "grad_norm": 9.125, "learning_rate": 1.237262041781568e-05, "loss": 0.9883, "mean_token_accuracy": 0.7290615320205689, "num_tokens": 268576.0, "step": 540 }, { "epoch": 0.56, "grad_norm": 29.5, "learning_rate": 1.1603118919083913e-05, "loss": 0.9659, "mean_token_accuracy": 0.7375754147768021, "num_tokens": 279922.0, "step": 560 }, { "epoch": 0.58, "grad_norm": 9.0, "learning_rate": 1.0823733659124857e-05, "loss": 0.9751, "mean_token_accuracy": 0.7351507008075714, "num_tokens": 291510.0, "step": 580 }, { "epoch": 0.6, "grad_norm": 11.125, "learning_rate": 1.0039269807238061e-05, "loss": 0.9839, "mean_token_accuracy": 0.7332630962133407, "num_tokens": 299117.0, "step": 600 }, { "epoch": 0.62, "grad_norm": 4.59375, "learning_rate": 9.254563843931104e-06, "loss": 0.874, "mean_token_accuracy": 0.759978985786438, "num_tokens": 308284.0, "step": 620 }, { "epoch": 0.64, "grad_norm": 8.1875, "learning_rate": 8.474453742408992e-06, "loss": 1.117, "mean_token_accuracy": 0.700233319401741, "num_tokens": 317108.0, "step": 640 }, { "epoch": 0.66, "grad_norm": 7.125, "learning_rate": 7.703749140860532e-06, "loss": 0.8497, "mean_token_accuracy": 0.7608069866895676, "num_tokens": 329158.0, "step": 660 }, { "epoch": 0.68, "grad_norm": 5.03125, "learning_rate": 6.947201689439566e-06, "loss": 0.9534, "mean_token_accuracy": 0.738220265507698, "num_tokens": 341065.0, "step": 680 }, { "epoch": 0.7, "grad_norm": 29.125, "learning_rate": 6.209475754761656e-06, "loss": 1.0181, "mean_token_accuracy": 0.7240578979253769, "num_tokens": 350407.0, "step": 700 }, { "epoch": 0.72, "grad_norm": 7.625, "learning_rate": 5.495119662532857e-06, "loss": 0.9699, "mean_token_accuracy": 0.7413066476583481, "num_tokens": 359556.0, "step": 720 }, { "epoch": 0.74, "grad_norm": 10.8125, "learning_rate": 4.808537655609275e-06, "loss": 0.918, "mean_token_accuracy": 0.733815786242485, "num_tokens": 369946.0, "step": 740 }, { "epoch": 0.76, "grad_norm": 14.75, "learning_rate": 4.153962740375267e-06, "loss": 1.0485, "mean_token_accuracy": 0.7134604543447495, "num_tokens": 378913.0, "step": 760 }, { "epoch": 0.78, "grad_norm": 32.0, "learning_rate": 3.535430588851503e-06, "loss": 0.9323, "mean_token_accuracy": 0.7460434168577195, "num_tokens": 388498.0, "step": 780 }, { "epoch": 0.8, "grad_norm": 22.125, "learning_rate": 2.956754657434937e-06, "loss": 0.9263, "mean_token_accuracy": 0.7375149309635163, "num_tokens": 399344.0, "step": 800 }, { "epoch": 0.82, "grad_norm": 9.625, "learning_rate": 2.4215026756720326e-06, "loss": 0.9806, "mean_token_accuracy": 0.7308182954788208, "num_tokens": 408984.0, "step": 820 }, { "epoch": 0.84, "grad_norm": 9.5, "learning_rate": 1.932974650019702e-06, "loss": 0.8883, "mean_token_accuracy": 0.7622001320123672, "num_tokens": 420774.0, "step": 840 }, { "epoch": 0.86, "grad_norm": 10.3125, "learning_rate": 1.4941825182081392e-06, "loss": 0.9587, "mean_token_accuracy": 0.7398986458778382, "num_tokens": 431337.0, "step": 860 }, { "epoch": 0.88, "grad_norm": 5.59375, "learning_rate": 1.107831579643157e-06, "loss": 0.968, "mean_token_accuracy": 0.7305586785078049, "num_tokens": 441642.0, "step": 880 }, { "epoch": 0.9, "grad_norm": 5.25, "learning_rate": 7.763038163357317e-07, "loss": 1.0572, "mean_token_accuracy": 0.7166826635599136, "num_tokens": 451392.0, "step": 900 }, { "epoch": 0.92, "grad_norm": 6.46875, "learning_rate": 5.016432071908228e-07, "loss": 0.9599, "mean_token_accuracy": 0.7386292368173599, "num_tokens": 462976.0, "step": 920 }, { "epoch": 0.94, "grad_norm": 7.28125, "learning_rate": 2.855431261977315e-07, "loss": 1.1379, "mean_token_accuracy": 0.7005489379167557, "num_tokens": 470774.0, "step": 940 }, { "epoch": 0.96, "grad_norm": 34.5, "learning_rate": 1.293359022163443e-07, "loss": 1.0575, "mean_token_accuracy": 0.716192701458931, "num_tokens": 482102.0, "step": 960 }, { "epoch": 0.98, "grad_norm": 5.65625, "learning_rate": 3.398460472668341e-08, "loss": 0.8541, "mean_token_accuracy": 0.7635544419288636, "num_tokens": 494552.0, "step": 980 }, { "epoch": 1.0, "grad_norm": 7.21875, "learning_rate": 7.710618529443992e-11, "loss": 0.9549, "mean_token_accuracy": 0.7463011115789413, "num_tokens": 504368.0, "step": 1000 }, { "epoch": 1.0, "step": 1000, "total_flos": 1.0995762888818688e+16, "train_loss": 1.0670766773223876, "train_runtime": 40993.3469, "train_samples_per_second": 0.024, "train_steps_per_second": 0.024 } ], "logging_steps": 20, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0995762888818688e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }