Qwen3-Bifrost-SOL-4B / trainer_state.json
BifrostTitan's picture
Qwen Bifrost SOL 4B - 1000 batches v0 BETA
49818d5 verified
raw
history blame
12.7 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 15.5,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.6489,
"mean_token_accuracy": 0.6407988503575325,
"num_tokens": 9165.0,
"step": 20
},
{
"epoch": 0.04,
"grad_norm": 13.25,
"learning_rate": 3.900000000000001e-06,
"loss": 1.5796,
"mean_token_accuracy": 0.6642300620675087,
"num_tokens": 19130.0,
"step": 40
},
{
"epoch": 0.06,
"grad_norm": 8.5625,
"learning_rate": 5.9e-06,
"loss": 1.5051,
"mean_token_accuracy": 0.6587439611554146,
"num_tokens": 30279.0,
"step": 60
},
{
"epoch": 0.08,
"grad_norm": 47.0,
"learning_rate": 7.9e-06,
"loss": 1.0906,
"mean_token_accuracy": 0.7278790444135665,
"num_tokens": 42533.0,
"step": 80
},
{
"epoch": 0.1,
"grad_norm": 18.0,
"learning_rate": 9.9e-06,
"loss": 1.6005,
"mean_token_accuracy": 0.6399814695119858,
"num_tokens": 48242.0,
"step": 100
},
{
"epoch": 0.12,
"grad_norm": 23.875,
"learning_rate": 1.1900000000000001e-05,
"loss": 1.1348,
"mean_token_accuracy": 0.7180342584848404,
"num_tokens": 59331.0,
"step": 120
},
{
"epoch": 0.14,
"grad_norm": 5.53125,
"learning_rate": 1.39e-05,
"loss": 1.1225,
"mean_token_accuracy": 0.7080226972699165,
"num_tokens": 68979.0,
"step": 140
},
{
"epoch": 0.16,
"grad_norm": 7.53125,
"learning_rate": 1.5900000000000004e-05,
"loss": 1.2722,
"mean_token_accuracy": 0.6867348909378052,
"num_tokens": 76070.0,
"step": 160
},
{
"epoch": 0.18,
"grad_norm": 10.0,
"learning_rate": 1.79e-05,
"loss": 1.0984,
"mean_token_accuracy": 0.7116657719016075,
"num_tokens": 85738.0,
"step": 180
},
{
"epoch": 0.2,
"grad_norm": 29.75,
"learning_rate": 1.9900000000000003e-05,
"loss": 1.2382,
"mean_token_accuracy": 0.6763741672039032,
"num_tokens": 94928.0,
"step": 200
},
{
"epoch": 0.22,
"grad_norm": 10.0,
"learning_rate": 1.997217754240393e-05,
"loss": 1.1036,
"mean_token_accuracy": 0.7029919415712357,
"num_tokens": 104773.0,
"step": 220
},
{
"epoch": 0.24,
"grad_norm": 8.875,
"learning_rate": 1.9882950400358694e-05,
"loss": 1.086,
"mean_token_accuracy": 0.7115180641412735,
"num_tokens": 114064.0,
"step": 240
},
{
"epoch": 0.26,
"grad_norm": 21.625,
"learning_rate": 1.973279158268075e-05,
"loss": 1.0707,
"mean_token_accuracy": 0.7133749336004257,
"num_tokens": 123805.0,
"step": 260
},
{
"epoch": 0.28,
"grad_norm": 6.875,
"learning_rate": 1.9522626868413956e-05,
"loss": 1.0925,
"mean_token_accuracy": 0.6988732308149338,
"num_tokens": 134324.0,
"step": 280
},
{
"epoch": 0.3,
"grad_norm": 15.75,
"learning_rate": 1.9253751992908624e-05,
"loss": 1.0249,
"mean_token_accuracy": 0.7223192781209946,
"num_tokens": 143153.0,
"step": 300
},
{
"epoch": 0.32,
"grad_norm": 15.25,
"learning_rate": 1.892782465918221e-05,
"loss": 0.9397,
"mean_token_accuracy": 0.7436516433954239,
"num_tokens": 154603.0,
"step": 320
},
{
"epoch": 0.34,
"grad_norm": 9.1875,
"learning_rate": 1.854685431762898e-05,
"loss": 0.9858,
"mean_token_accuracy": 0.7232172518968583,
"num_tokens": 167175.0,
"step": 340
},
{
"epoch": 0.36,
"grad_norm": 20.5,
"learning_rate": 1.81131897770901e-05,
"loss": 1.1003,
"mean_token_accuracy": 0.710582047700882,
"num_tokens": 176751.0,
"step": 360
},
{
"epoch": 0.38,
"grad_norm": 7.6875,
"learning_rate": 1.762950472366609e-05,
"loss": 1.1021,
"mean_token_accuracy": 0.7087165921926498,
"num_tokens": 185233.0,
"step": 380
},
{
"epoch": 0.4,
"grad_norm": 9.1875,
"learning_rate": 1.7098781236552905e-05,
"loss": 1.0907,
"mean_token_accuracy": 0.7013399839401245,
"num_tokens": 193072.0,
"step": 400
},
{
"epoch": 0.42,
"grad_norm": 8.125,
"learning_rate": 1.6524291402532068e-05,
"loss": 1.0278,
"mean_token_accuracy": 0.7198010861873627,
"num_tokens": 204372.0,
"step": 420
},
{
"epoch": 0.44,
"grad_norm": 15.6875,
"learning_rate": 1.5909577142467575e-05,
"loss": 0.9938,
"mean_token_accuracy": 0.7272346079349518,
"num_tokens": 216681.0,
"step": 440
},
{
"epoch": 0.46,
"grad_norm": 6.90625,
"learning_rate": 1.5258428374185957e-05,
"loss": 1.0869,
"mean_token_accuracy": 0.7040712654590606,
"num_tokens": 226708.0,
"step": 460
},
{
"epoch": 0.48,
"grad_norm": 16.5,
"learning_rate": 1.4574859646372605e-05,
"loss": 0.9903,
"mean_token_accuracy": 0.718842813372612,
"num_tokens": 237866.0,
"step": 480
},
{
"epoch": 0.5,
"grad_norm": 8.25,
"learning_rate": 1.3863085387544162e-05,
"loss": 1.1209,
"mean_token_accuracy": 0.6857956349849701,
"num_tokens": 245487.0,
"step": 500
},
{
"epoch": 0.52,
"grad_norm": 6.21875,
"learning_rate": 1.312749392269526e-05,
"loss": 0.9098,
"mean_token_accuracy": 0.7424555242061615,
"num_tokens": 257916.0,
"step": 520
},
{
"epoch": 0.54,
"grad_norm": 9.125,
"learning_rate": 1.237262041781568e-05,
"loss": 0.9883,
"mean_token_accuracy": 0.7290615320205689,
"num_tokens": 268576.0,
"step": 540
},
{
"epoch": 0.56,
"grad_norm": 29.5,
"learning_rate": 1.1603118919083913e-05,
"loss": 0.9659,
"mean_token_accuracy": 0.7375754147768021,
"num_tokens": 279922.0,
"step": 560
},
{
"epoch": 0.58,
"grad_norm": 9.0,
"learning_rate": 1.0823733659124857e-05,
"loss": 0.9751,
"mean_token_accuracy": 0.7351507008075714,
"num_tokens": 291510.0,
"step": 580
},
{
"epoch": 0.6,
"grad_norm": 11.125,
"learning_rate": 1.0039269807238061e-05,
"loss": 0.9839,
"mean_token_accuracy": 0.7332630962133407,
"num_tokens": 299117.0,
"step": 600
},
{
"epoch": 0.62,
"grad_norm": 4.59375,
"learning_rate": 9.254563843931104e-06,
"loss": 0.874,
"mean_token_accuracy": 0.759978985786438,
"num_tokens": 308284.0,
"step": 620
},
{
"epoch": 0.64,
"grad_norm": 8.1875,
"learning_rate": 8.474453742408992e-06,
"loss": 1.117,
"mean_token_accuracy": 0.700233319401741,
"num_tokens": 317108.0,
"step": 640
},
{
"epoch": 0.66,
"grad_norm": 7.125,
"learning_rate": 7.703749140860532e-06,
"loss": 0.8497,
"mean_token_accuracy": 0.7608069866895676,
"num_tokens": 329158.0,
"step": 660
},
{
"epoch": 0.68,
"grad_norm": 5.03125,
"learning_rate": 6.947201689439566e-06,
"loss": 0.9534,
"mean_token_accuracy": 0.738220265507698,
"num_tokens": 341065.0,
"step": 680
},
{
"epoch": 0.7,
"grad_norm": 29.125,
"learning_rate": 6.209475754761656e-06,
"loss": 1.0181,
"mean_token_accuracy": 0.7240578979253769,
"num_tokens": 350407.0,
"step": 700
},
{
"epoch": 0.72,
"grad_norm": 7.625,
"learning_rate": 5.495119662532857e-06,
"loss": 0.9699,
"mean_token_accuracy": 0.7413066476583481,
"num_tokens": 359556.0,
"step": 720
},
{
"epoch": 0.74,
"grad_norm": 10.8125,
"learning_rate": 4.808537655609275e-06,
"loss": 0.918,
"mean_token_accuracy": 0.733815786242485,
"num_tokens": 369946.0,
"step": 740
},
{
"epoch": 0.76,
"grad_norm": 14.75,
"learning_rate": 4.153962740375267e-06,
"loss": 1.0485,
"mean_token_accuracy": 0.7134604543447495,
"num_tokens": 378913.0,
"step": 760
},
{
"epoch": 0.78,
"grad_norm": 32.0,
"learning_rate": 3.535430588851503e-06,
"loss": 0.9323,
"mean_token_accuracy": 0.7460434168577195,
"num_tokens": 388498.0,
"step": 780
},
{
"epoch": 0.8,
"grad_norm": 22.125,
"learning_rate": 2.956754657434937e-06,
"loss": 0.9263,
"mean_token_accuracy": 0.7375149309635163,
"num_tokens": 399344.0,
"step": 800
},
{
"epoch": 0.82,
"grad_norm": 9.625,
"learning_rate": 2.4215026756720326e-06,
"loss": 0.9806,
"mean_token_accuracy": 0.7308182954788208,
"num_tokens": 408984.0,
"step": 820
},
{
"epoch": 0.84,
"grad_norm": 9.5,
"learning_rate": 1.932974650019702e-06,
"loss": 0.8883,
"mean_token_accuracy": 0.7622001320123672,
"num_tokens": 420774.0,
"step": 840
},
{
"epoch": 0.86,
"grad_norm": 10.3125,
"learning_rate": 1.4941825182081392e-06,
"loss": 0.9587,
"mean_token_accuracy": 0.7398986458778382,
"num_tokens": 431337.0,
"step": 860
},
{
"epoch": 0.88,
"grad_norm": 5.59375,
"learning_rate": 1.107831579643157e-06,
"loss": 0.968,
"mean_token_accuracy": 0.7305586785078049,
"num_tokens": 441642.0,
"step": 880
},
{
"epoch": 0.9,
"grad_norm": 5.25,
"learning_rate": 7.763038163357317e-07,
"loss": 1.0572,
"mean_token_accuracy": 0.7166826635599136,
"num_tokens": 451392.0,
"step": 900
},
{
"epoch": 0.92,
"grad_norm": 6.46875,
"learning_rate": 5.016432071908228e-07,
"loss": 0.9599,
"mean_token_accuracy": 0.7386292368173599,
"num_tokens": 462976.0,
"step": 920
},
{
"epoch": 0.94,
"grad_norm": 7.28125,
"learning_rate": 2.855431261977315e-07,
"loss": 1.1379,
"mean_token_accuracy": 0.7005489379167557,
"num_tokens": 470774.0,
"step": 940
},
{
"epoch": 0.96,
"grad_norm": 34.5,
"learning_rate": 1.293359022163443e-07,
"loss": 1.0575,
"mean_token_accuracy": 0.716192701458931,
"num_tokens": 482102.0,
"step": 960
},
{
"epoch": 0.98,
"grad_norm": 5.65625,
"learning_rate": 3.398460472668341e-08,
"loss": 0.8541,
"mean_token_accuracy": 0.7635544419288636,
"num_tokens": 494552.0,
"step": 980
},
{
"epoch": 1.0,
"grad_norm": 7.21875,
"learning_rate": 7.710618529443992e-11,
"loss": 0.9549,
"mean_token_accuracy": 0.7463011115789413,
"num_tokens": 504368.0,
"step": 1000
},
{
"epoch": 1.0,
"step": 1000,
"total_flos": 1.0995762888818688e+16,
"train_loss": 1.0670766773223876,
"train_runtime": 40993.3469,
"train_samples_per_second": 0.024,
"train_steps_per_second": 0.024
}
],
"logging_steps": 20,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0995762888818688e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}