jekunz commited on
Commit
3fe07f0
·
verified ·
1 Parent(s): 86f972a

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-1102/config.json +32 -0
  2. checkpoint-1102/generation_config.json +6 -0
  3. checkpoint-1102/model.safetensors +3 -0
  4. checkpoint-1102/optimizer.pt +3 -0
  5. checkpoint-1102/rng_state.pth +3 -0
  6. checkpoint-1102/scaler.pt +3 -0
  7. checkpoint-1102/scheduler.pt +3 -0
  8. checkpoint-1102/trainer_state.json +119 -0
  9. checkpoint-1102/training_args.bin +3 -0
  10. checkpoint-11020/config.json +32 -0
  11. checkpoint-11020/generation_config.json +6 -0
  12. checkpoint-11020/model.safetensors +3 -0
  13. checkpoint-11020/optimizer.pt +3 -0
  14. checkpoint-11020/rng_state.pth +3 -0
  15. checkpoint-11020/scaler.pt +3 -0
  16. checkpoint-11020/scheduler.pt +3 -0
  17. checkpoint-11020/trainer_state.json +884 -0
  18. checkpoint-11020/training_args.bin +3 -0
  19. checkpoint-2204/config.json +32 -0
  20. checkpoint-2204/generation_config.json +6 -0
  21. checkpoint-2204/model.safetensors +3 -0
  22. checkpoint-2204/optimizer.pt +3 -0
  23. checkpoint-2204/rng_state.pth +3 -0
  24. checkpoint-2204/scaler.pt +3 -0
  25. checkpoint-2204/scheduler.pt +3 -0
  26. checkpoint-2204/trainer_state.json +204 -0
  27. checkpoint-2204/training_args.bin +3 -0
  28. checkpoint-3306/config.json +32 -0
  29. checkpoint-3306/generation_config.json +6 -0
  30. checkpoint-3306/model.safetensors +3 -0
  31. checkpoint-3306/optimizer.pt +3 -0
  32. checkpoint-3306/rng_state.pth +3 -0
  33. checkpoint-3306/scaler.pt +3 -0
  34. checkpoint-3306/scheduler.pt +3 -0
  35. checkpoint-3306/trainer_state.json +289 -0
  36. checkpoint-3306/training_args.bin +3 -0
  37. checkpoint-4408/config.json +32 -0
  38. checkpoint-4408/generation_config.json +6 -0
  39. checkpoint-4408/model.safetensors +3 -0
  40. checkpoint-4408/optimizer.pt +3 -0
  41. checkpoint-4408/rng_state.pth +3 -0
  42. checkpoint-4408/scaler.pt +3 -0
  43. checkpoint-4408/scheduler.pt +3 -0
  44. checkpoint-4408/trainer_state.json +374 -0
  45. checkpoint-4408/training_args.bin +3 -0
  46. checkpoint-5510/config.json +32 -0
  47. checkpoint-5510/generation_config.json +6 -0
  48. checkpoint-5510/model.safetensors +3 -0
  49. checkpoint-5510/optimizer.pt +3 -0
  50. checkpoint-5510/rng_state.pth +3 -0
checkpoint-1102/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 512,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 512,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.55.1",
30
+ "use_cache": true,
31
+ "vocab_size": 50000
32
+ }
checkpoint-1102/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.55.1"
6
+ }
checkpoint-1102/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4664e940e6525e7580eed0b411b0219793a2d7f1414b87ab4faf26ac07ccdb76
3
+ size 495411840
checkpoint-1102/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fa467dc354c344596db4b11c98be257e84aca9d180b09a5a17ec127d0d59d60
3
+ size 990920075
checkpoint-1102/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:980a1b68c7dbf538e6ffcfed871f2ea00501e6597777d59ebeacf240c8e404df
3
+ size 14645
checkpoint-1102/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d890ab2507c2df10293362b044d4f23e8ea33efa0473aacef3746451fff4942a
3
+ size 1383
checkpoint-1102/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cde6d27b68e1b876a90c26a3caabdaeffd87c0b46c4f3d8df07cd4891b14659
3
+ size 1465
checkpoint-1102/trainer_state.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1102,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09074410163339383,
14
+ "grad_norm": 1.4687509536743164,
15
+ "learning_rate": 8.98366606170599e-06,
16
+ "loss": 6.9172,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.18148820326678766,
21
+ "grad_norm": 1.268220067024231,
22
+ "learning_rate": 1.8058076225045372e-05,
23
+ "loss": 5.2198,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.27223230490018147,
28
+ "grad_norm": 0.9486992955207825,
29
+ "learning_rate": 2.7132486388384752e-05,
30
+ "loss": 4.6092,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.3629764065335753,
35
+ "grad_norm": 0.8284947276115417,
36
+ "learning_rate": 3.620689655172414e-05,
37
+ "loss": 4.2682,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.4537205081669691,
42
+ "grad_norm": 0.6980849504470825,
43
+ "learning_rate": 4.528130671506352e-05,
44
+ "loss": 4.2027,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5444646098003629,
49
+ "grad_norm": 0.9210988879203796,
50
+ "learning_rate": 5.435571687840291e-05,
51
+ "loss": 3.9613,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.6352087114337568,
56
+ "grad_norm": 0.8076866269111633,
57
+ "learning_rate": 6.343012704174229e-05,
58
+ "loss": 3.9595,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.7259528130671506,
63
+ "grad_norm": 0.7056506276130676,
64
+ "learning_rate": 7.250453720508167e-05,
65
+ "loss": 3.9345,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.8166969147005445,
70
+ "grad_norm": 0.7730728387832642,
71
+ "learning_rate": 8.157894736842105e-05,
72
+ "loss": 3.8263,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.9074410163339383,
77
+ "grad_norm": 0.6958891749382019,
78
+ "learning_rate": 9.065335753176044e-05,
79
+ "loss": 3.7584,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.9981851179673321,
84
+ "grad_norm": 0.7870356440544128,
85
+ "learning_rate": 9.972776769509982e-05,
86
+ "loss": 3.6662,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 1.0,
91
+ "eval_loss": 3.6291747093200684,
92
+ "eval_runtime": 9.9369,
93
+ "eval_samples_per_second": 394.189,
94
+ "eval_steps_per_second": 49.311,
95
+ "step": 1102
96
+ }
97
+ ],
98
+ "logging_steps": 100,
99
+ "max_steps": 11020,
100
+ "num_input_tokens_seen": 0,
101
+ "num_train_epochs": 10,
102
+ "save_steps": 500,
103
+ "stateful_callbacks": {
104
+ "TrainerControl": {
105
+ "args": {
106
+ "should_epoch_stop": false,
107
+ "should_evaluate": false,
108
+ "should_log": false,
109
+ "should_save": true,
110
+ "should_training_stop": false
111
+ },
112
+ "attributes": {}
113
+ }
114
+ },
115
+ "total_flos": 9210544128000000.0,
116
+ "train_batch_size": 32,
117
+ "trial_name": null,
118
+ "trial_params": null
119
+ }
checkpoint-1102/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
3
+ size 5777
checkpoint-11020/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 512,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 512,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.55.1",
30
+ "use_cache": true,
31
+ "vocab_size": 50000
32
+ }
checkpoint-11020/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.55.1"
6
+ }
checkpoint-11020/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8aa07a97c8b8148a133997c04520900b431feb8b1dbbd0c20c276ba90f3677b
3
+ size 495411840
checkpoint-11020/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9303d39d90ef877f0b01fefc47f5a1755dd7d8463b6d5549f7ec0f98c4ecd8f3
3
+ size 990920075
checkpoint-11020/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:029bc7ecc190ebd08eb7d9876b7268f48999ec5e606161c21b89560f5722aeab
3
+ size 14645
checkpoint-11020/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2915d6e197efe6d8d0c73cfb2af961ab3f3d7743c08ba257267996ceef72893b
3
+ size 1383
checkpoint-11020/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb235176fde760bf3096416d5242276faf48d1d454591d8c36bd44dc6e96ae8
3
+ size 1465
checkpoint-11020/trainer_state.json ADDED
@@ -0,0 +1,884 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 11020,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09074410163339383,
14
+ "grad_norm": 1.4687509536743164,
15
+ "learning_rate": 8.98366606170599e-06,
16
+ "loss": 6.9172,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.18148820326678766,
21
+ "grad_norm": 1.268220067024231,
22
+ "learning_rate": 1.8058076225045372e-05,
23
+ "loss": 5.2198,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.27223230490018147,
28
+ "grad_norm": 0.9486992955207825,
29
+ "learning_rate": 2.7132486388384752e-05,
30
+ "loss": 4.6092,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.3629764065335753,
35
+ "grad_norm": 0.8284947276115417,
36
+ "learning_rate": 3.620689655172414e-05,
37
+ "loss": 4.2682,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.4537205081669691,
42
+ "grad_norm": 0.6980849504470825,
43
+ "learning_rate": 4.528130671506352e-05,
44
+ "loss": 4.2027,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5444646098003629,
49
+ "grad_norm": 0.9210988879203796,
50
+ "learning_rate": 5.435571687840291e-05,
51
+ "loss": 3.9613,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.6352087114337568,
56
+ "grad_norm": 0.8076866269111633,
57
+ "learning_rate": 6.343012704174229e-05,
58
+ "loss": 3.9595,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.7259528130671506,
63
+ "grad_norm": 0.7056506276130676,
64
+ "learning_rate": 7.250453720508167e-05,
65
+ "loss": 3.9345,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.8166969147005445,
70
+ "grad_norm": 0.7730728387832642,
71
+ "learning_rate": 8.157894736842105e-05,
72
+ "loss": 3.8263,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.9074410163339383,
77
+ "grad_norm": 0.6958891749382019,
78
+ "learning_rate": 9.065335753176044e-05,
79
+ "loss": 3.7584,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.9981851179673321,
84
+ "grad_norm": 0.7870356440544128,
85
+ "learning_rate": 9.972776769509982e-05,
86
+ "loss": 3.6662,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 1.0,
91
+ "eval_loss": 3.6291747093200684,
92
+ "eval_runtime": 9.9369,
93
+ "eval_samples_per_second": 394.189,
94
+ "eval_steps_per_second": 49.311,
95
+ "step": 1102
96
+ },
97
+ {
98
+ "epoch": 1.0889292196007259,
99
+ "grad_norm": 0.6388369798660278,
100
+ "learning_rate": 9.90219802379512e-05,
101
+ "loss": 3.5837,
102
+ "step": 1200
103
+ },
104
+ {
105
+ "epoch": 1.1796733212341197,
106
+ "grad_norm": 0.771931529045105,
107
+ "learning_rate": 9.80137124420246e-05,
108
+ "loss": 3.5696,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 1.2704174228675136,
113
+ "grad_norm": 0.8356881141662598,
114
+ "learning_rate": 9.700544464609801e-05,
115
+ "loss": 3.4928,
116
+ "step": 1400
117
+ },
118
+ {
119
+ "epoch": 1.3611615245009074,
120
+ "grad_norm": 0.7187588214874268,
121
+ "learning_rate": 9.599717685017141e-05,
122
+ "loss": 3.5173,
123
+ "step": 1500
124
+ },
125
+ {
126
+ "epoch": 1.4519056261343013,
127
+ "grad_norm": 0.76460862159729,
128
+ "learning_rate": 9.498890905424481e-05,
129
+ "loss": 3.4,
130
+ "step": 1600
131
+ },
132
+ {
133
+ "epoch": 1.542649727767695,
134
+ "grad_norm": 0.7365128993988037,
135
+ "learning_rate": 9.398064125831822e-05,
136
+ "loss": 3.3335,
137
+ "step": 1700
138
+ },
139
+ {
140
+ "epoch": 1.633393829401089,
141
+ "grad_norm": 0.6376582980155945,
142
+ "learning_rate": 9.297237346239162e-05,
143
+ "loss": 3.2894,
144
+ "step": 1800
145
+ },
146
+ {
147
+ "epoch": 1.7241379310344827,
148
+ "grad_norm": 0.7320665121078491,
149
+ "learning_rate": 9.196410566646501e-05,
150
+ "loss": 3.2772,
151
+ "step": 1900
152
+ },
153
+ {
154
+ "epoch": 1.8148820326678767,
155
+ "grad_norm": 1.4237512350082397,
156
+ "learning_rate": 9.095583787053841e-05,
157
+ "loss": 3.2934,
158
+ "step": 2000
159
+ },
160
+ {
161
+ "epoch": 1.9056261343012704,
162
+ "grad_norm": 0.7177993655204773,
163
+ "learning_rate": 8.994757007461182e-05,
164
+ "loss": 3.2374,
165
+ "step": 2100
166
+ },
167
+ {
168
+ "epoch": 1.9963702359346642,
169
+ "grad_norm": 0.7358414530754089,
170
+ "learning_rate": 8.893930227868522e-05,
171
+ "loss": 3.2032,
172
+ "step": 2200
173
+ },
174
+ {
175
+ "epoch": 2.0,
176
+ "eval_loss": 3.1682567596435547,
177
+ "eval_runtime": 9.9556,
178
+ "eval_samples_per_second": 393.448,
179
+ "eval_steps_per_second": 49.219,
180
+ "step": 2204
181
+ },
182
+ {
183
+ "epoch": 2.087114337568058,
184
+ "grad_norm": 0.6569345593452454,
185
+ "learning_rate": 8.793103448275862e-05,
186
+ "loss": 3.1474,
187
+ "step": 2300
188
+ },
189
+ {
190
+ "epoch": 2.1778584392014517,
191
+ "grad_norm": 0.719578206539154,
192
+ "learning_rate": 8.692276668683203e-05,
193
+ "loss": 3.0573,
194
+ "step": 2400
195
+ },
196
+ {
197
+ "epoch": 2.268602540834846,
198
+ "grad_norm": 0.7317540645599365,
199
+ "learning_rate": 8.591449889090543e-05,
200
+ "loss": 3.1363,
201
+ "step": 2500
202
+ },
203
+ {
204
+ "epoch": 2.3593466424682394,
205
+ "grad_norm": 0.7985743880271912,
206
+ "learning_rate": 8.490623109497882e-05,
207
+ "loss": 3.0611,
208
+ "step": 2600
209
+ },
210
+ {
211
+ "epoch": 2.4500907441016335,
212
+ "grad_norm": 0.8037993907928467,
213
+ "learning_rate": 8.389796329905223e-05,
214
+ "loss": 3.0735,
215
+ "step": 2700
216
+ },
217
+ {
218
+ "epoch": 2.540834845735027,
219
+ "grad_norm": 0.7689797878265381,
220
+ "learning_rate": 8.288969550312563e-05,
221
+ "loss": 3.0639,
222
+ "step": 2800
223
+ },
224
+ {
225
+ "epoch": 2.6315789473684212,
226
+ "grad_norm": 0.7713281512260437,
227
+ "learning_rate": 8.188142770719903e-05,
228
+ "loss": 3.0165,
229
+ "step": 2900
230
+ },
231
+ {
232
+ "epoch": 2.722323049001815,
233
+ "grad_norm": 0.7961378693580627,
234
+ "learning_rate": 8.087315991127244e-05,
235
+ "loss": 2.9901,
236
+ "step": 3000
237
+ },
238
+ {
239
+ "epoch": 2.8130671506352085,
240
+ "grad_norm": 0.7088135480880737,
241
+ "learning_rate": 7.986489211534584e-05,
242
+ "loss": 3.0241,
243
+ "step": 3100
244
+ },
245
+ {
246
+ "epoch": 2.9038112522686026,
247
+ "grad_norm": 0.7496780157089233,
248
+ "learning_rate": 7.885662431941923e-05,
249
+ "loss": 2.9939,
250
+ "step": 3200
251
+ },
252
+ {
253
+ "epoch": 2.9945553539019962,
254
+ "grad_norm": 0.7221343517303467,
255
+ "learning_rate": 7.784835652349264e-05,
256
+ "loss": 2.9926,
257
+ "step": 3300
258
+ },
259
+ {
260
+ "epoch": 3.0,
261
+ "eval_loss": 2.9603192806243896,
262
+ "eval_runtime": 10.1067,
263
+ "eval_samples_per_second": 387.563,
264
+ "eval_steps_per_second": 48.483,
265
+ "step": 3306
266
+ },
267
+ {
268
+ "epoch": 3.0852994555353903,
269
+ "grad_norm": 0.8570982217788696,
270
+ "learning_rate": 7.684008872756604e-05,
271
+ "loss": 2.9251,
272
+ "step": 3400
273
+ },
274
+ {
275
+ "epoch": 3.176043557168784,
276
+ "grad_norm": 0.7704641222953796,
277
+ "learning_rate": 7.583182093163945e-05,
278
+ "loss": 2.8877,
279
+ "step": 3500
280
+ },
281
+ {
282
+ "epoch": 3.266787658802178,
283
+ "grad_norm": 0.6863057017326355,
284
+ "learning_rate": 7.482355313571285e-05,
285
+ "loss": 2.8743,
286
+ "step": 3600
287
+ },
288
+ {
289
+ "epoch": 3.3575317604355717,
290
+ "grad_norm": 0.7227942943572998,
291
+ "learning_rate": 7.381528533978626e-05,
292
+ "loss": 2.823,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 3.4482758620689653,
297
+ "grad_norm": 0.7746195197105408,
298
+ "learning_rate": 7.280701754385966e-05,
299
+ "loss": 2.8962,
300
+ "step": 3800
301
+ },
302
+ {
303
+ "epoch": 3.5390199637023594,
304
+ "grad_norm": 0.7885217070579529,
305
+ "learning_rate": 7.179874974793306e-05,
306
+ "loss": 2.8419,
307
+ "step": 3900
308
+ },
309
+ {
310
+ "epoch": 3.629764065335753,
311
+ "grad_norm": 0.7623139023780823,
312
+ "learning_rate": 7.079048195200647e-05,
313
+ "loss": 2.8333,
314
+ "step": 4000
315
+ },
316
+ {
317
+ "epoch": 3.720508166969147,
318
+ "grad_norm": 0.8208196759223938,
319
+ "learning_rate": 6.978221415607986e-05,
320
+ "loss": 2.8302,
321
+ "step": 4100
322
+ },
323
+ {
324
+ "epoch": 3.8112522686025407,
325
+ "grad_norm": 0.7712786197662354,
326
+ "learning_rate": 6.877394636015326e-05,
327
+ "loss": 2.8674,
328
+ "step": 4200
329
+ },
330
+ {
331
+ "epoch": 3.901996370235935,
332
+ "grad_norm": 0.8100000023841858,
333
+ "learning_rate": 6.776567856422666e-05,
334
+ "loss": 2.8097,
335
+ "step": 4300
336
+ },
337
+ {
338
+ "epoch": 3.9927404718693285,
339
+ "grad_norm": 0.8472097516059875,
340
+ "learning_rate": 6.675741076830007e-05,
341
+ "loss": 2.8418,
342
+ "step": 4400
343
+ },
344
+ {
345
+ "epoch": 4.0,
346
+ "eval_loss": 2.8345706462860107,
347
+ "eval_runtime": 9.9462,
348
+ "eval_samples_per_second": 393.819,
349
+ "eval_steps_per_second": 49.265,
350
+ "step": 4408
351
+ },
352
+ {
353
+ "epoch": 4.083484573502722,
354
+ "grad_norm": 0.8192525506019592,
355
+ "learning_rate": 6.574914297237347e-05,
356
+ "loss": 2.7376,
357
+ "step": 4500
358
+ },
359
+ {
360
+ "epoch": 4.174228675136116,
361
+ "grad_norm": 0.7768793106079102,
362
+ "learning_rate": 6.474087517644686e-05,
363
+ "loss": 2.7351,
364
+ "step": 4600
365
+ },
366
+ {
367
+ "epoch": 4.26497277676951,
368
+ "grad_norm": 0.8244798183441162,
369
+ "learning_rate": 6.373260738052027e-05,
370
+ "loss": 2.6991,
371
+ "step": 4700
372
+ },
373
+ {
374
+ "epoch": 4.3557168784029034,
375
+ "grad_norm": 0.8790801167488098,
376
+ "learning_rate": 6.272433958459367e-05,
377
+ "loss": 2.7034,
378
+ "step": 4800
379
+ },
380
+ {
381
+ "epoch": 4.4464609800362975,
382
+ "grad_norm": 0.7422960996627808,
383
+ "learning_rate": 6.171607178866707e-05,
384
+ "loss": 2.7208,
385
+ "step": 4900
386
+ },
387
+ {
388
+ "epoch": 4.537205081669692,
389
+ "grad_norm": 0.8407799601554871,
390
+ "learning_rate": 6.070780399274047e-05,
391
+ "loss": 2.7036,
392
+ "step": 5000
393
+ },
394
+ {
395
+ "epoch": 4.627949183303086,
396
+ "grad_norm": 0.8926665186882019,
397
+ "learning_rate": 5.969953619681388e-05,
398
+ "loss": 2.7198,
399
+ "step": 5100
400
+ },
401
+ {
402
+ "epoch": 4.718693284936479,
403
+ "grad_norm": 0.7808049917221069,
404
+ "learning_rate": 5.869126840088728e-05,
405
+ "loss": 2.706,
406
+ "step": 5200
407
+ },
408
+ {
409
+ "epoch": 4.809437386569873,
410
+ "grad_norm": 0.8481338024139404,
411
+ "learning_rate": 5.768300060496068e-05,
412
+ "loss": 2.6995,
413
+ "step": 5300
414
+ },
415
+ {
416
+ "epoch": 4.900181488203267,
417
+ "grad_norm": 0.8285768628120422,
418
+ "learning_rate": 5.667473280903408e-05,
419
+ "loss": 2.698,
420
+ "step": 5400
421
+ },
422
+ {
423
+ "epoch": 4.99092558983666,
424
+ "grad_norm": 0.8166932463645935,
425
+ "learning_rate": 5.566646501310748e-05,
426
+ "loss": 2.6797,
427
+ "step": 5500
428
+ },
429
+ {
430
+ "epoch": 5.0,
431
+ "eval_loss": 2.74469256401062,
432
+ "eval_runtime": 9.9766,
433
+ "eval_samples_per_second": 392.619,
434
+ "eval_steps_per_second": 49.115,
435
+ "step": 5510
436
+ },
437
+ {
438
+ "epoch": 5.081669691470054,
439
+ "grad_norm": 0.8111730217933655,
440
+ "learning_rate": 5.4658197217180884e-05,
441
+ "loss": 2.5822,
442
+ "step": 5600
443
+ },
444
+ {
445
+ "epoch": 5.172413793103448,
446
+ "grad_norm": 0.7661839723587036,
447
+ "learning_rate": 5.364992942125429e-05,
448
+ "loss": 2.6167,
449
+ "step": 5700
450
+ },
451
+ {
452
+ "epoch": 5.2631578947368425,
453
+ "grad_norm": 0.8927045464515686,
454
+ "learning_rate": 5.2641661625327685e-05,
455
+ "loss": 2.6113,
456
+ "step": 5800
457
+ },
458
+ {
459
+ "epoch": 5.353901996370236,
460
+ "grad_norm": 0.9407956600189209,
461
+ "learning_rate": 5.163339382940109e-05,
462
+ "loss": 2.5686,
463
+ "step": 5900
464
+ },
465
+ {
466
+ "epoch": 5.44464609800363,
467
+ "grad_norm": 0.8638333082199097,
468
+ "learning_rate": 5.062512603347449e-05,
469
+ "loss": 2.6222,
470
+ "step": 6000
471
+ },
472
+ {
473
+ "epoch": 5.535390199637024,
474
+ "grad_norm": 0.8607461452484131,
475
+ "learning_rate": 4.96168582375479e-05,
476
+ "loss": 2.5987,
477
+ "step": 6100
478
+ },
479
+ {
480
+ "epoch": 5.626134301270417,
481
+ "grad_norm": 0.7911032438278198,
482
+ "learning_rate": 4.86085904416213e-05,
483
+ "loss": 2.5868,
484
+ "step": 6200
485
+ },
486
+ {
487
+ "epoch": 5.716878402903811,
488
+ "grad_norm": 0.8415167927742004,
489
+ "learning_rate": 4.76003226456947e-05,
490
+ "loss": 2.5615,
491
+ "step": 6300
492
+ },
493
+ {
494
+ "epoch": 5.807622504537205,
495
+ "grad_norm": 0.8212953805923462,
496
+ "learning_rate": 4.65920548497681e-05,
497
+ "loss": 2.5962,
498
+ "step": 6400
499
+ },
500
+ {
501
+ "epoch": 5.898366606170599,
502
+ "grad_norm": 0.8392799496650696,
503
+ "learning_rate": 4.55837870538415e-05,
504
+ "loss": 2.6055,
505
+ "step": 6500
506
+ },
507
+ {
508
+ "epoch": 5.9891107078039925,
509
+ "grad_norm": 0.8681470155715942,
510
+ "learning_rate": 4.4575519257914904e-05,
511
+ "loss": 2.5534,
512
+ "step": 6600
513
+ },
514
+ {
515
+ "epoch": 6.0,
516
+ "eval_loss": 2.6848697662353516,
517
+ "eval_runtime": 9.9629,
518
+ "eval_samples_per_second": 393.16,
519
+ "eval_steps_per_second": 49.183,
520
+ "step": 6612
521
+ },
522
+ {
523
+ "epoch": 6.0798548094373865,
524
+ "grad_norm": 0.8970000147819519,
525
+ "learning_rate": 4.356725146198831e-05,
526
+ "loss": 2.5023,
527
+ "step": 6700
528
+ },
529
+ {
530
+ "epoch": 6.170598911070781,
531
+ "grad_norm": 0.9249663352966309,
532
+ "learning_rate": 4.2558983666061706e-05,
533
+ "loss": 2.514,
534
+ "step": 6800
535
+ },
536
+ {
537
+ "epoch": 6.261343012704174,
538
+ "grad_norm": 0.812436044216156,
539
+ "learning_rate": 4.155071587013511e-05,
540
+ "loss": 2.482,
541
+ "step": 6900
542
+ },
543
+ {
544
+ "epoch": 6.352087114337568,
545
+ "grad_norm": 0.9799634218215942,
546
+ "learning_rate": 4.0542448074208514e-05,
547
+ "loss": 2.4941,
548
+ "step": 7000
549
+ },
550
+ {
551
+ "epoch": 6.442831215970962,
552
+ "grad_norm": 0.9098866581916809,
553
+ "learning_rate": 3.953418027828191e-05,
554
+ "loss": 2.4911,
555
+ "step": 7100
556
+ },
557
+ {
558
+ "epoch": 6.533575317604356,
559
+ "grad_norm": 0.9876662492752075,
560
+ "learning_rate": 3.8525912482355315e-05,
561
+ "loss": 2.4846,
562
+ "step": 7200
563
+ },
564
+ {
565
+ "epoch": 6.624319419237749,
566
+ "grad_norm": 0.8980563282966614,
567
+ "learning_rate": 3.751764468642871e-05,
568
+ "loss": 2.5171,
569
+ "step": 7300
570
+ },
571
+ {
572
+ "epoch": 6.715063520871143,
573
+ "grad_norm": 0.893933117389679,
574
+ "learning_rate": 3.6509376890502117e-05,
575
+ "loss": 2.5136,
576
+ "step": 7400
577
+ },
578
+ {
579
+ "epoch": 6.805807622504537,
580
+ "grad_norm": 0.9221234321594238,
581
+ "learning_rate": 3.550110909457552e-05,
582
+ "loss": 2.4839,
583
+ "step": 7500
584
+ },
585
+ {
586
+ "epoch": 6.896551724137931,
587
+ "grad_norm": 1.0190584659576416,
588
+ "learning_rate": 3.449284129864892e-05,
589
+ "loss": 2.4976,
590
+ "step": 7600
591
+ },
592
+ {
593
+ "epoch": 6.987295825771325,
594
+ "grad_norm": 0.8790056109428406,
595
+ "learning_rate": 3.348457350272233e-05,
596
+ "loss": 2.4994,
597
+ "step": 7700
598
+ },
599
+ {
600
+ "epoch": 7.0,
601
+ "eval_loss": 2.64707612991333,
602
+ "eval_runtime": 9.9673,
603
+ "eval_samples_per_second": 392.987,
604
+ "eval_steps_per_second": 49.161,
605
+ "step": 7714
606
+ },
607
+ {
608
+ "epoch": 7.078039927404719,
609
+ "grad_norm": 1.0930935144424438,
610
+ "learning_rate": 3.2476305706795726e-05,
611
+ "loss": 2.4629,
612
+ "step": 7800
613
+ },
614
+ {
615
+ "epoch": 7.168784029038113,
616
+ "grad_norm": 1.0013751983642578,
617
+ "learning_rate": 3.146803791086913e-05,
618
+ "loss": 2.4113,
619
+ "step": 7900
620
+ },
621
+ {
622
+ "epoch": 7.259528130671506,
623
+ "grad_norm": 0.9267390370368958,
624
+ "learning_rate": 3.045977011494253e-05,
625
+ "loss": 2.4304,
626
+ "step": 8000
627
+ },
628
+ {
629
+ "epoch": 7.3502722323049,
630
+ "grad_norm": 1.0082099437713623,
631
+ "learning_rate": 2.9451502319015935e-05,
632
+ "loss": 2.3784,
633
+ "step": 8100
634
+ },
635
+ {
636
+ "epoch": 7.441016333938294,
637
+ "grad_norm": 1.0260313749313354,
638
+ "learning_rate": 2.8443234523089336e-05,
639
+ "loss": 2.3913,
640
+ "step": 8200
641
+ },
642
+ {
643
+ "epoch": 7.531760435571687,
644
+ "grad_norm": 0.900663435459137,
645
+ "learning_rate": 2.7434966727162736e-05,
646
+ "loss": 2.4293,
647
+ "step": 8300
648
+ },
649
+ {
650
+ "epoch": 7.6225045372050815,
651
+ "grad_norm": 0.9622400999069214,
652
+ "learning_rate": 2.6426698931236137e-05,
653
+ "loss": 2.4511,
654
+ "step": 8400
655
+ },
656
+ {
657
+ "epoch": 7.713248638838476,
658
+ "grad_norm": 0.9455797076225281,
659
+ "learning_rate": 2.541843113530954e-05,
660
+ "loss": 2.4199,
661
+ "step": 8500
662
+ },
663
+ {
664
+ "epoch": 7.80399274047187,
665
+ "grad_norm": 1.0435410737991333,
666
+ "learning_rate": 2.441016333938294e-05,
667
+ "loss": 2.4473,
668
+ "step": 8600
669
+ },
670
+ {
671
+ "epoch": 7.894736842105263,
672
+ "grad_norm": 0.9524980187416077,
673
+ "learning_rate": 2.3401895543456342e-05,
674
+ "loss": 2.4318,
675
+ "step": 8700
676
+ },
677
+ {
678
+ "epoch": 7.985480943738657,
679
+ "grad_norm": 1.0212959051132202,
680
+ "learning_rate": 2.2393627747529743e-05,
681
+ "loss": 2.4208,
682
+ "step": 8800
683
+ },
684
+ {
685
+ "epoch": 8.0,
686
+ "eval_loss": 2.624673843383789,
687
+ "eval_runtime": 9.9655,
688
+ "eval_samples_per_second": 393.055,
689
+ "eval_steps_per_second": 49.169,
690
+ "step": 8816
691
+ },
692
+ {
693
+ "epoch": 8.07622504537205,
694
+ "grad_norm": 1.0319713354110718,
695
+ "learning_rate": 2.1385359951603147e-05,
696
+ "loss": 2.4007,
697
+ "step": 8900
698
+ },
699
+ {
700
+ "epoch": 8.166969147005444,
701
+ "grad_norm": 0.9200676679611206,
702
+ "learning_rate": 2.037709215567655e-05,
703
+ "loss": 2.3846,
704
+ "step": 9000
705
+ },
706
+ {
707
+ "epoch": 8.257713248638838,
708
+ "grad_norm": 0.9439195394515991,
709
+ "learning_rate": 1.9368824359749952e-05,
710
+ "loss": 2.3628,
711
+ "step": 9100
712
+ },
713
+ {
714
+ "epoch": 8.348457350272232,
715
+ "grad_norm": 0.9906073808670044,
716
+ "learning_rate": 1.8360556563823353e-05,
717
+ "loss": 2.3659,
718
+ "step": 9200
719
+ },
720
+ {
721
+ "epoch": 8.439201451905626,
722
+ "grad_norm": 0.9741255640983582,
723
+ "learning_rate": 1.7352288767896753e-05,
724
+ "loss": 2.3134,
725
+ "step": 9300
726
+ },
727
+ {
728
+ "epoch": 8.52994555353902,
729
+ "grad_norm": 0.9208199977874756,
730
+ "learning_rate": 1.6344020971970157e-05,
731
+ "loss": 2.3716,
732
+ "step": 9400
733
+ },
734
+ {
735
+ "epoch": 8.620689655172415,
736
+ "grad_norm": 1.0154922008514404,
737
+ "learning_rate": 1.5335753176043558e-05,
738
+ "loss": 2.36,
739
+ "step": 9500
740
+ },
741
+ {
742
+ "epoch": 8.711433756805807,
743
+ "grad_norm": 1.1290541887283325,
744
+ "learning_rate": 1.4327485380116959e-05,
745
+ "loss": 2.384,
746
+ "step": 9600
747
+ },
748
+ {
749
+ "epoch": 8.802177858439201,
750
+ "grad_norm": 0.9401739835739136,
751
+ "learning_rate": 1.3319217584190361e-05,
752
+ "loss": 2.3736,
753
+ "step": 9700
754
+ },
755
+ {
756
+ "epoch": 8.892921960072595,
757
+ "grad_norm": 1.0175809860229492,
758
+ "learning_rate": 1.2310949788263763e-05,
759
+ "loss": 2.3405,
760
+ "step": 9800
761
+ },
762
+ {
763
+ "epoch": 8.98366606170599,
764
+ "grad_norm": 1.0658190250396729,
765
+ "learning_rate": 1.1302681992337164e-05,
766
+ "loss": 2.3499,
767
+ "step": 9900
768
+ },
769
+ {
770
+ "epoch": 9.0,
771
+ "eval_loss": 2.6118369102478027,
772
+ "eval_runtime": 9.9649,
773
+ "eval_samples_per_second": 393.08,
774
+ "eval_steps_per_second": 49.173,
775
+ "step": 9918
776
+ },
777
+ {
778
+ "epoch": 9.074410163339383,
779
+ "grad_norm": 1.1189725399017334,
780
+ "learning_rate": 1.0294414196410568e-05,
781
+ "loss": 2.3341,
782
+ "step": 10000
783
+ },
784
+ {
785
+ "epoch": 9.165154264972777,
786
+ "grad_norm": 0.9556492567062378,
787
+ "learning_rate": 9.286146400483969e-06,
788
+ "loss": 2.3361,
789
+ "step": 10100
790
+ },
791
+ {
792
+ "epoch": 9.255898366606171,
793
+ "grad_norm": 1.1089420318603516,
794
+ "learning_rate": 8.277878604557371e-06,
795
+ "loss": 2.3237,
796
+ "step": 10200
797
+ },
798
+ {
799
+ "epoch": 9.346642468239564,
800
+ "grad_norm": 1.0827833414077759,
801
+ "learning_rate": 7.269610808630773e-06,
802
+ "loss": 2.3379,
803
+ "step": 10300
804
+ },
805
+ {
806
+ "epoch": 9.437386569872958,
807
+ "grad_norm": 1.1406363248825073,
808
+ "learning_rate": 6.2613430127041735e-06,
809
+ "loss": 2.3169,
810
+ "step": 10400
811
+ },
812
+ {
813
+ "epoch": 9.528130671506352,
814
+ "grad_norm": 1.1382629871368408,
815
+ "learning_rate": 5.253075216777576e-06,
816
+ "loss": 2.2953,
817
+ "step": 10500
818
+ },
819
+ {
820
+ "epoch": 9.618874773139746,
821
+ "grad_norm": 1.0905636548995972,
822
+ "learning_rate": 4.244807420850978e-06,
823
+ "loss": 2.3415,
824
+ "step": 10600
825
+ },
826
+ {
827
+ "epoch": 9.70961887477314,
828
+ "grad_norm": 1.0931600332260132,
829
+ "learning_rate": 3.2365396249243798e-06,
830
+ "loss": 2.3542,
831
+ "step": 10700
832
+ },
833
+ {
834
+ "epoch": 9.800362976406534,
835
+ "grad_norm": 1.0854520797729492,
836
+ "learning_rate": 2.228271828997782e-06,
837
+ "loss": 2.3083,
838
+ "step": 10800
839
+ },
840
+ {
841
+ "epoch": 9.891107078039928,
842
+ "grad_norm": 1.082558035850525,
843
+ "learning_rate": 1.2200040330711837e-06,
844
+ "loss": 2.2774,
845
+ "step": 10900
846
+ },
847
+ {
848
+ "epoch": 9.98185117967332,
849
+ "grad_norm": 0.9970278739929199,
850
+ "learning_rate": 2.117362371445856e-07,
851
+ "loss": 2.3158,
852
+ "step": 11000
853
+ },
854
+ {
855
+ "epoch": 10.0,
856
+ "eval_loss": 2.6086275577545166,
857
+ "eval_runtime": 9.9766,
858
+ "eval_samples_per_second": 392.617,
859
+ "eval_steps_per_second": 49.115,
860
+ "step": 11020
861
+ }
862
+ ],
863
+ "logging_steps": 100,
864
+ "max_steps": 11020,
865
+ "num_input_tokens_seen": 0,
866
+ "num_train_epochs": 10,
867
+ "save_steps": 500,
868
+ "stateful_callbacks": {
869
+ "TrainerControl": {
870
+ "args": {
871
+ "should_epoch_stop": false,
872
+ "should_evaluate": false,
873
+ "should_log": false,
874
+ "should_save": true,
875
+ "should_training_stop": true
876
+ },
877
+ "attributes": {}
878
+ }
879
+ },
880
+ "total_flos": 9.210544128e+16,
881
+ "train_batch_size": 32,
882
+ "trial_name": null,
883
+ "trial_params": null
884
+ }
checkpoint-11020/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
3
+ size 5777
checkpoint-2204/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 512,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 512,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.55.1",
30
+ "use_cache": true,
31
+ "vocab_size": 50000
32
+ }
checkpoint-2204/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.55.1"
6
+ }
checkpoint-2204/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:371cee5c4fda9aabfc97636c74270305a0270f106c1413026093e11a09467bd6
3
+ size 495411840
checkpoint-2204/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af6573b11ed70c3954ca16c768614c905907654a161de392a33f3ce861d5e561
3
+ size 990920075
checkpoint-2204/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e974088c31a741bdc9ef02bc6a3b26e16da31ea187c7c63f0f2dac77054eb596
3
+ size 14645
checkpoint-2204/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efa92bbd79cb151c29c6f4763d58957bd29b39038ad1cafda589b6487915d47e
3
+ size 1383
checkpoint-2204/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fad23755ed5a6a8765f66010c5b5d6c60e734321ac41a7e825717fb75460d3d8
3
+ size 1465
checkpoint-2204/trainer_state.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2204,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09074410163339383,
14
+ "grad_norm": 1.4687509536743164,
15
+ "learning_rate": 8.98366606170599e-06,
16
+ "loss": 6.9172,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.18148820326678766,
21
+ "grad_norm": 1.268220067024231,
22
+ "learning_rate": 1.8058076225045372e-05,
23
+ "loss": 5.2198,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.27223230490018147,
28
+ "grad_norm": 0.9486992955207825,
29
+ "learning_rate": 2.7132486388384752e-05,
30
+ "loss": 4.6092,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.3629764065335753,
35
+ "grad_norm": 0.8284947276115417,
36
+ "learning_rate": 3.620689655172414e-05,
37
+ "loss": 4.2682,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.4537205081669691,
42
+ "grad_norm": 0.6980849504470825,
43
+ "learning_rate": 4.528130671506352e-05,
44
+ "loss": 4.2027,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5444646098003629,
49
+ "grad_norm": 0.9210988879203796,
50
+ "learning_rate": 5.435571687840291e-05,
51
+ "loss": 3.9613,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.6352087114337568,
56
+ "grad_norm": 0.8076866269111633,
57
+ "learning_rate": 6.343012704174229e-05,
58
+ "loss": 3.9595,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.7259528130671506,
63
+ "grad_norm": 0.7056506276130676,
64
+ "learning_rate": 7.250453720508167e-05,
65
+ "loss": 3.9345,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.8166969147005445,
70
+ "grad_norm": 0.7730728387832642,
71
+ "learning_rate": 8.157894736842105e-05,
72
+ "loss": 3.8263,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.9074410163339383,
77
+ "grad_norm": 0.6958891749382019,
78
+ "learning_rate": 9.065335753176044e-05,
79
+ "loss": 3.7584,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.9981851179673321,
84
+ "grad_norm": 0.7870356440544128,
85
+ "learning_rate": 9.972776769509982e-05,
86
+ "loss": 3.6662,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 1.0,
91
+ "eval_loss": 3.6291747093200684,
92
+ "eval_runtime": 9.9369,
93
+ "eval_samples_per_second": 394.189,
94
+ "eval_steps_per_second": 49.311,
95
+ "step": 1102
96
+ },
97
+ {
98
+ "epoch": 1.0889292196007259,
99
+ "grad_norm": 0.6388369798660278,
100
+ "learning_rate": 9.90219802379512e-05,
101
+ "loss": 3.5837,
102
+ "step": 1200
103
+ },
104
+ {
105
+ "epoch": 1.1796733212341197,
106
+ "grad_norm": 0.771931529045105,
107
+ "learning_rate": 9.80137124420246e-05,
108
+ "loss": 3.5696,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 1.2704174228675136,
113
+ "grad_norm": 0.8356881141662598,
114
+ "learning_rate": 9.700544464609801e-05,
115
+ "loss": 3.4928,
116
+ "step": 1400
117
+ },
118
+ {
119
+ "epoch": 1.3611615245009074,
120
+ "grad_norm": 0.7187588214874268,
121
+ "learning_rate": 9.599717685017141e-05,
122
+ "loss": 3.5173,
123
+ "step": 1500
124
+ },
125
+ {
126
+ "epoch": 1.4519056261343013,
127
+ "grad_norm": 0.76460862159729,
128
+ "learning_rate": 9.498890905424481e-05,
129
+ "loss": 3.4,
130
+ "step": 1600
131
+ },
132
+ {
133
+ "epoch": 1.542649727767695,
134
+ "grad_norm": 0.7365128993988037,
135
+ "learning_rate": 9.398064125831822e-05,
136
+ "loss": 3.3335,
137
+ "step": 1700
138
+ },
139
+ {
140
+ "epoch": 1.633393829401089,
141
+ "grad_norm": 0.6376582980155945,
142
+ "learning_rate": 9.297237346239162e-05,
143
+ "loss": 3.2894,
144
+ "step": 1800
145
+ },
146
+ {
147
+ "epoch": 1.7241379310344827,
148
+ "grad_norm": 0.7320665121078491,
149
+ "learning_rate": 9.196410566646501e-05,
150
+ "loss": 3.2772,
151
+ "step": 1900
152
+ },
153
+ {
154
+ "epoch": 1.8148820326678767,
155
+ "grad_norm": 1.4237512350082397,
156
+ "learning_rate": 9.095583787053841e-05,
157
+ "loss": 3.2934,
158
+ "step": 2000
159
+ },
160
+ {
161
+ "epoch": 1.9056261343012704,
162
+ "grad_norm": 0.7177993655204773,
163
+ "learning_rate": 8.994757007461182e-05,
164
+ "loss": 3.2374,
165
+ "step": 2100
166
+ },
167
+ {
168
+ "epoch": 1.9963702359346642,
169
+ "grad_norm": 0.7358414530754089,
170
+ "learning_rate": 8.893930227868522e-05,
171
+ "loss": 3.2032,
172
+ "step": 2200
173
+ },
174
+ {
175
+ "epoch": 2.0,
176
+ "eval_loss": 3.1682567596435547,
177
+ "eval_runtime": 9.9556,
178
+ "eval_samples_per_second": 393.448,
179
+ "eval_steps_per_second": 49.219,
180
+ "step": 2204
181
+ }
182
+ ],
183
+ "logging_steps": 100,
184
+ "max_steps": 11020,
185
+ "num_input_tokens_seen": 0,
186
+ "num_train_epochs": 10,
187
+ "save_steps": 500,
188
+ "stateful_callbacks": {
189
+ "TrainerControl": {
190
+ "args": {
191
+ "should_epoch_stop": false,
192
+ "should_evaluate": false,
193
+ "should_log": false,
194
+ "should_save": true,
195
+ "should_training_stop": false
196
+ },
197
+ "attributes": {}
198
+ }
199
+ },
200
+ "total_flos": 1.8421088256e+16,
201
+ "train_batch_size": 32,
202
+ "trial_name": null,
203
+ "trial_params": null
204
+ }
checkpoint-2204/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
3
+ size 5777
checkpoint-3306/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 512,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 512,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.55.1",
30
+ "use_cache": true,
31
+ "vocab_size": 50000
32
+ }
checkpoint-3306/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.55.1"
6
+ }
checkpoint-3306/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a08a2090b45354e5af5eb1cebf64dfe28315682bb8063642e0f2d8714b9061b
3
+ size 495411840
checkpoint-3306/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe483895c2f126675278e9e99504ce61a369a987775c0b58bd97a2c62b86a509
3
+ size 990920075
checkpoint-3306/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7b51df299b32b1d15e80da6dccb211575cefa4973745a4a0aadfa9393436ab
3
+ size 14645
checkpoint-3306/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb3f993f19242cc21b249926529d9621fcd4005c95c36819f3b93098771d8c48
3
+ size 1383
checkpoint-3306/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8098d0a50b0a08da1532b22288d46dbd7b7ecd90ddf43ed734ed15ffaafe599
3
+ size 1465
checkpoint-3306/trainer_state.json ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3306,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09074410163339383,
14
+ "grad_norm": 1.4687509536743164,
15
+ "learning_rate": 8.98366606170599e-06,
16
+ "loss": 6.9172,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.18148820326678766,
21
+ "grad_norm": 1.268220067024231,
22
+ "learning_rate": 1.8058076225045372e-05,
23
+ "loss": 5.2198,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.27223230490018147,
28
+ "grad_norm": 0.9486992955207825,
29
+ "learning_rate": 2.7132486388384752e-05,
30
+ "loss": 4.6092,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.3629764065335753,
35
+ "grad_norm": 0.8284947276115417,
36
+ "learning_rate": 3.620689655172414e-05,
37
+ "loss": 4.2682,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.4537205081669691,
42
+ "grad_norm": 0.6980849504470825,
43
+ "learning_rate": 4.528130671506352e-05,
44
+ "loss": 4.2027,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5444646098003629,
49
+ "grad_norm": 0.9210988879203796,
50
+ "learning_rate": 5.435571687840291e-05,
51
+ "loss": 3.9613,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.6352087114337568,
56
+ "grad_norm": 0.8076866269111633,
57
+ "learning_rate": 6.343012704174229e-05,
58
+ "loss": 3.9595,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.7259528130671506,
63
+ "grad_norm": 0.7056506276130676,
64
+ "learning_rate": 7.250453720508167e-05,
65
+ "loss": 3.9345,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.8166969147005445,
70
+ "grad_norm": 0.7730728387832642,
71
+ "learning_rate": 8.157894736842105e-05,
72
+ "loss": 3.8263,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.9074410163339383,
77
+ "grad_norm": 0.6958891749382019,
78
+ "learning_rate": 9.065335753176044e-05,
79
+ "loss": 3.7584,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.9981851179673321,
84
+ "grad_norm": 0.7870356440544128,
85
+ "learning_rate": 9.972776769509982e-05,
86
+ "loss": 3.6662,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 1.0,
91
+ "eval_loss": 3.6291747093200684,
92
+ "eval_runtime": 9.9369,
93
+ "eval_samples_per_second": 394.189,
94
+ "eval_steps_per_second": 49.311,
95
+ "step": 1102
96
+ },
97
+ {
98
+ "epoch": 1.0889292196007259,
99
+ "grad_norm": 0.6388369798660278,
100
+ "learning_rate": 9.90219802379512e-05,
101
+ "loss": 3.5837,
102
+ "step": 1200
103
+ },
104
+ {
105
+ "epoch": 1.1796733212341197,
106
+ "grad_norm": 0.771931529045105,
107
+ "learning_rate": 9.80137124420246e-05,
108
+ "loss": 3.5696,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 1.2704174228675136,
113
+ "grad_norm": 0.8356881141662598,
114
+ "learning_rate": 9.700544464609801e-05,
115
+ "loss": 3.4928,
116
+ "step": 1400
117
+ },
118
+ {
119
+ "epoch": 1.3611615245009074,
120
+ "grad_norm": 0.7187588214874268,
121
+ "learning_rate": 9.599717685017141e-05,
122
+ "loss": 3.5173,
123
+ "step": 1500
124
+ },
125
+ {
126
+ "epoch": 1.4519056261343013,
127
+ "grad_norm": 0.76460862159729,
128
+ "learning_rate": 9.498890905424481e-05,
129
+ "loss": 3.4,
130
+ "step": 1600
131
+ },
132
+ {
133
+ "epoch": 1.542649727767695,
134
+ "grad_norm": 0.7365128993988037,
135
+ "learning_rate": 9.398064125831822e-05,
136
+ "loss": 3.3335,
137
+ "step": 1700
138
+ },
139
+ {
140
+ "epoch": 1.633393829401089,
141
+ "grad_norm": 0.6376582980155945,
142
+ "learning_rate": 9.297237346239162e-05,
143
+ "loss": 3.2894,
144
+ "step": 1800
145
+ },
146
+ {
147
+ "epoch": 1.7241379310344827,
148
+ "grad_norm": 0.7320665121078491,
149
+ "learning_rate": 9.196410566646501e-05,
150
+ "loss": 3.2772,
151
+ "step": 1900
152
+ },
153
+ {
154
+ "epoch": 1.8148820326678767,
155
+ "grad_norm": 1.4237512350082397,
156
+ "learning_rate": 9.095583787053841e-05,
157
+ "loss": 3.2934,
158
+ "step": 2000
159
+ },
160
+ {
161
+ "epoch": 1.9056261343012704,
162
+ "grad_norm": 0.7177993655204773,
163
+ "learning_rate": 8.994757007461182e-05,
164
+ "loss": 3.2374,
165
+ "step": 2100
166
+ },
167
+ {
168
+ "epoch": 1.9963702359346642,
169
+ "grad_norm": 0.7358414530754089,
170
+ "learning_rate": 8.893930227868522e-05,
171
+ "loss": 3.2032,
172
+ "step": 2200
173
+ },
174
+ {
175
+ "epoch": 2.0,
176
+ "eval_loss": 3.1682567596435547,
177
+ "eval_runtime": 9.9556,
178
+ "eval_samples_per_second": 393.448,
179
+ "eval_steps_per_second": 49.219,
180
+ "step": 2204
181
+ },
182
+ {
183
+ "epoch": 2.087114337568058,
184
+ "grad_norm": 0.6569345593452454,
185
+ "learning_rate": 8.793103448275862e-05,
186
+ "loss": 3.1474,
187
+ "step": 2300
188
+ },
189
+ {
190
+ "epoch": 2.1778584392014517,
191
+ "grad_norm": 0.719578206539154,
192
+ "learning_rate": 8.692276668683203e-05,
193
+ "loss": 3.0573,
194
+ "step": 2400
195
+ },
196
+ {
197
+ "epoch": 2.268602540834846,
198
+ "grad_norm": 0.7317540645599365,
199
+ "learning_rate": 8.591449889090543e-05,
200
+ "loss": 3.1363,
201
+ "step": 2500
202
+ },
203
+ {
204
+ "epoch": 2.3593466424682394,
205
+ "grad_norm": 0.7985743880271912,
206
+ "learning_rate": 8.490623109497882e-05,
207
+ "loss": 3.0611,
208
+ "step": 2600
209
+ },
210
+ {
211
+ "epoch": 2.4500907441016335,
212
+ "grad_norm": 0.8037993907928467,
213
+ "learning_rate": 8.389796329905223e-05,
214
+ "loss": 3.0735,
215
+ "step": 2700
216
+ },
217
+ {
218
+ "epoch": 2.540834845735027,
219
+ "grad_norm": 0.7689797878265381,
220
+ "learning_rate": 8.288969550312563e-05,
221
+ "loss": 3.0639,
222
+ "step": 2800
223
+ },
224
+ {
225
+ "epoch": 2.6315789473684212,
226
+ "grad_norm": 0.7713281512260437,
227
+ "learning_rate": 8.188142770719903e-05,
228
+ "loss": 3.0165,
229
+ "step": 2900
230
+ },
231
+ {
232
+ "epoch": 2.722323049001815,
233
+ "grad_norm": 0.7961378693580627,
234
+ "learning_rate": 8.087315991127244e-05,
235
+ "loss": 2.9901,
236
+ "step": 3000
237
+ },
238
+ {
239
+ "epoch": 2.8130671506352085,
240
+ "grad_norm": 0.7088135480880737,
241
+ "learning_rate": 7.986489211534584e-05,
242
+ "loss": 3.0241,
243
+ "step": 3100
244
+ },
245
+ {
246
+ "epoch": 2.9038112522686026,
247
+ "grad_norm": 0.7496780157089233,
248
+ "learning_rate": 7.885662431941923e-05,
249
+ "loss": 2.9939,
250
+ "step": 3200
251
+ },
252
+ {
253
+ "epoch": 2.9945553539019962,
254
+ "grad_norm": 0.7221343517303467,
255
+ "learning_rate": 7.784835652349264e-05,
256
+ "loss": 2.9926,
257
+ "step": 3300
258
+ },
259
+ {
260
+ "epoch": 3.0,
261
+ "eval_loss": 2.9603192806243896,
262
+ "eval_runtime": 10.1067,
263
+ "eval_samples_per_second": 387.563,
264
+ "eval_steps_per_second": 48.483,
265
+ "step": 3306
266
+ }
267
+ ],
268
+ "logging_steps": 100,
269
+ "max_steps": 11020,
270
+ "num_input_tokens_seen": 0,
271
+ "num_train_epochs": 10,
272
+ "save_steps": 500,
273
+ "stateful_callbacks": {
274
+ "TrainerControl": {
275
+ "args": {
276
+ "should_epoch_stop": false,
277
+ "should_evaluate": false,
278
+ "should_log": false,
279
+ "should_save": true,
280
+ "should_training_stop": false
281
+ },
282
+ "attributes": {}
283
+ }
284
+ },
285
+ "total_flos": 2.7631632384e+16,
286
+ "train_batch_size": 32,
287
+ "trial_name": null,
288
+ "trial_params": null
289
+ }
checkpoint-3306/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
3
+ size 5777
checkpoint-4408/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 512,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 512,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.55.1",
30
+ "use_cache": true,
31
+ "vocab_size": 50000
32
+ }
checkpoint-4408/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.55.1"
6
+ }
checkpoint-4408/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a233baa8d1a111b7be64b7d49d53720fc77a86f35a7a5e2b4b4b50aadcb00832
3
+ size 495411840
checkpoint-4408/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82bfe263f2408c02d905a1ae42fc1ace5850810e781a2068eb873b4f88cdeba6
3
+ size 990920075
checkpoint-4408/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffb15f127c28eb75998b71c1ec183731c8b207892736e6e627ee2c73156db517
3
+ size 14645
checkpoint-4408/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a1d6b1539486d741d691f9bae9a67afd921fe54bf7cd8cabd2c75c1229a85bb
3
+ size 1383
checkpoint-4408/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31d76e4d6318626af04f0f85be2e6a54f3a0008533d32ff287c639030e907fb1
3
+ size 1465
checkpoint-4408/trainer_state.json ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4408,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09074410163339383,
14
+ "grad_norm": 1.4687509536743164,
15
+ "learning_rate": 8.98366606170599e-06,
16
+ "loss": 6.9172,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.18148820326678766,
21
+ "grad_norm": 1.268220067024231,
22
+ "learning_rate": 1.8058076225045372e-05,
23
+ "loss": 5.2198,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.27223230490018147,
28
+ "grad_norm": 0.9486992955207825,
29
+ "learning_rate": 2.7132486388384752e-05,
30
+ "loss": 4.6092,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.3629764065335753,
35
+ "grad_norm": 0.8284947276115417,
36
+ "learning_rate": 3.620689655172414e-05,
37
+ "loss": 4.2682,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.4537205081669691,
42
+ "grad_norm": 0.6980849504470825,
43
+ "learning_rate": 4.528130671506352e-05,
44
+ "loss": 4.2027,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5444646098003629,
49
+ "grad_norm": 0.9210988879203796,
50
+ "learning_rate": 5.435571687840291e-05,
51
+ "loss": 3.9613,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.6352087114337568,
56
+ "grad_norm": 0.8076866269111633,
57
+ "learning_rate": 6.343012704174229e-05,
58
+ "loss": 3.9595,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.7259528130671506,
63
+ "grad_norm": 0.7056506276130676,
64
+ "learning_rate": 7.250453720508167e-05,
65
+ "loss": 3.9345,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.8166969147005445,
70
+ "grad_norm": 0.7730728387832642,
71
+ "learning_rate": 8.157894736842105e-05,
72
+ "loss": 3.8263,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.9074410163339383,
77
+ "grad_norm": 0.6958891749382019,
78
+ "learning_rate": 9.065335753176044e-05,
79
+ "loss": 3.7584,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.9981851179673321,
84
+ "grad_norm": 0.7870356440544128,
85
+ "learning_rate": 9.972776769509982e-05,
86
+ "loss": 3.6662,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 1.0,
91
+ "eval_loss": 3.6291747093200684,
92
+ "eval_runtime": 9.9369,
93
+ "eval_samples_per_second": 394.189,
94
+ "eval_steps_per_second": 49.311,
95
+ "step": 1102
96
+ },
97
+ {
98
+ "epoch": 1.0889292196007259,
99
+ "grad_norm": 0.6388369798660278,
100
+ "learning_rate": 9.90219802379512e-05,
101
+ "loss": 3.5837,
102
+ "step": 1200
103
+ },
104
+ {
105
+ "epoch": 1.1796733212341197,
106
+ "grad_norm": 0.771931529045105,
107
+ "learning_rate": 9.80137124420246e-05,
108
+ "loss": 3.5696,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 1.2704174228675136,
113
+ "grad_norm": 0.8356881141662598,
114
+ "learning_rate": 9.700544464609801e-05,
115
+ "loss": 3.4928,
116
+ "step": 1400
117
+ },
118
+ {
119
+ "epoch": 1.3611615245009074,
120
+ "grad_norm": 0.7187588214874268,
121
+ "learning_rate": 9.599717685017141e-05,
122
+ "loss": 3.5173,
123
+ "step": 1500
124
+ },
125
+ {
126
+ "epoch": 1.4519056261343013,
127
+ "grad_norm": 0.76460862159729,
128
+ "learning_rate": 9.498890905424481e-05,
129
+ "loss": 3.4,
130
+ "step": 1600
131
+ },
132
+ {
133
+ "epoch": 1.542649727767695,
134
+ "grad_norm": 0.7365128993988037,
135
+ "learning_rate": 9.398064125831822e-05,
136
+ "loss": 3.3335,
137
+ "step": 1700
138
+ },
139
+ {
140
+ "epoch": 1.633393829401089,
141
+ "grad_norm": 0.6376582980155945,
142
+ "learning_rate": 9.297237346239162e-05,
143
+ "loss": 3.2894,
144
+ "step": 1800
145
+ },
146
+ {
147
+ "epoch": 1.7241379310344827,
148
+ "grad_norm": 0.7320665121078491,
149
+ "learning_rate": 9.196410566646501e-05,
150
+ "loss": 3.2772,
151
+ "step": 1900
152
+ },
153
+ {
154
+ "epoch": 1.8148820326678767,
155
+ "grad_norm": 1.4237512350082397,
156
+ "learning_rate": 9.095583787053841e-05,
157
+ "loss": 3.2934,
158
+ "step": 2000
159
+ },
160
+ {
161
+ "epoch": 1.9056261343012704,
162
+ "grad_norm": 0.7177993655204773,
163
+ "learning_rate": 8.994757007461182e-05,
164
+ "loss": 3.2374,
165
+ "step": 2100
166
+ },
167
+ {
168
+ "epoch": 1.9963702359346642,
169
+ "grad_norm": 0.7358414530754089,
170
+ "learning_rate": 8.893930227868522e-05,
171
+ "loss": 3.2032,
172
+ "step": 2200
173
+ },
174
+ {
175
+ "epoch": 2.0,
176
+ "eval_loss": 3.1682567596435547,
177
+ "eval_runtime": 9.9556,
178
+ "eval_samples_per_second": 393.448,
179
+ "eval_steps_per_second": 49.219,
180
+ "step": 2204
181
+ },
182
+ {
183
+ "epoch": 2.087114337568058,
184
+ "grad_norm": 0.6569345593452454,
185
+ "learning_rate": 8.793103448275862e-05,
186
+ "loss": 3.1474,
187
+ "step": 2300
188
+ },
189
+ {
190
+ "epoch": 2.1778584392014517,
191
+ "grad_norm": 0.719578206539154,
192
+ "learning_rate": 8.692276668683203e-05,
193
+ "loss": 3.0573,
194
+ "step": 2400
195
+ },
196
+ {
197
+ "epoch": 2.268602540834846,
198
+ "grad_norm": 0.7317540645599365,
199
+ "learning_rate": 8.591449889090543e-05,
200
+ "loss": 3.1363,
201
+ "step": 2500
202
+ },
203
+ {
204
+ "epoch": 2.3593466424682394,
205
+ "grad_norm": 0.7985743880271912,
206
+ "learning_rate": 8.490623109497882e-05,
207
+ "loss": 3.0611,
208
+ "step": 2600
209
+ },
210
+ {
211
+ "epoch": 2.4500907441016335,
212
+ "grad_norm": 0.8037993907928467,
213
+ "learning_rate": 8.389796329905223e-05,
214
+ "loss": 3.0735,
215
+ "step": 2700
216
+ },
217
+ {
218
+ "epoch": 2.540834845735027,
219
+ "grad_norm": 0.7689797878265381,
220
+ "learning_rate": 8.288969550312563e-05,
221
+ "loss": 3.0639,
222
+ "step": 2800
223
+ },
224
+ {
225
+ "epoch": 2.6315789473684212,
226
+ "grad_norm": 0.7713281512260437,
227
+ "learning_rate": 8.188142770719903e-05,
228
+ "loss": 3.0165,
229
+ "step": 2900
230
+ },
231
+ {
232
+ "epoch": 2.722323049001815,
233
+ "grad_norm": 0.7961378693580627,
234
+ "learning_rate": 8.087315991127244e-05,
235
+ "loss": 2.9901,
236
+ "step": 3000
237
+ },
238
+ {
239
+ "epoch": 2.8130671506352085,
240
+ "grad_norm": 0.7088135480880737,
241
+ "learning_rate": 7.986489211534584e-05,
242
+ "loss": 3.0241,
243
+ "step": 3100
244
+ },
245
+ {
246
+ "epoch": 2.9038112522686026,
247
+ "grad_norm": 0.7496780157089233,
248
+ "learning_rate": 7.885662431941923e-05,
249
+ "loss": 2.9939,
250
+ "step": 3200
251
+ },
252
+ {
253
+ "epoch": 2.9945553539019962,
254
+ "grad_norm": 0.7221343517303467,
255
+ "learning_rate": 7.784835652349264e-05,
256
+ "loss": 2.9926,
257
+ "step": 3300
258
+ },
259
+ {
260
+ "epoch": 3.0,
261
+ "eval_loss": 2.9603192806243896,
262
+ "eval_runtime": 10.1067,
263
+ "eval_samples_per_second": 387.563,
264
+ "eval_steps_per_second": 48.483,
265
+ "step": 3306
266
+ },
267
+ {
268
+ "epoch": 3.0852994555353903,
269
+ "grad_norm": 0.8570982217788696,
270
+ "learning_rate": 7.684008872756604e-05,
271
+ "loss": 2.9251,
272
+ "step": 3400
273
+ },
274
+ {
275
+ "epoch": 3.176043557168784,
276
+ "grad_norm": 0.7704641222953796,
277
+ "learning_rate": 7.583182093163945e-05,
278
+ "loss": 2.8877,
279
+ "step": 3500
280
+ },
281
+ {
282
+ "epoch": 3.266787658802178,
283
+ "grad_norm": 0.6863057017326355,
284
+ "learning_rate": 7.482355313571285e-05,
285
+ "loss": 2.8743,
286
+ "step": 3600
287
+ },
288
+ {
289
+ "epoch": 3.3575317604355717,
290
+ "grad_norm": 0.7227942943572998,
291
+ "learning_rate": 7.381528533978626e-05,
292
+ "loss": 2.823,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 3.4482758620689653,
297
+ "grad_norm": 0.7746195197105408,
298
+ "learning_rate": 7.280701754385966e-05,
299
+ "loss": 2.8962,
300
+ "step": 3800
301
+ },
302
+ {
303
+ "epoch": 3.5390199637023594,
304
+ "grad_norm": 0.7885217070579529,
305
+ "learning_rate": 7.179874974793306e-05,
306
+ "loss": 2.8419,
307
+ "step": 3900
308
+ },
309
+ {
310
+ "epoch": 3.629764065335753,
311
+ "grad_norm": 0.7623139023780823,
312
+ "learning_rate": 7.079048195200647e-05,
313
+ "loss": 2.8333,
314
+ "step": 4000
315
+ },
316
+ {
317
+ "epoch": 3.720508166969147,
318
+ "grad_norm": 0.8208196759223938,
319
+ "learning_rate": 6.978221415607986e-05,
320
+ "loss": 2.8302,
321
+ "step": 4100
322
+ },
323
+ {
324
+ "epoch": 3.8112522686025407,
325
+ "grad_norm": 0.7712786197662354,
326
+ "learning_rate": 6.877394636015326e-05,
327
+ "loss": 2.8674,
328
+ "step": 4200
329
+ },
330
+ {
331
+ "epoch": 3.901996370235935,
332
+ "grad_norm": 0.8100000023841858,
333
+ "learning_rate": 6.776567856422666e-05,
334
+ "loss": 2.8097,
335
+ "step": 4300
336
+ },
337
+ {
338
+ "epoch": 3.9927404718693285,
339
+ "grad_norm": 0.8472097516059875,
340
+ "learning_rate": 6.675741076830007e-05,
341
+ "loss": 2.8418,
342
+ "step": 4400
343
+ },
344
+ {
345
+ "epoch": 4.0,
346
+ "eval_loss": 2.8345706462860107,
347
+ "eval_runtime": 9.9462,
348
+ "eval_samples_per_second": 393.819,
349
+ "eval_steps_per_second": 49.265,
350
+ "step": 4408
351
+ }
352
+ ],
353
+ "logging_steps": 100,
354
+ "max_steps": 11020,
355
+ "num_input_tokens_seen": 0,
356
+ "num_train_epochs": 10,
357
+ "save_steps": 500,
358
+ "stateful_callbacks": {
359
+ "TrainerControl": {
360
+ "args": {
361
+ "should_epoch_stop": false,
362
+ "should_evaluate": false,
363
+ "should_log": false,
364
+ "should_save": true,
365
+ "should_training_stop": false
366
+ },
367
+ "attributes": {}
368
+ }
369
+ },
370
+ "total_flos": 3.6842176512e+16,
371
+ "train_batch_size": 32,
372
+ "trial_name": null,
373
+ "trial_params": null
374
+ }
checkpoint-4408/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
3
+ size 5777
checkpoint-5510/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 512,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 512,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.55.1",
30
+ "use_cache": true,
31
+ "vocab_size": 50000
32
+ }
checkpoint-5510/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.55.1"
6
+ }
checkpoint-5510/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e1913fca505749d24fa57f254a8ca8c8eb4c1a1960bc8aed3638d48d0de2f3f
3
+ size 495411840
checkpoint-5510/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7791d9266744c6b12ac9a1ee4226f85b1431337a7798ba3752d28b413c11aadd
3
+ size 990920075
checkpoint-5510/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e04e2fc3463de6b54ec5a12876a98d410df180d34e08b28d2dcf75fe273573f
3
+ size 14645