nschenk16 commited on
Commit
098b226
·
verified ·
1 Parent(s): ead7426

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -25,12 +25,12 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
 
28
  "v_proj",
29
- "k_proj",
30
  "gate_proj",
31
- "q_proj",
32
  "o_proj",
33
- "up_proj",
34
  "down_proj"
35
  ],
36
  "target_parameters": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "up_proj",
29
  "v_proj",
 
30
  "gate_proj",
31
+ "k_proj",
32
  "o_proj",
33
+ "q_proj",
34
  "down_proj"
35
  ],
36
  "target_parameters": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd56087c89c66211706ec9604f874f7f37183adc4a87bde3296c02d95ad546cf
3
  size 70430032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bad6519f4222c69631b39ae11e4e641593524e17b56b28f272222f981b37964
3
  size 70430032
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94d540d4d5ec8bd8a9745d5a6982fe34bd12584b54f1a20f08af3b91e1c1042e
3
  size 36140325
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10f0af0adccf49d58dc86ff958136fdbc89448a8666cadb0cced82e40b534cd1
3
  size 36140325
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2b06adf858fd431254cfea4a735b5073ec95a938351985d2f58fd75afb88f51
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cfde38fbb26ecac9770a5a0fdd06ad8687211a169ed61ebc2799cda44f36fb9
3
  size 14645
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_global_step": 100,
3
- "best_metric": 0.3803686201572418,
4
- "best_model_checkpoint": "./lora_out/1ca02918/checkpoint-100",
5
  "epoch": 1.0,
6
  "eval_steps": 10,
7
  "global_step": 107,
@@ -10,193 +10,213 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
 
13
  "epoch": 0.09411764705882353,
14
- "grad_norm": 1.270073413848877,
15
  "learning_rate": 0.00016003636363636365,
16
- "loss": 1.2398,
17
- "mean_token_accuracy": 0.7614437624812126,
18
- "num_tokens": 115124.0,
19
  "step": 10
20
  },
21
  {
22
  "epoch": 0.09411764705882353,
23
- "eval_loss": 0.896133542060852,
24
- "eval_mean_token_accuracy": 0.8207377485434214,
25
- "eval_num_tokens": 115124.0,
26
- "eval_runtime": 23.5565,
27
- "eval_samples_per_second": 6.368,
28
- "eval_steps_per_second": 6.368,
 
29
  "step": 10
30
  },
31
  {
 
32
  "epoch": 0.18823529411764706,
33
- "grad_norm": 0.6748596429824829,
34
  "learning_rate": 0.0001793,
35
- "loss": 0.555,
36
- "mean_token_accuracy": 0.8851501628756523,
37
- "num_tokens": 183590.0,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.18823529411764706,
42
- "eval_loss": 0.4925537109375,
43
- "eval_mean_token_accuracy": 0.9008145872751872,
44
- "eval_num_tokens": 183590.0,
45
- "eval_runtime": 23.5261,
46
- "eval_samples_per_second": 6.376,
47
- "eval_steps_per_second": 6.376,
 
48
  "step": 20
49
  },
50
  {
 
51
  "epoch": 0.2823529411764706,
52
- "grad_norm": 0.5109054446220398,
53
  "learning_rate": 0.000158925,
54
- "loss": 0.5305,
55
- "mean_token_accuracy": 0.8950182288885117,
56
- "num_tokens": 274669.0,
57
  "step": 30
58
  },
59
  {
60
  "epoch": 0.2823529411764706,
61
- "eval_loss": 0.43484488129615784,
62
- "eval_mean_token_accuracy": 0.9108006227016449,
63
- "eval_num_tokens": 274669.0,
64
- "eval_runtime": 23.5726,
65
- "eval_samples_per_second": 6.363,
66
- "eval_steps_per_second": 6.363,
 
67
  "step": 30
68
  },
69
  {
 
70
  "epoch": 0.3764705882352941,
71
- "grad_norm": 0.2931932806968689,
72
  "learning_rate": 0.00013855,
73
- "loss": 0.4322,
74
- "mean_token_accuracy": 0.9151976436376572,
75
- "num_tokens": 366523.0,
76
  "step": 40
77
  },
78
  {
79
  "epoch": 0.3764705882352941,
80
- "eval_loss": 0.419387549161911,
81
- "eval_mean_token_accuracy": 0.9135305122534434,
82
- "eval_num_tokens": 366523.0,
83
- "eval_runtime": 23.518,
84
- "eval_samples_per_second": 6.378,
85
- "eval_steps_per_second": 6.378,
 
86
  "step": 40
87
  },
88
  {
 
89
  "epoch": 0.47058823529411764,
90
- "grad_norm": 0.2669583559036255,
91
  "learning_rate": 0.000118175,
92
- "loss": 0.3397,
93
- "mean_token_accuracy": 0.9227273896336555,
94
- "num_tokens": 427601.0,
95
  "step": 50
96
  },
97
  {
98
  "epoch": 0.47058823529411764,
99
- "eval_loss": 0.4084239900112152,
100
- "eval_mean_token_accuracy": 0.9155814254283905,
101
- "eval_num_tokens": 427601.0,
102
- "eval_runtime": 23.5117,
103
- "eval_samples_per_second": 6.38,
104
- "eval_steps_per_second": 6.38,
 
105
  "step": 50
106
  },
107
  {
 
108
  "epoch": 0.5647058823529412,
109
- "grad_norm": 0.24561642110347748,
110
  "learning_rate": 9.78e-05,
111
- "loss": 0.5525,
112
- "mean_token_accuracy": 0.8890453979372979,
113
- "num_tokens": 541710.0,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.5647058823529412,
118
- "eval_loss": 0.4013407528400421,
119
- "eval_mean_token_accuracy": 0.9171908668677012,
120
- "eval_num_tokens": 541710.0,
121
- "eval_runtime": 23.5555,
122
- "eval_samples_per_second": 6.368,
123
- "eval_steps_per_second": 6.368,
 
124
  "step": 60
125
  },
126
  {
 
127
  "epoch": 0.6588235294117647,
128
- "grad_norm": 0.3256433606147766,
129
  "learning_rate": 7.7425e-05,
130
- "loss": 0.3582,
131
- "mean_token_accuracy": 0.9248881861567497,
132
- "num_tokens": 620339.0,
133
  "step": 70
134
  },
135
  {
136
  "epoch": 0.6588235294117647,
137
- "eval_loss": 0.3908792734146118,
138
- "eval_mean_token_accuracy": 0.9177969670295716,
139
- "eval_num_tokens": 620339.0,
140
- "eval_runtime": 23.6388,
141
- "eval_samples_per_second": 6.345,
142
- "eval_steps_per_second": 6.345,
 
143
  "step": 70
144
  },
145
  {
 
146
  "epoch": 0.7529411764705882,
147
- "grad_norm": 0.35445523262023926,
148
  "learning_rate": 5.7050000000000004e-05,
149
- "loss": 0.3729,
150
- "mean_token_accuracy": 0.9206046536564827,
151
- "num_tokens": 695812.0,
152
  "step": 80
153
  },
154
  {
155
  "epoch": 0.7529411764705882,
156
- "eval_loss": 0.3841544985771179,
157
- "eval_mean_token_accuracy": 0.9190148003896077,
158
- "eval_num_tokens": 695812.0,
159
- "eval_runtime": 23.5427,
160
- "eval_samples_per_second": 6.371,
161
- "eval_steps_per_second": 6.371,
 
162
  "step": 80
163
  },
164
  {
 
165
  "epoch": 0.8470588235294118,
166
- "grad_norm": 0.2725517153739929,
167
  "learning_rate": 3.6675000000000004e-05,
168
- "loss": 0.4558,
169
- "mean_token_accuracy": 0.9098866626620292,
170
- "num_tokens": 796104.0,
171
  "step": 90
172
  },
173
  {
174
  "epoch": 0.8470588235294118,
175
- "eval_loss": 0.3851577639579773,
176
- "eval_mean_token_accuracy": 0.9192596852779389,
177
- "eval_num_tokens": 796104.0,
178
- "eval_runtime": 23.599,
179
- "eval_samples_per_second": 6.356,
180
- "eval_steps_per_second": 6.356,
 
181
  "step": 90
182
  },
183
  {
 
184
  "epoch": 0.9411764705882353,
185
- "grad_norm": 0.28482893109321594,
186
  "learning_rate": 1.63e-05,
187
- "loss": 0.2851,
188
- "mean_token_accuracy": 0.9351509854197502,
189
- "num_tokens": 861517.0,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.9411764705882353,
194
- "eval_loss": 0.3803686201572418,
195
- "eval_mean_token_accuracy": 0.920326874256134,
196
- "eval_num_tokens": 861517.0,
197
- "eval_runtime": 23.6491,
198
- "eval_samples_per_second": 6.343,
199
- "eval_steps_per_second": 6.343,
 
200
  "step": 100
201
  }
202
  ],
@@ -217,7 +237,7 @@
217
  "attributes": {}
218
  }
219
  },
220
- "total_flos": 2032207927076352.0,
221
  "train_batch_size": 2,
222
  "trial_name": null,
223
  "trial_params": null
 
1
  {
2
  "best_global_step": 100,
3
+ "best_metric": 0.3854297995567322,
4
+ "best_model_checkpoint": "./lora_out/efu0wyi4/checkpoint-100",
5
  "epoch": 1.0,
6
  "eval_steps": 10,
7
  "global_step": 107,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.1153396010398864,
14
  "epoch": 0.09411764705882353,
15
+ "grad_norm": 1.2214514017105103,
16
  "learning_rate": 0.00016003636363636365,
17
+ "loss": 1.2524,
18
+ "mean_token_accuracy": 0.7594971776008606,
19
+ "num_tokens": 115799.0,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.09411764705882353,
24
+ "eval_entropy": 0.9907166957855225,
25
+ "eval_loss": 0.9035767912864685,
26
+ "eval_mean_token_accuracy": 0.8175230169296265,
27
+ "eval_num_tokens": 115799.0,
28
+ "eval_runtime": 26.0099,
29
+ "eval_samples_per_second": 5.767,
30
+ "eval_steps_per_second": 5.767,
31
  "step": 10
32
  },
33
  {
34
+ "entropy": 0.6309416361153126,
35
  "epoch": 0.18823529411764706,
36
+ "grad_norm": 0.6099486351013184,
37
  "learning_rate": 0.0001793,
38
+ "loss": 0.5626,
39
+ "mean_token_accuracy": 0.8832408726215363,
40
+ "num_tokens": 184486.0,
41
  "step": 20
42
  },
43
  {
44
  "epoch": 0.18823529411764706,
45
+ "eval_entropy": 0.4709713250398636,
46
+ "eval_loss": 0.5018168687820435,
47
+ "eval_mean_token_accuracy": 0.8997142084439596,
48
+ "eval_num_tokens": 184486.0,
49
+ "eval_runtime": 26.0226,
50
+ "eval_samples_per_second": 5.764,
51
+ "eval_steps_per_second": 5.764,
52
  "step": 20
53
  },
54
  {
55
+ "entropy": 0.5151705276221037,
56
  "epoch": 0.2823529411764706,
57
+ "grad_norm": 0.5180553793907166,
58
  "learning_rate": 0.000158925,
59
+ "loss": 0.5382,
60
+ "mean_token_accuracy": 0.8940704673528671,
61
+ "num_tokens": 275845.0,
62
  "step": 30
63
  },
64
  {
65
  "epoch": 0.2823529411764706,
66
+ "eval_entropy": 0.4590779893596967,
67
+ "eval_loss": 0.4454800486564636,
68
+ "eval_mean_token_accuracy": 0.9063887639840444,
69
+ "eval_num_tokens": 275845.0,
70
+ "eval_runtime": 26.0809,
71
+ "eval_samples_per_second": 5.751,
72
+ "eval_steps_per_second": 5.751,
73
  "step": 30
74
  },
75
  {
76
+ "entropy": 0.4594229131937027,
77
  "epoch": 0.3764705882352941,
78
+ "grad_norm": 0.2898012399673462,
79
  "learning_rate": 0.00013855,
80
+ "loss": 0.45,
81
+ "mean_token_accuracy": 0.91114012748003,
82
+ "num_tokens": 368424.0,
83
  "step": 40
84
  },
85
  {
86
  "epoch": 0.3764705882352941,
87
+ "eval_entropy": 0.4673765400548776,
88
+ "eval_loss": 0.43020305037498474,
89
+ "eval_mean_token_accuracy": 0.9115767550468444,
90
+ "eval_num_tokens": 368424.0,
91
+ "eval_runtime": 26.0231,
92
+ "eval_samples_per_second": 5.764,
93
+ "eval_steps_per_second": 5.764,
94
  "step": 40
95
  },
96
  {
97
+ "entropy": 0.3577877376228571,
98
  "epoch": 0.47058823529411764,
99
+ "grad_norm": 0.25978532433509827,
100
  "learning_rate": 0.000118175,
101
+ "loss": 0.343,
102
+ "mean_token_accuracy": 0.9222506016492844,
103
+ "num_tokens": 429589.0,
104
  "step": 50
105
  },
106
  {
107
  "epoch": 0.47058823529411764,
108
+ "eval_entropy": 0.39317929953336717,
109
+ "eval_loss": 0.42108139395713806,
110
+ "eval_mean_token_accuracy": 0.9134278730551402,
111
+ "eval_num_tokens": 429589.0,
112
+ "eval_runtime": 26.002,
113
+ "eval_samples_per_second": 5.769,
114
+ "eval_steps_per_second": 5.769,
115
  "step": 50
116
  },
117
  {
118
+ "entropy": 0.54765380397439,
119
  "epoch": 0.5647058823529412,
120
+ "grad_norm": 0.2597196698188782,
121
  "learning_rate": 9.78e-05,
122
+ "loss": 0.5644,
123
+ "mean_token_accuracy": 0.8865802466869355,
124
+ "num_tokens": 544259.0,
125
  "step": 60
126
  },
127
  {
128
  "epoch": 0.5647058823529412,
129
+ "eval_entropy": 0.41075391257802646,
130
+ "eval_loss": 0.4073421359062195,
131
+ "eval_mean_token_accuracy": 0.9149504574139913,
132
+ "eval_num_tokens": 544259.0,
133
+ "eval_runtime": 26.2517,
134
+ "eval_samples_per_second": 5.714,
135
+ "eval_steps_per_second": 5.714,
136
  "step": 60
137
  },
138
  {
139
+ "entropy": 0.3822147287428379,
140
  "epoch": 0.6588235294117647,
141
+ "grad_norm": 0.3055135905742645,
142
  "learning_rate": 7.7425e-05,
143
+ "loss": 0.3693,
144
+ "mean_token_accuracy": 0.922209607064724,
145
+ "num_tokens": 623365.0,
146
  "step": 70
147
  },
148
  {
149
  "epoch": 0.6588235294117647,
150
+ "eval_entropy": 0.4195191798110803,
151
+ "eval_loss": 0.3989087641239166,
152
+ "eval_mean_token_accuracy": 0.915905403693517,
153
+ "eval_num_tokens": 623365.0,
154
+ "eval_runtime": 26.2348,
155
+ "eval_samples_per_second": 5.718,
156
+ "eval_steps_per_second": 5.718,
157
  "step": 70
158
  },
159
  {
160
+ "entropy": 0.38976135551929475,
161
  "epoch": 0.7529411764705882,
162
+ "grad_norm": 0.34105750918388367,
163
  "learning_rate": 5.7050000000000004e-05,
164
+ "loss": 0.3761,
165
+ "mean_token_accuracy": 0.9197401210665703,
166
+ "num_tokens": 699013.0,
167
  "step": 80
168
  },
169
  {
170
  "epoch": 0.7529411764705882,
171
+ "eval_entropy": 0.39603232031067215,
172
+ "eval_loss": 0.3924122452735901,
173
+ "eval_mean_token_accuracy": 0.917570983171463,
174
+ "eval_num_tokens": 699013.0,
175
+ "eval_runtime": 26.1772,
176
+ "eval_samples_per_second": 5.73,
177
+ "eval_steps_per_second": 5.73,
178
  "step": 80
179
  },
180
  {
181
+ "entropy": 0.47345383167266847,
182
  "epoch": 0.8470588235294118,
183
+ "grad_norm": 0.3139539957046509,
184
  "learning_rate": 3.6675000000000004e-05,
185
+ "loss": 0.4697,
186
+ "mean_token_accuracy": 0.9068517610430717,
187
+ "num_tokens": 800061.0,
188
  "step": 90
189
  },
190
  {
191
  "epoch": 0.8470588235294118,
192
+ "eval_entropy": 0.38997318550944327,
193
+ "eval_loss": 0.39237019419670105,
194
+ "eval_mean_token_accuracy": 0.917557996114095,
195
+ "eval_num_tokens": 800061.0,
196
+ "eval_runtime": 26.0084,
197
+ "eval_samples_per_second": 5.767,
198
+ "eval_steps_per_second": 5.767,
199
  "step": 90
200
  },
201
  {
202
+ "entropy": 0.2872362457215786,
203
  "epoch": 0.9411764705882353,
204
+ "grad_norm": 0.29094481468200684,
205
  "learning_rate": 1.63e-05,
206
+ "loss": 0.2916,
207
+ "mean_token_accuracy": 0.9335257709026337,
208
+ "num_tokens": 865712.0,
209
  "step": 100
210
  },
211
  {
212
  "epoch": 0.9411764705882353,
213
+ "eval_entropy": 0.3926775233944257,
214
+ "eval_loss": 0.3854297995567322,
215
+ "eval_mean_token_accuracy": 0.9184372560183207,
216
+ "eval_num_tokens": 865712.0,
217
+ "eval_runtime": 26.027,
218
+ "eval_samples_per_second": 5.763,
219
+ "eval_steps_per_second": 5.763,
220
  "step": 100
221
  }
222
  ],
 
237
  "attributes": {}
238
  }
239
  },
240
+ "total_flos": 2041922721504768.0,
241
  "train_batch_size": 2,
242
  "trial_name": null,
243
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c99c3d3d0bfedc2d4a25d224f204b007b771f3ffcf74f9edf57603f164cfe386
3
- size 6097
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f775e201d96104912459fe7cf55fed26f17379938713adb7401917362d5db8d
3
+ size 6161