stefanocarrera commited on
Commit
8cdf54d
·
verified ·
1 Parent(s): fbe1e4d

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
- "up_proj",
34
- "k_proj",
35
- "v_proj",
36
  "o_proj",
 
 
 
37
  "down_proj",
38
- "gate_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
 
 
32
  "o_proj",
33
+ "gate_proj",
34
+ "k_proj",
35
+ "q_proj",
36
  "down_proj",
37
+ "v_proj",
38
+ "up_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99d569edfe594f0c62c62a99823ca2772d86d0bf64c287ec820ae443b07beaa1
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a30df352e3e5bf7a3be3ccca5e0bf0b3a9b19ac4eb509a3b6c3fbbccdd879fb
3
  size 83946192
checkpoint-150/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "gate_proj",
33
- "up_proj",
34
- "q_proj",
35
- "v_proj",
36
  "k_proj",
 
37
  "down_proj",
38
- "o_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "o_proj",
33
  "gate_proj",
 
 
 
34
  "k_proj",
35
+ "q_proj",
36
  "down_proj",
37
+ "v_proj",
38
+ "up_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
checkpoint-150/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a44a4988e01e3c390b69e98347d898a87d3cf126ec0cb79df08350fad8ca7faa
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa9896d8e319bff7d9d35da65aba6a17142562f9de4a13413e0140189cb8db35
3
  size 83946192
checkpoint-150/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a42bc05db82290a4b40ebe09854f20ca5f421a7db097ca0451c5c9ec80006e2e
3
  size 85728997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e55538cf9956f93c36ccaecb1516773c281d3971699a863b515afecea63db183
3
  size 85728997
checkpoint-150/trainer_state.json CHANGED
@@ -10,1536 +10,1536 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 0.3660925142467022,
14
  "epoch": 0.01225114854517611,
15
- "grad_norm": 0.0166015625,
16
  "learning_rate": 0.0002,
17
- "loss": 0.0020782470237463713,
18
- "mean_token_accuracy": 0.9997171945869923,
19
- "num_tokens": 6092.0,
20
  "step": 1
21
  },
22
  {
23
- "entropy": 0.34051003493368626,
24
  "epoch": 0.02450229709035222,
25
- "grad_norm": 0.000823974609375,
26
  "learning_rate": 0.00019878048780487805,
27
- "loss": 9.216360922437161e-05,
28
- "mean_token_accuracy": 1.0,
29
- "num_tokens": 11535.0,
30
  "step": 2
31
  },
32
  {
33
- "entropy": 0.32960800640285015,
34
  "epoch": 0.036753445635528334,
35
- "grad_norm": 0.0098876953125,
36
  "learning_rate": 0.0001975609756097561,
37
- "loss": 0.0001977928914129734,
38
- "mean_token_accuracy": 1.0,
39
- "num_tokens": 16432.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 0.33627333864569664,
44
  "epoch": 0.04900459418070444,
45
- "grad_norm": 0.06640625,
46
  "learning_rate": 0.00019634146341463416,
47
- "loss": 0.00977393426001072,
48
- "mean_token_accuracy": 0.9985632188618183,
49
- "num_tokens": 20507.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 0.31916058249771595,
54
  "epoch": 0.06125574272588055,
55
- "grad_norm": 0.0003108978271484375,
56
  "learning_rate": 0.0001951219512195122,
57
- "loss": 5.0926646508742124e-05,
58
- "mean_token_accuracy": 1.0,
59
- "num_tokens": 26122.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 0.3524587769061327,
64
  "epoch": 0.07350689127105667,
65
- "grad_norm": 0.000186920166015625,
66
  "learning_rate": 0.00019390243902439025,
67
- "loss": 4.6155335439834744e-05,
68
- "mean_token_accuracy": 1.0,
69
- "num_tokens": 30847.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 0.3272323925048113,
74
  "epoch": 0.08575803981623277,
75
- "grad_norm": 0.005859375,
76
  "learning_rate": 0.0001926829268292683,
77
- "loss": 0.000202978597371839,
78
- "mean_token_accuracy": 1.0,
79
- "num_tokens": 36541.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 0.347023731097579,
84
  "epoch": 0.09800918836140889,
85
- "grad_norm": 0.00072479248046875,
86
  "learning_rate": 0.00019146341463414633,
87
- "loss": 0.00011593783710850403,
88
- "mean_token_accuracy": 1.0,
89
- "num_tokens": 41001.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 0.376500410027802,
94
  "epoch": 0.11026033690658499,
95
- "grad_norm": 0.09033203125,
96
  "learning_rate": 0.0001902439024390244,
97
- "loss": 0.008863622322678566,
98
- "mean_token_accuracy": 0.9979648105800152,
99
- "num_tokens": 45467.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 0.3560014171525836,
104
  "epoch": 0.1225114854517611,
105
- "grad_norm": 0.055419921875,
106
  "learning_rate": 0.00018902439024390244,
107
- "loss": 0.004083322826772928,
108
- "mean_token_accuracy": 0.9990039840340614,
109
- "num_tokens": 50478.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 0.3533000349998474,
114
  "epoch": 0.13476263399693722,
115
- "grad_norm": 0.0033721923828125,
116
  "learning_rate": 0.0001878048780487805,
117
- "loss": 0.000252897065365687,
118
- "mean_token_accuracy": 1.0,
119
- "num_tokens": 56181.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 0.4079158063977957,
124
  "epoch": 0.14701378254211334,
125
- "grad_norm": 0.00110626220703125,
126
  "learning_rate": 0.00018658536585365856,
127
- "loss": 0.00019193078333046287,
128
- "mean_token_accuracy": 1.0,
129
- "num_tokens": 62946.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 0.4043316235765815,
134
  "epoch": 0.15926493108728942,
135
- "grad_norm": 0.0021209716796875,
136
  "learning_rate": 0.0001853658536585366,
137
- "loss": 0.00025091503630392253,
138
- "mean_token_accuracy": 1.0,
139
- "num_tokens": 68436.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 0.41207500360906124,
144
  "epoch": 0.17151607963246554,
145
- "grad_norm": 0.00139617919921875,
146
  "learning_rate": 0.00018414634146341464,
147
- "loss": 0.0002536335668992251,
148
- "mean_token_accuracy": 1.0,
149
- "num_tokens": 73603.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 0.43669185042381287,
154
  "epoch": 0.18376722817764166,
155
- "grad_norm": 0.020751953125,
156
  "learning_rate": 0.0001829268292682927,
157
- "loss": 0.0008837866480462253,
158
- "mean_token_accuracy": 0.9994877055287361,
159
- "num_tokens": 77845.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 0.41382858343422413,
164
  "epoch": 0.19601837672281777,
165
- "grad_norm": 0.0145263671875,
166
  "learning_rate": 0.00018170731707317075,
167
- "loss": 0.0006772386841475964,
168
- "mean_token_accuracy": 1.0,
169
- "num_tokens": 82744.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 0.4243332091718912,
174
  "epoch": 0.2082695252679939,
175
- "grad_norm": 0.001922607421875,
176
  "learning_rate": 0.0001804878048780488,
177
- "loss": 0.00027059210697188973,
178
- "mean_token_accuracy": 1.0,
179
- "num_tokens": 87453.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 0.4329488482326269,
184
  "epoch": 0.22052067381316998,
185
- "grad_norm": 0.004852294921875,
186
  "learning_rate": 0.00017926829268292684,
187
- "loss": 0.00031758740078657866,
188
- "mean_token_accuracy": 1.0,
189
- "num_tokens": 92321.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 0.440301101654768,
194
  "epoch": 0.2327718223583461,
195
- "grad_norm": 0.005767822265625,
196
  "learning_rate": 0.00017804878048780488,
197
- "loss": 0.0004065934626851231,
198
- "mean_token_accuracy": 1.0,
199
- "num_tokens": 97146.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 0.4400939680635929,
204
  "epoch": 0.2450229709035222,
205
- "grad_norm": 0.0023040771484375,
206
  "learning_rate": 0.00017682926829268295,
207
- "loss": 0.00020425915136002004,
208
- "mean_token_accuracy": 1.0,
209
- "num_tokens": 101943.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 0.4579729177057743,
214
  "epoch": 0.2572741194486983,
215
- "grad_norm": 0.0286865234375,
216
  "learning_rate": 0.000175609756097561,
217
- "loss": 0.0015601275954395533,
218
- "mean_token_accuracy": 0.9996448867022991,
219
- "num_tokens": 106772.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 0.40288309939205647,
224
  "epoch": 0.26952526799387444,
225
- "grad_norm": 0.00072479248046875,
226
  "learning_rate": 0.00017439024390243903,
227
- "loss": 9.121054608840495e-05,
228
- "mean_token_accuracy": 1.0,
229
- "num_tokens": 112558.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 0.4252484003081918,
234
  "epoch": 0.28177641653905056,
235
- "grad_norm": 0.000457763671875,
236
  "learning_rate": 0.00017317073170731708,
237
- "loss": 8.147547487169504e-05,
238
- "mean_token_accuracy": 1.0,
239
- "num_tokens": 117489.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 0.44810181483626366,
244
  "epoch": 0.29402756508422667,
245
- "grad_norm": 0.007720947265625,
246
  "learning_rate": 0.00017195121951219512,
247
- "loss": 0.0003956289147026837,
248
- "mean_token_accuracy": 1.0,
249
- "num_tokens": 123010.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 0.4023376125842333,
254
  "epoch": 0.30627871362940273,
255
- "grad_norm": 0.00103759765625,
256
  "learning_rate": 0.0001707317073170732,
257
- "loss": 8.693434210726991e-05,
258
- "mean_token_accuracy": 1.0,
259
- "num_tokens": 127716.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 0.4007954867556691,
264
  "epoch": 0.31852986217457885,
265
- "grad_norm": 0.00194549560546875,
266
  "learning_rate": 0.00016951219512195123,
267
- "loss": 8.696074655745178e-05,
268
- "mean_token_accuracy": 1.0,
269
- "num_tokens": 132372.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 0.3759774696081877,
274
  "epoch": 0.33078101071975496,
275
- "grad_norm": 0.003387451171875,
276
  "learning_rate": 0.00016829268292682927,
277
- "loss": 0.00013623938139062375,
278
- "mean_token_accuracy": 1.0,
279
- "num_tokens": 137028.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 0.40147540159523487,
284
  "epoch": 0.3430321592649311,
285
- "grad_norm": 0.0380859375,
286
  "learning_rate": 0.00016707317073170731,
287
- "loss": 0.005999124608933926,
288
- "mean_token_accuracy": 0.9987113401293755,
289
- "num_tokens": 142088.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 0.38656803220510483,
294
  "epoch": 0.3552833078101072,
295
- "grad_norm": 0.0322265625,
296
  "learning_rate": 0.00016585365853658536,
297
- "loss": 0.00021061318693682551,
298
- "mean_token_accuracy": 1.0,
299
- "num_tokens": 147481.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 0.4059827271848917,
304
  "epoch": 0.3675344563552833,
305
- "grad_norm": 0.00015163421630859375,
306
  "learning_rate": 0.00016463414634146343,
307
- "loss": 3.9411937905242667e-05,
308
- "mean_token_accuracy": 1.0,
309
- "num_tokens": 152973.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 0.40111804008483887,
314
  "epoch": 0.37978560490045943,
315
- "grad_norm": 0.0003681182861328125,
316
  "learning_rate": 0.00016341463414634147,
317
- "loss": 5.111394784762524e-05,
318
- "mean_token_accuracy": 1.0,
319
- "num_tokens": 156786.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 0.41568026319146156,
324
  "epoch": 0.39203675344563554,
325
- "grad_norm": 0.00162506103515625,
326
  "learning_rate": 0.00016219512195121954,
327
- "loss": 0.0001103500762837939,
328
- "mean_token_accuracy": 1.0,
329
- "num_tokens": 162859.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 0.39988269470632076,
334
  "epoch": 0.40428790199081166,
335
- "grad_norm": 0.000518798828125,
336
  "learning_rate": 0.00016097560975609758,
337
- "loss": 6.166221282910556e-05,
338
- "mean_token_accuracy": 1.0,
339
- "num_tokens": 167969.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 0.3738459562882781,
344
  "epoch": 0.4165390505359878,
345
- "grad_norm": 0.00537109375,
346
  "learning_rate": 0.00015975609756097562,
347
- "loss": 0.00012469613284338266,
348
- "mean_token_accuracy": 1.0,
349
- "num_tokens": 172518.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 0.40653541777282953,
354
  "epoch": 0.42879019908116384,
355
- "grad_norm": 0.0031280517578125,
356
  "learning_rate": 0.00015853658536585366,
357
- "loss": 0.00010661048872862011,
358
- "mean_token_accuracy": 1.0,
359
- "num_tokens": 177085.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 0.39361329190433025,
364
  "epoch": 0.44104134762633995,
365
- "grad_norm": 0.08154296875,
366
  "learning_rate": 0.00015731707317073173,
367
- "loss": 0.0010916765313595533,
368
- "mean_token_accuracy": 0.9990942031145096,
369
- "num_tokens": 181617.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 0.358949625864625,
374
  "epoch": 0.45329249617151607,
375
- "grad_norm": 0.01080322265625,
376
  "learning_rate": 0.00015609756097560978,
377
- "loss": 0.0010772041277959943,
378
- "mean_token_accuracy": 0.9995535723865032,
379
- "num_tokens": 186836.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 0.3930373042821884,
384
  "epoch": 0.4655436447166922,
385
- "grad_norm": 0.000461578369140625,
386
  "learning_rate": 0.00015487804878048782,
387
- "loss": 5.279047036310658e-05,
388
- "mean_token_accuracy": 1.0,
389
- "num_tokens": 191224.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 0.35740520991384983,
394
  "epoch": 0.4777947932618683,
395
- "grad_norm": 0.000873565673828125,
396
  "learning_rate": 0.00015365853658536586,
397
- "loss": 5.439379674498923e-05,
398
- "mean_token_accuracy": 1.0,
399
- "num_tokens": 195926.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 0.38909873832017183,
404
  "epoch": 0.4900459418070444,
405
- "grad_norm": 0.0257568359375,
406
  "learning_rate": 0.0001524390243902439,
407
- "loss": 0.0015194097068160772,
408
- "mean_token_accuracy": 0.999550361186266,
409
- "num_tokens": 200772.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 0.36850977689027786,
414
  "epoch": 0.5022970903522205,
415
- "grad_norm": 0.1064453125,
416
  "learning_rate": 0.00015121951219512197,
417
- "loss": 0.002955856267362833,
418
- "mean_token_accuracy": 0.9993872530758381,
419
- "num_tokens": 204499.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 0.3940112106502056,
424
  "epoch": 0.5145482388973966,
425
- "grad_norm": 0.00885009765625,
426
  "learning_rate": 0.00015000000000000001,
427
- "loss": 0.000253106962190941,
428
- "mean_token_accuracy": 1.0,
429
- "num_tokens": 208814.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 0.39878340624272823,
434
  "epoch": 0.5267993874425727,
435
- "grad_norm": 0.037841796875,
436
  "learning_rate": 0.00014878048780487806,
437
- "loss": 0.0007202713750302792,
438
- "mean_token_accuracy": 0.9995833337306976,
439
- "num_tokens": 213907.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 0.41587444953620434,
444
  "epoch": 0.5390505359877489,
445
- "grad_norm": 0.0004177093505859375,
446
  "learning_rate": 0.0001475609756097561,
447
- "loss": 6.820505223004147e-05,
448
- "mean_token_accuracy": 1.0,
449
- "num_tokens": 218988.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 0.3888211837038398,
454
  "epoch": 0.5513016845329249,
455
- "grad_norm": 0.007568359375,
456
  "learning_rate": 0.00014634146341463414,
457
- "loss": 0.000737900089006871,
458
- "mean_token_accuracy": 0.9995967745780945,
459
- "num_tokens": 223595.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 0.4139576517045498,
464
  "epoch": 0.5635528330781011,
465
- "grad_norm": 0.014892578125,
466
  "learning_rate": 0.0001451219512195122,
467
- "loss": 0.0006043408066034317,
468
- "mean_token_accuracy": 0.9995192289352417,
469
- "num_tokens": 228244.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 0.39713083021342754,
474
  "epoch": 0.5758039816232772,
475
- "grad_norm": 0.00046539306640625,
476
  "learning_rate": 0.00014390243902439025,
477
- "loss": 8.217584399972111e-05,
478
- "mean_token_accuracy": 1.0,
479
- "num_tokens": 232606.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 0.40557617880403996,
484
  "epoch": 0.5880551301684533,
485
- "grad_norm": 0.0009918212890625,
486
  "learning_rate": 0.0001426829268292683,
487
- "loss": 0.00012616875756066293,
488
- "mean_token_accuracy": 1.0,
489
- "num_tokens": 236563.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 0.43470797687768936,
494
  "epoch": 0.6003062787136294,
495
- "grad_norm": 0.0238037109375,
496
  "learning_rate": 0.00014146341463414634,
497
- "loss": 0.0010796654969453812,
498
- "mean_token_accuracy": 0.999465811997652,
499
- "num_tokens": 241214.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 0.4234541580080986,
504
  "epoch": 0.6125574272588055,
505
- "grad_norm": 0.02783203125,
506
  "learning_rate": 0.00014024390243902438,
507
- "loss": 0.0009178520413115621,
508
- "mean_token_accuracy": 0.9996565915644169,
509
- "num_tokens": 245200.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
- "eval_entropy": 0.4022736955380094,
515
- "eval_loss": 0.0006544959614984691,
516
- "eval_mean_token_accuracy": 0.9998166846192401,
517
- "eval_num_tokens": 245200.0,
518
- "eval_runtime": 51.0138,
519
- "eval_samples_per_second": 1.353,
520
- "eval_steps_per_second": 1.353,
521
  "step": 50
522
  },
523
  {
524
- "entropy": 0.41674751229584217,
525
  "epoch": 0.6248085758039816,
526
- "grad_norm": 0.00131988525390625,
527
  "learning_rate": 0.00013902439024390245,
528
- "loss": 0.0001285702601308003,
529
- "mean_token_accuracy": 1.0,
530
- "num_tokens": 249761.0,
531
  "step": 51
532
  },
533
  {
534
- "entropy": 0.42886597104370594,
535
  "epoch": 0.6370597243491577,
536
- "grad_norm": 0.00171661376953125,
537
  "learning_rate": 0.0001378048780487805,
538
- "loss": 0.00014620381989516318,
539
- "mean_token_accuracy": 1.0,
540
- "num_tokens": 254787.0,
541
  "step": 52
542
  },
543
  {
544
- "entropy": 0.4423276912420988,
545
  "epoch": 0.6493108728943339,
546
- "grad_norm": 0.038818359375,
547
  "learning_rate": 0.00013658536585365856,
548
- "loss": 0.003947169054299593,
549
- "mean_token_accuracy": 0.9983357414603233,
550
- "num_tokens": 260287.0,
551
  "step": 53
552
  },
553
  {
554
- "entropy": 0.3989156847819686,
555
  "epoch": 0.6615620214395099,
556
- "grad_norm": 0.0211181640625,
557
  "learning_rate": 0.0001353658536585366,
558
- "loss": 0.00047477131010964513,
559
- "mean_token_accuracy": 0.9998249299824238,
560
- "num_tokens": 264810.0,
561
  "step": 54
562
  },
563
  {
564
- "entropy": 0.4272368475794792,
565
  "epoch": 0.6738131699846861,
566
- "grad_norm": 0.029052734375,
567
  "learning_rate": 0.00013414634146341464,
568
- "loss": 0.00408769678324461,
569
- "mean_token_accuracy": 0.9993622452020645,
570
- "num_tokens": 270386.0,
571
  "step": 55
572
  },
573
  {
574
- "entropy": 0.44703495875000954,
575
  "epoch": 0.6860643185298622,
576
- "grad_norm": 0.01202392578125,
577
  "learning_rate": 0.0001329268292682927,
578
- "loss": 0.00038261126610450447,
579
- "mean_token_accuracy": 1.0,
580
- "num_tokens": 274391.0,
581
  "step": 56
582
  },
583
  {
584
- "entropy": 0.4288428146392107,
585
  "epoch": 0.6983154670750383,
586
- "grad_norm": 0.01019287109375,
587
  "learning_rate": 0.00013170731707317076,
588
- "loss": 0.0003242077073082328,
589
- "mean_token_accuracy": 1.0,
590
- "num_tokens": 279716.0,
591
  "step": 57
592
  },
593
  {
594
- "entropy": 0.37452960200607777,
595
  "epoch": 0.7105666156202144,
596
- "grad_norm": 0.021728515625,
597
  "learning_rate": 0.0001304878048780488,
598
- "loss": 0.0027725810650736094,
599
- "mean_token_accuracy": 0.9994703382253647,
600
- "num_tokens": 285404.0,
601
  "step": 58
602
  },
603
  {
604
- "entropy": 0.4130611680448055,
605
  "epoch": 0.7228177641653905,
606
- "grad_norm": 0.04541015625,
607
  "learning_rate": 0.00012926829268292684,
608
- "loss": 0.0017543239519000053,
609
- "mean_token_accuracy": 0.9995689652860165,
610
- "num_tokens": 289992.0,
611
  "step": 59
612
  },
613
  {
614
- "entropy": 0.41101630590856075,
615
  "epoch": 0.7350689127105666,
616
- "grad_norm": 0.00078582763671875,
617
  "learning_rate": 0.00012804878048780488,
618
- "loss": 9.316274372395128e-05,
619
- "mean_token_accuracy": 1.0,
620
- "num_tokens": 294861.0,
621
  "step": 60
622
  },
623
  {
624
- "entropy": 0.3678157525137067,
625
  "epoch": 0.7473200612557427,
626
- "grad_norm": 0.00058746337890625,
627
  "learning_rate": 0.00012682926829268293,
628
- "loss": 8.83688626345247e-05,
629
- "mean_token_accuracy": 1.0,
630
- "num_tokens": 300355.0,
631
  "step": 61
632
  },
633
  {
634
- "entropy": 0.40994635969400406,
635
  "epoch": 0.7595712098009189,
636
- "grad_norm": 0.0015869140625,
637
  "learning_rate": 0.000125609756097561,
638
- "loss": 8.545083983335644e-05,
639
- "mean_token_accuracy": 1.0,
640
- "num_tokens": 305776.0,
641
  "step": 62
642
  },
643
  {
644
- "entropy": 0.37295936793088913,
645
  "epoch": 0.7718223583460949,
646
- "grad_norm": 0.000827789306640625,
647
  "learning_rate": 0.00012439024390243904,
648
- "loss": 7.97374959802255e-05,
649
- "mean_token_accuracy": 1.0,
650
- "num_tokens": 310204.0,
651
  "step": 63
652
  },
653
  {
654
- "entropy": 0.36804571095854044,
655
  "epoch": 0.7840735068912711,
656
- "grad_norm": 0.0002880096435546875,
657
  "learning_rate": 0.00012317073170731708,
658
- "loss": 6.0703161580022424e-05,
659
- "mean_token_accuracy": 1.0,
660
- "num_tokens": 314205.0,
661
  "step": 64
662
  },
663
  {
664
- "entropy": 0.3904844745993614,
665
  "epoch": 0.7963246554364471,
666
- "grad_norm": 0.0019989013671875,
667
  "learning_rate": 0.00012195121951219512,
668
- "loss": 7.91027705417946e-05,
669
- "mean_token_accuracy": 1.0,
670
- "num_tokens": 319157.0,
671
  "step": 65
672
  },
673
  {
674
- "entropy": 0.3921838700771332,
675
  "epoch": 0.8085758039816233,
676
- "grad_norm": 0.00177764892578125,
677
  "learning_rate": 0.00012073170731707318,
678
- "loss": 8.364896348211914e-05,
679
- "mean_token_accuracy": 1.0,
680
- "num_tokens": 324681.0,
681
  "step": 66
682
  },
683
  {
684
- "entropy": 0.34572961553931236,
685
  "epoch": 0.8208269525267994,
686
- "grad_norm": 0.061767578125,
687
  "learning_rate": 0.00011951219512195122,
688
- "loss": 0.008409281261265278,
689
- "mean_token_accuracy": 0.9963545724749565,
690
- "num_tokens": 329941.0,
691
  "step": 67
692
  },
693
  {
694
- "entropy": 0.3841299172490835,
695
  "epoch": 0.8330781010719756,
696
- "grad_norm": 0.01123046875,
697
  "learning_rate": 0.00011829268292682926,
698
- "loss": 0.00017956709780264646,
699
- "mean_token_accuracy": 1.0,
700
- "num_tokens": 334486.0,
701
  "step": 68
702
  },
703
  {
704
- "entropy": 0.39541577361524105,
705
  "epoch": 0.8453292496171516,
706
- "grad_norm": 0.00274658203125,
707
  "learning_rate": 0.00011707317073170732,
708
- "loss": 0.00012585960212163627,
709
- "mean_token_accuracy": 1.0,
710
- "num_tokens": 338183.0,
711
  "step": 69
712
  },
713
  {
714
- "entropy": 0.4046988161280751,
715
  "epoch": 0.8575803981623277,
716
- "grad_norm": 0.126953125,
717
  "learning_rate": 0.00011585365853658536,
718
- "loss": 0.007125813513994217,
719
- "mean_token_accuracy": 0.9981492757797241,
720
- "num_tokens": 342593.0,
721
  "step": 70
722
  },
723
  {
724
- "entropy": 0.40994592756032944,
725
  "epoch": 0.8698315467075038,
726
- "grad_norm": 0.0517578125,
727
  "learning_rate": 0.00011463414634146342,
728
- "loss": 0.0006066925125196576,
729
- "mean_token_accuracy": 0.9997807033360004,
730
- "num_tokens": 347797.0,
731
  "step": 71
732
  },
733
  {
734
- "entropy": 0.3796220198273659,
735
  "epoch": 0.8820826952526799,
736
- "grad_norm": 0.006103515625,
737
  "learning_rate": 0.00011341463414634146,
738
- "loss": 0.00017896694771479815,
739
- "mean_token_accuracy": 1.0,
740
- "num_tokens": 352121.0,
741
  "step": 72
742
  },
743
  {
744
- "entropy": 0.3931356444954872,
745
  "epoch": 0.8943338437978561,
746
- "grad_norm": 0.0181884765625,
747
  "learning_rate": 0.00011219512195121953,
748
- "loss": 0.0010632644407451153,
749
- "mean_token_accuracy": 0.9997568093240261,
750
- "num_tokens": 357943.0,
751
  "step": 73
752
  },
753
  {
754
- "entropy": 0.36392936669290066,
755
  "epoch": 0.9065849923430321,
756
- "grad_norm": 0.024658203125,
757
  "learning_rate": 0.00011097560975609757,
758
- "loss": 0.0006849091150797904,
759
- "mean_token_accuracy": 0.9996345043182373,
760
- "num_tokens": 363814.0,
761
  "step": 74
762
  },
763
  {
764
- "entropy": 0.3864069525152445,
765
  "epoch": 0.9188361408882083,
766
- "grad_norm": 0.000270843505859375,
767
  "learning_rate": 0.00010975609756097563,
768
- "loss": 5.0294114771531895e-05,
769
- "mean_token_accuracy": 1.0,
770
- "num_tokens": 368870.0,
771
  "step": 75
772
  },
773
  {
774
- "entropy": 0.39719677343964577,
775
  "epoch": 0.9310872894333844,
776
- "grad_norm": 0.01519775390625,
777
  "learning_rate": 0.00010853658536585367,
778
- "loss": 0.00048823675024323165,
779
- "mean_token_accuracy": 0.999143835157156,
780
- "num_tokens": 373670.0,
781
  "step": 76
782
  },
783
  {
784
- "entropy": 0.35627279058098793,
785
  "epoch": 0.9433384379785605,
786
- "grad_norm": 0.0074462890625,
787
  "learning_rate": 0.00010731707317073172,
788
- "loss": 0.000174719825736247,
789
- "mean_token_accuracy": 1.0,
790
- "num_tokens": 379037.0,
791
  "step": 77
792
  },
793
  {
794
- "entropy": 0.38681978918612003,
795
  "epoch": 0.9555895865237366,
796
- "grad_norm": 0.0181884765625,
797
  "learning_rate": 0.00010609756097560977,
798
- "loss": 0.000976942596025765,
799
- "mean_token_accuracy": 0.9992977529764175,
800
- "num_tokens": 384252.0,
801
  "step": 78
802
  },
803
  {
804
- "entropy": 0.3772548586130142,
805
  "epoch": 0.9678407350689127,
806
- "grad_norm": 0.000904083251953125,
807
  "learning_rate": 0.00010487804878048781,
808
- "loss": 6.608536932617426e-05,
809
- "mean_token_accuracy": 1.0,
810
- "num_tokens": 388347.0,
811
  "step": 79
812
  },
813
  {
814
- "entropy": 0.3597776433452964,
815
  "epoch": 0.9800918836140888,
816
- "grad_norm": 0.010986328125,
817
  "learning_rate": 0.00010365853658536586,
818
- "loss": 0.0007963755051605403,
819
- "mean_token_accuracy": 0.999015748500824,
820
- "num_tokens": 394213.0,
821
  "step": 80
822
  },
823
  {
824
- "entropy": 0.3731031287461519,
825
  "epoch": 0.9923430321592649,
826
- "grad_norm": 0.00115966796875,
827
  "learning_rate": 0.0001024390243902439,
828
- "loss": 8.310518751386553e-05,
829
- "mean_token_accuracy": 1.0,
830
- "num_tokens": 399113.0,
831
  "step": 81
832
  },
833
  {
834
- "entropy": 0.37349462509155273,
835
  "epoch": 1.0,
836
- "grad_norm": 0.00022125244140625,
837
  "learning_rate": 0.00010121951219512196,
838
- "loss": 4.093759343959391e-05,
839
- "mean_token_accuracy": 1.0,
840
- "num_tokens": 402129.0,
841
  "step": 82
842
  },
843
  {
844
- "entropy": 0.38408348336815834,
845
  "epoch": 1.0122511485451762,
846
- "grad_norm": 0.027099609375,
847
  "learning_rate": 0.0001,
848
- "loss": 0.0015746817225590348,
849
- "mean_token_accuracy": 0.9996279776096344,
850
- "num_tokens": 406760.0,
851
  "step": 83
852
  },
853
  {
854
- "entropy": 0.36415083333849907,
855
  "epoch": 1.0245022970903521,
856
- "grad_norm": 0.0032501220703125,
857
  "learning_rate": 9.878048780487805e-05,
858
- "loss": 0.00011362869554432109,
859
- "mean_token_accuracy": 1.0,
860
- "num_tokens": 411366.0,
861
  "step": 84
862
  },
863
  {
864
- "entropy": 0.3951573334634304,
865
  "epoch": 1.0367534456355283,
866
- "grad_norm": 0.0018768310546875,
867
  "learning_rate": 9.75609756097561e-05,
868
- "loss": 8.601781155448407e-05,
869
- "mean_token_accuracy": 1.0,
870
- "num_tokens": 417767.0,
871
  "step": 85
872
  },
873
  {
874
- "entropy": 0.3533172570168972,
875
  "epoch": 1.0490045941807045,
876
- "grad_norm": 0.00146484375,
877
  "learning_rate": 9.634146341463415e-05,
878
- "loss": 5.874271664652042e-05,
879
- "mean_token_accuracy": 1.0,
880
- "num_tokens": 421737.0,
881
  "step": 86
882
  },
883
  {
884
- "entropy": 0.35251205042004585,
885
  "epoch": 1.0612557427258806,
886
- "grad_norm": 6.008148193359375e-05,
887
  "learning_rate": 9.51219512195122e-05,
888
- "loss": 2.1197016394580714e-05,
889
- "mean_token_accuracy": 1.0,
890
- "num_tokens": 426853.0,
891
  "step": 87
892
  },
893
  {
894
- "entropy": 0.42304582707583904,
895
  "epoch": 1.0735068912710566,
896
- "grad_norm": 0.000797271728515625,
897
  "learning_rate": 9.390243902439024e-05,
898
- "loss": 6.177897739689797e-05,
899
- "mean_token_accuracy": 1.0,
900
- "num_tokens": 431082.0,
901
  "step": 88
902
  },
903
  {
904
- "entropy": 0.39542090706527233,
905
  "epoch": 1.0857580398162328,
906
- "grad_norm": 0.041015625,
907
  "learning_rate": 9.26829268292683e-05,
908
- "loss": 0.0009606232051737607,
909
- "mean_token_accuracy": 1.0,
910
- "num_tokens": 435693.0,
911
  "step": 89
912
  },
913
  {
914
- "entropy": 0.37046173214912415,
915
  "epoch": 1.098009188361409,
916
- "grad_norm": 0.000278472900390625,
917
  "learning_rate": 9.146341463414635e-05,
918
- "loss": 4.265129246050492e-05,
919
- "mean_token_accuracy": 1.0,
920
- "num_tokens": 440577.0,
921
  "step": 90
922
  },
923
  {
924
- "entropy": 0.3931607408449054,
925
  "epoch": 1.110260336906585,
926
- "grad_norm": 0.035400390625,
927
  "learning_rate": 9.02439024390244e-05,
928
- "loss": 0.004250116180628538,
929
- "mean_token_accuracy": 0.9994369372725487,
930
- "num_tokens": 445265.0,
931
  "step": 91
932
  },
933
  {
934
- "entropy": 0.3917137086391449,
935
  "epoch": 1.122511485451761,
936
- "grad_norm": 0.0419921875,
937
  "learning_rate": 8.902439024390244e-05,
938
- "loss": 0.002317648846656084,
939
- "mean_token_accuracy": 0.9992785975337029,
940
- "num_tokens": 450020.0,
941
  "step": 92
942
  },
943
  {
944
- "entropy": 0.3758338335901499,
945
  "epoch": 1.1347626339969372,
946
- "grad_norm": 0.0196533203125,
947
  "learning_rate": 8.78048780487805e-05,
948
- "loss": 0.0006808089674450457,
949
- "mean_token_accuracy": 0.999522902071476,
950
- "num_tokens": 455003.0,
951
  "step": 93
952
  },
953
  {
954
- "entropy": 0.383782709017396,
955
  "epoch": 1.1470137825421134,
956
- "grad_norm": 0.0034027099609375,
957
  "learning_rate": 8.658536585365854e-05,
958
- "loss": 7.263245788635686e-05,
959
- "mean_token_accuracy": 1.0,
960
- "num_tokens": 459698.0,
961
  "step": 94
962
  },
963
  {
964
- "entropy": 0.3821055982261896,
965
  "epoch": 1.1592649310872893,
966
- "grad_norm": 0.0004138946533203125,
967
  "learning_rate": 8.53658536585366e-05,
968
- "loss": 3.771902629523538e-05,
969
- "mean_token_accuracy": 1.0,
970
- "num_tokens": 464337.0,
971
  "step": 95
972
  },
973
  {
974
- "entropy": 0.3649219311773777,
975
  "epoch": 1.1715160796324655,
976
- "grad_norm": 0.00872802734375,
977
  "learning_rate": 8.414634146341464e-05,
978
- "loss": 0.0004717935808002949,
979
- "mean_token_accuracy": 1.0,
980
- "num_tokens": 468882.0,
981
  "step": 96
982
  },
983
  {
984
- "entropy": 0.3700664434581995,
985
  "epoch": 1.1837672281776417,
986
- "grad_norm": 0.00015544891357421875,
987
  "learning_rate": 8.292682926829268e-05,
988
- "loss": 3.247045970056206e-05,
989
- "mean_token_accuracy": 1.0,
990
- "num_tokens": 473756.0,
991
  "step": 97
992
  },
993
  {
994
- "entropy": 0.3915936965495348,
995
  "epoch": 1.1960183767228179,
996
- "grad_norm": 0.05078125,
997
  "learning_rate": 8.170731707317073e-05,
998
- "loss": 0.005024694371968508,
999
- "mean_token_accuracy": 0.9996565915644169,
1000
- "num_tokens": 479061.0,
1001
  "step": 98
1002
  },
1003
  {
1004
- "entropy": 0.4096358586102724,
1005
  "epoch": 1.2082695252679938,
1006
- "grad_norm": 0.00144195556640625,
1007
  "learning_rate": 8.048780487804879e-05,
1008
- "loss": 4.485135286813602e-05,
1009
- "mean_token_accuracy": 1.0,
1010
- "num_tokens": 484835.0,
1011
  "step": 99
1012
  },
1013
  {
1014
- "entropy": 0.35138822346925735,
1015
  "epoch": 1.22052067381317,
1016
- "grad_norm": 0.0038299560546875,
1017
  "learning_rate": 7.926829268292683e-05,
1018
- "loss": 0.00019770213111769408,
1019
- "mean_token_accuracy": 1.0,
1020
- "num_tokens": 489546.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
- "eval_entropy": 0.3780687239722929,
1026
- "eval_loss": 0.00034746917663142085,
1027
- "eval_mean_token_accuracy": 0.9999171840971794,
1028
- "eval_num_tokens": 489546.0,
1029
- "eval_runtime": 50.9982,
1030
- "eval_samples_per_second": 1.353,
1031
- "eval_steps_per_second": 1.353,
1032
  "step": 100
1033
  },
1034
  {
1035
- "entropy": 0.4012060575187206,
1036
  "epoch": 1.2327718223583461,
1037
- "grad_norm": 0.000217437744140625,
1038
  "learning_rate": 7.804878048780489e-05,
1039
- "loss": 3.667730197776109e-05,
1040
- "mean_token_accuracy": 1.0,
1041
- "num_tokens": 494781.0,
1042
  "step": 101
1043
  },
1044
  {
1045
- "entropy": 0.37181732058525085,
1046
  "epoch": 1.245022970903522,
1047
- "grad_norm": 0.0002155303955078125,
1048
  "learning_rate": 7.682926829268293e-05,
1049
- "loss": 2.923922693298664e-05,
1050
- "mean_token_accuracy": 1.0,
1051
- "num_tokens": 499861.0,
1052
  "step": 102
1053
  },
1054
  {
1055
- "entropy": 0.38948795571923256,
1056
  "epoch": 1.2572741194486983,
1057
- "grad_norm": 6.866455078125e-05,
1058
  "learning_rate": 7.560975609756099e-05,
1059
- "loss": 3.10177420033142e-05,
1060
- "mean_token_accuracy": 1.0,
1061
- "num_tokens": 505291.0,
1062
  "step": 103
1063
  },
1064
  {
1065
- "entropy": 0.3776157572865486,
1066
  "epoch": 1.2695252679938744,
1067
- "grad_norm": 0.00012874603271484375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
- "loss": 2.8559963539009914e-05,
1070
- "mean_token_accuracy": 1.0,
1071
- "num_tokens": 510284.0,
1072
  "step": 104
1073
  },
1074
  {
1075
- "entropy": 0.3941178657114506,
1076
  "epoch": 1.2817764165390506,
1077
- "grad_norm": 0.007232666015625,
1078
  "learning_rate": 7.317073170731707e-05,
1079
- "loss": 0.0008174990070983768,
1080
- "mean_token_accuracy": 1.0,
1081
- "num_tokens": 514517.0,
1082
  "step": 105
1083
  },
1084
  {
1085
- "entropy": 0.3697250857949257,
1086
  "epoch": 1.2940275650842268,
1087
- "grad_norm": 0.003143310546875,
1088
  "learning_rate": 7.195121951219513e-05,
1089
- "loss": 0.00010880863555939868,
1090
- "mean_token_accuracy": 1.0,
1091
- "num_tokens": 519535.0,
1092
  "step": 106
1093
  },
1094
  {
1095
- "entropy": 0.3888526763767004,
1096
  "epoch": 1.3062787136294027,
1097
- "grad_norm": 0.00054931640625,
1098
  "learning_rate": 7.073170731707317e-05,
1099
- "loss": 5.111205973662436e-05,
1100
- "mean_token_accuracy": 1.0,
1101
- "num_tokens": 524397.0,
1102
  "step": 107
1103
  },
1104
  {
1105
- "entropy": 0.3866258058696985,
1106
  "epoch": 1.318529862174579,
1107
- "grad_norm": 0.0004100799560546875,
1108
  "learning_rate": 6.951219512195122e-05,
1109
- "loss": 3.999587715952657e-05,
1110
- "mean_token_accuracy": 1.0,
1111
- "num_tokens": 528997.0,
1112
  "step": 108
1113
  },
1114
  {
1115
- "entropy": 0.3921303730458021,
1116
  "epoch": 1.3307810107197549,
1117
- "grad_norm": 0.000885009765625,
1118
  "learning_rate": 6.829268292682928e-05,
1119
- "loss": 6.128583481768146e-05,
1120
- "mean_token_accuracy": 1.0,
1121
- "num_tokens": 533965.0,
1122
  "step": 109
1123
  },
1124
  {
1125
- "entropy": 0.3705854155123234,
1126
  "epoch": 1.343032159264931,
1127
- "grad_norm": 0.002960205078125,
1128
  "learning_rate": 6.707317073170732e-05,
1129
- "loss": 7.792656106175855e-05,
1130
- "mean_token_accuracy": 1.0,
1131
- "num_tokens": 539387.0,
1132
  "step": 110
1133
  },
1134
  {
1135
- "entropy": 0.3712622048333287,
1136
  "epoch": 1.3552833078101072,
1137
- "grad_norm": 0.00089263916015625,
1138
  "learning_rate": 6.585365853658538e-05,
1139
- "loss": 4.521696246229112e-05,
1140
- "mean_token_accuracy": 1.0,
1141
- "num_tokens": 543755.0,
1142
  "step": 111
1143
  },
1144
  {
1145
- "entropy": 0.40867704525589943,
1146
  "epoch": 1.3675344563552834,
1147
- "grad_norm": 0.023193359375,
1148
  "learning_rate": 6.463414634146342e-05,
1149
- "loss": 0.003280676668509841,
1150
- "mean_token_accuracy": 0.9978448264300823,
1151
- "num_tokens": 548188.0,
1152
  "step": 112
1153
  },
1154
  {
1155
- "entropy": 0.3910982459783554,
1156
  "epoch": 1.3797856049004595,
1157
- "grad_norm": 0.0028533935546875,
1158
  "learning_rate": 6.341463414634146e-05,
1159
- "loss": 0.00015341158723458648,
1160
- "mean_token_accuracy": 1.0,
1161
- "num_tokens": 553717.0,
1162
  "step": 113
1163
  },
1164
  {
1165
- "entropy": 0.3753495467826724,
1166
  "epoch": 1.3920367534456355,
1167
- "grad_norm": 6.866455078125e-05,
1168
  "learning_rate": 6.219512195121952e-05,
1169
- "loss": 2.554376442276407e-05,
1170
- "mean_token_accuracy": 1.0,
1171
- "num_tokens": 558501.0,
1172
  "step": 114
1173
  },
1174
  {
1175
- "entropy": 0.3936616498976946,
1176
  "epoch": 1.4042879019908117,
1177
- "grad_norm": 0.000774383544921875,
1178
  "learning_rate": 6.097560975609756e-05,
1179
- "loss": 4.565157360048033e-05,
1180
- "mean_token_accuracy": 1.0,
1181
- "num_tokens": 563989.0,
1182
  "step": 115
1183
  },
1184
  {
1185
- "entropy": 0.4080927763134241,
1186
  "epoch": 1.4165390505359878,
1187
- "grad_norm": 0.000728607177734375,
1188
  "learning_rate": 5.975609756097561e-05,
1189
- "loss": 5.44461581739597e-05,
1190
- "mean_token_accuracy": 1.0,
1191
- "num_tokens": 568327.0,
1192
  "step": 116
1193
  },
1194
  {
1195
- "entropy": 0.36639871448278427,
1196
  "epoch": 1.4287901990811638,
1197
- "grad_norm": 0.000457763671875,
1198
  "learning_rate": 5.853658536585366e-05,
1199
- "loss": 3.381741407793015e-05,
1200
- "mean_token_accuracy": 1.0,
1201
- "num_tokens": 572919.0,
1202
  "step": 117
1203
  },
1204
  {
1205
- "entropy": 0.4015892669558525,
1206
  "epoch": 1.44104134762634,
1207
- "grad_norm": 0.00017833709716796875,
1208
  "learning_rate": 5.731707317073171e-05,
1209
- "loss": 4.158892625127919e-05,
1210
- "mean_token_accuracy": 1.0,
1211
- "num_tokens": 577916.0,
1212
  "step": 118
1213
  },
1214
  {
1215
- "entropy": 0.40410150960087776,
1216
  "epoch": 1.4532924961715161,
1217
- "grad_norm": 0.000621795654296875,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
- "loss": 2.5736055249581113e-05,
1220
- "mean_token_accuracy": 1.0,
1221
- "num_tokens": 583152.0,
1222
  "step": 119
1223
  },
1224
  {
1225
- "entropy": 0.40528898034244776,
1226
  "epoch": 1.4655436447166923,
1227
- "grad_norm": 0.01953125,
1228
  "learning_rate": 5.487804878048781e-05,
1229
- "loss": 0.00020874114125035703,
1230
- "mean_token_accuracy": 1.0,
1231
- "num_tokens": 587880.0,
1232
  "step": 120
1233
  },
1234
  {
1235
- "entropy": 0.35937592945992947,
1236
  "epoch": 1.4777947932618682,
1237
- "grad_norm": 0.083984375,
1238
  "learning_rate": 5.365853658536586e-05,
1239
- "loss": 0.007331337314099073,
1240
- "mean_token_accuracy": 0.9991379305720329,
1241
- "num_tokens": 592284.0,
1242
  "step": 121
1243
  },
1244
  {
1245
- "entropy": 0.3928218297660351,
1246
  "epoch": 1.4900459418070444,
1247
- "grad_norm": 0.00013446807861328125,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
- "loss": 2.927147943410091e-05,
1250
- "mean_token_accuracy": 1.0,
1251
- "num_tokens": 597046.0,
1252
  "step": 122
1253
  },
1254
  {
1255
- "entropy": 0.3777940608561039,
1256
  "epoch": 1.5022970903522204,
1257
- "grad_norm": 0.000579833984375,
1258
  "learning_rate": 5.121951219512195e-05,
1259
- "loss": 6.0145219322294e-05,
1260
- "mean_token_accuracy": 1.0,
1261
- "num_tokens": 601350.0,
1262
  "step": 123
1263
  },
1264
  {
1265
- "entropy": 0.39830240048468113,
1266
  "epoch": 1.5145482388973965,
1267
- "grad_norm": 0.0245361328125,
1268
  "learning_rate": 5e-05,
1269
- "loss": 0.00029612769139930606,
1270
- "mean_token_accuracy": 1.0,
1271
- "num_tokens": 606643.0,
1272
  "step": 124
1273
  },
1274
  {
1275
- "entropy": 0.3925098739564419,
1276
  "epoch": 1.5267993874425727,
1277
- "grad_norm": 0.0004749298095703125,
1278
  "learning_rate": 4.878048780487805e-05,
1279
- "loss": 4.631431511370465e-05,
1280
- "mean_token_accuracy": 1.0,
1281
- "num_tokens": 612405.0,
1282
  "step": 125
1283
  },
1284
  {
1285
- "entropy": 0.3956710360944271,
1286
  "epoch": 1.5390505359877489,
1287
- "grad_norm": 0.00634765625,
1288
  "learning_rate": 4.75609756097561e-05,
1289
- "loss": 8.446360880043358e-05,
1290
- "mean_token_accuracy": 1.0,
1291
- "num_tokens": 617227.0,
1292
  "step": 126
1293
  },
1294
  {
1295
- "entropy": 0.430975291877985,
1296
  "epoch": 1.551301684532925,
1297
- "grad_norm": 0.000518798828125,
1298
  "learning_rate": 4.634146341463415e-05,
1299
- "loss": 6.132836278993636e-05,
1300
- "mean_token_accuracy": 1.0,
1301
- "num_tokens": 622353.0,
1302
  "step": 127
1303
  },
1304
  {
1305
- "entropy": 0.4242272228002548,
1306
  "epoch": 1.5635528330781012,
1307
- "grad_norm": 0.0025177001953125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
- "loss": 0.00011561957217054442,
1310
- "mean_token_accuracy": 1.0,
1311
- "num_tokens": 627267.0,
1312
  "step": 128
1313
  },
1314
  {
1315
- "entropy": 0.3710012398660183,
1316
  "epoch": 1.5758039816232772,
1317
- "grad_norm": 0.002777099609375,
1318
  "learning_rate": 4.390243902439025e-05,
1319
- "loss": 0.00010202911653323099,
1320
- "mean_token_accuracy": 1.0,
1321
- "num_tokens": 631452.0,
1322
  "step": 129
1323
  },
1324
  {
1325
- "entropy": 0.35699679516255856,
1326
  "epoch": 1.5880551301684533,
1327
- "grad_norm": 0.00023651123046875,
1328
  "learning_rate": 4.26829268292683e-05,
1329
- "loss": 5.903129203943536e-05,
1330
- "mean_token_accuracy": 1.0,
1331
- "num_tokens": 636500.0,
1332
  "step": 130
1333
  },
1334
  {
1335
- "entropy": 0.39619251526892185,
1336
  "epoch": 1.6003062787136293,
1337
- "grad_norm": 0.0230712890625,
1338
  "learning_rate": 4.146341463414634e-05,
1339
- "loss": 0.0031676713842898607,
1340
- "mean_token_accuracy": 0.9987796545028687,
1341
- "num_tokens": 641262.0,
1342
  "step": 131
1343
  },
1344
  {
1345
- "entropy": 0.40411114878952503,
1346
  "epoch": 1.6125574272588055,
1347
- "grad_norm": 0.0361328125,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
- "loss": 0.0015652105212211609,
1350
- "mean_token_accuracy": 0.999205507338047,
1351
- "num_tokens": 646375.0,
1352
  "step": 132
1353
  },
1354
  {
1355
- "entropy": 0.3453770913183689,
1356
  "epoch": 1.6248085758039816,
1357
- "grad_norm": 8.440017700195312e-05,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
- "loss": 3.279931843280792e-05,
1360
- "mean_token_accuracy": 1.0,
1361
- "num_tokens": 650765.0,
1362
  "step": 133
1363
  },
1364
  {
1365
- "entropy": 0.37724466249346733,
1366
  "epoch": 1.6370597243491578,
1367
- "grad_norm": 0.00142669677734375,
1368
  "learning_rate": 3.780487804878049e-05,
1369
- "loss": 5.4958236432867125e-05,
1370
- "mean_token_accuracy": 1.0,
1371
- "num_tokens": 655167.0,
1372
  "step": 134
1373
  },
1374
  {
1375
- "entropy": 0.39796170592308044,
1376
  "epoch": 1.649310872894334,
1377
- "grad_norm": 0.0003986358642578125,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
- "loss": 3.9815466152504086e-05,
1380
- "mean_token_accuracy": 1.0,
1381
- "num_tokens": 660288.0,
1382
  "step": 135
1383
  },
1384
  {
1385
- "entropy": 0.4333613757044077,
1386
  "epoch": 1.66156202143951,
1387
- "grad_norm": 0.0001544952392578125,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
- "loss": 4.787950456375256e-05,
1390
- "mean_token_accuracy": 1.0,
1391
- "num_tokens": 664471.0,
1392
  "step": 136
1393
  },
1394
  {
1395
- "entropy": 0.41916552372276783,
1396
  "epoch": 1.673813169984686,
1397
- "grad_norm": 0.0002899169921875,
1398
  "learning_rate": 3.414634146341464e-05,
1399
- "loss": 4.767990321852267e-05,
1400
- "mean_token_accuracy": 1.0,
1401
- "num_tokens": 669354.0,
1402
  "step": 137
1403
  },
1404
  {
1405
- "entropy": 0.3999825790524483,
1406
  "epoch": 1.686064318529862,
1407
- "grad_norm": 0.0026397705078125,
1408
  "learning_rate": 3.292682926829269e-05,
1409
- "loss": 0.0001605500146979466,
1410
- "mean_token_accuracy": 1.0,
1411
- "num_tokens": 674909.0,
1412
  "step": 138
1413
  },
1414
  {
1415
- "entropy": 0.39421058259904385,
1416
  "epoch": 1.6983154670750382,
1417
- "grad_norm": 0.005767822265625,
1418
  "learning_rate": 3.170731707317073e-05,
1419
- "loss": 0.00022102531511336565,
1420
- "mean_token_accuracy": 1.0,
1421
- "num_tokens": 679690.0,
1422
  "step": 139
1423
  },
1424
  {
1425
- "entropy": 0.4142182134091854,
1426
  "epoch": 1.7105666156202144,
1427
- "grad_norm": 0.003631591796875,
1428
  "learning_rate": 3.048780487804878e-05,
1429
- "loss": 0.00014472004841081798,
1430
- "mean_token_accuracy": 1.0,
1431
- "num_tokens": 685046.0,
1432
  "step": 140
1433
  },
1434
  {
1435
- "entropy": 0.3982192352414131,
1436
  "epoch": 1.7228177641653906,
1437
- "grad_norm": 0.00019168853759765625,
1438
  "learning_rate": 2.926829268292683e-05,
1439
- "loss": 4.7273264499381185e-05,
1440
- "mean_token_accuracy": 1.0,
1441
- "num_tokens": 689394.0,
1442
  "step": 141
1443
  },
1444
  {
1445
- "entropy": 0.4133493732661009,
1446
  "epoch": 1.7350689127105667,
1447
- "grad_norm": 0.00701904296875,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
- "loss": 9.296434291172773e-05,
1450
- "mean_token_accuracy": 1.0,
1451
- "num_tokens": 693187.0,
1452
  "step": 142
1453
  },
1454
  {
1455
- "entropy": 0.40933855436742306,
1456
  "epoch": 1.7473200612557427,
1457
- "grad_norm": 0.0019683837890625,
1458
  "learning_rate": 2.682926829268293e-05,
1459
- "loss": 9.476351988269016e-05,
1460
- "mean_token_accuracy": 1.0,
1461
- "num_tokens": 697601.0,
1462
  "step": 143
1463
  },
1464
  {
1465
- "entropy": 0.41714910976588726,
1466
  "epoch": 1.7595712098009189,
1467
- "grad_norm": 0.045166015625,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
- "loss": 0.0034146099351346493,
1470
- "mean_token_accuracy": 0.9998650103807449,
1471
- "num_tokens": 703048.0,
1472
  "step": 144
1473
  },
1474
  {
1475
- "entropy": 0.40594901144504547,
1476
  "epoch": 1.7718223583460948,
1477
- "grad_norm": 0.02587890625,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
- "loss": 0.001274456619285047,
1480
- "mean_token_accuracy": 0.999015748500824,
1481
- "num_tokens": 707860.0,
1482
  "step": 145
1483
  },
1484
  {
1485
- "entropy": 0.41635255329310894,
1486
  "epoch": 1.784073506891271,
1487
- "grad_norm": 0.000156402587890625,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
- "loss": 5.037836672272533e-05,
1490
- "mean_token_accuracy": 1.0,
1491
- "num_tokens": 712819.0,
1492
  "step": 146
1493
  },
1494
  {
1495
- "entropy": 0.4038653904572129,
1496
  "epoch": 1.7963246554364471,
1497
- "grad_norm": 0.0004100799560546875,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
- "loss": 4.163683479418978e-05,
1500
- "mean_token_accuracy": 1.0,
1501
- "num_tokens": 718451.0,
1502
  "step": 147
1503
  },
1504
  {
1505
- "entropy": 0.4069879539310932,
1506
  "epoch": 1.8085758039816233,
1507
- "grad_norm": 7.104873657226562e-05,
1508
  "learning_rate": 2.073170731707317e-05,
1509
- "loss": 3.6120818549534306e-05,
1510
- "mean_token_accuracy": 1.0,
1511
- "num_tokens": 723808.0,
1512
  "step": 148
1513
  },
1514
  {
1515
- "entropy": 0.4381860624998808,
1516
  "epoch": 1.8208269525267995,
1517
- "grad_norm": 9.870529174804688e-05,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
- "loss": 4.433648064150475e-05,
1520
- "mean_token_accuracy": 1.0,
1521
- "num_tokens": 728124.0,
1522
  "step": 149
1523
  },
1524
  {
1525
- "entropy": 0.42220813781023026,
1526
  "epoch": 1.8330781010719757,
1527
- "grad_norm": 0.005401611328125,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
- "loss": 0.00017107791791204363,
1530
- "mean_token_accuracy": 1.0,
1531
- "num_tokens": 733915.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
- "eval_entropy": 0.4000617520532746,
1537
- "eval_loss": 0.00023719228920526803,
1538
- "eval_mean_token_accuracy": 0.9998813841653906,
1539
- "eval_num_tokens": 733915.0,
1540
- "eval_runtime": 50.9031,
1541
- "eval_samples_per_second": 1.356,
1542
- "eval_steps_per_second": 1.356,
1543
  "step": 150
1544
  }
1545
  ],
@@ -1560,7 +1560,7 @@
1560
  "attributes": {}
1561
  }
1562
  },
1563
- "total_flos": 3.323255650111488e+16,
1564
  "train_batch_size": 1,
1565
  "trial_name": null,
1566
  "trial_params": null
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 0.2316489452496171,
14
  "epoch": 0.01225114854517611,
15
+ "grad_norm": 1.21875,
16
  "learning_rate": 0.0002,
17
+ "loss": 0.1141367182135582,
18
+ "mean_token_accuracy": 0.962372187525034,
19
+ "num_tokens": 6133.0,
20
  "step": 1
21
  },
22
  {
23
+ "entropy": 0.2494401354342699,
24
  "epoch": 0.02450229709035222,
25
+ "grad_norm": 0.59765625,
26
  "learning_rate": 0.00019878048780487805,
27
+ "loss": 0.07354862987995148,
28
+ "mean_token_accuracy": 0.9755491837859154,
29
+ "num_tokens": 12088.0,
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 0.31152926199138165,
34
  "epoch": 0.036753445635528334,
35
+ "grad_norm": 0.306640625,
36
  "learning_rate": 0.0001975609756097561,
37
+ "loss": 0.06412772834300995,
38
+ "mean_token_accuracy": 0.978853102773428,
39
+ "num_tokens": 17331.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 0.30638211220502853,
44
  "epoch": 0.04900459418070444,
45
+ "grad_norm": 0.8984375,
46
  "learning_rate": 0.00019634146341463416,
47
+ "loss": 0.08034519106149673,
48
+ "mean_token_accuracy": 0.9723691493272781,
49
+ "num_tokens": 22383.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 0.3171741934493184,
54
  "epoch": 0.06125574272588055,
55
+ "grad_norm": 0.60546875,
56
  "learning_rate": 0.0001951219512195122,
57
+ "loss": 0.07083277404308319,
58
+ "mean_token_accuracy": 0.9742059484124184,
59
+ "num_tokens": 27930.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 0.3094687405973673,
64
  "epoch": 0.07350689127105667,
65
+ "grad_norm": 0.6796875,
66
  "learning_rate": 0.00019390243902439025,
67
+ "loss": 0.08443780243396759,
68
+ "mean_token_accuracy": 0.9732540361583233,
69
+ "num_tokens": 33286.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 0.2914603017270565,
74
  "epoch": 0.08575803981623277,
75
+ "grad_norm": 0.265625,
76
  "learning_rate": 0.0001926829268292683,
77
+ "loss": 0.06558080017566681,
78
+ "mean_token_accuracy": 0.9725310951471329,
79
+ "num_tokens": 39568.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 0.279434559866786,
84
  "epoch": 0.09800918836140889,
85
+ "grad_norm": 0.58984375,
86
  "learning_rate": 0.00019146341463414633,
87
+ "loss": 0.07338608056306839,
88
+ "mean_token_accuracy": 0.9793376848101616,
89
+ "num_tokens": 44597.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 0.27481516171246767,
94
  "epoch": 0.11026033690658499,
95
+ "grad_norm": 0.3125,
96
  "learning_rate": 0.0001902439024390244,
97
+ "loss": 0.06733334064483643,
98
+ "mean_token_accuracy": 0.9732998013496399,
99
+ "num_tokens": 49848.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 0.2752347318455577,
104
  "epoch": 0.1225114854517611,
105
+ "grad_norm": 0.4296875,
106
  "learning_rate": 0.00018902439024390244,
107
+ "loss": 0.08688339591026306,
108
+ "mean_token_accuracy": 0.9711812101304531,
109
+ "num_tokens": 55087.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 0.23697010707110167,
114
  "epoch": 0.13476263399693722,
115
+ "grad_norm": 0.35546875,
116
  "learning_rate": 0.0001878048780487805,
117
+ "loss": 0.09419302642345428,
118
+ "mean_token_accuracy": 0.9671205654740334,
119
+ "num_tokens": 61901.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 0.2767820842564106,
124
  "epoch": 0.14701378254211334,
125
+ "grad_norm": 0.5078125,
126
  "learning_rate": 0.00018658536585365856,
127
+ "loss": 0.09175145626068115,
128
+ "mean_token_accuracy": 0.9672112688422203,
129
+ "num_tokens": 68472.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 0.2712240917608142,
134
  "epoch": 0.15926493108728942,
135
+ "grad_norm": 0.43359375,
136
  "learning_rate": 0.0001853658536585366,
137
+ "loss": 0.1060388907790184,
138
+ "mean_token_accuracy": 0.9682641178369522,
139
+ "num_tokens": 74380.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 0.2655314621515572,
144
  "epoch": 0.17151607963246554,
145
+ "grad_norm": 0.5234375,
146
  "learning_rate": 0.00018414634146341464,
147
+ "loss": 0.09543660283088684,
148
+ "mean_token_accuracy": 0.9580898210406303,
149
+ "num_tokens": 80297.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 0.2568928087130189,
154
  "epoch": 0.18376722817764166,
155
+ "grad_norm": 0.306640625,
156
  "learning_rate": 0.0001829268292682927,
157
+ "loss": 0.05766459181904793,
158
+ "mean_token_accuracy": 0.9795842878520489,
159
+ "num_tokens": 85162.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 0.27691631484776735,
164
  "epoch": 0.19601837672281777,
165
+ "grad_norm": 0.359375,
166
  "learning_rate": 0.00018170731707317075,
167
+ "loss": 0.0939052402973175,
168
+ "mean_token_accuracy": 0.9671713933348656,
169
+ "num_tokens": 90393.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 0.2810298567637801,
174
  "epoch": 0.2082695252679939,
175
+ "grad_norm": 0.26953125,
176
  "learning_rate": 0.0001804878048780488,
177
+ "loss": 0.058892831206321716,
178
+ "mean_token_accuracy": 0.9773643910884857,
179
+ "num_tokens": 95530.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 0.2796283131465316,
184
  "epoch": 0.22052067381316998,
185
+ "grad_norm": 0.345703125,
186
  "learning_rate": 0.00017926829268292684,
187
+ "loss": 0.07744893431663513,
188
+ "mean_token_accuracy": 0.9721782878041267,
189
+ "num_tokens": 101234.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 0.2912421654909849,
194
  "epoch": 0.2327718223583461,
195
+ "grad_norm": 0.48828125,
196
  "learning_rate": 0.00017804878048780488,
197
+ "loss": 0.07593704760074615,
198
+ "mean_token_accuracy": 0.9668422974646091,
199
+ "num_tokens": 107018.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 0.28678335808217525,
204
  "epoch": 0.2450229709035222,
205
+ "grad_norm": 0.337890625,
206
  "learning_rate": 0.00017682926829268295,
207
+ "loss": 0.07227691262960434,
208
+ "mean_token_accuracy": 0.9736582525074482,
209
+ "num_tokens": 112299.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 0.296040833927691,
214
  "epoch": 0.2572741194486983,
215
+ "grad_norm": 0.33203125,
216
  "learning_rate": 0.000175609756097561,
217
+ "loss": 0.07230418920516968,
218
+ "mean_token_accuracy": 0.9750959761440754,
219
+ "num_tokens": 117872.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 0.27195548359304667,
224
  "epoch": 0.26952526799387444,
225
+ "grad_norm": 0.3671875,
226
  "learning_rate": 0.00017439024390243903,
227
+ "loss": 0.08706101030111313,
228
+ "mean_token_accuracy": 0.9771376326680183,
229
+ "num_tokens": 124580.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 0.29904199205338955,
234
  "epoch": 0.28177641653905056,
235
+ "grad_norm": 0.408203125,
236
  "learning_rate": 0.00017317073170731708,
237
+ "loss": 0.0653143897652626,
238
+ "mean_token_accuracy": 0.9760479032993317,
239
+ "num_tokens": 129745.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 0.2986137717962265,
244
  "epoch": 0.29402756508422667,
245
+ "grad_norm": 0.421875,
246
  "learning_rate": 0.00017195121951219512,
247
+ "loss": 0.07193314284086227,
248
+ "mean_token_accuracy": 0.9698839113116264,
249
+ "num_tokens": 135543.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 0.24683671910315752,
254
  "epoch": 0.30627871362940273,
255
+ "grad_norm": 0.37890625,
256
  "learning_rate": 0.0001707317073170732,
257
+ "loss": 0.07017349451780319,
258
+ "mean_token_accuracy": 0.9763788469135761,
259
+ "num_tokens": 141145.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 0.23581106960773468,
264
  "epoch": 0.31852986217457885,
265
+ "grad_norm": 0.349609375,
266
  "learning_rate": 0.00016951219512195123,
267
+ "loss": 0.07848861813545227,
268
+ "mean_token_accuracy": 0.9711455926299095,
269
+ "num_tokens": 146832.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 0.19877766259014606,
274
  "epoch": 0.33078101071975496,
275
+ "grad_norm": 0.32421875,
276
  "learning_rate": 0.00016829268292682927,
277
+ "loss": 0.05964134261012077,
278
+ "mean_token_accuracy": 0.9766620621085167,
279
+ "num_tokens": 153062.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 0.24412551056593657,
284
  "epoch": 0.3430321592649311,
285
+ "grad_norm": 0.466796875,
286
  "learning_rate": 0.00016707317073170731,
287
+ "loss": 0.10119230300188065,
288
+ "mean_token_accuracy": 0.9631960429251194,
289
+ "num_tokens": 159097.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 0.2634996743872762,
294
  "epoch": 0.3552833078101072,
295
+ "grad_norm": 0.376953125,
296
  "learning_rate": 0.00016585365853658536,
297
+ "loss": 0.07137235254049301,
298
+ "mean_token_accuracy": 0.9721279740333557,
299
+ "num_tokens": 164465.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 0.2398172626271844,
304
  "epoch": 0.3675344563552833,
305
+ "grad_norm": 0.380859375,
306
  "learning_rate": 0.00016463414634146343,
307
+ "loss": 0.08367905020713806,
308
+ "mean_token_accuracy": 0.9688702113926411,
309
+ "num_tokens": 171131.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 0.2387447776272893,
314
  "epoch": 0.37978560490045943,
315
+ "grad_norm": 0.39453125,
316
  "learning_rate": 0.00016341463414634147,
317
+ "loss": 0.07410822808742523,
318
+ "mean_token_accuracy": 0.9765294268727303,
319
+ "num_tokens": 175655.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 0.24556818418204784,
324
  "epoch": 0.39203675344563554,
325
+ "grad_norm": 0.361328125,
326
  "learning_rate": 0.00016219512195121954,
327
+ "loss": 0.07339000701904297,
328
+ "mean_token_accuracy": 0.9750400222837925,
329
+ "num_tokens": 182309.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 0.23958251252770424,
334
  "epoch": 0.40428790199081166,
335
+ "grad_norm": 0.376953125,
336
  "learning_rate": 0.00016097560975609758,
337
+ "loss": 0.0825161263346672,
338
+ "mean_token_accuracy": 0.9695910774171352,
339
+ "num_tokens": 188122.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 0.25066179782152176,
344
  "epoch": 0.4165390505359878,
345
+ "grad_norm": 0.34765625,
346
  "learning_rate": 0.00015975609756097562,
347
+ "loss": 0.0681036114692688,
348
+ "mean_token_accuracy": 0.9773549512028694,
349
+ "num_tokens": 193308.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 0.2489402163773775,
354
  "epoch": 0.42879019908116384,
355
+ "grad_norm": 0.33984375,
356
  "learning_rate": 0.00015853658536585366,
357
+ "loss": 0.07768924534320831,
358
+ "mean_token_accuracy": 0.9787707962095737,
359
+ "num_tokens": 198904.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 0.25176819786429405,
364
  "epoch": 0.44104134762633995,
365
+ "grad_norm": 0.353515625,
366
  "learning_rate": 0.00015731707317073173,
367
+ "loss": 0.07323021441698074,
368
+ "mean_token_accuracy": 0.9740425609052181,
369
+ "num_tokens": 204184.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 0.23491865396499634,
374
  "epoch": 0.45329249617151607,
375
+ "grad_norm": 0.345703125,
376
  "learning_rate": 0.00015609756097560978,
377
+ "loss": 0.06643179059028625,
378
+ "mean_token_accuracy": 0.9767155349254608,
379
+ "num_tokens": 210362.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 0.25266142282634974,
384
  "epoch": 0.4655436447166922,
385
+ "grad_norm": 0.50390625,
386
  "learning_rate": 0.00015487804878048782,
387
+ "loss": 0.08636192977428436,
388
+ "mean_token_accuracy": 0.9685244522988796,
389
+ "num_tokens": 215483.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 0.24919006042182446,
394
  "epoch": 0.4777947932618683,
395
+ "grad_norm": 0.357421875,
396
  "learning_rate": 0.00015365853658536586,
397
+ "loss": 0.06912290304899216,
398
+ "mean_token_accuracy": 0.9728152006864548,
399
+ "num_tokens": 220437.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 0.2789237005636096,
404
  "epoch": 0.4900459418070444,
405
+ "grad_norm": 0.3671875,
406
  "learning_rate": 0.0001524390243902439,
407
+ "loss": 0.07096827030181885,
408
+ "mean_token_accuracy": 0.9718564338982105,
409
+ "num_tokens": 225444.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 0.23915204405784607,
414
  "epoch": 0.5022970903522205,
415
+ "grad_norm": 0.35546875,
416
  "learning_rate": 0.00015121951219512197,
417
+ "loss": 0.06407603621482849,
418
+ "mean_token_accuracy": 0.975932989269495,
419
+ "num_tokens": 230088.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 0.25953691080212593,
424
  "epoch": 0.5145482388973966,
425
+ "grad_norm": 0.365234375,
426
  "learning_rate": 0.00015000000000000001,
427
+ "loss": 0.07893452048301697,
428
+ "mean_token_accuracy": 0.9717175625264645,
429
+ "num_tokens": 234974.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 0.25131134409457445,
434
  "epoch": 0.5267993874425727,
435
+ "grad_norm": 0.33984375,
436
  "learning_rate": 0.00014878048780487806,
437
+ "loss": 0.0724797397851944,
438
+ "mean_token_accuracy": 0.9746548496186733,
439
+ "num_tokens": 240695.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 0.25067666731774807,
444
  "epoch": 0.5390505359877489,
445
+ "grad_norm": 0.44921875,
446
  "learning_rate": 0.0001475609756097561,
447
+ "loss": 0.06145863234996796,
448
+ "mean_token_accuracy": 0.9786989763379097,
449
+ "num_tokens": 246515.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 0.22192941885441542,
454
  "epoch": 0.5513016845329249,
455
+ "grad_norm": 0.4375,
456
  "learning_rate": 0.00014634146341463414,
457
+ "loss": 0.06996186822652817,
458
+ "mean_token_accuracy": 0.9778482280671597,
459
+ "num_tokens": 252150.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 0.24868111684918404,
464
  "epoch": 0.5635528330781011,
465
+ "grad_norm": 0.392578125,
466
  "learning_rate": 0.0001451219512195122,
467
+ "loss": 0.07759839296340942,
468
+ "mean_token_accuracy": 0.9743853285908699,
469
+ "num_tokens": 257699.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 0.2405283828265965,
474
  "epoch": 0.5758039816232772,
475
+ "grad_norm": 0.400390625,
476
  "learning_rate": 0.00014390243902439025,
477
+ "loss": 0.06918229907751083,
478
+ "mean_token_accuracy": 0.9726257510483265,
479
+ "num_tokens": 262974.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 0.2463641557842493,
484
  "epoch": 0.5880551301684533,
485
+ "grad_norm": 0.5078125,
486
  "learning_rate": 0.0001426829268292683,
487
+ "loss": 0.08698121458292007,
488
+ "mean_token_accuracy": 0.9751730673015118,
489
+ "num_tokens": 267714.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 0.2611560570076108,
494
  "epoch": 0.6003062787136294,
495
+ "grad_norm": 0.3203125,
496
  "learning_rate": 0.00014146341463414634,
497
+ "loss": 0.0795765370130539,
498
+ "mean_token_accuracy": 0.9706047028303146,
499
+ "num_tokens": 273102.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 0.24631980434060097,
504
  "epoch": 0.6125574272588055,
505
+ "grad_norm": 0.365234375,
506
  "learning_rate": 0.00014024390243902438,
507
+ "loss": 0.06434721499681473,
508
+ "mean_token_accuracy": 0.9787219613790512,
509
+ "num_tokens": 278414.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
+ "eval_entropy": 0.25439983627934387,
515
+ "eval_loss": 0.07568059861660004,
516
+ "eval_mean_token_accuracy": 0.9709554686062578,
517
+ "eval_num_tokens": 278414.0,
518
+ "eval_runtime": 56.679,
519
+ "eval_samples_per_second": 1.217,
520
+ "eval_steps_per_second": 1.217,
521
  "step": 50
522
  },
523
  {
524
+ "entropy": 0.22273720148950815,
525
  "epoch": 0.6248085758039816,
526
+ "grad_norm": 0.330078125,
527
  "learning_rate": 0.00013902439024390245,
528
+ "loss": 0.06272563338279724,
529
+ "mean_token_accuracy": 0.9790237173438072,
530
+ "num_tokens": 284001.0,
531
  "step": 51
532
  },
533
  {
534
+ "entropy": 0.25650967564433813,
535
  "epoch": 0.6370597243491577,
536
+ "grad_norm": 0.3515625,
537
  "learning_rate": 0.0001378048780487805,
538
+ "loss": 0.0695340633392334,
539
+ "mean_token_accuracy": 0.9723741784691811,
540
+ "num_tokens": 289900.0,
541
  "step": 52
542
  },
543
  {
544
+ "entropy": 0.27689922973513603,
545
  "epoch": 0.6493108728943339,
546
+ "grad_norm": 0.443359375,
547
  "learning_rate": 0.00013658536585365856,
548
+ "loss": 0.08247513324022293,
549
+ "mean_token_accuracy": 0.9751085750758648,
550
+ "num_tokens": 295774.0,
551
  "step": 53
552
  },
553
  {
554
+ "entropy": 0.24619914591312408,
555
  "epoch": 0.6615620214395099,
556
+ "grad_norm": 0.349609375,
557
  "learning_rate": 0.0001353658536585366,
558
+ "loss": 0.06673211604356766,
559
+ "mean_token_accuracy": 0.9788386225700378,
560
+ "num_tokens": 300970.0,
561
  "step": 54
562
  },
563
  {
564
+ "entropy": 0.27198443934321404,
565
  "epoch": 0.6738131699846861,
566
+ "grad_norm": 0.4921875,
567
  "learning_rate": 0.00013414634146341464,
568
+ "loss": 0.07676997035741806,
569
+ "mean_token_accuracy": 0.9696366749703884,
570
+ "num_tokens": 306709.0,
571
  "step": 55
572
  },
573
  {
574
+ "entropy": 0.2689105300232768,
575
  "epoch": 0.6860643185298622,
576
+ "grad_norm": 0.47265625,
577
  "learning_rate": 0.0001329268292682927,
578
+ "loss": 0.06719915568828583,
579
+ "mean_token_accuracy": 0.9702229462563992,
580
+ "num_tokens": 311650.0,
581
  "step": 56
582
  },
583
  {
584
+ "entropy": 0.2787257097661495,
585
  "epoch": 0.6983154670750383,
586
+ "grad_norm": 0.369140625,
587
  "learning_rate": 0.00013170731707317076,
588
+ "loss": 0.07159961760044098,
589
+ "mean_token_accuracy": 0.9748533591628075,
590
+ "num_tokens": 317257.0,
591
  "step": 57
592
  },
593
  {
594
+ "entropy": 0.2522663725540042,
595
  "epoch": 0.7105666156202144,
596
+ "grad_norm": 0.31640625,
597
  "learning_rate": 0.0001304878048780488,
598
+ "loss": 0.08856096863746643,
599
+ "mean_token_accuracy": 0.9697616137564182,
600
+ "num_tokens": 323281.0,
601
  "step": 58
602
  },
603
  {
604
+ "entropy": 0.24693416617810726,
605
  "epoch": 0.7228177641653905,
606
+ "grad_norm": 0.37109375,
607
  "learning_rate": 0.00012926829268292684,
608
+ "loss": 0.07423190027475357,
609
+ "mean_token_accuracy": 0.9705353751778603,
610
+ "num_tokens": 328551.0,
611
  "step": 59
612
  },
613
  {
614
+ "entropy": 0.2651137877255678,
615
  "epoch": 0.7350689127105666,
616
+ "grad_norm": 0.43359375,
617
  "learning_rate": 0.00012804878048780488,
618
+ "loss": 0.0738719031214714,
619
+ "mean_token_accuracy": 0.9752235859632492,
620
+ "num_tokens": 334143.0,
621
  "step": 60
622
  },
623
  {
624
+ "entropy": 0.2281778110191226,
625
  "epoch": 0.7473200612557427,
626
+ "grad_norm": 0.2490234375,
627
  "learning_rate": 0.00012682926829268293,
628
+ "loss": 0.0633026584982872,
629
+ "mean_token_accuracy": 0.9741999059915543,
630
+ "num_tokens": 341092.0,
631
  "step": 61
632
  },
633
  {
634
+ "entropy": 0.2535929596051574,
635
  "epoch": 0.7595712098009189,
636
+ "grad_norm": 0.390625,
637
  "learning_rate": 0.000125609756097561,
638
+ "loss": 0.0719546377658844,
639
+ "mean_token_accuracy": 0.9765410870313644,
640
+ "num_tokens": 347467.0,
641
  "step": 62
642
  },
643
  {
644
+ "entropy": 0.25424638390541077,
645
  "epoch": 0.7718223583460949,
646
+ "grad_norm": 0.431640625,
647
  "learning_rate": 0.00012439024390243904,
648
+ "loss": 0.05535401031374931,
649
+ "mean_token_accuracy": 0.9780425503849983,
650
+ "num_tokens": 352164.0,
651
  "step": 63
652
  },
653
  {
654
+ "entropy": 0.23888325225561857,
655
  "epoch": 0.7840735068912711,
656
+ "grad_norm": 0.435546875,
657
  "learning_rate": 0.00012317073170731708,
658
+ "loss": 0.07177040725946426,
659
+ "mean_token_accuracy": 0.9734687805175781,
660
+ "num_tokens": 357308.0,
661
  "step": 64
662
  },
663
  {
664
+ "entropy": 0.27028472628444433,
665
  "epoch": 0.7963246554364471,
666
+ "grad_norm": 0.30859375,
667
  "learning_rate": 0.00012195121951219512,
668
+ "loss": 0.06257087737321854,
669
+ "mean_token_accuracy": 0.9774579927325249,
670
+ "num_tokens": 362666.0,
671
  "step": 65
672
  },
673
  {
674
+ "entropy": 0.2821849435567856,
675
  "epoch": 0.8085758039816233,
676
+ "grad_norm": 0.373046875,
677
  "learning_rate": 0.00012073170731707318,
678
+ "loss": 0.06471723318099976,
679
+ "mean_token_accuracy": 0.976191334426403,
680
+ "num_tokens": 368427.0,
681
  "step": 66
682
  },
683
  {
684
+ "entropy": 0.22186184907332063,
685
  "epoch": 0.8208269525267994,
686
+ "grad_norm": 0.263671875,
687
  "learning_rate": 0.00011951219512195122,
688
+ "loss": 0.06329935044050217,
689
+ "mean_token_accuracy": 0.978707954287529,
690
+ "num_tokens": 374540.0,
691
  "step": 67
692
  },
693
  {
694
+ "entropy": 0.23882555402815342,
695
  "epoch": 0.8330781010719756,
696
+ "grad_norm": 0.3046875,
697
  "learning_rate": 0.00011829268292682926,
698
+ "loss": 0.07082124054431915,
699
+ "mean_token_accuracy": 0.979393869638443,
700
+ "num_tokens": 379925.0,
701
  "step": 68
702
  },
703
  {
704
+ "entropy": 0.2527451729401946,
705
  "epoch": 0.8453292496171516,
706
+ "grad_norm": 0.37109375,
707
  "learning_rate": 0.00011707317073170732,
708
+ "loss": 0.0804731696844101,
709
+ "mean_token_accuracy": 0.9763551540672779,
710
+ "num_tokens": 384279.0,
711
  "step": 69
712
  },
713
  {
714
+ "entropy": 0.26056139171123505,
715
  "epoch": 0.8575803981623277,
716
+ "grad_norm": 0.40234375,
717
  "learning_rate": 0.00011585365853658536,
718
+ "loss": 0.09266315400600433,
719
+ "mean_token_accuracy": 0.9709281474351883,
720
+ "num_tokens": 389563.0,
721
  "step": 70
722
  },
723
  {
724
+ "entropy": 0.2919591320678592,
725
  "epoch": 0.8698315467075038,
726
+ "grad_norm": 0.423828125,
727
  "learning_rate": 0.00011463414634146342,
728
+ "loss": 0.07172521948814392,
729
+ "mean_token_accuracy": 0.9725044220685959,
730
+ "num_tokens": 394650.0,
731
  "step": 71
732
  },
733
  {
734
+ "entropy": 0.2520558973774314,
735
  "epoch": 0.8820826952526799,
736
+ "grad_norm": 0.47265625,
737
  "learning_rate": 0.00011341463414634146,
738
+ "loss": 0.07857581228017807,
739
+ "mean_token_accuracy": 0.967189610004425,
740
+ "num_tokens": 399583.0,
741
  "step": 72
742
  },
743
  {
744
+ "entropy": 0.2681189738214016,
745
  "epoch": 0.8943338437978561,
746
+ "grad_norm": 0.470703125,
747
  "learning_rate": 0.00011219512195121953,
748
+ "loss": 0.0883592814207077,
749
+ "mean_token_accuracy": 0.9760300181806087,
750
+ "num_tokens": 406224.0,
751
  "step": 73
752
  },
753
  {
754
+ "entropy": 0.25226688850671053,
755
  "epoch": 0.9065849923430321,
756
+ "grad_norm": 0.349609375,
757
  "learning_rate": 0.00011097560975609757,
758
+ "loss": 0.06107043847441673,
759
+ "mean_token_accuracy": 0.9742026180028915,
760
+ "num_tokens": 412481.0,
761
  "step": 74
762
  },
763
  {
764
+ "entropy": 0.25610699970275164,
765
  "epoch": 0.9188361408882083,
766
+ "grad_norm": 0.4140625,
767
  "learning_rate": 0.00010975609756097563,
768
+ "loss": 0.06678957492113113,
769
+ "mean_token_accuracy": 0.9725399203598499,
770
+ "num_tokens": 417862.0,
771
  "step": 75
772
  },
773
  {
774
+ "entropy": 0.2826196616515517,
775
  "epoch": 0.9310872894333844,
776
+ "grad_norm": 0.859375,
777
  "learning_rate": 0.00010853658536585367,
778
+ "loss": 0.048859648406505585,
779
+ "mean_token_accuracy": 0.9790267050266266,
780
+ "num_tokens": 422878.0,
781
  "step": 76
782
  },
783
  {
784
+ "entropy": 0.23871563002467155,
785
  "epoch": 0.9433384379785605,
786
+ "grad_norm": 0.466796875,
787
  "learning_rate": 0.00010731707317073172,
788
+ "loss": 0.07596343755722046,
789
+ "mean_token_accuracy": 0.971769668161869,
790
+ "num_tokens": 429170.0,
791
  "step": 77
792
  },
793
  {
794
+ "entropy": 0.2777755409479141,
795
  "epoch": 0.9555895865237366,
796
+ "grad_norm": 0.443359375,
797
  "learning_rate": 0.00010609756097560977,
798
+ "loss": 0.06630191206932068,
799
+ "mean_token_accuracy": 0.9747902825474739,
800
+ "num_tokens": 434323.0,
801
  "step": 78
802
  },
803
  {
804
+ "entropy": 0.23950364720076323,
805
  "epoch": 0.9678407350689127,
806
+ "grad_norm": 0.349609375,
807
  "learning_rate": 0.00010487804878048781,
808
+ "loss": 0.057458702474832535,
809
+ "mean_token_accuracy": 0.980991818010807,
810
+ "num_tokens": 439539.0,
811
  "step": 79
812
  },
813
  {
814
+ "entropy": 0.245719694532454,
815
  "epoch": 0.9800918836140888,
816
+ "grad_norm": 0.3046875,
817
  "learning_rate": 0.00010365853658536586,
818
+ "loss": 0.06474918127059937,
819
+ "mean_token_accuracy": 0.9749566093087196,
820
+ "num_tokens": 445548.0,
821
  "step": 80
822
  },
823
  {
824
+ "entropy": 0.2553516002371907,
825
  "epoch": 0.9923430321592649,
826
+ "grad_norm": 0.59375,
827
  "learning_rate": 0.0001024390243902439,
828
+ "loss": 0.07626976072788239,
829
+ "mean_token_accuracy": 0.9740116000175476,
830
+ "num_tokens": 451007.0,
831
  "step": 81
832
  },
833
  {
834
+ "entropy": 0.24858922958374025,
835
  "epoch": 1.0,
836
+ "grad_norm": 0.4140625,
837
  "learning_rate": 0.00010121951219512196,
838
+ "loss": 0.05956536903977394,
839
+ "mean_token_accuracy": 0.9751910209655762,
840
+ "num_tokens": 454678.0,
841
  "step": 82
842
  },
843
  {
844
+ "entropy": 0.22480082791298628,
845
  "epoch": 1.0122511485451762,
846
+ "grad_norm": 0.302734375,
847
  "learning_rate": 0.0001,
848
+ "loss": 0.03318095952272415,
849
+ "mean_token_accuracy": 0.9908282831311226,
850
+ "num_tokens": 460195.0,
851
  "step": 83
852
  },
853
  {
854
+ "entropy": 0.21941375825554132,
855
  "epoch": 1.0245022970903521,
856
+ "grad_norm": 0.322265625,
857
  "learning_rate": 9.878048780487805e-05,
858
+ "loss": 0.037562280893325806,
859
+ "mean_token_accuracy": 0.9899826981127262,
860
+ "num_tokens": 465814.0,
861
  "step": 84
862
  },
863
  {
864
+ "entropy": 0.2297668270766735,
865
  "epoch": 1.0367534456355283,
866
+ "grad_norm": 0.259765625,
867
  "learning_rate": 9.75609756097561e-05,
868
+ "loss": 0.03667337819933891,
869
+ "mean_token_accuracy": 0.9867987670004368,
870
+ "num_tokens": 472919.0,
871
  "step": 85
872
  },
873
  {
874
+ "entropy": 0.1959990761242807,
875
  "epoch": 1.0490045941807045,
876
+ "grad_norm": 0.171875,
877
  "learning_rate": 9.634146341463415e-05,
878
+ "loss": 0.02224677987396717,
879
+ "mean_token_accuracy": 0.9947787970304489,
880
+ "num_tokens": 477926.0,
881
  "step": 86
882
  },
883
  {
884
+ "entropy": 0.22538460325449705,
885
  "epoch": 1.0612557427258806,
886
+ "grad_norm": 0.294921875,
887
  "learning_rate": 9.51219512195122e-05,
888
+ "loss": 0.05467130243778229,
889
+ "mean_token_accuracy": 0.9857094436883926,
890
+ "num_tokens": 483369.0,
891
  "step": 87
892
  },
893
  {
894
+ "entropy": 0.2385974396020174,
895
  "epoch": 1.0735068912710566,
896
+ "grad_norm": 0.2392578125,
897
  "learning_rate": 9.390243902439024e-05,
898
+ "loss": 0.02876465395092964,
899
+ "mean_token_accuracy": 0.9933567047119141,
900
+ "num_tokens": 488048.0,
901
  "step": 88
902
  },
903
  {
904
+ "entropy": 0.2244573337957263,
905
  "epoch": 1.0857580398162328,
906
+ "grad_norm": 0.17578125,
907
  "learning_rate": 9.26829268292683e-05,
908
+ "loss": 0.022544220089912415,
909
+ "mean_token_accuracy": 0.9952267222106457,
910
+ "num_tokens": 492951.0,
911
  "step": 89
912
  },
913
  {
914
+ "entropy": 0.21164159616455436,
915
  "epoch": 1.098009188361409,
916
+ "grad_norm": 0.3671875,
917
  "learning_rate": 9.146341463414635e-05,
918
+ "loss": 0.0307400431483984,
919
+ "mean_token_accuracy": 0.9898485280573368,
920
+ "num_tokens": 498298.0,
921
  "step": 90
922
  },
923
  {
924
+ "entropy": 0.22300960402935743,
925
  "epoch": 1.110260336906585,
926
+ "grad_norm": 0.25390625,
927
  "learning_rate": 9.02439024390244e-05,
928
+ "loss": 0.02349678799510002,
929
+ "mean_token_accuracy": 0.9937595501542091,
930
+ "num_tokens": 503013.0,
931
  "step": 91
932
  },
933
  {
934
+ "entropy": 0.2144601820036769,
935
  "epoch": 1.122511485451761,
936
+ "grad_norm": 0.466796875,
937
  "learning_rate": 8.902439024390244e-05,
938
+ "loss": 0.025124385952949524,
939
+ "mean_token_accuracy": 0.9929902292788029,
940
+ "num_tokens": 507687.0,
941
  "step": 92
942
  },
943
  {
944
+ "entropy": 0.18067707447335124,
945
  "epoch": 1.1347626339969372,
946
+ "grad_norm": 0.462890625,
947
  "learning_rate": 8.78048780487805e-05,
948
+ "loss": 0.04210633784532547,
949
+ "mean_token_accuracy": 0.9874051883816719,
950
+ "num_tokens": 513217.0,
951
  "step": 93
952
  },
953
  {
954
+ "entropy": 0.18840790819376707,
955
  "epoch": 1.1470137825421134,
956
+ "grad_norm": 0.2578125,
957
  "learning_rate": 8.658536585365854e-05,
958
+ "loss": 0.023590605705976486,
959
+ "mean_token_accuracy": 0.9930241219699383,
960
+ "num_tokens": 518384.0,
961
  "step": 94
962
  },
963
  {
964
+ "entropy": 0.16844777530059218,
965
  "epoch": 1.1592649310872893,
966
+ "grad_norm": 0.3046875,
967
  "learning_rate": 8.53658536585366e-05,
968
+ "loss": 0.02408467046916485,
969
+ "mean_token_accuracy": 0.9940578565001488,
970
+ "num_tokens": 523975.0,
971
  "step": 95
972
  },
973
  {
974
+ "entropy": 0.1988551402464509,
975
  "epoch": 1.1715160796324655,
976
+ "grad_norm": 0.25390625,
977
  "learning_rate": 8.414634146341464e-05,
978
+ "loss": 0.01896364614367485,
979
+ "mean_token_accuracy": 0.9935651384294033,
980
+ "num_tokens": 528838.0,
981
  "step": 96
982
  },
983
  {
984
+ "entropy": 0.19662938080728054,
985
  "epoch": 1.1837672281776417,
986
+ "grad_norm": 0.271484375,
987
  "learning_rate": 8.292682926829268e-05,
988
+ "loss": 0.023568641394376755,
989
+ "mean_token_accuracy": 0.9942812882363796,
990
+ "num_tokens": 533723.0,
991
  "step": 97
992
  },
993
  {
994
+ "entropy": 0.18521032202988863,
995
  "epoch": 1.1960183767228179,
996
+ "grad_norm": 0.2158203125,
997
  "learning_rate": 8.170731707317073e-05,
998
+ "loss": 0.03203809633851051,
999
+ "mean_token_accuracy": 0.9899982325732708,
1000
+ "num_tokens": 540180.0,
1001
  "step": 98
1002
  },
1003
  {
1004
+ "entropy": 0.18826917372643948,
1005
  "epoch": 1.2082695252679938,
1006
+ "grad_norm": 0.4765625,
1007
  "learning_rate": 8.048780487804879e-05,
1008
+ "loss": 0.03463224321603775,
1009
+ "mean_token_accuracy": 0.9889252111315727,
1010
+ "num_tokens": 546618.0,
1011
  "step": 99
1012
  },
1013
  {
1014
+ "entropy": 0.1889605624601245,
1015
  "epoch": 1.22052067381317,
1016
+ "grad_norm": 0.337890625,
1017
  "learning_rate": 7.926829268292683e-05,
1018
+ "loss": 0.038746241480112076,
1019
+ "mean_token_accuracy": 0.9897148124873638,
1020
+ "num_tokens": 552084.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
+ "eval_entropy": 0.19684839270253113,
1026
+ "eval_loss": 0.08200085908174515,
1027
+ "eval_mean_token_accuracy": 0.9706140955289205,
1028
+ "eval_num_tokens": 552084.0,
1029
+ "eval_runtime": 56.6368,
1030
+ "eval_samples_per_second": 1.218,
1031
+ "eval_steps_per_second": 1.218,
1032
  "step": 100
1033
  },
1034
  {
1035
+ "entropy": 0.18781481962651014,
1036
  "epoch": 1.2327718223583461,
1037
+ "grad_norm": 0.2490234375,
1038
  "learning_rate": 7.804878048780489e-05,
1039
+ "loss": 0.03647669032216072,
1040
+ "mean_token_accuracy": 0.9900195822119713,
1041
+ "num_tokens": 558384.0,
1042
  "step": 101
1043
  },
1044
  {
1045
+ "entropy": 0.182833943516016,
1046
  "epoch": 1.245022970903522,
1047
+ "grad_norm": 0.1708984375,
1048
  "learning_rate": 7.682926829268293e-05,
1049
+ "loss": 0.01754325069487095,
1050
+ "mean_token_accuracy": 0.9952104948461056,
1051
+ "num_tokens": 564025.0,
1052
  "step": 102
1053
  },
1054
  {
1055
+ "entropy": 0.19512099027633667,
1056
  "epoch": 1.2572741194486983,
1057
+ "grad_norm": 0.32421875,
1058
  "learning_rate": 7.560975609756099e-05,
1059
+ "loss": 0.045042332261800766,
1060
+ "mean_token_accuracy": 0.987647294998169,
1061
+ "num_tokens": 569791.0,
1062
  "step": 103
1063
  },
1064
  {
1065
+ "entropy": 0.19775146059691906,
1066
  "epoch": 1.2695252679938744,
1067
+ "grad_norm": 0.287109375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
+ "loss": 0.03481469675898552,
1070
+ "mean_token_accuracy": 0.9876400642096996,
1071
+ "num_tokens": 575432.0,
1072
  "step": 104
1073
  },
1074
  {
1075
+ "entropy": 0.19757689163088799,
1076
  "epoch": 1.2817764165390506,
1077
+ "grad_norm": 0.392578125,
1078
  "learning_rate": 7.317073170731707e-05,
1079
+ "loss": 0.045782968401908875,
1080
+ "mean_token_accuracy": 0.987156193703413,
1081
+ "num_tokens": 580586.0,
1082
  "step": 105
1083
  },
1084
  {
1085
+ "entropy": 0.19568088464438915,
1086
  "epoch": 1.2940275650842268,
1087
+ "grad_norm": 0.271484375,
1088
  "learning_rate": 7.195121951219513e-05,
1089
+ "loss": 0.03614577651023865,
1090
+ "mean_token_accuracy": 0.989520326256752,
1091
+ "num_tokens": 586255.0,
1092
  "step": 106
1093
  },
1094
  {
1095
+ "entropy": 0.18891402333974838,
1096
  "epoch": 1.3062787136294027,
1097
+ "grad_norm": 0.169921875,
1098
  "learning_rate": 7.073170731707317e-05,
1099
+ "loss": 0.018318383023142815,
1100
+ "mean_token_accuracy": 0.9943608231842518,
1101
+ "num_tokens": 591734.0,
1102
  "step": 107
1103
  },
1104
  {
1105
+ "entropy": 0.2118115657940507,
1106
  "epoch": 1.318529862174579,
1107
+ "grad_norm": 0.34375,
1108
  "learning_rate": 6.951219512195122e-05,
1109
+ "loss": 0.02556736022233963,
1110
+ "mean_token_accuracy": 0.9910119064152241,
1111
+ "num_tokens": 596805.0,
1112
  "step": 108
1113
  },
1114
  {
1115
+ "entropy": 0.20146753964945674,
1116
  "epoch": 1.3307810107197549,
1117
+ "grad_norm": 0.251953125,
1118
  "learning_rate": 6.829268292682928e-05,
1119
+ "loss": 0.026423780247569084,
1120
+ "mean_token_accuracy": 0.9911187067627907,
1121
+ "num_tokens": 602469.0,
1122
  "step": 109
1123
  },
1124
  {
1125
+ "entropy": 0.19927682168781757,
1126
  "epoch": 1.343032159264931,
1127
+ "grad_norm": 0.2314453125,
1128
  "learning_rate": 6.707317073170732e-05,
1129
+ "loss": 0.038182880729436874,
1130
+ "mean_token_accuracy": 0.9882474392652512,
1131
+ "num_tokens": 608854.0,
1132
  "step": 110
1133
  },
1134
  {
1135
+ "entropy": 0.18457680894061923,
1136
  "epoch": 1.3552833078101072,
1137
+ "grad_norm": 0.24609375,
1138
  "learning_rate": 6.585365853658538e-05,
1139
+ "loss": 0.025912806391716003,
1140
+ "mean_token_accuracy": 0.9923904649913311,
1141
+ "num_tokens": 614272.0,
1142
  "step": 111
1143
  },
1144
  {
1145
+ "entropy": 0.1993693085387349,
1146
  "epoch": 1.3675344563552834,
1147
+ "grad_norm": 0.291015625,
1148
  "learning_rate": 6.463414634146342e-05,
1149
+ "loss": 0.021378764882683754,
1150
+ "mean_token_accuracy": 0.9953300580382347,
1151
+ "num_tokens": 619446.0,
1152
  "step": 112
1153
  },
1154
  {
1155
+ "entropy": 0.19518085662275553,
1156
  "epoch": 1.3797856049004595,
1157
+ "grad_norm": 0.30078125,
1158
  "learning_rate": 6.341463414634146e-05,
1159
+ "loss": 0.03335938975214958,
1160
+ "mean_token_accuracy": 0.9875492453575134,
1161
+ "num_tokens": 625774.0,
1162
  "step": 113
1163
  },
1164
  {
1165
+ "entropy": 0.20890573505312204,
1166
  "epoch": 1.3920367534456355,
1167
+ "grad_norm": 0.373046875,
1168
  "learning_rate": 6.219512195121952e-05,
1169
+ "loss": 0.036217525601387024,
1170
+ "mean_token_accuracy": 0.9891358688473701,
1171
+ "num_tokens": 630747.0,
1172
  "step": 114
1173
  },
1174
  {
1175
+ "entropy": 0.19118426740169525,
1176
  "epoch": 1.4042879019908117,
1177
+ "grad_norm": 0.251953125,
1178
  "learning_rate": 6.097560975609756e-05,
1179
+ "loss": 0.030090918764472008,
1180
+ "mean_token_accuracy": 0.9934539385139942,
1181
+ "num_tokens": 637405.0,
1182
  "step": 115
1183
  },
1184
  {
1185
+ "entropy": 0.2176859974861145,
1186
  "epoch": 1.4165390505359878,
1187
+ "grad_norm": 0.2373046875,
1188
  "learning_rate": 5.975609756097561e-05,
1189
+ "loss": 0.024563392624258995,
1190
+ "mean_token_accuracy": 0.9921185150742531,
1191
+ "num_tokens": 642328.0,
1192
  "step": 116
1193
  },
1194
  {
1195
+ "entropy": 0.1849509342573583,
1196
  "epoch": 1.4287901990811638,
1197
+ "grad_norm": 0.35546875,
1198
  "learning_rate": 5.853658536585366e-05,
1199
+ "loss": 0.042349379509687424,
1200
+ "mean_token_accuracy": 0.9899747557938099,
1201
+ "num_tokens": 647857.0,
1202
  "step": 117
1203
  },
1204
  {
1205
+ "entropy": 0.19377889391034842,
1206
  "epoch": 1.44104134762634,
1207
+ "grad_norm": 0.279296875,
1208
  "learning_rate": 5.731707317073171e-05,
1209
+ "loss": 0.02413174696266651,
1210
+ "mean_token_accuracy": 0.9931157529354095,
1211
+ "num_tokens": 653805.0,
1212
  "step": 118
1213
  },
1214
  {
1215
+ "entropy": 0.20709845190867782,
1216
  "epoch": 1.4532924961715161,
1217
+ "grad_norm": 0.28125,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
+ "loss": 0.03505600988864899,
1220
+ "mean_token_accuracy": 0.9896740056574345,
1221
+ "num_tokens": 659708.0,
1222
  "step": 119
1223
  },
1224
  {
1225
+ "entropy": 0.20671271299943328,
1226
  "epoch": 1.4655436447166923,
1227
+ "grad_norm": 0.2734375,
1228
  "learning_rate": 5.487804878048781e-05,
1229
+ "loss": 0.02634236589074135,
1230
+ "mean_token_accuracy": 0.9935285076498985,
1231
+ "num_tokens": 665292.0,
1232
  "step": 120
1233
  },
1234
  {
1235
+ "entropy": 0.18826642259955406,
1236
  "epoch": 1.4777947932618682,
1237
+ "grad_norm": 0.2177734375,
1238
  "learning_rate": 5.365853658536586e-05,
1239
+ "loss": 0.022179996594786644,
1240
+ "mean_token_accuracy": 0.9928314089775085,
1241
+ "num_tokens": 670669.0,
1242
  "step": 121
1243
  },
1244
  {
1245
+ "entropy": 0.2311026845127344,
1246
  "epoch": 1.4900459418070444,
1247
+ "grad_norm": 0.267578125,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
+ "loss": 0.025521911680698395,
1250
+ "mean_token_accuracy": 0.9930035471916199,
1251
+ "num_tokens": 675524.0,
1252
  "step": 122
1253
  },
1254
  {
1255
+ "entropy": 0.1890636207535863,
1256
  "epoch": 1.5022970903522204,
1257
+ "grad_norm": 0.22265625,
1258
  "learning_rate": 5.121951219512195e-05,
1259
+ "loss": 0.02293182723224163,
1260
+ "mean_token_accuracy": 0.9917827062308788,
1261
+ "num_tokens": 681083.0,
1262
  "step": 123
1263
  },
1264
  {
1265
+ "entropy": 0.20301904529333115,
1266
  "epoch": 1.5145482388973965,
1267
+ "grad_norm": 0.251953125,
1268
  "learning_rate": 5e-05,
1269
+ "loss": 0.026392869651317596,
1270
+ "mean_token_accuracy": 0.9935696609318256,
1271
+ "num_tokens": 686909.0,
1272
  "step": 124
1273
  },
1274
  {
1275
+ "entropy": 0.18326633982360363,
1276
  "epoch": 1.5267993874425727,
1277
+ "grad_norm": 0.189453125,
1278
  "learning_rate": 4.878048780487805e-05,
1279
+ "loss": 0.03385050222277641,
1280
+ "mean_token_accuracy": 0.9923080727458,
1281
+ "num_tokens": 693716.0,
1282
  "step": 125
1283
  },
1284
  {
1285
+ "entropy": 0.1940352749079466,
1286
  "epoch": 1.5390505359877489,
1287
+ "grad_norm": 0.25,
1288
  "learning_rate": 4.75609756097561e-05,
1289
+ "loss": 0.03128973767161369,
1290
+ "mean_token_accuracy": 0.9904795847833157,
1291
+ "num_tokens": 699231.0,
1292
  "step": 126
1293
  },
1294
  {
1295
+ "entropy": 0.2052145255729556,
1296
  "epoch": 1.551301684532925,
1297
+ "grad_norm": 0.1962890625,
1298
  "learning_rate": 4.634146341463415e-05,
1299
+ "loss": 0.01906367763876915,
1300
+ "mean_token_accuracy": 0.9935221113264561,
1301
+ "num_tokens": 705026.0,
1302
  "step": 127
1303
  },
1304
  {
1305
+ "entropy": 0.22084870096296072,
1306
  "epoch": 1.5635528330781012,
1307
+ "grad_norm": 0.28125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
+ "loss": 0.026771627366542816,
1310
+ "mean_token_accuracy": 0.9931596331298351,
1311
+ "num_tokens": 710155.0,
1312
  "step": 128
1313
  },
1314
  {
1315
+ "entropy": 0.18041892955079675,
1316
  "epoch": 1.5758039816232772,
1317
+ "grad_norm": 0.369140625,
1318
  "learning_rate": 4.390243902439025e-05,
1319
+ "loss": 0.024752795696258545,
1320
+ "mean_token_accuracy": 0.9915198720991611,
1321
+ "num_tokens": 715496.0,
1322
  "step": 129
1323
  },
1324
  {
1325
+ "entropy": 0.1869538608007133,
1326
  "epoch": 1.5880551301684533,
1327
+ "grad_norm": 0.3046875,
1328
  "learning_rate": 4.26829268292683e-05,
1329
+ "loss": 0.03293408453464508,
1330
+ "mean_token_accuracy": 0.990137055516243,
1331
+ "num_tokens": 721491.0,
1332
  "step": 130
1333
  },
1334
  {
1335
+ "entropy": 0.20515098702162504,
1336
  "epoch": 1.6003062787136293,
1337
+ "grad_norm": 0.349609375,
1338
  "learning_rate": 4.146341463414634e-05,
1339
+ "loss": 0.023330464959144592,
1340
+ "mean_token_accuracy": 0.9892629720270634,
1341
+ "num_tokens": 726673.0,
1342
  "step": 131
1343
  },
1344
  {
1345
+ "entropy": 0.18135815067216754,
1346
  "epoch": 1.6125574272588055,
1347
+ "grad_norm": 0.357421875,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
+ "loss": 0.03119005262851715,
1350
+ "mean_token_accuracy": 0.9911304786801338,
1351
+ "num_tokens": 733054.0,
1352
  "step": 132
1353
  },
1354
  {
1355
+ "entropy": 0.20070009911432862,
1356
  "epoch": 1.6248085758039816,
1357
+ "grad_norm": 0.21484375,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
+ "loss": 0.030009731650352478,
1360
+ "mean_token_accuracy": 0.9932212419807911,
1361
+ "num_tokens": 737990.0,
1362
  "step": 133
1363
  },
1364
  {
1365
+ "entropy": 0.18819584511220455,
1366
  "epoch": 1.6370597243491578,
1367
+ "grad_norm": 0.2451171875,
1368
  "learning_rate": 3.780487804878049e-05,
1369
+ "loss": 0.02752860262989998,
1370
+ "mean_token_accuracy": 0.9897669702768326,
1371
+ "num_tokens": 743394.0,
1372
  "step": 134
1373
  },
1374
  {
1375
+ "entropy": 0.18869836069643497,
1376
  "epoch": 1.649310872894334,
1377
+ "grad_norm": 0.240234375,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
+ "loss": 0.03194504603743553,
1380
+ "mean_token_accuracy": 0.9914098270237446,
1381
+ "num_tokens": 749356.0,
1382
  "step": 135
1383
  },
1384
  {
1385
+ "entropy": 0.2093992899172008,
1386
  "epoch": 1.66156202143951,
1387
+ "grad_norm": 0.291015625,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
+ "loss": 0.02633955329656601,
1390
+ "mean_token_accuracy": 0.992473166435957,
1391
+ "num_tokens": 754312.0,
1392
  "step": 136
1393
  },
1394
  {
1395
+ "entropy": 0.1928223273716867,
1396
  "epoch": 1.673813169984686,
1397
+ "grad_norm": 0.2470703125,
1398
  "learning_rate": 3.414634146341464e-05,
1399
+ "loss": 0.035037778317928314,
1400
+ "mean_token_accuracy": 0.9916842468082905,
1401
+ "num_tokens": 760182.0,
1402
  "step": 137
1403
  },
1404
  {
1405
+ "entropy": 0.19663999788463116,
1406
  "epoch": 1.686064318529862,
1407
+ "grad_norm": 0.265625,
1408
  "learning_rate": 3.292682926829269e-05,
1409
+ "loss": 0.03151565045118332,
1410
+ "mean_token_accuracy": 0.9930234625935555,
1411
+ "num_tokens": 766267.0,
1412
  "step": 138
1413
  },
1414
  {
1415
+ "entropy": 0.2058473015204072,
1416
  "epoch": 1.6983154670750382,
1417
+ "grad_norm": 0.2578125,
1418
  "learning_rate": 3.170731707317073e-05,
1419
+ "loss": 0.02509160526096821,
1420
+ "mean_token_accuracy": 0.9920520819723606,
1421
+ "num_tokens": 771135.0,
1422
  "step": 139
1423
  },
1424
  {
1425
+ "entropy": 0.20955495908856392,
1426
  "epoch": 1.7105666156202144,
1427
+ "grad_norm": 0.36328125,
1428
  "learning_rate": 3.048780487804878e-05,
1429
+ "loss": 0.03856905177235603,
1430
+ "mean_token_accuracy": 0.9877506978809834,
1431
+ "num_tokens": 776727.0,
1432
  "step": 140
1433
  },
1434
  {
1435
+ "entropy": 0.17796193715184927,
1436
  "epoch": 1.7228177641653906,
1437
+ "grad_norm": 0.271484375,
1438
  "learning_rate": 2.926829268292683e-05,
1439
+ "loss": 0.03061492368578911,
1440
+ "mean_token_accuracy": 0.9933489374816418,
1441
+ "num_tokens": 782352.0,
1442
  "step": 141
1443
  },
1444
  {
1445
+ "entropy": 0.19299636129289865,
1446
  "epoch": 1.7350689127105667,
1447
+ "grad_norm": 0.2392578125,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
+ "loss": 0.03383423760533333,
1450
+ "mean_token_accuracy": 0.9913677796721458,
1451
+ "num_tokens": 787139.0,
1452
  "step": 142
1453
  },
1454
  {
1455
+ "entropy": 0.2032350143417716,
1456
  "epoch": 1.7473200612557427,
1457
+ "grad_norm": 0.314453125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
+ "loss": 0.03458622097969055,
1460
+ "mean_token_accuracy": 0.9920257851481438,
1461
+ "num_tokens": 792244.0,
1462
  "step": 143
1463
  },
1464
  {
1465
+ "entropy": 0.21589675825089216,
1466
  "epoch": 1.7595712098009189,
1467
+ "grad_norm": 0.27734375,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
+ "loss": 0.029654916375875473,
1470
+ "mean_token_accuracy": 0.9936717823147774,
1471
+ "num_tokens": 797998.0,
1472
  "step": 144
1473
  },
1474
  {
1475
+ "entropy": 0.19791326764971018,
1476
  "epoch": 1.7718223583460948,
1477
+ "grad_norm": 0.1748046875,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
+ "loss": 0.019491517916321754,
1480
+ "mean_token_accuracy": 0.9953687153756618,
1481
+ "num_tokens": 803118.0,
1482
  "step": 145
1483
  },
1484
  {
1485
+ "entropy": 0.19606765313073993,
1486
  "epoch": 1.784073506891271,
1487
+ "grad_norm": 0.2236328125,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
+ "loss": 0.017046257853507996,
1490
+ "mean_token_accuracy": 0.9934666827321053,
1491
+ "num_tokens": 808709.0,
1492
  "step": 146
1493
  },
1494
  {
1495
+ "entropy": 0.17984948493540287,
1496
  "epoch": 1.7963246554364471,
1497
+ "grad_norm": 0.2119140625,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
+ "loss": 0.028008146211504936,
1500
+ "mean_token_accuracy": 0.9918750263750553,
1501
+ "num_tokens": 815053.0,
1502
  "step": 147
1503
  },
1504
  {
1505
+ "entropy": 0.19215012807399035,
1506
  "epoch": 1.8085758039816233,
1507
+ "grad_norm": 0.212890625,
1508
  "learning_rate": 2.073170731707317e-05,
1509
+ "loss": 0.02620745822787285,
1510
+ "mean_token_accuracy": 0.9895812347531319,
1511
+ "num_tokens": 821046.0,
1512
  "step": 148
1513
  },
1514
  {
1515
+ "entropy": 0.1954274857416749,
1516
  "epoch": 1.8208269525267995,
1517
+ "grad_norm": 0.1630859375,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
+ "loss": 0.012469938956201077,
1520
+ "mean_token_accuracy": 0.9970379211008549,
1521
+ "num_tokens": 825773.0,
1522
  "step": 149
1523
  },
1524
  {
1525
+ "entropy": 0.20444792695343494,
1526
  "epoch": 1.8330781010719757,
1527
+ "grad_norm": 0.3671875,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
+ "loss": 0.029102876782417297,
1530
+ "mean_token_accuracy": 0.9916210547089577,
1531
+ "num_tokens": 831944.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
+ "eval_entropy": 0.20245846825233405,
1537
+ "eval_loss": 0.07568201422691345,
1538
+ "eval_mean_token_accuracy": 0.973983341369076,
1539
+ "eval_num_tokens": 831944.0,
1540
+ "eval_runtime": 56.7259,
1541
+ "eval_samples_per_second": 1.216,
1542
+ "eval_steps_per_second": 1.216,
1543
  "step": 150
1544
  }
1545
  ],
 
1560
  "attributes": {}
1561
  }
1562
  },
1563
+ "total_flos": 3.767142787075277e+16,
1564
  "train_batch_size": 1,
1565
  "trial_name": null,
1566
  "trial_params": null
checkpoint-164/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "gate_proj",
33
- "up_proj",
34
- "q_proj",
35
- "v_proj",
36
  "k_proj",
 
37
  "down_proj",
38
- "o_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "o_proj",
33
  "gate_proj",
 
 
 
34
  "k_proj",
35
+ "q_proj",
36
  "down_proj",
37
+ "v_proj",
38
+ "up_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
checkpoint-164/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:992684d6ec4153831df4e539107495d771d85ab5ab3a998ef80302393087065c
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a30df352e3e5bf7a3be3ccca5e0bf0b3a9b19ac4eb509a3b6c3fbbccdd879fb
3
  size 83946192
checkpoint-164/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e449096d3a07f08f22b0b85be61c0b047450894cb70e29f590dfe9fce82f726
3
  size 85728997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b366dd84c17ff0c0f93bee9c1e5c08d40747cae3b15322a88899d9af7f34b76
3
  size 85728997
checkpoint-164/trainer_state.json CHANGED
@@ -10,1676 +10,1676 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 0.3660925142467022,
14
  "epoch": 0.01225114854517611,
15
- "grad_norm": 0.0166015625,
16
  "learning_rate": 0.0002,
17
- "loss": 0.0020782470237463713,
18
- "mean_token_accuracy": 0.9997171945869923,
19
- "num_tokens": 6092.0,
20
  "step": 1
21
  },
22
  {
23
- "entropy": 0.34051003493368626,
24
  "epoch": 0.02450229709035222,
25
- "grad_norm": 0.000823974609375,
26
  "learning_rate": 0.00019878048780487805,
27
- "loss": 9.216360922437161e-05,
28
- "mean_token_accuracy": 1.0,
29
- "num_tokens": 11535.0,
30
  "step": 2
31
  },
32
  {
33
- "entropy": 0.32960800640285015,
34
  "epoch": 0.036753445635528334,
35
- "grad_norm": 0.0098876953125,
36
  "learning_rate": 0.0001975609756097561,
37
- "loss": 0.0001977928914129734,
38
- "mean_token_accuracy": 1.0,
39
- "num_tokens": 16432.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 0.33627333864569664,
44
  "epoch": 0.04900459418070444,
45
- "grad_norm": 0.06640625,
46
  "learning_rate": 0.00019634146341463416,
47
- "loss": 0.00977393426001072,
48
- "mean_token_accuracy": 0.9985632188618183,
49
- "num_tokens": 20507.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 0.31916058249771595,
54
  "epoch": 0.06125574272588055,
55
- "grad_norm": 0.0003108978271484375,
56
  "learning_rate": 0.0001951219512195122,
57
- "loss": 5.0926646508742124e-05,
58
- "mean_token_accuracy": 1.0,
59
- "num_tokens": 26122.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 0.3524587769061327,
64
  "epoch": 0.07350689127105667,
65
- "grad_norm": 0.000186920166015625,
66
  "learning_rate": 0.00019390243902439025,
67
- "loss": 4.6155335439834744e-05,
68
- "mean_token_accuracy": 1.0,
69
- "num_tokens": 30847.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 0.3272323925048113,
74
  "epoch": 0.08575803981623277,
75
- "grad_norm": 0.005859375,
76
  "learning_rate": 0.0001926829268292683,
77
- "loss": 0.000202978597371839,
78
- "mean_token_accuracy": 1.0,
79
- "num_tokens": 36541.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 0.347023731097579,
84
  "epoch": 0.09800918836140889,
85
- "grad_norm": 0.00072479248046875,
86
  "learning_rate": 0.00019146341463414633,
87
- "loss": 0.00011593783710850403,
88
- "mean_token_accuracy": 1.0,
89
- "num_tokens": 41001.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 0.376500410027802,
94
  "epoch": 0.11026033690658499,
95
- "grad_norm": 0.09033203125,
96
  "learning_rate": 0.0001902439024390244,
97
- "loss": 0.008863622322678566,
98
- "mean_token_accuracy": 0.9979648105800152,
99
- "num_tokens": 45467.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 0.3560014171525836,
104
  "epoch": 0.1225114854517611,
105
- "grad_norm": 0.055419921875,
106
  "learning_rate": 0.00018902439024390244,
107
- "loss": 0.004083322826772928,
108
- "mean_token_accuracy": 0.9990039840340614,
109
- "num_tokens": 50478.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 0.3533000349998474,
114
  "epoch": 0.13476263399693722,
115
- "grad_norm": 0.0033721923828125,
116
  "learning_rate": 0.0001878048780487805,
117
- "loss": 0.000252897065365687,
118
- "mean_token_accuracy": 1.0,
119
- "num_tokens": 56181.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 0.4079158063977957,
124
  "epoch": 0.14701378254211334,
125
- "grad_norm": 0.00110626220703125,
126
  "learning_rate": 0.00018658536585365856,
127
- "loss": 0.00019193078333046287,
128
- "mean_token_accuracy": 1.0,
129
- "num_tokens": 62946.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 0.4043316235765815,
134
  "epoch": 0.15926493108728942,
135
- "grad_norm": 0.0021209716796875,
136
  "learning_rate": 0.0001853658536585366,
137
- "loss": 0.00025091503630392253,
138
- "mean_token_accuracy": 1.0,
139
- "num_tokens": 68436.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 0.41207500360906124,
144
  "epoch": 0.17151607963246554,
145
- "grad_norm": 0.00139617919921875,
146
  "learning_rate": 0.00018414634146341464,
147
- "loss": 0.0002536335668992251,
148
- "mean_token_accuracy": 1.0,
149
- "num_tokens": 73603.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 0.43669185042381287,
154
  "epoch": 0.18376722817764166,
155
- "grad_norm": 0.020751953125,
156
  "learning_rate": 0.0001829268292682927,
157
- "loss": 0.0008837866480462253,
158
- "mean_token_accuracy": 0.9994877055287361,
159
- "num_tokens": 77845.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 0.41382858343422413,
164
  "epoch": 0.19601837672281777,
165
- "grad_norm": 0.0145263671875,
166
  "learning_rate": 0.00018170731707317075,
167
- "loss": 0.0006772386841475964,
168
- "mean_token_accuracy": 1.0,
169
- "num_tokens": 82744.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 0.4243332091718912,
174
  "epoch": 0.2082695252679939,
175
- "grad_norm": 0.001922607421875,
176
  "learning_rate": 0.0001804878048780488,
177
- "loss": 0.00027059210697188973,
178
- "mean_token_accuracy": 1.0,
179
- "num_tokens": 87453.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 0.4329488482326269,
184
  "epoch": 0.22052067381316998,
185
- "grad_norm": 0.004852294921875,
186
  "learning_rate": 0.00017926829268292684,
187
- "loss": 0.00031758740078657866,
188
- "mean_token_accuracy": 1.0,
189
- "num_tokens": 92321.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 0.440301101654768,
194
  "epoch": 0.2327718223583461,
195
- "grad_norm": 0.005767822265625,
196
  "learning_rate": 0.00017804878048780488,
197
- "loss": 0.0004065934626851231,
198
- "mean_token_accuracy": 1.0,
199
- "num_tokens": 97146.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 0.4400939680635929,
204
  "epoch": 0.2450229709035222,
205
- "grad_norm": 0.0023040771484375,
206
  "learning_rate": 0.00017682926829268295,
207
- "loss": 0.00020425915136002004,
208
- "mean_token_accuracy": 1.0,
209
- "num_tokens": 101943.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 0.4579729177057743,
214
  "epoch": 0.2572741194486983,
215
- "grad_norm": 0.0286865234375,
216
  "learning_rate": 0.000175609756097561,
217
- "loss": 0.0015601275954395533,
218
- "mean_token_accuracy": 0.9996448867022991,
219
- "num_tokens": 106772.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 0.40288309939205647,
224
  "epoch": 0.26952526799387444,
225
- "grad_norm": 0.00072479248046875,
226
  "learning_rate": 0.00017439024390243903,
227
- "loss": 9.121054608840495e-05,
228
- "mean_token_accuracy": 1.0,
229
- "num_tokens": 112558.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 0.4252484003081918,
234
  "epoch": 0.28177641653905056,
235
- "grad_norm": 0.000457763671875,
236
  "learning_rate": 0.00017317073170731708,
237
- "loss": 8.147547487169504e-05,
238
- "mean_token_accuracy": 1.0,
239
- "num_tokens": 117489.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 0.44810181483626366,
244
  "epoch": 0.29402756508422667,
245
- "grad_norm": 0.007720947265625,
246
  "learning_rate": 0.00017195121951219512,
247
- "loss": 0.0003956289147026837,
248
- "mean_token_accuracy": 1.0,
249
- "num_tokens": 123010.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 0.4023376125842333,
254
  "epoch": 0.30627871362940273,
255
- "grad_norm": 0.00103759765625,
256
  "learning_rate": 0.0001707317073170732,
257
- "loss": 8.693434210726991e-05,
258
- "mean_token_accuracy": 1.0,
259
- "num_tokens": 127716.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 0.4007954867556691,
264
  "epoch": 0.31852986217457885,
265
- "grad_norm": 0.00194549560546875,
266
  "learning_rate": 0.00016951219512195123,
267
- "loss": 8.696074655745178e-05,
268
- "mean_token_accuracy": 1.0,
269
- "num_tokens": 132372.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 0.3759774696081877,
274
  "epoch": 0.33078101071975496,
275
- "grad_norm": 0.003387451171875,
276
  "learning_rate": 0.00016829268292682927,
277
- "loss": 0.00013623938139062375,
278
- "mean_token_accuracy": 1.0,
279
- "num_tokens": 137028.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 0.40147540159523487,
284
  "epoch": 0.3430321592649311,
285
- "grad_norm": 0.0380859375,
286
  "learning_rate": 0.00016707317073170731,
287
- "loss": 0.005999124608933926,
288
- "mean_token_accuracy": 0.9987113401293755,
289
- "num_tokens": 142088.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 0.38656803220510483,
294
  "epoch": 0.3552833078101072,
295
- "grad_norm": 0.0322265625,
296
  "learning_rate": 0.00016585365853658536,
297
- "loss": 0.00021061318693682551,
298
- "mean_token_accuracy": 1.0,
299
- "num_tokens": 147481.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 0.4059827271848917,
304
  "epoch": 0.3675344563552833,
305
- "grad_norm": 0.00015163421630859375,
306
  "learning_rate": 0.00016463414634146343,
307
- "loss": 3.9411937905242667e-05,
308
- "mean_token_accuracy": 1.0,
309
- "num_tokens": 152973.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 0.40111804008483887,
314
  "epoch": 0.37978560490045943,
315
- "grad_norm": 0.0003681182861328125,
316
  "learning_rate": 0.00016341463414634147,
317
- "loss": 5.111394784762524e-05,
318
- "mean_token_accuracy": 1.0,
319
- "num_tokens": 156786.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 0.41568026319146156,
324
  "epoch": 0.39203675344563554,
325
- "grad_norm": 0.00162506103515625,
326
  "learning_rate": 0.00016219512195121954,
327
- "loss": 0.0001103500762837939,
328
- "mean_token_accuracy": 1.0,
329
- "num_tokens": 162859.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 0.39988269470632076,
334
  "epoch": 0.40428790199081166,
335
- "grad_norm": 0.000518798828125,
336
  "learning_rate": 0.00016097560975609758,
337
- "loss": 6.166221282910556e-05,
338
- "mean_token_accuracy": 1.0,
339
- "num_tokens": 167969.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 0.3738459562882781,
344
  "epoch": 0.4165390505359878,
345
- "grad_norm": 0.00537109375,
346
  "learning_rate": 0.00015975609756097562,
347
- "loss": 0.00012469613284338266,
348
- "mean_token_accuracy": 1.0,
349
- "num_tokens": 172518.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 0.40653541777282953,
354
  "epoch": 0.42879019908116384,
355
- "grad_norm": 0.0031280517578125,
356
  "learning_rate": 0.00015853658536585366,
357
- "loss": 0.00010661048872862011,
358
- "mean_token_accuracy": 1.0,
359
- "num_tokens": 177085.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 0.39361329190433025,
364
  "epoch": 0.44104134762633995,
365
- "grad_norm": 0.08154296875,
366
  "learning_rate": 0.00015731707317073173,
367
- "loss": 0.0010916765313595533,
368
- "mean_token_accuracy": 0.9990942031145096,
369
- "num_tokens": 181617.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 0.358949625864625,
374
  "epoch": 0.45329249617151607,
375
- "grad_norm": 0.01080322265625,
376
  "learning_rate": 0.00015609756097560978,
377
- "loss": 0.0010772041277959943,
378
- "mean_token_accuracy": 0.9995535723865032,
379
- "num_tokens": 186836.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 0.3930373042821884,
384
  "epoch": 0.4655436447166922,
385
- "grad_norm": 0.000461578369140625,
386
  "learning_rate": 0.00015487804878048782,
387
- "loss": 5.279047036310658e-05,
388
- "mean_token_accuracy": 1.0,
389
- "num_tokens": 191224.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 0.35740520991384983,
394
  "epoch": 0.4777947932618683,
395
- "grad_norm": 0.000873565673828125,
396
  "learning_rate": 0.00015365853658536586,
397
- "loss": 5.439379674498923e-05,
398
- "mean_token_accuracy": 1.0,
399
- "num_tokens": 195926.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 0.38909873832017183,
404
  "epoch": 0.4900459418070444,
405
- "grad_norm": 0.0257568359375,
406
  "learning_rate": 0.0001524390243902439,
407
- "loss": 0.0015194097068160772,
408
- "mean_token_accuracy": 0.999550361186266,
409
- "num_tokens": 200772.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 0.36850977689027786,
414
  "epoch": 0.5022970903522205,
415
- "grad_norm": 0.1064453125,
416
  "learning_rate": 0.00015121951219512197,
417
- "loss": 0.002955856267362833,
418
- "mean_token_accuracy": 0.9993872530758381,
419
- "num_tokens": 204499.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 0.3940112106502056,
424
  "epoch": 0.5145482388973966,
425
- "grad_norm": 0.00885009765625,
426
  "learning_rate": 0.00015000000000000001,
427
- "loss": 0.000253106962190941,
428
- "mean_token_accuracy": 1.0,
429
- "num_tokens": 208814.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 0.39878340624272823,
434
  "epoch": 0.5267993874425727,
435
- "grad_norm": 0.037841796875,
436
  "learning_rate": 0.00014878048780487806,
437
- "loss": 0.0007202713750302792,
438
- "mean_token_accuracy": 0.9995833337306976,
439
- "num_tokens": 213907.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 0.41587444953620434,
444
  "epoch": 0.5390505359877489,
445
- "grad_norm": 0.0004177093505859375,
446
  "learning_rate": 0.0001475609756097561,
447
- "loss": 6.820505223004147e-05,
448
- "mean_token_accuracy": 1.0,
449
- "num_tokens": 218988.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 0.3888211837038398,
454
  "epoch": 0.5513016845329249,
455
- "grad_norm": 0.007568359375,
456
  "learning_rate": 0.00014634146341463414,
457
- "loss": 0.000737900089006871,
458
- "mean_token_accuracy": 0.9995967745780945,
459
- "num_tokens": 223595.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 0.4139576517045498,
464
  "epoch": 0.5635528330781011,
465
- "grad_norm": 0.014892578125,
466
  "learning_rate": 0.0001451219512195122,
467
- "loss": 0.0006043408066034317,
468
- "mean_token_accuracy": 0.9995192289352417,
469
- "num_tokens": 228244.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 0.39713083021342754,
474
  "epoch": 0.5758039816232772,
475
- "grad_norm": 0.00046539306640625,
476
  "learning_rate": 0.00014390243902439025,
477
- "loss": 8.217584399972111e-05,
478
- "mean_token_accuracy": 1.0,
479
- "num_tokens": 232606.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 0.40557617880403996,
484
  "epoch": 0.5880551301684533,
485
- "grad_norm": 0.0009918212890625,
486
  "learning_rate": 0.0001426829268292683,
487
- "loss": 0.00012616875756066293,
488
- "mean_token_accuracy": 1.0,
489
- "num_tokens": 236563.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 0.43470797687768936,
494
  "epoch": 0.6003062787136294,
495
- "grad_norm": 0.0238037109375,
496
  "learning_rate": 0.00014146341463414634,
497
- "loss": 0.0010796654969453812,
498
- "mean_token_accuracy": 0.999465811997652,
499
- "num_tokens": 241214.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 0.4234541580080986,
504
  "epoch": 0.6125574272588055,
505
- "grad_norm": 0.02783203125,
506
  "learning_rate": 0.00014024390243902438,
507
- "loss": 0.0009178520413115621,
508
- "mean_token_accuracy": 0.9996565915644169,
509
- "num_tokens": 245200.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
- "eval_entropy": 0.4022736955380094,
515
- "eval_loss": 0.0006544959614984691,
516
- "eval_mean_token_accuracy": 0.9998166846192401,
517
- "eval_num_tokens": 245200.0,
518
- "eval_runtime": 51.0138,
519
- "eval_samples_per_second": 1.353,
520
- "eval_steps_per_second": 1.353,
521
  "step": 50
522
  },
523
  {
524
- "entropy": 0.41674751229584217,
525
  "epoch": 0.6248085758039816,
526
- "grad_norm": 0.00131988525390625,
527
  "learning_rate": 0.00013902439024390245,
528
- "loss": 0.0001285702601308003,
529
- "mean_token_accuracy": 1.0,
530
- "num_tokens": 249761.0,
531
  "step": 51
532
  },
533
  {
534
- "entropy": 0.42886597104370594,
535
  "epoch": 0.6370597243491577,
536
- "grad_norm": 0.00171661376953125,
537
  "learning_rate": 0.0001378048780487805,
538
- "loss": 0.00014620381989516318,
539
- "mean_token_accuracy": 1.0,
540
- "num_tokens": 254787.0,
541
  "step": 52
542
  },
543
  {
544
- "entropy": 0.4423276912420988,
545
  "epoch": 0.6493108728943339,
546
- "grad_norm": 0.038818359375,
547
  "learning_rate": 0.00013658536585365856,
548
- "loss": 0.003947169054299593,
549
- "mean_token_accuracy": 0.9983357414603233,
550
- "num_tokens": 260287.0,
551
  "step": 53
552
  },
553
  {
554
- "entropy": 0.3989156847819686,
555
  "epoch": 0.6615620214395099,
556
- "grad_norm": 0.0211181640625,
557
  "learning_rate": 0.0001353658536585366,
558
- "loss": 0.00047477131010964513,
559
- "mean_token_accuracy": 0.9998249299824238,
560
- "num_tokens": 264810.0,
561
  "step": 54
562
  },
563
  {
564
- "entropy": 0.4272368475794792,
565
  "epoch": 0.6738131699846861,
566
- "grad_norm": 0.029052734375,
567
  "learning_rate": 0.00013414634146341464,
568
- "loss": 0.00408769678324461,
569
- "mean_token_accuracy": 0.9993622452020645,
570
- "num_tokens": 270386.0,
571
  "step": 55
572
  },
573
  {
574
- "entropy": 0.44703495875000954,
575
  "epoch": 0.6860643185298622,
576
- "grad_norm": 0.01202392578125,
577
  "learning_rate": 0.0001329268292682927,
578
- "loss": 0.00038261126610450447,
579
- "mean_token_accuracy": 1.0,
580
- "num_tokens": 274391.0,
581
  "step": 56
582
  },
583
  {
584
- "entropy": 0.4288428146392107,
585
  "epoch": 0.6983154670750383,
586
- "grad_norm": 0.01019287109375,
587
  "learning_rate": 0.00013170731707317076,
588
- "loss": 0.0003242077073082328,
589
- "mean_token_accuracy": 1.0,
590
- "num_tokens": 279716.0,
591
  "step": 57
592
  },
593
  {
594
- "entropy": 0.37452960200607777,
595
  "epoch": 0.7105666156202144,
596
- "grad_norm": 0.021728515625,
597
  "learning_rate": 0.0001304878048780488,
598
- "loss": 0.0027725810650736094,
599
- "mean_token_accuracy": 0.9994703382253647,
600
- "num_tokens": 285404.0,
601
  "step": 58
602
  },
603
  {
604
- "entropy": 0.4130611680448055,
605
  "epoch": 0.7228177641653905,
606
- "grad_norm": 0.04541015625,
607
  "learning_rate": 0.00012926829268292684,
608
- "loss": 0.0017543239519000053,
609
- "mean_token_accuracy": 0.9995689652860165,
610
- "num_tokens": 289992.0,
611
  "step": 59
612
  },
613
  {
614
- "entropy": 0.41101630590856075,
615
  "epoch": 0.7350689127105666,
616
- "grad_norm": 0.00078582763671875,
617
  "learning_rate": 0.00012804878048780488,
618
- "loss": 9.316274372395128e-05,
619
- "mean_token_accuracy": 1.0,
620
- "num_tokens": 294861.0,
621
  "step": 60
622
  },
623
  {
624
- "entropy": 0.3678157525137067,
625
  "epoch": 0.7473200612557427,
626
- "grad_norm": 0.00058746337890625,
627
  "learning_rate": 0.00012682926829268293,
628
- "loss": 8.83688626345247e-05,
629
- "mean_token_accuracy": 1.0,
630
- "num_tokens": 300355.0,
631
  "step": 61
632
  },
633
  {
634
- "entropy": 0.40994635969400406,
635
  "epoch": 0.7595712098009189,
636
- "grad_norm": 0.0015869140625,
637
  "learning_rate": 0.000125609756097561,
638
- "loss": 8.545083983335644e-05,
639
- "mean_token_accuracy": 1.0,
640
- "num_tokens": 305776.0,
641
  "step": 62
642
  },
643
  {
644
- "entropy": 0.37295936793088913,
645
  "epoch": 0.7718223583460949,
646
- "grad_norm": 0.000827789306640625,
647
  "learning_rate": 0.00012439024390243904,
648
- "loss": 7.97374959802255e-05,
649
- "mean_token_accuracy": 1.0,
650
- "num_tokens": 310204.0,
651
  "step": 63
652
  },
653
  {
654
- "entropy": 0.36804571095854044,
655
  "epoch": 0.7840735068912711,
656
- "grad_norm": 0.0002880096435546875,
657
  "learning_rate": 0.00012317073170731708,
658
- "loss": 6.0703161580022424e-05,
659
- "mean_token_accuracy": 1.0,
660
- "num_tokens": 314205.0,
661
  "step": 64
662
  },
663
  {
664
- "entropy": 0.3904844745993614,
665
  "epoch": 0.7963246554364471,
666
- "grad_norm": 0.0019989013671875,
667
  "learning_rate": 0.00012195121951219512,
668
- "loss": 7.91027705417946e-05,
669
- "mean_token_accuracy": 1.0,
670
- "num_tokens": 319157.0,
671
  "step": 65
672
  },
673
  {
674
- "entropy": 0.3921838700771332,
675
  "epoch": 0.8085758039816233,
676
- "grad_norm": 0.00177764892578125,
677
  "learning_rate": 0.00012073170731707318,
678
- "loss": 8.364896348211914e-05,
679
- "mean_token_accuracy": 1.0,
680
- "num_tokens": 324681.0,
681
  "step": 66
682
  },
683
  {
684
- "entropy": 0.34572961553931236,
685
  "epoch": 0.8208269525267994,
686
- "grad_norm": 0.061767578125,
687
  "learning_rate": 0.00011951219512195122,
688
- "loss": 0.008409281261265278,
689
- "mean_token_accuracy": 0.9963545724749565,
690
- "num_tokens": 329941.0,
691
  "step": 67
692
  },
693
  {
694
- "entropy": 0.3841299172490835,
695
  "epoch": 0.8330781010719756,
696
- "grad_norm": 0.01123046875,
697
  "learning_rate": 0.00011829268292682926,
698
- "loss": 0.00017956709780264646,
699
- "mean_token_accuracy": 1.0,
700
- "num_tokens": 334486.0,
701
  "step": 68
702
  },
703
  {
704
- "entropy": 0.39541577361524105,
705
  "epoch": 0.8453292496171516,
706
- "grad_norm": 0.00274658203125,
707
  "learning_rate": 0.00011707317073170732,
708
- "loss": 0.00012585960212163627,
709
- "mean_token_accuracy": 1.0,
710
- "num_tokens": 338183.0,
711
  "step": 69
712
  },
713
  {
714
- "entropy": 0.4046988161280751,
715
  "epoch": 0.8575803981623277,
716
- "grad_norm": 0.126953125,
717
  "learning_rate": 0.00011585365853658536,
718
- "loss": 0.007125813513994217,
719
- "mean_token_accuracy": 0.9981492757797241,
720
- "num_tokens": 342593.0,
721
  "step": 70
722
  },
723
  {
724
- "entropy": 0.40994592756032944,
725
  "epoch": 0.8698315467075038,
726
- "grad_norm": 0.0517578125,
727
  "learning_rate": 0.00011463414634146342,
728
- "loss": 0.0006066925125196576,
729
- "mean_token_accuracy": 0.9997807033360004,
730
- "num_tokens": 347797.0,
731
  "step": 71
732
  },
733
  {
734
- "entropy": 0.3796220198273659,
735
  "epoch": 0.8820826952526799,
736
- "grad_norm": 0.006103515625,
737
  "learning_rate": 0.00011341463414634146,
738
- "loss": 0.00017896694771479815,
739
- "mean_token_accuracy": 1.0,
740
- "num_tokens": 352121.0,
741
  "step": 72
742
  },
743
  {
744
- "entropy": 0.3931356444954872,
745
  "epoch": 0.8943338437978561,
746
- "grad_norm": 0.0181884765625,
747
  "learning_rate": 0.00011219512195121953,
748
- "loss": 0.0010632644407451153,
749
- "mean_token_accuracy": 0.9997568093240261,
750
- "num_tokens": 357943.0,
751
  "step": 73
752
  },
753
  {
754
- "entropy": 0.36392936669290066,
755
  "epoch": 0.9065849923430321,
756
- "grad_norm": 0.024658203125,
757
  "learning_rate": 0.00011097560975609757,
758
- "loss": 0.0006849091150797904,
759
- "mean_token_accuracy": 0.9996345043182373,
760
- "num_tokens": 363814.0,
761
  "step": 74
762
  },
763
  {
764
- "entropy": 0.3864069525152445,
765
  "epoch": 0.9188361408882083,
766
- "grad_norm": 0.000270843505859375,
767
  "learning_rate": 0.00010975609756097563,
768
- "loss": 5.0294114771531895e-05,
769
- "mean_token_accuracy": 1.0,
770
- "num_tokens": 368870.0,
771
  "step": 75
772
  },
773
  {
774
- "entropy": 0.39719677343964577,
775
  "epoch": 0.9310872894333844,
776
- "grad_norm": 0.01519775390625,
777
  "learning_rate": 0.00010853658536585367,
778
- "loss": 0.00048823675024323165,
779
- "mean_token_accuracy": 0.999143835157156,
780
- "num_tokens": 373670.0,
781
  "step": 76
782
  },
783
  {
784
- "entropy": 0.35627279058098793,
785
  "epoch": 0.9433384379785605,
786
- "grad_norm": 0.0074462890625,
787
  "learning_rate": 0.00010731707317073172,
788
- "loss": 0.000174719825736247,
789
- "mean_token_accuracy": 1.0,
790
- "num_tokens": 379037.0,
791
  "step": 77
792
  },
793
  {
794
- "entropy": 0.38681978918612003,
795
  "epoch": 0.9555895865237366,
796
- "grad_norm": 0.0181884765625,
797
  "learning_rate": 0.00010609756097560977,
798
- "loss": 0.000976942596025765,
799
- "mean_token_accuracy": 0.9992977529764175,
800
- "num_tokens": 384252.0,
801
  "step": 78
802
  },
803
  {
804
- "entropy": 0.3772548586130142,
805
  "epoch": 0.9678407350689127,
806
- "grad_norm": 0.000904083251953125,
807
  "learning_rate": 0.00010487804878048781,
808
- "loss": 6.608536932617426e-05,
809
- "mean_token_accuracy": 1.0,
810
- "num_tokens": 388347.0,
811
  "step": 79
812
  },
813
  {
814
- "entropy": 0.3597776433452964,
815
  "epoch": 0.9800918836140888,
816
- "grad_norm": 0.010986328125,
817
  "learning_rate": 0.00010365853658536586,
818
- "loss": 0.0007963755051605403,
819
- "mean_token_accuracy": 0.999015748500824,
820
- "num_tokens": 394213.0,
821
  "step": 80
822
  },
823
  {
824
- "entropy": 0.3731031287461519,
825
  "epoch": 0.9923430321592649,
826
- "grad_norm": 0.00115966796875,
827
  "learning_rate": 0.0001024390243902439,
828
- "loss": 8.310518751386553e-05,
829
- "mean_token_accuracy": 1.0,
830
- "num_tokens": 399113.0,
831
  "step": 81
832
  },
833
  {
834
- "entropy": 0.37349462509155273,
835
  "epoch": 1.0,
836
- "grad_norm": 0.00022125244140625,
837
  "learning_rate": 0.00010121951219512196,
838
- "loss": 4.093759343959391e-05,
839
- "mean_token_accuracy": 1.0,
840
- "num_tokens": 402129.0,
841
  "step": 82
842
  },
843
  {
844
- "entropy": 0.38408348336815834,
845
  "epoch": 1.0122511485451762,
846
- "grad_norm": 0.027099609375,
847
  "learning_rate": 0.0001,
848
- "loss": 0.0015746817225590348,
849
- "mean_token_accuracy": 0.9996279776096344,
850
- "num_tokens": 406760.0,
851
  "step": 83
852
  },
853
  {
854
- "entropy": 0.36415083333849907,
855
  "epoch": 1.0245022970903521,
856
- "grad_norm": 0.0032501220703125,
857
  "learning_rate": 9.878048780487805e-05,
858
- "loss": 0.00011362869554432109,
859
- "mean_token_accuracy": 1.0,
860
- "num_tokens": 411366.0,
861
  "step": 84
862
  },
863
  {
864
- "entropy": 0.3951573334634304,
865
  "epoch": 1.0367534456355283,
866
- "grad_norm": 0.0018768310546875,
867
  "learning_rate": 9.75609756097561e-05,
868
- "loss": 8.601781155448407e-05,
869
- "mean_token_accuracy": 1.0,
870
- "num_tokens": 417767.0,
871
  "step": 85
872
  },
873
  {
874
- "entropy": 0.3533172570168972,
875
  "epoch": 1.0490045941807045,
876
- "grad_norm": 0.00146484375,
877
  "learning_rate": 9.634146341463415e-05,
878
- "loss": 5.874271664652042e-05,
879
- "mean_token_accuracy": 1.0,
880
- "num_tokens": 421737.0,
881
  "step": 86
882
  },
883
  {
884
- "entropy": 0.35251205042004585,
885
  "epoch": 1.0612557427258806,
886
- "grad_norm": 6.008148193359375e-05,
887
  "learning_rate": 9.51219512195122e-05,
888
- "loss": 2.1197016394580714e-05,
889
- "mean_token_accuracy": 1.0,
890
- "num_tokens": 426853.0,
891
  "step": 87
892
  },
893
  {
894
- "entropy": 0.42304582707583904,
895
  "epoch": 1.0735068912710566,
896
- "grad_norm": 0.000797271728515625,
897
  "learning_rate": 9.390243902439024e-05,
898
- "loss": 6.177897739689797e-05,
899
- "mean_token_accuracy": 1.0,
900
- "num_tokens": 431082.0,
901
  "step": 88
902
  },
903
  {
904
- "entropy": 0.39542090706527233,
905
  "epoch": 1.0857580398162328,
906
- "grad_norm": 0.041015625,
907
  "learning_rate": 9.26829268292683e-05,
908
- "loss": 0.0009606232051737607,
909
- "mean_token_accuracy": 1.0,
910
- "num_tokens": 435693.0,
911
  "step": 89
912
  },
913
  {
914
- "entropy": 0.37046173214912415,
915
  "epoch": 1.098009188361409,
916
- "grad_norm": 0.000278472900390625,
917
  "learning_rate": 9.146341463414635e-05,
918
- "loss": 4.265129246050492e-05,
919
- "mean_token_accuracy": 1.0,
920
- "num_tokens": 440577.0,
921
  "step": 90
922
  },
923
  {
924
- "entropy": 0.3931607408449054,
925
  "epoch": 1.110260336906585,
926
- "grad_norm": 0.035400390625,
927
  "learning_rate": 9.02439024390244e-05,
928
- "loss": 0.004250116180628538,
929
- "mean_token_accuracy": 0.9994369372725487,
930
- "num_tokens": 445265.0,
931
  "step": 91
932
  },
933
  {
934
- "entropy": 0.3917137086391449,
935
  "epoch": 1.122511485451761,
936
- "grad_norm": 0.0419921875,
937
  "learning_rate": 8.902439024390244e-05,
938
- "loss": 0.002317648846656084,
939
- "mean_token_accuracy": 0.9992785975337029,
940
- "num_tokens": 450020.0,
941
  "step": 92
942
  },
943
  {
944
- "entropy": 0.3758338335901499,
945
  "epoch": 1.1347626339969372,
946
- "grad_norm": 0.0196533203125,
947
  "learning_rate": 8.78048780487805e-05,
948
- "loss": 0.0006808089674450457,
949
- "mean_token_accuracy": 0.999522902071476,
950
- "num_tokens": 455003.0,
951
  "step": 93
952
  },
953
  {
954
- "entropy": 0.383782709017396,
955
  "epoch": 1.1470137825421134,
956
- "grad_norm": 0.0034027099609375,
957
  "learning_rate": 8.658536585365854e-05,
958
- "loss": 7.263245788635686e-05,
959
- "mean_token_accuracy": 1.0,
960
- "num_tokens": 459698.0,
961
  "step": 94
962
  },
963
  {
964
- "entropy": 0.3821055982261896,
965
  "epoch": 1.1592649310872893,
966
- "grad_norm": 0.0004138946533203125,
967
  "learning_rate": 8.53658536585366e-05,
968
- "loss": 3.771902629523538e-05,
969
- "mean_token_accuracy": 1.0,
970
- "num_tokens": 464337.0,
971
  "step": 95
972
  },
973
  {
974
- "entropy": 0.3649219311773777,
975
  "epoch": 1.1715160796324655,
976
- "grad_norm": 0.00872802734375,
977
  "learning_rate": 8.414634146341464e-05,
978
- "loss": 0.0004717935808002949,
979
- "mean_token_accuracy": 1.0,
980
- "num_tokens": 468882.0,
981
  "step": 96
982
  },
983
  {
984
- "entropy": 0.3700664434581995,
985
  "epoch": 1.1837672281776417,
986
- "grad_norm": 0.00015544891357421875,
987
  "learning_rate": 8.292682926829268e-05,
988
- "loss": 3.247045970056206e-05,
989
- "mean_token_accuracy": 1.0,
990
- "num_tokens": 473756.0,
991
  "step": 97
992
  },
993
  {
994
- "entropy": 0.3915936965495348,
995
  "epoch": 1.1960183767228179,
996
- "grad_norm": 0.05078125,
997
  "learning_rate": 8.170731707317073e-05,
998
- "loss": 0.005024694371968508,
999
- "mean_token_accuracy": 0.9996565915644169,
1000
- "num_tokens": 479061.0,
1001
  "step": 98
1002
  },
1003
  {
1004
- "entropy": 0.4096358586102724,
1005
  "epoch": 1.2082695252679938,
1006
- "grad_norm": 0.00144195556640625,
1007
  "learning_rate": 8.048780487804879e-05,
1008
- "loss": 4.485135286813602e-05,
1009
- "mean_token_accuracy": 1.0,
1010
- "num_tokens": 484835.0,
1011
  "step": 99
1012
  },
1013
  {
1014
- "entropy": 0.35138822346925735,
1015
  "epoch": 1.22052067381317,
1016
- "grad_norm": 0.0038299560546875,
1017
  "learning_rate": 7.926829268292683e-05,
1018
- "loss": 0.00019770213111769408,
1019
- "mean_token_accuracy": 1.0,
1020
- "num_tokens": 489546.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
- "eval_entropy": 0.3780687239722929,
1026
- "eval_loss": 0.00034746917663142085,
1027
- "eval_mean_token_accuracy": 0.9999171840971794,
1028
- "eval_num_tokens": 489546.0,
1029
- "eval_runtime": 50.9982,
1030
- "eval_samples_per_second": 1.353,
1031
- "eval_steps_per_second": 1.353,
1032
  "step": 100
1033
  },
1034
  {
1035
- "entropy": 0.4012060575187206,
1036
  "epoch": 1.2327718223583461,
1037
- "grad_norm": 0.000217437744140625,
1038
  "learning_rate": 7.804878048780489e-05,
1039
- "loss": 3.667730197776109e-05,
1040
- "mean_token_accuracy": 1.0,
1041
- "num_tokens": 494781.0,
1042
  "step": 101
1043
  },
1044
  {
1045
- "entropy": 0.37181732058525085,
1046
  "epoch": 1.245022970903522,
1047
- "grad_norm": 0.0002155303955078125,
1048
  "learning_rate": 7.682926829268293e-05,
1049
- "loss": 2.923922693298664e-05,
1050
- "mean_token_accuracy": 1.0,
1051
- "num_tokens": 499861.0,
1052
  "step": 102
1053
  },
1054
  {
1055
- "entropy": 0.38948795571923256,
1056
  "epoch": 1.2572741194486983,
1057
- "grad_norm": 6.866455078125e-05,
1058
  "learning_rate": 7.560975609756099e-05,
1059
- "loss": 3.10177420033142e-05,
1060
- "mean_token_accuracy": 1.0,
1061
- "num_tokens": 505291.0,
1062
  "step": 103
1063
  },
1064
  {
1065
- "entropy": 0.3776157572865486,
1066
  "epoch": 1.2695252679938744,
1067
- "grad_norm": 0.00012874603271484375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
- "loss": 2.8559963539009914e-05,
1070
- "mean_token_accuracy": 1.0,
1071
- "num_tokens": 510284.0,
1072
  "step": 104
1073
  },
1074
  {
1075
- "entropy": 0.3941178657114506,
1076
  "epoch": 1.2817764165390506,
1077
- "grad_norm": 0.007232666015625,
1078
  "learning_rate": 7.317073170731707e-05,
1079
- "loss": 0.0008174990070983768,
1080
- "mean_token_accuracy": 1.0,
1081
- "num_tokens": 514517.0,
1082
  "step": 105
1083
  },
1084
  {
1085
- "entropy": 0.3697250857949257,
1086
  "epoch": 1.2940275650842268,
1087
- "grad_norm": 0.003143310546875,
1088
  "learning_rate": 7.195121951219513e-05,
1089
- "loss": 0.00010880863555939868,
1090
- "mean_token_accuracy": 1.0,
1091
- "num_tokens": 519535.0,
1092
  "step": 106
1093
  },
1094
  {
1095
- "entropy": 0.3888526763767004,
1096
  "epoch": 1.3062787136294027,
1097
- "grad_norm": 0.00054931640625,
1098
  "learning_rate": 7.073170731707317e-05,
1099
- "loss": 5.111205973662436e-05,
1100
- "mean_token_accuracy": 1.0,
1101
- "num_tokens": 524397.0,
1102
  "step": 107
1103
  },
1104
  {
1105
- "entropy": 0.3866258058696985,
1106
  "epoch": 1.318529862174579,
1107
- "grad_norm": 0.0004100799560546875,
1108
  "learning_rate": 6.951219512195122e-05,
1109
- "loss": 3.999587715952657e-05,
1110
- "mean_token_accuracy": 1.0,
1111
- "num_tokens": 528997.0,
1112
  "step": 108
1113
  },
1114
  {
1115
- "entropy": 0.3921303730458021,
1116
  "epoch": 1.3307810107197549,
1117
- "grad_norm": 0.000885009765625,
1118
  "learning_rate": 6.829268292682928e-05,
1119
- "loss": 6.128583481768146e-05,
1120
- "mean_token_accuracy": 1.0,
1121
- "num_tokens": 533965.0,
1122
  "step": 109
1123
  },
1124
  {
1125
- "entropy": 0.3705854155123234,
1126
  "epoch": 1.343032159264931,
1127
- "grad_norm": 0.002960205078125,
1128
  "learning_rate": 6.707317073170732e-05,
1129
- "loss": 7.792656106175855e-05,
1130
- "mean_token_accuracy": 1.0,
1131
- "num_tokens": 539387.0,
1132
  "step": 110
1133
  },
1134
  {
1135
- "entropy": 0.3712622048333287,
1136
  "epoch": 1.3552833078101072,
1137
- "grad_norm": 0.00089263916015625,
1138
  "learning_rate": 6.585365853658538e-05,
1139
- "loss": 4.521696246229112e-05,
1140
- "mean_token_accuracy": 1.0,
1141
- "num_tokens": 543755.0,
1142
  "step": 111
1143
  },
1144
  {
1145
- "entropy": 0.40867704525589943,
1146
  "epoch": 1.3675344563552834,
1147
- "grad_norm": 0.023193359375,
1148
  "learning_rate": 6.463414634146342e-05,
1149
- "loss": 0.003280676668509841,
1150
- "mean_token_accuracy": 0.9978448264300823,
1151
- "num_tokens": 548188.0,
1152
  "step": 112
1153
  },
1154
  {
1155
- "entropy": 0.3910982459783554,
1156
  "epoch": 1.3797856049004595,
1157
- "grad_norm": 0.0028533935546875,
1158
  "learning_rate": 6.341463414634146e-05,
1159
- "loss": 0.00015341158723458648,
1160
- "mean_token_accuracy": 1.0,
1161
- "num_tokens": 553717.0,
1162
  "step": 113
1163
  },
1164
  {
1165
- "entropy": 0.3753495467826724,
1166
  "epoch": 1.3920367534456355,
1167
- "grad_norm": 6.866455078125e-05,
1168
  "learning_rate": 6.219512195121952e-05,
1169
- "loss": 2.554376442276407e-05,
1170
- "mean_token_accuracy": 1.0,
1171
- "num_tokens": 558501.0,
1172
  "step": 114
1173
  },
1174
  {
1175
- "entropy": 0.3936616498976946,
1176
  "epoch": 1.4042879019908117,
1177
- "grad_norm": 0.000774383544921875,
1178
  "learning_rate": 6.097560975609756e-05,
1179
- "loss": 4.565157360048033e-05,
1180
- "mean_token_accuracy": 1.0,
1181
- "num_tokens": 563989.0,
1182
  "step": 115
1183
  },
1184
  {
1185
- "entropy": 0.4080927763134241,
1186
  "epoch": 1.4165390505359878,
1187
- "grad_norm": 0.000728607177734375,
1188
  "learning_rate": 5.975609756097561e-05,
1189
- "loss": 5.44461581739597e-05,
1190
- "mean_token_accuracy": 1.0,
1191
- "num_tokens": 568327.0,
1192
  "step": 116
1193
  },
1194
  {
1195
- "entropy": 0.36639871448278427,
1196
  "epoch": 1.4287901990811638,
1197
- "grad_norm": 0.000457763671875,
1198
  "learning_rate": 5.853658536585366e-05,
1199
- "loss": 3.381741407793015e-05,
1200
- "mean_token_accuracy": 1.0,
1201
- "num_tokens": 572919.0,
1202
  "step": 117
1203
  },
1204
  {
1205
- "entropy": 0.4015892669558525,
1206
  "epoch": 1.44104134762634,
1207
- "grad_norm": 0.00017833709716796875,
1208
  "learning_rate": 5.731707317073171e-05,
1209
- "loss": 4.158892625127919e-05,
1210
- "mean_token_accuracy": 1.0,
1211
- "num_tokens": 577916.0,
1212
  "step": 118
1213
  },
1214
  {
1215
- "entropy": 0.40410150960087776,
1216
  "epoch": 1.4532924961715161,
1217
- "grad_norm": 0.000621795654296875,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
- "loss": 2.5736055249581113e-05,
1220
- "mean_token_accuracy": 1.0,
1221
- "num_tokens": 583152.0,
1222
  "step": 119
1223
  },
1224
  {
1225
- "entropy": 0.40528898034244776,
1226
  "epoch": 1.4655436447166923,
1227
- "grad_norm": 0.01953125,
1228
  "learning_rate": 5.487804878048781e-05,
1229
- "loss": 0.00020874114125035703,
1230
- "mean_token_accuracy": 1.0,
1231
- "num_tokens": 587880.0,
1232
  "step": 120
1233
  },
1234
  {
1235
- "entropy": 0.35937592945992947,
1236
  "epoch": 1.4777947932618682,
1237
- "grad_norm": 0.083984375,
1238
  "learning_rate": 5.365853658536586e-05,
1239
- "loss": 0.007331337314099073,
1240
- "mean_token_accuracy": 0.9991379305720329,
1241
- "num_tokens": 592284.0,
1242
  "step": 121
1243
  },
1244
  {
1245
- "entropy": 0.3928218297660351,
1246
  "epoch": 1.4900459418070444,
1247
- "grad_norm": 0.00013446807861328125,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
- "loss": 2.927147943410091e-05,
1250
- "mean_token_accuracy": 1.0,
1251
- "num_tokens": 597046.0,
1252
  "step": 122
1253
  },
1254
  {
1255
- "entropy": 0.3777940608561039,
1256
  "epoch": 1.5022970903522204,
1257
- "grad_norm": 0.000579833984375,
1258
  "learning_rate": 5.121951219512195e-05,
1259
- "loss": 6.0145219322294e-05,
1260
- "mean_token_accuracy": 1.0,
1261
- "num_tokens": 601350.0,
1262
  "step": 123
1263
  },
1264
  {
1265
- "entropy": 0.39830240048468113,
1266
  "epoch": 1.5145482388973965,
1267
- "grad_norm": 0.0245361328125,
1268
  "learning_rate": 5e-05,
1269
- "loss": 0.00029612769139930606,
1270
- "mean_token_accuracy": 1.0,
1271
- "num_tokens": 606643.0,
1272
  "step": 124
1273
  },
1274
  {
1275
- "entropy": 0.3925098739564419,
1276
  "epoch": 1.5267993874425727,
1277
- "grad_norm": 0.0004749298095703125,
1278
  "learning_rate": 4.878048780487805e-05,
1279
- "loss": 4.631431511370465e-05,
1280
- "mean_token_accuracy": 1.0,
1281
- "num_tokens": 612405.0,
1282
  "step": 125
1283
  },
1284
  {
1285
- "entropy": 0.3956710360944271,
1286
  "epoch": 1.5390505359877489,
1287
- "grad_norm": 0.00634765625,
1288
  "learning_rate": 4.75609756097561e-05,
1289
- "loss": 8.446360880043358e-05,
1290
- "mean_token_accuracy": 1.0,
1291
- "num_tokens": 617227.0,
1292
  "step": 126
1293
  },
1294
  {
1295
- "entropy": 0.430975291877985,
1296
  "epoch": 1.551301684532925,
1297
- "grad_norm": 0.000518798828125,
1298
  "learning_rate": 4.634146341463415e-05,
1299
- "loss": 6.132836278993636e-05,
1300
- "mean_token_accuracy": 1.0,
1301
- "num_tokens": 622353.0,
1302
  "step": 127
1303
  },
1304
  {
1305
- "entropy": 0.4242272228002548,
1306
  "epoch": 1.5635528330781012,
1307
- "grad_norm": 0.0025177001953125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
- "loss": 0.00011561957217054442,
1310
- "mean_token_accuracy": 1.0,
1311
- "num_tokens": 627267.0,
1312
  "step": 128
1313
  },
1314
  {
1315
- "entropy": 0.3710012398660183,
1316
  "epoch": 1.5758039816232772,
1317
- "grad_norm": 0.002777099609375,
1318
  "learning_rate": 4.390243902439025e-05,
1319
- "loss": 0.00010202911653323099,
1320
- "mean_token_accuracy": 1.0,
1321
- "num_tokens": 631452.0,
1322
  "step": 129
1323
  },
1324
  {
1325
- "entropy": 0.35699679516255856,
1326
  "epoch": 1.5880551301684533,
1327
- "grad_norm": 0.00023651123046875,
1328
  "learning_rate": 4.26829268292683e-05,
1329
- "loss": 5.903129203943536e-05,
1330
- "mean_token_accuracy": 1.0,
1331
- "num_tokens": 636500.0,
1332
  "step": 130
1333
  },
1334
  {
1335
- "entropy": 0.39619251526892185,
1336
  "epoch": 1.6003062787136293,
1337
- "grad_norm": 0.0230712890625,
1338
  "learning_rate": 4.146341463414634e-05,
1339
- "loss": 0.0031676713842898607,
1340
- "mean_token_accuracy": 0.9987796545028687,
1341
- "num_tokens": 641262.0,
1342
  "step": 131
1343
  },
1344
  {
1345
- "entropy": 0.40411114878952503,
1346
  "epoch": 1.6125574272588055,
1347
- "grad_norm": 0.0361328125,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
- "loss": 0.0015652105212211609,
1350
- "mean_token_accuracy": 0.999205507338047,
1351
- "num_tokens": 646375.0,
1352
  "step": 132
1353
  },
1354
  {
1355
- "entropy": 0.3453770913183689,
1356
  "epoch": 1.6248085758039816,
1357
- "grad_norm": 8.440017700195312e-05,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
- "loss": 3.279931843280792e-05,
1360
- "mean_token_accuracy": 1.0,
1361
- "num_tokens": 650765.0,
1362
  "step": 133
1363
  },
1364
  {
1365
- "entropy": 0.37724466249346733,
1366
  "epoch": 1.6370597243491578,
1367
- "grad_norm": 0.00142669677734375,
1368
  "learning_rate": 3.780487804878049e-05,
1369
- "loss": 5.4958236432867125e-05,
1370
- "mean_token_accuracy": 1.0,
1371
- "num_tokens": 655167.0,
1372
  "step": 134
1373
  },
1374
  {
1375
- "entropy": 0.39796170592308044,
1376
  "epoch": 1.649310872894334,
1377
- "grad_norm": 0.0003986358642578125,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
- "loss": 3.9815466152504086e-05,
1380
- "mean_token_accuracy": 1.0,
1381
- "num_tokens": 660288.0,
1382
  "step": 135
1383
  },
1384
  {
1385
- "entropy": 0.4333613757044077,
1386
  "epoch": 1.66156202143951,
1387
- "grad_norm": 0.0001544952392578125,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
- "loss": 4.787950456375256e-05,
1390
- "mean_token_accuracy": 1.0,
1391
- "num_tokens": 664471.0,
1392
  "step": 136
1393
  },
1394
  {
1395
- "entropy": 0.41916552372276783,
1396
  "epoch": 1.673813169984686,
1397
- "grad_norm": 0.0002899169921875,
1398
  "learning_rate": 3.414634146341464e-05,
1399
- "loss": 4.767990321852267e-05,
1400
- "mean_token_accuracy": 1.0,
1401
- "num_tokens": 669354.0,
1402
  "step": 137
1403
  },
1404
  {
1405
- "entropy": 0.3999825790524483,
1406
  "epoch": 1.686064318529862,
1407
- "grad_norm": 0.0026397705078125,
1408
  "learning_rate": 3.292682926829269e-05,
1409
- "loss": 0.0001605500146979466,
1410
- "mean_token_accuracy": 1.0,
1411
- "num_tokens": 674909.0,
1412
  "step": 138
1413
  },
1414
  {
1415
- "entropy": 0.39421058259904385,
1416
  "epoch": 1.6983154670750382,
1417
- "grad_norm": 0.005767822265625,
1418
  "learning_rate": 3.170731707317073e-05,
1419
- "loss": 0.00022102531511336565,
1420
- "mean_token_accuracy": 1.0,
1421
- "num_tokens": 679690.0,
1422
  "step": 139
1423
  },
1424
  {
1425
- "entropy": 0.4142182134091854,
1426
  "epoch": 1.7105666156202144,
1427
- "grad_norm": 0.003631591796875,
1428
  "learning_rate": 3.048780487804878e-05,
1429
- "loss": 0.00014472004841081798,
1430
- "mean_token_accuracy": 1.0,
1431
- "num_tokens": 685046.0,
1432
  "step": 140
1433
  },
1434
  {
1435
- "entropy": 0.3982192352414131,
1436
  "epoch": 1.7228177641653906,
1437
- "grad_norm": 0.00019168853759765625,
1438
  "learning_rate": 2.926829268292683e-05,
1439
- "loss": 4.7273264499381185e-05,
1440
- "mean_token_accuracy": 1.0,
1441
- "num_tokens": 689394.0,
1442
  "step": 141
1443
  },
1444
  {
1445
- "entropy": 0.4133493732661009,
1446
  "epoch": 1.7350689127105667,
1447
- "grad_norm": 0.00701904296875,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
- "loss": 9.296434291172773e-05,
1450
- "mean_token_accuracy": 1.0,
1451
- "num_tokens": 693187.0,
1452
  "step": 142
1453
  },
1454
  {
1455
- "entropy": 0.40933855436742306,
1456
  "epoch": 1.7473200612557427,
1457
- "grad_norm": 0.0019683837890625,
1458
  "learning_rate": 2.682926829268293e-05,
1459
- "loss": 9.476351988269016e-05,
1460
- "mean_token_accuracy": 1.0,
1461
- "num_tokens": 697601.0,
1462
  "step": 143
1463
  },
1464
  {
1465
- "entropy": 0.41714910976588726,
1466
  "epoch": 1.7595712098009189,
1467
- "grad_norm": 0.045166015625,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
- "loss": 0.0034146099351346493,
1470
- "mean_token_accuracy": 0.9998650103807449,
1471
- "num_tokens": 703048.0,
1472
  "step": 144
1473
  },
1474
  {
1475
- "entropy": 0.40594901144504547,
1476
  "epoch": 1.7718223583460948,
1477
- "grad_norm": 0.02587890625,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
- "loss": 0.001274456619285047,
1480
- "mean_token_accuracy": 0.999015748500824,
1481
- "num_tokens": 707860.0,
1482
  "step": 145
1483
  },
1484
  {
1485
- "entropy": 0.41635255329310894,
1486
  "epoch": 1.784073506891271,
1487
- "grad_norm": 0.000156402587890625,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
- "loss": 5.037836672272533e-05,
1490
- "mean_token_accuracy": 1.0,
1491
- "num_tokens": 712819.0,
1492
  "step": 146
1493
  },
1494
  {
1495
- "entropy": 0.4038653904572129,
1496
  "epoch": 1.7963246554364471,
1497
- "grad_norm": 0.0004100799560546875,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
- "loss": 4.163683479418978e-05,
1500
- "mean_token_accuracy": 1.0,
1501
- "num_tokens": 718451.0,
1502
  "step": 147
1503
  },
1504
  {
1505
- "entropy": 0.4069879539310932,
1506
  "epoch": 1.8085758039816233,
1507
- "grad_norm": 7.104873657226562e-05,
1508
  "learning_rate": 2.073170731707317e-05,
1509
- "loss": 3.6120818549534306e-05,
1510
- "mean_token_accuracy": 1.0,
1511
- "num_tokens": 723808.0,
1512
  "step": 148
1513
  },
1514
  {
1515
- "entropy": 0.4381860624998808,
1516
  "epoch": 1.8208269525267995,
1517
- "grad_norm": 9.870529174804688e-05,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
- "loss": 4.433648064150475e-05,
1520
- "mean_token_accuracy": 1.0,
1521
- "num_tokens": 728124.0,
1522
  "step": 149
1523
  },
1524
  {
1525
- "entropy": 0.42220813781023026,
1526
  "epoch": 1.8330781010719757,
1527
- "grad_norm": 0.005401611328125,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
- "loss": 0.00017107791791204363,
1530
- "mean_token_accuracy": 1.0,
1531
- "num_tokens": 733915.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
- "eval_entropy": 0.4000617520532746,
1537
- "eval_loss": 0.00023719228920526803,
1538
- "eval_mean_token_accuracy": 0.9998813841653906,
1539
- "eval_num_tokens": 733915.0,
1540
- "eval_runtime": 50.9031,
1541
- "eval_samples_per_second": 1.356,
1542
- "eval_steps_per_second": 1.356,
1543
  "step": 150
1544
  },
1545
  {
1546
- "entropy": 0.3884127251803875,
1547
  "epoch": 1.8453292496171516,
1548
- "grad_norm": 0.0010986328125,
1549
  "learning_rate": 1.707317073170732e-05,
1550
- "loss": 6.602725625270978e-05,
1551
- "mean_token_accuracy": 1.0,
1552
- "num_tokens": 738158.0,
1553
  "step": 151
1554
  },
1555
  {
1556
- "entropy": 0.43312329426407814,
1557
  "epoch": 1.8575803981623276,
1558
- "grad_norm": 0.00109100341796875,
1559
  "learning_rate": 1.5853658536585366e-05,
1560
- "loss": 6.0493410273920745e-05,
1561
- "mean_token_accuracy": 1.0,
1562
- "num_tokens": 743914.0,
1563
  "step": 152
1564
  },
1565
  {
1566
- "entropy": 0.3950846865773201,
1567
  "epoch": 1.8698315467075037,
1568
- "grad_norm": 9.393692016601562e-05,
1569
  "learning_rate": 1.4634146341463415e-05,
1570
- "loss": 3.80194433091674e-05,
1571
- "mean_token_accuracy": 1.0,
1572
- "num_tokens": 749555.0,
1573
  "step": 153
1574
  },
1575
  {
1576
- "entropy": 0.41627938114106655,
1577
  "epoch": 1.88208269525268,
1578
- "grad_norm": 0.00018405914306640625,
1579
  "learning_rate": 1.3414634146341466e-05,
1580
- "loss": 3.8107638829387724e-05,
1581
- "mean_token_accuracy": 1.0,
1582
- "num_tokens": 755347.0,
1583
  "step": 154
1584
  },
1585
  {
1586
- "entropy": 0.44089478626847267,
1587
  "epoch": 1.894333843797856,
1588
- "grad_norm": 0.00029754638671875,
1589
  "learning_rate": 1.2195121951219513e-05,
1590
- "loss": 4.713048838311806e-05,
1591
- "mean_token_accuracy": 1.0,
1592
- "num_tokens": 760505.0,
1593
  "step": 155
1594
  },
1595
  {
1596
- "entropy": 0.4184252228587866,
1597
  "epoch": 1.9065849923430322,
1598
- "grad_norm": 0.000301361083984375,
1599
  "learning_rate": 1.0975609756097562e-05,
1600
- "loss": 5.365146716940217e-05,
1601
- "mean_token_accuracy": 1.0,
1602
- "num_tokens": 765413.0,
1603
  "step": 156
1604
  },
1605
  {
1606
- "entropy": 0.41183059848845005,
1607
  "epoch": 1.9188361408882084,
1608
- "grad_norm": 0.0257568359375,
1609
  "learning_rate": 9.756097560975611e-06,
1610
- "loss": 0.0007220981642603874,
1611
- "mean_token_accuracy": 0.9997568093240261,
1612
- "num_tokens": 770650.0,
1613
  "step": 157
1614
  },
1615
  {
1616
- "entropy": 0.4214022643864155,
1617
  "epoch": 1.9310872894333844,
1618
- "grad_norm": 0.000244140625,
1619
  "learning_rate": 8.53658536585366e-06,
1620
- "loss": 4.793080370291136e-05,
1621
- "mean_token_accuracy": 1.0,
1622
- "num_tokens": 775650.0,
1623
  "step": 158
1624
  },
1625
  {
1626
- "entropy": 0.4058344177901745,
1627
  "epoch": 1.9433384379785605,
1628
- "grad_norm": 0.000362396240234375,
1629
  "learning_rate": 7.317073170731707e-06,
1630
- "loss": 6.000606663292274e-05,
1631
- "mean_token_accuracy": 1.0,
1632
- "num_tokens": 779848.0,
1633
  "step": 159
1634
  },
1635
  {
1636
- "entropy": 0.41280501522123814,
1637
  "epoch": 1.9555895865237365,
1638
- "grad_norm": 0.00061798095703125,
1639
  "learning_rate": 6.0975609756097564e-06,
1640
- "loss": 8.753919246373698e-05,
1641
- "mean_token_accuracy": 1.0,
1642
- "num_tokens": 784522.0,
1643
  "step": 160
1644
  },
1645
  {
1646
- "entropy": 0.4334367923438549,
1647
  "epoch": 1.9678407350689127,
1648
- "grad_norm": 0.0003643035888671875,
1649
  "learning_rate": 4.8780487804878055e-06,
1650
- "loss": 6.043446410330944e-05,
1651
- "mean_token_accuracy": 1.0,
1652
- "num_tokens": 790217.0,
1653
  "step": 161
1654
  },
1655
  {
1656
- "entropy": 0.41199295595288277,
1657
  "epoch": 1.9800918836140888,
1658
- "grad_norm": 0.00101470947265625,
1659
  "learning_rate": 3.6585365853658537e-06,
1660
- "loss": 9.327918814960867e-05,
1661
- "mean_token_accuracy": 1.0,
1662
- "num_tokens": 794982.0,
1663
  "step": 162
1664
  },
1665
  {
1666
- "entropy": 0.43511078506708145,
1667
  "epoch": 1.992343032159265,
1668
- "grad_norm": 0.000171661376953125,
1669
  "learning_rate": 2.4390243902439027e-06,
1670
- "loss": 4.208434984320775e-05,
1671
- "mean_token_accuracy": 1.0,
1672
- "num_tokens": 800602.0,
1673
  "step": 163
1674
  },
1675
  {
1676
- "entropy": 0.42249701023101804,
1677
  "epoch": 2.0,
1678
- "grad_norm": 0.037353515625,
1679
  "learning_rate": 1.2195121951219514e-06,
1680
- "loss": 0.0015817588428035378,
1681
- "mean_token_accuracy": 0.9995260655879974,
1682
- "num_tokens": 804258.0,
1683
  "step": 164
1684
  }
1685
  ],
@@ -1700,7 +1700,7 @@
1700
  "attributes": {}
1701
  }
1702
  },
1703
- "total_flos": 3.641777239390618e+16,
1704
  "train_batch_size": 1,
1705
  "trial_name": null,
1706
  "trial_params": null
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 0.2316489452496171,
14
  "epoch": 0.01225114854517611,
15
+ "grad_norm": 1.21875,
16
  "learning_rate": 0.0002,
17
+ "loss": 0.1141367182135582,
18
+ "mean_token_accuracy": 0.962372187525034,
19
+ "num_tokens": 6133.0,
20
  "step": 1
21
  },
22
  {
23
+ "entropy": 0.2494401354342699,
24
  "epoch": 0.02450229709035222,
25
+ "grad_norm": 0.59765625,
26
  "learning_rate": 0.00019878048780487805,
27
+ "loss": 0.07354862987995148,
28
+ "mean_token_accuracy": 0.9755491837859154,
29
+ "num_tokens": 12088.0,
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 0.31152926199138165,
34
  "epoch": 0.036753445635528334,
35
+ "grad_norm": 0.306640625,
36
  "learning_rate": 0.0001975609756097561,
37
+ "loss": 0.06412772834300995,
38
+ "mean_token_accuracy": 0.978853102773428,
39
+ "num_tokens": 17331.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 0.30638211220502853,
44
  "epoch": 0.04900459418070444,
45
+ "grad_norm": 0.8984375,
46
  "learning_rate": 0.00019634146341463416,
47
+ "loss": 0.08034519106149673,
48
+ "mean_token_accuracy": 0.9723691493272781,
49
+ "num_tokens": 22383.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 0.3171741934493184,
54
  "epoch": 0.06125574272588055,
55
+ "grad_norm": 0.60546875,
56
  "learning_rate": 0.0001951219512195122,
57
+ "loss": 0.07083277404308319,
58
+ "mean_token_accuracy": 0.9742059484124184,
59
+ "num_tokens": 27930.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 0.3094687405973673,
64
  "epoch": 0.07350689127105667,
65
+ "grad_norm": 0.6796875,
66
  "learning_rate": 0.00019390243902439025,
67
+ "loss": 0.08443780243396759,
68
+ "mean_token_accuracy": 0.9732540361583233,
69
+ "num_tokens": 33286.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 0.2914603017270565,
74
  "epoch": 0.08575803981623277,
75
+ "grad_norm": 0.265625,
76
  "learning_rate": 0.0001926829268292683,
77
+ "loss": 0.06558080017566681,
78
+ "mean_token_accuracy": 0.9725310951471329,
79
+ "num_tokens": 39568.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 0.279434559866786,
84
  "epoch": 0.09800918836140889,
85
+ "grad_norm": 0.58984375,
86
  "learning_rate": 0.00019146341463414633,
87
+ "loss": 0.07338608056306839,
88
+ "mean_token_accuracy": 0.9793376848101616,
89
+ "num_tokens": 44597.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 0.27481516171246767,
94
  "epoch": 0.11026033690658499,
95
+ "grad_norm": 0.3125,
96
  "learning_rate": 0.0001902439024390244,
97
+ "loss": 0.06733334064483643,
98
+ "mean_token_accuracy": 0.9732998013496399,
99
+ "num_tokens": 49848.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 0.2752347318455577,
104
  "epoch": 0.1225114854517611,
105
+ "grad_norm": 0.4296875,
106
  "learning_rate": 0.00018902439024390244,
107
+ "loss": 0.08688339591026306,
108
+ "mean_token_accuracy": 0.9711812101304531,
109
+ "num_tokens": 55087.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 0.23697010707110167,
114
  "epoch": 0.13476263399693722,
115
+ "grad_norm": 0.35546875,
116
  "learning_rate": 0.0001878048780487805,
117
+ "loss": 0.09419302642345428,
118
+ "mean_token_accuracy": 0.9671205654740334,
119
+ "num_tokens": 61901.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 0.2767820842564106,
124
  "epoch": 0.14701378254211334,
125
+ "grad_norm": 0.5078125,
126
  "learning_rate": 0.00018658536585365856,
127
+ "loss": 0.09175145626068115,
128
+ "mean_token_accuracy": 0.9672112688422203,
129
+ "num_tokens": 68472.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 0.2712240917608142,
134
  "epoch": 0.15926493108728942,
135
+ "grad_norm": 0.43359375,
136
  "learning_rate": 0.0001853658536585366,
137
+ "loss": 0.1060388907790184,
138
+ "mean_token_accuracy": 0.9682641178369522,
139
+ "num_tokens": 74380.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 0.2655314621515572,
144
  "epoch": 0.17151607963246554,
145
+ "grad_norm": 0.5234375,
146
  "learning_rate": 0.00018414634146341464,
147
+ "loss": 0.09543660283088684,
148
+ "mean_token_accuracy": 0.9580898210406303,
149
+ "num_tokens": 80297.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 0.2568928087130189,
154
  "epoch": 0.18376722817764166,
155
+ "grad_norm": 0.306640625,
156
  "learning_rate": 0.0001829268292682927,
157
+ "loss": 0.05766459181904793,
158
+ "mean_token_accuracy": 0.9795842878520489,
159
+ "num_tokens": 85162.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 0.27691631484776735,
164
  "epoch": 0.19601837672281777,
165
+ "grad_norm": 0.359375,
166
  "learning_rate": 0.00018170731707317075,
167
+ "loss": 0.0939052402973175,
168
+ "mean_token_accuracy": 0.9671713933348656,
169
+ "num_tokens": 90393.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 0.2810298567637801,
174
  "epoch": 0.2082695252679939,
175
+ "grad_norm": 0.26953125,
176
  "learning_rate": 0.0001804878048780488,
177
+ "loss": 0.058892831206321716,
178
+ "mean_token_accuracy": 0.9773643910884857,
179
+ "num_tokens": 95530.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 0.2796283131465316,
184
  "epoch": 0.22052067381316998,
185
+ "grad_norm": 0.345703125,
186
  "learning_rate": 0.00017926829268292684,
187
+ "loss": 0.07744893431663513,
188
+ "mean_token_accuracy": 0.9721782878041267,
189
+ "num_tokens": 101234.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 0.2912421654909849,
194
  "epoch": 0.2327718223583461,
195
+ "grad_norm": 0.48828125,
196
  "learning_rate": 0.00017804878048780488,
197
+ "loss": 0.07593704760074615,
198
+ "mean_token_accuracy": 0.9668422974646091,
199
+ "num_tokens": 107018.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 0.28678335808217525,
204
  "epoch": 0.2450229709035222,
205
+ "grad_norm": 0.337890625,
206
  "learning_rate": 0.00017682926829268295,
207
+ "loss": 0.07227691262960434,
208
+ "mean_token_accuracy": 0.9736582525074482,
209
+ "num_tokens": 112299.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 0.296040833927691,
214
  "epoch": 0.2572741194486983,
215
+ "grad_norm": 0.33203125,
216
  "learning_rate": 0.000175609756097561,
217
+ "loss": 0.07230418920516968,
218
+ "mean_token_accuracy": 0.9750959761440754,
219
+ "num_tokens": 117872.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 0.27195548359304667,
224
  "epoch": 0.26952526799387444,
225
+ "grad_norm": 0.3671875,
226
  "learning_rate": 0.00017439024390243903,
227
+ "loss": 0.08706101030111313,
228
+ "mean_token_accuracy": 0.9771376326680183,
229
+ "num_tokens": 124580.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 0.29904199205338955,
234
  "epoch": 0.28177641653905056,
235
+ "grad_norm": 0.408203125,
236
  "learning_rate": 0.00017317073170731708,
237
+ "loss": 0.0653143897652626,
238
+ "mean_token_accuracy": 0.9760479032993317,
239
+ "num_tokens": 129745.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 0.2986137717962265,
244
  "epoch": 0.29402756508422667,
245
+ "grad_norm": 0.421875,
246
  "learning_rate": 0.00017195121951219512,
247
+ "loss": 0.07193314284086227,
248
+ "mean_token_accuracy": 0.9698839113116264,
249
+ "num_tokens": 135543.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 0.24683671910315752,
254
  "epoch": 0.30627871362940273,
255
+ "grad_norm": 0.37890625,
256
  "learning_rate": 0.0001707317073170732,
257
+ "loss": 0.07017349451780319,
258
+ "mean_token_accuracy": 0.9763788469135761,
259
+ "num_tokens": 141145.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 0.23581106960773468,
264
  "epoch": 0.31852986217457885,
265
+ "grad_norm": 0.349609375,
266
  "learning_rate": 0.00016951219512195123,
267
+ "loss": 0.07848861813545227,
268
+ "mean_token_accuracy": 0.9711455926299095,
269
+ "num_tokens": 146832.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 0.19877766259014606,
274
  "epoch": 0.33078101071975496,
275
+ "grad_norm": 0.32421875,
276
  "learning_rate": 0.00016829268292682927,
277
+ "loss": 0.05964134261012077,
278
+ "mean_token_accuracy": 0.9766620621085167,
279
+ "num_tokens": 153062.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 0.24412551056593657,
284
  "epoch": 0.3430321592649311,
285
+ "grad_norm": 0.466796875,
286
  "learning_rate": 0.00016707317073170731,
287
+ "loss": 0.10119230300188065,
288
+ "mean_token_accuracy": 0.9631960429251194,
289
+ "num_tokens": 159097.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 0.2634996743872762,
294
  "epoch": 0.3552833078101072,
295
+ "grad_norm": 0.376953125,
296
  "learning_rate": 0.00016585365853658536,
297
+ "loss": 0.07137235254049301,
298
+ "mean_token_accuracy": 0.9721279740333557,
299
+ "num_tokens": 164465.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 0.2398172626271844,
304
  "epoch": 0.3675344563552833,
305
+ "grad_norm": 0.380859375,
306
  "learning_rate": 0.00016463414634146343,
307
+ "loss": 0.08367905020713806,
308
+ "mean_token_accuracy": 0.9688702113926411,
309
+ "num_tokens": 171131.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 0.2387447776272893,
314
  "epoch": 0.37978560490045943,
315
+ "grad_norm": 0.39453125,
316
  "learning_rate": 0.00016341463414634147,
317
+ "loss": 0.07410822808742523,
318
+ "mean_token_accuracy": 0.9765294268727303,
319
+ "num_tokens": 175655.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 0.24556818418204784,
324
  "epoch": 0.39203675344563554,
325
+ "grad_norm": 0.361328125,
326
  "learning_rate": 0.00016219512195121954,
327
+ "loss": 0.07339000701904297,
328
+ "mean_token_accuracy": 0.9750400222837925,
329
+ "num_tokens": 182309.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 0.23958251252770424,
334
  "epoch": 0.40428790199081166,
335
+ "grad_norm": 0.376953125,
336
  "learning_rate": 0.00016097560975609758,
337
+ "loss": 0.0825161263346672,
338
+ "mean_token_accuracy": 0.9695910774171352,
339
+ "num_tokens": 188122.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 0.25066179782152176,
344
  "epoch": 0.4165390505359878,
345
+ "grad_norm": 0.34765625,
346
  "learning_rate": 0.00015975609756097562,
347
+ "loss": 0.0681036114692688,
348
+ "mean_token_accuracy": 0.9773549512028694,
349
+ "num_tokens": 193308.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 0.2489402163773775,
354
  "epoch": 0.42879019908116384,
355
+ "grad_norm": 0.33984375,
356
  "learning_rate": 0.00015853658536585366,
357
+ "loss": 0.07768924534320831,
358
+ "mean_token_accuracy": 0.9787707962095737,
359
+ "num_tokens": 198904.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 0.25176819786429405,
364
  "epoch": 0.44104134762633995,
365
+ "grad_norm": 0.353515625,
366
  "learning_rate": 0.00015731707317073173,
367
+ "loss": 0.07323021441698074,
368
+ "mean_token_accuracy": 0.9740425609052181,
369
+ "num_tokens": 204184.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 0.23491865396499634,
374
  "epoch": 0.45329249617151607,
375
+ "grad_norm": 0.345703125,
376
  "learning_rate": 0.00015609756097560978,
377
+ "loss": 0.06643179059028625,
378
+ "mean_token_accuracy": 0.9767155349254608,
379
+ "num_tokens": 210362.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 0.25266142282634974,
384
  "epoch": 0.4655436447166922,
385
+ "grad_norm": 0.50390625,
386
  "learning_rate": 0.00015487804878048782,
387
+ "loss": 0.08636192977428436,
388
+ "mean_token_accuracy": 0.9685244522988796,
389
+ "num_tokens": 215483.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 0.24919006042182446,
394
  "epoch": 0.4777947932618683,
395
+ "grad_norm": 0.357421875,
396
  "learning_rate": 0.00015365853658536586,
397
+ "loss": 0.06912290304899216,
398
+ "mean_token_accuracy": 0.9728152006864548,
399
+ "num_tokens": 220437.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 0.2789237005636096,
404
  "epoch": 0.4900459418070444,
405
+ "grad_norm": 0.3671875,
406
  "learning_rate": 0.0001524390243902439,
407
+ "loss": 0.07096827030181885,
408
+ "mean_token_accuracy": 0.9718564338982105,
409
+ "num_tokens": 225444.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 0.23915204405784607,
414
  "epoch": 0.5022970903522205,
415
+ "grad_norm": 0.35546875,
416
  "learning_rate": 0.00015121951219512197,
417
+ "loss": 0.06407603621482849,
418
+ "mean_token_accuracy": 0.975932989269495,
419
+ "num_tokens": 230088.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 0.25953691080212593,
424
  "epoch": 0.5145482388973966,
425
+ "grad_norm": 0.365234375,
426
  "learning_rate": 0.00015000000000000001,
427
+ "loss": 0.07893452048301697,
428
+ "mean_token_accuracy": 0.9717175625264645,
429
+ "num_tokens": 234974.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 0.25131134409457445,
434
  "epoch": 0.5267993874425727,
435
+ "grad_norm": 0.33984375,
436
  "learning_rate": 0.00014878048780487806,
437
+ "loss": 0.0724797397851944,
438
+ "mean_token_accuracy": 0.9746548496186733,
439
+ "num_tokens": 240695.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 0.25067666731774807,
444
  "epoch": 0.5390505359877489,
445
+ "grad_norm": 0.44921875,
446
  "learning_rate": 0.0001475609756097561,
447
+ "loss": 0.06145863234996796,
448
+ "mean_token_accuracy": 0.9786989763379097,
449
+ "num_tokens": 246515.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 0.22192941885441542,
454
  "epoch": 0.5513016845329249,
455
+ "grad_norm": 0.4375,
456
  "learning_rate": 0.00014634146341463414,
457
+ "loss": 0.06996186822652817,
458
+ "mean_token_accuracy": 0.9778482280671597,
459
+ "num_tokens": 252150.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 0.24868111684918404,
464
  "epoch": 0.5635528330781011,
465
+ "grad_norm": 0.392578125,
466
  "learning_rate": 0.0001451219512195122,
467
+ "loss": 0.07759839296340942,
468
+ "mean_token_accuracy": 0.9743853285908699,
469
+ "num_tokens": 257699.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 0.2405283828265965,
474
  "epoch": 0.5758039816232772,
475
+ "grad_norm": 0.400390625,
476
  "learning_rate": 0.00014390243902439025,
477
+ "loss": 0.06918229907751083,
478
+ "mean_token_accuracy": 0.9726257510483265,
479
+ "num_tokens": 262974.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 0.2463641557842493,
484
  "epoch": 0.5880551301684533,
485
+ "grad_norm": 0.5078125,
486
  "learning_rate": 0.0001426829268292683,
487
+ "loss": 0.08698121458292007,
488
+ "mean_token_accuracy": 0.9751730673015118,
489
+ "num_tokens": 267714.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 0.2611560570076108,
494
  "epoch": 0.6003062787136294,
495
+ "grad_norm": 0.3203125,
496
  "learning_rate": 0.00014146341463414634,
497
+ "loss": 0.0795765370130539,
498
+ "mean_token_accuracy": 0.9706047028303146,
499
+ "num_tokens": 273102.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 0.24631980434060097,
504
  "epoch": 0.6125574272588055,
505
+ "grad_norm": 0.365234375,
506
  "learning_rate": 0.00014024390243902438,
507
+ "loss": 0.06434721499681473,
508
+ "mean_token_accuracy": 0.9787219613790512,
509
+ "num_tokens": 278414.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
+ "eval_entropy": 0.25439983627934387,
515
+ "eval_loss": 0.07568059861660004,
516
+ "eval_mean_token_accuracy": 0.9709554686062578,
517
+ "eval_num_tokens": 278414.0,
518
+ "eval_runtime": 56.679,
519
+ "eval_samples_per_second": 1.217,
520
+ "eval_steps_per_second": 1.217,
521
  "step": 50
522
  },
523
  {
524
+ "entropy": 0.22273720148950815,
525
  "epoch": 0.6248085758039816,
526
+ "grad_norm": 0.330078125,
527
  "learning_rate": 0.00013902439024390245,
528
+ "loss": 0.06272563338279724,
529
+ "mean_token_accuracy": 0.9790237173438072,
530
+ "num_tokens": 284001.0,
531
  "step": 51
532
  },
533
  {
534
+ "entropy": 0.25650967564433813,
535
  "epoch": 0.6370597243491577,
536
+ "grad_norm": 0.3515625,
537
  "learning_rate": 0.0001378048780487805,
538
+ "loss": 0.0695340633392334,
539
+ "mean_token_accuracy": 0.9723741784691811,
540
+ "num_tokens": 289900.0,
541
  "step": 52
542
  },
543
  {
544
+ "entropy": 0.27689922973513603,
545
  "epoch": 0.6493108728943339,
546
+ "grad_norm": 0.443359375,
547
  "learning_rate": 0.00013658536585365856,
548
+ "loss": 0.08247513324022293,
549
+ "mean_token_accuracy": 0.9751085750758648,
550
+ "num_tokens": 295774.0,
551
  "step": 53
552
  },
553
  {
554
+ "entropy": 0.24619914591312408,
555
  "epoch": 0.6615620214395099,
556
+ "grad_norm": 0.349609375,
557
  "learning_rate": 0.0001353658536585366,
558
+ "loss": 0.06673211604356766,
559
+ "mean_token_accuracy": 0.9788386225700378,
560
+ "num_tokens": 300970.0,
561
  "step": 54
562
  },
563
  {
564
+ "entropy": 0.27198443934321404,
565
  "epoch": 0.6738131699846861,
566
+ "grad_norm": 0.4921875,
567
  "learning_rate": 0.00013414634146341464,
568
+ "loss": 0.07676997035741806,
569
+ "mean_token_accuracy": 0.9696366749703884,
570
+ "num_tokens": 306709.0,
571
  "step": 55
572
  },
573
  {
574
+ "entropy": 0.2689105300232768,
575
  "epoch": 0.6860643185298622,
576
+ "grad_norm": 0.47265625,
577
  "learning_rate": 0.0001329268292682927,
578
+ "loss": 0.06719915568828583,
579
+ "mean_token_accuracy": 0.9702229462563992,
580
+ "num_tokens": 311650.0,
581
  "step": 56
582
  },
583
  {
584
+ "entropy": 0.2787257097661495,
585
  "epoch": 0.6983154670750383,
586
+ "grad_norm": 0.369140625,
587
  "learning_rate": 0.00013170731707317076,
588
+ "loss": 0.07159961760044098,
589
+ "mean_token_accuracy": 0.9748533591628075,
590
+ "num_tokens": 317257.0,
591
  "step": 57
592
  },
593
  {
594
+ "entropy": 0.2522663725540042,
595
  "epoch": 0.7105666156202144,
596
+ "grad_norm": 0.31640625,
597
  "learning_rate": 0.0001304878048780488,
598
+ "loss": 0.08856096863746643,
599
+ "mean_token_accuracy": 0.9697616137564182,
600
+ "num_tokens": 323281.0,
601
  "step": 58
602
  },
603
  {
604
+ "entropy": 0.24693416617810726,
605
  "epoch": 0.7228177641653905,
606
+ "grad_norm": 0.37109375,
607
  "learning_rate": 0.00012926829268292684,
608
+ "loss": 0.07423190027475357,
609
+ "mean_token_accuracy": 0.9705353751778603,
610
+ "num_tokens": 328551.0,
611
  "step": 59
612
  },
613
  {
614
+ "entropy": 0.2651137877255678,
615
  "epoch": 0.7350689127105666,
616
+ "grad_norm": 0.43359375,
617
  "learning_rate": 0.00012804878048780488,
618
+ "loss": 0.0738719031214714,
619
+ "mean_token_accuracy": 0.9752235859632492,
620
+ "num_tokens": 334143.0,
621
  "step": 60
622
  },
623
  {
624
+ "entropy": 0.2281778110191226,
625
  "epoch": 0.7473200612557427,
626
+ "grad_norm": 0.2490234375,
627
  "learning_rate": 0.00012682926829268293,
628
+ "loss": 0.0633026584982872,
629
+ "mean_token_accuracy": 0.9741999059915543,
630
+ "num_tokens": 341092.0,
631
  "step": 61
632
  },
633
  {
634
+ "entropy": 0.2535929596051574,
635
  "epoch": 0.7595712098009189,
636
+ "grad_norm": 0.390625,
637
  "learning_rate": 0.000125609756097561,
638
+ "loss": 0.0719546377658844,
639
+ "mean_token_accuracy": 0.9765410870313644,
640
+ "num_tokens": 347467.0,
641
  "step": 62
642
  },
643
  {
644
+ "entropy": 0.25424638390541077,
645
  "epoch": 0.7718223583460949,
646
+ "grad_norm": 0.431640625,
647
  "learning_rate": 0.00012439024390243904,
648
+ "loss": 0.05535401031374931,
649
+ "mean_token_accuracy": 0.9780425503849983,
650
+ "num_tokens": 352164.0,
651
  "step": 63
652
  },
653
  {
654
+ "entropy": 0.23888325225561857,
655
  "epoch": 0.7840735068912711,
656
+ "grad_norm": 0.435546875,
657
  "learning_rate": 0.00012317073170731708,
658
+ "loss": 0.07177040725946426,
659
+ "mean_token_accuracy": 0.9734687805175781,
660
+ "num_tokens": 357308.0,
661
  "step": 64
662
  },
663
  {
664
+ "entropy": 0.27028472628444433,
665
  "epoch": 0.7963246554364471,
666
+ "grad_norm": 0.30859375,
667
  "learning_rate": 0.00012195121951219512,
668
+ "loss": 0.06257087737321854,
669
+ "mean_token_accuracy": 0.9774579927325249,
670
+ "num_tokens": 362666.0,
671
  "step": 65
672
  },
673
  {
674
+ "entropy": 0.2821849435567856,
675
  "epoch": 0.8085758039816233,
676
+ "grad_norm": 0.373046875,
677
  "learning_rate": 0.00012073170731707318,
678
+ "loss": 0.06471723318099976,
679
+ "mean_token_accuracy": 0.976191334426403,
680
+ "num_tokens": 368427.0,
681
  "step": 66
682
  },
683
  {
684
+ "entropy": 0.22186184907332063,
685
  "epoch": 0.8208269525267994,
686
+ "grad_norm": 0.263671875,
687
  "learning_rate": 0.00011951219512195122,
688
+ "loss": 0.06329935044050217,
689
+ "mean_token_accuracy": 0.978707954287529,
690
+ "num_tokens": 374540.0,
691
  "step": 67
692
  },
693
  {
694
+ "entropy": 0.23882555402815342,
695
  "epoch": 0.8330781010719756,
696
+ "grad_norm": 0.3046875,
697
  "learning_rate": 0.00011829268292682926,
698
+ "loss": 0.07082124054431915,
699
+ "mean_token_accuracy": 0.979393869638443,
700
+ "num_tokens": 379925.0,
701
  "step": 68
702
  },
703
  {
704
+ "entropy": 0.2527451729401946,
705
  "epoch": 0.8453292496171516,
706
+ "grad_norm": 0.37109375,
707
  "learning_rate": 0.00011707317073170732,
708
+ "loss": 0.0804731696844101,
709
+ "mean_token_accuracy": 0.9763551540672779,
710
+ "num_tokens": 384279.0,
711
  "step": 69
712
  },
713
  {
714
+ "entropy": 0.26056139171123505,
715
  "epoch": 0.8575803981623277,
716
+ "grad_norm": 0.40234375,
717
  "learning_rate": 0.00011585365853658536,
718
+ "loss": 0.09266315400600433,
719
+ "mean_token_accuracy": 0.9709281474351883,
720
+ "num_tokens": 389563.0,
721
  "step": 70
722
  },
723
  {
724
+ "entropy": 0.2919591320678592,
725
  "epoch": 0.8698315467075038,
726
+ "grad_norm": 0.423828125,
727
  "learning_rate": 0.00011463414634146342,
728
+ "loss": 0.07172521948814392,
729
+ "mean_token_accuracy": 0.9725044220685959,
730
+ "num_tokens": 394650.0,
731
  "step": 71
732
  },
733
  {
734
+ "entropy": 0.2520558973774314,
735
  "epoch": 0.8820826952526799,
736
+ "grad_norm": 0.47265625,
737
  "learning_rate": 0.00011341463414634146,
738
+ "loss": 0.07857581228017807,
739
+ "mean_token_accuracy": 0.967189610004425,
740
+ "num_tokens": 399583.0,
741
  "step": 72
742
  },
743
  {
744
+ "entropy": 0.2681189738214016,
745
  "epoch": 0.8943338437978561,
746
+ "grad_norm": 0.470703125,
747
  "learning_rate": 0.00011219512195121953,
748
+ "loss": 0.0883592814207077,
749
+ "mean_token_accuracy": 0.9760300181806087,
750
+ "num_tokens": 406224.0,
751
  "step": 73
752
  },
753
  {
754
+ "entropy": 0.25226688850671053,
755
  "epoch": 0.9065849923430321,
756
+ "grad_norm": 0.349609375,
757
  "learning_rate": 0.00011097560975609757,
758
+ "loss": 0.06107043847441673,
759
+ "mean_token_accuracy": 0.9742026180028915,
760
+ "num_tokens": 412481.0,
761
  "step": 74
762
  },
763
  {
764
+ "entropy": 0.25610699970275164,
765
  "epoch": 0.9188361408882083,
766
+ "grad_norm": 0.4140625,
767
  "learning_rate": 0.00010975609756097563,
768
+ "loss": 0.06678957492113113,
769
+ "mean_token_accuracy": 0.9725399203598499,
770
+ "num_tokens": 417862.0,
771
  "step": 75
772
  },
773
  {
774
+ "entropy": 0.2826196616515517,
775
  "epoch": 0.9310872894333844,
776
+ "grad_norm": 0.859375,
777
  "learning_rate": 0.00010853658536585367,
778
+ "loss": 0.048859648406505585,
779
+ "mean_token_accuracy": 0.9790267050266266,
780
+ "num_tokens": 422878.0,
781
  "step": 76
782
  },
783
  {
784
+ "entropy": 0.23871563002467155,
785
  "epoch": 0.9433384379785605,
786
+ "grad_norm": 0.466796875,
787
  "learning_rate": 0.00010731707317073172,
788
+ "loss": 0.07596343755722046,
789
+ "mean_token_accuracy": 0.971769668161869,
790
+ "num_tokens": 429170.0,
791
  "step": 77
792
  },
793
  {
794
+ "entropy": 0.2777755409479141,
795
  "epoch": 0.9555895865237366,
796
+ "grad_norm": 0.443359375,
797
  "learning_rate": 0.00010609756097560977,
798
+ "loss": 0.06630191206932068,
799
+ "mean_token_accuracy": 0.9747902825474739,
800
+ "num_tokens": 434323.0,
801
  "step": 78
802
  },
803
  {
804
+ "entropy": 0.23950364720076323,
805
  "epoch": 0.9678407350689127,
806
+ "grad_norm": 0.349609375,
807
  "learning_rate": 0.00010487804878048781,
808
+ "loss": 0.057458702474832535,
809
+ "mean_token_accuracy": 0.980991818010807,
810
+ "num_tokens": 439539.0,
811
  "step": 79
812
  },
813
  {
814
+ "entropy": 0.245719694532454,
815
  "epoch": 0.9800918836140888,
816
+ "grad_norm": 0.3046875,
817
  "learning_rate": 0.00010365853658536586,
818
+ "loss": 0.06474918127059937,
819
+ "mean_token_accuracy": 0.9749566093087196,
820
+ "num_tokens": 445548.0,
821
  "step": 80
822
  },
823
  {
824
+ "entropy": 0.2553516002371907,
825
  "epoch": 0.9923430321592649,
826
+ "grad_norm": 0.59375,
827
  "learning_rate": 0.0001024390243902439,
828
+ "loss": 0.07626976072788239,
829
+ "mean_token_accuracy": 0.9740116000175476,
830
+ "num_tokens": 451007.0,
831
  "step": 81
832
  },
833
  {
834
+ "entropy": 0.24858922958374025,
835
  "epoch": 1.0,
836
+ "grad_norm": 0.4140625,
837
  "learning_rate": 0.00010121951219512196,
838
+ "loss": 0.05956536903977394,
839
+ "mean_token_accuracy": 0.9751910209655762,
840
+ "num_tokens": 454678.0,
841
  "step": 82
842
  },
843
  {
844
+ "entropy": 0.22480082791298628,
845
  "epoch": 1.0122511485451762,
846
+ "grad_norm": 0.302734375,
847
  "learning_rate": 0.0001,
848
+ "loss": 0.03318095952272415,
849
+ "mean_token_accuracy": 0.9908282831311226,
850
+ "num_tokens": 460195.0,
851
  "step": 83
852
  },
853
  {
854
+ "entropy": 0.21941375825554132,
855
  "epoch": 1.0245022970903521,
856
+ "grad_norm": 0.322265625,
857
  "learning_rate": 9.878048780487805e-05,
858
+ "loss": 0.037562280893325806,
859
+ "mean_token_accuracy": 0.9899826981127262,
860
+ "num_tokens": 465814.0,
861
  "step": 84
862
  },
863
  {
864
+ "entropy": 0.2297668270766735,
865
  "epoch": 1.0367534456355283,
866
+ "grad_norm": 0.259765625,
867
  "learning_rate": 9.75609756097561e-05,
868
+ "loss": 0.03667337819933891,
869
+ "mean_token_accuracy": 0.9867987670004368,
870
+ "num_tokens": 472919.0,
871
  "step": 85
872
  },
873
  {
874
+ "entropy": 0.1959990761242807,
875
  "epoch": 1.0490045941807045,
876
+ "grad_norm": 0.171875,
877
  "learning_rate": 9.634146341463415e-05,
878
+ "loss": 0.02224677987396717,
879
+ "mean_token_accuracy": 0.9947787970304489,
880
+ "num_tokens": 477926.0,
881
  "step": 86
882
  },
883
  {
884
+ "entropy": 0.22538460325449705,
885
  "epoch": 1.0612557427258806,
886
+ "grad_norm": 0.294921875,
887
  "learning_rate": 9.51219512195122e-05,
888
+ "loss": 0.05467130243778229,
889
+ "mean_token_accuracy": 0.9857094436883926,
890
+ "num_tokens": 483369.0,
891
  "step": 87
892
  },
893
  {
894
+ "entropy": 0.2385974396020174,
895
  "epoch": 1.0735068912710566,
896
+ "grad_norm": 0.2392578125,
897
  "learning_rate": 9.390243902439024e-05,
898
+ "loss": 0.02876465395092964,
899
+ "mean_token_accuracy": 0.9933567047119141,
900
+ "num_tokens": 488048.0,
901
  "step": 88
902
  },
903
  {
904
+ "entropy": 0.2244573337957263,
905
  "epoch": 1.0857580398162328,
906
+ "grad_norm": 0.17578125,
907
  "learning_rate": 9.26829268292683e-05,
908
+ "loss": 0.022544220089912415,
909
+ "mean_token_accuracy": 0.9952267222106457,
910
+ "num_tokens": 492951.0,
911
  "step": 89
912
  },
913
  {
914
+ "entropy": 0.21164159616455436,
915
  "epoch": 1.098009188361409,
916
+ "grad_norm": 0.3671875,
917
  "learning_rate": 9.146341463414635e-05,
918
+ "loss": 0.0307400431483984,
919
+ "mean_token_accuracy": 0.9898485280573368,
920
+ "num_tokens": 498298.0,
921
  "step": 90
922
  },
923
  {
924
+ "entropy": 0.22300960402935743,
925
  "epoch": 1.110260336906585,
926
+ "grad_norm": 0.25390625,
927
  "learning_rate": 9.02439024390244e-05,
928
+ "loss": 0.02349678799510002,
929
+ "mean_token_accuracy": 0.9937595501542091,
930
+ "num_tokens": 503013.0,
931
  "step": 91
932
  },
933
  {
934
+ "entropy": 0.2144601820036769,
935
  "epoch": 1.122511485451761,
936
+ "grad_norm": 0.466796875,
937
  "learning_rate": 8.902439024390244e-05,
938
+ "loss": 0.025124385952949524,
939
+ "mean_token_accuracy": 0.9929902292788029,
940
+ "num_tokens": 507687.0,
941
  "step": 92
942
  },
943
  {
944
+ "entropy": 0.18067707447335124,
945
  "epoch": 1.1347626339969372,
946
+ "grad_norm": 0.462890625,
947
  "learning_rate": 8.78048780487805e-05,
948
+ "loss": 0.04210633784532547,
949
+ "mean_token_accuracy": 0.9874051883816719,
950
+ "num_tokens": 513217.0,
951
  "step": 93
952
  },
953
  {
954
+ "entropy": 0.18840790819376707,
955
  "epoch": 1.1470137825421134,
956
+ "grad_norm": 0.2578125,
957
  "learning_rate": 8.658536585365854e-05,
958
+ "loss": 0.023590605705976486,
959
+ "mean_token_accuracy": 0.9930241219699383,
960
+ "num_tokens": 518384.0,
961
  "step": 94
962
  },
963
  {
964
+ "entropy": 0.16844777530059218,
965
  "epoch": 1.1592649310872893,
966
+ "grad_norm": 0.3046875,
967
  "learning_rate": 8.53658536585366e-05,
968
+ "loss": 0.02408467046916485,
969
+ "mean_token_accuracy": 0.9940578565001488,
970
+ "num_tokens": 523975.0,
971
  "step": 95
972
  },
973
  {
974
+ "entropy": 0.1988551402464509,
975
  "epoch": 1.1715160796324655,
976
+ "grad_norm": 0.25390625,
977
  "learning_rate": 8.414634146341464e-05,
978
+ "loss": 0.01896364614367485,
979
+ "mean_token_accuracy": 0.9935651384294033,
980
+ "num_tokens": 528838.0,
981
  "step": 96
982
  },
983
  {
984
+ "entropy": 0.19662938080728054,
985
  "epoch": 1.1837672281776417,
986
+ "grad_norm": 0.271484375,
987
  "learning_rate": 8.292682926829268e-05,
988
+ "loss": 0.023568641394376755,
989
+ "mean_token_accuracy": 0.9942812882363796,
990
+ "num_tokens": 533723.0,
991
  "step": 97
992
  },
993
  {
994
+ "entropy": 0.18521032202988863,
995
  "epoch": 1.1960183767228179,
996
+ "grad_norm": 0.2158203125,
997
  "learning_rate": 8.170731707317073e-05,
998
+ "loss": 0.03203809633851051,
999
+ "mean_token_accuracy": 0.9899982325732708,
1000
+ "num_tokens": 540180.0,
1001
  "step": 98
1002
  },
1003
  {
1004
+ "entropy": 0.18826917372643948,
1005
  "epoch": 1.2082695252679938,
1006
+ "grad_norm": 0.4765625,
1007
  "learning_rate": 8.048780487804879e-05,
1008
+ "loss": 0.03463224321603775,
1009
+ "mean_token_accuracy": 0.9889252111315727,
1010
+ "num_tokens": 546618.0,
1011
  "step": 99
1012
  },
1013
  {
1014
+ "entropy": 0.1889605624601245,
1015
  "epoch": 1.22052067381317,
1016
+ "grad_norm": 0.337890625,
1017
  "learning_rate": 7.926829268292683e-05,
1018
+ "loss": 0.038746241480112076,
1019
+ "mean_token_accuracy": 0.9897148124873638,
1020
+ "num_tokens": 552084.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
+ "eval_entropy": 0.19684839270253113,
1026
+ "eval_loss": 0.08200085908174515,
1027
+ "eval_mean_token_accuracy": 0.9706140955289205,
1028
+ "eval_num_tokens": 552084.0,
1029
+ "eval_runtime": 56.6368,
1030
+ "eval_samples_per_second": 1.218,
1031
+ "eval_steps_per_second": 1.218,
1032
  "step": 100
1033
  },
1034
  {
1035
+ "entropy": 0.18781481962651014,
1036
  "epoch": 1.2327718223583461,
1037
+ "grad_norm": 0.2490234375,
1038
  "learning_rate": 7.804878048780489e-05,
1039
+ "loss": 0.03647669032216072,
1040
+ "mean_token_accuracy": 0.9900195822119713,
1041
+ "num_tokens": 558384.0,
1042
  "step": 101
1043
  },
1044
  {
1045
+ "entropy": 0.182833943516016,
1046
  "epoch": 1.245022970903522,
1047
+ "grad_norm": 0.1708984375,
1048
  "learning_rate": 7.682926829268293e-05,
1049
+ "loss": 0.01754325069487095,
1050
+ "mean_token_accuracy": 0.9952104948461056,
1051
+ "num_tokens": 564025.0,
1052
  "step": 102
1053
  },
1054
  {
1055
+ "entropy": 0.19512099027633667,
1056
  "epoch": 1.2572741194486983,
1057
+ "grad_norm": 0.32421875,
1058
  "learning_rate": 7.560975609756099e-05,
1059
+ "loss": 0.045042332261800766,
1060
+ "mean_token_accuracy": 0.987647294998169,
1061
+ "num_tokens": 569791.0,
1062
  "step": 103
1063
  },
1064
  {
1065
+ "entropy": 0.19775146059691906,
1066
  "epoch": 1.2695252679938744,
1067
+ "grad_norm": 0.287109375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
+ "loss": 0.03481469675898552,
1070
+ "mean_token_accuracy": 0.9876400642096996,
1071
+ "num_tokens": 575432.0,
1072
  "step": 104
1073
  },
1074
  {
1075
+ "entropy": 0.19757689163088799,
1076
  "epoch": 1.2817764165390506,
1077
+ "grad_norm": 0.392578125,
1078
  "learning_rate": 7.317073170731707e-05,
1079
+ "loss": 0.045782968401908875,
1080
+ "mean_token_accuracy": 0.987156193703413,
1081
+ "num_tokens": 580586.0,
1082
  "step": 105
1083
  },
1084
  {
1085
+ "entropy": 0.19568088464438915,
1086
  "epoch": 1.2940275650842268,
1087
+ "grad_norm": 0.271484375,
1088
  "learning_rate": 7.195121951219513e-05,
1089
+ "loss": 0.03614577651023865,
1090
+ "mean_token_accuracy": 0.989520326256752,
1091
+ "num_tokens": 586255.0,
1092
  "step": 106
1093
  },
1094
  {
1095
+ "entropy": 0.18891402333974838,
1096
  "epoch": 1.3062787136294027,
1097
+ "grad_norm": 0.169921875,
1098
  "learning_rate": 7.073170731707317e-05,
1099
+ "loss": 0.018318383023142815,
1100
+ "mean_token_accuracy": 0.9943608231842518,
1101
+ "num_tokens": 591734.0,
1102
  "step": 107
1103
  },
1104
  {
1105
+ "entropy": 0.2118115657940507,
1106
  "epoch": 1.318529862174579,
1107
+ "grad_norm": 0.34375,
1108
  "learning_rate": 6.951219512195122e-05,
1109
+ "loss": 0.02556736022233963,
1110
+ "mean_token_accuracy": 0.9910119064152241,
1111
+ "num_tokens": 596805.0,
1112
  "step": 108
1113
  },
1114
  {
1115
+ "entropy": 0.20146753964945674,
1116
  "epoch": 1.3307810107197549,
1117
+ "grad_norm": 0.251953125,
1118
  "learning_rate": 6.829268292682928e-05,
1119
+ "loss": 0.026423780247569084,
1120
+ "mean_token_accuracy": 0.9911187067627907,
1121
+ "num_tokens": 602469.0,
1122
  "step": 109
1123
  },
1124
  {
1125
+ "entropy": 0.19927682168781757,
1126
  "epoch": 1.343032159264931,
1127
+ "grad_norm": 0.2314453125,
1128
  "learning_rate": 6.707317073170732e-05,
1129
+ "loss": 0.038182880729436874,
1130
+ "mean_token_accuracy": 0.9882474392652512,
1131
+ "num_tokens": 608854.0,
1132
  "step": 110
1133
  },
1134
  {
1135
+ "entropy": 0.18457680894061923,
1136
  "epoch": 1.3552833078101072,
1137
+ "grad_norm": 0.24609375,
1138
  "learning_rate": 6.585365853658538e-05,
1139
+ "loss": 0.025912806391716003,
1140
+ "mean_token_accuracy": 0.9923904649913311,
1141
+ "num_tokens": 614272.0,
1142
  "step": 111
1143
  },
1144
  {
1145
+ "entropy": 0.1993693085387349,
1146
  "epoch": 1.3675344563552834,
1147
+ "grad_norm": 0.291015625,
1148
  "learning_rate": 6.463414634146342e-05,
1149
+ "loss": 0.021378764882683754,
1150
+ "mean_token_accuracy": 0.9953300580382347,
1151
+ "num_tokens": 619446.0,
1152
  "step": 112
1153
  },
1154
  {
1155
+ "entropy": 0.19518085662275553,
1156
  "epoch": 1.3797856049004595,
1157
+ "grad_norm": 0.30078125,
1158
  "learning_rate": 6.341463414634146e-05,
1159
+ "loss": 0.03335938975214958,
1160
+ "mean_token_accuracy": 0.9875492453575134,
1161
+ "num_tokens": 625774.0,
1162
  "step": 113
1163
  },
1164
  {
1165
+ "entropy": 0.20890573505312204,
1166
  "epoch": 1.3920367534456355,
1167
+ "grad_norm": 0.373046875,
1168
  "learning_rate": 6.219512195121952e-05,
1169
+ "loss": 0.036217525601387024,
1170
+ "mean_token_accuracy": 0.9891358688473701,
1171
+ "num_tokens": 630747.0,
1172
  "step": 114
1173
  },
1174
  {
1175
+ "entropy": 0.19118426740169525,
1176
  "epoch": 1.4042879019908117,
1177
+ "grad_norm": 0.251953125,
1178
  "learning_rate": 6.097560975609756e-05,
1179
+ "loss": 0.030090918764472008,
1180
+ "mean_token_accuracy": 0.9934539385139942,
1181
+ "num_tokens": 637405.0,
1182
  "step": 115
1183
  },
1184
  {
1185
+ "entropy": 0.2176859974861145,
1186
  "epoch": 1.4165390505359878,
1187
+ "grad_norm": 0.2373046875,
1188
  "learning_rate": 5.975609756097561e-05,
1189
+ "loss": 0.024563392624258995,
1190
+ "mean_token_accuracy": 0.9921185150742531,
1191
+ "num_tokens": 642328.0,
1192
  "step": 116
1193
  },
1194
  {
1195
+ "entropy": 0.1849509342573583,
1196
  "epoch": 1.4287901990811638,
1197
+ "grad_norm": 0.35546875,
1198
  "learning_rate": 5.853658536585366e-05,
1199
+ "loss": 0.042349379509687424,
1200
+ "mean_token_accuracy": 0.9899747557938099,
1201
+ "num_tokens": 647857.0,
1202
  "step": 117
1203
  },
1204
  {
1205
+ "entropy": 0.19377889391034842,
1206
  "epoch": 1.44104134762634,
1207
+ "grad_norm": 0.279296875,
1208
  "learning_rate": 5.731707317073171e-05,
1209
+ "loss": 0.02413174696266651,
1210
+ "mean_token_accuracy": 0.9931157529354095,
1211
+ "num_tokens": 653805.0,
1212
  "step": 118
1213
  },
1214
  {
1215
+ "entropy": 0.20709845190867782,
1216
  "epoch": 1.4532924961715161,
1217
+ "grad_norm": 0.28125,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
+ "loss": 0.03505600988864899,
1220
+ "mean_token_accuracy": 0.9896740056574345,
1221
+ "num_tokens": 659708.0,
1222
  "step": 119
1223
  },
1224
  {
1225
+ "entropy": 0.20671271299943328,
1226
  "epoch": 1.4655436447166923,
1227
+ "grad_norm": 0.2734375,
1228
  "learning_rate": 5.487804878048781e-05,
1229
+ "loss": 0.02634236589074135,
1230
+ "mean_token_accuracy": 0.9935285076498985,
1231
+ "num_tokens": 665292.0,
1232
  "step": 120
1233
  },
1234
  {
1235
+ "entropy": 0.18826642259955406,
1236
  "epoch": 1.4777947932618682,
1237
+ "grad_norm": 0.2177734375,
1238
  "learning_rate": 5.365853658536586e-05,
1239
+ "loss": 0.022179996594786644,
1240
+ "mean_token_accuracy": 0.9928314089775085,
1241
+ "num_tokens": 670669.0,
1242
  "step": 121
1243
  },
1244
  {
1245
+ "entropy": 0.2311026845127344,
1246
  "epoch": 1.4900459418070444,
1247
+ "grad_norm": 0.267578125,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
+ "loss": 0.025521911680698395,
1250
+ "mean_token_accuracy": 0.9930035471916199,
1251
+ "num_tokens": 675524.0,
1252
  "step": 122
1253
  },
1254
  {
1255
+ "entropy": 0.1890636207535863,
1256
  "epoch": 1.5022970903522204,
1257
+ "grad_norm": 0.22265625,
1258
  "learning_rate": 5.121951219512195e-05,
1259
+ "loss": 0.02293182723224163,
1260
+ "mean_token_accuracy": 0.9917827062308788,
1261
+ "num_tokens": 681083.0,
1262
  "step": 123
1263
  },
1264
  {
1265
+ "entropy": 0.20301904529333115,
1266
  "epoch": 1.5145482388973965,
1267
+ "grad_norm": 0.251953125,
1268
  "learning_rate": 5e-05,
1269
+ "loss": 0.026392869651317596,
1270
+ "mean_token_accuracy": 0.9935696609318256,
1271
+ "num_tokens": 686909.0,
1272
  "step": 124
1273
  },
1274
  {
1275
+ "entropy": 0.18326633982360363,
1276
  "epoch": 1.5267993874425727,
1277
+ "grad_norm": 0.189453125,
1278
  "learning_rate": 4.878048780487805e-05,
1279
+ "loss": 0.03385050222277641,
1280
+ "mean_token_accuracy": 0.9923080727458,
1281
+ "num_tokens": 693716.0,
1282
  "step": 125
1283
  },
1284
  {
1285
+ "entropy": 0.1940352749079466,
1286
  "epoch": 1.5390505359877489,
1287
+ "grad_norm": 0.25,
1288
  "learning_rate": 4.75609756097561e-05,
1289
+ "loss": 0.03128973767161369,
1290
+ "mean_token_accuracy": 0.9904795847833157,
1291
+ "num_tokens": 699231.0,
1292
  "step": 126
1293
  },
1294
  {
1295
+ "entropy": 0.2052145255729556,
1296
  "epoch": 1.551301684532925,
1297
+ "grad_norm": 0.1962890625,
1298
  "learning_rate": 4.634146341463415e-05,
1299
+ "loss": 0.01906367763876915,
1300
+ "mean_token_accuracy": 0.9935221113264561,
1301
+ "num_tokens": 705026.0,
1302
  "step": 127
1303
  },
1304
  {
1305
+ "entropy": 0.22084870096296072,
1306
  "epoch": 1.5635528330781012,
1307
+ "grad_norm": 0.28125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
+ "loss": 0.026771627366542816,
1310
+ "mean_token_accuracy": 0.9931596331298351,
1311
+ "num_tokens": 710155.0,
1312
  "step": 128
1313
  },
1314
  {
1315
+ "entropy": 0.18041892955079675,
1316
  "epoch": 1.5758039816232772,
1317
+ "grad_norm": 0.369140625,
1318
  "learning_rate": 4.390243902439025e-05,
1319
+ "loss": 0.024752795696258545,
1320
+ "mean_token_accuracy": 0.9915198720991611,
1321
+ "num_tokens": 715496.0,
1322
  "step": 129
1323
  },
1324
  {
1325
+ "entropy": 0.1869538608007133,
1326
  "epoch": 1.5880551301684533,
1327
+ "grad_norm": 0.3046875,
1328
  "learning_rate": 4.26829268292683e-05,
1329
+ "loss": 0.03293408453464508,
1330
+ "mean_token_accuracy": 0.990137055516243,
1331
+ "num_tokens": 721491.0,
1332
  "step": 130
1333
  },
1334
  {
1335
+ "entropy": 0.20515098702162504,
1336
  "epoch": 1.6003062787136293,
1337
+ "grad_norm": 0.349609375,
1338
  "learning_rate": 4.146341463414634e-05,
1339
+ "loss": 0.023330464959144592,
1340
+ "mean_token_accuracy": 0.9892629720270634,
1341
+ "num_tokens": 726673.0,
1342
  "step": 131
1343
  },
1344
  {
1345
+ "entropy": 0.18135815067216754,
1346
  "epoch": 1.6125574272588055,
1347
+ "grad_norm": 0.357421875,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
+ "loss": 0.03119005262851715,
1350
+ "mean_token_accuracy": 0.9911304786801338,
1351
+ "num_tokens": 733054.0,
1352
  "step": 132
1353
  },
1354
  {
1355
+ "entropy": 0.20070009911432862,
1356
  "epoch": 1.6248085758039816,
1357
+ "grad_norm": 0.21484375,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
+ "loss": 0.030009731650352478,
1360
+ "mean_token_accuracy": 0.9932212419807911,
1361
+ "num_tokens": 737990.0,
1362
  "step": 133
1363
  },
1364
  {
1365
+ "entropy": 0.18819584511220455,
1366
  "epoch": 1.6370597243491578,
1367
+ "grad_norm": 0.2451171875,
1368
  "learning_rate": 3.780487804878049e-05,
1369
+ "loss": 0.02752860262989998,
1370
+ "mean_token_accuracy": 0.9897669702768326,
1371
+ "num_tokens": 743394.0,
1372
  "step": 134
1373
  },
1374
  {
1375
+ "entropy": 0.18869836069643497,
1376
  "epoch": 1.649310872894334,
1377
+ "grad_norm": 0.240234375,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
+ "loss": 0.03194504603743553,
1380
+ "mean_token_accuracy": 0.9914098270237446,
1381
+ "num_tokens": 749356.0,
1382
  "step": 135
1383
  },
1384
  {
1385
+ "entropy": 0.2093992899172008,
1386
  "epoch": 1.66156202143951,
1387
+ "grad_norm": 0.291015625,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
+ "loss": 0.02633955329656601,
1390
+ "mean_token_accuracy": 0.992473166435957,
1391
+ "num_tokens": 754312.0,
1392
  "step": 136
1393
  },
1394
  {
1395
+ "entropy": 0.1928223273716867,
1396
  "epoch": 1.673813169984686,
1397
+ "grad_norm": 0.2470703125,
1398
  "learning_rate": 3.414634146341464e-05,
1399
+ "loss": 0.035037778317928314,
1400
+ "mean_token_accuracy": 0.9916842468082905,
1401
+ "num_tokens": 760182.0,
1402
  "step": 137
1403
  },
1404
  {
1405
+ "entropy": 0.19663999788463116,
1406
  "epoch": 1.686064318529862,
1407
+ "grad_norm": 0.265625,
1408
  "learning_rate": 3.292682926829269e-05,
1409
+ "loss": 0.03151565045118332,
1410
+ "mean_token_accuracy": 0.9930234625935555,
1411
+ "num_tokens": 766267.0,
1412
  "step": 138
1413
  },
1414
  {
1415
+ "entropy": 0.2058473015204072,
1416
  "epoch": 1.6983154670750382,
1417
+ "grad_norm": 0.2578125,
1418
  "learning_rate": 3.170731707317073e-05,
1419
+ "loss": 0.02509160526096821,
1420
+ "mean_token_accuracy": 0.9920520819723606,
1421
+ "num_tokens": 771135.0,
1422
  "step": 139
1423
  },
1424
  {
1425
+ "entropy": 0.20955495908856392,
1426
  "epoch": 1.7105666156202144,
1427
+ "grad_norm": 0.36328125,
1428
  "learning_rate": 3.048780487804878e-05,
1429
+ "loss": 0.03856905177235603,
1430
+ "mean_token_accuracy": 0.9877506978809834,
1431
+ "num_tokens": 776727.0,
1432
  "step": 140
1433
  },
1434
  {
1435
+ "entropy": 0.17796193715184927,
1436
  "epoch": 1.7228177641653906,
1437
+ "grad_norm": 0.271484375,
1438
  "learning_rate": 2.926829268292683e-05,
1439
+ "loss": 0.03061492368578911,
1440
+ "mean_token_accuracy": 0.9933489374816418,
1441
+ "num_tokens": 782352.0,
1442
  "step": 141
1443
  },
1444
  {
1445
+ "entropy": 0.19299636129289865,
1446
  "epoch": 1.7350689127105667,
1447
+ "grad_norm": 0.2392578125,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
+ "loss": 0.03383423760533333,
1450
+ "mean_token_accuracy": 0.9913677796721458,
1451
+ "num_tokens": 787139.0,
1452
  "step": 142
1453
  },
1454
  {
1455
+ "entropy": 0.2032350143417716,
1456
  "epoch": 1.7473200612557427,
1457
+ "grad_norm": 0.314453125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
+ "loss": 0.03458622097969055,
1460
+ "mean_token_accuracy": 0.9920257851481438,
1461
+ "num_tokens": 792244.0,
1462
  "step": 143
1463
  },
1464
  {
1465
+ "entropy": 0.21589675825089216,
1466
  "epoch": 1.7595712098009189,
1467
+ "grad_norm": 0.27734375,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
+ "loss": 0.029654916375875473,
1470
+ "mean_token_accuracy": 0.9936717823147774,
1471
+ "num_tokens": 797998.0,
1472
  "step": 144
1473
  },
1474
  {
1475
+ "entropy": 0.19791326764971018,
1476
  "epoch": 1.7718223583460948,
1477
+ "grad_norm": 0.1748046875,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
+ "loss": 0.019491517916321754,
1480
+ "mean_token_accuracy": 0.9953687153756618,
1481
+ "num_tokens": 803118.0,
1482
  "step": 145
1483
  },
1484
  {
1485
+ "entropy": 0.19606765313073993,
1486
  "epoch": 1.784073506891271,
1487
+ "grad_norm": 0.2236328125,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
+ "loss": 0.017046257853507996,
1490
+ "mean_token_accuracy": 0.9934666827321053,
1491
+ "num_tokens": 808709.0,
1492
  "step": 146
1493
  },
1494
  {
1495
+ "entropy": 0.17984948493540287,
1496
  "epoch": 1.7963246554364471,
1497
+ "grad_norm": 0.2119140625,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
+ "loss": 0.028008146211504936,
1500
+ "mean_token_accuracy": 0.9918750263750553,
1501
+ "num_tokens": 815053.0,
1502
  "step": 147
1503
  },
1504
  {
1505
+ "entropy": 0.19215012807399035,
1506
  "epoch": 1.8085758039816233,
1507
+ "grad_norm": 0.212890625,
1508
  "learning_rate": 2.073170731707317e-05,
1509
+ "loss": 0.02620745822787285,
1510
+ "mean_token_accuracy": 0.9895812347531319,
1511
+ "num_tokens": 821046.0,
1512
  "step": 148
1513
  },
1514
  {
1515
+ "entropy": 0.1954274857416749,
1516
  "epoch": 1.8208269525267995,
1517
+ "grad_norm": 0.1630859375,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
+ "loss": 0.012469938956201077,
1520
+ "mean_token_accuracy": 0.9970379211008549,
1521
+ "num_tokens": 825773.0,
1522
  "step": 149
1523
  },
1524
  {
1525
+ "entropy": 0.20444792695343494,
1526
  "epoch": 1.8330781010719757,
1527
+ "grad_norm": 0.3671875,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
+ "loss": 0.029102876782417297,
1530
+ "mean_token_accuracy": 0.9916210547089577,
1531
+ "num_tokens": 831944.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
+ "eval_entropy": 0.20245846825233405,
1537
+ "eval_loss": 0.07568201422691345,
1538
+ "eval_mean_token_accuracy": 0.973983341369076,
1539
+ "eval_num_tokens": 831944.0,
1540
+ "eval_runtime": 56.7259,
1541
+ "eval_samples_per_second": 1.216,
1542
+ "eval_steps_per_second": 1.216,
1543
  "step": 150
1544
  },
1545
  {
1546
+ "entropy": 0.18990392005071044,
1547
  "epoch": 1.8453292496171516,
1548
+ "grad_norm": 0.2734375,
1549
  "learning_rate": 1.707317073170732e-05,
1550
+ "loss": 0.019072150811553,
1551
+ "mean_token_accuracy": 0.9943390414118767,
1552
+ "num_tokens": 836732.0,
1553
  "step": 151
1554
  },
1555
  {
1556
+ "entropy": 0.2014783564954996,
1557
  "epoch": 1.8575803981623276,
1558
+ "grad_norm": 0.232421875,
1559
  "learning_rate": 1.5853658536585366e-05,
1560
+ "loss": 0.02772960253059864,
1561
+ "mean_token_accuracy": 0.9943372644484043,
1562
+ "num_tokens": 843548.0,
1563
  "step": 152
1564
  },
1565
  {
1566
+ "entropy": 0.20029952516779304,
1567
  "epoch": 1.8698315467075037,
1568
+ "grad_norm": 0.291015625,
1569
  "learning_rate": 1.4634146341463415e-05,
1570
+ "loss": 0.029696376994252205,
1571
+ "mean_token_accuracy": 0.9905722960829735,
1572
+ "num_tokens": 849264.0,
1573
  "step": 153
1574
  },
1575
  {
1576
+ "entropy": 0.1881282702088356,
1577
  "epoch": 1.88208269525268,
1578
+ "grad_norm": 0.1953125,
1579
  "learning_rate": 1.3414634146341466e-05,
1580
+ "loss": 0.01899532601237297,
1581
+ "mean_token_accuracy": 0.9949756152927876,
1582
+ "num_tokens": 855941.0,
1583
  "step": 154
1584
  },
1585
  {
1586
+ "entropy": 0.20087886042892933,
1587
  "epoch": 1.894333843797856,
1588
+ "grad_norm": 0.28125,
1589
  "learning_rate": 1.2195121951219513e-05,
1590
+ "loss": 0.027130059897899628,
1591
+ "mean_token_accuracy": 0.990227460861206,
1592
+ "num_tokens": 861676.0,
1593
  "step": 155
1594
  },
1595
  {
1596
+ "entropy": 0.18239097949117422,
1597
  "epoch": 1.9065849923430322,
1598
+ "grad_norm": 0.2255859375,
1599
  "learning_rate": 1.0975609756097562e-05,
1600
+ "loss": 0.021481823176145554,
1601
+ "mean_token_accuracy": 0.994240652769804,
1602
+ "num_tokens": 867376.0,
1603
  "step": 156
1604
  },
1605
  {
1606
+ "entropy": 0.19513252703472972,
1607
  "epoch": 1.9188361408882084,
1608
+ "grad_norm": 0.390625,
1609
  "learning_rate": 9.756097560975611e-06,
1610
+ "loss": 0.03814350813627243,
1611
+ "mean_token_accuracy": 0.9888629019260406,
1612
+ "num_tokens": 873357.0,
1613
  "step": 157
1614
  },
1615
  {
1616
+ "entropy": 0.19895873684436083,
1617
  "epoch": 1.9310872894333844,
1618
+ "grad_norm": 0.28515625,
1619
  "learning_rate": 8.53658536585366e-06,
1620
+ "loss": 0.030593648552894592,
1621
+ "mean_token_accuracy": 0.9909784123301506,
1622
+ "num_tokens": 879168.0,
1623
  "step": 158
1624
  },
1625
  {
1626
+ "entropy": 0.19939070381224155,
1627
  "epoch": 1.9433384379785605,
1628
+ "grad_norm": 0.2392578125,
1629
  "learning_rate": 7.317073170731707e-06,
1630
+ "loss": 0.03540084883570671,
1631
+ "mean_token_accuracy": 0.9887162260711193,
1632
+ "num_tokens": 884239.0,
1633
  "step": 159
1634
  },
1635
  {
1636
+ "entropy": 0.19125983119010925,
1637
  "epoch": 1.9555895865237365,
1638
+ "grad_norm": 0.2890625,
1639
  "learning_rate": 6.0975609756097564e-06,
1640
+ "loss": 0.03378206118941307,
1641
+ "mean_token_accuracy": 0.9916012957692146,
1642
+ "num_tokens": 889409.0,
1643
  "step": 160
1644
  },
1645
  {
1646
+ "entropy": 0.21400849102064967,
1647
  "epoch": 1.9678407350689127,
1648
+ "grad_norm": 0.208984375,
1649
  "learning_rate": 4.8780487804878055e-06,
1650
+ "loss": 0.033363211899995804,
1651
+ "mean_token_accuracy": 0.9923242144286633,
1652
+ "num_tokens": 895282.0,
1653
  "step": 161
1654
  },
1655
  {
1656
+ "entropy": 0.22280079126358032,
1657
  "epoch": 1.9800918836140888,
1658
+ "grad_norm": 0.181640625,
1659
  "learning_rate": 3.6585365853658537e-06,
1660
+ "loss": 0.016371803358197212,
1661
+ "mean_token_accuracy": 0.9944233559072018,
1662
+ "num_tokens": 899869.0,
1663
  "step": 162
1664
  },
1665
  {
1666
+ "entropy": 0.21370396204292774,
1667
  "epoch": 1.992343032159265,
1668
+ "grad_norm": 0.279296875,
1669
  "learning_rate": 2.4390243902439027e-06,
1670
+ "loss": 0.024234982207417488,
1671
+ "mean_token_accuracy": 0.9939975440502167,
1672
+ "num_tokens": 905419.0,
1673
  "step": 163
1674
  },
1675
  {
1676
+ "entropy": 0.19501846730709077,
1677
  "epoch": 2.0,
1678
+ "grad_norm": 0.244140625,
1679
  "learning_rate": 1.2195121951219514e-06,
1680
+ "loss": 0.021054470911622047,
1681
+ "mean_token_accuracy": 0.9963582038879395,
1682
+ "num_tokens": 909356.0,
1683
  "step": 164
1684
  }
1685
  ],
 
1700
  "attributes": {}
1701
  }
1702
  },
1703
+ "total_flos": 4.117673661068083e+16,
1704
  "train_batch_size": 1,
1705
  "trial_name": null,
1706
  "trial_params": null