stefanocarrera commited on
Commit
0689088
·
verified ·
1 Parent(s): 9288eec

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "gate_proj",
33
- "o_proj",
34
- "k_proj",
35
  "up_proj",
36
- "down_proj",
 
37
  "q_proj",
38
- "v_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
 
32
  "up_proj",
33
+ "k_proj",
34
+ "gate_proj",
35
  "q_proj",
36
+ "down_proj",
37
+ "v_proj",
38
+ "o_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4d0e360ca66849bb30c4a864d9c2d2c5f15c97cf908091ea556172e8b63d3c2
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c50446119658169e4b938126c27c77bafe70a24c07da2bdb29a80ed9e37b2df
3
  size 83946192
checkpoint-150/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "gate_proj",
33
- "k_proj",
34
  "up_proj",
35
- "o_proj",
 
 
36
  "down_proj",
37
  "v_proj",
38
- "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "up_proj",
33
+ "k_proj",
34
+ "gate_proj",
35
+ "q_proj",
36
  "down_proj",
37
  "v_proj",
38
+ "o_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
checkpoint-150/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e14cd85efc6181de7bf67dfba627ecc3a8e85902faa37a8e88f8f7ad6ddaf4c4
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7692fca28df068d449d21db768ffa34ee2c882830fc15488e1768973612cf902
3
  size 83946192
checkpoint-150/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2971f43ea71fcc46b772be99c7c59c284402d1ee484c1b80d229b38c50998252
3
  size 85728997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e82c72c5dbb9a0cc2b44eadadae5fc00bdfc4e80027c573c0cdaa6f5541c61b2
3
  size 85728997
checkpoint-150/trainer_state.json CHANGED
@@ -10,1536 +10,1536 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 0.32083135563880205,
14
  "epoch": 0.01225114854517611,
15
- "grad_norm": 0.134765625,
16
  "learning_rate": 0.0002,
17
- "loss": 0.019214527681469917,
18
- "mean_token_accuracy": 0.9918519593775272,
19
- "num_tokens": 6092.0,
20
  "step": 1
21
  },
22
  {
23
- "entropy": 0.3576695416122675,
24
  "epoch": 0.02450229709035222,
25
- "grad_norm": 0.50390625,
26
  "learning_rate": 0.00019878048780487805,
27
- "loss": 0.03324645012617111,
28
- "mean_token_accuracy": 0.988272774964571,
29
- "num_tokens": 11535.0,
30
  "step": 2
31
  },
32
  {
33
- "entropy": 0.33352363388985395,
34
  "epoch": 0.036753445635528334,
35
- "grad_norm": 0.0272216796875,
36
  "learning_rate": 0.0001975609756097561,
37
- "loss": 0.0017091021873056889,
38
- "mean_token_accuracy": 1.0,
39
- "num_tokens": 16432.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 0.35098350048065186,
44
  "epoch": 0.04900459418070444,
45
- "grad_norm": 0.06640625,
46
  "learning_rate": 0.00019634146341463416,
47
- "loss": 0.00414489908143878,
48
- "mean_token_accuracy": 0.9985632188618183,
49
- "num_tokens": 20507.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 0.3005372080951929,
54
  "epoch": 0.06125574272588055,
55
- "grad_norm": 0.01416015625,
56
  "learning_rate": 0.0001951219512195122,
57
- "loss": 0.0008560216519981623,
58
- "mean_token_accuracy": 1.0,
59
- "num_tokens": 26122.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 0.3177621979266405,
64
  "epoch": 0.07350689127105667,
65
- "grad_norm": 0.008544921875,
66
  "learning_rate": 0.00019390243902439025,
67
- "loss": 0.0005585744511336088,
68
- "mean_token_accuracy": 1.0,
69
- "num_tokens": 30847.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 0.27754624653607607,
74
  "epoch": 0.08575803981623277,
75
- "grad_norm": 0.019775390625,
76
  "learning_rate": 0.0001926829268292683,
77
- "loss": 0.0012820134870707989,
78
- "mean_token_accuracy": 0.9998413696885109,
79
- "num_tokens": 36541.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 0.30307829193770885,
84
  "epoch": 0.09800918836140889,
85
- "grad_norm": 0.004364013671875,
86
  "learning_rate": 0.00019146341463414633,
87
- "loss": 0.0003136860905215144,
88
- "mean_token_accuracy": 1.0,
89
- "num_tokens": 41001.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 0.31226138956844807,
94
  "epoch": 0.11026033690658499,
95
- "grad_norm": 0.11767578125,
96
  "learning_rate": 0.0001902439024390244,
97
- "loss": 0.006275261752307415,
98
- "mean_token_accuracy": 0.9993216060101986,
99
- "num_tokens": 45467.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 0.2779384208843112,
104
  "epoch": 0.1225114854517611,
105
- "grad_norm": 0.011474609375,
106
  "learning_rate": 0.00018902439024390244,
107
- "loss": 0.0006869531353004277,
108
- "mean_token_accuracy": 1.0,
109
- "num_tokens": 50478.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 0.27587867714464664,
114
  "epoch": 0.13476263399693722,
115
- "grad_norm": 0.00188446044921875,
116
  "learning_rate": 0.0001878048780487805,
117
- "loss": 0.0001916390028782189,
118
- "mean_token_accuracy": 1.0,
119
- "num_tokens": 56181.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 0.2948900917544961,
124
  "epoch": 0.14701378254211334,
125
- "grad_norm": 0.07177734375,
126
  "learning_rate": 0.00018658536585365856,
127
- "loss": 0.001886777114123106,
128
- "mean_token_accuracy": 0.9998650103807449,
129
- "num_tokens": 62946.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 0.29555963445454836,
134
  "epoch": 0.15926493108728942,
135
- "grad_norm": 0.005523681640625,
136
  "learning_rate": 0.0001853658536585366,
137
- "loss": 0.00017441912495996803,
138
- "mean_token_accuracy": 1.0,
139
- "num_tokens": 68436.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 0.287986209616065,
144
  "epoch": 0.17151607963246554,
145
- "grad_norm": 0.02001953125,
146
  "learning_rate": 0.00018414634146341464,
147
- "loss": 0.00017802949878387153,
148
- "mean_token_accuracy": 1.0,
149
- "num_tokens": 73603.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 0.3127295421436429,
154
  "epoch": 0.18376722817764166,
155
- "grad_norm": 0.06787109375,
156
  "learning_rate": 0.0001829268292682927,
157
- "loss": 0.0010371531825512648,
158
- "mean_token_accuracy": 0.9995941556990147,
159
- "num_tokens": 77845.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 0.2922206539660692,
164
  "epoch": 0.19601837672281777,
165
- "grad_norm": 0.00118255615234375,
166
  "learning_rate": 0.00018170731707317075,
167
- "loss": 0.00011905122664757073,
168
- "mean_token_accuracy": 1.0,
169
- "num_tokens": 82744.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 0.2928574001416564,
174
  "epoch": 0.2082695252679939,
175
- "grad_norm": 0.0003719329833984375,
176
  "learning_rate": 0.0001804878048780488,
177
- "loss": 7.616190850967541e-05,
178
- "mean_token_accuracy": 1.0,
179
- "num_tokens": 87453.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 0.2979039028286934,
184
  "epoch": 0.22052067381316998,
185
- "grad_norm": 0.0026702880859375,
186
  "learning_rate": 0.00017926829268292684,
187
- "loss": 0.00012367898307275027,
188
- "mean_token_accuracy": 1.0,
189
- "num_tokens": 92321.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 0.31858293898403645,
194
  "epoch": 0.2327718223583461,
195
- "grad_norm": 0.10498046875,
196
  "learning_rate": 0.00017804878048780488,
197
- "loss": 0.0006579139153473079,
198
- "mean_token_accuracy": 0.9997499994933605,
199
- "num_tokens": 97146.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 0.30853591673076153,
204
  "epoch": 0.2450229709035222,
205
- "grad_norm": 0.004364013671875,
206
  "learning_rate": 0.00017682926829268295,
207
- "loss": 0.00014281428593676537,
208
- "mean_token_accuracy": 1.0,
209
- "num_tokens": 101943.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 0.34037051256746054,
214
  "epoch": 0.2572741194486983,
215
- "grad_norm": 0.056884765625,
216
  "learning_rate": 0.000175609756097561,
217
- "loss": 0.011726096272468567,
218
- "mean_token_accuracy": 0.9993422217667103,
219
- "num_tokens": 106772.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 0.29644382931292057,
224
  "epoch": 0.26952526799387444,
225
- "grad_norm": 0.0023193359375,
226
  "learning_rate": 0.00017439024390243903,
227
- "loss": 0.00010672100324882194,
228
- "mean_token_accuracy": 1.0,
229
- "num_tokens": 112558.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 0.3180191367864609,
234
  "epoch": 0.28177641653905056,
235
- "grad_norm": 0.000675201416015625,
236
  "learning_rate": 0.00017317073170731708,
237
- "loss": 9.894849790725857e-05,
238
- "mean_token_accuracy": 1.0,
239
- "num_tokens": 117489.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 0.32946281880140305,
244
  "epoch": 0.29402756508422667,
245
- "grad_norm": 0.0242919921875,
246
  "learning_rate": 0.00017195121951219512,
247
- "loss": 0.0029232932720333338,
248
- "mean_token_accuracy": 0.9996279776096344,
249
- "num_tokens": 123010.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 0.3180750487372279,
254
  "epoch": 0.30627871362940273,
255
- "grad_norm": 0.038330078125,
256
  "learning_rate": 0.0001707317073170732,
257
- "loss": 0.0015810562763363123,
258
- "mean_token_accuracy": 0.9990344606339931,
259
- "num_tokens": 127716.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 0.31262985058128834,
264
  "epoch": 0.31852986217457885,
265
- "grad_norm": 0.0027313232421875,
266
  "learning_rate": 0.00016951219512195123,
267
- "loss": 0.00019670175970532,
268
- "mean_token_accuracy": 1.0,
269
- "num_tokens": 132372.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 0.2831157138571143,
274
  "epoch": 0.33078101071975496,
275
- "grad_norm": 0.1484375,
276
  "learning_rate": 0.00016829268292682927,
277
- "loss": 0.003187144873663783,
278
- "mean_token_accuracy": 0.9994877055287361,
279
- "num_tokens": 137028.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 0.3106652954593301,
284
  "epoch": 0.3430321592649311,
285
- "grad_norm": 0.05810546875,
286
  "learning_rate": 0.00016707317073170731,
287
- "loss": 0.004998125601559877,
288
- "mean_token_accuracy": 0.9980670101940632,
289
- "num_tokens": 142088.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 0.31454288959503174,
294
  "epoch": 0.3552833078101072,
295
- "grad_norm": 0.0306396484375,
296
  "learning_rate": 0.00016585365853658536,
297
- "loss": 0.000461318384623155,
298
- "mean_token_accuracy": 1.0,
299
- "num_tokens": 147481.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 0.33650430012494326,
304
  "epoch": 0.3675344563552833,
305
- "grad_norm": 0.0238037109375,
306
  "learning_rate": 0.00016463414634146343,
307
- "loss": 0.0005614800029434264,
308
- "mean_token_accuracy": 1.0,
309
- "num_tokens": 152973.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 0.33513325452804565,
314
  "epoch": 0.37978560490045943,
315
- "grad_norm": 0.00604248046875,
316
  "learning_rate": 0.00016341463414634147,
317
- "loss": 0.00020872258755844086,
318
- "mean_token_accuracy": 1.0,
319
- "num_tokens": 156786.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 0.34442581795156,
324
  "epoch": 0.39203675344563554,
325
- "grad_norm": 0.0159912109375,
326
  "learning_rate": 0.00016219512195121954,
327
- "loss": 0.00043797443504445255,
328
- "mean_token_accuracy": 1.0,
329
- "num_tokens": 162859.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 0.34709672816097736,
334
  "epoch": 0.40428790199081166,
335
- "grad_norm": 0.04052734375,
336
  "learning_rate": 0.00016097560975609758,
337
- "loss": 0.0008612321689724922,
338
- "mean_token_accuracy": 1.0,
339
- "num_tokens": 167969.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 0.31636961828917265,
344
  "epoch": 0.4165390505359878,
345
- "grad_norm": 0.048583984375,
346
  "learning_rate": 0.00015975609756097562,
347
- "loss": 0.001623529358766973,
348
- "mean_token_accuracy": 1.0,
349
- "num_tokens": 172518.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 0.341240718960762,
354
  "epoch": 0.42879019908116384,
355
- "grad_norm": 0.0089111328125,
356
  "learning_rate": 0.00015853658536585366,
357
- "loss": 0.0004598334198817611,
358
- "mean_token_accuracy": 1.0,
359
- "num_tokens": 177085.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 0.3331515807658434,
364
  "epoch": 0.44104134762633995,
365
- "grad_norm": 0.0137939453125,
366
  "learning_rate": 0.00015731707317073173,
367
- "loss": 0.00047711117076687515,
368
- "mean_token_accuracy": 1.0,
369
- "num_tokens": 181617.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 0.2969168536365032,
374
  "epoch": 0.45329249617151607,
375
- "grad_norm": 0.0296630859375,
376
  "learning_rate": 0.00015609756097560978,
377
- "loss": 0.0018673602025955915,
378
- "mean_token_accuracy": 0.9982142858207226,
379
- "num_tokens": 186836.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 0.3208611598238349,
384
  "epoch": 0.4655436447166922,
385
- "grad_norm": 0.0034027099609375,
386
  "learning_rate": 0.00015487804878048782,
387
- "loss": 0.00018661899957805872,
388
- "mean_token_accuracy": 1.0,
389
- "num_tokens": 191224.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 0.296407300978899,
394
  "epoch": 0.4777947932618683,
395
- "grad_norm": 0.003570556640625,
396
  "learning_rate": 0.00015365853658536586,
397
- "loss": 0.0001632017083466053,
398
- "mean_token_accuracy": 1.0,
399
- "num_tokens": 195926.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 0.32142599392682314,
404
  "epoch": 0.4900459418070444,
405
- "grad_norm": 0.0277099609375,
406
  "learning_rate": 0.0001524390243902439,
407
- "loss": 0.0039696223102509975,
408
- "mean_token_accuracy": 0.9992866478860378,
409
- "num_tokens": 200772.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 0.3037592498585582,
414
  "epoch": 0.5022970903522205,
415
- "grad_norm": 0.0026092529296875,
416
  "learning_rate": 0.00015121951219512197,
417
- "loss": 0.00013867147208657116,
418
- "mean_token_accuracy": 1.0,
419
- "num_tokens": 204499.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 0.31665132474154234,
424
  "epoch": 0.5145482388973966,
425
- "grad_norm": 0.004730224609375,
426
  "learning_rate": 0.00015000000000000001,
427
- "loss": 0.00025882094632834196,
428
- "mean_token_accuracy": 1.0,
429
- "num_tokens": 208814.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 0.33023010194301605,
434
  "epoch": 0.5267993874425727,
435
- "grad_norm": 0.001922607421875,
436
  "learning_rate": 0.00014878048780487806,
437
- "loss": 0.00019074659212492406,
438
- "mean_token_accuracy": 1.0,
439
- "num_tokens": 213907.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 0.334543508477509,
444
  "epoch": 0.5390505359877489,
445
- "grad_norm": 0.0018157958984375,
446
  "learning_rate": 0.0001475609756097561,
447
- "loss": 0.00011566472676349804,
448
- "mean_token_accuracy": 1.0,
449
- "num_tokens": 218988.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 0.3078083451837301,
454
  "epoch": 0.5513016845329249,
455
- "grad_norm": 0.03515625,
456
  "learning_rate": 0.00014634146341463414,
457
- "loss": 0.0022110757417976856,
458
- "mean_token_accuracy": 0.9987903237342834,
459
- "num_tokens": 223595.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 0.32667472772300243,
464
  "epoch": 0.5635528330781011,
465
- "grad_norm": 0.034423828125,
466
  "learning_rate": 0.0001451219512195122,
467
- "loss": 0.0010719874408096075,
468
- "mean_token_accuracy": 0.9991953931748867,
469
- "num_tokens": 228244.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 0.3273861287161708,
474
  "epoch": 0.5758039816232772,
475
- "grad_norm": 0.00057220458984375,
476
  "learning_rate": 0.00014390243902439025,
477
- "loss": 6.594268779736012e-05,
478
- "mean_token_accuracy": 1.0,
479
- "num_tokens": 232606.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 0.31728990003466606,
484
  "epoch": 0.5880551301684533,
485
- "grad_norm": 0.0003185272216796875,
486
  "learning_rate": 0.0001426829268292683,
487
- "loss": 8.574798266636208e-05,
488
- "mean_token_accuracy": 1.0,
489
- "num_tokens": 236563.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 0.34826087579131126,
494
  "epoch": 0.6003062787136294,
495
- "grad_norm": 0.00390625,
496
  "learning_rate": 0.00014146341463414634,
497
- "loss": 0.00015243196685332805,
498
- "mean_token_accuracy": 1.0,
499
- "num_tokens": 241214.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 0.3367287954315543,
504
  "epoch": 0.6125574272588055,
505
- "grad_norm": 0.003265380859375,
506
  "learning_rate": 0.00014024390243902438,
507
- "loss": 0.0001341242023045197,
508
- "mean_token_accuracy": 1.0,
509
- "num_tokens": 245200.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
- "eval_entropy": 0.3212364659361217,
515
- "eval_loss": 0.0014040147652849555,
516
- "eval_mean_token_accuracy": 0.9998166846192401,
517
- "eval_num_tokens": 245200.0,
518
- "eval_runtime": 51.1353,
519
- "eval_samples_per_second": 1.349,
520
- "eval_steps_per_second": 1.349,
521
  "step": 50
522
  },
523
  {
524
- "entropy": 0.3274610061198473,
525
  "epoch": 0.6248085758039816,
526
- "grad_norm": 0.000518798828125,
527
  "learning_rate": 0.00013902439024390245,
528
- "loss": 6.213193410076201e-05,
529
- "mean_token_accuracy": 1.0,
530
- "num_tokens": 249761.0,
531
  "step": 51
532
  },
533
  {
534
- "entropy": 0.3302043145522475,
535
  "epoch": 0.6370597243491577,
536
- "grad_norm": 0.00067901611328125,
537
  "learning_rate": 0.0001378048780487805,
538
- "loss": 7.391967665171251e-05,
539
- "mean_token_accuracy": 1.0,
540
- "num_tokens": 254787.0,
541
  "step": 52
542
  },
543
  {
544
- "entropy": 0.3345805983990431,
545
  "epoch": 0.6493108728943339,
546
- "grad_norm": 0.064453125,
547
  "learning_rate": 0.00013658536585365856,
548
- "loss": 0.008045142516493797,
549
- "mean_token_accuracy": 0.9975476562976837,
550
- "num_tokens": 260287.0,
551
  "step": 53
552
  },
553
  {
554
- "entropy": 0.3093695640563965,
555
  "epoch": 0.6615620214395099,
556
- "grad_norm": 0.036865234375,
557
  "learning_rate": 0.0001353658536585366,
558
- "loss": 0.0016300748102366924,
559
- "mean_token_accuracy": 0.9998249299824238,
560
- "num_tokens": 264810.0,
561
  "step": 54
562
  },
563
  {
564
- "entropy": 0.33090174850076437,
565
  "epoch": 0.6738131699846861,
566
- "grad_norm": 0.04052734375,
567
  "learning_rate": 0.00013414634146341464,
568
- "loss": 0.0037348291371017694,
569
- "mean_token_accuracy": 0.9990433678030968,
570
- "num_tokens": 270386.0,
571
  "step": 55
572
  },
573
  {
574
- "entropy": 0.3455248447135091,
575
  "epoch": 0.6860643185298622,
576
- "grad_norm": 0.0301513671875,
577
  "learning_rate": 0.0001329268292682927,
578
- "loss": 0.0006253286846913397,
579
- "mean_token_accuracy": 1.0,
580
- "num_tokens": 274391.0,
581
  "step": 56
582
  },
583
  {
584
- "entropy": 0.3408086858689785,
585
  "epoch": 0.6983154670750383,
586
- "grad_norm": 0.0033111572265625,
587
  "learning_rate": 0.00013170731707317076,
588
- "loss": 0.00020847572886850685,
589
- "mean_token_accuracy": 1.0,
590
- "num_tokens": 279716.0,
591
  "step": 57
592
  },
593
  {
594
- "entropy": 0.29423840064555407,
595
  "epoch": 0.7105666156202144,
596
- "grad_norm": 0.125,
597
  "learning_rate": 0.0001304878048780488,
598
- "loss": 0.005600863602012396,
599
- "mean_token_accuracy": 0.998680267482996,
600
- "num_tokens": 285404.0,
601
  "step": 58
602
  },
603
  {
604
- "entropy": 0.33689095824956894,
605
  "epoch": 0.7228177641653905,
606
- "grad_norm": 0.057861328125,
607
  "learning_rate": 0.00012926829268292684,
608
- "loss": 0.009100214578211308,
609
- "mean_token_accuracy": 0.9967310577630997,
610
- "num_tokens": 290021.0,
611
  "step": 59
612
  },
613
  {
614
- "entropy": 0.3336018780246377,
615
  "epoch": 0.7350689127105666,
616
- "grad_norm": 0.005889892578125,
617
  "learning_rate": 0.00012804878048780488,
618
- "loss": 0.00015729578444734216,
619
- "mean_token_accuracy": 1.0,
620
- "num_tokens": 294890.0,
621
  "step": 60
622
  },
623
  {
624
- "entropy": 0.30060291569679976,
625
  "epoch": 0.7473200612557427,
626
- "grad_norm": 0.0172119140625,
627
  "learning_rate": 0.00012682926829268293,
628
- "loss": 0.00039864826248958707,
629
- "mean_token_accuracy": 0.9993686862289906,
630
- "num_tokens": 300384.0,
631
  "step": 61
632
  },
633
  {
634
- "entropy": 0.36021818965673447,
635
  "epoch": 0.7595712098009189,
636
- "grad_norm": 0.0025634765625,
637
  "learning_rate": 0.000125609756097561,
638
- "loss": 0.00016568033606745303,
639
- "mean_token_accuracy": 1.0,
640
- "num_tokens": 305805.0,
641
  "step": 62
642
  },
643
  {
644
- "entropy": 0.32536453381180763,
645
  "epoch": 0.7718223583460949,
646
- "grad_norm": 0.001800537109375,
647
  "learning_rate": 0.00012439024390243904,
648
- "loss": 0.00014585268218070269,
649
- "mean_token_accuracy": 1.0,
650
- "num_tokens": 310233.0,
651
  "step": 63
652
  },
653
  {
654
- "entropy": 0.31967335008084774,
655
  "epoch": 0.7840735068912711,
656
- "grad_norm": 0.0010223388671875,
657
  "learning_rate": 0.00012317073170731708,
658
- "loss": 0.00010060967179015279,
659
- "mean_token_accuracy": 1.0,
660
- "num_tokens": 314234.0,
661
  "step": 64
662
  },
663
  {
664
- "entropy": 0.34358128905296326,
665
  "epoch": 0.7963246554364471,
666
- "grad_norm": 0.000743865966796875,
667
  "learning_rate": 0.00012195121951219512,
668
- "loss": 9.478208812652156e-05,
669
- "mean_token_accuracy": 1.0,
670
- "num_tokens": 319186.0,
671
  "step": 65
672
  },
673
  {
674
- "entropy": 0.33988895174115896,
675
  "epoch": 0.8085758039816233,
676
- "grad_norm": 0.0419921875,
677
  "learning_rate": 0.00012073170731707318,
678
- "loss": 0.0011607923079282045,
679
- "mean_token_accuracy": 0.9995629377663136,
680
- "num_tokens": 324710.0,
681
  "step": 66
682
  },
683
  {
684
- "entropy": 0.3078791871666908,
685
  "epoch": 0.8208269525267994,
686
- "grad_norm": 0.05859375,
687
  "learning_rate": 0.00011951219512195122,
688
- "loss": 0.016102174296975136,
689
- "mean_token_accuracy": 0.9935315921902657,
690
- "num_tokens": 329942.0,
691
  "step": 67
692
  },
693
  {
694
- "entropy": 0.3587793167680502,
695
  "epoch": 0.8330781010719756,
696
- "grad_norm": 0.002716064453125,
697
  "learning_rate": 0.00011829268292682926,
698
- "loss": 0.0001911829021992162,
699
- "mean_token_accuracy": 1.0,
700
- "num_tokens": 334487.0,
701
  "step": 68
702
  },
703
  {
704
- "entropy": 0.360817888751626,
705
  "epoch": 0.8453292496171516,
706
- "grad_norm": 0.003753662109375,
707
  "learning_rate": 0.00011707317073170732,
708
- "loss": 0.00026575953233987093,
709
- "mean_token_accuracy": 1.0,
710
- "num_tokens": 338184.0,
711
  "step": 69
712
  },
713
  {
714
- "entropy": 0.3788213599473238,
715
  "epoch": 0.8575803981623277,
716
- "grad_norm": 0.07421875,
717
  "learning_rate": 0.00011585365853658536,
718
- "loss": 0.007251895032823086,
719
- "mean_token_accuracy": 0.997805867344141,
720
- "num_tokens": 342594.0,
721
  "step": 70
722
  },
723
  {
724
- "entropy": 0.37989665009081364,
725
  "epoch": 0.8698315467075038,
726
- "grad_norm": 0.0361328125,
727
  "learning_rate": 0.00011463414634146342,
728
- "loss": 0.001519644632935524,
729
- "mean_token_accuracy": 0.9997807033360004,
730
- "num_tokens": 347798.0,
731
  "step": 71
732
  },
733
  {
734
- "entropy": 0.35538383200764656,
735
  "epoch": 0.8820826952526799,
736
- "grad_norm": 0.0038604736328125,
737
  "learning_rate": 0.00011341463414634146,
738
- "loss": 0.00030194621649570763,
739
- "mean_token_accuracy": 1.0,
740
- "num_tokens": 352122.0,
741
  "step": 72
742
  },
743
  {
744
- "entropy": 0.36578258499503136,
745
  "epoch": 0.8943338437978561,
746
- "grad_norm": 0.02001953125,
747
  "learning_rate": 0.00011219512195121953,
748
- "loss": 0.0018432819051668048,
749
- "mean_token_accuracy": 0.9997568093240261,
750
- "num_tokens": 357944.0,
751
  "step": 73
752
  },
753
  {
754
- "entropy": 0.3363148244097829,
755
  "epoch": 0.9065849923430321,
756
- "grad_norm": 0.01214599609375,
757
  "learning_rate": 0.00011097560975609757,
758
- "loss": 0.0004945008549839258,
759
- "mean_token_accuracy": 1.0,
760
- "num_tokens": 363815.0,
761
  "step": 74
762
  },
763
  {
764
- "entropy": 0.3567014401778579,
765
  "epoch": 0.9188361408882083,
766
- "grad_norm": 0.00160980224609375,
767
  "learning_rate": 0.00010975609756097563,
768
- "loss": 0.0002087215252686292,
769
- "mean_token_accuracy": 1.0,
770
- "num_tokens": 368871.0,
771
  "step": 75
772
  },
773
  {
774
- "entropy": 0.3798025632277131,
775
  "epoch": 0.9310872894333844,
776
- "grad_norm": 0.0242919921875,
777
  "learning_rate": 0.00010853658536585367,
778
- "loss": 0.0011810146970674396,
779
- "mean_token_accuracy": 0.999143835157156,
780
- "num_tokens": 373671.0,
781
  "step": 76
782
  },
783
  {
784
- "entropy": 0.3385667558759451,
785
  "epoch": 0.9433384379785605,
786
- "grad_norm": 0.00164031982421875,
787
  "learning_rate": 0.00010731707317073172,
788
- "loss": 0.00021391667542047799,
789
- "mean_token_accuracy": 1.0,
790
- "num_tokens": 379038.0,
791
  "step": 77
792
  },
793
  {
794
- "entropy": 0.37137152813374996,
795
  "epoch": 0.9555895865237366,
796
- "grad_norm": 0.0194091796875,
797
  "learning_rate": 0.00010609756097560977,
798
- "loss": 0.0009015509858727455,
799
- "mean_token_accuracy": 0.9992977529764175,
800
- "num_tokens": 384253.0,
801
  "step": 78
802
  },
803
  {
804
- "entropy": 0.35634181648492813,
805
  "epoch": 0.9678407350689127,
806
- "grad_norm": 0.002349853515625,
807
  "learning_rate": 0.00010487804878048781,
808
- "loss": 0.0003007323248311877,
809
- "mean_token_accuracy": 1.0,
810
- "num_tokens": 388348.0,
811
  "step": 79
812
  },
813
  {
814
- "entropy": 0.3363165808841586,
815
  "epoch": 0.9800918836140888,
816
- "grad_norm": 0.013916015625,
817
  "learning_rate": 0.00010365853658536586,
818
- "loss": 0.0015124119818210602,
819
- "mean_token_accuracy": 0.999507874250412,
820
- "num_tokens": 394214.0,
821
  "step": 80
822
  },
823
  {
824
- "entropy": 0.34769035689532757,
825
  "epoch": 0.9923430321592649,
826
- "grad_norm": 0.0264892578125,
827
  "learning_rate": 0.0001024390243902439,
828
- "loss": 0.0008837911300361156,
829
- "mean_token_accuracy": 0.9992187507450581,
830
- "num_tokens": 399114.0,
831
  "step": 81
832
  },
833
  {
834
- "entropy": 0.34723484665155413,
835
  "epoch": 1.0,
836
- "grad_norm": 0.002288818359375,
837
  "learning_rate": 0.00010121951219512196,
838
- "loss": 0.0002318796032341197,
839
- "mean_token_accuracy": 1.0,
840
- "num_tokens": 402130.0,
841
  "step": 82
842
  },
843
  {
844
- "entropy": 0.3677198924124241,
845
  "epoch": 1.0122511485451762,
846
- "grad_norm": 0.03173828125,
847
  "learning_rate": 0.0001,
848
- "loss": 0.0028767124749720097,
849
- "mean_token_accuracy": 0.9997509978711605,
850
- "num_tokens": 406761.0,
851
  "step": 83
852
  },
853
  {
854
- "entropy": 0.3296260507777333,
855
  "epoch": 1.0245022970903521,
856
- "grad_norm": 0.0016326904296875,
857
  "learning_rate": 9.878048780487805e-05,
858
- "loss": 0.00020801745995413512,
859
- "mean_token_accuracy": 1.0,
860
- "num_tokens": 411367.0,
861
  "step": 84
862
  },
863
  {
864
- "entropy": 0.36815651040524244,
865
  "epoch": 1.0367534456355283,
866
- "grad_norm": 0.00299072265625,
867
  "learning_rate": 9.75609756097561e-05,
868
- "loss": 0.00034169916762039065,
869
- "mean_token_accuracy": 1.0,
870
- "num_tokens": 417768.0,
871
  "step": 85
872
  },
873
  {
874
- "entropy": 0.33015719801187515,
875
  "epoch": 1.0490045941807045,
876
- "grad_norm": 0.0019683837890625,
877
  "learning_rate": 9.634146341463415e-05,
878
- "loss": 0.0002285851223859936,
879
- "mean_token_accuracy": 1.0,
880
- "num_tokens": 421738.0,
881
  "step": 86
882
  },
883
  {
884
- "entropy": 0.33297139778733253,
885
  "epoch": 1.0612557427258806,
886
- "grad_norm": 0.0003604888916015625,
887
  "learning_rate": 9.51219512195122e-05,
888
- "loss": 0.00012145948858233169,
889
- "mean_token_accuracy": 1.0,
890
- "num_tokens": 426854.0,
891
  "step": 87
892
  },
893
  {
894
- "entropy": 0.4070947393774986,
895
  "epoch": 1.0735068912710566,
896
- "grad_norm": 0.017333984375,
897
  "learning_rate": 9.390243902439024e-05,
898
- "loss": 0.0016109611606225371,
899
- "mean_token_accuracy": 0.9998486675322056,
900
- "num_tokens": 431083.0,
901
  "step": 88
902
  },
903
  {
904
- "entropy": 0.3781026881188154,
905
  "epoch": 1.0857580398162328,
906
- "grad_norm": 0.038818359375,
907
  "learning_rate": 9.26829268292683e-05,
908
- "loss": 0.003159651067107916,
909
- "mean_token_accuracy": 0.9989801794290543,
910
- "num_tokens": 435694.0,
911
  "step": 89
912
  },
913
  {
914
- "entropy": 0.3439221568405628,
915
  "epoch": 1.098009188361409,
916
- "grad_norm": 0.000949859619140625,
917
  "learning_rate": 9.146341463414635e-05,
918
- "loss": 0.00018103225738741457,
919
- "mean_token_accuracy": 1.0,
920
- "num_tokens": 440578.0,
921
  "step": 90
922
  },
923
  {
924
- "entropy": 0.38779534585773945,
925
  "epoch": 1.110260336906585,
926
- "grad_norm": 0.0142822265625,
927
  "learning_rate": 9.02439024390244e-05,
928
- "loss": 0.002015941310673952,
929
- "mean_token_accuracy": 0.9984939768910408,
930
- "num_tokens": 445238.0,
931
  "step": 91
932
  },
933
  {
934
- "entropy": 0.3697750475257635,
935
  "epoch": 1.122511485451761,
936
- "grad_norm": 0.08642578125,
937
  "learning_rate": 8.902439024390244e-05,
938
- "loss": 0.006127167027443647,
939
- "mean_token_accuracy": 0.9989957921206951,
940
- "num_tokens": 449993.0,
941
  "step": 92
942
  },
943
  {
944
- "entropy": 0.34917816519737244,
945
  "epoch": 1.1347626339969372,
946
- "grad_norm": 0.0037384033203125,
947
  "learning_rate": 8.78048780487805e-05,
948
- "loss": 0.00024314325128216296,
949
- "mean_token_accuracy": 1.0,
950
- "num_tokens": 454976.0,
951
  "step": 93
952
  },
953
  {
954
- "entropy": 0.3524725306779146,
955
  "epoch": 1.1470137825421134,
956
- "grad_norm": 0.00104522705078125,
957
  "learning_rate": 8.658536585365854e-05,
958
- "loss": 0.00014462518447544426,
959
- "mean_token_accuracy": 1.0,
960
- "num_tokens": 459671.0,
961
  "step": 94
962
  },
963
  {
964
- "entropy": 0.3524913527071476,
965
  "epoch": 1.1592649310872893,
966
- "grad_norm": 0.000782012939453125,
967
  "learning_rate": 8.53658536585366e-05,
968
- "loss": 0.0001363266637781635,
969
- "mean_token_accuracy": 1.0,
970
- "num_tokens": 464310.0,
971
  "step": 95
972
  },
973
  {
974
- "entropy": 0.33474782202392817,
975
  "epoch": 1.1715160796324655,
976
- "grad_norm": 0.05615234375,
977
  "learning_rate": 8.414634146341464e-05,
978
- "loss": 0.006995758973062038,
979
- "mean_token_accuracy": 0.997385773807764,
980
- "num_tokens": 468855.0,
981
  "step": 96
982
  },
983
  {
984
- "entropy": 0.34024662896990776,
985
  "epoch": 1.1837672281776417,
986
- "grad_norm": 0.000762939453125,
987
  "learning_rate": 8.292682926829268e-05,
988
- "loss": 0.00012206919927848503,
989
- "mean_token_accuracy": 1.0,
990
- "num_tokens": 473729.0,
991
  "step": 97
992
  },
993
  {
994
- "entropy": 0.35474758967757225,
995
  "epoch": 1.1960183767228179,
996
- "grad_norm": 0.032958984375,
997
  "learning_rate": 8.170731707317073e-05,
998
- "loss": 0.0028819667641073465,
999
- "mean_token_accuracy": 0.9993131868541241,
1000
- "num_tokens": 479034.0,
1001
  "step": 98
1002
  },
1003
  {
1004
- "entropy": 0.3854726795107126,
1005
  "epoch": 1.2082695252679938,
1006
- "grad_norm": 0.00046539306640625,
1007
  "learning_rate": 8.048780487804879e-05,
1008
- "loss": 9.724850679049268e-05,
1009
- "mean_token_accuracy": 1.0,
1010
- "num_tokens": 484808.0,
1011
  "step": 99
1012
  },
1013
  {
1014
- "entropy": 0.31455889251083136,
1015
  "epoch": 1.22052067381317,
1016
- "grad_norm": 0.00958251953125,
1017
  "learning_rate": 7.926829268292683e-05,
1018
- "loss": 0.0009833230869844556,
1019
- "mean_token_accuracy": 1.0,
1020
- "num_tokens": 489519.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
- "eval_entropy": 0.3496412036643512,
1026
- "eval_loss": 0.0005010219174437225,
1027
- "eval_mean_token_accuracy": 0.9998490343923154,
1028
- "eval_num_tokens": 489519.0,
1029
- "eval_runtime": 51.1698,
1030
- "eval_samples_per_second": 1.348,
1031
- "eval_steps_per_second": 1.348,
1032
  "step": 100
1033
  },
1034
  {
1035
- "entropy": 0.36140021588653326,
1036
  "epoch": 1.2327718223583461,
1037
- "grad_norm": 0.0006256103515625,
1038
  "learning_rate": 7.804878048780489e-05,
1039
- "loss": 0.00011641360470093787,
1040
- "mean_token_accuracy": 1.0,
1041
- "num_tokens": 494754.0,
1042
  "step": 101
1043
  },
1044
  {
1045
- "entropy": 0.33879768289625645,
1046
  "epoch": 1.245022970903522,
1047
- "grad_norm": 0.00037384033203125,
1048
  "learning_rate": 7.682926829268293e-05,
1049
- "loss": 0.00010185636347159743,
1050
- "mean_token_accuracy": 1.0,
1051
- "num_tokens": 499834.0,
1052
  "step": 102
1053
  },
1054
  {
1055
- "entropy": 0.36160764284431934,
1056
  "epoch": 1.2572741194486983,
1057
- "grad_norm": 0.00103759765625,
1058
  "learning_rate": 7.560975609756099e-05,
1059
- "loss": 0.00012021363363601267,
1060
- "mean_token_accuracy": 1.0,
1061
- "num_tokens": 505264.0,
1062
  "step": 103
1063
  },
1064
  {
1065
- "entropy": 0.3344170628115535,
1066
  "epoch": 1.2695252679938744,
1067
- "grad_norm": 0.06787109375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
- "loss": 0.00044063289533369243,
1070
- "mean_token_accuracy": 0.9995915032923222,
1071
- "num_tokens": 510257.0,
1072
  "step": 104
1073
  },
1074
  {
1075
- "entropy": 0.36058457661420107,
1076
  "epoch": 1.2817764165390506,
1077
- "grad_norm": 0.01336669921875,
1078
  "learning_rate": 7.317073170731707e-05,
1079
- "loss": 0.0015127016231417656,
1080
- "mean_token_accuracy": 0.9993556700646877,
1081
- "num_tokens": 514490.0,
1082
  "step": 105
1083
  },
1084
  {
1085
- "entropy": 0.33314079977571964,
1086
  "epoch": 1.2940275650842268,
1087
- "grad_norm": 0.0011749267578125,
1088
  "learning_rate": 7.195121951219513e-05,
1089
- "loss": 0.00011071039625676349,
1090
- "mean_token_accuracy": 1.0,
1091
- "num_tokens": 519508.0,
1092
  "step": 106
1093
  },
1094
  {
1095
- "entropy": 0.3573821699246764,
1096
  "epoch": 1.3062787136294027,
1097
- "grad_norm": 0.0003986358642578125,
1098
  "learning_rate": 7.073170731707317e-05,
1099
- "loss": 0.00011713722778949887,
1100
- "mean_token_accuracy": 1.0,
1101
- "num_tokens": 524370.0,
1102
  "step": 107
1103
  },
1104
  {
1105
- "entropy": 0.3524222169071436,
1106
  "epoch": 1.318529862174579,
1107
- "grad_norm": 0.0003108978271484375,
1108
  "learning_rate": 6.951219512195122e-05,
1109
- "loss": 9.721294190967456e-05,
1110
- "mean_token_accuracy": 1.0,
1111
- "num_tokens": 528970.0,
1112
  "step": 108
1113
  },
1114
  {
1115
- "entropy": 0.3544369339942932,
1116
  "epoch": 1.3307810107197549,
1117
- "grad_norm": 0.005950927734375,
1118
  "learning_rate": 6.829268292682928e-05,
1119
- "loss": 0.0003032644744962454,
1120
- "mean_token_accuracy": 1.0,
1121
- "num_tokens": 533938.0,
1122
  "step": 109
1123
  },
1124
  {
1125
- "entropy": 0.3304135613143444,
1126
  "epoch": 1.343032159264931,
1127
- "grad_norm": 0.000965118408203125,
1128
  "learning_rate": 6.707317073170732e-05,
1129
- "loss": 0.00012454115494620055,
1130
- "mean_token_accuracy": 1.0,
1131
- "num_tokens": 539360.0,
1132
  "step": 110
1133
  },
1134
  {
1135
- "entropy": 0.3306180518120527,
1136
  "epoch": 1.3552833078101072,
1137
- "grad_norm": 0.0011444091796875,
1138
  "learning_rate": 6.585365853658538e-05,
1139
- "loss": 0.00013282139843795449,
1140
- "mean_token_accuracy": 1.0,
1141
- "num_tokens": 543728.0,
1142
  "step": 111
1143
  },
1144
  {
1145
- "entropy": 0.3708817586302757,
1146
  "epoch": 1.3675344563552834,
1147
- "grad_norm": 0.0218505859375,
1148
  "learning_rate": 6.463414634146342e-05,
1149
- "loss": 0.004361060913652182,
1150
- "mean_token_accuracy": 0.9983282573521137,
1151
- "num_tokens": 548161.0,
1152
  "step": 112
1153
  },
1154
  {
1155
- "entropy": 0.35475645773112774,
1156
  "epoch": 1.3797856049004595,
1157
- "grad_norm": 0.01361083984375,
1158
  "learning_rate": 6.341463414634146e-05,
1159
- "loss": 0.0014049941673874855,
1160
- "mean_token_accuracy": 0.998511902987957,
1161
- "num_tokens": 553690.0,
1162
  "step": 113
1163
  },
1164
  {
1165
- "entropy": 0.3360502114519477,
1166
  "epoch": 1.3920367534456355,
1167
- "grad_norm": 0.00023746490478515625,
1168
  "learning_rate": 6.219512195121952e-05,
1169
- "loss": 8.739449549466372e-05,
1170
- "mean_token_accuracy": 1.0,
1171
- "num_tokens": 558474.0,
1172
  "step": 114
1173
  },
1174
  {
1175
- "entropy": 0.35608484130352736,
1176
  "epoch": 1.4042879019908117,
1177
- "grad_norm": 0.0009765625,
1178
  "learning_rate": 6.097560975609756e-05,
1179
- "loss": 0.00013572419993579388,
1180
- "mean_token_accuracy": 1.0,
1181
- "num_tokens": 563962.0,
1182
  "step": 115
1183
  },
1184
  {
1185
- "entropy": 0.3591584851965308,
1186
  "epoch": 1.4165390505359878,
1187
- "grad_norm": 0.00103759765625,
1188
  "learning_rate": 5.975609756097561e-05,
1189
- "loss": 0.0001251319336006418,
1190
- "mean_token_accuracy": 1.0,
1191
- "num_tokens": 568300.0,
1192
  "step": 116
1193
  },
1194
  {
1195
- "entropy": 0.32333058025687933,
1196
  "epoch": 1.4287901990811638,
1197
- "grad_norm": 0.0002803802490234375,
1198
  "learning_rate": 5.853658536585366e-05,
1199
- "loss": 8.771298598730937e-05,
1200
- "mean_token_accuracy": 1.0,
1201
- "num_tokens": 572892.0,
1202
  "step": 117
1203
  },
1204
  {
1205
- "entropy": 0.3675775118172169,
1206
  "epoch": 1.44104134762634,
1207
- "grad_norm": 0.0014495849609375,
1208
  "learning_rate": 5.731707317073171e-05,
1209
- "loss": 0.00014175268006511033,
1210
- "mean_token_accuracy": 1.0,
1211
- "num_tokens": 577889.0,
1212
  "step": 118
1213
  },
1214
  {
1215
- "entropy": 0.37294205371290445,
1216
  "epoch": 1.4532924961715161,
1217
- "grad_norm": 0.00099945068359375,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
- "loss": 8.949499897425994e-05,
1220
- "mean_token_accuracy": 1.0,
1221
- "num_tokens": 583125.0,
1222
  "step": 119
1223
  },
1224
  {
1225
- "entropy": 0.3598701385781169,
1226
  "epoch": 1.4655436447166923,
1227
- "grad_norm": 0.006011962890625,
1228
  "learning_rate": 5.487804878048781e-05,
1229
- "loss": 0.00018555490532889962,
1230
- "mean_token_accuracy": 1.0,
1231
- "num_tokens": 587853.0,
1232
  "step": 120
1233
  },
1234
  {
1235
- "entropy": 0.3222861588001251,
1236
  "epoch": 1.4777947932618682,
1237
- "grad_norm": 0.0174560546875,
1238
  "learning_rate": 5.365853658536586e-05,
1239
- "loss": 0.0032859183847904205,
1240
- "mean_token_accuracy": 0.9993932023644447,
1241
- "num_tokens": 592286.0,
1242
  "step": 121
1243
  },
1244
  {
1245
- "entropy": 0.3423085901886225,
1246
  "epoch": 1.4900459418070444,
1247
- "grad_norm": 0.000354766845703125,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
- "loss": 9.40198588068597e-05,
1250
- "mean_token_accuracy": 1.0,
1251
- "num_tokens": 597048.0,
1252
  "step": 122
1253
  },
1254
  {
1255
- "entropy": 0.3356657065451145,
1256
  "epoch": 1.5022970903522204,
1257
- "grad_norm": 0.00177764892578125,
1258
  "learning_rate": 5.121951219512195e-05,
1259
- "loss": 0.00018200451449956745,
1260
- "mean_token_accuracy": 1.0,
1261
- "num_tokens": 601352.0,
1262
  "step": 123
1263
  },
1264
  {
1265
- "entropy": 0.34760472923517227,
1266
  "epoch": 1.5145482388973965,
1267
- "grad_norm": 0.11181640625,
1268
  "learning_rate": 5e-05,
1269
- "loss": 0.0016977301565930247,
1270
- "mean_token_accuracy": 0.9993686862289906,
1271
- "num_tokens": 606645.0,
1272
  "step": 124
1273
  },
1274
  {
1275
- "entropy": 0.34292006585747004,
1276
  "epoch": 1.5267993874425727,
1277
- "grad_norm": 0.00048828125,
1278
  "learning_rate": 4.878048780487805e-05,
1279
- "loss": 0.00011081612319685519,
1280
- "mean_token_accuracy": 1.0,
1281
- "num_tokens": 612407.0,
1282
  "step": 125
1283
  },
1284
  {
1285
- "entropy": 0.3405891256406903,
1286
  "epoch": 1.5390505359877489,
1287
- "grad_norm": 0.0098876953125,
1288
  "learning_rate": 4.75609756097561e-05,
1289
- "loss": 0.0002546444011386484,
1290
- "mean_token_accuracy": 1.0,
1291
- "num_tokens": 617229.0,
1292
  "step": 126
1293
  },
1294
  {
1295
- "entropy": 0.39804220758378506,
1296
  "epoch": 1.551301684532925,
1297
- "grad_norm": 0.00177001953125,
1298
  "learning_rate": 4.634146341463415e-05,
1299
- "loss": 0.00020191296061966568,
1300
- "mean_token_accuracy": 1.0,
1301
- "num_tokens": 622355.0,
1302
  "step": 127
1303
  },
1304
  {
1305
- "entropy": 0.38183566741645336,
1306
  "epoch": 1.5635528330781012,
1307
- "grad_norm": 0.0020294189453125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
- "loss": 0.0002027210284722969,
1310
- "mean_token_accuracy": 1.0,
1311
- "num_tokens": 627269.0,
1312
  "step": 128
1313
  },
1314
  {
1315
- "entropy": 0.32283751480281353,
1316
  "epoch": 1.5758039816232772,
1317
- "grad_norm": 0.054443359375,
1318
  "learning_rate": 4.390243902439025e-05,
1319
- "loss": 0.0007472627912648022,
1320
- "mean_token_accuracy": 0.9991987161338329,
1321
- "num_tokens": 631454.0,
1322
  "step": 129
1323
  },
1324
  {
1325
- "entropy": 0.31161691434681416,
1326
  "epoch": 1.5880551301684533,
1327
- "grad_norm": 0.00174713134765625,
1328
  "learning_rate": 4.26829268292683e-05,
1329
- "loss": 0.0001439937186660245,
1330
- "mean_token_accuracy": 1.0,
1331
- "num_tokens": 636502.0,
1332
  "step": 130
1333
  },
1334
  {
1335
- "entropy": 0.3435830660164356,
1336
  "epoch": 1.6003062787136293,
1337
- "grad_norm": 0.0308837890625,
1338
  "learning_rate": 4.146341463414634e-05,
1339
- "loss": 0.004759644623845816,
1340
- "mean_token_accuracy": 0.9986401423811913,
1341
- "num_tokens": 641264.0,
1342
  "step": 131
1343
  },
1344
  {
1345
- "entropy": 0.35103026777505875,
1346
  "epoch": 1.6125574272588055,
1347
- "grad_norm": 0.014404296875,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
- "loss": 0.002162080956622958,
1350
- "mean_token_accuracy": 0.9997351691126823,
1351
- "num_tokens": 646377.0,
1352
  "step": 132
1353
  },
1354
  {
1355
- "entropy": 0.2977801924571395,
1356
  "epoch": 1.6248085758039816,
1357
- "grad_norm": 0.0003948211669921875,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
- "loss": 0.00010242296411888674,
1360
- "mean_token_accuracy": 1.0,
1361
- "num_tokens": 650767.0,
1362
  "step": 133
1363
  },
1364
  {
1365
- "entropy": 0.3230333384126425,
1366
  "epoch": 1.6370597243491578,
1367
- "grad_norm": 0.00138092041015625,
1368
  "learning_rate": 3.780487804878049e-05,
1369
- "loss": 0.00015076796989887953,
1370
- "mean_token_accuracy": 1.0,
1371
- "num_tokens": 655169.0,
1372
  "step": 134
1373
  },
1374
  {
1375
- "entropy": 0.341650640591979,
1376
  "epoch": 1.649310872894334,
1377
- "grad_norm": 0.000942230224609375,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
- "loss": 0.00014208458014763892,
1380
- "mean_token_accuracy": 1.0,
1381
- "num_tokens": 660290.0,
1382
  "step": 135
1383
  },
1384
  {
1385
- "entropy": 0.3829786740243435,
1386
  "epoch": 1.66156202143951,
1387
- "grad_norm": 0.00069427490234375,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
- "loss": 0.00014442864630836993,
1390
- "mean_token_accuracy": 1.0,
1391
- "num_tokens": 664473.0,
1392
  "step": 136
1393
  },
1394
  {
1395
- "entropy": 0.36254822462797165,
1396
  "epoch": 1.673813169984686,
1397
- "grad_norm": 0.000873565673828125,
1398
  "learning_rate": 3.414634146341464e-05,
1399
- "loss": 0.00012407865142449737,
1400
- "mean_token_accuracy": 1.0,
1401
- "num_tokens": 669356.0,
1402
  "step": 137
1403
  },
1404
  {
1405
- "entropy": 0.3526885788887739,
1406
  "epoch": 1.686064318529862,
1407
- "grad_norm": 0.01544189453125,
1408
  "learning_rate": 3.292682926829269e-05,
1409
- "loss": 0.0013645780272781849,
1410
- "mean_token_accuracy": 1.0,
1411
- "num_tokens": 674911.0,
1412
  "step": 138
1413
  },
1414
  {
1415
- "entropy": 0.3426882065832615,
1416
  "epoch": 1.6983154670750382,
1417
- "grad_norm": 0.00136566162109375,
1418
  "learning_rate": 3.170731707317073e-05,
1419
- "loss": 0.00017942595877684653,
1420
- "mean_token_accuracy": 1.0,
1421
- "num_tokens": 679692.0,
1422
  "step": 139
1423
  },
1424
  {
1425
- "entropy": 0.36831479519605637,
1426
  "epoch": 1.7105666156202144,
1427
- "grad_norm": 0.01104736328125,
1428
  "learning_rate": 3.048780487804878e-05,
1429
- "loss": 0.00024098601716104895,
1430
- "mean_token_accuracy": 1.0,
1431
- "num_tokens": 685048.0,
1432
  "step": 140
1433
  },
1434
  {
1435
- "entropy": 0.3340944442898035,
1436
  "epoch": 1.7228177641653906,
1437
- "grad_norm": 0.000865936279296875,
1438
  "learning_rate": 2.926829268292683e-05,
1439
- "loss": 0.00013921498612035066,
1440
- "mean_token_accuracy": 1.0,
1441
- "num_tokens": 689396.0,
1442
  "step": 141
1443
  },
1444
  {
1445
- "entropy": 0.34801830537617207,
1446
  "epoch": 1.7350689127105667,
1447
- "grad_norm": 0.000965118408203125,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
- "loss": 0.0001655905944062397,
1450
- "mean_token_accuracy": 1.0,
1451
- "num_tokens": 693189.0,
1452
  "step": 142
1453
  },
1454
  {
1455
- "entropy": 0.35556044429540634,
1456
  "epoch": 1.7473200612557427,
1457
- "grad_norm": 0.0019073486328125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
- "loss": 0.00019044376676902175,
1460
- "mean_token_accuracy": 1.0,
1461
- "num_tokens": 697603.0,
1462
  "step": 143
1463
  },
1464
  {
1465
- "entropy": 0.3632572125643492,
1466
  "epoch": 1.7595712098009189,
1467
- "grad_norm": 0.00106048583984375,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
- "loss": 0.00017029076116159558,
1470
- "mean_token_accuracy": 1.0,
1471
- "num_tokens": 703050.0,
1472
  "step": 144
1473
  },
1474
  {
1475
- "entropy": 0.35750158317387104,
1476
  "epoch": 1.7718223583460948,
1477
- "grad_norm": 0.0130615234375,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
- "loss": 0.0015582278138026595,
1480
- "mean_token_accuracy": 0.999015748500824,
1481
- "num_tokens": 707862.0,
1482
  "step": 145
1483
  },
1484
  {
1485
- "entropy": 0.36597106605768204,
1486
  "epoch": 1.784073506891271,
1487
- "grad_norm": 0.000583648681640625,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
- "loss": 0.00013483221118804067,
1490
- "mean_token_accuracy": 1.0,
1491
- "num_tokens": 712821.0,
1492
  "step": 146
1493
  },
1494
  {
1495
- "entropy": 0.35171396005898714,
1496
  "epoch": 1.7963246554364471,
1497
- "grad_norm": 0.00157928466796875,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
- "loss": 0.00012708510621450841,
1500
- "mean_token_accuracy": 1.0,
1501
- "num_tokens": 718453.0,
1502
  "step": 147
1503
  },
1504
  {
1505
- "entropy": 0.3596025314182043,
1506
  "epoch": 1.8085758039816233,
1507
- "grad_norm": 0.000762939453125,
1508
  "learning_rate": 2.073170731707317e-05,
1509
- "loss": 0.00011665250349324197,
1510
- "mean_token_accuracy": 1.0,
1511
- "num_tokens": 723810.0,
1512
  "step": 148
1513
  },
1514
  {
1515
- "entropy": 0.3876404408365488,
1516
  "epoch": 1.8208269525267995,
1517
- "grad_norm": 0.00170135498046875,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
- "loss": 0.00014468679728452116,
1520
- "mean_token_accuracy": 1.0,
1521
- "num_tokens": 728126.0,
1522
  "step": 149
1523
  },
1524
  {
1525
- "entropy": 0.3599753547459841,
1526
  "epoch": 1.8330781010719757,
1527
- "grad_norm": 0.00982666015625,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
- "loss": 0.0008729367982596159,
1530
- "mean_token_accuracy": 0.9996936284005642,
1531
- "num_tokens": 733917.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
- "eval_entropy": 0.3504998228256253,
1537
- "eval_loss": 0.0005272864946164191,
1538
- "eval_mean_token_accuracy": 0.9998166846192401,
1539
- "eval_num_tokens": 733917.0,
1540
- "eval_runtime": 51.0847,
1541
- "eval_samples_per_second": 1.351,
1542
- "eval_steps_per_second": 1.351,
1543
  "step": 150
1544
  }
1545
  ],
@@ -1560,7 +1560,7 @@
1560
  "attributes": {}
1561
  }
1562
  },
1563
- "total_flos": 3.3232647063527424e+16,
1564
  "train_batch_size": 1,
1565
  "trial_name": null,
1566
  "trial_params": null
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 0.3041980676352978,
14
  "epoch": 0.01225114854517611,
15
+ "grad_norm": 0.65234375,
16
  "learning_rate": 0.0002,
17
+ "loss": 0.12987954914569855,
18
+ "mean_token_accuracy": 0.9616314880549908,
19
+ "num_tokens": 6158.0,
20
  "step": 1
21
  },
22
  {
23
+ "entropy": 0.3288959041237831,
24
  "epoch": 0.02450229709035222,
25
+ "grad_norm": 0.35546875,
26
  "learning_rate": 0.00019878048780487805,
27
+ "loss": 0.07780474424362183,
28
+ "mean_token_accuracy": 0.9729398302733898,
29
+ "num_tokens": 11874.0,
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 0.36142856534570456,
34
  "epoch": 0.036753445635528334,
35
+ "grad_norm": 0.359375,
36
  "learning_rate": 0.0001975609756097561,
37
+ "loss": 0.08096732199192047,
38
+ "mean_token_accuracy": 0.9701218046247959,
39
+ "num_tokens": 17155.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 0.3489781664684415,
44
  "epoch": 0.04900459418070444,
45
+ "grad_norm": 0.2890625,
46
  "learning_rate": 0.00019634146341463416,
47
+ "loss": 0.07328949123620987,
48
+ "mean_token_accuracy": 0.9726539552211761,
49
+ "num_tokens": 22369.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 0.3293187078088522,
54
  "epoch": 0.06125574272588055,
55
+ "grad_norm": 0.248046875,
56
  "learning_rate": 0.0001951219512195122,
57
+ "loss": 0.0884413868188858,
58
+ "mean_token_accuracy": 0.9707589820027351,
59
+ "num_tokens": 28747.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 0.339785429649055,
64
  "epoch": 0.07350689127105667,
65
+ "grad_norm": 0.5390625,
66
  "learning_rate": 0.00019390243902439025,
67
+ "loss": 0.09005022048950195,
68
+ "mean_token_accuracy": 0.9711229205131531,
69
+ "num_tokens": 34001.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 0.3187539605423808,
74
  "epoch": 0.08575803981623277,
75
+ "grad_norm": 0.357421875,
76
  "learning_rate": 0.0001926829268292683,
77
+ "loss": 0.08771149814128876,
78
+ "mean_token_accuracy": 0.9713485687971115,
79
+ "num_tokens": 40149.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 0.30580494925379753,
84
  "epoch": 0.09800918836140889,
85
+ "grad_norm": 0.294921875,
86
  "learning_rate": 0.00019146341463414633,
87
+ "loss": 0.07060129195451736,
88
+ "mean_token_accuracy": 0.9733475148677826,
89
+ "num_tokens": 45006.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 0.2799514262005687,
94
  "epoch": 0.11026033690658499,
95
+ "grad_norm": 0.318359375,
96
  "learning_rate": 0.0001902439024390244,
97
+ "loss": 0.08461187779903412,
98
+ "mean_token_accuracy": 0.9710647016763687,
99
+ "num_tokens": 51060.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 0.28654457721859217,
104
  "epoch": 0.1225114854517611,
105
+ "grad_norm": 0.341796875,
106
  "learning_rate": 0.00018902439024390244,
107
+ "loss": 0.08227542042732239,
108
+ "mean_token_accuracy": 0.9692031815648079,
109
+ "num_tokens": 56583.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 0.26793921180069447,
114
  "epoch": 0.13476263399693722,
115
+ "grad_norm": 0.259765625,
116
  "learning_rate": 0.0001878048780487805,
117
+ "loss": 0.07852551341056824,
118
+ "mean_token_accuracy": 0.9747273214161396,
119
+ "num_tokens": 63136.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 0.30183705035597086,
124
  "epoch": 0.14701378254211334,
125
+ "grad_norm": 0.279296875,
126
  "learning_rate": 0.00018658536585365856,
127
+ "loss": 0.07575420290231705,
128
+ "mean_token_accuracy": 0.9726979620754719,
129
+ "num_tokens": 69604.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 0.2680633468553424,
134
  "epoch": 0.15926493108728942,
135
+ "grad_norm": 0.30078125,
136
  "learning_rate": 0.0001853658536585366,
137
+ "loss": 0.07610919326543808,
138
+ "mean_token_accuracy": 0.9748654179275036,
139
+ "num_tokens": 75871.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 0.27854922134429216,
144
  "epoch": 0.17151607963246554,
145
+ "grad_norm": 0.41015625,
146
  "learning_rate": 0.00018414634146341464,
147
+ "loss": 0.09525731950998306,
148
+ "mean_token_accuracy": 0.9607353135943413,
149
+ "num_tokens": 81257.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 0.27795467525720596,
154
  "epoch": 0.18376722817764166,
155
+ "grad_norm": 0.318359375,
156
  "learning_rate": 0.0001829268292682927,
157
+ "loss": 0.06710757315158844,
158
+ "mean_token_accuracy": 0.9751962497830391,
159
+ "num_tokens": 85769.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 0.2635908415541053,
164
  "epoch": 0.19601837672281777,
165
+ "grad_norm": 0.439453125,
166
  "learning_rate": 0.00018170731707317075,
167
+ "loss": 0.08850108832120895,
168
+ "mean_token_accuracy": 0.9715681448578835,
169
+ "num_tokens": 91023.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 0.26509028300642967,
174
  "epoch": 0.2082695252679939,
175
+ "grad_norm": 0.271484375,
176
  "learning_rate": 0.0001804878048780488,
177
+ "loss": 0.07479405403137207,
178
+ "mean_token_accuracy": 0.9715253114700317,
179
+ "num_tokens": 96339.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 0.2699039001017809,
184
  "epoch": 0.22052067381316998,
185
+ "grad_norm": 0.353515625,
186
  "learning_rate": 0.00017926829268292684,
187
+ "loss": 0.07519614696502686,
188
+ "mean_token_accuracy": 0.9718605615198612,
189
+ "num_tokens": 101938.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 0.2809357335790992,
194
  "epoch": 0.2327718223583461,
195
+ "grad_norm": 0.3046875,
196
  "learning_rate": 0.00017804878048780488,
197
+ "loss": 0.08889807015657425,
198
+ "mean_token_accuracy": 0.9662206135690212,
199
+ "num_tokens": 107971.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 0.29283443558961153,
204
  "epoch": 0.2450229709035222,
205
+ "grad_norm": 0.3359375,
206
  "learning_rate": 0.00017682926829268295,
207
+ "loss": 0.07666820287704468,
208
+ "mean_token_accuracy": 0.9718802459537983,
209
+ "num_tokens": 113005.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 0.29593323450535536,
214
  "epoch": 0.2572741194486983,
215
+ "grad_norm": 0.48046875,
216
  "learning_rate": 0.000175609756097561,
217
+ "loss": 0.0903002992272377,
218
+ "mean_token_accuracy": 0.9660120271146297,
219
+ "num_tokens": 118993.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 0.2751638563349843,
224
  "epoch": 0.26952526799387444,
225
+ "grad_norm": 0.27734375,
226
  "learning_rate": 0.00017439024390243903,
227
+ "loss": 0.09172362089157104,
228
+ "mean_token_accuracy": 0.9718118757009506,
229
+ "num_tokens": 125931.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 0.32398509234189987,
234
  "epoch": 0.28177641653905056,
235
+ "grad_norm": 0.41796875,
236
  "learning_rate": 0.00017317073170731708,
237
+ "loss": 0.08742143213748932,
238
+ "mean_token_accuracy": 0.9695158265531063,
239
+ "num_tokens": 130991.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 0.33191137574613094,
244
  "epoch": 0.29402756508422667,
245
+ "grad_norm": 0.322265625,
246
  "learning_rate": 0.00017195121951219512,
247
+ "loss": 0.0861031711101532,
248
+ "mean_token_accuracy": 0.9639297090470791,
249
+ "num_tokens": 136879.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 0.3045969307422638,
254
  "epoch": 0.30627871362940273,
255
+ "grad_norm": 0.263671875,
256
  "learning_rate": 0.0001707317073170732,
257
+ "loss": 0.07694194465875626,
258
+ "mean_token_accuracy": 0.9716509021818638,
259
+ "num_tokens": 142190.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 0.3009780840948224,
264
  "epoch": 0.31852986217457885,
265
+ "grad_norm": 0.33203125,
266
  "learning_rate": 0.00016951219512195123,
267
+ "loss": 0.0928640067577362,
268
+ "mean_token_accuracy": 0.9631010964512825,
269
+ "num_tokens": 147659.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 0.26603397261351347,
274
  "epoch": 0.33078101071975496,
275
+ "grad_norm": 0.255859375,
276
  "learning_rate": 0.00016829268292682927,
277
+ "loss": 0.07426867634057999,
278
+ "mean_token_accuracy": 0.9714149422943592,
279
+ "num_tokens": 153293.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 0.3046340309083462,
284
  "epoch": 0.3430321592649311,
285
+ "grad_norm": 0.275390625,
286
  "learning_rate": 0.00016707317073170731,
287
+ "loss": 0.0794563814997673,
288
+ "mean_token_accuracy": 0.9729922078549862,
289
+ "num_tokens": 159233.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 0.3388302018865943,
294
  "epoch": 0.3552833078101072,
295
+ "grad_norm": 0.330078125,
296
  "learning_rate": 0.00016585365853658536,
297
+ "loss": 0.09019184857606888,
298
+ "mean_token_accuracy": 0.9697935245931149,
299
+ "num_tokens": 164503.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 0.3342750547453761,
304
  "epoch": 0.3675344563552833,
305
+ "grad_norm": 0.294921875,
306
  "learning_rate": 0.00016463414634146343,
307
+ "loss": 0.09340573102235794,
308
+ "mean_token_accuracy": 0.9605537690222263,
309
+ "num_tokens": 170523.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 0.30620657559484243,
314
  "epoch": 0.37978560490045943,
315
+ "grad_norm": 0.345703125,
316
  "learning_rate": 0.00016341463414634147,
317
+ "loss": 0.07664323598146439,
318
+ "mean_token_accuracy": 0.9681061618030071,
319
+ "num_tokens": 174928.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 0.30496103409677744,
324
  "epoch": 0.39203675344563554,
325
+ "grad_norm": 0.2490234375,
326
  "learning_rate": 0.00016219512195121954,
327
+ "loss": 0.07825497537851334,
328
+ "mean_token_accuracy": 0.9774314574897289,
329
+ "num_tokens": 181701.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 0.2976566730067134,
334
  "epoch": 0.40428790199081166,
335
+ "grad_norm": 0.27734375,
336
  "learning_rate": 0.00016097560975609758,
337
+ "loss": 0.09027402102947235,
338
+ "mean_token_accuracy": 0.9658169783651829,
339
+ "num_tokens": 187412.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 0.27866631746292114,
344
  "epoch": 0.4165390505359878,
345
+ "grad_norm": 0.251953125,
346
  "learning_rate": 0.00015975609756097562,
347
+ "loss": 0.08274199813604355,
348
+ "mean_token_accuracy": 0.9746941477060318,
349
+ "num_tokens": 192875.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 0.2830528961494565,
354
  "epoch": 0.42879019908116384,
355
+ "grad_norm": 0.30859375,
356
  "learning_rate": 0.00015853658536585366,
357
+ "loss": 0.09378398954868317,
358
+ "mean_token_accuracy": 0.9666311629116535,
359
+ "num_tokens": 198242.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 0.27817794494330883,
364
  "epoch": 0.44104134762633995,
365
+ "grad_norm": 0.37890625,
366
  "learning_rate": 0.00015731707317073173,
367
+ "loss": 0.0867948904633522,
368
+ "mean_token_accuracy": 0.9669017046689987,
369
+ "num_tokens": 203345.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 0.24728088174015284,
374
  "epoch": 0.45329249617151607,
375
+ "grad_norm": 0.2734375,
376
  "learning_rate": 0.00015609756097560978,
377
+ "loss": 0.0845772847533226,
378
+ "mean_token_accuracy": 0.9704948216676712,
379
+ "num_tokens": 209548.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 0.2514335783198476,
384
  "epoch": 0.4655436447166922,
385
+ "grad_norm": 0.296875,
386
  "learning_rate": 0.00015487804878048782,
387
+ "loss": 0.08028042316436768,
388
+ "mean_token_accuracy": 0.9699894711375237,
389
+ "num_tokens": 215104.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 0.2479592664167285,
394
  "epoch": 0.4777947932618683,
395
+ "grad_norm": 0.34375,
396
  "learning_rate": 0.00015365853658536586,
397
+ "loss": 0.0773642361164093,
398
+ "mean_token_accuracy": 0.9734528213739395,
399
+ "num_tokens": 220401.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 0.28870310448110104,
404
  "epoch": 0.4900459418070444,
405
+ "grad_norm": 0.404296875,
406
  "learning_rate": 0.0001524390243902439,
407
+ "loss": 0.086701899766922,
408
+ "mean_token_accuracy": 0.9637217558920383,
409
+ "num_tokens": 225652.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 0.24501706194132566,
414
  "epoch": 0.5022970903522205,
415
+ "grad_norm": 0.25390625,
416
  "learning_rate": 0.00015121951219512197,
417
+ "loss": 0.07521235942840576,
418
+ "mean_token_accuracy": 0.9753581583499908,
419
+ "num_tokens": 230583.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 0.28654969297349453,
424
  "epoch": 0.5145482388973966,
425
+ "grad_norm": 0.39453125,
426
  "learning_rate": 0.00015000000000000001,
427
+ "loss": 0.09706442058086395,
428
+ "mean_token_accuracy": 0.9664160050451756,
429
+ "num_tokens": 235457.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 0.28464407846331596,
434
  "epoch": 0.5267993874425727,
435
+ "grad_norm": 0.306640625,
436
  "learning_rate": 0.00014878048780487806,
437
+ "loss": 0.07722343504428864,
438
+ "mean_token_accuracy": 0.9730902686715126,
439
+ "num_tokens": 241513.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 0.30445601511746645,
444
  "epoch": 0.5390505359877489,
445
+ "grad_norm": 0.310546875,
446
  "learning_rate": 0.0001475609756097561,
447
+ "loss": 0.0771762803196907,
448
+ "mean_token_accuracy": 0.975468497723341,
449
+ "num_tokens": 246914.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 0.26495161652565,
454
  "epoch": 0.5513016845329249,
455
+ "grad_norm": 0.25390625,
456
  "learning_rate": 0.00014634146341463414,
457
+ "loss": 0.0772022157907486,
458
+ "mean_token_accuracy": 0.9669279642403126,
459
+ "num_tokens": 252479.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 0.28918597288429737,
464
  "epoch": 0.5635528330781011,
465
+ "grad_norm": 0.294921875,
466
  "learning_rate": 0.0001451219512195122,
467
+ "loss": 0.08781749755144119,
468
+ "mean_token_accuracy": 0.9674229696393013,
469
+ "num_tokens": 257947.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 0.2856784025207162,
474
  "epoch": 0.5758039816232772,
475
+ "grad_norm": 0.267578125,
476
  "learning_rate": 0.00014390243902439025,
477
+ "loss": 0.0642290860414505,
478
+ "mean_token_accuracy": 0.9745214283466339,
479
+ "num_tokens": 263220.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 0.2811201810836792,
484
  "epoch": 0.5880551301684533,
485
+ "grad_norm": 0.34765625,
486
  "learning_rate": 0.0001426829268292683,
487
+ "loss": 0.08826867491006851,
488
+ "mean_token_accuracy": 0.9674578756093979,
489
+ "num_tokens": 268072.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 0.3114980049431324,
494
  "epoch": 0.6003062787136294,
495
+ "grad_norm": 0.380859375,
496
  "learning_rate": 0.00014146341463414634,
497
+ "loss": 0.08005333691835403,
498
+ "mean_token_accuracy": 0.9698234163224697,
499
+ "num_tokens": 273245.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 0.2891535870730877,
504
  "epoch": 0.6125574272588055,
505
+ "grad_norm": 0.2734375,
506
  "learning_rate": 0.00014024390243902438,
507
+ "loss": 0.06931450217962265,
508
+ "mean_token_accuracy": 0.9755596853792667,
509
+ "num_tokens": 278455.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
+ "eval_entropy": 0.2854771510414455,
515
+ "eval_loss": 0.07233226299285889,
516
+ "eval_mean_token_accuracy": 0.9710306654805723,
517
+ "eval_num_tokens": 278455.0,
518
+ "eval_runtime": 57.5345,
519
+ "eval_samples_per_second": 1.199,
520
+ "eval_steps_per_second": 1.199,
521
  "step": 50
522
  },
523
  {
524
+ "entropy": 0.2805565893650055,
525
  "epoch": 0.6248085758039816,
526
+ "grad_norm": 0.283203125,
527
  "learning_rate": 0.00013902439024390245,
528
+ "loss": 0.07015535980463028,
529
+ "mean_token_accuracy": 0.9736857265233994,
530
+ "num_tokens": 283905.0,
531
  "step": 51
532
  },
533
  {
534
+ "entropy": 0.2953841704875231,
535
  "epoch": 0.6370597243491577,
536
+ "grad_norm": 0.326171875,
537
  "learning_rate": 0.0001378048780487805,
538
+ "loss": 0.08345313370227814,
539
+ "mean_token_accuracy": 0.9668012037873268,
540
+ "num_tokens": 289621.0,
541
  "step": 52
542
  },
543
  {
544
+ "entropy": 0.2993172137066722,
545
  "epoch": 0.6493108728943339,
546
+ "grad_norm": 0.330078125,
547
  "learning_rate": 0.00013658536585365856,
548
+ "loss": 0.06988305598497391,
549
+ "mean_token_accuracy": 0.9744048714637756,
550
+ "num_tokens": 295589.0,
551
  "step": 53
552
  },
553
  {
554
+ "entropy": 0.24585585854947567,
555
  "epoch": 0.6615620214395099,
556
+ "grad_norm": 0.326171875,
557
  "learning_rate": 0.0001353658536585366,
558
+ "loss": 0.08018705993890762,
559
+ "mean_token_accuracy": 0.97354631498456,
560
+ "num_tokens": 301123.0,
561
  "step": 54
562
  },
563
  {
564
+ "entropy": 0.2925192918628454,
565
  "epoch": 0.6738131699846861,
566
+ "grad_norm": 0.310546875,
567
  "learning_rate": 0.00013414634146341464,
568
+ "loss": 0.08674345165491104,
569
+ "mean_token_accuracy": 0.9711452201008797,
570
+ "num_tokens": 307068.0,
571
  "step": 55
572
  },
573
  {
574
+ "entropy": 0.25901074800640345,
575
  "epoch": 0.6860643185298622,
576
+ "grad_norm": 0.28515625,
577
  "learning_rate": 0.0001329268292682927,
578
+ "loss": 0.08070839941501617,
579
+ "mean_token_accuracy": 0.9729307927191257,
580
+ "num_tokens": 312510.0,
581
  "step": 56
582
  },
583
  {
584
+ "entropy": 0.29548987187445164,
585
  "epoch": 0.6983154670750383,
586
+ "grad_norm": 0.34375,
587
  "learning_rate": 0.00013170731707317076,
588
+ "loss": 0.08920362591743469,
589
+ "mean_token_accuracy": 0.9669180549681187,
590
+ "num_tokens": 318484.0,
591
  "step": 57
592
  },
593
  {
594
+ "entropy": 0.2568454071879387,
595
  "epoch": 0.7105666156202144,
596
+ "grad_norm": 0.2734375,
597
  "learning_rate": 0.0001304878048780488,
598
+ "loss": 0.07155663520097733,
599
+ "mean_token_accuracy": 0.9701582230627537,
600
+ "num_tokens": 324813.0,
601
  "step": 58
602
  },
603
  {
604
+ "entropy": 0.2808335982263088,
605
  "epoch": 0.7228177641653905,
606
+ "grad_norm": 0.283203125,
607
  "learning_rate": 0.00012926829268292684,
608
+ "loss": 0.07705684751272202,
609
+ "mean_token_accuracy": 0.966577123850584,
610
+ "num_tokens": 330011.0,
611
  "step": 59
612
  },
613
  {
614
+ "entropy": 0.2699971180409193,
615
  "epoch": 0.7350689127105666,
616
+ "grad_norm": 0.39453125,
617
  "learning_rate": 0.00012804878048780488,
618
+ "loss": 0.10086975991725922,
619
+ "mean_token_accuracy": 0.9665980078279972,
620
+ "num_tokens": 335902.0,
621
  "step": 60
622
  },
623
  {
624
+ "entropy": 0.2485162764787674,
625
  "epoch": 0.7473200612557427,
626
+ "grad_norm": 0.310546875,
627
  "learning_rate": 0.00012682926829268293,
628
+ "loss": 0.08342916518449783,
629
+ "mean_token_accuracy": 0.9685300663113594,
630
+ "num_tokens": 342624.0,
631
  "step": 61
632
  },
633
  {
634
+ "entropy": 0.3012225143611431,
635
  "epoch": 0.7595712098009189,
636
+ "grad_norm": 0.2490234375,
637
  "learning_rate": 0.000125609756097561,
638
+ "loss": 0.06536269932985306,
639
+ "mean_token_accuracy": 0.9756054095923901,
640
+ "num_tokens": 348764.0,
641
  "step": 62
642
  },
643
  {
644
+ "entropy": 0.26388413086533546,
645
  "epoch": 0.7718223583460949,
646
+ "grad_norm": 0.384765625,
647
  "learning_rate": 0.00012439024390243904,
648
+ "loss": 0.06917200982570648,
649
+ "mean_token_accuracy": 0.9760353714227676,
650
+ "num_tokens": 353717.0,
651
  "step": 63
652
  },
653
  {
654
+ "entropy": 0.26560324616730213,
655
  "epoch": 0.7840735068912711,
656
+ "grad_norm": 0.341796875,
657
  "learning_rate": 0.00012317073170731708,
658
+ "loss": 0.07885865867137909,
659
+ "mean_token_accuracy": 0.9693707525730133,
660
+ "num_tokens": 358700.0,
661
  "step": 64
662
  },
663
  {
664
+ "entropy": 0.2870019916445017,
665
  "epoch": 0.7963246554364471,
666
+ "grad_norm": 0.296875,
667
  "learning_rate": 0.00012195121951219512,
668
+ "loss": 0.07569920271635056,
669
+ "mean_token_accuracy": 0.9747120216488838,
670
+ "num_tokens": 364367.0,
671
  "step": 65
672
  },
673
  {
674
+ "entropy": 0.30871331319212914,
675
  "epoch": 0.8085758039816233,
676
+ "grad_norm": 0.39453125,
677
  "learning_rate": 0.00012073170731707318,
678
+ "loss": 0.07961063086986542,
679
+ "mean_token_accuracy": 0.9741152077913284,
680
+ "num_tokens": 370191.0,
681
  "step": 66
682
  },
683
  {
684
+ "entropy": 0.25089073460549116,
685
  "epoch": 0.8208269525267994,
686
+ "grad_norm": 0.275390625,
687
  "learning_rate": 0.00011951219512195122,
688
+ "loss": 0.06939976662397385,
689
+ "mean_token_accuracy": 0.9737464673817158,
690
+ "num_tokens": 376432.0,
691
  "step": 67
692
  },
693
  {
694
+ "entropy": 0.2964933030307293,
695
  "epoch": 0.8330781010719756,
696
+ "grad_norm": 0.30859375,
697
  "learning_rate": 0.00011829268292682926,
698
+ "loss": 0.06059417501091957,
699
+ "mean_token_accuracy": 0.9753321446478367,
700
+ "num_tokens": 381184.0,
701
  "step": 68
702
  },
703
  {
704
+ "entropy": 0.29217866342514753,
705
  "epoch": 0.8453292496171516,
706
+ "grad_norm": 0.3046875,
707
  "learning_rate": 0.00011707317073170732,
708
+ "loss": 0.0808170959353447,
709
+ "mean_token_accuracy": 0.974962618201971,
710
+ "num_tokens": 385476.0,
711
  "step": 69
712
  },
713
  {
714
+ "entropy": 0.32675024215132,
715
  "epoch": 0.8575803981623277,
716
+ "grad_norm": 0.5078125,
717
  "learning_rate": 0.00011585365853658536,
718
+ "loss": 0.10380380600690842,
719
+ "mean_token_accuracy": 0.9621776640415192,
720
+ "num_tokens": 390246.0,
721
  "step": 70
722
  },
723
  {
724
+ "entropy": 0.3259228030219674,
725
  "epoch": 0.8698315467075038,
726
+ "grad_norm": 0.45703125,
727
  "learning_rate": 0.00011463414634146342,
728
+ "loss": 0.07719732075929642,
729
+ "mean_token_accuracy": 0.9656922854483128,
730
+ "num_tokens": 395726.0,
731
  "step": 71
732
  },
733
  {
734
+ "entropy": 0.2817502664402127,
735
  "epoch": 0.8820826952526799,
736
+ "grad_norm": 0.400390625,
737
  "learning_rate": 0.00011341463414634146,
738
+ "loss": 0.10946179181337357,
739
+ "mean_token_accuracy": 0.9588761143386364,
740
+ "num_tokens": 400824.0,
741
  "step": 72
742
  },
743
  {
744
+ "entropy": 0.28915605414658785,
745
  "epoch": 0.8943338437978561,
746
+ "grad_norm": 0.341796875,
747
  "learning_rate": 0.00011219512195121953,
748
+ "loss": 0.07160484790802002,
749
+ "mean_token_accuracy": 0.9732935056090355,
750
+ "num_tokens": 407602.0,
751
  "step": 73
752
  },
753
  {
754
+ "entropy": 0.285653960891068,
755
  "epoch": 0.9065849923430321,
756
+ "grad_norm": 0.283203125,
757
  "learning_rate": 0.00011097560975609757,
758
+ "loss": 0.08304032683372498,
759
+ "mean_token_accuracy": 0.9711127728223801,
760
+ "num_tokens": 413815.0,
761
  "step": 74
762
  },
763
  {
764
+ "entropy": 0.30409657675772905,
765
  "epoch": 0.9188361408882083,
766
+ "grad_norm": 0.322265625,
767
  "learning_rate": 0.00010975609756097563,
768
+ "loss": 0.06820105761289597,
769
+ "mean_token_accuracy": 0.9733226448297501,
770
+ "num_tokens": 418971.0,
771
  "step": 75
772
  },
773
  {
774
+ "entropy": 0.3267542561516166,
775
  "epoch": 0.9310872894333844,
776
+ "grad_norm": 0.291015625,
777
  "learning_rate": 0.00010853658536585367,
778
+ "loss": 0.07155608385801315,
779
+ "mean_token_accuracy": 0.9717761054635048,
780
+ "num_tokens": 423777.0,
781
  "step": 76
782
  },
783
  {
784
+ "entropy": 0.26458421628922224,
785
  "epoch": 0.9433384379785605,
786
+ "grad_norm": 0.271484375,
787
  "learning_rate": 0.00010731707317073172,
788
+ "loss": 0.07516152411699295,
789
+ "mean_token_accuracy": 0.9726166129112244,
790
+ "num_tokens": 430300.0,
791
  "step": 77
792
  },
793
  {
794
+ "entropy": 0.313277630135417,
795
  "epoch": 0.9555895865237366,
796
+ "grad_norm": 0.33984375,
797
  "learning_rate": 0.00010609756097560977,
798
+ "loss": 0.07090278714895248,
799
+ "mean_token_accuracy": 0.9733094871044159,
800
+ "num_tokens": 435531.0,
801
  "step": 78
802
  },
803
  {
804
+ "entropy": 0.29259978514164686,
805
  "epoch": 0.9678407350689127,
806
+ "grad_norm": 0.369140625,
807
  "learning_rate": 0.00010487804878048781,
808
+ "loss": 0.07661356031894684,
809
+ "mean_token_accuracy": 0.9706463180482388,
810
+ "num_tokens": 440264.0,
811
  "step": 79
812
  },
813
  {
814
+ "entropy": 0.2779441485181451,
815
  "epoch": 0.9800918836140888,
816
+ "grad_norm": 0.3203125,
817
  "learning_rate": 0.00010365853658536586,
818
+ "loss": 0.07106667011976242,
819
+ "mean_token_accuracy": 0.9709027595818043,
820
+ "num_tokens": 446616.0,
821
  "step": 80
822
  },
823
  {
824
+ "entropy": 0.2993398727849126,
825
  "epoch": 0.9923430321592649,
826
+ "grad_norm": 0.27734375,
827
  "learning_rate": 0.0001024390243902439,
828
+ "loss": 0.08056843280792236,
829
+ "mean_token_accuracy": 0.97335534542799,
830
+ "num_tokens": 451901.0,
831
  "step": 81
832
  },
833
  {
834
+ "entropy": 0.2985739395022392,
835
  "epoch": 1.0,
836
+ "grad_norm": 0.3515625,
837
  "learning_rate": 0.00010121951219512196,
838
+ "loss": 0.05783425644040108,
839
+ "mean_token_accuracy": 0.9816944003105164,
840
+ "num_tokens": 455183.0,
841
  "step": 82
842
  },
843
  {
844
+ "entropy": 0.25938804540783167,
845
  "epoch": 1.0122511485451762,
846
+ "grad_norm": 0.2099609375,
847
  "learning_rate": 0.0001,
848
+ "loss": 0.04025664180517197,
849
+ "mean_token_accuracy": 0.9888772070407867,
850
+ "num_tokens": 460758.0,
851
  "step": 83
852
  },
853
  {
854
+ "entropy": 0.2424000184983015,
855
  "epoch": 1.0245022970903521,
856
+ "grad_norm": 0.31640625,
857
  "learning_rate": 9.878048780487805e-05,
858
+ "loss": 0.041272781789302826,
859
+ "mean_token_accuracy": 0.9850874915719032,
860
+ "num_tokens": 466506.0,
861
  "step": 84
862
  },
863
  {
864
+ "entropy": 0.28685680869966745,
865
  "epoch": 1.0367534456355283,
866
+ "grad_norm": 0.220703125,
867
  "learning_rate": 9.75609756097561e-05,
868
+ "loss": 0.04828771948814392,
869
+ "mean_token_accuracy": 0.987192340195179,
870
+ "num_tokens": 473114.0,
871
  "step": 85
872
  },
873
  {
874
+ "entropy": 0.2173819374293089,
875
  "epoch": 1.0490045941807045,
876
+ "grad_norm": 0.212890625,
877
  "learning_rate": 9.634146341463415e-05,
878
+ "loss": 0.02984496019780636,
879
+ "mean_token_accuracy": 0.9893322959542274,
880
+ "num_tokens": 478272.0,
881
  "step": 86
882
  },
883
  {
884
+ "entropy": 0.24273153394460678,
885
  "epoch": 1.0612557427258806,
886
+ "grad_norm": 0.193359375,
887
  "learning_rate": 9.51219512195122e-05,
888
+ "loss": 0.05374791473150253,
889
+ "mean_token_accuracy": 0.9889552295207977,
890
+ "num_tokens": 484056.0,
891
  "step": 87
892
  },
893
  {
894
+ "entropy": 0.26517119724303484,
895
  "epoch": 1.0735068912710566,
896
+ "grad_norm": 0.2265625,
897
  "learning_rate": 9.390243902439024e-05,
898
+ "loss": 0.03632190451025963,
899
+ "mean_token_accuracy": 0.9914858341217041,
900
+ "num_tokens": 489018.0,
901
  "step": 88
902
  },
903
  {
904
+ "entropy": 0.24422209709882736,
905
  "epoch": 1.0857580398162328,
906
+ "grad_norm": 0.2041015625,
907
  "learning_rate": 9.26829268292683e-05,
908
+ "loss": 0.03214319050312042,
909
+ "mean_token_accuracy": 0.9906447269022465,
910
+ "num_tokens": 493923.0,
911
  "step": 89
912
  },
913
  {
914
+ "entropy": 0.221808229573071,
915
  "epoch": 1.098009188361409,
916
+ "grad_norm": 0.234375,
917
  "learning_rate": 9.146341463414635e-05,
918
+ "loss": 0.038689155131578445,
919
+ "mean_token_accuracy": 0.9870963655412197,
920
+ "num_tokens": 499718.0,
921
  "step": 90
922
  },
923
  {
924
+ "entropy": 0.25665554590523243,
925
  "epoch": 1.110260336906585,
926
+ "grad_norm": 0.26171875,
927
  "learning_rate": 9.02439024390244e-05,
928
+ "loss": 0.03570985421538353,
929
+ "mean_token_accuracy": 0.9886105321347713,
930
+ "num_tokens": 504426.0,
931
  "step": 91
932
  },
933
  {
934
+ "entropy": 0.23205960728228092,
935
  "epoch": 1.122511485451761,
936
+ "grad_norm": 0.1826171875,
937
  "learning_rate": 8.902439024390244e-05,
938
+ "loss": 0.024796659126877785,
939
+ "mean_token_accuracy": 0.9915902987122536,
940
+ "num_tokens": 509188.0,
941
  "step": 92
942
  },
943
  {
944
+ "entropy": 0.2075599799863994,
945
  "epoch": 1.1347626339969372,
946
+ "grad_norm": 0.2392578125,
947
  "learning_rate": 8.78048780487805e-05,
948
+ "loss": 0.027952998876571655,
949
+ "mean_token_accuracy": 0.9888908788561821,
950
+ "num_tokens": 514373.0,
951
  "step": 93
952
  },
953
  {
954
+ "entropy": 0.20593736693263054,
955
  "epoch": 1.1470137825421134,
956
+ "grad_norm": 0.294921875,
957
  "learning_rate": 8.658536585365854e-05,
958
+ "loss": 0.03931131958961487,
959
+ "mean_token_accuracy": 0.9884286597371101,
960
+ "num_tokens": 519285.0,
961
  "step": 94
962
  },
963
  {
964
+ "entropy": 0.18408051086589694,
965
  "epoch": 1.1592649310872893,
966
+ "grad_norm": 0.37890625,
967
  "learning_rate": 8.53658536585366e-05,
968
+ "loss": 0.029383456334471703,
969
+ "mean_token_accuracy": 0.9906960390508175,
970
+ "num_tokens": 524819.0,
971
  "step": 95
972
  },
973
  {
974
+ "entropy": 0.21047947462648153,
975
  "epoch": 1.1715160796324655,
976
+ "grad_norm": 0.328125,
977
  "learning_rate": 8.414634146341464e-05,
978
+ "loss": 0.03170277550816536,
979
+ "mean_token_accuracy": 0.9901499785482883,
980
+ "num_tokens": 529540.0,
981
  "step": 96
982
  },
983
  {
984
+ "entropy": 0.20975746307522058,
985
  "epoch": 1.1837672281776417,
986
+ "grad_norm": 0.365234375,
987
  "learning_rate": 8.292682926829268e-05,
988
+ "loss": 0.030736297369003296,
989
+ "mean_token_accuracy": 0.988069623708725,
990
+ "num_tokens": 534290.0,
991
  "step": 97
992
  },
993
  {
994
+ "entropy": 0.2099036993458867,
995
  "epoch": 1.1960183767228179,
996
+ "grad_norm": 0.298828125,
997
  "learning_rate": 8.170731707317073e-05,
998
+ "loss": 0.04264690354466438,
999
+ "mean_token_accuracy": 0.9882702529430389,
1000
+ "num_tokens": 540093.0,
1001
  "step": 98
1002
  },
1003
  {
1004
+ "entropy": 0.20470174960792065,
1005
  "epoch": 1.2082695252679938,
1006
+ "grad_norm": 0.283203125,
1007
  "learning_rate": 8.048780487804879e-05,
1008
+ "loss": 0.04001612961292267,
1009
+ "mean_token_accuracy": 0.9892623983323574,
1010
+ "num_tokens": 546621.0,
1011
  "step": 99
1012
  },
1013
  {
1014
+ "entropy": 0.19278133288025856,
1015
  "epoch": 1.22052067381317,
1016
+ "grad_norm": 0.6171875,
1017
  "learning_rate": 7.926829268292683e-05,
1018
+ "loss": 0.04199153557419777,
1019
+ "mean_token_accuracy": 0.987313587218523,
1020
+ "num_tokens": 552086.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
+ "eval_entropy": 0.20443568367888962,
1026
+ "eval_loss": 0.07565851509571075,
1027
+ "eval_mean_token_accuracy": 0.9715519590654235,
1028
+ "eval_num_tokens": 552086.0,
1029
+ "eval_runtime": 57.4717,
1030
+ "eval_samples_per_second": 1.201,
1031
+ "eval_steps_per_second": 1.201,
1032
  "step": 100
1033
  },
1034
  {
1035
+ "entropy": 0.20453590713441372,
1036
  "epoch": 1.2327718223583461,
1037
+ "grad_norm": 0.478515625,
1038
  "learning_rate": 7.804878048780489e-05,
1039
+ "loss": 0.05255145579576492,
1040
+ "mean_token_accuracy": 0.9822860956192017,
1041
+ "num_tokens": 558025.0,
1042
  "step": 101
1043
  },
1044
  {
1045
+ "entropy": 0.18952476000413299,
1046
  "epoch": 1.245022970903522,
1047
+ "grad_norm": 0.271484375,
1048
  "learning_rate": 7.682926829268293e-05,
1049
+ "loss": 0.027246126905083656,
1050
+ "mean_token_accuracy": 0.9902120418846607,
1051
+ "num_tokens": 564016.0,
1052
  "step": 102
1053
  },
1054
  {
1055
+ "entropy": 0.2112936107441783,
1056
  "epoch": 1.2572741194486983,
1057
+ "grad_norm": 0.279296875,
1058
  "learning_rate": 7.560975609756099e-05,
1059
+ "loss": 0.03221844881772995,
1060
+ "mean_token_accuracy": 0.9891848936676979,
1061
+ "num_tokens": 569749.0,
1062
  "step": 103
1063
  },
1064
  {
1065
+ "entropy": 0.21675583999603987,
1066
  "epoch": 1.2695252679938744,
1067
+ "grad_norm": 0.322265625,
1068
  "learning_rate": 7.439024390243903e-05,
1069
+ "loss": 0.0432349294424057,
1070
+ "mean_token_accuracy": 0.9877067022025585,
1071
+ "num_tokens": 575108.0,
1072
  "step": 104
1073
  },
1074
  {
1075
+ "entropy": 0.20377221517264843,
1076
  "epoch": 1.2817764165390506,
1077
+ "grad_norm": 0.298828125,
1078
  "learning_rate": 7.317073170731707e-05,
1079
+ "loss": 0.029307818040251732,
1080
+ "mean_token_accuracy": 0.9907472543418407,
1081
+ "num_tokens": 580685.0,
1082
  "step": 105
1083
  },
1084
  {
1085
+ "entropy": 0.2235541269183159,
1086
  "epoch": 1.2940275650842268,
1087
+ "grad_norm": 0.29296875,
1088
  "learning_rate": 7.195121951219513e-05,
1089
+ "loss": 0.032167330384254456,
1090
+ "mean_token_accuracy": 0.9906046241521835,
1091
+ "num_tokens": 586319.0,
1092
  "step": 106
1093
  },
1094
  {
1095
+ "entropy": 0.21635576337575912,
1096
  "epoch": 1.3062787136294027,
1097
+ "grad_norm": 0.15234375,
1098
  "learning_rate": 7.073170731707317e-05,
1099
+ "loss": 0.024916525930166245,
1100
+ "mean_token_accuracy": 0.9925986491143703,
1101
+ "num_tokens": 591791.0,
1102
  "step": 107
1103
  },
1104
  {
1105
+ "entropy": 0.23305469285696745,
1106
  "epoch": 1.318529862174579,
1107
+ "grad_norm": 0.384765625,
1108
  "learning_rate": 6.951219512195122e-05,
1109
+ "loss": 0.043021317571401596,
1110
+ "mean_token_accuracy": 0.9842446520924568,
1111
+ "num_tokens": 597135.0,
1112
  "step": 108
1113
  },
1114
  {
1115
+ "entropy": 0.2322743725962937,
1116
  "epoch": 1.3307810107197549,
1117
+ "grad_norm": 0.2392578125,
1118
  "learning_rate": 6.829268292682928e-05,
1119
+ "loss": 0.03324022889137268,
1120
+ "mean_token_accuracy": 0.9893980734050274,
1121
+ "num_tokens": 602707.0,
1122
  "step": 109
1123
  },
1124
  {
1125
+ "entropy": 0.21593647170811892,
1126
  "epoch": 1.343032159264931,
1127
+ "grad_norm": 0.244140625,
1128
  "learning_rate": 6.707317073170732e-05,
1129
+ "loss": 0.051236435770988464,
1130
+ "mean_token_accuracy": 0.9845945909619331,
1131
+ "num_tokens": 609274.0,
1132
  "step": 110
1133
  },
1134
  {
1135
+ "entropy": 0.21711204759776592,
1136
  "epoch": 1.3552833078101072,
1137
+ "grad_norm": 0.298828125,
1138
  "learning_rate": 6.585365853658538e-05,
1139
+ "loss": 0.03724904730916023,
1140
+ "mean_token_accuracy": 0.9879111871123314,
1141
+ "num_tokens": 614639.0,
1142
  "step": 111
1143
  },
1144
  {
1145
+ "entropy": 0.23972039762884378,
1146
  "epoch": 1.3675344563552834,
1147
+ "grad_norm": 0.1953125,
1148
  "learning_rate": 6.463414634146342e-05,
1149
+ "loss": 0.041368596255779266,
1150
+ "mean_token_accuracy": 0.9879214912652969,
1151
+ "num_tokens": 619839.0,
1152
  "step": 112
1153
  },
1154
  {
1155
+ "entropy": 0.2266655545681715,
1156
  "epoch": 1.3797856049004595,
1157
+ "grad_norm": 0.2734375,
1158
  "learning_rate": 6.341463414634146e-05,
1159
+ "loss": 0.04395541176199913,
1160
+ "mean_token_accuracy": 0.9881241992115974,
1161
+ "num_tokens": 626252.0,
1162
  "step": 113
1163
  },
1164
  {
1165
+ "entropy": 0.23816584516316652,
1166
  "epoch": 1.3920367534456355,
1167
+ "grad_norm": 0.205078125,
1168
  "learning_rate": 6.219512195121952e-05,
1169
+ "loss": 0.03704490512609482,
1170
+ "mean_token_accuracy": 0.9873962365090847,
1171
+ "num_tokens": 631466.0,
1172
  "step": 114
1173
  },
1174
  {
1175
+ "entropy": 0.23965218709781766,
1176
  "epoch": 1.4042879019908117,
1177
+ "grad_norm": 0.1982421875,
1178
  "learning_rate": 6.097560975609756e-05,
1179
+ "loss": 0.029025251045823097,
1180
+ "mean_token_accuracy": 0.9914826788008213,
1181
+ "num_tokens": 637746.0,
1182
  "step": 115
1183
  },
1184
  {
1185
+ "entropy": 0.25465985108166933,
1186
  "epoch": 1.4165390505359878,
1187
+ "grad_norm": 0.2490234375,
1188
  "learning_rate": 5.975609756097561e-05,
1189
+ "loss": 0.040289562195539474,
1190
+ "mean_token_accuracy": 0.9853745028376579,
1191
+ "num_tokens": 642638.0,
1192
  "step": 116
1193
  },
1194
  {
1195
+ "entropy": 0.21541998535394669,
1196
  "epoch": 1.4287901990811638,
1197
+ "grad_norm": 0.263671875,
1198
  "learning_rate": 5.853658536585366e-05,
1199
+ "loss": 0.036677829921245575,
1200
+ "mean_token_accuracy": 0.9888547360897064,
1201
+ "num_tokens": 648219.0,
1202
  "step": 117
1203
  },
1204
  {
1205
+ "entropy": 0.23220631666481495,
1206
  "epoch": 1.44104134762634,
1207
+ "grad_norm": 0.197265625,
1208
  "learning_rate": 5.731707317073171e-05,
1209
+ "loss": 0.041446540504693985,
1210
+ "mean_token_accuracy": 0.9872667863965034,
1211
+ "num_tokens": 654329.0,
1212
  "step": 118
1213
  },
1214
  {
1215
+ "entropy": 0.23642429150640965,
1216
  "epoch": 1.4532924961715161,
1217
+ "grad_norm": 0.1904296875,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
+ "loss": 0.034801121801137924,
1220
+ "mean_token_accuracy": 0.9886356219649315,
1221
+ "num_tokens": 660173.0,
1222
  "step": 119
1223
  },
1224
  {
1225
+ "entropy": 0.22930414089933038,
1226
  "epoch": 1.4655436447166923,
1227
+ "grad_norm": 0.18359375,
1228
  "learning_rate": 5.487804878048781e-05,
1229
+ "loss": 0.03842389956116676,
1230
+ "mean_token_accuracy": 0.9868512041866779,
1231
+ "num_tokens": 665808.0,
1232
  "step": 120
1233
  },
1234
  {
1235
+ "entropy": 0.20680655166506767,
1236
  "epoch": 1.4777947932618682,
1237
+ "grad_norm": 0.25,
1238
  "learning_rate": 5.365853658536586e-05,
1239
+ "loss": 0.03596107289195061,
1240
+ "mean_token_accuracy": 0.98941445723176,
1241
+ "num_tokens": 671196.0,
1242
  "step": 121
1243
  },
1244
  {
1245
+ "entropy": 0.24193469621241093,
1246
  "epoch": 1.4900459418070444,
1247
+ "grad_norm": 0.298828125,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
+ "loss": 0.039926398545503616,
1250
+ "mean_token_accuracy": 0.9893642216920853,
1251
+ "num_tokens": 676532.0,
1252
  "step": 122
1253
  },
1254
  {
1255
+ "entropy": 0.22021344117820263,
1256
  "epoch": 1.5022970903522204,
1257
+ "grad_norm": 0.16796875,
1258
  "learning_rate": 5.121951219512195e-05,
1259
+ "loss": 0.028786854818463326,
1260
+ "mean_token_accuracy": 0.9905072785913944,
1261
+ "num_tokens": 681580.0,
1262
  "step": 123
1263
  },
1264
  {
1265
+ "entropy": 0.2207544706761837,
1266
  "epoch": 1.5145482388973965,
1267
+ "grad_norm": 0.1748046875,
1268
  "learning_rate": 5e-05,
1269
+ "loss": 0.02596566267311573,
1270
+ "mean_token_accuracy": 0.9926727451384068,
1271
+ "num_tokens": 687310.0,
1272
  "step": 124
1273
  },
1274
  {
1275
+ "entropy": 0.21766360383480787,
1276
  "epoch": 1.5267993874425727,
1277
+ "grad_norm": 0.2177734375,
1278
  "learning_rate": 4.878048780487805e-05,
1279
+ "loss": 0.0336245559155941,
1280
+ "mean_token_accuracy": 0.9895498640835285,
1281
+ "num_tokens": 693584.0,
1282
  "step": 125
1283
  },
1284
  {
1285
+ "entropy": 0.20466620940715075,
1286
  "epoch": 1.5390505359877489,
1287
+ "grad_norm": 0.1669921875,
1288
  "learning_rate": 4.75609756097561e-05,
1289
+ "loss": 0.025749148800969124,
1290
+ "mean_token_accuracy": 0.9912248440086842,
1291
+ "num_tokens": 699395.0,
1292
  "step": 126
1293
  },
1294
  {
1295
+ "entropy": 0.2437387891113758,
1296
  "epoch": 1.551301684532925,
1297
+ "grad_norm": 0.193359375,
1298
  "learning_rate": 4.634146341463415e-05,
1299
+ "loss": 0.03270214796066284,
1300
+ "mean_token_accuracy": 0.9933484047651291,
1301
+ "num_tokens": 705055.0,
1302
  "step": 127
1303
  },
1304
  {
1305
+ "entropy": 0.2368676969781518,
1306
  "epoch": 1.5635528330781012,
1307
+ "grad_norm": 0.2236328125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
+ "loss": 0.03528536483645439,
1310
+ "mean_token_accuracy": 0.9893530681729317,
1311
+ "num_tokens": 710632.0,
1312
  "step": 128
1313
  },
1314
  {
1315
+ "entropy": 0.19431999698281288,
1316
  "epoch": 1.5758039816232772,
1317
+ "grad_norm": 0.43359375,
1318
  "learning_rate": 4.390243902439025e-05,
1319
+ "loss": 0.047928016632795334,
1320
+ "mean_token_accuracy": 0.9849436171352863,
1321
+ "num_tokens": 715837.0,
1322
  "step": 129
1323
  },
1324
  {
1325
+ "entropy": 0.18662631046026945,
1326
  "epoch": 1.5880551301684533,
1327
+ "grad_norm": 0.240234375,
1328
  "learning_rate": 4.26829268292683e-05,
1329
+ "loss": 0.04247990995645523,
1330
+ "mean_token_accuracy": 0.9910683631896973,
1331
+ "num_tokens": 721933.0,
1332
  "step": 130
1333
  },
1334
  {
1335
+ "entropy": 0.2159680761396885,
1336
  "epoch": 1.6003062787136293,
1337
+ "grad_norm": 0.283203125,
1338
  "learning_rate": 4.146341463414634e-05,
1339
+ "loss": 0.0323370024561882,
1340
+ "mean_token_accuracy": 0.9914376214146614,
1341
+ "num_tokens": 727148.0,
1342
  "step": 131
1343
  },
1344
  {
1345
+ "entropy": 0.19935058476403356,
1346
  "epoch": 1.6125574272588055,
1347
+ "grad_norm": 0.298828125,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
+ "loss": 0.04690408334136009,
1350
+ "mean_token_accuracy": 0.9893516302108765,
1351
+ "num_tokens": 733471.0,
1352
  "step": 132
1353
  },
1354
  {
1355
+ "entropy": 0.20206499379128218,
1356
  "epoch": 1.6248085758039816,
1357
+ "grad_norm": 0.271484375,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
+ "loss": 0.03374583646655083,
1360
+ "mean_token_accuracy": 0.9919851459562778,
1361
+ "num_tokens": 738692.0,
1362
  "step": 133
1363
  },
1364
  {
1365
+ "entropy": 0.2015545079484582,
1366
  "epoch": 1.6370597243491578,
1367
+ "grad_norm": 0.208984375,
1368
  "learning_rate": 3.780487804878049e-05,
1369
+ "loss": 0.030488884076476097,
1370
+ "mean_token_accuracy": 0.9903693087399006,
1371
+ "num_tokens": 743858.0,
1372
  "step": 134
1373
  },
1374
  {
1375
+ "entropy": 0.19723766017705202,
1376
  "epoch": 1.649310872894334,
1377
+ "grad_norm": 0.2158203125,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
+ "loss": 0.025454077869653702,
1380
+ "mean_token_accuracy": 0.9917643442749977,
1381
+ "num_tokens": 749750.0,
1382
  "step": 135
1383
  },
1384
  {
1385
+ "entropy": 0.24156489223241806,
1386
  "epoch": 1.66156202143951,
1387
+ "grad_norm": 0.248046875,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
+ "loss": 0.03519139811396599,
1390
+ "mean_token_accuracy": 0.9872167631983757,
1391
+ "num_tokens": 754490.0,
1392
  "step": 136
1393
  },
1394
  {
1395
+ "entropy": 0.21465991251170635,
1396
  "epoch": 1.673813169984686,
1397
+ "grad_norm": 0.2177734375,
1398
  "learning_rate": 3.414634146341464e-05,
1399
+ "loss": 0.035167597234249115,
1400
+ "mean_token_accuracy": 0.9907721877098083,
1401
+ "num_tokens": 760201.0,
1402
  "step": 137
1403
  },
1404
  {
1405
+ "entropy": 0.2013603514060378,
1406
  "epoch": 1.686064318529862,
1407
+ "grad_norm": 0.2275390625,
1408
  "learning_rate": 3.292682926829269e-05,
1409
+ "loss": 0.038648657500743866,
1410
+ "mean_token_accuracy": 0.9927868358790874,
1411
+ "num_tokens": 767298.0,
1412
  "step": 138
1413
  },
1414
  {
1415
+ "entropy": 0.21827432699501514,
1416
  "epoch": 1.6983154670750382,
1417
+ "grad_norm": 0.3984375,
1418
  "learning_rate": 3.170731707317073e-05,
1419
+ "loss": 0.03258686885237694,
1420
+ "mean_token_accuracy": 0.9875011034309864,
1421
+ "num_tokens": 772255.0,
1422
  "step": 139
1423
  },
1424
  {
1425
+ "entropy": 0.2216914091259241,
1426
  "epoch": 1.7105666156202144,
1427
+ "grad_norm": 0.2431640625,
1428
  "learning_rate": 3.048780487804878e-05,
1429
+ "loss": 0.02989918179810047,
1430
+ "mean_token_accuracy": 0.9894328564405441,
1431
+ "num_tokens": 777953.0,
1432
  "step": 140
1433
  },
1434
  {
1435
+ "entropy": 0.1955341473221779,
1436
  "epoch": 1.7228177641653906,
1437
+ "grad_norm": 0.19140625,
1438
  "learning_rate": 2.926829268292683e-05,
1439
+ "loss": 0.03125489503145218,
1440
+ "mean_token_accuracy": 0.990275178104639,
1441
+ "num_tokens": 783196.0,
1442
  "step": 141
1443
  },
1444
  {
1445
+ "entropy": 0.20080329850316048,
1446
  "epoch": 1.7350689127105667,
1447
+ "grad_norm": 0.205078125,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
+ "loss": 0.02998793125152588,
1450
+ "mean_token_accuracy": 0.990708488970995,
1451
+ "num_tokens": 787962.0,
1452
  "step": 142
1453
  },
1454
  {
1455
+ "entropy": 0.23265999322757125,
1456
  "epoch": 1.7473200612557427,
1457
+ "grad_norm": 0.486328125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
+ "loss": 0.05563311651349068,
1460
+ "mean_token_accuracy": 0.9868011735379696,
1461
+ "num_tokens": 792930.0,
1462
  "step": 143
1463
  },
1464
  {
1465
+ "entropy": 0.2262994982302189,
1466
  "epoch": 1.7595712098009189,
1467
+ "grad_norm": 0.3125,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
+ "loss": 0.03620155528187752,
1470
+ "mean_token_accuracy": 0.9913722947239876,
1471
+ "num_tokens": 798789.0,
1472
  "step": 144
1473
  },
1474
  {
1475
+ "entropy": 0.216988081112504,
1476
  "epoch": 1.7718223583460948,
1477
+ "grad_norm": 0.4921875,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
+ "loss": 0.03459199145436287,
1480
+ "mean_token_accuracy": 0.985675573348999,
1481
+ "num_tokens": 803970.0,
1482
  "step": 145
1483
  },
1484
  {
1485
+ "entropy": 0.2017216570675373,
1486
  "epoch": 1.784073506891271,
1487
+ "grad_norm": 0.197265625,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
+ "loss": 0.03301437944173813,
1490
+ "mean_token_accuracy": 0.9892940744757652,
1491
+ "num_tokens": 810201.0,
1492
  "step": 146
1493
  },
1494
  {
1495
+ "entropy": 0.19614959321916103,
1496
  "epoch": 1.7963246554364471,
1497
+ "grad_norm": 0.251953125,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
+ "loss": 0.03109458088874817,
1500
+ "mean_token_accuracy": 0.9879713654518127,
1501
+ "num_tokens": 816419.0,
1502
  "step": 147
1503
  },
1504
  {
1505
+ "entropy": 0.22246063826605678,
1506
  "epoch": 1.8085758039816233,
1507
+ "grad_norm": 0.28125,
1508
  "learning_rate": 2.073170731707317e-05,
1509
+ "loss": 0.03694477677345276,
1510
+ "mean_token_accuracy": 0.9848503768444061,
1511
+ "num_tokens": 822186.0,
1512
  "step": 148
1513
  },
1514
  {
1515
+ "entropy": 0.22035463713109493,
1516
  "epoch": 1.8208269525267995,
1517
+ "grad_norm": 0.171875,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
+ "loss": 0.023739267140626907,
1520
+ "mean_token_accuracy": 0.9927939847111702,
1521
+ "num_tokens": 826733.0,
1522
  "step": 149
1523
  },
1524
  {
1525
+ "entropy": 0.21801204327493906,
1526
  "epoch": 1.8330781010719757,
1527
+ "grad_norm": 0.1884765625,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
+ "loss": 0.03263175114989281,
1530
+ "mean_token_accuracy": 0.9923874475061893,
1531
+ "num_tokens": 833042.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
+ "eval_entropy": 0.21679833023876383,
1537
+ "eval_loss": 0.0704953595995903,
1538
+ "eval_mean_token_accuracy": 0.9732788464297419,
1539
+ "eval_num_tokens": 833042.0,
1540
+ "eval_runtime": 57.5678,
1541
+ "eval_samples_per_second": 1.199,
1542
+ "eval_steps_per_second": 1.199,
1543
  "step": 150
1544
  }
1545
  ],
 
1560
  "attributes": {}
1561
  }
1562
  },
1563
+ "total_flos": 3.772114663523942e+16,
1564
  "train_batch_size": 1,
1565
  "trial_name": null,
1566
  "trial_params": null
checkpoint-164/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "gate_proj",
33
- "k_proj",
34
  "up_proj",
35
- "o_proj",
 
 
36
  "down_proj",
37
  "v_proj",
38
- "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "up_proj",
33
+ "k_proj",
34
+ "gate_proj",
35
+ "q_proj",
36
  "down_proj",
37
  "v_proj",
38
+ "o_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
checkpoint-164/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1cfd52fdf227b7e674c9a251aff7c304abf9a7a8919fce7857b076b93ca8e43
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c50446119658169e4b938126c27c77bafe70a24c07da2bdb29a80ed9e37b2df
3
  size 83946192
checkpoint-164/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:790ca2f2587106fb193d92d7f10c85139892f97a9ddd96f10bc3dd3b1f58f028
3
  size 85728997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bea6f61b24c4a9a6bbb5e0f6a512e06d1c420f539f6a58e01695989bdde29a5
3
  size 85728997
checkpoint-164/trainer_state.json CHANGED
@@ -10,1676 +10,1676 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 0.32083135563880205,
14
  "epoch": 0.01225114854517611,
15
- "grad_norm": 0.134765625,
16
  "learning_rate": 0.0002,
17
- "loss": 0.019214527681469917,
18
- "mean_token_accuracy": 0.9918519593775272,
19
- "num_tokens": 6092.0,
20
  "step": 1
21
  },
22
  {
23
- "entropy": 0.3576695416122675,
24
  "epoch": 0.02450229709035222,
25
- "grad_norm": 0.50390625,
26
  "learning_rate": 0.00019878048780487805,
27
- "loss": 0.03324645012617111,
28
- "mean_token_accuracy": 0.988272774964571,
29
- "num_tokens": 11535.0,
30
  "step": 2
31
  },
32
  {
33
- "entropy": 0.33352363388985395,
34
  "epoch": 0.036753445635528334,
35
- "grad_norm": 0.0272216796875,
36
  "learning_rate": 0.0001975609756097561,
37
- "loss": 0.0017091021873056889,
38
- "mean_token_accuracy": 1.0,
39
- "num_tokens": 16432.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 0.35098350048065186,
44
  "epoch": 0.04900459418070444,
45
- "grad_norm": 0.06640625,
46
  "learning_rate": 0.00019634146341463416,
47
- "loss": 0.00414489908143878,
48
- "mean_token_accuracy": 0.9985632188618183,
49
- "num_tokens": 20507.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 0.3005372080951929,
54
  "epoch": 0.06125574272588055,
55
- "grad_norm": 0.01416015625,
56
  "learning_rate": 0.0001951219512195122,
57
- "loss": 0.0008560216519981623,
58
- "mean_token_accuracy": 1.0,
59
- "num_tokens": 26122.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 0.3177621979266405,
64
  "epoch": 0.07350689127105667,
65
- "grad_norm": 0.008544921875,
66
  "learning_rate": 0.00019390243902439025,
67
- "loss": 0.0005585744511336088,
68
- "mean_token_accuracy": 1.0,
69
- "num_tokens": 30847.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 0.27754624653607607,
74
  "epoch": 0.08575803981623277,
75
- "grad_norm": 0.019775390625,
76
  "learning_rate": 0.0001926829268292683,
77
- "loss": 0.0012820134870707989,
78
- "mean_token_accuracy": 0.9998413696885109,
79
- "num_tokens": 36541.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 0.30307829193770885,
84
  "epoch": 0.09800918836140889,
85
- "grad_norm": 0.004364013671875,
86
  "learning_rate": 0.00019146341463414633,
87
- "loss": 0.0003136860905215144,
88
- "mean_token_accuracy": 1.0,
89
- "num_tokens": 41001.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 0.31226138956844807,
94
  "epoch": 0.11026033690658499,
95
- "grad_norm": 0.11767578125,
96
  "learning_rate": 0.0001902439024390244,
97
- "loss": 0.006275261752307415,
98
- "mean_token_accuracy": 0.9993216060101986,
99
- "num_tokens": 45467.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 0.2779384208843112,
104
  "epoch": 0.1225114854517611,
105
- "grad_norm": 0.011474609375,
106
  "learning_rate": 0.00018902439024390244,
107
- "loss": 0.0006869531353004277,
108
- "mean_token_accuracy": 1.0,
109
- "num_tokens": 50478.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 0.27587867714464664,
114
  "epoch": 0.13476263399693722,
115
- "grad_norm": 0.00188446044921875,
116
  "learning_rate": 0.0001878048780487805,
117
- "loss": 0.0001916390028782189,
118
- "mean_token_accuracy": 1.0,
119
- "num_tokens": 56181.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 0.2948900917544961,
124
  "epoch": 0.14701378254211334,
125
- "grad_norm": 0.07177734375,
126
  "learning_rate": 0.00018658536585365856,
127
- "loss": 0.001886777114123106,
128
- "mean_token_accuracy": 0.9998650103807449,
129
- "num_tokens": 62946.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 0.29555963445454836,
134
  "epoch": 0.15926493108728942,
135
- "grad_norm": 0.005523681640625,
136
  "learning_rate": 0.0001853658536585366,
137
- "loss": 0.00017441912495996803,
138
- "mean_token_accuracy": 1.0,
139
- "num_tokens": 68436.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 0.287986209616065,
144
  "epoch": 0.17151607963246554,
145
- "grad_norm": 0.02001953125,
146
  "learning_rate": 0.00018414634146341464,
147
- "loss": 0.00017802949878387153,
148
- "mean_token_accuracy": 1.0,
149
- "num_tokens": 73603.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 0.3127295421436429,
154
  "epoch": 0.18376722817764166,
155
- "grad_norm": 0.06787109375,
156
  "learning_rate": 0.0001829268292682927,
157
- "loss": 0.0010371531825512648,
158
- "mean_token_accuracy": 0.9995941556990147,
159
- "num_tokens": 77845.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 0.2922206539660692,
164
  "epoch": 0.19601837672281777,
165
- "grad_norm": 0.00118255615234375,
166
  "learning_rate": 0.00018170731707317075,
167
- "loss": 0.00011905122664757073,
168
- "mean_token_accuracy": 1.0,
169
- "num_tokens": 82744.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 0.2928574001416564,
174
  "epoch": 0.2082695252679939,
175
- "grad_norm": 0.0003719329833984375,
176
  "learning_rate": 0.0001804878048780488,
177
- "loss": 7.616190850967541e-05,
178
- "mean_token_accuracy": 1.0,
179
- "num_tokens": 87453.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 0.2979039028286934,
184
  "epoch": 0.22052067381316998,
185
- "grad_norm": 0.0026702880859375,
186
  "learning_rate": 0.00017926829268292684,
187
- "loss": 0.00012367898307275027,
188
- "mean_token_accuracy": 1.0,
189
- "num_tokens": 92321.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 0.31858293898403645,
194
  "epoch": 0.2327718223583461,
195
- "grad_norm": 0.10498046875,
196
  "learning_rate": 0.00017804878048780488,
197
- "loss": 0.0006579139153473079,
198
- "mean_token_accuracy": 0.9997499994933605,
199
- "num_tokens": 97146.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 0.30853591673076153,
204
  "epoch": 0.2450229709035222,
205
- "grad_norm": 0.004364013671875,
206
  "learning_rate": 0.00017682926829268295,
207
- "loss": 0.00014281428593676537,
208
- "mean_token_accuracy": 1.0,
209
- "num_tokens": 101943.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 0.34037051256746054,
214
  "epoch": 0.2572741194486983,
215
- "grad_norm": 0.056884765625,
216
  "learning_rate": 0.000175609756097561,
217
- "loss": 0.011726096272468567,
218
- "mean_token_accuracy": 0.9993422217667103,
219
- "num_tokens": 106772.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 0.29644382931292057,
224
  "epoch": 0.26952526799387444,
225
- "grad_norm": 0.0023193359375,
226
  "learning_rate": 0.00017439024390243903,
227
- "loss": 0.00010672100324882194,
228
- "mean_token_accuracy": 1.0,
229
- "num_tokens": 112558.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 0.3180191367864609,
234
  "epoch": 0.28177641653905056,
235
- "grad_norm": 0.000675201416015625,
236
  "learning_rate": 0.00017317073170731708,
237
- "loss": 9.894849790725857e-05,
238
- "mean_token_accuracy": 1.0,
239
- "num_tokens": 117489.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 0.32946281880140305,
244
  "epoch": 0.29402756508422667,
245
- "grad_norm": 0.0242919921875,
246
  "learning_rate": 0.00017195121951219512,
247
- "loss": 0.0029232932720333338,
248
- "mean_token_accuracy": 0.9996279776096344,
249
- "num_tokens": 123010.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 0.3180750487372279,
254
  "epoch": 0.30627871362940273,
255
- "grad_norm": 0.038330078125,
256
  "learning_rate": 0.0001707317073170732,
257
- "loss": 0.0015810562763363123,
258
- "mean_token_accuracy": 0.9990344606339931,
259
- "num_tokens": 127716.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 0.31262985058128834,
264
  "epoch": 0.31852986217457885,
265
- "grad_norm": 0.0027313232421875,
266
  "learning_rate": 0.00016951219512195123,
267
- "loss": 0.00019670175970532,
268
- "mean_token_accuracy": 1.0,
269
- "num_tokens": 132372.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 0.2831157138571143,
274
  "epoch": 0.33078101071975496,
275
- "grad_norm": 0.1484375,
276
  "learning_rate": 0.00016829268292682927,
277
- "loss": 0.003187144873663783,
278
- "mean_token_accuracy": 0.9994877055287361,
279
- "num_tokens": 137028.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 0.3106652954593301,
284
  "epoch": 0.3430321592649311,
285
- "grad_norm": 0.05810546875,
286
  "learning_rate": 0.00016707317073170731,
287
- "loss": 0.004998125601559877,
288
- "mean_token_accuracy": 0.9980670101940632,
289
- "num_tokens": 142088.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 0.31454288959503174,
294
  "epoch": 0.3552833078101072,
295
- "grad_norm": 0.0306396484375,
296
  "learning_rate": 0.00016585365853658536,
297
- "loss": 0.000461318384623155,
298
- "mean_token_accuracy": 1.0,
299
- "num_tokens": 147481.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 0.33650430012494326,
304
  "epoch": 0.3675344563552833,
305
- "grad_norm": 0.0238037109375,
306
  "learning_rate": 0.00016463414634146343,
307
- "loss": 0.0005614800029434264,
308
- "mean_token_accuracy": 1.0,
309
- "num_tokens": 152973.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 0.33513325452804565,
314
  "epoch": 0.37978560490045943,
315
- "grad_norm": 0.00604248046875,
316
  "learning_rate": 0.00016341463414634147,
317
- "loss": 0.00020872258755844086,
318
- "mean_token_accuracy": 1.0,
319
- "num_tokens": 156786.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 0.34442581795156,
324
  "epoch": 0.39203675344563554,
325
- "grad_norm": 0.0159912109375,
326
  "learning_rate": 0.00016219512195121954,
327
- "loss": 0.00043797443504445255,
328
- "mean_token_accuracy": 1.0,
329
- "num_tokens": 162859.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 0.34709672816097736,
334
  "epoch": 0.40428790199081166,
335
- "grad_norm": 0.04052734375,
336
  "learning_rate": 0.00016097560975609758,
337
- "loss": 0.0008612321689724922,
338
- "mean_token_accuracy": 1.0,
339
- "num_tokens": 167969.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 0.31636961828917265,
344
  "epoch": 0.4165390505359878,
345
- "grad_norm": 0.048583984375,
346
  "learning_rate": 0.00015975609756097562,
347
- "loss": 0.001623529358766973,
348
- "mean_token_accuracy": 1.0,
349
- "num_tokens": 172518.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 0.341240718960762,
354
  "epoch": 0.42879019908116384,
355
- "grad_norm": 0.0089111328125,
356
  "learning_rate": 0.00015853658536585366,
357
- "loss": 0.0004598334198817611,
358
- "mean_token_accuracy": 1.0,
359
- "num_tokens": 177085.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 0.3331515807658434,
364
  "epoch": 0.44104134762633995,
365
- "grad_norm": 0.0137939453125,
366
  "learning_rate": 0.00015731707317073173,
367
- "loss": 0.00047711117076687515,
368
- "mean_token_accuracy": 1.0,
369
- "num_tokens": 181617.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 0.2969168536365032,
374
  "epoch": 0.45329249617151607,
375
- "grad_norm": 0.0296630859375,
376
  "learning_rate": 0.00015609756097560978,
377
- "loss": 0.0018673602025955915,
378
- "mean_token_accuracy": 0.9982142858207226,
379
- "num_tokens": 186836.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 0.3208611598238349,
384
  "epoch": 0.4655436447166922,
385
- "grad_norm": 0.0034027099609375,
386
  "learning_rate": 0.00015487804878048782,
387
- "loss": 0.00018661899957805872,
388
- "mean_token_accuracy": 1.0,
389
- "num_tokens": 191224.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 0.296407300978899,
394
  "epoch": 0.4777947932618683,
395
- "grad_norm": 0.003570556640625,
396
  "learning_rate": 0.00015365853658536586,
397
- "loss": 0.0001632017083466053,
398
- "mean_token_accuracy": 1.0,
399
- "num_tokens": 195926.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 0.32142599392682314,
404
  "epoch": 0.4900459418070444,
405
- "grad_norm": 0.0277099609375,
406
  "learning_rate": 0.0001524390243902439,
407
- "loss": 0.0039696223102509975,
408
- "mean_token_accuracy": 0.9992866478860378,
409
- "num_tokens": 200772.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 0.3037592498585582,
414
  "epoch": 0.5022970903522205,
415
- "grad_norm": 0.0026092529296875,
416
  "learning_rate": 0.00015121951219512197,
417
- "loss": 0.00013867147208657116,
418
- "mean_token_accuracy": 1.0,
419
- "num_tokens": 204499.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 0.31665132474154234,
424
  "epoch": 0.5145482388973966,
425
- "grad_norm": 0.004730224609375,
426
  "learning_rate": 0.00015000000000000001,
427
- "loss": 0.00025882094632834196,
428
- "mean_token_accuracy": 1.0,
429
- "num_tokens": 208814.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 0.33023010194301605,
434
  "epoch": 0.5267993874425727,
435
- "grad_norm": 0.001922607421875,
436
  "learning_rate": 0.00014878048780487806,
437
- "loss": 0.00019074659212492406,
438
- "mean_token_accuracy": 1.0,
439
- "num_tokens": 213907.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 0.334543508477509,
444
  "epoch": 0.5390505359877489,
445
- "grad_norm": 0.0018157958984375,
446
  "learning_rate": 0.0001475609756097561,
447
- "loss": 0.00011566472676349804,
448
- "mean_token_accuracy": 1.0,
449
- "num_tokens": 218988.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 0.3078083451837301,
454
  "epoch": 0.5513016845329249,
455
- "grad_norm": 0.03515625,
456
  "learning_rate": 0.00014634146341463414,
457
- "loss": 0.0022110757417976856,
458
- "mean_token_accuracy": 0.9987903237342834,
459
- "num_tokens": 223595.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 0.32667472772300243,
464
  "epoch": 0.5635528330781011,
465
- "grad_norm": 0.034423828125,
466
  "learning_rate": 0.0001451219512195122,
467
- "loss": 0.0010719874408096075,
468
- "mean_token_accuracy": 0.9991953931748867,
469
- "num_tokens": 228244.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 0.3273861287161708,
474
  "epoch": 0.5758039816232772,
475
- "grad_norm": 0.00057220458984375,
476
  "learning_rate": 0.00014390243902439025,
477
- "loss": 6.594268779736012e-05,
478
- "mean_token_accuracy": 1.0,
479
- "num_tokens": 232606.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 0.31728990003466606,
484
  "epoch": 0.5880551301684533,
485
- "grad_norm": 0.0003185272216796875,
486
  "learning_rate": 0.0001426829268292683,
487
- "loss": 8.574798266636208e-05,
488
- "mean_token_accuracy": 1.0,
489
- "num_tokens": 236563.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 0.34826087579131126,
494
  "epoch": 0.6003062787136294,
495
- "grad_norm": 0.00390625,
496
  "learning_rate": 0.00014146341463414634,
497
- "loss": 0.00015243196685332805,
498
- "mean_token_accuracy": 1.0,
499
- "num_tokens": 241214.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 0.3367287954315543,
504
  "epoch": 0.6125574272588055,
505
- "grad_norm": 0.003265380859375,
506
  "learning_rate": 0.00014024390243902438,
507
- "loss": 0.0001341242023045197,
508
- "mean_token_accuracy": 1.0,
509
- "num_tokens": 245200.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
- "eval_entropy": 0.3212364659361217,
515
- "eval_loss": 0.0014040147652849555,
516
- "eval_mean_token_accuracy": 0.9998166846192401,
517
- "eval_num_tokens": 245200.0,
518
- "eval_runtime": 51.1353,
519
- "eval_samples_per_second": 1.349,
520
- "eval_steps_per_second": 1.349,
521
  "step": 50
522
  },
523
  {
524
- "entropy": 0.3274610061198473,
525
  "epoch": 0.6248085758039816,
526
- "grad_norm": 0.000518798828125,
527
  "learning_rate": 0.00013902439024390245,
528
- "loss": 6.213193410076201e-05,
529
- "mean_token_accuracy": 1.0,
530
- "num_tokens": 249761.0,
531
  "step": 51
532
  },
533
  {
534
- "entropy": 0.3302043145522475,
535
  "epoch": 0.6370597243491577,
536
- "grad_norm": 0.00067901611328125,
537
  "learning_rate": 0.0001378048780487805,
538
- "loss": 7.391967665171251e-05,
539
- "mean_token_accuracy": 1.0,
540
- "num_tokens": 254787.0,
541
  "step": 52
542
  },
543
  {
544
- "entropy": 0.3345805983990431,
545
  "epoch": 0.6493108728943339,
546
- "grad_norm": 0.064453125,
547
  "learning_rate": 0.00013658536585365856,
548
- "loss": 0.008045142516493797,
549
- "mean_token_accuracy": 0.9975476562976837,
550
- "num_tokens": 260287.0,
551
  "step": 53
552
  },
553
  {
554
- "entropy": 0.3093695640563965,
555
  "epoch": 0.6615620214395099,
556
- "grad_norm": 0.036865234375,
557
  "learning_rate": 0.0001353658536585366,
558
- "loss": 0.0016300748102366924,
559
- "mean_token_accuracy": 0.9998249299824238,
560
- "num_tokens": 264810.0,
561
  "step": 54
562
  },
563
  {
564
- "entropy": 0.33090174850076437,
565
  "epoch": 0.6738131699846861,
566
- "grad_norm": 0.04052734375,
567
  "learning_rate": 0.00013414634146341464,
568
- "loss": 0.0037348291371017694,
569
- "mean_token_accuracy": 0.9990433678030968,
570
- "num_tokens": 270386.0,
571
  "step": 55
572
  },
573
  {
574
- "entropy": 0.3455248447135091,
575
  "epoch": 0.6860643185298622,
576
- "grad_norm": 0.0301513671875,
577
  "learning_rate": 0.0001329268292682927,
578
- "loss": 0.0006253286846913397,
579
- "mean_token_accuracy": 1.0,
580
- "num_tokens": 274391.0,
581
  "step": 56
582
  },
583
  {
584
- "entropy": 0.3408086858689785,
585
  "epoch": 0.6983154670750383,
586
- "grad_norm": 0.0033111572265625,
587
  "learning_rate": 0.00013170731707317076,
588
- "loss": 0.00020847572886850685,
589
- "mean_token_accuracy": 1.0,
590
- "num_tokens": 279716.0,
591
  "step": 57
592
  },
593
  {
594
- "entropy": 0.29423840064555407,
595
  "epoch": 0.7105666156202144,
596
- "grad_norm": 0.125,
597
  "learning_rate": 0.0001304878048780488,
598
- "loss": 0.005600863602012396,
599
- "mean_token_accuracy": 0.998680267482996,
600
- "num_tokens": 285404.0,
601
  "step": 58
602
  },
603
  {
604
- "entropy": 0.33689095824956894,
605
  "epoch": 0.7228177641653905,
606
- "grad_norm": 0.057861328125,
607
  "learning_rate": 0.00012926829268292684,
608
- "loss": 0.009100214578211308,
609
- "mean_token_accuracy": 0.9967310577630997,
610
- "num_tokens": 290021.0,
611
  "step": 59
612
  },
613
  {
614
- "entropy": 0.3336018780246377,
615
  "epoch": 0.7350689127105666,
616
- "grad_norm": 0.005889892578125,
617
  "learning_rate": 0.00012804878048780488,
618
- "loss": 0.00015729578444734216,
619
- "mean_token_accuracy": 1.0,
620
- "num_tokens": 294890.0,
621
  "step": 60
622
  },
623
  {
624
- "entropy": 0.30060291569679976,
625
  "epoch": 0.7473200612557427,
626
- "grad_norm": 0.0172119140625,
627
  "learning_rate": 0.00012682926829268293,
628
- "loss": 0.00039864826248958707,
629
- "mean_token_accuracy": 0.9993686862289906,
630
- "num_tokens": 300384.0,
631
  "step": 61
632
  },
633
  {
634
- "entropy": 0.36021818965673447,
635
  "epoch": 0.7595712098009189,
636
- "grad_norm": 0.0025634765625,
637
  "learning_rate": 0.000125609756097561,
638
- "loss": 0.00016568033606745303,
639
- "mean_token_accuracy": 1.0,
640
- "num_tokens": 305805.0,
641
  "step": 62
642
  },
643
  {
644
- "entropy": 0.32536453381180763,
645
  "epoch": 0.7718223583460949,
646
- "grad_norm": 0.001800537109375,
647
  "learning_rate": 0.00012439024390243904,
648
- "loss": 0.00014585268218070269,
649
- "mean_token_accuracy": 1.0,
650
- "num_tokens": 310233.0,
651
  "step": 63
652
  },
653
  {
654
- "entropy": 0.31967335008084774,
655
  "epoch": 0.7840735068912711,
656
- "grad_norm": 0.0010223388671875,
657
  "learning_rate": 0.00012317073170731708,
658
- "loss": 0.00010060967179015279,
659
- "mean_token_accuracy": 1.0,
660
- "num_tokens": 314234.0,
661
  "step": 64
662
  },
663
  {
664
- "entropy": 0.34358128905296326,
665
  "epoch": 0.7963246554364471,
666
- "grad_norm": 0.000743865966796875,
667
  "learning_rate": 0.00012195121951219512,
668
- "loss": 9.478208812652156e-05,
669
- "mean_token_accuracy": 1.0,
670
- "num_tokens": 319186.0,
671
  "step": 65
672
  },
673
  {
674
- "entropy": 0.33988895174115896,
675
  "epoch": 0.8085758039816233,
676
- "grad_norm": 0.0419921875,
677
  "learning_rate": 0.00012073170731707318,
678
- "loss": 0.0011607923079282045,
679
- "mean_token_accuracy": 0.9995629377663136,
680
- "num_tokens": 324710.0,
681
  "step": 66
682
  },
683
  {
684
- "entropy": 0.3078791871666908,
685
  "epoch": 0.8208269525267994,
686
- "grad_norm": 0.05859375,
687
  "learning_rate": 0.00011951219512195122,
688
- "loss": 0.016102174296975136,
689
- "mean_token_accuracy": 0.9935315921902657,
690
- "num_tokens": 329942.0,
691
  "step": 67
692
  },
693
  {
694
- "entropy": 0.3587793167680502,
695
  "epoch": 0.8330781010719756,
696
- "grad_norm": 0.002716064453125,
697
  "learning_rate": 0.00011829268292682926,
698
- "loss": 0.0001911829021992162,
699
- "mean_token_accuracy": 1.0,
700
- "num_tokens": 334487.0,
701
  "step": 68
702
  },
703
  {
704
- "entropy": 0.360817888751626,
705
  "epoch": 0.8453292496171516,
706
- "grad_norm": 0.003753662109375,
707
  "learning_rate": 0.00011707317073170732,
708
- "loss": 0.00026575953233987093,
709
- "mean_token_accuracy": 1.0,
710
- "num_tokens": 338184.0,
711
  "step": 69
712
  },
713
  {
714
- "entropy": 0.3788213599473238,
715
  "epoch": 0.8575803981623277,
716
- "grad_norm": 0.07421875,
717
  "learning_rate": 0.00011585365853658536,
718
- "loss": 0.007251895032823086,
719
- "mean_token_accuracy": 0.997805867344141,
720
- "num_tokens": 342594.0,
721
  "step": 70
722
  },
723
  {
724
- "entropy": 0.37989665009081364,
725
  "epoch": 0.8698315467075038,
726
- "grad_norm": 0.0361328125,
727
  "learning_rate": 0.00011463414634146342,
728
- "loss": 0.001519644632935524,
729
- "mean_token_accuracy": 0.9997807033360004,
730
- "num_tokens": 347798.0,
731
  "step": 71
732
  },
733
  {
734
- "entropy": 0.35538383200764656,
735
  "epoch": 0.8820826952526799,
736
- "grad_norm": 0.0038604736328125,
737
  "learning_rate": 0.00011341463414634146,
738
- "loss": 0.00030194621649570763,
739
- "mean_token_accuracy": 1.0,
740
- "num_tokens": 352122.0,
741
  "step": 72
742
  },
743
  {
744
- "entropy": 0.36578258499503136,
745
  "epoch": 0.8943338437978561,
746
- "grad_norm": 0.02001953125,
747
  "learning_rate": 0.00011219512195121953,
748
- "loss": 0.0018432819051668048,
749
- "mean_token_accuracy": 0.9997568093240261,
750
- "num_tokens": 357944.0,
751
  "step": 73
752
  },
753
  {
754
- "entropy": 0.3363148244097829,
755
  "epoch": 0.9065849923430321,
756
- "grad_norm": 0.01214599609375,
757
  "learning_rate": 0.00011097560975609757,
758
- "loss": 0.0004945008549839258,
759
- "mean_token_accuracy": 1.0,
760
- "num_tokens": 363815.0,
761
  "step": 74
762
  },
763
  {
764
- "entropy": 0.3567014401778579,
765
  "epoch": 0.9188361408882083,
766
- "grad_norm": 0.00160980224609375,
767
  "learning_rate": 0.00010975609756097563,
768
- "loss": 0.0002087215252686292,
769
- "mean_token_accuracy": 1.0,
770
- "num_tokens": 368871.0,
771
  "step": 75
772
  },
773
  {
774
- "entropy": 0.3798025632277131,
775
  "epoch": 0.9310872894333844,
776
- "grad_norm": 0.0242919921875,
777
  "learning_rate": 0.00010853658536585367,
778
- "loss": 0.0011810146970674396,
779
- "mean_token_accuracy": 0.999143835157156,
780
- "num_tokens": 373671.0,
781
  "step": 76
782
  },
783
  {
784
- "entropy": 0.3385667558759451,
785
  "epoch": 0.9433384379785605,
786
- "grad_norm": 0.00164031982421875,
787
  "learning_rate": 0.00010731707317073172,
788
- "loss": 0.00021391667542047799,
789
- "mean_token_accuracy": 1.0,
790
- "num_tokens": 379038.0,
791
  "step": 77
792
  },
793
  {
794
- "entropy": 0.37137152813374996,
795
  "epoch": 0.9555895865237366,
796
- "grad_norm": 0.0194091796875,
797
  "learning_rate": 0.00010609756097560977,
798
- "loss": 0.0009015509858727455,
799
- "mean_token_accuracy": 0.9992977529764175,
800
- "num_tokens": 384253.0,
801
  "step": 78
802
  },
803
  {
804
- "entropy": 0.35634181648492813,
805
  "epoch": 0.9678407350689127,
806
- "grad_norm": 0.002349853515625,
807
  "learning_rate": 0.00010487804878048781,
808
- "loss": 0.0003007323248311877,
809
- "mean_token_accuracy": 1.0,
810
- "num_tokens": 388348.0,
811
  "step": 79
812
  },
813
  {
814
- "entropy": 0.3363165808841586,
815
  "epoch": 0.9800918836140888,
816
- "grad_norm": 0.013916015625,
817
  "learning_rate": 0.00010365853658536586,
818
- "loss": 0.0015124119818210602,
819
- "mean_token_accuracy": 0.999507874250412,
820
- "num_tokens": 394214.0,
821
  "step": 80
822
  },
823
  {
824
- "entropy": 0.34769035689532757,
825
  "epoch": 0.9923430321592649,
826
- "grad_norm": 0.0264892578125,
827
  "learning_rate": 0.0001024390243902439,
828
- "loss": 0.0008837911300361156,
829
- "mean_token_accuracy": 0.9992187507450581,
830
- "num_tokens": 399114.0,
831
  "step": 81
832
  },
833
  {
834
- "entropy": 0.34723484665155413,
835
  "epoch": 1.0,
836
- "grad_norm": 0.002288818359375,
837
  "learning_rate": 0.00010121951219512196,
838
- "loss": 0.0002318796032341197,
839
- "mean_token_accuracy": 1.0,
840
- "num_tokens": 402130.0,
841
  "step": 82
842
  },
843
  {
844
- "entropy": 0.3677198924124241,
845
  "epoch": 1.0122511485451762,
846
- "grad_norm": 0.03173828125,
847
  "learning_rate": 0.0001,
848
- "loss": 0.0028767124749720097,
849
- "mean_token_accuracy": 0.9997509978711605,
850
- "num_tokens": 406761.0,
851
  "step": 83
852
  },
853
  {
854
- "entropy": 0.3296260507777333,
855
  "epoch": 1.0245022970903521,
856
- "grad_norm": 0.0016326904296875,
857
  "learning_rate": 9.878048780487805e-05,
858
- "loss": 0.00020801745995413512,
859
- "mean_token_accuracy": 1.0,
860
- "num_tokens": 411367.0,
861
  "step": 84
862
  },
863
  {
864
- "entropy": 0.36815651040524244,
865
  "epoch": 1.0367534456355283,
866
- "grad_norm": 0.00299072265625,
867
  "learning_rate": 9.75609756097561e-05,
868
- "loss": 0.00034169916762039065,
869
- "mean_token_accuracy": 1.0,
870
- "num_tokens": 417768.0,
871
  "step": 85
872
  },
873
  {
874
- "entropy": 0.33015719801187515,
875
  "epoch": 1.0490045941807045,
876
- "grad_norm": 0.0019683837890625,
877
  "learning_rate": 9.634146341463415e-05,
878
- "loss": 0.0002285851223859936,
879
- "mean_token_accuracy": 1.0,
880
- "num_tokens": 421738.0,
881
  "step": 86
882
  },
883
  {
884
- "entropy": 0.33297139778733253,
885
  "epoch": 1.0612557427258806,
886
- "grad_norm": 0.0003604888916015625,
887
  "learning_rate": 9.51219512195122e-05,
888
- "loss": 0.00012145948858233169,
889
- "mean_token_accuracy": 1.0,
890
- "num_tokens": 426854.0,
891
  "step": 87
892
  },
893
  {
894
- "entropy": 0.4070947393774986,
895
  "epoch": 1.0735068912710566,
896
- "grad_norm": 0.017333984375,
897
  "learning_rate": 9.390243902439024e-05,
898
- "loss": 0.0016109611606225371,
899
- "mean_token_accuracy": 0.9998486675322056,
900
- "num_tokens": 431083.0,
901
  "step": 88
902
  },
903
  {
904
- "entropy": 0.3781026881188154,
905
  "epoch": 1.0857580398162328,
906
- "grad_norm": 0.038818359375,
907
  "learning_rate": 9.26829268292683e-05,
908
- "loss": 0.003159651067107916,
909
- "mean_token_accuracy": 0.9989801794290543,
910
- "num_tokens": 435694.0,
911
  "step": 89
912
  },
913
  {
914
- "entropy": 0.3439221568405628,
915
  "epoch": 1.098009188361409,
916
- "grad_norm": 0.000949859619140625,
917
  "learning_rate": 9.146341463414635e-05,
918
- "loss": 0.00018103225738741457,
919
- "mean_token_accuracy": 1.0,
920
- "num_tokens": 440578.0,
921
  "step": 90
922
  },
923
  {
924
- "entropy": 0.38779534585773945,
925
  "epoch": 1.110260336906585,
926
- "grad_norm": 0.0142822265625,
927
  "learning_rate": 9.02439024390244e-05,
928
- "loss": 0.002015941310673952,
929
- "mean_token_accuracy": 0.9984939768910408,
930
- "num_tokens": 445238.0,
931
  "step": 91
932
  },
933
  {
934
- "entropy": 0.3697750475257635,
935
  "epoch": 1.122511485451761,
936
- "grad_norm": 0.08642578125,
937
  "learning_rate": 8.902439024390244e-05,
938
- "loss": 0.006127167027443647,
939
- "mean_token_accuracy": 0.9989957921206951,
940
- "num_tokens": 449993.0,
941
  "step": 92
942
  },
943
  {
944
- "entropy": 0.34917816519737244,
945
  "epoch": 1.1347626339969372,
946
- "grad_norm": 0.0037384033203125,
947
  "learning_rate": 8.78048780487805e-05,
948
- "loss": 0.00024314325128216296,
949
- "mean_token_accuracy": 1.0,
950
- "num_tokens": 454976.0,
951
  "step": 93
952
  },
953
  {
954
- "entropy": 0.3524725306779146,
955
  "epoch": 1.1470137825421134,
956
- "grad_norm": 0.00104522705078125,
957
  "learning_rate": 8.658536585365854e-05,
958
- "loss": 0.00014462518447544426,
959
- "mean_token_accuracy": 1.0,
960
- "num_tokens": 459671.0,
961
  "step": 94
962
  },
963
  {
964
- "entropy": 0.3524913527071476,
965
  "epoch": 1.1592649310872893,
966
- "grad_norm": 0.000782012939453125,
967
  "learning_rate": 8.53658536585366e-05,
968
- "loss": 0.0001363266637781635,
969
- "mean_token_accuracy": 1.0,
970
- "num_tokens": 464310.0,
971
  "step": 95
972
  },
973
  {
974
- "entropy": 0.33474782202392817,
975
  "epoch": 1.1715160796324655,
976
- "grad_norm": 0.05615234375,
977
  "learning_rate": 8.414634146341464e-05,
978
- "loss": 0.006995758973062038,
979
- "mean_token_accuracy": 0.997385773807764,
980
- "num_tokens": 468855.0,
981
  "step": 96
982
  },
983
  {
984
- "entropy": 0.34024662896990776,
985
  "epoch": 1.1837672281776417,
986
- "grad_norm": 0.000762939453125,
987
  "learning_rate": 8.292682926829268e-05,
988
- "loss": 0.00012206919927848503,
989
- "mean_token_accuracy": 1.0,
990
- "num_tokens": 473729.0,
991
  "step": 97
992
  },
993
  {
994
- "entropy": 0.35474758967757225,
995
  "epoch": 1.1960183767228179,
996
- "grad_norm": 0.032958984375,
997
  "learning_rate": 8.170731707317073e-05,
998
- "loss": 0.0028819667641073465,
999
- "mean_token_accuracy": 0.9993131868541241,
1000
- "num_tokens": 479034.0,
1001
  "step": 98
1002
  },
1003
  {
1004
- "entropy": 0.3854726795107126,
1005
  "epoch": 1.2082695252679938,
1006
- "grad_norm": 0.00046539306640625,
1007
  "learning_rate": 8.048780487804879e-05,
1008
- "loss": 9.724850679049268e-05,
1009
- "mean_token_accuracy": 1.0,
1010
- "num_tokens": 484808.0,
1011
  "step": 99
1012
  },
1013
  {
1014
- "entropy": 0.31455889251083136,
1015
  "epoch": 1.22052067381317,
1016
- "grad_norm": 0.00958251953125,
1017
  "learning_rate": 7.926829268292683e-05,
1018
- "loss": 0.0009833230869844556,
1019
- "mean_token_accuracy": 1.0,
1020
- "num_tokens": 489519.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
- "eval_entropy": 0.3496412036643512,
1026
- "eval_loss": 0.0005010219174437225,
1027
- "eval_mean_token_accuracy": 0.9998490343923154,
1028
- "eval_num_tokens": 489519.0,
1029
- "eval_runtime": 51.1698,
1030
- "eval_samples_per_second": 1.348,
1031
- "eval_steps_per_second": 1.348,
1032
  "step": 100
1033
  },
1034
  {
1035
- "entropy": 0.36140021588653326,
1036
  "epoch": 1.2327718223583461,
1037
- "grad_norm": 0.0006256103515625,
1038
  "learning_rate": 7.804878048780489e-05,
1039
- "loss": 0.00011641360470093787,
1040
- "mean_token_accuracy": 1.0,
1041
- "num_tokens": 494754.0,
1042
  "step": 101
1043
  },
1044
  {
1045
- "entropy": 0.33879768289625645,
1046
  "epoch": 1.245022970903522,
1047
- "grad_norm": 0.00037384033203125,
1048
  "learning_rate": 7.682926829268293e-05,
1049
- "loss": 0.00010185636347159743,
1050
- "mean_token_accuracy": 1.0,
1051
- "num_tokens": 499834.0,
1052
  "step": 102
1053
  },
1054
  {
1055
- "entropy": 0.36160764284431934,
1056
  "epoch": 1.2572741194486983,
1057
- "grad_norm": 0.00103759765625,
1058
  "learning_rate": 7.560975609756099e-05,
1059
- "loss": 0.00012021363363601267,
1060
- "mean_token_accuracy": 1.0,
1061
- "num_tokens": 505264.0,
1062
  "step": 103
1063
  },
1064
  {
1065
- "entropy": 0.3344170628115535,
1066
  "epoch": 1.2695252679938744,
1067
- "grad_norm": 0.06787109375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
- "loss": 0.00044063289533369243,
1070
- "mean_token_accuracy": 0.9995915032923222,
1071
- "num_tokens": 510257.0,
1072
  "step": 104
1073
  },
1074
  {
1075
- "entropy": 0.36058457661420107,
1076
  "epoch": 1.2817764165390506,
1077
- "grad_norm": 0.01336669921875,
1078
  "learning_rate": 7.317073170731707e-05,
1079
- "loss": 0.0015127016231417656,
1080
- "mean_token_accuracy": 0.9993556700646877,
1081
- "num_tokens": 514490.0,
1082
  "step": 105
1083
  },
1084
  {
1085
- "entropy": 0.33314079977571964,
1086
  "epoch": 1.2940275650842268,
1087
- "grad_norm": 0.0011749267578125,
1088
  "learning_rate": 7.195121951219513e-05,
1089
- "loss": 0.00011071039625676349,
1090
- "mean_token_accuracy": 1.0,
1091
- "num_tokens": 519508.0,
1092
  "step": 106
1093
  },
1094
  {
1095
- "entropy": 0.3573821699246764,
1096
  "epoch": 1.3062787136294027,
1097
- "grad_norm": 0.0003986358642578125,
1098
  "learning_rate": 7.073170731707317e-05,
1099
- "loss": 0.00011713722778949887,
1100
- "mean_token_accuracy": 1.0,
1101
- "num_tokens": 524370.0,
1102
  "step": 107
1103
  },
1104
  {
1105
- "entropy": 0.3524222169071436,
1106
  "epoch": 1.318529862174579,
1107
- "grad_norm": 0.0003108978271484375,
1108
  "learning_rate": 6.951219512195122e-05,
1109
- "loss": 9.721294190967456e-05,
1110
- "mean_token_accuracy": 1.0,
1111
- "num_tokens": 528970.0,
1112
  "step": 108
1113
  },
1114
  {
1115
- "entropy": 0.3544369339942932,
1116
  "epoch": 1.3307810107197549,
1117
- "grad_norm": 0.005950927734375,
1118
  "learning_rate": 6.829268292682928e-05,
1119
- "loss": 0.0003032644744962454,
1120
- "mean_token_accuracy": 1.0,
1121
- "num_tokens": 533938.0,
1122
  "step": 109
1123
  },
1124
  {
1125
- "entropy": 0.3304135613143444,
1126
  "epoch": 1.343032159264931,
1127
- "grad_norm": 0.000965118408203125,
1128
  "learning_rate": 6.707317073170732e-05,
1129
- "loss": 0.00012454115494620055,
1130
- "mean_token_accuracy": 1.0,
1131
- "num_tokens": 539360.0,
1132
  "step": 110
1133
  },
1134
  {
1135
- "entropy": 0.3306180518120527,
1136
  "epoch": 1.3552833078101072,
1137
- "grad_norm": 0.0011444091796875,
1138
  "learning_rate": 6.585365853658538e-05,
1139
- "loss": 0.00013282139843795449,
1140
- "mean_token_accuracy": 1.0,
1141
- "num_tokens": 543728.0,
1142
  "step": 111
1143
  },
1144
  {
1145
- "entropy": 0.3708817586302757,
1146
  "epoch": 1.3675344563552834,
1147
- "grad_norm": 0.0218505859375,
1148
  "learning_rate": 6.463414634146342e-05,
1149
- "loss": 0.004361060913652182,
1150
- "mean_token_accuracy": 0.9983282573521137,
1151
- "num_tokens": 548161.0,
1152
  "step": 112
1153
  },
1154
  {
1155
- "entropy": 0.35475645773112774,
1156
  "epoch": 1.3797856049004595,
1157
- "grad_norm": 0.01361083984375,
1158
  "learning_rate": 6.341463414634146e-05,
1159
- "loss": 0.0014049941673874855,
1160
- "mean_token_accuracy": 0.998511902987957,
1161
- "num_tokens": 553690.0,
1162
  "step": 113
1163
  },
1164
  {
1165
- "entropy": 0.3360502114519477,
1166
  "epoch": 1.3920367534456355,
1167
- "grad_norm": 0.00023746490478515625,
1168
  "learning_rate": 6.219512195121952e-05,
1169
- "loss": 8.739449549466372e-05,
1170
- "mean_token_accuracy": 1.0,
1171
- "num_tokens": 558474.0,
1172
  "step": 114
1173
  },
1174
  {
1175
- "entropy": 0.35608484130352736,
1176
  "epoch": 1.4042879019908117,
1177
- "grad_norm": 0.0009765625,
1178
  "learning_rate": 6.097560975609756e-05,
1179
- "loss": 0.00013572419993579388,
1180
- "mean_token_accuracy": 1.0,
1181
- "num_tokens": 563962.0,
1182
  "step": 115
1183
  },
1184
  {
1185
- "entropy": 0.3591584851965308,
1186
  "epoch": 1.4165390505359878,
1187
- "grad_norm": 0.00103759765625,
1188
  "learning_rate": 5.975609756097561e-05,
1189
- "loss": 0.0001251319336006418,
1190
- "mean_token_accuracy": 1.0,
1191
- "num_tokens": 568300.0,
1192
  "step": 116
1193
  },
1194
  {
1195
- "entropy": 0.32333058025687933,
1196
  "epoch": 1.4287901990811638,
1197
- "grad_norm": 0.0002803802490234375,
1198
  "learning_rate": 5.853658536585366e-05,
1199
- "loss": 8.771298598730937e-05,
1200
- "mean_token_accuracy": 1.0,
1201
- "num_tokens": 572892.0,
1202
  "step": 117
1203
  },
1204
  {
1205
- "entropy": 0.3675775118172169,
1206
  "epoch": 1.44104134762634,
1207
- "grad_norm": 0.0014495849609375,
1208
  "learning_rate": 5.731707317073171e-05,
1209
- "loss": 0.00014175268006511033,
1210
- "mean_token_accuracy": 1.0,
1211
- "num_tokens": 577889.0,
1212
  "step": 118
1213
  },
1214
  {
1215
- "entropy": 0.37294205371290445,
1216
  "epoch": 1.4532924961715161,
1217
- "grad_norm": 0.00099945068359375,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
- "loss": 8.949499897425994e-05,
1220
- "mean_token_accuracy": 1.0,
1221
- "num_tokens": 583125.0,
1222
  "step": 119
1223
  },
1224
  {
1225
- "entropy": 0.3598701385781169,
1226
  "epoch": 1.4655436447166923,
1227
- "grad_norm": 0.006011962890625,
1228
  "learning_rate": 5.487804878048781e-05,
1229
- "loss": 0.00018555490532889962,
1230
- "mean_token_accuracy": 1.0,
1231
- "num_tokens": 587853.0,
1232
  "step": 120
1233
  },
1234
  {
1235
- "entropy": 0.3222861588001251,
1236
  "epoch": 1.4777947932618682,
1237
- "grad_norm": 0.0174560546875,
1238
  "learning_rate": 5.365853658536586e-05,
1239
- "loss": 0.0032859183847904205,
1240
- "mean_token_accuracy": 0.9993932023644447,
1241
- "num_tokens": 592286.0,
1242
  "step": 121
1243
  },
1244
  {
1245
- "entropy": 0.3423085901886225,
1246
  "epoch": 1.4900459418070444,
1247
- "grad_norm": 0.000354766845703125,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
- "loss": 9.40198588068597e-05,
1250
- "mean_token_accuracy": 1.0,
1251
- "num_tokens": 597048.0,
1252
  "step": 122
1253
  },
1254
  {
1255
- "entropy": 0.3356657065451145,
1256
  "epoch": 1.5022970903522204,
1257
- "grad_norm": 0.00177764892578125,
1258
  "learning_rate": 5.121951219512195e-05,
1259
- "loss": 0.00018200451449956745,
1260
- "mean_token_accuracy": 1.0,
1261
- "num_tokens": 601352.0,
1262
  "step": 123
1263
  },
1264
  {
1265
- "entropy": 0.34760472923517227,
1266
  "epoch": 1.5145482388973965,
1267
- "grad_norm": 0.11181640625,
1268
  "learning_rate": 5e-05,
1269
- "loss": 0.0016977301565930247,
1270
- "mean_token_accuracy": 0.9993686862289906,
1271
- "num_tokens": 606645.0,
1272
  "step": 124
1273
  },
1274
  {
1275
- "entropy": 0.34292006585747004,
1276
  "epoch": 1.5267993874425727,
1277
- "grad_norm": 0.00048828125,
1278
  "learning_rate": 4.878048780487805e-05,
1279
- "loss": 0.00011081612319685519,
1280
- "mean_token_accuracy": 1.0,
1281
- "num_tokens": 612407.0,
1282
  "step": 125
1283
  },
1284
  {
1285
- "entropy": 0.3405891256406903,
1286
  "epoch": 1.5390505359877489,
1287
- "grad_norm": 0.0098876953125,
1288
  "learning_rate": 4.75609756097561e-05,
1289
- "loss": 0.0002546444011386484,
1290
- "mean_token_accuracy": 1.0,
1291
- "num_tokens": 617229.0,
1292
  "step": 126
1293
  },
1294
  {
1295
- "entropy": 0.39804220758378506,
1296
  "epoch": 1.551301684532925,
1297
- "grad_norm": 0.00177001953125,
1298
  "learning_rate": 4.634146341463415e-05,
1299
- "loss": 0.00020191296061966568,
1300
- "mean_token_accuracy": 1.0,
1301
- "num_tokens": 622355.0,
1302
  "step": 127
1303
  },
1304
  {
1305
- "entropy": 0.38183566741645336,
1306
  "epoch": 1.5635528330781012,
1307
- "grad_norm": 0.0020294189453125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
- "loss": 0.0002027210284722969,
1310
- "mean_token_accuracy": 1.0,
1311
- "num_tokens": 627269.0,
1312
  "step": 128
1313
  },
1314
  {
1315
- "entropy": 0.32283751480281353,
1316
  "epoch": 1.5758039816232772,
1317
- "grad_norm": 0.054443359375,
1318
  "learning_rate": 4.390243902439025e-05,
1319
- "loss": 0.0007472627912648022,
1320
- "mean_token_accuracy": 0.9991987161338329,
1321
- "num_tokens": 631454.0,
1322
  "step": 129
1323
  },
1324
  {
1325
- "entropy": 0.31161691434681416,
1326
  "epoch": 1.5880551301684533,
1327
- "grad_norm": 0.00174713134765625,
1328
  "learning_rate": 4.26829268292683e-05,
1329
- "loss": 0.0001439937186660245,
1330
- "mean_token_accuracy": 1.0,
1331
- "num_tokens": 636502.0,
1332
  "step": 130
1333
  },
1334
  {
1335
- "entropy": 0.3435830660164356,
1336
  "epoch": 1.6003062787136293,
1337
- "grad_norm": 0.0308837890625,
1338
  "learning_rate": 4.146341463414634e-05,
1339
- "loss": 0.004759644623845816,
1340
- "mean_token_accuracy": 0.9986401423811913,
1341
- "num_tokens": 641264.0,
1342
  "step": 131
1343
  },
1344
  {
1345
- "entropy": 0.35103026777505875,
1346
  "epoch": 1.6125574272588055,
1347
- "grad_norm": 0.014404296875,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
- "loss": 0.002162080956622958,
1350
- "mean_token_accuracy": 0.9997351691126823,
1351
- "num_tokens": 646377.0,
1352
  "step": 132
1353
  },
1354
  {
1355
- "entropy": 0.2977801924571395,
1356
  "epoch": 1.6248085758039816,
1357
- "grad_norm": 0.0003948211669921875,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
- "loss": 0.00010242296411888674,
1360
- "mean_token_accuracy": 1.0,
1361
- "num_tokens": 650767.0,
1362
  "step": 133
1363
  },
1364
  {
1365
- "entropy": 0.3230333384126425,
1366
  "epoch": 1.6370597243491578,
1367
- "grad_norm": 0.00138092041015625,
1368
  "learning_rate": 3.780487804878049e-05,
1369
- "loss": 0.00015076796989887953,
1370
- "mean_token_accuracy": 1.0,
1371
- "num_tokens": 655169.0,
1372
  "step": 134
1373
  },
1374
  {
1375
- "entropy": 0.341650640591979,
1376
  "epoch": 1.649310872894334,
1377
- "grad_norm": 0.000942230224609375,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
- "loss": 0.00014208458014763892,
1380
- "mean_token_accuracy": 1.0,
1381
- "num_tokens": 660290.0,
1382
  "step": 135
1383
  },
1384
  {
1385
- "entropy": 0.3829786740243435,
1386
  "epoch": 1.66156202143951,
1387
- "grad_norm": 0.00069427490234375,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
- "loss": 0.00014442864630836993,
1390
- "mean_token_accuracy": 1.0,
1391
- "num_tokens": 664473.0,
1392
  "step": 136
1393
  },
1394
  {
1395
- "entropy": 0.36254822462797165,
1396
  "epoch": 1.673813169984686,
1397
- "grad_norm": 0.000873565673828125,
1398
  "learning_rate": 3.414634146341464e-05,
1399
- "loss": 0.00012407865142449737,
1400
- "mean_token_accuracy": 1.0,
1401
- "num_tokens": 669356.0,
1402
  "step": 137
1403
  },
1404
  {
1405
- "entropy": 0.3526885788887739,
1406
  "epoch": 1.686064318529862,
1407
- "grad_norm": 0.01544189453125,
1408
  "learning_rate": 3.292682926829269e-05,
1409
- "loss": 0.0013645780272781849,
1410
- "mean_token_accuracy": 1.0,
1411
- "num_tokens": 674911.0,
1412
  "step": 138
1413
  },
1414
  {
1415
- "entropy": 0.3426882065832615,
1416
  "epoch": 1.6983154670750382,
1417
- "grad_norm": 0.00136566162109375,
1418
  "learning_rate": 3.170731707317073e-05,
1419
- "loss": 0.00017942595877684653,
1420
- "mean_token_accuracy": 1.0,
1421
- "num_tokens": 679692.0,
1422
  "step": 139
1423
  },
1424
  {
1425
- "entropy": 0.36831479519605637,
1426
  "epoch": 1.7105666156202144,
1427
- "grad_norm": 0.01104736328125,
1428
  "learning_rate": 3.048780487804878e-05,
1429
- "loss": 0.00024098601716104895,
1430
- "mean_token_accuracy": 1.0,
1431
- "num_tokens": 685048.0,
1432
  "step": 140
1433
  },
1434
  {
1435
- "entropy": 0.3340944442898035,
1436
  "epoch": 1.7228177641653906,
1437
- "grad_norm": 0.000865936279296875,
1438
  "learning_rate": 2.926829268292683e-05,
1439
- "loss": 0.00013921498612035066,
1440
- "mean_token_accuracy": 1.0,
1441
- "num_tokens": 689396.0,
1442
  "step": 141
1443
  },
1444
  {
1445
- "entropy": 0.34801830537617207,
1446
  "epoch": 1.7350689127105667,
1447
- "grad_norm": 0.000965118408203125,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
- "loss": 0.0001655905944062397,
1450
- "mean_token_accuracy": 1.0,
1451
- "num_tokens": 693189.0,
1452
  "step": 142
1453
  },
1454
  {
1455
- "entropy": 0.35556044429540634,
1456
  "epoch": 1.7473200612557427,
1457
- "grad_norm": 0.0019073486328125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
- "loss": 0.00019044376676902175,
1460
- "mean_token_accuracy": 1.0,
1461
- "num_tokens": 697603.0,
1462
  "step": 143
1463
  },
1464
  {
1465
- "entropy": 0.3632572125643492,
1466
  "epoch": 1.7595712098009189,
1467
- "grad_norm": 0.00106048583984375,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
- "loss": 0.00017029076116159558,
1470
- "mean_token_accuracy": 1.0,
1471
- "num_tokens": 703050.0,
1472
  "step": 144
1473
  },
1474
  {
1475
- "entropy": 0.35750158317387104,
1476
  "epoch": 1.7718223583460948,
1477
- "grad_norm": 0.0130615234375,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
- "loss": 0.0015582278138026595,
1480
- "mean_token_accuracy": 0.999015748500824,
1481
- "num_tokens": 707862.0,
1482
  "step": 145
1483
  },
1484
  {
1485
- "entropy": 0.36597106605768204,
1486
  "epoch": 1.784073506891271,
1487
- "grad_norm": 0.000583648681640625,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
- "loss": 0.00013483221118804067,
1490
- "mean_token_accuracy": 1.0,
1491
- "num_tokens": 712821.0,
1492
  "step": 146
1493
  },
1494
  {
1495
- "entropy": 0.35171396005898714,
1496
  "epoch": 1.7963246554364471,
1497
- "grad_norm": 0.00157928466796875,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
- "loss": 0.00012708510621450841,
1500
- "mean_token_accuracy": 1.0,
1501
- "num_tokens": 718453.0,
1502
  "step": 147
1503
  },
1504
  {
1505
- "entropy": 0.3596025314182043,
1506
  "epoch": 1.8085758039816233,
1507
- "grad_norm": 0.000762939453125,
1508
  "learning_rate": 2.073170731707317e-05,
1509
- "loss": 0.00011665250349324197,
1510
- "mean_token_accuracy": 1.0,
1511
- "num_tokens": 723810.0,
1512
  "step": 148
1513
  },
1514
  {
1515
- "entropy": 0.3876404408365488,
1516
  "epoch": 1.8208269525267995,
1517
- "grad_norm": 0.00170135498046875,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
- "loss": 0.00014468679728452116,
1520
- "mean_token_accuracy": 1.0,
1521
- "num_tokens": 728126.0,
1522
  "step": 149
1523
  },
1524
  {
1525
- "entropy": 0.3599753547459841,
1526
  "epoch": 1.8330781010719757,
1527
- "grad_norm": 0.00982666015625,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
- "loss": 0.0008729367982596159,
1530
- "mean_token_accuracy": 0.9996936284005642,
1531
- "num_tokens": 733917.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
- "eval_entropy": 0.3504998228256253,
1537
- "eval_loss": 0.0005272864946164191,
1538
- "eval_mean_token_accuracy": 0.9998166846192401,
1539
- "eval_num_tokens": 733917.0,
1540
- "eval_runtime": 51.0847,
1541
- "eval_samples_per_second": 1.351,
1542
- "eval_steps_per_second": 1.351,
1543
  "step": 150
1544
  },
1545
  {
1546
- "entropy": 0.33359322790056467,
1547
  "epoch": 1.8453292496171516,
1548
- "grad_norm": 0.025146484375,
1549
  "learning_rate": 1.707317073170732e-05,
1550
- "loss": 0.0006189637933857739,
1551
- "mean_token_accuracy": 0.9997438527643681,
1552
- "num_tokens": 738160.0,
1553
  "step": 151
1554
  },
1555
  {
1556
- "entropy": 0.3766339849680662,
1557
  "epoch": 1.8575803981623276,
1558
- "grad_norm": 0.00141143798828125,
1559
  "learning_rate": 1.5853658536585366e-05,
1560
- "loss": 0.0001552235771669075,
1561
- "mean_token_accuracy": 1.0,
1562
- "num_tokens": 743916.0,
1563
  "step": 152
1564
  },
1565
  {
1566
- "entropy": 0.3593102600425482,
1567
  "epoch": 1.8698315467075037,
1568
- "grad_norm": 0.0005035400390625,
1569
  "learning_rate": 1.4634146341463415e-05,
1570
- "loss": 0.00010784749611048028,
1571
- "mean_token_accuracy": 1.0,
1572
- "num_tokens": 749557.0,
1573
  "step": 153
1574
  },
1575
  {
1576
- "entropy": 0.35354321263730526,
1577
  "epoch": 1.88208269525268,
1578
- "grad_norm": 0.006622314453125,
1579
  "learning_rate": 1.3414634146341466e-05,
1580
- "loss": 0.00018536817515268922,
1581
- "mean_token_accuracy": 1.0,
1582
- "num_tokens": 755349.0,
1583
  "step": 154
1584
  },
1585
  {
1586
- "entropy": 0.38418914191424847,
1587
  "epoch": 1.894333843797856,
1588
- "grad_norm": 0.0048828125,
1589
  "learning_rate": 1.2195121951219513e-05,
1590
- "loss": 0.00016054415027610958,
1591
- "mean_token_accuracy": 1.0,
1592
- "num_tokens": 760507.0,
1593
  "step": 155
1594
  },
1595
  {
1596
- "entropy": 0.36554452031850815,
1597
  "epoch": 1.9065849923430322,
1598
- "grad_norm": 0.000640869140625,
1599
  "learning_rate": 1.0975609756097562e-05,
1600
- "loss": 0.0001385942887281999,
1601
- "mean_token_accuracy": 1.0,
1602
- "num_tokens": 765415.0,
1603
  "step": 156
1604
  },
1605
  {
1606
- "entropy": 0.3568859798833728,
1607
  "epoch": 1.9188361408882084,
1608
- "grad_norm": 0.005401611328125,
1609
  "learning_rate": 9.756097560975611e-06,
1610
- "loss": 0.00035842141369357705,
1611
- "mean_token_accuracy": 1.0,
1612
- "num_tokens": 770652.0,
1613
  "step": 157
1614
  },
1615
  {
1616
- "entropy": 0.3693056581541896,
1617
  "epoch": 1.9310872894333844,
1618
- "grad_norm": 0.0006561279296875,
1619
  "learning_rate": 8.53658536585366e-06,
1620
- "loss": 0.00012641935609281063,
1621
- "mean_token_accuracy": 1.0,
1622
- "num_tokens": 775652.0,
1623
  "step": 158
1624
  },
1625
  {
1626
- "entropy": 0.3441598182544112,
1627
  "epoch": 1.9433384379785605,
1628
- "grad_norm": 0.0030364990234375,
1629
  "learning_rate": 7.317073170731707e-06,
1630
- "loss": 0.00021011351782362908,
1631
- "mean_token_accuracy": 1.0,
1632
- "num_tokens": 779850.0,
1633
  "step": 159
1634
  },
1635
  {
1636
- "entropy": 0.3504209266975522,
1637
  "epoch": 1.9555895865237365,
1638
- "grad_norm": 0.0021514892578125,
1639
  "learning_rate": 6.0975609756097564e-06,
1640
- "loss": 0.0001835815783124417,
1641
- "mean_token_accuracy": 1.0,
1642
- "num_tokens": 784524.0,
1643
  "step": 160
1644
  },
1645
  {
1646
- "entropy": 0.3750592265278101,
1647
  "epoch": 1.9678407350689127,
1648
- "grad_norm": 0.000492095947265625,
1649
  "learning_rate": 4.8780487804878055e-06,
1650
- "loss": 0.00013111173757351935,
1651
- "mean_token_accuracy": 1.0,
1652
- "num_tokens": 790219.0,
1653
  "step": 161
1654
  },
1655
  {
1656
- "entropy": 0.35673493705689907,
1657
  "epoch": 1.9800918836140888,
1658
- "grad_norm": 0.003204345703125,
1659
  "learning_rate": 3.6585365853658537e-06,
1660
- "loss": 0.00022010535758454353,
1661
- "mean_token_accuracy": 1.0,
1662
- "num_tokens": 794984.0,
1663
  "step": 162
1664
  },
1665
  {
1666
- "entropy": 0.3832458099350333,
1667
  "epoch": 1.992343032159265,
1668
- "grad_norm": 0.00061798095703125,
1669
  "learning_rate": 2.4390243902439027e-06,
1670
- "loss": 0.00012069179501850158,
1671
- "mean_token_accuracy": 1.0,
1672
- "num_tokens": 800604.0,
1673
  "step": 163
1674
  },
1675
  {
1676
- "entropy": 0.37493912875652313,
1677
  "epoch": 2.0,
1678
- "grad_norm": 0.023193359375,
1679
  "learning_rate": 1.2195121951219514e-06,
1680
- "loss": 0.002225137548521161,
1681
- "mean_token_accuracy": 0.999763035774231,
1682
- "num_tokens": 804260.0,
1683
  "step": 164
1684
  }
1685
  ],
@@ -1700,7 +1700,7 @@
1700
  "attributes": {}
1701
  }
1702
  },
1703
- "total_flos": 3.641786295631872e+16,
1704
  "train_batch_size": 1,
1705
  "trial_name": null,
1706
  "trial_params": null
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 0.3041980676352978,
14
  "epoch": 0.01225114854517611,
15
+ "grad_norm": 0.65234375,
16
  "learning_rate": 0.0002,
17
+ "loss": 0.12987954914569855,
18
+ "mean_token_accuracy": 0.9616314880549908,
19
+ "num_tokens": 6158.0,
20
  "step": 1
21
  },
22
  {
23
+ "entropy": 0.3288959041237831,
24
  "epoch": 0.02450229709035222,
25
+ "grad_norm": 0.35546875,
26
  "learning_rate": 0.00019878048780487805,
27
+ "loss": 0.07780474424362183,
28
+ "mean_token_accuracy": 0.9729398302733898,
29
+ "num_tokens": 11874.0,
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 0.36142856534570456,
34
  "epoch": 0.036753445635528334,
35
+ "grad_norm": 0.359375,
36
  "learning_rate": 0.0001975609756097561,
37
+ "loss": 0.08096732199192047,
38
+ "mean_token_accuracy": 0.9701218046247959,
39
+ "num_tokens": 17155.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 0.3489781664684415,
44
  "epoch": 0.04900459418070444,
45
+ "grad_norm": 0.2890625,
46
  "learning_rate": 0.00019634146341463416,
47
+ "loss": 0.07328949123620987,
48
+ "mean_token_accuracy": 0.9726539552211761,
49
+ "num_tokens": 22369.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 0.3293187078088522,
54
  "epoch": 0.06125574272588055,
55
+ "grad_norm": 0.248046875,
56
  "learning_rate": 0.0001951219512195122,
57
+ "loss": 0.0884413868188858,
58
+ "mean_token_accuracy": 0.9707589820027351,
59
+ "num_tokens": 28747.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 0.339785429649055,
64
  "epoch": 0.07350689127105667,
65
+ "grad_norm": 0.5390625,
66
  "learning_rate": 0.00019390243902439025,
67
+ "loss": 0.09005022048950195,
68
+ "mean_token_accuracy": 0.9711229205131531,
69
+ "num_tokens": 34001.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 0.3187539605423808,
74
  "epoch": 0.08575803981623277,
75
+ "grad_norm": 0.357421875,
76
  "learning_rate": 0.0001926829268292683,
77
+ "loss": 0.08771149814128876,
78
+ "mean_token_accuracy": 0.9713485687971115,
79
+ "num_tokens": 40149.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 0.30580494925379753,
84
  "epoch": 0.09800918836140889,
85
+ "grad_norm": 0.294921875,
86
  "learning_rate": 0.00019146341463414633,
87
+ "loss": 0.07060129195451736,
88
+ "mean_token_accuracy": 0.9733475148677826,
89
+ "num_tokens": 45006.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 0.2799514262005687,
94
  "epoch": 0.11026033690658499,
95
+ "grad_norm": 0.318359375,
96
  "learning_rate": 0.0001902439024390244,
97
+ "loss": 0.08461187779903412,
98
+ "mean_token_accuracy": 0.9710647016763687,
99
+ "num_tokens": 51060.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 0.28654457721859217,
104
  "epoch": 0.1225114854517611,
105
+ "grad_norm": 0.341796875,
106
  "learning_rate": 0.00018902439024390244,
107
+ "loss": 0.08227542042732239,
108
+ "mean_token_accuracy": 0.9692031815648079,
109
+ "num_tokens": 56583.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 0.26793921180069447,
114
  "epoch": 0.13476263399693722,
115
+ "grad_norm": 0.259765625,
116
  "learning_rate": 0.0001878048780487805,
117
+ "loss": 0.07852551341056824,
118
+ "mean_token_accuracy": 0.9747273214161396,
119
+ "num_tokens": 63136.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 0.30183705035597086,
124
  "epoch": 0.14701378254211334,
125
+ "grad_norm": 0.279296875,
126
  "learning_rate": 0.00018658536585365856,
127
+ "loss": 0.07575420290231705,
128
+ "mean_token_accuracy": 0.9726979620754719,
129
+ "num_tokens": 69604.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 0.2680633468553424,
134
  "epoch": 0.15926493108728942,
135
+ "grad_norm": 0.30078125,
136
  "learning_rate": 0.0001853658536585366,
137
+ "loss": 0.07610919326543808,
138
+ "mean_token_accuracy": 0.9748654179275036,
139
+ "num_tokens": 75871.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 0.27854922134429216,
144
  "epoch": 0.17151607963246554,
145
+ "grad_norm": 0.41015625,
146
  "learning_rate": 0.00018414634146341464,
147
+ "loss": 0.09525731950998306,
148
+ "mean_token_accuracy": 0.9607353135943413,
149
+ "num_tokens": 81257.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 0.27795467525720596,
154
  "epoch": 0.18376722817764166,
155
+ "grad_norm": 0.318359375,
156
  "learning_rate": 0.0001829268292682927,
157
+ "loss": 0.06710757315158844,
158
+ "mean_token_accuracy": 0.9751962497830391,
159
+ "num_tokens": 85769.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 0.2635908415541053,
164
  "epoch": 0.19601837672281777,
165
+ "grad_norm": 0.439453125,
166
  "learning_rate": 0.00018170731707317075,
167
+ "loss": 0.08850108832120895,
168
+ "mean_token_accuracy": 0.9715681448578835,
169
+ "num_tokens": 91023.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 0.26509028300642967,
174
  "epoch": 0.2082695252679939,
175
+ "grad_norm": 0.271484375,
176
  "learning_rate": 0.0001804878048780488,
177
+ "loss": 0.07479405403137207,
178
+ "mean_token_accuracy": 0.9715253114700317,
179
+ "num_tokens": 96339.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 0.2699039001017809,
184
  "epoch": 0.22052067381316998,
185
+ "grad_norm": 0.353515625,
186
  "learning_rate": 0.00017926829268292684,
187
+ "loss": 0.07519614696502686,
188
+ "mean_token_accuracy": 0.9718605615198612,
189
+ "num_tokens": 101938.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 0.2809357335790992,
194
  "epoch": 0.2327718223583461,
195
+ "grad_norm": 0.3046875,
196
  "learning_rate": 0.00017804878048780488,
197
+ "loss": 0.08889807015657425,
198
+ "mean_token_accuracy": 0.9662206135690212,
199
+ "num_tokens": 107971.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 0.29283443558961153,
204
  "epoch": 0.2450229709035222,
205
+ "grad_norm": 0.3359375,
206
  "learning_rate": 0.00017682926829268295,
207
+ "loss": 0.07666820287704468,
208
+ "mean_token_accuracy": 0.9718802459537983,
209
+ "num_tokens": 113005.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 0.29593323450535536,
214
  "epoch": 0.2572741194486983,
215
+ "grad_norm": 0.48046875,
216
  "learning_rate": 0.000175609756097561,
217
+ "loss": 0.0903002992272377,
218
+ "mean_token_accuracy": 0.9660120271146297,
219
+ "num_tokens": 118993.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 0.2751638563349843,
224
  "epoch": 0.26952526799387444,
225
+ "grad_norm": 0.27734375,
226
  "learning_rate": 0.00017439024390243903,
227
+ "loss": 0.09172362089157104,
228
+ "mean_token_accuracy": 0.9718118757009506,
229
+ "num_tokens": 125931.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 0.32398509234189987,
234
  "epoch": 0.28177641653905056,
235
+ "grad_norm": 0.41796875,
236
  "learning_rate": 0.00017317073170731708,
237
+ "loss": 0.08742143213748932,
238
+ "mean_token_accuracy": 0.9695158265531063,
239
+ "num_tokens": 130991.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 0.33191137574613094,
244
  "epoch": 0.29402756508422667,
245
+ "grad_norm": 0.322265625,
246
  "learning_rate": 0.00017195121951219512,
247
+ "loss": 0.0861031711101532,
248
+ "mean_token_accuracy": 0.9639297090470791,
249
+ "num_tokens": 136879.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 0.3045969307422638,
254
  "epoch": 0.30627871362940273,
255
+ "grad_norm": 0.263671875,
256
  "learning_rate": 0.0001707317073170732,
257
+ "loss": 0.07694194465875626,
258
+ "mean_token_accuracy": 0.9716509021818638,
259
+ "num_tokens": 142190.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 0.3009780840948224,
264
  "epoch": 0.31852986217457885,
265
+ "grad_norm": 0.33203125,
266
  "learning_rate": 0.00016951219512195123,
267
+ "loss": 0.0928640067577362,
268
+ "mean_token_accuracy": 0.9631010964512825,
269
+ "num_tokens": 147659.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 0.26603397261351347,
274
  "epoch": 0.33078101071975496,
275
+ "grad_norm": 0.255859375,
276
  "learning_rate": 0.00016829268292682927,
277
+ "loss": 0.07426867634057999,
278
+ "mean_token_accuracy": 0.9714149422943592,
279
+ "num_tokens": 153293.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 0.3046340309083462,
284
  "epoch": 0.3430321592649311,
285
+ "grad_norm": 0.275390625,
286
  "learning_rate": 0.00016707317073170731,
287
+ "loss": 0.0794563814997673,
288
+ "mean_token_accuracy": 0.9729922078549862,
289
+ "num_tokens": 159233.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 0.3388302018865943,
294
  "epoch": 0.3552833078101072,
295
+ "grad_norm": 0.330078125,
296
  "learning_rate": 0.00016585365853658536,
297
+ "loss": 0.09019184857606888,
298
+ "mean_token_accuracy": 0.9697935245931149,
299
+ "num_tokens": 164503.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 0.3342750547453761,
304
  "epoch": 0.3675344563552833,
305
+ "grad_norm": 0.294921875,
306
  "learning_rate": 0.00016463414634146343,
307
+ "loss": 0.09340573102235794,
308
+ "mean_token_accuracy": 0.9605537690222263,
309
+ "num_tokens": 170523.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 0.30620657559484243,
314
  "epoch": 0.37978560490045943,
315
+ "grad_norm": 0.345703125,
316
  "learning_rate": 0.00016341463414634147,
317
+ "loss": 0.07664323598146439,
318
+ "mean_token_accuracy": 0.9681061618030071,
319
+ "num_tokens": 174928.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 0.30496103409677744,
324
  "epoch": 0.39203675344563554,
325
+ "grad_norm": 0.2490234375,
326
  "learning_rate": 0.00016219512195121954,
327
+ "loss": 0.07825497537851334,
328
+ "mean_token_accuracy": 0.9774314574897289,
329
+ "num_tokens": 181701.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 0.2976566730067134,
334
  "epoch": 0.40428790199081166,
335
+ "grad_norm": 0.27734375,
336
  "learning_rate": 0.00016097560975609758,
337
+ "loss": 0.09027402102947235,
338
+ "mean_token_accuracy": 0.9658169783651829,
339
+ "num_tokens": 187412.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 0.27866631746292114,
344
  "epoch": 0.4165390505359878,
345
+ "grad_norm": 0.251953125,
346
  "learning_rate": 0.00015975609756097562,
347
+ "loss": 0.08274199813604355,
348
+ "mean_token_accuracy": 0.9746941477060318,
349
+ "num_tokens": 192875.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 0.2830528961494565,
354
  "epoch": 0.42879019908116384,
355
+ "grad_norm": 0.30859375,
356
  "learning_rate": 0.00015853658536585366,
357
+ "loss": 0.09378398954868317,
358
+ "mean_token_accuracy": 0.9666311629116535,
359
+ "num_tokens": 198242.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 0.27817794494330883,
364
  "epoch": 0.44104134762633995,
365
+ "grad_norm": 0.37890625,
366
  "learning_rate": 0.00015731707317073173,
367
+ "loss": 0.0867948904633522,
368
+ "mean_token_accuracy": 0.9669017046689987,
369
+ "num_tokens": 203345.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 0.24728088174015284,
374
  "epoch": 0.45329249617151607,
375
+ "grad_norm": 0.2734375,
376
  "learning_rate": 0.00015609756097560978,
377
+ "loss": 0.0845772847533226,
378
+ "mean_token_accuracy": 0.9704948216676712,
379
+ "num_tokens": 209548.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 0.2514335783198476,
384
  "epoch": 0.4655436447166922,
385
+ "grad_norm": 0.296875,
386
  "learning_rate": 0.00015487804878048782,
387
+ "loss": 0.08028042316436768,
388
+ "mean_token_accuracy": 0.9699894711375237,
389
+ "num_tokens": 215104.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 0.2479592664167285,
394
  "epoch": 0.4777947932618683,
395
+ "grad_norm": 0.34375,
396
  "learning_rate": 0.00015365853658536586,
397
+ "loss": 0.0773642361164093,
398
+ "mean_token_accuracy": 0.9734528213739395,
399
+ "num_tokens": 220401.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 0.28870310448110104,
404
  "epoch": 0.4900459418070444,
405
+ "grad_norm": 0.404296875,
406
  "learning_rate": 0.0001524390243902439,
407
+ "loss": 0.086701899766922,
408
+ "mean_token_accuracy": 0.9637217558920383,
409
+ "num_tokens": 225652.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 0.24501706194132566,
414
  "epoch": 0.5022970903522205,
415
+ "grad_norm": 0.25390625,
416
  "learning_rate": 0.00015121951219512197,
417
+ "loss": 0.07521235942840576,
418
+ "mean_token_accuracy": 0.9753581583499908,
419
+ "num_tokens": 230583.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 0.28654969297349453,
424
  "epoch": 0.5145482388973966,
425
+ "grad_norm": 0.39453125,
426
  "learning_rate": 0.00015000000000000001,
427
+ "loss": 0.09706442058086395,
428
+ "mean_token_accuracy": 0.9664160050451756,
429
+ "num_tokens": 235457.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 0.28464407846331596,
434
  "epoch": 0.5267993874425727,
435
+ "grad_norm": 0.306640625,
436
  "learning_rate": 0.00014878048780487806,
437
+ "loss": 0.07722343504428864,
438
+ "mean_token_accuracy": 0.9730902686715126,
439
+ "num_tokens": 241513.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 0.30445601511746645,
444
  "epoch": 0.5390505359877489,
445
+ "grad_norm": 0.310546875,
446
  "learning_rate": 0.0001475609756097561,
447
+ "loss": 0.0771762803196907,
448
+ "mean_token_accuracy": 0.975468497723341,
449
+ "num_tokens": 246914.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 0.26495161652565,
454
  "epoch": 0.5513016845329249,
455
+ "grad_norm": 0.25390625,
456
  "learning_rate": 0.00014634146341463414,
457
+ "loss": 0.0772022157907486,
458
+ "mean_token_accuracy": 0.9669279642403126,
459
+ "num_tokens": 252479.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 0.28918597288429737,
464
  "epoch": 0.5635528330781011,
465
+ "grad_norm": 0.294921875,
466
  "learning_rate": 0.0001451219512195122,
467
+ "loss": 0.08781749755144119,
468
+ "mean_token_accuracy": 0.9674229696393013,
469
+ "num_tokens": 257947.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 0.2856784025207162,
474
  "epoch": 0.5758039816232772,
475
+ "grad_norm": 0.267578125,
476
  "learning_rate": 0.00014390243902439025,
477
+ "loss": 0.0642290860414505,
478
+ "mean_token_accuracy": 0.9745214283466339,
479
+ "num_tokens": 263220.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 0.2811201810836792,
484
  "epoch": 0.5880551301684533,
485
+ "grad_norm": 0.34765625,
486
  "learning_rate": 0.0001426829268292683,
487
+ "loss": 0.08826867491006851,
488
+ "mean_token_accuracy": 0.9674578756093979,
489
+ "num_tokens": 268072.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 0.3114980049431324,
494
  "epoch": 0.6003062787136294,
495
+ "grad_norm": 0.380859375,
496
  "learning_rate": 0.00014146341463414634,
497
+ "loss": 0.08005333691835403,
498
+ "mean_token_accuracy": 0.9698234163224697,
499
+ "num_tokens": 273245.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 0.2891535870730877,
504
  "epoch": 0.6125574272588055,
505
+ "grad_norm": 0.2734375,
506
  "learning_rate": 0.00014024390243902438,
507
+ "loss": 0.06931450217962265,
508
+ "mean_token_accuracy": 0.9755596853792667,
509
+ "num_tokens": 278455.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
+ "eval_entropy": 0.2854771510414455,
515
+ "eval_loss": 0.07233226299285889,
516
+ "eval_mean_token_accuracy": 0.9710306654805723,
517
+ "eval_num_tokens": 278455.0,
518
+ "eval_runtime": 57.5345,
519
+ "eval_samples_per_second": 1.199,
520
+ "eval_steps_per_second": 1.199,
521
  "step": 50
522
  },
523
  {
524
+ "entropy": 0.2805565893650055,
525
  "epoch": 0.6248085758039816,
526
+ "grad_norm": 0.283203125,
527
  "learning_rate": 0.00013902439024390245,
528
+ "loss": 0.07015535980463028,
529
+ "mean_token_accuracy": 0.9736857265233994,
530
+ "num_tokens": 283905.0,
531
  "step": 51
532
  },
533
  {
534
+ "entropy": 0.2953841704875231,
535
  "epoch": 0.6370597243491577,
536
+ "grad_norm": 0.326171875,
537
  "learning_rate": 0.0001378048780487805,
538
+ "loss": 0.08345313370227814,
539
+ "mean_token_accuracy": 0.9668012037873268,
540
+ "num_tokens": 289621.0,
541
  "step": 52
542
  },
543
  {
544
+ "entropy": 0.2993172137066722,
545
  "epoch": 0.6493108728943339,
546
+ "grad_norm": 0.330078125,
547
  "learning_rate": 0.00013658536585365856,
548
+ "loss": 0.06988305598497391,
549
+ "mean_token_accuracy": 0.9744048714637756,
550
+ "num_tokens": 295589.0,
551
  "step": 53
552
  },
553
  {
554
+ "entropy": 0.24585585854947567,
555
  "epoch": 0.6615620214395099,
556
+ "grad_norm": 0.326171875,
557
  "learning_rate": 0.0001353658536585366,
558
+ "loss": 0.08018705993890762,
559
+ "mean_token_accuracy": 0.97354631498456,
560
+ "num_tokens": 301123.0,
561
  "step": 54
562
  },
563
  {
564
+ "entropy": 0.2925192918628454,
565
  "epoch": 0.6738131699846861,
566
+ "grad_norm": 0.310546875,
567
  "learning_rate": 0.00013414634146341464,
568
+ "loss": 0.08674345165491104,
569
+ "mean_token_accuracy": 0.9711452201008797,
570
+ "num_tokens": 307068.0,
571
  "step": 55
572
  },
573
  {
574
+ "entropy": 0.25901074800640345,
575
  "epoch": 0.6860643185298622,
576
+ "grad_norm": 0.28515625,
577
  "learning_rate": 0.0001329268292682927,
578
+ "loss": 0.08070839941501617,
579
+ "mean_token_accuracy": 0.9729307927191257,
580
+ "num_tokens": 312510.0,
581
  "step": 56
582
  },
583
  {
584
+ "entropy": 0.29548987187445164,
585
  "epoch": 0.6983154670750383,
586
+ "grad_norm": 0.34375,
587
  "learning_rate": 0.00013170731707317076,
588
+ "loss": 0.08920362591743469,
589
+ "mean_token_accuracy": 0.9669180549681187,
590
+ "num_tokens": 318484.0,
591
  "step": 57
592
  },
593
  {
594
+ "entropy": 0.2568454071879387,
595
  "epoch": 0.7105666156202144,
596
+ "grad_norm": 0.2734375,
597
  "learning_rate": 0.0001304878048780488,
598
+ "loss": 0.07155663520097733,
599
+ "mean_token_accuracy": 0.9701582230627537,
600
+ "num_tokens": 324813.0,
601
  "step": 58
602
  },
603
  {
604
+ "entropy": 0.2808335982263088,
605
  "epoch": 0.7228177641653905,
606
+ "grad_norm": 0.283203125,
607
  "learning_rate": 0.00012926829268292684,
608
+ "loss": 0.07705684751272202,
609
+ "mean_token_accuracy": 0.966577123850584,
610
+ "num_tokens": 330011.0,
611
  "step": 59
612
  },
613
  {
614
+ "entropy": 0.2699971180409193,
615
  "epoch": 0.7350689127105666,
616
+ "grad_norm": 0.39453125,
617
  "learning_rate": 0.00012804878048780488,
618
+ "loss": 0.10086975991725922,
619
+ "mean_token_accuracy": 0.9665980078279972,
620
+ "num_tokens": 335902.0,
621
  "step": 60
622
  },
623
  {
624
+ "entropy": 0.2485162764787674,
625
  "epoch": 0.7473200612557427,
626
+ "grad_norm": 0.310546875,
627
  "learning_rate": 0.00012682926829268293,
628
+ "loss": 0.08342916518449783,
629
+ "mean_token_accuracy": 0.9685300663113594,
630
+ "num_tokens": 342624.0,
631
  "step": 61
632
  },
633
  {
634
+ "entropy": 0.3012225143611431,
635
  "epoch": 0.7595712098009189,
636
+ "grad_norm": 0.2490234375,
637
  "learning_rate": 0.000125609756097561,
638
+ "loss": 0.06536269932985306,
639
+ "mean_token_accuracy": 0.9756054095923901,
640
+ "num_tokens": 348764.0,
641
  "step": 62
642
  },
643
  {
644
+ "entropy": 0.26388413086533546,
645
  "epoch": 0.7718223583460949,
646
+ "grad_norm": 0.384765625,
647
  "learning_rate": 0.00012439024390243904,
648
+ "loss": 0.06917200982570648,
649
+ "mean_token_accuracy": 0.9760353714227676,
650
+ "num_tokens": 353717.0,
651
  "step": 63
652
  },
653
  {
654
+ "entropy": 0.26560324616730213,
655
  "epoch": 0.7840735068912711,
656
+ "grad_norm": 0.341796875,
657
  "learning_rate": 0.00012317073170731708,
658
+ "loss": 0.07885865867137909,
659
+ "mean_token_accuracy": 0.9693707525730133,
660
+ "num_tokens": 358700.0,
661
  "step": 64
662
  },
663
  {
664
+ "entropy": 0.2870019916445017,
665
  "epoch": 0.7963246554364471,
666
+ "grad_norm": 0.296875,
667
  "learning_rate": 0.00012195121951219512,
668
+ "loss": 0.07569920271635056,
669
+ "mean_token_accuracy": 0.9747120216488838,
670
+ "num_tokens": 364367.0,
671
  "step": 65
672
  },
673
  {
674
+ "entropy": 0.30871331319212914,
675
  "epoch": 0.8085758039816233,
676
+ "grad_norm": 0.39453125,
677
  "learning_rate": 0.00012073170731707318,
678
+ "loss": 0.07961063086986542,
679
+ "mean_token_accuracy": 0.9741152077913284,
680
+ "num_tokens": 370191.0,
681
  "step": 66
682
  },
683
  {
684
+ "entropy": 0.25089073460549116,
685
  "epoch": 0.8208269525267994,
686
+ "grad_norm": 0.275390625,
687
  "learning_rate": 0.00011951219512195122,
688
+ "loss": 0.06939976662397385,
689
+ "mean_token_accuracy": 0.9737464673817158,
690
+ "num_tokens": 376432.0,
691
  "step": 67
692
  },
693
  {
694
+ "entropy": 0.2964933030307293,
695
  "epoch": 0.8330781010719756,
696
+ "grad_norm": 0.30859375,
697
  "learning_rate": 0.00011829268292682926,
698
+ "loss": 0.06059417501091957,
699
+ "mean_token_accuracy": 0.9753321446478367,
700
+ "num_tokens": 381184.0,
701
  "step": 68
702
  },
703
  {
704
+ "entropy": 0.29217866342514753,
705
  "epoch": 0.8453292496171516,
706
+ "grad_norm": 0.3046875,
707
  "learning_rate": 0.00011707317073170732,
708
+ "loss": 0.0808170959353447,
709
+ "mean_token_accuracy": 0.974962618201971,
710
+ "num_tokens": 385476.0,
711
  "step": 69
712
  },
713
  {
714
+ "entropy": 0.32675024215132,
715
  "epoch": 0.8575803981623277,
716
+ "grad_norm": 0.5078125,
717
  "learning_rate": 0.00011585365853658536,
718
+ "loss": 0.10380380600690842,
719
+ "mean_token_accuracy": 0.9621776640415192,
720
+ "num_tokens": 390246.0,
721
  "step": 70
722
  },
723
  {
724
+ "entropy": 0.3259228030219674,
725
  "epoch": 0.8698315467075038,
726
+ "grad_norm": 0.45703125,
727
  "learning_rate": 0.00011463414634146342,
728
+ "loss": 0.07719732075929642,
729
+ "mean_token_accuracy": 0.9656922854483128,
730
+ "num_tokens": 395726.0,
731
  "step": 71
732
  },
733
  {
734
+ "entropy": 0.2817502664402127,
735
  "epoch": 0.8820826952526799,
736
+ "grad_norm": 0.400390625,
737
  "learning_rate": 0.00011341463414634146,
738
+ "loss": 0.10946179181337357,
739
+ "mean_token_accuracy": 0.9588761143386364,
740
+ "num_tokens": 400824.0,
741
  "step": 72
742
  },
743
  {
744
+ "entropy": 0.28915605414658785,
745
  "epoch": 0.8943338437978561,
746
+ "grad_norm": 0.341796875,
747
  "learning_rate": 0.00011219512195121953,
748
+ "loss": 0.07160484790802002,
749
+ "mean_token_accuracy": 0.9732935056090355,
750
+ "num_tokens": 407602.0,
751
  "step": 73
752
  },
753
  {
754
+ "entropy": 0.285653960891068,
755
  "epoch": 0.9065849923430321,
756
+ "grad_norm": 0.283203125,
757
  "learning_rate": 0.00011097560975609757,
758
+ "loss": 0.08304032683372498,
759
+ "mean_token_accuracy": 0.9711127728223801,
760
+ "num_tokens": 413815.0,
761
  "step": 74
762
  },
763
  {
764
+ "entropy": 0.30409657675772905,
765
  "epoch": 0.9188361408882083,
766
+ "grad_norm": 0.322265625,
767
  "learning_rate": 0.00010975609756097563,
768
+ "loss": 0.06820105761289597,
769
+ "mean_token_accuracy": 0.9733226448297501,
770
+ "num_tokens": 418971.0,
771
  "step": 75
772
  },
773
  {
774
+ "entropy": 0.3267542561516166,
775
  "epoch": 0.9310872894333844,
776
+ "grad_norm": 0.291015625,
777
  "learning_rate": 0.00010853658536585367,
778
+ "loss": 0.07155608385801315,
779
+ "mean_token_accuracy": 0.9717761054635048,
780
+ "num_tokens": 423777.0,
781
  "step": 76
782
  },
783
  {
784
+ "entropy": 0.26458421628922224,
785
  "epoch": 0.9433384379785605,
786
+ "grad_norm": 0.271484375,
787
  "learning_rate": 0.00010731707317073172,
788
+ "loss": 0.07516152411699295,
789
+ "mean_token_accuracy": 0.9726166129112244,
790
+ "num_tokens": 430300.0,
791
  "step": 77
792
  },
793
  {
794
+ "entropy": 0.313277630135417,
795
  "epoch": 0.9555895865237366,
796
+ "grad_norm": 0.33984375,
797
  "learning_rate": 0.00010609756097560977,
798
+ "loss": 0.07090278714895248,
799
+ "mean_token_accuracy": 0.9733094871044159,
800
+ "num_tokens": 435531.0,
801
  "step": 78
802
  },
803
  {
804
+ "entropy": 0.29259978514164686,
805
  "epoch": 0.9678407350689127,
806
+ "grad_norm": 0.369140625,
807
  "learning_rate": 0.00010487804878048781,
808
+ "loss": 0.07661356031894684,
809
+ "mean_token_accuracy": 0.9706463180482388,
810
+ "num_tokens": 440264.0,
811
  "step": 79
812
  },
813
  {
814
+ "entropy": 0.2779441485181451,
815
  "epoch": 0.9800918836140888,
816
+ "grad_norm": 0.3203125,
817
  "learning_rate": 0.00010365853658536586,
818
+ "loss": 0.07106667011976242,
819
+ "mean_token_accuracy": 0.9709027595818043,
820
+ "num_tokens": 446616.0,
821
  "step": 80
822
  },
823
  {
824
+ "entropy": 0.2993398727849126,
825
  "epoch": 0.9923430321592649,
826
+ "grad_norm": 0.27734375,
827
  "learning_rate": 0.0001024390243902439,
828
+ "loss": 0.08056843280792236,
829
+ "mean_token_accuracy": 0.97335534542799,
830
+ "num_tokens": 451901.0,
831
  "step": 81
832
  },
833
  {
834
+ "entropy": 0.2985739395022392,
835
  "epoch": 1.0,
836
+ "grad_norm": 0.3515625,
837
  "learning_rate": 0.00010121951219512196,
838
+ "loss": 0.05783425644040108,
839
+ "mean_token_accuracy": 0.9816944003105164,
840
+ "num_tokens": 455183.0,
841
  "step": 82
842
  },
843
  {
844
+ "entropy": 0.25938804540783167,
845
  "epoch": 1.0122511485451762,
846
+ "grad_norm": 0.2099609375,
847
  "learning_rate": 0.0001,
848
+ "loss": 0.04025664180517197,
849
+ "mean_token_accuracy": 0.9888772070407867,
850
+ "num_tokens": 460758.0,
851
  "step": 83
852
  },
853
  {
854
+ "entropy": 0.2424000184983015,
855
  "epoch": 1.0245022970903521,
856
+ "grad_norm": 0.31640625,
857
  "learning_rate": 9.878048780487805e-05,
858
+ "loss": 0.041272781789302826,
859
+ "mean_token_accuracy": 0.9850874915719032,
860
+ "num_tokens": 466506.0,
861
  "step": 84
862
  },
863
  {
864
+ "entropy": 0.28685680869966745,
865
  "epoch": 1.0367534456355283,
866
+ "grad_norm": 0.220703125,
867
  "learning_rate": 9.75609756097561e-05,
868
+ "loss": 0.04828771948814392,
869
+ "mean_token_accuracy": 0.987192340195179,
870
+ "num_tokens": 473114.0,
871
  "step": 85
872
  },
873
  {
874
+ "entropy": 0.2173819374293089,
875
  "epoch": 1.0490045941807045,
876
+ "grad_norm": 0.212890625,
877
  "learning_rate": 9.634146341463415e-05,
878
+ "loss": 0.02984496019780636,
879
+ "mean_token_accuracy": 0.9893322959542274,
880
+ "num_tokens": 478272.0,
881
  "step": 86
882
  },
883
  {
884
+ "entropy": 0.24273153394460678,
885
  "epoch": 1.0612557427258806,
886
+ "grad_norm": 0.193359375,
887
  "learning_rate": 9.51219512195122e-05,
888
+ "loss": 0.05374791473150253,
889
+ "mean_token_accuracy": 0.9889552295207977,
890
+ "num_tokens": 484056.0,
891
  "step": 87
892
  },
893
  {
894
+ "entropy": 0.26517119724303484,
895
  "epoch": 1.0735068912710566,
896
+ "grad_norm": 0.2265625,
897
  "learning_rate": 9.390243902439024e-05,
898
+ "loss": 0.03632190451025963,
899
+ "mean_token_accuracy": 0.9914858341217041,
900
+ "num_tokens": 489018.0,
901
  "step": 88
902
  },
903
  {
904
+ "entropy": 0.24422209709882736,
905
  "epoch": 1.0857580398162328,
906
+ "grad_norm": 0.2041015625,
907
  "learning_rate": 9.26829268292683e-05,
908
+ "loss": 0.03214319050312042,
909
+ "mean_token_accuracy": 0.9906447269022465,
910
+ "num_tokens": 493923.0,
911
  "step": 89
912
  },
913
  {
914
+ "entropy": 0.221808229573071,
915
  "epoch": 1.098009188361409,
916
+ "grad_norm": 0.234375,
917
  "learning_rate": 9.146341463414635e-05,
918
+ "loss": 0.038689155131578445,
919
+ "mean_token_accuracy": 0.9870963655412197,
920
+ "num_tokens": 499718.0,
921
  "step": 90
922
  },
923
  {
924
+ "entropy": 0.25665554590523243,
925
  "epoch": 1.110260336906585,
926
+ "grad_norm": 0.26171875,
927
  "learning_rate": 9.02439024390244e-05,
928
+ "loss": 0.03570985421538353,
929
+ "mean_token_accuracy": 0.9886105321347713,
930
+ "num_tokens": 504426.0,
931
  "step": 91
932
  },
933
  {
934
+ "entropy": 0.23205960728228092,
935
  "epoch": 1.122511485451761,
936
+ "grad_norm": 0.1826171875,
937
  "learning_rate": 8.902439024390244e-05,
938
+ "loss": 0.024796659126877785,
939
+ "mean_token_accuracy": 0.9915902987122536,
940
+ "num_tokens": 509188.0,
941
  "step": 92
942
  },
943
  {
944
+ "entropy": 0.2075599799863994,
945
  "epoch": 1.1347626339969372,
946
+ "grad_norm": 0.2392578125,
947
  "learning_rate": 8.78048780487805e-05,
948
+ "loss": 0.027952998876571655,
949
+ "mean_token_accuracy": 0.9888908788561821,
950
+ "num_tokens": 514373.0,
951
  "step": 93
952
  },
953
  {
954
+ "entropy": 0.20593736693263054,
955
  "epoch": 1.1470137825421134,
956
+ "grad_norm": 0.294921875,
957
  "learning_rate": 8.658536585365854e-05,
958
+ "loss": 0.03931131958961487,
959
+ "mean_token_accuracy": 0.9884286597371101,
960
+ "num_tokens": 519285.0,
961
  "step": 94
962
  },
963
  {
964
+ "entropy": 0.18408051086589694,
965
  "epoch": 1.1592649310872893,
966
+ "grad_norm": 0.37890625,
967
  "learning_rate": 8.53658536585366e-05,
968
+ "loss": 0.029383456334471703,
969
+ "mean_token_accuracy": 0.9906960390508175,
970
+ "num_tokens": 524819.0,
971
  "step": 95
972
  },
973
  {
974
+ "entropy": 0.21047947462648153,
975
  "epoch": 1.1715160796324655,
976
+ "grad_norm": 0.328125,
977
  "learning_rate": 8.414634146341464e-05,
978
+ "loss": 0.03170277550816536,
979
+ "mean_token_accuracy": 0.9901499785482883,
980
+ "num_tokens": 529540.0,
981
  "step": 96
982
  },
983
  {
984
+ "entropy": 0.20975746307522058,
985
  "epoch": 1.1837672281776417,
986
+ "grad_norm": 0.365234375,
987
  "learning_rate": 8.292682926829268e-05,
988
+ "loss": 0.030736297369003296,
989
+ "mean_token_accuracy": 0.988069623708725,
990
+ "num_tokens": 534290.0,
991
  "step": 97
992
  },
993
  {
994
+ "entropy": 0.2099036993458867,
995
  "epoch": 1.1960183767228179,
996
+ "grad_norm": 0.298828125,
997
  "learning_rate": 8.170731707317073e-05,
998
+ "loss": 0.04264690354466438,
999
+ "mean_token_accuracy": 0.9882702529430389,
1000
+ "num_tokens": 540093.0,
1001
  "step": 98
1002
  },
1003
  {
1004
+ "entropy": 0.20470174960792065,
1005
  "epoch": 1.2082695252679938,
1006
+ "grad_norm": 0.283203125,
1007
  "learning_rate": 8.048780487804879e-05,
1008
+ "loss": 0.04001612961292267,
1009
+ "mean_token_accuracy": 0.9892623983323574,
1010
+ "num_tokens": 546621.0,
1011
  "step": 99
1012
  },
1013
  {
1014
+ "entropy": 0.19278133288025856,
1015
  "epoch": 1.22052067381317,
1016
+ "grad_norm": 0.6171875,
1017
  "learning_rate": 7.926829268292683e-05,
1018
+ "loss": 0.04199153557419777,
1019
+ "mean_token_accuracy": 0.987313587218523,
1020
+ "num_tokens": 552086.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
+ "eval_entropy": 0.20443568367888962,
1026
+ "eval_loss": 0.07565851509571075,
1027
+ "eval_mean_token_accuracy": 0.9715519590654235,
1028
+ "eval_num_tokens": 552086.0,
1029
+ "eval_runtime": 57.4717,
1030
+ "eval_samples_per_second": 1.201,
1031
+ "eval_steps_per_second": 1.201,
1032
  "step": 100
1033
  },
1034
  {
1035
+ "entropy": 0.20453590713441372,
1036
  "epoch": 1.2327718223583461,
1037
+ "grad_norm": 0.478515625,
1038
  "learning_rate": 7.804878048780489e-05,
1039
+ "loss": 0.05255145579576492,
1040
+ "mean_token_accuracy": 0.9822860956192017,
1041
+ "num_tokens": 558025.0,
1042
  "step": 101
1043
  },
1044
  {
1045
+ "entropy": 0.18952476000413299,
1046
  "epoch": 1.245022970903522,
1047
+ "grad_norm": 0.271484375,
1048
  "learning_rate": 7.682926829268293e-05,
1049
+ "loss": 0.027246126905083656,
1050
+ "mean_token_accuracy": 0.9902120418846607,
1051
+ "num_tokens": 564016.0,
1052
  "step": 102
1053
  },
1054
  {
1055
+ "entropy": 0.2112936107441783,
1056
  "epoch": 1.2572741194486983,
1057
+ "grad_norm": 0.279296875,
1058
  "learning_rate": 7.560975609756099e-05,
1059
+ "loss": 0.03221844881772995,
1060
+ "mean_token_accuracy": 0.9891848936676979,
1061
+ "num_tokens": 569749.0,
1062
  "step": 103
1063
  },
1064
  {
1065
+ "entropy": 0.21675583999603987,
1066
  "epoch": 1.2695252679938744,
1067
+ "grad_norm": 0.322265625,
1068
  "learning_rate": 7.439024390243903e-05,
1069
+ "loss": 0.0432349294424057,
1070
+ "mean_token_accuracy": 0.9877067022025585,
1071
+ "num_tokens": 575108.0,
1072
  "step": 104
1073
  },
1074
  {
1075
+ "entropy": 0.20377221517264843,
1076
  "epoch": 1.2817764165390506,
1077
+ "grad_norm": 0.298828125,
1078
  "learning_rate": 7.317073170731707e-05,
1079
+ "loss": 0.029307818040251732,
1080
+ "mean_token_accuracy": 0.9907472543418407,
1081
+ "num_tokens": 580685.0,
1082
  "step": 105
1083
  },
1084
  {
1085
+ "entropy": 0.2235541269183159,
1086
  "epoch": 1.2940275650842268,
1087
+ "grad_norm": 0.29296875,
1088
  "learning_rate": 7.195121951219513e-05,
1089
+ "loss": 0.032167330384254456,
1090
+ "mean_token_accuracy": 0.9906046241521835,
1091
+ "num_tokens": 586319.0,
1092
  "step": 106
1093
  },
1094
  {
1095
+ "entropy": 0.21635576337575912,
1096
  "epoch": 1.3062787136294027,
1097
+ "grad_norm": 0.15234375,
1098
  "learning_rate": 7.073170731707317e-05,
1099
+ "loss": 0.024916525930166245,
1100
+ "mean_token_accuracy": 0.9925986491143703,
1101
+ "num_tokens": 591791.0,
1102
  "step": 107
1103
  },
1104
  {
1105
+ "entropy": 0.23305469285696745,
1106
  "epoch": 1.318529862174579,
1107
+ "grad_norm": 0.384765625,
1108
  "learning_rate": 6.951219512195122e-05,
1109
+ "loss": 0.043021317571401596,
1110
+ "mean_token_accuracy": 0.9842446520924568,
1111
+ "num_tokens": 597135.0,
1112
  "step": 108
1113
  },
1114
  {
1115
+ "entropy": 0.2322743725962937,
1116
  "epoch": 1.3307810107197549,
1117
+ "grad_norm": 0.2392578125,
1118
  "learning_rate": 6.829268292682928e-05,
1119
+ "loss": 0.03324022889137268,
1120
+ "mean_token_accuracy": 0.9893980734050274,
1121
+ "num_tokens": 602707.0,
1122
  "step": 109
1123
  },
1124
  {
1125
+ "entropy": 0.21593647170811892,
1126
  "epoch": 1.343032159264931,
1127
+ "grad_norm": 0.244140625,
1128
  "learning_rate": 6.707317073170732e-05,
1129
+ "loss": 0.051236435770988464,
1130
+ "mean_token_accuracy": 0.9845945909619331,
1131
+ "num_tokens": 609274.0,
1132
  "step": 110
1133
  },
1134
  {
1135
+ "entropy": 0.21711204759776592,
1136
  "epoch": 1.3552833078101072,
1137
+ "grad_norm": 0.298828125,
1138
  "learning_rate": 6.585365853658538e-05,
1139
+ "loss": 0.03724904730916023,
1140
+ "mean_token_accuracy": 0.9879111871123314,
1141
+ "num_tokens": 614639.0,
1142
  "step": 111
1143
  },
1144
  {
1145
+ "entropy": 0.23972039762884378,
1146
  "epoch": 1.3675344563552834,
1147
+ "grad_norm": 0.1953125,
1148
  "learning_rate": 6.463414634146342e-05,
1149
+ "loss": 0.041368596255779266,
1150
+ "mean_token_accuracy": 0.9879214912652969,
1151
+ "num_tokens": 619839.0,
1152
  "step": 112
1153
  },
1154
  {
1155
+ "entropy": 0.2266655545681715,
1156
  "epoch": 1.3797856049004595,
1157
+ "grad_norm": 0.2734375,
1158
  "learning_rate": 6.341463414634146e-05,
1159
+ "loss": 0.04395541176199913,
1160
+ "mean_token_accuracy": 0.9881241992115974,
1161
+ "num_tokens": 626252.0,
1162
  "step": 113
1163
  },
1164
  {
1165
+ "entropy": 0.23816584516316652,
1166
  "epoch": 1.3920367534456355,
1167
+ "grad_norm": 0.205078125,
1168
  "learning_rate": 6.219512195121952e-05,
1169
+ "loss": 0.03704490512609482,
1170
+ "mean_token_accuracy": 0.9873962365090847,
1171
+ "num_tokens": 631466.0,
1172
  "step": 114
1173
  },
1174
  {
1175
+ "entropy": 0.23965218709781766,
1176
  "epoch": 1.4042879019908117,
1177
+ "grad_norm": 0.1982421875,
1178
  "learning_rate": 6.097560975609756e-05,
1179
+ "loss": 0.029025251045823097,
1180
+ "mean_token_accuracy": 0.9914826788008213,
1181
+ "num_tokens": 637746.0,
1182
  "step": 115
1183
  },
1184
  {
1185
+ "entropy": 0.25465985108166933,
1186
  "epoch": 1.4165390505359878,
1187
+ "grad_norm": 0.2490234375,
1188
  "learning_rate": 5.975609756097561e-05,
1189
+ "loss": 0.040289562195539474,
1190
+ "mean_token_accuracy": 0.9853745028376579,
1191
+ "num_tokens": 642638.0,
1192
  "step": 116
1193
  },
1194
  {
1195
+ "entropy": 0.21541998535394669,
1196
  "epoch": 1.4287901990811638,
1197
+ "grad_norm": 0.263671875,
1198
  "learning_rate": 5.853658536585366e-05,
1199
+ "loss": 0.036677829921245575,
1200
+ "mean_token_accuracy": 0.9888547360897064,
1201
+ "num_tokens": 648219.0,
1202
  "step": 117
1203
  },
1204
  {
1205
+ "entropy": 0.23220631666481495,
1206
  "epoch": 1.44104134762634,
1207
+ "grad_norm": 0.197265625,
1208
  "learning_rate": 5.731707317073171e-05,
1209
+ "loss": 0.041446540504693985,
1210
+ "mean_token_accuracy": 0.9872667863965034,
1211
+ "num_tokens": 654329.0,
1212
  "step": 118
1213
  },
1214
  {
1215
+ "entropy": 0.23642429150640965,
1216
  "epoch": 1.4532924961715161,
1217
+ "grad_norm": 0.1904296875,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
+ "loss": 0.034801121801137924,
1220
+ "mean_token_accuracy": 0.9886356219649315,
1221
+ "num_tokens": 660173.0,
1222
  "step": 119
1223
  },
1224
  {
1225
+ "entropy": 0.22930414089933038,
1226
  "epoch": 1.4655436447166923,
1227
+ "grad_norm": 0.18359375,
1228
  "learning_rate": 5.487804878048781e-05,
1229
+ "loss": 0.03842389956116676,
1230
+ "mean_token_accuracy": 0.9868512041866779,
1231
+ "num_tokens": 665808.0,
1232
  "step": 120
1233
  },
1234
  {
1235
+ "entropy": 0.20680655166506767,
1236
  "epoch": 1.4777947932618682,
1237
+ "grad_norm": 0.25,
1238
  "learning_rate": 5.365853658536586e-05,
1239
+ "loss": 0.03596107289195061,
1240
+ "mean_token_accuracy": 0.98941445723176,
1241
+ "num_tokens": 671196.0,
1242
  "step": 121
1243
  },
1244
  {
1245
+ "entropy": 0.24193469621241093,
1246
  "epoch": 1.4900459418070444,
1247
+ "grad_norm": 0.298828125,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
+ "loss": 0.039926398545503616,
1250
+ "mean_token_accuracy": 0.9893642216920853,
1251
+ "num_tokens": 676532.0,
1252
  "step": 122
1253
  },
1254
  {
1255
+ "entropy": 0.22021344117820263,
1256
  "epoch": 1.5022970903522204,
1257
+ "grad_norm": 0.16796875,
1258
  "learning_rate": 5.121951219512195e-05,
1259
+ "loss": 0.028786854818463326,
1260
+ "mean_token_accuracy": 0.9905072785913944,
1261
+ "num_tokens": 681580.0,
1262
  "step": 123
1263
  },
1264
  {
1265
+ "entropy": 0.2207544706761837,
1266
  "epoch": 1.5145482388973965,
1267
+ "grad_norm": 0.1748046875,
1268
  "learning_rate": 5e-05,
1269
+ "loss": 0.02596566267311573,
1270
+ "mean_token_accuracy": 0.9926727451384068,
1271
+ "num_tokens": 687310.0,
1272
  "step": 124
1273
  },
1274
  {
1275
+ "entropy": 0.21766360383480787,
1276
  "epoch": 1.5267993874425727,
1277
+ "grad_norm": 0.2177734375,
1278
  "learning_rate": 4.878048780487805e-05,
1279
+ "loss": 0.0336245559155941,
1280
+ "mean_token_accuracy": 0.9895498640835285,
1281
+ "num_tokens": 693584.0,
1282
  "step": 125
1283
  },
1284
  {
1285
+ "entropy": 0.20466620940715075,
1286
  "epoch": 1.5390505359877489,
1287
+ "grad_norm": 0.1669921875,
1288
  "learning_rate": 4.75609756097561e-05,
1289
+ "loss": 0.025749148800969124,
1290
+ "mean_token_accuracy": 0.9912248440086842,
1291
+ "num_tokens": 699395.0,
1292
  "step": 126
1293
  },
1294
  {
1295
+ "entropy": 0.2437387891113758,
1296
  "epoch": 1.551301684532925,
1297
+ "grad_norm": 0.193359375,
1298
  "learning_rate": 4.634146341463415e-05,
1299
+ "loss": 0.03270214796066284,
1300
+ "mean_token_accuracy": 0.9933484047651291,
1301
+ "num_tokens": 705055.0,
1302
  "step": 127
1303
  },
1304
  {
1305
+ "entropy": 0.2368676969781518,
1306
  "epoch": 1.5635528330781012,
1307
+ "grad_norm": 0.2236328125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
+ "loss": 0.03528536483645439,
1310
+ "mean_token_accuracy": 0.9893530681729317,
1311
+ "num_tokens": 710632.0,
1312
  "step": 128
1313
  },
1314
  {
1315
+ "entropy": 0.19431999698281288,
1316
  "epoch": 1.5758039816232772,
1317
+ "grad_norm": 0.43359375,
1318
  "learning_rate": 4.390243902439025e-05,
1319
+ "loss": 0.047928016632795334,
1320
+ "mean_token_accuracy": 0.9849436171352863,
1321
+ "num_tokens": 715837.0,
1322
  "step": 129
1323
  },
1324
  {
1325
+ "entropy": 0.18662631046026945,
1326
  "epoch": 1.5880551301684533,
1327
+ "grad_norm": 0.240234375,
1328
  "learning_rate": 4.26829268292683e-05,
1329
+ "loss": 0.04247990995645523,
1330
+ "mean_token_accuracy": 0.9910683631896973,
1331
+ "num_tokens": 721933.0,
1332
  "step": 130
1333
  },
1334
  {
1335
+ "entropy": 0.2159680761396885,
1336
  "epoch": 1.6003062787136293,
1337
+ "grad_norm": 0.283203125,
1338
  "learning_rate": 4.146341463414634e-05,
1339
+ "loss": 0.0323370024561882,
1340
+ "mean_token_accuracy": 0.9914376214146614,
1341
+ "num_tokens": 727148.0,
1342
  "step": 131
1343
  },
1344
  {
1345
+ "entropy": 0.19935058476403356,
1346
  "epoch": 1.6125574272588055,
1347
+ "grad_norm": 0.298828125,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
+ "loss": 0.04690408334136009,
1350
+ "mean_token_accuracy": 0.9893516302108765,
1351
+ "num_tokens": 733471.0,
1352
  "step": 132
1353
  },
1354
  {
1355
+ "entropy": 0.20206499379128218,
1356
  "epoch": 1.6248085758039816,
1357
+ "grad_norm": 0.271484375,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
+ "loss": 0.03374583646655083,
1360
+ "mean_token_accuracy": 0.9919851459562778,
1361
+ "num_tokens": 738692.0,
1362
  "step": 133
1363
  },
1364
  {
1365
+ "entropy": 0.2015545079484582,
1366
  "epoch": 1.6370597243491578,
1367
+ "grad_norm": 0.208984375,
1368
  "learning_rate": 3.780487804878049e-05,
1369
+ "loss": 0.030488884076476097,
1370
+ "mean_token_accuracy": 0.9903693087399006,
1371
+ "num_tokens": 743858.0,
1372
  "step": 134
1373
  },
1374
  {
1375
+ "entropy": 0.19723766017705202,
1376
  "epoch": 1.649310872894334,
1377
+ "grad_norm": 0.2158203125,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
+ "loss": 0.025454077869653702,
1380
+ "mean_token_accuracy": 0.9917643442749977,
1381
+ "num_tokens": 749750.0,
1382
  "step": 135
1383
  },
1384
  {
1385
+ "entropy": 0.24156489223241806,
1386
  "epoch": 1.66156202143951,
1387
+ "grad_norm": 0.248046875,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
+ "loss": 0.03519139811396599,
1390
+ "mean_token_accuracy": 0.9872167631983757,
1391
+ "num_tokens": 754490.0,
1392
  "step": 136
1393
  },
1394
  {
1395
+ "entropy": 0.21465991251170635,
1396
  "epoch": 1.673813169984686,
1397
+ "grad_norm": 0.2177734375,
1398
  "learning_rate": 3.414634146341464e-05,
1399
+ "loss": 0.035167597234249115,
1400
+ "mean_token_accuracy": 0.9907721877098083,
1401
+ "num_tokens": 760201.0,
1402
  "step": 137
1403
  },
1404
  {
1405
+ "entropy": 0.2013603514060378,
1406
  "epoch": 1.686064318529862,
1407
+ "grad_norm": 0.2275390625,
1408
  "learning_rate": 3.292682926829269e-05,
1409
+ "loss": 0.038648657500743866,
1410
+ "mean_token_accuracy": 0.9927868358790874,
1411
+ "num_tokens": 767298.0,
1412
  "step": 138
1413
  },
1414
  {
1415
+ "entropy": 0.21827432699501514,
1416
  "epoch": 1.6983154670750382,
1417
+ "grad_norm": 0.3984375,
1418
  "learning_rate": 3.170731707317073e-05,
1419
+ "loss": 0.03258686885237694,
1420
+ "mean_token_accuracy": 0.9875011034309864,
1421
+ "num_tokens": 772255.0,
1422
  "step": 139
1423
  },
1424
  {
1425
+ "entropy": 0.2216914091259241,
1426
  "epoch": 1.7105666156202144,
1427
+ "grad_norm": 0.2431640625,
1428
  "learning_rate": 3.048780487804878e-05,
1429
+ "loss": 0.02989918179810047,
1430
+ "mean_token_accuracy": 0.9894328564405441,
1431
+ "num_tokens": 777953.0,
1432
  "step": 140
1433
  },
1434
  {
1435
+ "entropy": 0.1955341473221779,
1436
  "epoch": 1.7228177641653906,
1437
+ "grad_norm": 0.19140625,
1438
  "learning_rate": 2.926829268292683e-05,
1439
+ "loss": 0.03125489503145218,
1440
+ "mean_token_accuracy": 0.990275178104639,
1441
+ "num_tokens": 783196.0,
1442
  "step": 141
1443
  },
1444
  {
1445
+ "entropy": 0.20080329850316048,
1446
  "epoch": 1.7350689127105667,
1447
+ "grad_norm": 0.205078125,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
+ "loss": 0.02998793125152588,
1450
+ "mean_token_accuracy": 0.990708488970995,
1451
+ "num_tokens": 787962.0,
1452
  "step": 142
1453
  },
1454
  {
1455
+ "entropy": 0.23265999322757125,
1456
  "epoch": 1.7473200612557427,
1457
+ "grad_norm": 0.486328125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
+ "loss": 0.05563311651349068,
1460
+ "mean_token_accuracy": 0.9868011735379696,
1461
+ "num_tokens": 792930.0,
1462
  "step": 143
1463
  },
1464
  {
1465
+ "entropy": 0.2262994982302189,
1466
  "epoch": 1.7595712098009189,
1467
+ "grad_norm": 0.3125,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
+ "loss": 0.03620155528187752,
1470
+ "mean_token_accuracy": 0.9913722947239876,
1471
+ "num_tokens": 798789.0,
1472
  "step": 144
1473
  },
1474
  {
1475
+ "entropy": 0.216988081112504,
1476
  "epoch": 1.7718223583460948,
1477
+ "grad_norm": 0.4921875,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
+ "loss": 0.03459199145436287,
1480
+ "mean_token_accuracy": 0.985675573348999,
1481
+ "num_tokens": 803970.0,
1482
  "step": 145
1483
  },
1484
  {
1485
+ "entropy": 0.2017216570675373,
1486
  "epoch": 1.784073506891271,
1487
+ "grad_norm": 0.197265625,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
+ "loss": 0.03301437944173813,
1490
+ "mean_token_accuracy": 0.9892940744757652,
1491
+ "num_tokens": 810201.0,
1492
  "step": 146
1493
  },
1494
  {
1495
+ "entropy": 0.19614959321916103,
1496
  "epoch": 1.7963246554364471,
1497
+ "grad_norm": 0.251953125,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
+ "loss": 0.03109458088874817,
1500
+ "mean_token_accuracy": 0.9879713654518127,
1501
+ "num_tokens": 816419.0,
1502
  "step": 147
1503
  },
1504
  {
1505
+ "entropy": 0.22246063826605678,
1506
  "epoch": 1.8085758039816233,
1507
+ "grad_norm": 0.28125,
1508
  "learning_rate": 2.073170731707317e-05,
1509
+ "loss": 0.03694477677345276,
1510
+ "mean_token_accuracy": 0.9848503768444061,
1511
+ "num_tokens": 822186.0,
1512
  "step": 148
1513
  },
1514
  {
1515
+ "entropy": 0.22035463713109493,
1516
  "epoch": 1.8208269525267995,
1517
+ "grad_norm": 0.171875,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
+ "loss": 0.023739267140626907,
1520
+ "mean_token_accuracy": 0.9927939847111702,
1521
+ "num_tokens": 826733.0,
1522
  "step": 149
1523
  },
1524
  {
1525
+ "entropy": 0.21801204327493906,
1526
  "epoch": 1.8330781010719757,
1527
+ "grad_norm": 0.1884765625,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
+ "loss": 0.03263175114989281,
1530
+ "mean_token_accuracy": 0.9923874475061893,
1531
+ "num_tokens": 833042.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
+ "eval_entropy": 0.21679833023876383,
1537
+ "eval_loss": 0.0704953595995903,
1538
+ "eval_mean_token_accuracy": 0.9732788464297419,
1539
+ "eval_num_tokens": 833042.0,
1540
+ "eval_runtime": 57.5678,
1541
+ "eval_samples_per_second": 1.199,
1542
+ "eval_steps_per_second": 1.199,
1543
  "step": 150
1544
  },
1545
  {
1546
+ "entropy": 0.18707641633227468,
1547
  "epoch": 1.8453292496171516,
1548
+ "grad_norm": 0.2041015625,
1549
  "learning_rate": 1.707317073170732e-05,
1550
+ "loss": 0.03016384318470955,
1551
+ "mean_token_accuracy": 0.9911616966128349,
1552
+ "num_tokens": 838329.0,
1553
  "step": 151
1554
  },
1555
  {
1556
+ "entropy": 0.23812252935022116,
1557
  "epoch": 1.8575803981623276,
1558
+ "grad_norm": 0.302734375,
1559
  "learning_rate": 1.5853658536585366e-05,
1560
+ "loss": 0.03615584969520569,
1561
+ "mean_token_accuracy": 0.987461268901825,
1562
+ "num_tokens": 844696.0,
1563
  "step": 152
1564
  },
1565
  {
1566
+ "entropy": 0.22532062884420156,
1567
  "epoch": 1.8698315467075037,
1568
+ "grad_norm": 0.248046875,
1569
  "learning_rate": 1.4634146341463415e-05,
1570
+ "loss": 0.04055720940232277,
1571
+ "mean_token_accuracy": 0.9846500307321548,
1572
+ "num_tokens": 850554.0,
1573
  "step": 153
1574
  },
1575
  {
1576
+ "entropy": 0.20814889762550592,
1577
  "epoch": 1.88208269525268,
1578
+ "grad_norm": 0.232421875,
1579
  "learning_rate": 1.3414634146341466e-05,
1580
+ "loss": 0.031764715909957886,
1581
+ "mean_token_accuracy": 0.988288339227438,
1582
+ "num_tokens": 856923.0,
1583
  "step": 154
1584
  },
1585
  {
1586
+ "entropy": 0.23297990392893553,
1587
  "epoch": 1.894333843797856,
1588
+ "grad_norm": 0.1669921875,
1589
  "learning_rate": 1.2195121951219513e-05,
1590
+ "loss": 0.030051497742533684,
1591
+ "mean_token_accuracy": 0.9910264648497105,
1592
+ "num_tokens": 862398.0,
1593
  "step": 155
1594
  },
1595
  {
1596
+ "entropy": 0.2033767681568861,
1597
  "epoch": 1.9065849923430322,
1598
+ "grad_norm": 0.2490234375,
1599
  "learning_rate": 1.0975609756097562e-05,
1600
+ "loss": 0.03712699934840202,
1601
+ "mean_token_accuracy": 0.9899817705154419,
1602
+ "num_tokens": 868074.0,
1603
  "step": 156
1604
  },
1605
  {
1606
+ "entropy": 0.22080545127391815,
1607
  "epoch": 1.9188361408882084,
1608
+ "grad_norm": 0.255859375,
1609
  "learning_rate": 9.756097560975611e-06,
1610
+ "loss": 0.034820556640625,
1611
+ "mean_token_accuracy": 0.988889068365097,
1612
+ "num_tokens": 873838.0,
1613
  "step": 157
1614
  },
1615
  {
1616
+ "entropy": 0.2254256308078766,
1617
  "epoch": 1.9310872894333844,
1618
+ "grad_norm": 0.20703125,
1619
  "learning_rate": 8.53658536585366e-06,
1620
+ "loss": 0.04543515294790268,
1621
+ "mean_token_accuracy": 0.9892100431025028,
1622
+ "num_tokens": 879603.0,
1623
  "step": 158
1624
  },
1625
  {
1626
+ "entropy": 0.20991013059392571,
1627
  "epoch": 1.9433384379785605,
1628
+ "grad_norm": 0.1884765625,
1629
  "learning_rate": 7.317073170731707e-06,
1630
+ "loss": 0.03553229942917824,
1631
+ "mean_token_accuracy": 0.9892465956509113,
1632
+ "num_tokens": 884707.0,
1633
  "step": 159
1634
  },
1635
  {
1636
+ "entropy": 0.21032756008207798,
1637
  "epoch": 1.9555895865237365,
1638
+ "grad_norm": 0.2490234375,
1639
  "learning_rate": 6.0975609756097564e-06,
1640
+ "loss": 0.03755491226911545,
1641
+ "mean_token_accuracy": 0.9903870671987534,
1642
+ "num_tokens": 889972.0,
1643
  "step": 160
1644
  },
1645
  {
1646
+ "entropy": 0.22135276068001986,
1647
  "epoch": 1.9678407350689127,
1648
+ "grad_norm": 0.328125,
1649
  "learning_rate": 4.8780487804878055e-06,
1650
+ "loss": 0.05311673879623413,
1651
+ "mean_token_accuracy": 0.987005028873682,
1652
+ "num_tokens": 896339.0,
1653
  "step": 161
1654
  },
1655
  {
1656
+ "entropy": 0.23554523009806871,
1657
  "epoch": 1.9800918836140888,
1658
+ "grad_norm": 0.296875,
1659
  "learning_rate": 3.6585365853658537e-06,
1660
+ "loss": 0.029539132490754128,
1661
+ "mean_token_accuracy": 0.989458903670311,
1662
+ "num_tokens": 901155.0,
1663
  "step": 162
1664
  },
1665
  {
1666
+ "entropy": 0.2434804094955325,
1667
  "epoch": 1.992343032159265,
1668
+ "grad_norm": 0.41796875,
1669
  "learning_rate": 2.4390243902439027e-06,
1670
+ "loss": 0.04046294465661049,
1671
+ "mean_token_accuracy": 0.98642498254776,
1672
+ "num_tokens": 906714.0,
1673
  "step": 163
1674
  },
1675
  {
1676
+ "entropy": 0.2326144739985466,
1677
  "epoch": 2.0,
1678
+ "grad_norm": 0.37109375,
1679
  "learning_rate": 1.2195121951219514e-06,
1680
+ "loss": 0.03281162679195404,
1681
+ "mean_token_accuracy": 0.9881741642951966,
1682
+ "num_tokens": 910366.0,
1683
  "step": 164
1684
  }
1685
  ],
 
1700
  "attributes": {}
1701
  }
1702
  },
1703
+ "total_flos": 4.122247062901555e+16,
1704
  "train_batch_size": 1,
1705
  "trial_name": null,
1706
  "trial_params": null