stefanocarrera commited on
Commit
3a8f033
·
verified ·
1 Parent(s): 8b67d53

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -30,11 +30,11 @@
30
  "revision": null,
31
  "target_modules": [
32
  "o_proj",
33
- "q_proj",
34
- "gate_proj",
35
- "down_proj",
36
  "k_proj",
 
 
37
  "v_proj",
 
38
  "up_proj"
39
  ],
40
  "target_parameters": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "o_proj",
 
 
 
33
  "k_proj",
34
+ "down_proj",
35
+ "q_proj",
36
  "v_proj",
37
+ "gate_proj",
38
  "up_proj"
39
  ],
40
  "target_parameters": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:978fce5d23f8e65a5e89fd4fa0a502a5a505733bc5d548832ae8e85ecf1af748
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d7799ff7ae7f290e67eada9d323c6418a3a9db26bdb2158f039838076d95d1f
3
  size 83946192
checkpoint-150/adapter_config.json CHANGED
@@ -30,11 +30,11 @@
30
  "revision": null,
31
  "target_modules": [
32
  "o_proj",
33
- "q_proj",
34
- "gate_proj",
35
- "down_proj",
36
  "k_proj",
 
 
37
  "v_proj",
 
38
  "up_proj"
39
  ],
40
  "target_parameters": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "o_proj",
 
 
 
33
  "k_proj",
34
+ "down_proj",
35
+ "q_proj",
36
  "v_proj",
37
+ "gate_proj",
38
  "up_proj"
39
  ],
40
  "target_parameters": null,
checkpoint-150/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3def52f4b83524f4576484a0a550277910026b91b8717435dde9199c86a7895
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:762e73134d8cb3ce48b4da442522cb72cca6085e8d761c4956332ea9159b92c5
3
  size 83946192
checkpoint-150/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5aa061bf921d60836e9a149e021809302178d2ccf0e44570eb44a74679fa164e
3
  size 85728997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecadd10ced5ddfdc8d4f885fbd10e4abbd52e6bb17ae093c71dec91b5ed759e5
3
  size 85728997
checkpoint-150/trainer_state.json CHANGED
@@ -10,1536 +10,1536 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 0.45975130423903465,
14
  "epoch": 0.01225114854517611,
15
- "grad_norm": 0.00689697265625,
16
  "learning_rate": 0.0002,
17
- "loss": 0.0005938471877016127,
18
- "mean_token_accuracy": 0.9997171945869923,
19
- "num_tokens": 6092.0,
20
  "step": 1
21
  },
22
  {
23
- "entropy": 0.4158535748720169,
24
  "epoch": 0.02450229709035222,
25
- "grad_norm": 0.00017833709716796875,
26
  "learning_rate": 0.00019878048780487805,
27
- "loss": 3.472402386250906e-05,
28
- "mean_token_accuracy": 1.0,
29
- "num_tokens": 11535.0,
30
  "step": 2
31
  },
32
  {
33
- "entropy": 0.4280186090618372,
34
  "epoch": 0.036753445635528334,
35
- "grad_norm": 8.20159912109375e-05,
36
  "learning_rate": 0.0001975609756097561,
37
- "loss": 2.510893318685703e-05,
38
- "mean_token_accuracy": 1.0,
39
- "num_tokens": 16432.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 0.41829014383256435,
44
  "epoch": 0.04900459418070444,
45
- "grad_norm": 0.034912109375,
46
  "learning_rate": 0.00019634146341463416,
47
- "loss": 0.0034790209028869867,
48
- "mean_token_accuracy": 0.9971264377236366,
49
- "num_tokens": 20507.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 0.3744635935872793,
54
  "epoch": 0.06125574272588055,
55
- "grad_norm": 0.0001983642578125,
56
  "learning_rate": 0.0001951219512195122,
57
- "loss": 1.627415622351691e-05,
58
- "mean_token_accuracy": 1.0,
59
- "num_tokens": 26122.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 0.40895503386855125,
64
  "epoch": 0.07350689127105667,
65
- "grad_norm": 3.457069396972656e-05,
66
  "learning_rate": 0.00019390243902439025,
67
- "loss": 9.875144314719364e-06,
68
- "mean_token_accuracy": 1.0,
69
- "num_tokens": 30847.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 0.36759823746979237,
74
  "epoch": 0.08575803981623277,
75
- "grad_norm": 8.869171142578125e-05,
76
  "learning_rate": 0.0001926829268292683,
77
- "loss": 1.5701301890658215e-05,
78
- "mean_token_accuracy": 1.0,
79
- "num_tokens": 36541.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 0.3891780599951744,
84
  "epoch": 0.09800918836140889,
85
- "grad_norm": 5.078315734863281e-05,
86
  "learning_rate": 0.00019146341463414633,
87
- "loss": 1.2823864381061867e-05,
88
- "mean_token_accuracy": 1.0,
89
- "num_tokens": 41001.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 0.4104680269956589,
94
  "epoch": 0.11026033690658499,
95
- "grad_norm": 0.02099609375,
96
  "learning_rate": 0.0001902439024390244,
97
- "loss": 0.0011738959001377225,
98
- "mean_token_accuracy": 0.9996279776096344,
99
- "num_tokens": 45467.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 0.39176585152745247,
104
  "epoch": 0.1225114854517611,
105
- "grad_norm": 0.0703125,
106
  "learning_rate": 0.00018902439024390244,
107
- "loss": 0.0007126386626623571,
108
- "mean_token_accuracy": 0.9997509978711605,
109
- "num_tokens": 50478.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 0.3562493957579136,
114
  "epoch": 0.13476263399693722,
115
- "grad_norm": 0.0004405975341796875,
116
  "learning_rate": 0.0001878048780487805,
117
- "loss": 2.2854681446915492e-05,
118
- "mean_token_accuracy": 1.0,
119
- "num_tokens": 56181.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 0.3858679383993149,
124
  "epoch": 0.14701378254211334,
125
- "grad_norm": 0.00016307830810546875,
126
  "learning_rate": 0.00018658536585365856,
127
- "loss": 1.8136681319447234e-05,
128
- "mean_token_accuracy": 1.0,
129
- "num_tokens": 62946.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 0.37994169630110264,
134
  "epoch": 0.15926493108728942,
135
- "grad_norm": 0.000640869140625,
136
  "learning_rate": 0.0001853658536585366,
137
- "loss": 1.9365113985259086e-05,
138
- "mean_token_accuracy": 1.0,
139
- "num_tokens": 68436.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 0.3682236662134528,
144
  "epoch": 0.17151607963246554,
145
- "grad_norm": 4.267692565917969e-05,
146
  "learning_rate": 0.00018414634146341464,
147
- "loss": 9.09720802155789e-06,
148
- "mean_token_accuracy": 1.0,
149
- "num_tokens": 73603.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 0.40290670469403267,
154
  "epoch": 0.18376722817764166,
155
- "grad_norm": 9.441375732421875e-05,
156
  "learning_rate": 0.0001829268292682927,
157
- "loss": 1.5181853086687624e-05,
158
- "mean_token_accuracy": 1.0,
159
- "num_tokens": 77845.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 0.36544002406299114,
164
  "epoch": 0.19601837672281777,
165
- "grad_norm": 0.001007080078125,
166
  "learning_rate": 0.00018170731707317075,
167
- "loss": 1.5547768271062523e-05,
168
- "mean_token_accuracy": 1.0,
169
- "num_tokens": 82744.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 0.38514361158013344,
174
  "epoch": 0.2082695252679939,
175
- "grad_norm": 3.147125244140625e-05,
176
  "learning_rate": 0.0001804878048780488,
177
- "loss": 9.32630973693449e-06,
178
- "mean_token_accuracy": 1.0,
179
- "num_tokens": 87453.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 0.38769579119980335,
184
  "epoch": 0.22052067381316998,
185
- "grad_norm": 0.0001983642578125,
186
  "learning_rate": 0.00017926829268292684,
187
- "loss": 1.4681676475447603e-05,
188
- "mean_token_accuracy": 1.0,
189
- "num_tokens": 92321.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 0.3753592735156417,
194
  "epoch": 0.2327718223583461,
195
- "grad_norm": 0.00019168853759765625,
196
  "learning_rate": 0.00017804878048780488,
197
- "loss": 2.8633825422730297e-05,
198
- "mean_token_accuracy": 1.0,
199
- "num_tokens": 97146.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 0.3909287117421627,
204
  "epoch": 0.2450229709035222,
205
- "grad_norm": 0.0004482269287109375,
206
  "learning_rate": 0.00017682926829268295,
207
- "loss": 1.8875809473684058e-05,
208
- "mean_token_accuracy": 1.0,
209
- "num_tokens": 101943.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 0.4073780719190836,
214
  "epoch": 0.2572741194486983,
215
- "grad_norm": 0.36328125,
216
  "learning_rate": 0.000175609756097561,
217
- "loss": 0.005490713287144899,
218
- "mean_token_accuracy": 0.9996448867022991,
219
- "num_tokens": 106772.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 0.3673222251236439,
224
  "epoch": 0.26952526799387444,
225
- "grad_norm": 5.1975250244140625e-05,
226
  "learning_rate": 0.00017439024390243903,
227
- "loss": 1.0117664714925922e-05,
228
- "mean_token_accuracy": 1.0,
229
- "num_tokens": 112558.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 0.39382114820182323,
234
  "epoch": 0.28177641653905056,
235
- "grad_norm": 0.0003662109375,
236
  "learning_rate": 0.00017317073170731708,
237
- "loss": 1.4868882317387033e-05,
238
- "mean_token_accuracy": 1.0,
239
- "num_tokens": 117489.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 0.4107118733227253,
244
  "epoch": 0.29402756508422667,
245
- "grad_norm": 0.0009918212890625,
246
  "learning_rate": 0.00017195121951219512,
247
- "loss": 3.529411696945317e-05,
248
- "mean_token_accuracy": 1.0,
249
- "num_tokens": 123010.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 0.3787885829806328,
254
  "epoch": 0.30627871362940273,
255
- "grad_norm": 0.005859375,
256
  "learning_rate": 0.0001707317073170732,
257
- "loss": 9.493537800153717e-05,
258
- "mean_token_accuracy": 1.0,
259
- "num_tokens": 127716.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 0.37760412879288197,
264
  "epoch": 0.31852986217457885,
265
- "grad_norm": 0.00029754638671875,
266
  "learning_rate": 0.00016951219512195123,
267
- "loss": 1.7393856978742406e-05,
268
- "mean_token_accuracy": 1.0,
269
- "num_tokens": 132372.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 0.38016335386782885,
274
  "epoch": 0.33078101071975496,
275
- "grad_norm": 0.0198974609375,
276
  "learning_rate": 0.00016829268292682927,
277
- "loss": 0.00031554378801956773,
278
- "mean_token_accuracy": 1.0,
279
- "num_tokens": 137028.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 0.3974682204425335,
284
  "epoch": 0.3430321592649311,
285
- "grad_norm": 0.0546875,
286
  "learning_rate": 0.00016707317073170731,
287
- "loss": 0.0025693816132843494,
288
- "mean_token_accuracy": 0.9993556700646877,
289
- "num_tokens": 142088.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 0.3819452077150345,
294
  "epoch": 0.3552833078101072,
295
- "grad_norm": 0.0137939453125,
296
  "learning_rate": 0.00016585365853658536,
297
- "loss": 0.0001885725650936365,
298
- "mean_token_accuracy": 1.0,
299
- "num_tokens": 147481.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 0.41766250506043434,
304
  "epoch": 0.3675344563552833,
305
- "grad_norm": 0.000759124755859375,
306
  "learning_rate": 0.00016463414634146343,
307
- "loss": 1.8762426407192834e-05,
308
- "mean_token_accuracy": 1.0,
309
- "num_tokens": 152973.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 0.42338451743125916,
314
  "epoch": 0.37978560490045943,
315
- "grad_norm": 0.00015735626220703125,
316
  "learning_rate": 0.00016341463414634147,
317
- "loss": 1.797903678379953e-05,
318
- "mean_token_accuracy": 1.0,
319
- "num_tokens": 156786.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 0.41780348122119904,
324
  "epoch": 0.39203675344563554,
325
- "grad_norm": 0.00016117095947265625,
326
  "learning_rate": 0.00016219512195121954,
327
- "loss": 2.4896233298932202e-05,
328
- "mean_token_accuracy": 1.0,
329
- "num_tokens": 162859.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 0.3986742924898863,
334
  "epoch": 0.40428790199081166,
335
- "grad_norm": 0.0003185272216796875,
336
  "learning_rate": 0.00016097560975609758,
337
- "loss": 2.1766518329968676e-05,
338
- "mean_token_accuracy": 1.0,
339
- "num_tokens": 167969.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 0.40497588738799095,
344
  "epoch": 0.4165390505359878,
345
- "grad_norm": 0.00141143798828125,
346
  "learning_rate": 0.00015975609756097562,
347
- "loss": 5.013354166294448e-05,
348
- "mean_token_accuracy": 1.0,
349
- "num_tokens": 172518.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 0.44378601387143135,
354
  "epoch": 0.42879019908116384,
355
- "grad_norm": 0.007415771484375,
356
  "learning_rate": 0.00015853658536585366,
357
- "loss": 0.00011341742356307805,
358
- "mean_token_accuracy": 1.0,
359
- "num_tokens": 177085.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 0.44088135845959187,
364
  "epoch": 0.44104134762633995,
365
- "grad_norm": 0.0224609375,
366
  "learning_rate": 0.00015731707317073173,
367
- "loss": 0.0003354589862283319,
368
- "mean_token_accuracy": 1.0,
369
- "num_tokens": 181617.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 0.40403734613209963,
374
  "epoch": 0.45329249617151607,
375
- "grad_norm": 0.09326171875,
376
  "learning_rate": 0.00015609756097560978,
377
- "loss": 0.0009270600858144462,
378
- "mean_token_accuracy": 0.9998405613005161,
379
- "num_tokens": 186836.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 0.44129026494920254,
384
  "epoch": 0.4655436447166922,
385
- "grad_norm": 0.0001068115234375,
386
  "learning_rate": 0.00015487804878048782,
387
- "loss": 1.9685152437887155e-05,
388
- "mean_token_accuracy": 1.0,
389
- "num_tokens": 191224.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 0.41146982461214066,
394
  "epoch": 0.4777947932618683,
395
- "grad_norm": 6.437301635742188e-05,
396
  "learning_rate": 0.00015365853658536586,
397
- "loss": 1.4887214092595968e-05,
398
- "mean_token_accuracy": 1.0,
399
- "num_tokens": 195926.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 0.4401062335819006,
404
  "epoch": 0.4900459418070444,
405
- "grad_norm": 0.0125732421875,
406
  "learning_rate": 0.0001524390243902439,
407
- "loss": 0.0006239329231902957,
408
- "mean_token_accuracy": 0.999550361186266,
409
- "num_tokens": 200772.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 0.4169564712792635,
414
  "epoch": 0.5022970903522205,
415
- "grad_norm": 0.000118255615234375,
416
  "learning_rate": 0.00015121951219512197,
417
- "loss": 2.6680882001528516e-05,
418
- "mean_token_accuracy": 1.0,
419
- "num_tokens": 204499.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 0.45378032699227333,
424
  "epoch": 0.5145482388973966,
425
- "grad_norm": 0.00011491775512695312,
426
  "learning_rate": 0.00015000000000000001,
427
- "loss": 2.471652624080889e-05,
428
- "mean_token_accuracy": 1.0,
429
- "num_tokens": 208814.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 0.4465767778456211,
434
  "epoch": 0.5267993874425727,
435
- "grad_norm": 0.000263214111328125,
436
  "learning_rate": 0.00014878048780487806,
437
- "loss": 3.366273449501023e-05,
438
- "mean_token_accuracy": 1.0,
439
- "num_tokens": 213907.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 0.4534517452120781,
444
  "epoch": 0.5390505359877489,
445
- "grad_norm": 0.000728607177734375,
446
  "learning_rate": 0.0001475609756097561,
447
- "loss": 2.826840864145197e-05,
448
- "mean_token_accuracy": 1.0,
449
- "num_tokens": 218988.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 0.4201868511736393,
454
  "epoch": 0.5513016845329249,
455
- "grad_norm": 0.0196533203125,
456
  "learning_rate": 0.00014634146341463414,
457
- "loss": 0.000961265352088958,
458
- "mean_token_accuracy": 0.9995967745780945,
459
- "num_tokens": 223595.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 0.4538087658584118,
464
  "epoch": 0.5635528330781011,
465
- "grad_norm": 0.000629425048828125,
466
  "learning_rate": 0.0001451219512195122,
467
- "loss": 2.982705154863652e-05,
468
- "mean_token_accuracy": 1.0,
469
- "num_tokens": 228244.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 0.43760119564831257,
474
  "epoch": 0.5758039816232772,
475
- "grad_norm": 6.151199340820312e-05,
476
  "learning_rate": 0.00014390243902439025,
477
- "loss": 1.6359297660528682e-05,
478
- "mean_token_accuracy": 1.0,
479
- "num_tokens": 232606.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 0.44127281196415424,
484
  "epoch": 0.5880551301684533,
485
- "grad_norm": 9.632110595703125e-05,
486
  "learning_rate": 0.0001426829268292683,
487
- "loss": 2.9222681405371986e-05,
488
- "mean_token_accuracy": 1.0,
489
- "num_tokens": 236563.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 0.4647264387458563,
494
  "epoch": 0.6003062787136294,
495
- "grad_norm": 6.818771362304688e-05,
496
  "learning_rate": 0.00014146341463414634,
497
- "loss": 1.6634010535199195e-05,
498
- "mean_token_accuracy": 1.0,
499
- "num_tokens": 241214.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 0.43234376423060894,
504
  "epoch": 0.6125574272588055,
505
- "grad_norm": 9.107589721679688e-05,
506
  "learning_rate": 0.00014024390243902438,
507
- "loss": 2.512251739972271e-05,
508
- "mean_token_accuracy": 1.0,
509
- "num_tokens": 245200.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
- "eval_entropy": 0.42710635541141895,
515
- "eval_loss": 0.0009002267033793032,
516
- "eval_mean_token_accuracy": 0.9997843339823295,
517
- "eval_num_tokens": 245200.0,
518
- "eval_runtime": 51.2948,
519
- "eval_samples_per_second": 1.345,
520
- "eval_steps_per_second": 1.345,
521
  "step": 50
522
  },
523
  {
524
- "entropy": 0.436727499589324,
525
  "epoch": 0.6248085758039816,
526
- "grad_norm": 6.079673767089844e-05,
527
  "learning_rate": 0.00013902439024390245,
528
- "loss": 1.7863472749013454e-05,
529
- "mean_token_accuracy": 1.0,
530
- "num_tokens": 249761.0,
531
  "step": 51
532
  },
533
  {
534
- "entropy": 0.4489326383918524,
535
  "epoch": 0.6370597243491577,
536
- "grad_norm": 0.010009765625,
537
  "learning_rate": 0.0001378048780487805,
538
- "loss": 9.14962001843378e-05,
539
- "mean_token_accuracy": 1.0,
540
- "num_tokens": 254787.0,
541
  "step": 52
542
  },
543
  {
544
- "entropy": 0.4518893454223871,
545
  "epoch": 0.6493108728943339,
546
- "grad_norm": 0.029052734375,
547
  "learning_rate": 0.00013658536585365856,
548
- "loss": 0.002504949690774083,
549
- "mean_token_accuracy": 0.9991238303482533,
550
- "num_tokens": 260287.0,
551
  "step": 53
552
  },
553
  {
554
- "entropy": 0.4276025863364339,
555
  "epoch": 0.6615620214395099,
556
- "grad_norm": 0.00022411346435546875,
557
  "learning_rate": 0.0001353658536585366,
558
- "loss": 1.9805909687420353e-05,
559
- "mean_token_accuracy": 1.0,
560
- "num_tokens": 264810.0,
561
  "step": 54
562
  },
563
  {
564
- "entropy": 0.4455657321959734,
565
  "epoch": 0.6738131699846861,
566
- "grad_norm": 0.09912109375,
567
  "learning_rate": 0.00013414634146341464,
568
- "loss": 0.005040395073592663,
569
- "mean_token_accuracy": 0.9974489808082581,
570
- "num_tokens": 270386.0,
571
  "step": 55
572
  },
573
  {
574
- "entropy": 0.48375592939555645,
575
  "epoch": 0.6860643185298622,
576
- "grad_norm": 0.00020694732666015625,
577
  "learning_rate": 0.0001329268292682927,
578
- "loss": 3.307354199932888e-05,
579
- "mean_token_accuracy": 1.0,
580
- "num_tokens": 274391.0,
581
  "step": 56
582
  },
583
  {
584
- "entropy": 0.4558328855782747,
585
  "epoch": 0.6983154670750383,
586
- "grad_norm": 0.00011205673217773438,
587
  "learning_rate": 0.00013170731707317076,
588
- "loss": 2.9195363822509535e-05,
589
- "mean_token_accuracy": 1.0,
590
- "num_tokens": 279716.0,
591
  "step": 57
592
  },
593
  {
594
- "entropy": 0.4038175716996193,
595
  "epoch": 0.7105666156202144,
596
- "grad_norm": 0.130859375,
597
  "learning_rate": 0.0001304878048780488,
598
- "loss": 0.002872227458283305,
599
- "mean_token_accuracy": 0.9989018365740776,
600
- "num_tokens": 285404.0,
601
  "step": 58
602
  },
603
  {
604
- "entropy": 0.4584309756755829,
605
  "epoch": 0.7228177641653905,
606
- "grad_norm": 0.02294921875,
607
  "learning_rate": 0.00012926829268292684,
608
- "loss": 0.0006162020144984126,
609
- "mean_token_accuracy": 0.9997650384902954,
610
- "num_tokens": 289992.0,
611
  "step": 59
612
  },
613
  {
614
- "entropy": 0.47067076340317726,
615
  "epoch": 0.7350689127105666,
616
- "grad_norm": 5.14984130859375e-05,
617
  "learning_rate": 0.00012804878048780488,
618
- "loss": 1.8253980670124292e-05,
619
- "mean_token_accuracy": 1.0,
620
- "num_tokens": 294861.0,
621
  "step": 60
622
  },
623
  {
624
- "entropy": 0.4258435070514679,
625
  "epoch": 0.7473200612557427,
626
- "grad_norm": 6.437301635742188e-05,
627
  "learning_rate": 0.00012682926829268293,
628
- "loss": 2.3211847292259336e-05,
629
- "mean_token_accuracy": 1.0,
630
- "num_tokens": 300355.0,
631
  "step": 61
632
  },
633
  {
634
- "entropy": 0.4751600846648216,
635
  "epoch": 0.7595712098009189,
636
- "grad_norm": 0.0001201629638671875,
637
  "learning_rate": 0.000125609756097561,
638
- "loss": 2.862562905647792e-05,
639
- "mean_token_accuracy": 1.0,
640
- "num_tokens": 305776.0,
641
  "step": 62
642
  },
643
  {
644
- "entropy": 0.43714143335819244,
645
  "epoch": 0.7718223583460949,
646
- "grad_norm": 8.153915405273438e-05,
647
  "learning_rate": 0.00012439024390243904,
648
- "loss": 2.0440007574507035e-05,
649
- "mean_token_accuracy": 1.0,
650
- "num_tokens": 310204.0,
651
  "step": 63
652
  },
653
  {
654
- "entropy": 0.436653483659029,
655
  "epoch": 0.7840735068912711,
656
- "grad_norm": 9.298324584960938e-05,
657
  "learning_rate": 0.00012317073170731708,
658
- "loss": 2.5547835321049206e-05,
659
- "mean_token_accuracy": 1.0,
660
- "num_tokens": 314205.0,
661
  "step": 64
662
  },
663
  {
664
- "entropy": 0.4625023826956749,
665
  "epoch": 0.7963246554364471,
666
- "grad_norm": 4.9591064453125e-05,
667
  "learning_rate": 0.00012195121951219512,
668
- "loss": 1.6659454558975995e-05,
669
- "mean_token_accuracy": 1.0,
670
- "num_tokens": 319157.0,
671
  "step": 65
672
  },
673
  {
674
- "entropy": 0.45398022420704365,
675
  "epoch": 0.8085758039816233,
676
- "grad_norm": 0.0004730224609375,
677
  "learning_rate": 0.00012073170731707318,
678
- "loss": 2.831750134646427e-05,
679
- "mean_token_accuracy": 1.0,
680
- "num_tokens": 324681.0,
681
  "step": 66
682
  },
683
  {
684
- "entropy": 0.39901847764849663,
685
  "epoch": 0.8208269525267994,
686
- "grad_norm": 0.0113525390625,
687
  "learning_rate": 0.00011951219512195122,
688
- "loss": 0.0010163490660488605,
689
- "mean_token_accuracy": 0.9993686862289906,
690
- "num_tokens": 329929.0,
691
  "step": 67
692
  },
693
  {
694
- "entropy": 0.43489386700093746,
695
  "epoch": 0.8330781010719756,
696
- "grad_norm": 0.0002841949462890625,
697
  "learning_rate": 0.00011829268292682926,
698
- "loss": 3.556731462595053e-05,
699
- "mean_token_accuracy": 1.0,
700
- "num_tokens": 334474.0,
701
  "step": 68
702
  },
703
  {
704
- "entropy": 0.43658433854579926,
705
  "epoch": 0.8453292496171516,
706
- "grad_norm": 0.00021457672119140625,
707
  "learning_rate": 0.00011707317073170732,
708
- "loss": 3.145977098029107e-05,
709
- "mean_token_accuracy": 1.0,
710
- "num_tokens": 338171.0,
711
  "step": 69
712
  },
713
  {
714
- "entropy": 0.47345293685793877,
715
  "epoch": 0.8575803981623277,
716
- "grad_norm": 0.04052734375,
717
  "learning_rate": 0.00011585365853658536,
718
- "loss": 0.006434774026274681,
719
- "mean_token_accuracy": 0.9988360889256,
720
- "num_tokens": 342581.0,
721
  "step": 70
722
  },
723
  {
724
- "entropy": 0.47144644521176815,
725
  "epoch": 0.8698315467075038,
726
- "grad_norm": 0.03857421875,
727
  "learning_rate": 0.00011463414634146342,
728
- "loss": 0.0040056235156953335,
729
- "mean_token_accuracy": 0.9997807033360004,
730
- "num_tokens": 347785.0,
731
  "step": 71
732
  },
733
  {
734
- "entropy": 0.44001554138958454,
735
  "epoch": 0.8820826952526799,
736
- "grad_norm": 0.00081634521484375,
737
  "learning_rate": 0.00011341463414634146,
738
- "loss": 3.297243165434338e-05,
739
- "mean_token_accuracy": 1.0,
740
- "num_tokens": 352109.0,
741
  "step": 72
742
  },
743
  {
744
- "entropy": 0.44880508445203304,
745
  "epoch": 0.8943338437978561,
746
- "grad_norm": 0.0002689361572265625,
747
  "learning_rate": 0.00011219512195121953,
748
- "loss": 2.6160523702856153e-05,
749
- "mean_token_accuracy": 1.0,
750
- "num_tokens": 357931.0,
751
  "step": 73
752
  },
753
  {
754
- "entropy": 0.41770973429083824,
755
  "epoch": 0.9065849923430321,
756
- "grad_norm": 0.0002231597900390625,
757
  "learning_rate": 0.00011097560975609757,
758
- "loss": 3.9217924495460466e-05,
759
- "mean_token_accuracy": 1.0,
760
- "num_tokens": 363802.0,
761
  "step": 74
762
  },
763
  {
764
- "entropy": 0.45532275550067425,
765
  "epoch": 0.9188361408882083,
766
- "grad_norm": 6.389617919921875e-05,
767
  "learning_rate": 0.00010975609756097563,
768
- "loss": 2.482662421243731e-05,
769
- "mean_token_accuracy": 1.0,
770
- "num_tokens": 368858.0,
771
  "step": 75
772
  },
773
  {
774
- "entropy": 0.4533053319901228,
775
  "epoch": 0.9310872894333844,
776
- "grad_norm": 0.000492095947265625,
777
  "learning_rate": 0.00010853658536585367,
778
- "loss": 3.297019793535583e-05,
779
- "mean_token_accuracy": 1.0,
780
- "num_tokens": 373658.0,
781
  "step": 76
782
  },
783
  {
784
- "entropy": 0.4135119281709194,
785
  "epoch": 0.9433384379785605,
786
- "grad_norm": 0.000347137451171875,
787
  "learning_rate": 0.00010731707317073172,
788
- "loss": 3.026250487891957e-05,
789
- "mean_token_accuracy": 1.0,
790
- "num_tokens": 379025.0,
791
  "step": 77
792
  },
793
  {
794
- "entropy": 0.44705197028815746,
795
  "epoch": 0.9555895865237366,
796
- "grad_norm": 0.00067901611328125,
797
  "learning_rate": 0.00010609756097560977,
798
- "loss": 4.355545388534665e-05,
799
- "mean_token_accuracy": 1.0,
800
- "num_tokens": 384240.0,
801
  "step": 78
802
  },
803
  {
804
- "entropy": 0.459016814827919,
805
  "epoch": 0.9678407350689127,
806
- "grad_norm": 0.00098419189453125,
807
  "learning_rate": 0.00010487804878048781,
808
- "loss": 4.3970921979052946e-05,
809
- "mean_token_accuracy": 1.0,
810
- "num_tokens": 388335.0,
811
  "step": 79
812
  },
813
  {
814
- "entropy": 0.4241188894957304,
815
  "epoch": 0.9800918836140888,
816
- "grad_norm": 0.07275390625,
817
  "learning_rate": 0.00010365853658536586,
818
- "loss": 0.009294007904827595,
819
- "mean_token_accuracy": 0.9970472455024719,
820
- "num_tokens": 394201.0,
821
  "step": 80
822
  },
823
  {
824
- "entropy": 0.4442194551229477,
825
  "epoch": 0.9923430321592649,
826
- "grad_norm": 0.000377655029296875,
827
  "learning_rate": 0.0001024390243902439,
828
- "loss": 3.1872321414994076e-05,
829
- "mean_token_accuracy": 1.0,
830
- "num_tokens": 399101.0,
831
  "step": 81
832
  },
833
  {
834
- "entropy": 0.429327929019928,
835
  "epoch": 1.0,
836
- "grad_norm": 0.00064849853515625,
837
  "learning_rate": 0.00010121951219512196,
838
- "loss": 3.4027863875962794e-05,
839
- "mean_token_accuracy": 1.0,
840
- "num_tokens": 402117.0,
841
  "step": 82
842
  },
843
  {
844
- "entropy": 0.4472597725689411,
845
  "epoch": 1.0122511485451762,
846
- "grad_norm": 0.08056640625,
847
  "learning_rate": 0.0001,
848
- "loss": 0.005052679218351841,
849
- "mean_token_accuracy": 0.9986319616436958,
850
- "num_tokens": 406748.0,
851
  "step": 83
852
  },
853
  {
854
- "entropy": 0.4647933579981327,
855
  "epoch": 1.0245022970903521,
856
- "grad_norm": 0.0001888275146484375,
857
  "learning_rate": 9.878048780487805e-05,
858
- "loss": 3.911805833922699e-05,
859
- "mean_token_accuracy": 1.0,
860
- "num_tokens": 411354.0,
861
  "step": 84
862
  },
863
  {
864
- "entropy": 0.49184724502265453,
865
  "epoch": 1.0367534456355283,
866
- "grad_norm": 0.0009307861328125,
867
  "learning_rate": 9.75609756097561e-05,
868
- "loss": 6.517933798022568e-05,
869
- "mean_token_accuracy": 1.0,
870
- "num_tokens": 417755.0,
871
  "step": 85
872
  },
873
  {
874
- "entropy": 0.45203530229628086,
875
  "epoch": 1.0490045941807045,
876
- "grad_norm": 0.00017547607421875,
877
  "learning_rate": 9.634146341463415e-05,
878
- "loss": 4.648843969334848e-05,
879
- "mean_token_accuracy": 1.0,
880
- "num_tokens": 421725.0,
881
  "step": 86
882
  },
883
  {
884
- "entropy": 0.44451451301574707,
885
  "epoch": 1.0612557427258806,
886
- "grad_norm": 0.00012493133544921875,
887
  "learning_rate": 9.51219512195122e-05,
888
- "loss": 3.813640068983659e-05,
889
- "mean_token_accuracy": 1.0,
890
- "num_tokens": 426841.0,
891
  "step": 87
892
  },
893
  {
894
- "entropy": 0.5532373636960983,
895
  "epoch": 1.0735068912710566,
896
- "grad_norm": 0.0004425048828125,
897
  "learning_rate": 9.390243902439024e-05,
898
- "loss": 8.416183845838532e-05,
899
- "mean_token_accuracy": 1.0,
900
- "num_tokens": 431070.0,
901
  "step": 88
902
  },
903
  {
904
- "entropy": 0.5114028844982386,
905
  "epoch": 1.0857580398162328,
906
- "grad_norm": 0.021484375,
907
  "learning_rate": 9.26829268292683e-05,
908
- "loss": 0.0013321326114237309,
909
- "mean_token_accuracy": 0.9995535723865032,
910
- "num_tokens": 435681.0,
911
  "step": 89
912
  },
913
  {
914
- "entropy": 0.48618660122156143,
915
  "epoch": 1.098009188361409,
916
- "grad_norm": 0.0002498626708984375,
917
  "learning_rate": 9.146341463414635e-05,
918
- "loss": 6.904367910465226e-05,
919
- "mean_token_accuracy": 1.0,
920
- "num_tokens": 440565.0,
921
  "step": 90
922
  },
923
  {
924
- "entropy": 0.5016148556023836,
925
  "epoch": 1.110260336906585,
926
- "grad_norm": 0.019287109375,
927
  "learning_rate": 9.02439024390244e-05,
928
- "loss": 0.0020695198327302933,
929
- "mean_token_accuracy": 0.9993686862289906,
930
- "num_tokens": 445241.0,
931
  "step": 91
932
  },
933
  {
934
- "entropy": 0.5162393897771835,
935
  "epoch": 1.122511485451761,
936
- "grad_norm": 0.04638671875,
937
  "learning_rate": 8.902439024390244e-05,
938
- "loss": 0.0038224293384701014,
939
- "mean_token_accuracy": 0.9989322870969772,
940
- "num_tokens": 449996.0,
941
  "step": 92
942
  },
943
  {
944
- "entropy": 0.47938764840364456,
945
  "epoch": 1.1347626339969372,
946
- "grad_norm": 0.0028533935546875,
947
  "learning_rate": 8.78048780487805e-05,
948
- "loss": 0.00016397782019339502,
949
- "mean_token_accuracy": 1.0,
950
- "num_tokens": 454979.0,
951
  "step": 93
952
  },
953
  {
954
- "entropy": 0.5016432590782642,
955
  "epoch": 1.1470137825421134,
956
- "grad_norm": 0.000400543212890625,
957
  "learning_rate": 8.658536585365854e-05,
958
- "loss": 0.00010612564074108377,
959
- "mean_token_accuracy": 1.0,
960
- "num_tokens": 459674.0,
961
  "step": 94
962
  },
963
  {
964
- "entropy": 0.5095659829676151,
965
  "epoch": 1.1592649310872893,
966
- "grad_norm": 0.0004520416259765625,
967
  "learning_rate": 8.53658536585366e-05,
968
- "loss": 0.00011354458547430113,
969
- "mean_token_accuracy": 1.0,
970
- "num_tokens": 464313.0,
971
  "step": 95
972
  },
973
  {
974
- "entropy": 0.4933694824576378,
975
  "epoch": 1.1715160796324655,
976
- "grad_norm": 0.0556640625,
977
  "learning_rate": 8.414634146341464e-05,
978
- "loss": 0.004786409437656403,
979
- "mean_token_accuracy": 0.9988460540771484,
980
- "num_tokens": 468858.0,
981
  "step": 96
982
  },
983
  {
984
- "entropy": 0.5068543236702681,
985
  "epoch": 1.1837672281776417,
986
- "grad_norm": 0.000492095947265625,
987
  "learning_rate": 8.292682926829268e-05,
988
- "loss": 9.500309533905238e-05,
989
- "mean_token_accuracy": 1.0,
990
- "num_tokens": 473732.0,
991
  "step": 97
992
  },
993
  {
994
- "entropy": 0.502707714214921,
995
  "epoch": 1.1960183767228179,
996
- "grad_norm": 0.026123046875,
997
  "learning_rate": 8.170731707317073e-05,
998
- "loss": 0.002030049916356802,
999
- "mean_token_accuracy": 0.9993131868541241,
1000
- "num_tokens": 479037.0,
1001
  "step": 98
1002
  },
1003
  {
1004
- "entropy": 0.5147993545979261,
1005
  "epoch": 1.2082695252679938,
1006
- "grad_norm": 0.000354766845703125,
1007
  "learning_rate": 8.048780487804879e-05,
1008
- "loss": 6.365451554302126e-05,
1009
- "mean_token_accuracy": 1.0,
1010
- "num_tokens": 484811.0,
1011
  "step": 99
1012
  },
1013
  {
1014
- "entropy": 0.4471734017133713,
1015
  "epoch": 1.22052067381317,
1016
- "grad_norm": 0.0047607421875,
1017
  "learning_rate": 7.926829268292683e-05,
1018
- "loss": 0.0003211660368833691,
1019
- "mean_token_accuracy": 1.0,
1020
- "num_tokens": 489522.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
- "eval_entropy": 0.4696715573469798,
1026
- "eval_loss": 0.0007750109070912004,
1027
- "eval_mean_token_accuracy": 0.9997843339823295,
1028
- "eval_num_tokens": 489522.0,
1029
- "eval_runtime": 51.321,
1030
- "eval_samples_per_second": 1.344,
1031
- "eval_steps_per_second": 1.344,
1032
  "step": 100
1033
  },
1034
  {
1035
- "entropy": 0.4984112149104476,
1036
  "epoch": 1.2327718223583461,
1037
- "grad_norm": 0.0001850128173828125,
1038
  "learning_rate": 7.804878048780489e-05,
1039
- "loss": 5.6583492550998926e-05,
1040
- "mean_token_accuracy": 1.0,
1041
- "num_tokens": 494757.0,
1042
  "step": 101
1043
  },
1044
  {
1045
- "entropy": 0.46644425205886364,
1046
  "epoch": 1.245022970903522,
1047
- "grad_norm": 0.0001506805419921875,
1048
  "learning_rate": 7.682926829268293e-05,
1049
- "loss": 5.076146044302732e-05,
1050
- "mean_token_accuracy": 1.0,
1051
- "num_tokens": 499837.0,
1052
  "step": 102
1053
  },
1054
  {
1055
- "entropy": 0.4746809806674719,
1056
  "epoch": 1.2572741194486983,
1057
- "grad_norm": 0.00015354156494140625,
1058
  "learning_rate": 7.560975609756099e-05,
1059
- "loss": 5.508732647285797e-05,
1060
- "mean_token_accuracy": 1.0,
1061
- "num_tokens": 505267.0,
1062
  "step": 103
1063
  },
1064
  {
1065
- "entropy": 0.47748516872525215,
1066
  "epoch": 1.2695252679938744,
1067
- "grad_norm": 0.0001277923583984375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
- "loss": 4.464950325200334e-05,
1070
- "mean_token_accuracy": 1.0,
1071
- "num_tokens": 510260.0,
1072
  "step": 104
1073
  },
1074
  {
1075
- "entropy": 0.49103316478431225,
1076
  "epoch": 1.2817764165390506,
1077
- "grad_norm": 0.00689697265625,
1078
  "learning_rate": 7.317073170731707e-05,
1079
- "loss": 0.000652994611300528,
1080
- "mean_token_accuracy": 0.9993556700646877,
1081
- "num_tokens": 514493.0,
1082
  "step": 105
1083
  },
1084
  {
1085
- "entropy": 0.4787591751664877,
1086
  "epoch": 1.2940275650842268,
1087
- "grad_norm": 0.0003795623779296875,
1088
  "learning_rate": 7.195121951219513e-05,
1089
- "loss": 4.17455485148821e-05,
1090
- "mean_token_accuracy": 1.0,
1091
- "num_tokens": 519511.0,
1092
  "step": 106
1093
  },
1094
  {
1095
- "entropy": 0.46200828067958355,
1096
  "epoch": 1.3062787136294027,
1097
- "grad_norm": 0.0001678466796875,
1098
  "learning_rate": 7.073170731707317e-05,
1099
- "loss": 4.6432032831944525e-05,
1100
- "mean_token_accuracy": 1.0,
1101
- "num_tokens": 524373.0,
1102
  "step": 107
1103
  },
1104
  {
1105
- "entropy": 0.4632429350167513,
1106
  "epoch": 1.318529862174579,
1107
- "grad_norm": 0.00019073486328125,
1108
  "learning_rate": 6.951219512195122e-05,
1109
- "loss": 4.138273652642965e-05,
1110
- "mean_token_accuracy": 1.0,
1111
- "num_tokens": 528973.0,
1112
  "step": 108
1113
  },
1114
  {
1115
- "entropy": 0.4669873770326376,
1116
  "epoch": 1.3307810107197549,
1117
- "grad_norm": 0.000301361083984375,
1118
  "learning_rate": 6.829268292682928e-05,
1119
- "loss": 4.5484361180569977e-05,
1120
- "mean_token_accuracy": 1.0,
1121
- "num_tokens": 533941.0,
1122
  "step": 109
1123
  },
1124
  {
1125
- "entropy": 0.45179494842886925,
1126
  "epoch": 1.343032159264931,
1127
- "grad_norm": 0.00010776519775390625,
1128
  "learning_rate": 6.707317073170732e-05,
1129
- "loss": 3.3365573472110555e-05,
1130
- "mean_token_accuracy": 1.0,
1131
- "num_tokens": 539363.0,
1132
  "step": 110
1133
  },
1134
  {
1135
- "entropy": 0.438027735799551,
1136
  "epoch": 1.3552833078101072,
1137
- "grad_norm": 0.00014972686767578125,
1138
  "learning_rate": 6.585365853658538e-05,
1139
- "loss": 4.3530206312425435e-05,
1140
- "mean_token_accuracy": 1.0,
1141
- "num_tokens": 543731.0,
1142
  "step": 111
1143
  },
1144
  {
1145
- "entropy": 0.4696179609745741,
1146
  "epoch": 1.3675344563552834,
1147
- "grad_norm": 0.028076171875,
1148
  "learning_rate": 6.463414634146342e-05,
1149
- "loss": 0.00529400585219264,
1150
- "mean_token_accuracy": 0.9985632188618183,
1151
- "num_tokens": 548164.0,
1152
  "step": 112
1153
  },
1154
  {
1155
- "entropy": 0.4698081314563751,
1156
  "epoch": 1.3797856049004595,
1157
- "grad_norm": 0.00885009765625,
1158
  "learning_rate": 6.341463414634146e-05,
1159
- "loss": 0.0005042221746407449,
1160
- "mean_token_accuracy": 0.9995039664208889,
1161
- "num_tokens": 553693.0,
1162
  "step": 113
1163
  },
1164
  {
1165
- "entropy": 0.45541019923985004,
1166
  "epoch": 1.3920367534456355,
1167
- "grad_norm": 9.393692016601562e-05,
1168
  "learning_rate": 6.219512195121952e-05,
1169
- "loss": 3.189211565768346e-05,
1170
- "mean_token_accuracy": 1.0,
1171
- "num_tokens": 558477.0,
1172
  "step": 114
1173
  },
1174
  {
1175
- "entropy": 0.46046129800379276,
1176
  "epoch": 1.4042879019908117,
1177
- "grad_norm": 0.0001392364501953125,
1178
  "learning_rate": 6.097560975609756e-05,
1179
- "loss": 3.399374691070989e-05,
1180
- "mean_token_accuracy": 1.0,
1181
- "num_tokens": 563965.0,
1182
  "step": 115
1183
  },
1184
  {
1185
- "entropy": 0.49661404080688953,
1186
  "epoch": 1.4165390505359878,
1187
- "grad_norm": 0.0004062652587890625,
1188
  "learning_rate": 5.975609756097561e-05,
1189
- "loss": 5.0347538490314037e-05,
1190
- "mean_token_accuracy": 1.0,
1191
- "num_tokens": 568303.0,
1192
  "step": 116
1193
  },
1194
  {
1195
- "entropy": 0.4603871125727892,
1196
  "epoch": 1.4287901990811638,
1197
- "grad_norm": 9.870529174804688e-05,
1198
  "learning_rate": 5.853658536585366e-05,
1199
- "loss": 3.4569777199067175e-05,
1200
- "mean_token_accuracy": 1.0,
1201
- "num_tokens": 572895.0,
1202
  "step": 117
1203
  },
1204
  {
1205
- "entropy": 0.47774807177484035,
1206
  "epoch": 1.44104134762634,
1207
- "grad_norm": 0.00012063980102539062,
1208
  "learning_rate": 5.731707317073171e-05,
1209
- "loss": 4.4718148274114355e-05,
1210
- "mean_token_accuracy": 1.0,
1211
- "num_tokens": 577892.0,
1212
  "step": 118
1213
  },
1214
  {
1215
- "entropy": 0.4559262488037348,
1216
  "epoch": 1.4532924961715161,
1217
- "grad_norm": 8.440017700195312e-05,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
- "loss": 2.7120513550471514e-05,
1220
- "mean_token_accuracy": 1.0,
1221
- "num_tokens": 583128.0,
1222
  "step": 119
1223
  },
1224
  {
1225
- "entropy": 0.4927012659609318,
1226
  "epoch": 1.4655436447166923,
1227
- "grad_norm": 0.00011539459228515625,
1228
  "learning_rate": 5.487804878048781e-05,
1229
- "loss": 3.757046943064779e-05,
1230
- "mean_token_accuracy": 1.0,
1231
- "num_tokens": 587856.0,
1232
  "step": 120
1233
  },
1234
  {
1235
- "entropy": 0.43140678480267525,
1236
  "epoch": 1.4777947932618682,
1237
- "grad_norm": 0.000125885009765625,
1238
  "learning_rate": 5.365853658536586e-05,
1239
- "loss": 3.988837852375582e-05,
1240
- "mean_token_accuracy": 1.0,
1241
- "num_tokens": 592260.0,
1242
  "step": 121
1243
  },
1244
  {
1245
- "entropy": 0.46533982269465923,
1246
  "epoch": 1.4900459418070444,
1247
- "grad_norm": 9.822845458984375e-05,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
- "loss": 3.350730548845604e-05,
1250
- "mean_token_accuracy": 1.0,
1251
- "num_tokens": 597022.0,
1252
  "step": 122
1253
  },
1254
  {
1255
- "entropy": 0.4450340513139963,
1256
  "epoch": 1.5022970903522204,
1257
- "grad_norm": 0.00018596649169921875,
1258
  "learning_rate": 5.121951219512195e-05,
1259
- "loss": 4.867902316618711e-05,
1260
- "mean_token_accuracy": 1.0,
1261
- "num_tokens": 601326.0,
1262
  "step": 123
1263
  },
1264
  {
1265
- "entropy": 0.4453680943697691,
1266
  "epoch": 1.5145482388973965,
1267
- "grad_norm": 0.000270843505859375,
1268
  "learning_rate": 5e-05,
1269
- "loss": 4.58945614809636e-05,
1270
- "mean_token_accuracy": 1.0,
1271
- "num_tokens": 606619.0,
1272
  "step": 124
1273
  },
1274
  {
1275
- "entropy": 0.4738515168428421,
1276
  "epoch": 1.5267993874425727,
1277
- "grad_norm": 6.866455078125e-05,
1278
  "learning_rate": 4.878048780487805e-05,
1279
- "loss": 3.125666262349114e-05,
1280
- "mean_token_accuracy": 1.0,
1281
- "num_tokens": 612381.0,
1282
  "step": 125
1283
  },
1284
  {
1285
- "entropy": 0.4711528979241848,
1286
  "epoch": 1.5390505359877489,
1287
- "grad_norm": 0.0003032684326171875,
1288
  "learning_rate": 4.75609756097561e-05,
1289
- "loss": 4.3324482248863205e-05,
1290
- "mean_token_accuracy": 1.0,
1291
- "num_tokens": 617203.0,
1292
  "step": 126
1293
  },
1294
  {
1295
- "entropy": 0.4728289693593979,
1296
  "epoch": 1.551301684532925,
1297
- "grad_norm": 0.01611328125,
1298
  "learning_rate": 4.634146341463415e-05,
1299
- "loss": 0.00017536790983285755,
1300
- "mean_token_accuracy": 1.0,
1301
- "num_tokens": 622329.0,
1302
  "step": 127
1303
  },
1304
  {
1305
- "entropy": 0.48075354285538197,
1306
  "epoch": 1.5635528330781012,
1307
- "grad_norm": 0.000751495361328125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
- "loss": 5.28718919667881e-05,
1310
- "mean_token_accuracy": 1.0,
1311
- "num_tokens": 627243.0,
1312
  "step": 128
1313
  },
1314
  {
1315
- "entropy": 0.43419913947582245,
1316
  "epoch": 1.5758039816232772,
1317
- "grad_norm": 0.0001850128173828125,
1318
  "learning_rate": 4.390243902439025e-05,
1319
- "loss": 4.585986243910156e-05,
1320
- "mean_token_accuracy": 1.0,
1321
- "num_tokens": 631428.0,
1322
  "step": 129
1323
  },
1324
  {
1325
- "entropy": 0.4347258824855089,
1326
  "epoch": 1.5880551301684533,
1327
- "grad_norm": 0.0003814697265625,
1328
  "learning_rate": 4.26829268292683e-05,
1329
- "loss": 5.289731052471325e-05,
1330
- "mean_token_accuracy": 1.0,
1331
- "num_tokens": 636476.0,
1332
  "step": 130
1333
  },
1334
  {
1335
- "entropy": 0.44714186899363995,
1336
  "epoch": 1.6003062787136293,
1337
- "grad_norm": 0.04541015625,
1338
  "learning_rate": 4.146341463414634e-05,
1339
- "loss": 0.003742673434317112,
1340
- "mean_token_accuracy": 0.9986401423811913,
1341
- "num_tokens": 641238.0,
1342
  "step": 131
1343
  },
1344
  {
1345
- "entropy": 0.4518321752548218,
1346
  "epoch": 1.6125574272588055,
1347
- "grad_norm": 0.0751953125,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
- "loss": 0.006270918063819408,
1350
- "mean_token_accuracy": 0.999205507338047,
1351
- "num_tokens": 646351.0,
1352
  "step": 132
1353
  },
1354
  {
1355
- "entropy": 0.40802894718945026,
1356
  "epoch": 1.6248085758039816,
1357
- "grad_norm": 0.00011110305786132812,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
- "loss": 3.44005020451732e-05,
1360
- "mean_token_accuracy": 1.0,
1361
- "num_tokens": 650741.0,
1362
  "step": 133
1363
  },
1364
  {
1365
- "entropy": 0.42771636322140694,
1366
  "epoch": 1.6370597243491578,
1367
- "grad_norm": 0.0001239776611328125,
1368
  "learning_rate": 3.780487804878049e-05,
1369
- "loss": 4.249331323080696e-05,
1370
- "mean_token_accuracy": 1.0,
1371
- "num_tokens": 655143.0,
1372
  "step": 134
1373
  },
1374
  {
1375
- "entropy": 0.44244702346622944,
1376
  "epoch": 1.649310872894334,
1377
- "grad_norm": 0.00011205673217773438,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
- "loss": 3.287765503046103e-05,
1380
- "mean_token_accuracy": 1.0,
1381
- "num_tokens": 660264.0,
1382
  "step": 135
1383
  },
1384
  {
1385
- "entropy": 0.48481825925409794,
1386
  "epoch": 1.66156202143951,
1387
- "grad_norm": 0.000179290771484375,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
- "loss": 5.46249866602011e-05,
1390
- "mean_token_accuracy": 1.0,
1391
- "num_tokens": 664447.0,
1392
  "step": 136
1393
  },
1394
  {
1395
- "entropy": 0.46484761498868465,
1396
  "epoch": 1.673813169984686,
1397
- "grad_norm": 0.0002498626708984375,
1398
  "learning_rate": 3.414634146341464e-05,
1399
- "loss": 4.426595114637166e-05,
1400
- "mean_token_accuracy": 1.0,
1401
- "num_tokens": 669330.0,
1402
  "step": 137
1403
  },
1404
  {
1405
- "entropy": 0.4359226580709219,
1406
  "epoch": 1.686064318529862,
1407
- "grad_norm": 0.049560546875,
1408
  "learning_rate": 3.292682926829269e-05,
1409
- "loss": 0.004120181780308485,
1410
- "mean_token_accuracy": 0.9997727274894714,
1411
- "num_tokens": 674885.0,
1412
  "step": 138
1413
  },
1414
  {
1415
- "entropy": 0.4564925115555525,
1416
  "epoch": 1.6983154670750382,
1417
- "grad_norm": 0.0001544952392578125,
1418
  "learning_rate": 3.170731707317073e-05,
1419
- "loss": 5.280967161525041e-05,
1420
- "mean_token_accuracy": 1.0,
1421
- "num_tokens": 679666.0,
1422
  "step": 139
1423
  },
1424
  {
1425
- "entropy": 0.45392039604485035,
1426
  "epoch": 1.7105666156202144,
1427
- "grad_norm": 0.0001277923583984375,
1428
  "learning_rate": 3.048780487804878e-05,
1429
- "loss": 4.428522152011283e-05,
1430
- "mean_token_accuracy": 1.0,
1431
- "num_tokens": 685022.0,
1432
  "step": 140
1433
  },
1434
  {
1435
- "entropy": 0.4568201173096895,
1436
  "epoch": 1.7228177641653906,
1437
- "grad_norm": 0.000255584716796875,
1438
  "learning_rate": 2.926829268292683e-05,
1439
- "loss": 5.5990531109273434e-05,
1440
- "mean_token_accuracy": 1.0,
1441
- "num_tokens": 689370.0,
1442
  "step": 141
1443
  },
1444
  {
1445
- "entropy": 0.46470937319099903,
1446
  "epoch": 1.7350689127105667,
1447
- "grad_norm": 0.00020122528076171875,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
- "loss": 6.421299622161314e-05,
1450
- "mean_token_accuracy": 1.0,
1451
- "num_tokens": 693163.0,
1452
  "step": 142
1453
  },
1454
  {
1455
- "entropy": 0.47727371007204056,
1456
  "epoch": 1.7473200612557427,
1457
- "grad_norm": 0.000385284423828125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
- "loss": 7.020766497589648e-05,
1460
- "mean_token_accuracy": 1.0,
1461
- "num_tokens": 697577.0,
1462
  "step": 143
1463
  },
1464
  {
1465
- "entropy": 0.46956145390868187,
1466
  "epoch": 1.7595712098009189,
1467
- "grad_norm": 0.00017642974853515625,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
- "loss": 6.577485328307375e-05,
1470
- "mean_token_accuracy": 1.0,
1471
- "num_tokens": 703024.0,
1472
  "step": 144
1473
  },
1474
  {
1475
- "entropy": 0.4778987504541874,
1476
  "epoch": 1.7718223583460948,
1477
- "grad_norm": 0.0272216796875,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
- "loss": 0.0015227628173306584,
1480
- "mean_token_accuracy": 0.999507874250412,
1481
- "num_tokens": 707836.0,
1482
  "step": 145
1483
  },
1484
  {
1485
- "entropy": 0.4693255964666605,
1486
  "epoch": 1.784073506891271,
1487
- "grad_norm": 0.0016632080078125,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
- "loss": 8.514844375895336e-05,
1490
- "mean_token_accuracy": 1.0,
1491
- "num_tokens": 712795.0,
1492
  "step": 146
1493
  },
1494
  {
1495
- "entropy": 0.44871947541832924,
1496
  "epoch": 1.7963246554364471,
1497
- "grad_norm": 0.0001220703125,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
- "loss": 4.404923674883321e-05,
1500
- "mean_token_accuracy": 1.0,
1501
- "num_tokens": 718427.0,
1502
  "step": 147
1503
  },
1504
  {
1505
- "entropy": 0.46528770588338375,
1506
  "epoch": 1.8085758039816233,
1507
- "grad_norm": 0.00011539459228515625,
1508
  "learning_rate": 2.073170731707317e-05,
1509
- "loss": 4.299484135117382e-05,
1510
- "mean_token_accuracy": 1.0,
1511
- "num_tokens": 723784.0,
1512
  "step": 148
1513
  },
1514
  {
1515
- "entropy": 0.4871877897530794,
1516
  "epoch": 1.8208269525267995,
1517
- "grad_norm": 0.00018215179443359375,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
- "loss": 6.490876694442704e-05,
1520
- "mean_token_accuracy": 1.0,
1521
- "num_tokens": 728100.0,
1522
  "step": 149
1523
  },
1524
  {
1525
- "entropy": 0.4858295116573572,
1526
  "epoch": 1.8330781010719757,
1527
- "grad_norm": 0.004119873046875,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
- "loss": 0.0002347841509617865,
1530
- "mean_token_accuracy": 1.0,
1531
- "num_tokens": 733891.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
- "eval_entropy": 0.45632935347764386,
1537
- "eval_loss": 0.0005955203669145703,
1538
- "eval_mean_token_accuracy": 0.9997519842092542,
1539
- "eval_num_tokens": 733891.0,
1540
- "eval_runtime": 51.3196,
1541
- "eval_samples_per_second": 1.345,
1542
- "eval_steps_per_second": 1.345,
1543
  "step": 150
1544
  }
1545
  ],
@@ -1560,7 +1560,7 @@
1560
  "attributes": {}
1561
  }
1562
  },
1563
- "total_flos": 3.323146975216435e+16,
1564
  "train_batch_size": 1,
1565
  "trial_name": null,
1566
  "trial_params": null
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 0.22490596678107977,
14
  "epoch": 0.01225114854517611,
15
+ "grad_norm": 8.875,
16
  "learning_rate": 0.0002,
17
+ "loss": 0.1875426322221756,
18
+ "mean_token_accuracy": 0.9461580626666546,
19
+ "num_tokens": 6770.0,
20
  "step": 1
21
  },
22
  {
23
+ "entropy": 0.24707041680812836,
24
  "epoch": 0.02450229709035222,
25
+ "grad_norm": 8.4375,
26
  "learning_rate": 0.00019878048780487805,
27
+ "loss": 0.16050274670124054,
28
+ "mean_token_accuracy": 0.9445944800972939,
29
+ "num_tokens": 14234.0,
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 0.32129648607224226,
34
  "epoch": 0.036753445635528334,
35
+ "grad_norm": 2.75,
36
  "learning_rate": 0.0001975609756097561,
37
+ "loss": 0.09863867610692978,
38
+ "mean_token_accuracy": 0.9659304060041904,
39
+ "num_tokens": 20673.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 0.32960685156285763,
44
  "epoch": 0.04900459418070444,
45
+ "grad_norm": 1.671875,
46
  "learning_rate": 0.00019634146341463416,
47
+ "loss": 0.08542143553495407,
48
+ "mean_token_accuracy": 0.9690693095326424,
49
+ "num_tokens": 26890.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 0.2677983660250902,
54
  "epoch": 0.06125574272588055,
55
+ "grad_norm": 1.359375,
56
  "learning_rate": 0.0001951219512195122,
57
+ "loss": 0.08666501939296722,
58
+ "mean_token_accuracy": 0.968298003077507,
59
+ "num_tokens": 35017.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 0.3096502358093858,
64
  "epoch": 0.07350689127105667,
65
+ "grad_norm": 0.66015625,
66
  "learning_rate": 0.00019390243902439025,
67
+ "loss": 0.07875043898820877,
68
+ "mean_token_accuracy": 0.969221331179142,
69
+ "num_tokens": 41478.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 0.3156957607716322,
74
  "epoch": 0.08575803981623277,
75
+ "grad_norm": 2.0,
76
  "learning_rate": 0.0001926829268292683,
77
+ "loss": 0.07807251811027527,
78
+ "mean_token_accuracy": 0.9681689888238907,
79
+ "num_tokens": 48204.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 0.2759731076657772,
84
  "epoch": 0.09800918836140889,
85
+ "grad_norm": 1.1328125,
86
  "learning_rate": 0.00019146341463414633,
87
+ "loss": 0.07681904733181,
88
+ "mean_token_accuracy": 0.9719767943024635,
89
+ "num_tokens": 54668.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 0.24453612882643938,
94
  "epoch": 0.11026033690658499,
95
+ "grad_norm": 0.875,
96
  "learning_rate": 0.0001902439024390244,
97
+ "loss": 0.07310224324464798,
98
+ "mean_token_accuracy": 0.96934475004673,
99
+ "num_tokens": 61929.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 0.25852775294333696,
104
  "epoch": 0.1225114854517611,
105
+ "grad_norm": 1.4921875,
106
  "learning_rate": 0.00018902439024390244,
107
+ "loss": 0.07384984195232391,
108
+ "mean_token_accuracy": 0.9701811708509922,
109
+ "num_tokens": 69036.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 0.27396084927022457,
114
  "epoch": 0.13476263399693722,
115
+ "grad_norm": 0.94140625,
116
  "learning_rate": 0.0001878048780487805,
117
+ "loss": 0.10277765244245529,
118
+ "mean_token_accuracy": 0.9634475558996201,
119
+ "num_tokens": 76394.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 0.3001147015020251,
124
  "epoch": 0.14701378254211334,
125
+ "grad_norm": 0.84765625,
126
  "learning_rate": 0.00018658536585365856,
127
+ "loss": 0.08927591890096664,
128
+ "mean_token_accuracy": 0.9625685028731823,
129
+ "num_tokens": 84073.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 0.29679975286126137,
134
  "epoch": 0.15926493108728942,
135
+ "grad_norm": 0.8359375,
136
  "learning_rate": 0.0001853658536585366,
137
+ "loss": 0.10607243329286575,
138
+ "mean_token_accuracy": 0.9608454070985317,
139
+ "num_tokens": 91135.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 0.28288435423746705,
144
  "epoch": 0.17151607963246554,
145
+ "grad_norm": 0.69921875,
146
  "learning_rate": 0.00018414634146341464,
147
+ "loss": 0.07875586301088333,
148
+ "mean_token_accuracy": 0.9699672348797321,
149
+ "num_tokens": 97740.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 0.2927901232615113,
154
  "epoch": 0.18376722817764166,
155
+ "grad_norm": 0.8515625,
156
  "learning_rate": 0.0001829268292682927,
157
+ "loss": 0.08531365543603897,
158
+ "mean_token_accuracy": 0.9704407565295696,
159
+ "num_tokens": 104242.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 0.2802786426618695,
164
  "epoch": 0.19601837672281777,
165
+ "grad_norm": 2.609375,
166
  "learning_rate": 0.00018170731707317075,
167
+ "loss": 0.10073477029800415,
168
+ "mean_token_accuracy": 0.9676352478563786,
169
+ "num_tokens": 112245.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 0.30663597770035267,
174
  "epoch": 0.2082695252679939,
175
+ "grad_norm": 2.3125,
176
  "learning_rate": 0.0001804878048780488,
177
+ "loss": 0.11375448107719421,
178
+ "mean_token_accuracy": 0.9604234844446182,
179
+ "num_tokens": 119086.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 0.31370354909449816,
184
  "epoch": 0.22052067381316998,
185
+ "grad_norm": 2.3125,
186
  "learning_rate": 0.00017926829268292684,
187
+ "loss": 0.10826913267374039,
188
+ "mean_token_accuracy": 0.9599097929894924,
189
+ "num_tokens": 126539.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 0.3203959669917822,
194
  "epoch": 0.2327718223583461,
195
+ "grad_norm": 1.3203125,
196
  "learning_rate": 0.00017804878048780488,
197
+ "loss": 0.075275719165802,
198
+ "mean_token_accuracy": 0.9775180667638779,
199
+ "num_tokens": 133104.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 0.32591533567756414,
204
  "epoch": 0.2450229709035222,
205
+ "grad_norm": 0.921875,
206
  "learning_rate": 0.00017682926829268295,
207
+ "loss": 0.09778374433517456,
208
+ "mean_token_accuracy": 0.9647064991295338,
209
+ "num_tokens": 139853.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 0.3228916050866246,
214
  "epoch": 0.2572741194486983,
215
+ "grad_norm": 1.9453125,
216
  "learning_rate": 0.000175609756097561,
217
+ "loss": 0.11495943367481232,
218
+ "mean_token_accuracy": 0.9573761746287346,
219
+ "num_tokens": 146482.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 0.3363859634846449,
224
  "epoch": 0.26952526799387444,
225
+ "grad_norm": 1.5,
226
  "learning_rate": 0.00017439024390243903,
227
+ "loss": 0.10473912209272385,
228
+ "mean_token_accuracy": 0.9618786759674549,
229
+ "num_tokens": 153819.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 0.3069695383310318,
234
  "epoch": 0.28177641653905056,
235
+ "grad_norm": 1.3203125,
236
  "learning_rate": 0.00017317073170731708,
237
+ "loss": 0.09256276488304138,
238
+ "mean_token_accuracy": 0.9625396281480789,
239
+ "num_tokens": 160972.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 0.3574997428804636,
244
  "epoch": 0.29402756508422667,
245
+ "grad_norm": 0.62890625,
246
  "learning_rate": 0.00017195121951219512,
247
+ "loss": 0.09489140659570694,
248
+ "mean_token_accuracy": 0.9578843042254448,
249
+ "num_tokens": 167730.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 0.3444826593622565,
254
  "epoch": 0.30627871362940273,
255
+ "grad_norm": 0.56640625,
256
  "learning_rate": 0.0001707317073170732,
257
+ "loss": 0.09492132067680359,
258
+ "mean_token_accuracy": 0.9603794105350971,
259
+ "num_tokens": 174078.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 0.328093777410686,
264
  "epoch": 0.31852986217457885,
265
+ "grad_norm": 0.97265625,
266
  "learning_rate": 0.00016951219512195123,
267
+ "loss": 0.08727280050516129,
268
+ "mean_token_accuracy": 0.9661480598151684,
269
+ "num_tokens": 180867.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 0.3213672311976552,
274
  "epoch": 0.33078101071975496,
275
+ "grad_norm": 1.203125,
276
  "learning_rate": 0.00016829268292682927,
277
+ "loss": 0.07705243676900864,
278
+ "mean_token_accuracy": 0.9675347730517387,
279
+ "num_tokens": 187459.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 0.3209801884368062,
284
  "epoch": 0.3430321592649311,
285
+ "grad_norm": 0.49609375,
286
  "learning_rate": 0.00016707317073170731,
287
+ "loss": 0.08744930475950241,
288
+ "mean_token_accuracy": 0.9658873043954372,
289
+ "num_tokens": 194265.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 0.2975130006670952,
294
  "epoch": 0.3552833078101072,
295
+ "grad_norm": 0.578125,
296
  "learning_rate": 0.00016585365853658536,
297
+ "loss": 0.08422811329364777,
298
+ "mean_token_accuracy": 0.9715595282614231,
299
+ "num_tokens": 201332.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 0.29833013843744993,
304
  "epoch": 0.3675344563552833,
305
+ "grad_norm": 0.82421875,
306
  "learning_rate": 0.00016463414634146343,
307
+ "loss": 0.08079958707094193,
308
+ "mean_token_accuracy": 0.9676232784986496,
309
+ "num_tokens": 208902.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 0.31810148898512125,
314
  "epoch": 0.37978560490045943,
315
+ "grad_norm": 0.6796875,
316
  "learning_rate": 0.00016341463414634147,
317
+ "loss": 0.09296617656946182,
318
+ "mean_token_accuracy": 0.9628731682896614,
319
+ "num_tokens": 214635.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 0.2774961022660136,
324
  "epoch": 0.39203675344563554,
325
+ "grad_norm": 1.2109375,
326
  "learning_rate": 0.00016219512195121954,
327
+ "loss": 0.08057809621095657,
328
+ "mean_token_accuracy": 0.9683544635772705,
329
+ "num_tokens": 222703.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 0.2500351797789335,
334
  "epoch": 0.40428790199081166,
335
+ "grad_norm": 0.6953125,
336
  "learning_rate": 0.00016097560975609758,
337
+ "loss": 0.07790188491344452,
338
+ "mean_token_accuracy": 0.9730625562369823,
339
+ "num_tokens": 230136.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 0.27261121198534966,
344
  "epoch": 0.4165390505359878,
345
+ "grad_norm": 1.21875,
346
  "learning_rate": 0.00015975609756097562,
347
+ "loss": 0.08459997177124023,
348
+ "mean_token_accuracy": 0.9683701656758785,
349
+ "num_tokens": 236711.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 0.25461648125201464,
354
  "epoch": 0.42879019908116384,
355
+ "grad_norm": 1.5078125,
356
  "learning_rate": 0.00015853658536585366,
357
+ "loss": 0.09788602590560913,
358
+ "mean_token_accuracy": 0.9601947516202927,
359
+ "num_tokens": 243492.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 0.250462488271296,
364
  "epoch": 0.44104134762633995,
365
+ "grad_norm": 0.62109375,
366
  "learning_rate": 0.00015731707317073173,
367
+ "loss": 0.09664106369018555,
368
+ "mean_token_accuracy": 0.9635641165077686,
369
+ "num_tokens": 250330.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 0.26719998102635145,
374
  "epoch": 0.45329249617151607,
375
+ "grad_norm": 0.609375,
376
  "learning_rate": 0.00015609756097560978,
377
+ "loss": 0.08978135138750076,
378
+ "mean_token_accuracy": 0.9730992764234543,
379
+ "num_tokens": 257503.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 0.25437645614147186,
384
  "epoch": 0.4655436447166922,
385
+ "grad_norm": 1.0859375,
386
  "learning_rate": 0.00015487804878048782,
387
+ "loss": 0.08938639611005783,
388
+ "mean_token_accuracy": 0.9675878100097179,
389
+ "num_tokens": 264436.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 0.2722023595124483,
394
  "epoch": 0.4777947932618683,
395
+ "grad_norm": 1.375,
396
  "learning_rate": 0.00015365853658536586,
397
+ "loss": 0.07785381376743317,
398
+ "mean_token_accuracy": 0.9736072942614555,
399
+ "num_tokens": 270483.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 0.3116175327450037,
404
  "epoch": 0.4900459418070444,
405
+ "grad_norm": 0.65625,
406
  "learning_rate": 0.0001524390243902439,
407
+ "loss": 0.09019558876752853,
408
+ "mean_token_accuracy": 0.9605641178786755,
409
+ "num_tokens": 276329.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 0.28687036503106356,
414
  "epoch": 0.5022970903522205,
415
+ "grad_norm": 0.62890625,
416
  "learning_rate": 0.00015121951219512197,
417
+ "loss": 0.0810370221734047,
418
+ "mean_token_accuracy": 0.9663555175065994,
419
+ "num_tokens": 281636.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 0.2999298516660929,
424
  "epoch": 0.5145482388973966,
425
+ "grad_norm": 1.1484375,
426
  "learning_rate": 0.00015000000000000001,
427
+ "loss": 0.06981078535318375,
428
+ "mean_token_accuracy": 0.9718391671776772,
429
+ "num_tokens": 287849.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 0.3097079414874315,
434
  "epoch": 0.5267993874425727,
435
+ "grad_norm": 1.5546875,
436
  "learning_rate": 0.00014878048780487806,
437
+ "loss": 0.09350281953811646,
438
+ "mean_token_accuracy": 0.9683773033320904,
439
+ "num_tokens": 294425.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 0.2796417009085417,
444
  "epoch": 0.5390505359877489,
445
+ "grad_norm": 1.25,
446
  "learning_rate": 0.0001475609756097561,
447
+ "loss": 0.09558023512363434,
448
+ "mean_token_accuracy": 0.9602576456964016,
449
+ "num_tokens": 301451.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 0.265599487349391,
454
  "epoch": 0.5513016845329249,
455
+ "grad_norm": 0.59375,
456
  "learning_rate": 0.00014634146341463414,
457
+ "loss": 0.07772304862737656,
458
+ "mean_token_accuracy": 0.9693298228085041,
459
+ "num_tokens": 308208.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 0.29693579114973545,
464
  "epoch": 0.5635528330781011,
465
+ "grad_norm": 1.2109375,
466
  "learning_rate": 0.0001451219512195122,
467
+ "loss": 0.09863201528787613,
468
+ "mean_token_accuracy": 0.963471919298172,
469
+ "num_tokens": 314427.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 0.2665130514651537,
474
  "epoch": 0.5758039816232772,
475
+ "grad_norm": 0.80078125,
476
  "learning_rate": 0.00014390243902439025,
477
+ "loss": 0.08794506639242172,
478
+ "mean_token_accuracy": 0.9714972339570522,
479
+ "num_tokens": 321146.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 0.2665897011756897,
484
  "epoch": 0.5880551301684533,
485
+ "grad_norm": 0.765625,
486
  "learning_rate": 0.0001426829268292683,
487
+ "loss": 0.07602453231811523,
488
+ "mean_token_accuracy": 0.9719848223030567,
489
+ "num_tokens": 326952.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 0.2814077762886882,
494
  "epoch": 0.6003062787136294,
495
+ "grad_norm": 0.74609375,
496
  "learning_rate": 0.00014146341463414634,
497
+ "loss": 0.08512163907289505,
498
+ "mean_token_accuracy": 0.9680779539048672,
499
+ "num_tokens": 333716.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 0.311913987621665,
504
  "epoch": 0.6125574272588055,
505
+ "grad_norm": 0.52734375,
506
  "learning_rate": 0.00014024390243902438,
507
+ "loss": 0.0735088661313057,
508
+ "mean_token_accuracy": 0.9693484716117382,
509
+ "num_tokens": 339075.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
+ "eval_entropy": 0.2772115924652072,
515
+ "eval_loss": 0.08680303394794464,
516
+ "eval_mean_token_accuracy": 0.9665399781171826,
517
+ "eval_num_tokens": 339075.0,
518
+ "eval_runtime": 64.122,
519
+ "eval_samples_per_second": 1.076,
520
+ "eval_steps_per_second": 1.076,
521
  "step": 50
522
  },
523
  {
524
+ "entropy": 0.2713254941627383,
525
  "epoch": 0.6248085758039816,
526
+ "grad_norm": 0.76953125,
527
  "learning_rate": 0.00013902439024390245,
528
+ "loss": 0.07364857941865921,
529
+ "mean_token_accuracy": 0.9685694649815559,
530
+ "num_tokens": 346211.0,
531
  "step": 51
532
  },
533
  {
534
+ "entropy": 0.27622572146356106,
535
  "epoch": 0.6370597243491577,
536
+ "grad_norm": 1.953125,
537
  "learning_rate": 0.0001378048780487805,
538
+ "loss": 0.08796638250350952,
539
+ "mean_token_accuracy": 0.9678671807050705,
540
+ "num_tokens": 353743.0,
541
  "step": 52
542
  },
543
  {
544
+ "entropy": 0.3153565675020218,
545
  "epoch": 0.6493108728943339,
546
+ "grad_norm": 1.8125,
547
  "learning_rate": 0.00013658536585365856,
548
+ "loss": 0.09189874678850174,
549
+ "mean_token_accuracy": 0.9696005284786224,
550
+ "num_tokens": 360765.0,
551
  "step": 53
552
  },
553
  {
554
+ "entropy": 0.2793878586962819,
555
  "epoch": 0.6615620214395099,
556
+ "grad_norm": 0.94140625,
557
  "learning_rate": 0.0001353658536585366,
558
+ "loss": 0.0844489261507988,
559
+ "mean_token_accuracy": 0.9678577370941639,
560
+ "num_tokens": 366800.0,
561
  "step": 54
562
  },
563
  {
564
+ "entropy": 0.31044898089021444,
565
  "epoch": 0.6738131699846861,
566
+ "grad_norm": 0.9375,
567
  "learning_rate": 0.00013414634146341464,
568
+ "loss": 0.07886157184839249,
569
+ "mean_token_accuracy": 0.9673981033265591,
570
+ "num_tokens": 373439.0,
571
  "step": 55
572
  },
573
  {
574
+ "entropy": 0.27184910606592894,
575
  "epoch": 0.6860643185298622,
576
+ "grad_norm": 1.3359375,
577
  "learning_rate": 0.0001329268292682927,
578
+ "loss": 0.0787871852517128,
579
+ "mean_token_accuracy": 0.9677317887544632,
580
+ "num_tokens": 380492.0,
581
  "step": 56
582
  },
583
  {
584
+ "entropy": 0.31349051371216774,
585
  "epoch": 0.6983154670750383,
586
+ "grad_norm": 1.59375,
587
  "learning_rate": 0.00013170731707317076,
588
+ "loss": 0.08862332254648209,
589
+ "mean_token_accuracy": 0.9652546346187592,
590
+ "num_tokens": 386711.0,
591
  "step": 57
592
  },
593
  {
594
+ "entropy": 0.2799685625359416,
595
  "epoch": 0.7105666156202144,
596
+ "grad_norm": 1.5078125,
597
  "learning_rate": 0.0001304878048780488,
598
+ "loss": 0.10028493404388428,
599
+ "mean_token_accuracy": 0.9606899172067642,
600
+ "num_tokens": 394124.0,
601
  "step": 58
602
  },
603
  {
604
+ "entropy": 0.2792940763756633,
605
  "epoch": 0.7228177641653905,
606
+ "grad_norm": 1.5859375,
607
  "learning_rate": 0.00012926829268292684,
608
+ "loss": 0.07462260127067566,
609
+ "mean_token_accuracy": 0.9740471467375755,
610
+ "num_tokens": 401499.0,
611
  "step": 59
612
  },
613
  {
614
+ "entropy": 0.29724057391285896,
615
  "epoch": 0.7350689127105666,
616
+ "grad_norm": 0.95703125,
617
  "learning_rate": 0.00012804878048780488,
618
+ "loss": 0.06339482963085175,
619
+ "mean_token_accuracy": 0.9754546955227852,
620
+ "num_tokens": 407443.0,
621
  "step": 60
622
  },
623
  {
624
+ "entropy": 0.2698040744289756,
625
  "epoch": 0.7473200612557427,
626
+ "grad_norm": 0.60546875,
627
  "learning_rate": 0.00012682926829268293,
628
+ "loss": 0.10221480578184128,
629
+ "mean_token_accuracy": 0.9670109152793884,
630
+ "num_tokens": 415471.0,
631
  "step": 61
632
  },
633
  {
634
+ "entropy": 0.2995635373517871,
635
  "epoch": 0.7595712098009189,
636
+ "grad_norm": 1.75,
637
  "learning_rate": 0.000125609756097561,
638
+ "loss": 0.08588436245918274,
639
+ "mean_token_accuracy": 0.9686382673680782,
640
+ "num_tokens": 422504.0,
641
  "step": 62
642
  },
643
  {
644
+ "entropy": 0.2458120621740818,
645
  "epoch": 0.7718223583460949,
646
+ "grad_norm": 2.1875,
647
  "learning_rate": 0.00012439024390243904,
648
+ "loss": 0.08629653602838516,
649
+ "mean_token_accuracy": 0.966422975063324,
650
+ "num_tokens": 430143.0,
651
  "step": 63
652
  },
653
  {
654
+ "entropy": 0.2900782600045204,
655
  "epoch": 0.7840735068912711,
656
+ "grad_norm": 0.9296875,
657
  "learning_rate": 0.00012317073170731708,
658
+ "loss": 0.08716308325529099,
659
+ "mean_token_accuracy": 0.965714868158102,
660
+ "num_tokens": 435664.0,
661
  "step": 64
662
  },
663
  {
664
+ "entropy": 0.29250922333449125,
665
  "epoch": 0.7963246554364471,
666
+ "grad_norm": 0.59375,
667
  "learning_rate": 0.00012195121951219512,
668
+ "loss": 0.08158774673938751,
669
+ "mean_token_accuracy": 0.9694335348904133,
670
+ "num_tokens": 442457.0,
671
  "step": 65
672
  },
673
  {
674
+ "entropy": 0.3083174014464021,
675
  "epoch": 0.8085758039816233,
676
+ "grad_norm": 0.82421875,
677
  "learning_rate": 0.00012073170731707318,
678
+ "loss": 0.0988016203045845,
679
+ "mean_token_accuracy": 0.9648039489984512,
680
+ "num_tokens": 449983.0,
681
  "step": 66
682
  },
683
  {
684
+ "entropy": 0.25693165976554155,
685
  "epoch": 0.8208269525267994,
686
+ "grad_norm": 0.74609375,
687
  "learning_rate": 0.00011951219512195122,
688
+ "loss": 0.07928164303302765,
689
+ "mean_token_accuracy": 0.9698546566069126,
690
+ "num_tokens": 457640.0,
691
  "step": 67
692
  },
693
  {
694
+ "entropy": 0.2752681290730834,
695
  "epoch": 0.8330781010719756,
696
+ "grad_norm": 0.97265625,
697
  "learning_rate": 0.00011829268292682926,
698
+ "loss": 0.07464170455932617,
699
+ "mean_token_accuracy": 0.9697864800691605,
700
+ "num_tokens": 464050.0,
701
  "step": 68
702
  },
703
  {
704
+ "entropy": 0.27110164798796177,
705
  "epoch": 0.8453292496171516,
706
+ "grad_norm": 0.71875,
707
  "learning_rate": 0.00011707317073170732,
708
+ "loss": 0.0718315988779068,
709
+ "mean_token_accuracy": 0.9709942191839218,
710
+ "num_tokens": 469546.0,
711
  "step": 69
712
  },
713
  {
714
+ "entropy": 0.3264527218416333,
715
  "epoch": 0.8575803981623277,
716
+ "grad_norm": 0.62109375,
717
  "learning_rate": 0.00011585365853658536,
718
+ "loss": 0.0866687223315239,
719
+ "mean_token_accuracy": 0.9700192771852016,
720
+ "num_tokens": 475365.0,
721
  "step": 70
722
  },
723
  {
724
+ "entropy": 0.3122966531664133,
725
  "epoch": 0.8698315467075038,
726
+ "grad_norm": 0.67578125,
727
  "learning_rate": 0.00011463414634146342,
728
+ "loss": 0.06088244915008545,
729
+ "mean_token_accuracy": 0.9754119366407394,
730
+ "num_tokens": 481830.0,
731
  "step": 71
732
  },
733
  {
734
+ "entropy": 0.3018254106864333,
735
  "epoch": 0.8820826952526799,
736
+ "grad_norm": 0.56640625,
737
  "learning_rate": 0.00011341463414634146,
738
+ "loss": 0.08657931536436081,
739
+ "mean_token_accuracy": 0.9676030017435551,
740
+ "num_tokens": 487767.0,
741
  "step": 72
742
  },
743
  {
744
+ "entropy": 0.3276115320622921,
745
  "epoch": 0.8943338437978561,
746
+ "grad_norm": 0.5078125,
747
  "learning_rate": 0.00011219512195121953,
748
+ "loss": 0.08024603128433228,
749
+ "mean_token_accuracy": 0.9690204374492168,
750
+ "num_tokens": 494428.0,
751
  "step": 73
752
  },
753
  {
754
+ "entropy": 0.32397411670535803,
755
  "epoch": 0.9065849923430321,
756
+ "grad_norm": 1.1015625,
757
  "learning_rate": 0.00011097560975609757,
758
+ "loss": 0.07867392897605896,
759
+ "mean_token_accuracy": 0.9685576297342777,
760
+ "num_tokens": 500828.0,
761
  "step": 74
762
  },
763
  {
764
+ "entropy": 0.319146528840065,
765
  "epoch": 0.9188361408882083,
766
+ "grad_norm": 0.97265625,
767
  "learning_rate": 0.00010975609756097563,
768
+ "loss": 0.08432602882385254,
769
+ "mean_token_accuracy": 0.9689616709947586,
770
+ "num_tokens": 507523.0,
771
  "step": 75
772
  },
773
  {
774
+ "entropy": 0.3080446803942323,
775
  "epoch": 0.9310872894333844,
776
+ "grad_norm": 1.265625,
777
  "learning_rate": 0.00010853658536585367,
778
+ "loss": 0.0796058252453804,
779
+ "mean_token_accuracy": 0.9683922417461872,
780
+ "num_tokens": 513607.0,
781
  "step": 76
782
  },
783
  {
784
+ "entropy": 0.2667541950941086,
785
  "epoch": 0.9433384379785605,
786
+ "grad_norm": 0.59375,
787
  "learning_rate": 0.00010731707317073172,
788
+ "loss": 0.06495777517557144,
789
+ "mean_token_accuracy": 0.977863471955061,
790
+ "num_tokens": 521376.0,
791
  "step": 77
792
  },
793
  {
794
+ "entropy": 0.27901614736765623,
795
  "epoch": 0.9555895865237366,
796
+ "grad_norm": 1.0859375,
797
  "learning_rate": 0.00010609756097560977,
798
+ "loss": 0.08389777690172195,
799
+ "mean_token_accuracy": 0.967527512460947,
800
+ "num_tokens": 528624.0,
801
  "step": 78
802
  },
803
  {
804
+ "entropy": 0.2754220822826028,
805
  "epoch": 0.9678407350689127,
806
+ "grad_norm": 1.3515625,
807
  "learning_rate": 0.00010487804878048781,
808
+ "loss": 0.0762331560254097,
809
+ "mean_token_accuracy": 0.9713698588311672,
810
+ "num_tokens": 534817.0,
811
  "step": 79
812
  },
813
  {
814
+ "entropy": 0.2981132147833705,
815
  "epoch": 0.9800918836140888,
816
+ "grad_norm": 1.4375,
817
  "learning_rate": 0.00010365853658536586,
818
+ "loss": 0.07953717559576035,
819
+ "mean_token_accuracy": 0.967929158359766,
820
+ "num_tokens": 541716.0,
821
  "step": 80
822
  },
823
  {
824
+ "entropy": 0.30576920323073864,
825
  "epoch": 0.9923430321592649,
826
+ "grad_norm": 1.0234375,
827
  "learning_rate": 0.0001024390243902439,
828
+ "loss": 0.07800528407096863,
829
+ "mean_token_accuracy": 0.971219640225172,
830
+ "num_tokens": 548000.0,
831
  "step": 81
832
  },
833
  {
834
+ "entropy": 0.24986045509576799,
835
  "epoch": 1.0,
836
+ "grad_norm": 0.58203125,
837
  "learning_rate": 0.00010121951219512196,
838
+ "loss": 0.05879032611846924,
839
+ "mean_token_accuracy": 0.9748349964618683,
840
+ "num_tokens": 552608.0,
841
  "step": 82
842
  },
843
  {
844
+ "entropy": 0.2518839007243514,
845
  "epoch": 1.0122511485451762,
846
+ "grad_norm": 0.49609375,
847
  "learning_rate": 0.0001,
848
+ "loss": 0.047237373888492584,
849
+ "mean_token_accuracy": 0.9874232485890388,
850
+ "num_tokens": 559523.0,
851
  "step": 83
852
  },
853
  {
854
+ "entropy": 0.2561075631529093,
855
  "epoch": 1.0245022970903521,
856
+ "grad_norm": 0.65234375,
857
  "learning_rate": 9.878048780487805e-05,
858
+ "loss": 0.04376941919326782,
859
+ "mean_token_accuracy": 0.9896520264446735,
860
+ "num_tokens": 566232.0,
861
  "step": 84
862
  },
863
  {
864
+ "entropy": 0.2935391655191779,
865
  "epoch": 1.0367534456355283,
866
+ "grad_norm": 0.486328125,
867
  "learning_rate": 9.75609756097561e-05,
868
+ "loss": 0.052017997950315475,
869
+ "mean_token_accuracy": 0.9823879115283489,
870
+ "num_tokens": 573965.0,
871
  "step": 85
872
  },
873
  {
874
+ "entropy": 0.21971730748191476,
875
  "epoch": 1.0490045941807045,
876
+ "grad_norm": 0.330078125,
877
  "learning_rate": 9.634146341463415e-05,
878
+ "loss": 0.04022914543747902,
879
+ "mean_token_accuracy": 0.9874378368258476,
880
+ "num_tokens": 580768.0,
881
  "step": 86
882
  },
883
  {
884
+ "entropy": 0.23719595093280077,
885
  "epoch": 1.0612557427258806,
886
+ "grad_norm": 0.68359375,
887
  "learning_rate": 9.51219512195122e-05,
888
+ "loss": 0.04782414808869362,
889
+ "mean_token_accuracy": 0.9846052750945091,
890
+ "num_tokens": 588097.0,
891
  "step": 87
892
  },
893
  {
894
+ "entropy": 0.25634779036045074,
895
  "epoch": 1.0735068912710566,
896
+ "grad_norm": 0.291015625,
897
  "learning_rate": 9.390243902439024e-05,
898
+ "loss": 0.03357430174946785,
899
+ "mean_token_accuracy": 0.9895204566419125,
900
+ "num_tokens": 594215.0,
901
  "step": 88
902
  },
903
  {
904
+ "entropy": 0.26507470663636923,
905
  "epoch": 1.0857580398162328,
906
+ "grad_norm": 0.89453125,
907
  "learning_rate": 9.26829268292683e-05,
908
+ "loss": 0.0427095852792263,
909
+ "mean_token_accuracy": 0.984734483063221,
910
+ "num_tokens": 600018.0,
911
  "step": 89
912
  },
913
  {
914
+ "entropy": 0.25531507655978203,
915
  "epoch": 1.098009188361409,
916
+ "grad_norm": 0.357421875,
917
  "learning_rate": 9.146341463414635e-05,
918
+ "loss": 0.04051242396235466,
919
+ "mean_token_accuracy": 0.9878104776144028,
920
+ "num_tokens": 606254.0,
921
  "step": 90
922
  },
923
  {
924
+ "entropy": 0.26176126673817635,
925
  "epoch": 1.110260336906585,
926
+ "grad_norm": 0.55078125,
927
  "learning_rate": 9.02439024390244e-05,
928
+ "loss": 0.03882109373807907,
929
+ "mean_token_accuracy": 0.9838540144264698,
930
+ "num_tokens": 612316.0,
931
  "step": 91
932
  },
933
  {
934
+ "entropy": 0.2165100760757923,
935
  "epoch": 1.122511485451761,
936
+ "grad_norm": 0.3671875,
937
  "learning_rate": 8.902439024390244e-05,
938
+ "loss": 0.03010629303753376,
939
+ "mean_token_accuracy": 0.9918084405362606,
940
+ "num_tokens": 619629.0,
941
  "step": 92
942
  },
943
  {
944
+ "entropy": 0.24866555724292994,
945
  "epoch": 1.1347626339969372,
946
+ "grad_norm": 0.7578125,
947
  "learning_rate": 8.78048780487805e-05,
948
+ "loss": 0.03892926499247551,
949
+ "mean_token_accuracy": 0.984953761100769,
950
+ "num_tokens": 625947.0,
951
  "step": 93
952
  },
953
  {
954
+ "entropy": 0.21699398616328835,
955
  "epoch": 1.1470137825421134,
956
+ "grad_norm": 0.53125,
957
  "learning_rate": 8.658536585365854e-05,
958
+ "loss": 0.040178049355745316,
959
+ "mean_token_accuracy": 0.986099898815155,
960
+ "num_tokens": 632906.0,
961
  "step": 94
962
  },
963
  {
964
+ "entropy": 0.2104594809934497,
965
  "epoch": 1.1592649310872893,
966
+ "grad_norm": 1.4375,
967
  "learning_rate": 8.53658536585366e-05,
968
+ "loss": 0.05103502795100212,
969
+ "mean_token_accuracy": 0.9873828142881393,
970
+ "num_tokens": 639769.0,
971
  "step": 95
972
  },
973
  {
974
+ "entropy": 0.21941981185227633,
975
  "epoch": 1.1715160796324655,
976
+ "grad_norm": 0.984375,
977
  "learning_rate": 8.414634146341464e-05,
978
+ "loss": 0.03593335300683975,
979
+ "mean_token_accuracy": 0.9901031330227852,
980
+ "num_tokens": 646347.0,
981
  "step": 96
982
  },
983
  {
984
+ "entropy": 0.23086606059223413,
985
  "epoch": 1.1837672281776417,
986
+ "grad_norm": 0.65625,
987
  "learning_rate": 8.292682926829268e-05,
988
+ "loss": 0.034123439341783524,
989
+ "mean_token_accuracy": 0.9874096475541592,
990
+ "num_tokens": 652247.0,
991
  "step": 97
992
  },
993
  {
994
+ "entropy": 0.21858725044876337,
995
  "epoch": 1.1960183767228179,
996
+ "grad_norm": 0.3515625,
997
  "learning_rate": 8.170731707317073e-05,
998
+ "loss": 0.03983831778168678,
999
+ "mean_token_accuracy": 0.9883633032441139,
1000
+ "num_tokens": 659620.0,
1001
  "step": 98
1002
  },
1003
  {
1004
+ "entropy": 0.2186456574127078,
1005
  "epoch": 1.2082695252679938,
1006
+ "grad_norm": 0.50390625,
1007
  "learning_rate": 8.048780487804879e-05,
1008
+ "loss": 0.03659169375896454,
1009
+ "mean_token_accuracy": 0.9874354675412178,
1010
+ "num_tokens": 667017.0,
1011
  "step": 99
1012
  },
1013
  {
1014
+ "entropy": 0.21289387485012412,
1015
  "epoch": 1.22052067381317,
1016
+ "grad_norm": 1.2890625,
1017
  "learning_rate": 7.926829268292683e-05,
1018
+ "loss": 0.09039004892110825,
1019
+ "mean_token_accuracy": 0.9841732494533062,
1020
+ "num_tokens": 673866.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
+ "eval_entropy": 0.22615607968275098,
1026
+ "eval_loss": 0.0748714804649353,
1027
+ "eval_mean_token_accuracy": 0.9701917439267256,
1028
+ "eval_num_tokens": 673866.0,
1029
+ "eval_runtime": 64.1728,
1030
+ "eval_samples_per_second": 1.075,
1031
+ "eval_steps_per_second": 1.075,
1032
  "step": 100
1033
  },
1034
  {
1035
+ "entropy": 0.20847708079963923,
1036
  "epoch": 1.2327718223583461,
1037
+ "grad_norm": 0.9453125,
1038
  "learning_rate": 7.804878048780489e-05,
1039
+ "loss": 0.032662514597177505,
1040
+ "mean_token_accuracy": 0.9919092357158661,
1041
+ "num_tokens": 681308.0,
1042
  "step": 101
1043
  },
1044
  {
1045
+ "entropy": 0.23787071648985147,
1046
  "epoch": 1.245022970903522,
1047
+ "grad_norm": 0.859375,
1048
  "learning_rate": 7.682926829268293e-05,
1049
+ "loss": 0.044949762523174286,
1050
+ "mean_token_accuracy": 0.987742405384779,
1051
+ "num_tokens": 687496.0,
1052
  "step": 102
1053
  },
1054
  {
1055
+ "entropy": 0.21969830617308617,
1056
  "epoch": 1.2572741194486983,
1057
+ "grad_norm": 0.8671875,
1058
  "learning_rate": 7.560975609756099e-05,
1059
+ "loss": 0.036048222333192825,
1060
+ "mean_token_accuracy": 0.98578891903162,
1061
+ "num_tokens": 694818.0,
1062
  "step": 103
1063
  },
1064
  {
1065
+ "entropy": 0.228535583242774,
1066
  "epoch": 1.2695252679938744,
1067
+ "grad_norm": 1.7109375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
+ "loss": 0.050321951508522034,
1070
+ "mean_token_accuracy": 0.9846261814236641,
1071
+ "num_tokens": 701351.0,
1072
  "step": 104
1073
  },
1074
  {
1075
+ "entropy": 0.21918219700455666,
1076
  "epoch": 1.2817764165390506,
1077
+ "grad_norm": 0.57421875,
1078
  "learning_rate": 7.317073170731707e-05,
1079
+ "loss": 0.03220512717962265,
1080
+ "mean_token_accuracy": 0.9897662363946438,
1081
+ "num_tokens": 707212.0,
1082
  "step": 105
1083
  },
1084
  {
1085
+ "entropy": 0.21648676693439484,
1086
  "epoch": 1.2940275650842268,
1087
+ "grad_norm": 0.921875,
1088
  "learning_rate": 7.195121951219513e-05,
1089
+ "loss": 0.031827542930841446,
1090
+ "mean_token_accuracy": 0.9904872179031372,
1091
+ "num_tokens": 714524.0,
1092
  "step": 106
1093
  },
1094
  {
1095
+ "entropy": 0.20004846714437008,
1096
  "epoch": 1.3062787136294027,
1097
+ "grad_norm": 1.0234375,
1098
  "learning_rate": 7.073170731707317e-05,
1099
+ "loss": 0.03981270268559456,
1100
+ "mean_token_accuracy": 0.9861926138401031,
1101
+ "num_tokens": 722033.0,
1102
  "step": 107
1103
  },
1104
  {
1105
+ "entropy": 0.21497153211385012,
1106
  "epoch": 1.318529862174579,
1107
+ "grad_norm": 0.53515625,
1108
  "learning_rate": 6.951219512195122e-05,
1109
+ "loss": 0.03612194582819939,
1110
+ "mean_token_accuracy": 0.9883794784545898,
1111
+ "num_tokens": 728835.0,
1112
  "step": 108
1113
  },
1114
  {
1115
+ "entropy": 0.22441515233367682,
1116
  "epoch": 1.3307810107197549,
1117
+ "grad_norm": 0.66796875,
1118
  "learning_rate": 6.829268292682928e-05,
1119
+ "loss": 0.037204962223768234,
1120
+ "mean_token_accuracy": 0.9865190424025059,
1121
+ "num_tokens": 735463.0,
1122
  "step": 109
1123
  },
1124
  {
1125
+ "entropy": 0.21172351390123367,
1126
  "epoch": 1.343032159264931,
1127
+ "grad_norm": 0.314453125,
1128
  "learning_rate": 6.707317073170732e-05,
1129
+ "loss": 0.03260833024978638,
1130
+ "mean_token_accuracy": 0.9877017810940742,
1131
+ "num_tokens": 742536.0,
1132
  "step": 110
1133
  },
1134
  {
1135
+ "entropy": 0.19597876677289605,
1136
  "epoch": 1.3552833078101072,
1137
+ "grad_norm": 0.419921875,
1138
  "learning_rate": 6.585365853658538e-05,
1139
+ "loss": 0.0339697040617466,
1140
+ "mean_token_accuracy": 0.990579642355442,
1141
+ "num_tokens": 749606.0,
1142
  "step": 111
1143
  },
1144
  {
1145
+ "entropy": 0.21933963894844055,
1146
  "epoch": 1.3675344563552834,
1147
+ "grad_norm": 0.53515625,
1148
  "learning_rate": 6.463414634146342e-05,
1149
+ "loss": 0.028515402227640152,
1150
+ "mean_token_accuracy": 0.9883383698761463,
1151
+ "num_tokens": 755287.0,
1152
  "step": 112
1153
  },
1154
  {
1155
+ "entropy": 0.21494697034358978,
1156
  "epoch": 1.3797856049004595,
1157
+ "grad_norm": 0.37890625,
1158
  "learning_rate": 6.341463414634146e-05,
1159
+ "loss": 0.03924579173326492,
1160
+ "mean_token_accuracy": 0.9876385144889355,
1161
+ "num_tokens": 763515.0,
1162
  "step": 113
1163
  },
1164
  {
1165
+ "entropy": 0.22842750838026404,
1166
  "epoch": 1.3920367534456355,
1167
+ "grad_norm": 1.1484375,
1168
  "learning_rate": 6.219512195121952e-05,
1169
+ "loss": 0.0367334708571434,
1170
+ "mean_token_accuracy": 0.9872251562774181,
1171
+ "num_tokens": 769660.0,
1172
  "step": 114
1173
  },
1174
  {
1175
+ "entropy": 0.2147415135987103,
1176
  "epoch": 1.4042879019908117,
1177
+ "grad_norm": 0.921875,
1178
  "learning_rate": 6.097560975609756e-05,
1179
+ "loss": 0.030023006722331047,
1180
+ "mean_token_accuracy": 0.9890519753098488,
1181
+ "num_tokens": 777068.0,
1182
  "step": 115
1183
  },
1184
  {
1185
+ "entropy": 0.2247378919273615,
1186
  "epoch": 1.4165390505359878,
1187
+ "grad_norm": 0.9375,
1188
  "learning_rate": 5.975609756097561e-05,
1189
+ "loss": 0.03915408253669739,
1190
+ "mean_token_accuracy": 0.9883266240358353,
1191
+ "num_tokens": 783422.0,
1192
  "step": 116
1193
  },
1194
  {
1195
+ "entropy": 0.19090860895812511,
1196
  "epoch": 1.4287901990811638,
1197
+ "grad_norm": 0.765625,
1198
  "learning_rate": 5.853658536585366e-05,
1199
+ "loss": 0.037202730774879456,
1200
+ "mean_token_accuracy": 0.9874398410320282,
1201
+ "num_tokens": 790851.0,
1202
  "step": 117
1203
  },
1204
  {
1205
+ "entropy": 0.2285028137266636,
1206
  "epoch": 1.44104134762634,
1207
+ "grad_norm": 1.9140625,
1208
  "learning_rate": 5.731707317073171e-05,
1209
+ "loss": 0.043229859322309494,
1210
+ "mean_token_accuracy": 0.9905107729136944,
1211
+ "num_tokens": 797801.0,
1212
  "step": 118
1213
  },
1214
  {
1215
+ "entropy": 0.2443255502730608,
1216
  "epoch": 1.4532924961715161,
1217
+ "grad_norm": 0.365234375,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
+ "loss": 0.04100143164396286,
1220
+ "mean_token_accuracy": 0.9880562499165535,
1221
+ "num_tokens": 804371.0,
1222
  "step": 119
1223
  },
1224
  {
1225
+ "entropy": 0.19626039918512106,
1226
  "epoch": 1.4655436447166923,
1227
+ "grad_norm": 0.83984375,
1228
  "learning_rate": 5.487804878048781e-05,
1229
+ "loss": 0.038516998291015625,
1230
+ "mean_token_accuracy": 0.988171175122261,
1231
+ "num_tokens": 812335.0,
1232
  "step": 120
1233
  },
1234
  {
1235
+ "entropy": 0.2181866616010666,
1236
  "epoch": 1.4777947932618682,
1237
+ "grad_norm": 0.53515625,
1238
  "learning_rate": 5.365853658536586e-05,
1239
+ "loss": 0.02816646918654442,
1240
+ "mean_token_accuracy": 0.9916124008595943,
1241
+ "num_tokens": 818577.0,
1242
  "step": 121
1243
  },
1244
  {
1245
+ "entropy": 0.20635052677243948,
1246
  "epoch": 1.4900459418070444,
1247
+ "grad_norm": 0.74609375,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
+ "loss": 0.04106622561812401,
1250
+ "mean_token_accuracy": 0.9839451834559441,
1251
+ "num_tokens": 825535.0,
1252
  "step": 122
1253
  },
1254
  {
1255
+ "entropy": 0.21835408825427294,
1256
  "epoch": 1.5022970903522204,
1257
+ "grad_norm": 0.427734375,
1258
  "learning_rate": 5.121951219512195e-05,
1259
+ "loss": 0.026341412216424942,
1260
+ "mean_token_accuracy": 0.9940293915569782,
1261
+ "num_tokens": 831505.0,
1262
  "step": 123
1263
  },
1264
  {
1265
+ "entropy": 0.21729151718318462,
1266
  "epoch": 1.5145482388973965,
1267
+ "grad_norm": 0.455078125,
1268
  "learning_rate": 5e-05,
1269
+ "loss": 0.028432821854948997,
1270
+ "mean_token_accuracy": 0.9925089627504349,
1271
+ "num_tokens": 838385.0,
1272
  "step": 124
1273
  },
1274
  {
1275
+ "entropy": 0.23625962156802416,
1276
  "epoch": 1.5267993874425727,
1277
+ "grad_norm": 0.72265625,
1278
  "learning_rate": 4.878048780487805e-05,
1279
+ "loss": 0.03885198384523392,
1280
+ "mean_token_accuracy": 0.9883155077695847,
1281
+ "num_tokens": 845433.0,
1282
  "step": 125
1283
  },
1284
  {
1285
+ "entropy": 0.21153692342340946,
1286
  "epoch": 1.5390505359877489,
1287
+ "grad_norm": 0.66796875,
1288
  "learning_rate": 4.75609756097561e-05,
1289
+ "loss": 0.03570759296417236,
1290
+ "mean_token_accuracy": 0.9910184219479561,
1291
+ "num_tokens": 852471.0,
1292
  "step": 126
1293
  },
1294
  {
1295
+ "entropy": 0.23752436228096485,
1296
  "epoch": 1.551301684532925,
1297
+ "grad_norm": 0.640625,
1298
  "learning_rate": 4.634146341463415e-05,
1299
+ "loss": 0.028638798743486404,
1300
+ "mean_token_accuracy": 0.9928638078272343,
1301
+ "num_tokens": 858702.0,
1302
  "step": 127
1303
  },
1304
  {
1305
+ "entropy": 0.2128417994827032,
1306
  "epoch": 1.5635528330781012,
1307
+ "grad_norm": 0.8828125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
+ "loss": 0.029636576771736145,
1310
+ "mean_token_accuracy": 0.9910452663898468,
1311
+ "num_tokens": 865325.0,
1312
  "step": 128
1313
  },
1314
  {
1315
+ "entropy": 0.216589767485857,
1316
  "epoch": 1.5758039816232772,
1317
+ "grad_norm": 0.453125,
1318
  "learning_rate": 4.390243902439025e-05,
1319
+ "loss": 0.03238631784915924,
1320
+ "mean_token_accuracy": 0.9904623441398144,
1321
+ "num_tokens": 871341.0,
1322
  "step": 129
1323
  },
1324
  {
1325
+ "entropy": 0.19242106284946203,
1326
  "epoch": 1.5880551301684533,
1327
+ "grad_norm": 0.392578125,
1328
  "learning_rate": 4.26829268292683e-05,
1329
+ "loss": 0.0261989776045084,
1330
+ "mean_token_accuracy": 0.9925210140645504,
1331
+ "num_tokens": 878973.0,
1332
  "step": 130
1333
  },
1334
  {
1335
+ "entropy": 0.22208478767424822,
1336
  "epoch": 1.6003062787136293,
1337
+ "grad_norm": 0.328125,
1338
  "learning_rate": 4.146341463414634e-05,
1339
+ "loss": 0.029643766582012177,
1340
+ "mean_token_accuracy": 0.9926025420427322,
1341
+ "num_tokens": 885517.0,
1342
  "step": 131
1343
  },
1344
  {
1345
+ "entropy": 0.19283092580735683,
1346
  "epoch": 1.6125574272588055,
1347
+ "grad_norm": 0.423828125,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
+ "loss": 0.03948017954826355,
1350
+ "mean_token_accuracy": 0.9875317811965942,
1351
+ "num_tokens": 893273.0,
1352
  "step": 132
1353
  },
1354
  {
1355
+ "entropy": 0.18790056556463242,
1356
  "epoch": 1.6248085758039816,
1357
+ "grad_norm": 0.625,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
+ "loss": 0.025747017934918404,
1360
+ "mean_token_accuracy": 0.9934940375387669,
1361
+ "num_tokens": 900019.0,
1362
  "step": 133
1363
  },
1364
  {
1365
+ "entropy": 0.20814241049811244,
1366
  "epoch": 1.6370597243491578,
1367
+ "grad_norm": 0.376953125,
1368
  "learning_rate": 3.780487804878049e-05,
1369
+ "loss": 0.03998865559697151,
1370
+ "mean_token_accuracy": 0.9876968078315258,
1371
+ "num_tokens": 906633.0,
1372
  "step": 134
1373
  },
1374
  {
1375
+ "entropy": 0.1975369704887271,
1376
  "epoch": 1.649310872894334,
1377
+ "grad_norm": 0.3203125,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
+ "loss": 0.031131668016314507,
1380
+ "mean_token_accuracy": 0.9915927015244961,
1381
+ "num_tokens": 913990.0,
1382
  "step": 135
1383
  },
1384
  {
1385
+ "entropy": 0.23459685500711203,
1386
  "epoch": 1.66156202143951,
1387
+ "grad_norm": 0.76171875,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
+ "loss": 0.03373259678483009,
1390
+ "mean_token_accuracy": 0.9898596629500389,
1391
+ "num_tokens": 919248.0,
1392
  "step": 136
1393
  },
1394
  {
1395
+ "entropy": 0.1909911371767521,
1396
  "epoch": 1.673813169984686,
1397
+ "grad_norm": 0.60546875,
1398
  "learning_rate": 3.414634146341464e-05,
1399
+ "loss": 0.037791188806295395,
1400
+ "mean_token_accuracy": 0.9897548258304596,
1401
+ "num_tokens": 926248.0,
1402
  "step": 137
1403
  },
1404
  {
1405
+ "entropy": 0.2332595670595765,
1406
  "epoch": 1.686064318529862,
1407
+ "grad_norm": 0.89453125,
1408
  "learning_rate": 3.292682926829269e-05,
1409
+ "loss": 0.03799242898821831,
1410
+ "mean_token_accuracy": 0.9867184162139893,
1411
+ "num_tokens": 932490.0,
1412
  "step": 138
1413
  },
1414
  {
1415
+ "entropy": 0.22243124432861805,
1416
  "epoch": 1.6983154670750382,
1417
+ "grad_norm": 0.61328125,
1418
  "learning_rate": 3.170731707317073e-05,
1419
+ "loss": 0.04291514679789543,
1420
+ "mean_token_accuracy": 0.9877815246582031,
1421
+ "num_tokens": 938756.0,
1422
  "step": 139
1423
  },
1424
  {
1425
+ "entropy": 0.20778016652911901,
1426
  "epoch": 1.7105666156202144,
1427
+ "grad_norm": 0.41796875,
1428
  "learning_rate": 3.048780487804878e-05,
1429
+ "loss": 0.023588458076119423,
1430
+ "mean_token_accuracy": 0.9942950084805489,
1431
+ "num_tokens": 945866.0,
1432
  "step": 140
1433
  },
1434
  {
1435
+ "entropy": 0.18776059616357088,
1436
  "epoch": 1.7228177641653906,
1437
+ "grad_norm": 0.41796875,
1438
  "learning_rate": 2.926829268292683e-05,
1439
+ "loss": 0.03229852020740509,
1440
+ "mean_token_accuracy": 0.9909596405923367,
1441
+ "num_tokens": 952865.0,
1442
  "step": 141
1443
  },
1444
  {
1445
+ "entropy": 0.18707702960819006,
1446
  "epoch": 1.7350689127105667,
1447
+ "grad_norm": 0.609375,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
+ "loss": 0.03691868111491203,
1450
+ "mean_token_accuracy": 0.9900590926408768,
1451
+ "num_tokens": 959190.0,
1452
  "step": 142
1453
  },
1454
  {
1455
+ "entropy": 0.1914756903424859,
1456
  "epoch": 1.7473200612557427,
1457
+ "grad_norm": 0.408203125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
+ "loss": 0.03487441688776016,
1460
+ "mean_token_accuracy": 0.9909356310963631,
1461
+ "num_tokens": 966059.0,
1462
  "step": 143
1463
  },
1464
  {
1465
+ "entropy": 0.20852853963151574,
1466
  "epoch": 1.7595712098009189,
1467
+ "grad_norm": 0.380859375,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
+ "loss": 0.03023841790854931,
1470
+ "mean_token_accuracy": 0.9922478385269642,
1471
+ "num_tokens": 973553.0,
1472
  "step": 144
1473
  },
1474
  {
1475
+ "entropy": 0.18278094567358494,
1476
  "epoch": 1.7718223583460948,
1477
+ "grad_norm": 0.65625,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
+ "loss": 0.03335383161902428,
1480
+ "mean_token_accuracy": 0.9902437664568424,
1481
+ "num_tokens": 980748.0,
1482
  "step": 145
1483
  },
1484
  {
1485
+ "entropy": 0.2156418706290424,
1486
  "epoch": 1.784073506891271,
1487
+ "grad_norm": 0.58984375,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
+ "loss": 0.026211977005004883,
1490
+ "mean_token_accuracy": 0.9913386814296246,
1491
+ "num_tokens": 987018.0,
1492
  "step": 146
1493
  },
1494
  {
1495
+ "entropy": 0.2084086169488728,
1496
  "epoch": 1.7963246554364471,
1497
+ "grad_norm": 0.37890625,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
+ "loss": 0.029074503108859062,
1500
+ "mean_token_accuracy": 0.9920879267156124,
1501
+ "num_tokens": 993841.0,
1502
  "step": 147
1503
  },
1504
  {
1505
+ "entropy": 0.2162067350000143,
1506
  "epoch": 1.8085758039816233,
1507
+ "grad_norm": 0.38671875,
1508
  "learning_rate": 2.073170731707317e-05,
1509
+ "loss": 0.027591165155172348,
1510
+ "mean_token_accuracy": 0.9916894063353539,
1511
+ "num_tokens": 1000318.0,
1512
  "step": 148
1513
  },
1514
  {
1515
+ "entropy": 0.22895692195743322,
1516
  "epoch": 1.8208269525267995,
1517
+ "grad_norm": 1.421875,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
+ "loss": 0.034101299941539764,
1520
+ "mean_token_accuracy": 0.9889856353402138,
1521
+ "num_tokens": 1005747.0,
1522
  "step": 149
1523
  },
1524
  {
1525
+ "entropy": 0.21029841899871826,
1526
  "epoch": 1.8330781010719757,
1527
+ "grad_norm": 0.59375,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
+ "loss": 0.04408642649650574,
1530
+ "mean_token_accuracy": 0.988445583730936,
1531
+ "num_tokens": 1013365.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
+ "eval_entropy": 0.21028992522885834,
1537
+ "eval_loss": 0.06481878459453583,
1538
+ "eval_mean_token_accuracy": 0.9753203677094501,
1539
+ "eval_num_tokens": 1013365.0,
1540
+ "eval_runtime": 64.1224,
1541
+ "eval_samples_per_second": 1.076,
1542
+ "eval_steps_per_second": 1.076,
1543
  "step": 150
1544
  }
1545
  ],
 
1560
  "attributes": {}
1561
  }
1562
  },
1563
+ "total_flos": 4.588638959382528e+16,
1564
  "train_batch_size": 1,
1565
  "trial_name": null,
1566
  "trial_params": null
checkpoint-164/adapter_config.json CHANGED
@@ -30,11 +30,11 @@
30
  "revision": null,
31
  "target_modules": [
32
  "o_proj",
33
- "q_proj",
34
- "gate_proj",
35
- "down_proj",
36
  "k_proj",
 
 
37
  "v_proj",
 
38
  "up_proj"
39
  ],
40
  "target_parameters": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "o_proj",
 
 
 
33
  "k_proj",
34
+ "down_proj",
35
+ "q_proj",
36
  "v_proj",
37
+ "gate_proj",
38
  "up_proj"
39
  ],
40
  "target_parameters": null,
checkpoint-164/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:978fce5d23f8e65a5e89fd4fa0a502a5a505733bc5d548832ae8e85ecf1af748
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d7799ff7ae7f290e67eada9d323c6418a3a9db26bdb2158f039838076d95d1f
3
  size 83946192
checkpoint-164/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9b61551443a68aa89ddbd1fc5688ce7159ec4929f007d4514b1ed43fd6e0b19
3
  size 85728997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d6acb58aa21bd3fa4437c7d5eee173fd6389daa33a68a747aec69733fea274a
3
  size 85728997
checkpoint-164/trainer_state.json CHANGED
@@ -10,1676 +10,1676 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 0.45975130423903465,
14
  "epoch": 0.01225114854517611,
15
- "grad_norm": 0.00689697265625,
16
  "learning_rate": 0.0002,
17
- "loss": 0.0005938471877016127,
18
- "mean_token_accuracy": 0.9997171945869923,
19
- "num_tokens": 6092.0,
20
  "step": 1
21
  },
22
  {
23
- "entropy": 0.4158535748720169,
24
  "epoch": 0.02450229709035222,
25
- "grad_norm": 0.00017833709716796875,
26
  "learning_rate": 0.00019878048780487805,
27
- "loss": 3.472402386250906e-05,
28
- "mean_token_accuracy": 1.0,
29
- "num_tokens": 11535.0,
30
  "step": 2
31
  },
32
  {
33
- "entropy": 0.4280186090618372,
34
  "epoch": 0.036753445635528334,
35
- "grad_norm": 8.20159912109375e-05,
36
  "learning_rate": 0.0001975609756097561,
37
- "loss": 2.510893318685703e-05,
38
- "mean_token_accuracy": 1.0,
39
- "num_tokens": 16432.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 0.41829014383256435,
44
  "epoch": 0.04900459418070444,
45
- "grad_norm": 0.034912109375,
46
  "learning_rate": 0.00019634146341463416,
47
- "loss": 0.0034790209028869867,
48
- "mean_token_accuracy": 0.9971264377236366,
49
- "num_tokens": 20507.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 0.3744635935872793,
54
  "epoch": 0.06125574272588055,
55
- "grad_norm": 0.0001983642578125,
56
  "learning_rate": 0.0001951219512195122,
57
- "loss": 1.627415622351691e-05,
58
- "mean_token_accuracy": 1.0,
59
- "num_tokens": 26122.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 0.40895503386855125,
64
  "epoch": 0.07350689127105667,
65
- "grad_norm": 3.457069396972656e-05,
66
  "learning_rate": 0.00019390243902439025,
67
- "loss": 9.875144314719364e-06,
68
- "mean_token_accuracy": 1.0,
69
- "num_tokens": 30847.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 0.36759823746979237,
74
  "epoch": 0.08575803981623277,
75
- "grad_norm": 8.869171142578125e-05,
76
  "learning_rate": 0.0001926829268292683,
77
- "loss": 1.5701301890658215e-05,
78
- "mean_token_accuracy": 1.0,
79
- "num_tokens": 36541.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 0.3891780599951744,
84
  "epoch": 0.09800918836140889,
85
- "grad_norm": 5.078315734863281e-05,
86
  "learning_rate": 0.00019146341463414633,
87
- "loss": 1.2823864381061867e-05,
88
- "mean_token_accuracy": 1.0,
89
- "num_tokens": 41001.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 0.4104680269956589,
94
  "epoch": 0.11026033690658499,
95
- "grad_norm": 0.02099609375,
96
  "learning_rate": 0.0001902439024390244,
97
- "loss": 0.0011738959001377225,
98
- "mean_token_accuracy": 0.9996279776096344,
99
- "num_tokens": 45467.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 0.39176585152745247,
104
  "epoch": 0.1225114854517611,
105
- "grad_norm": 0.0703125,
106
  "learning_rate": 0.00018902439024390244,
107
- "loss": 0.0007126386626623571,
108
- "mean_token_accuracy": 0.9997509978711605,
109
- "num_tokens": 50478.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 0.3562493957579136,
114
  "epoch": 0.13476263399693722,
115
- "grad_norm": 0.0004405975341796875,
116
  "learning_rate": 0.0001878048780487805,
117
- "loss": 2.2854681446915492e-05,
118
- "mean_token_accuracy": 1.0,
119
- "num_tokens": 56181.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 0.3858679383993149,
124
  "epoch": 0.14701378254211334,
125
- "grad_norm": 0.00016307830810546875,
126
  "learning_rate": 0.00018658536585365856,
127
- "loss": 1.8136681319447234e-05,
128
- "mean_token_accuracy": 1.0,
129
- "num_tokens": 62946.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 0.37994169630110264,
134
  "epoch": 0.15926493108728942,
135
- "grad_norm": 0.000640869140625,
136
  "learning_rate": 0.0001853658536585366,
137
- "loss": 1.9365113985259086e-05,
138
- "mean_token_accuracy": 1.0,
139
- "num_tokens": 68436.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 0.3682236662134528,
144
  "epoch": 0.17151607963246554,
145
- "grad_norm": 4.267692565917969e-05,
146
  "learning_rate": 0.00018414634146341464,
147
- "loss": 9.09720802155789e-06,
148
- "mean_token_accuracy": 1.0,
149
- "num_tokens": 73603.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 0.40290670469403267,
154
  "epoch": 0.18376722817764166,
155
- "grad_norm": 9.441375732421875e-05,
156
  "learning_rate": 0.0001829268292682927,
157
- "loss": 1.5181853086687624e-05,
158
- "mean_token_accuracy": 1.0,
159
- "num_tokens": 77845.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 0.36544002406299114,
164
  "epoch": 0.19601837672281777,
165
- "grad_norm": 0.001007080078125,
166
  "learning_rate": 0.00018170731707317075,
167
- "loss": 1.5547768271062523e-05,
168
- "mean_token_accuracy": 1.0,
169
- "num_tokens": 82744.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 0.38514361158013344,
174
  "epoch": 0.2082695252679939,
175
- "grad_norm": 3.147125244140625e-05,
176
  "learning_rate": 0.0001804878048780488,
177
- "loss": 9.32630973693449e-06,
178
- "mean_token_accuracy": 1.0,
179
- "num_tokens": 87453.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 0.38769579119980335,
184
  "epoch": 0.22052067381316998,
185
- "grad_norm": 0.0001983642578125,
186
  "learning_rate": 0.00017926829268292684,
187
- "loss": 1.4681676475447603e-05,
188
- "mean_token_accuracy": 1.0,
189
- "num_tokens": 92321.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 0.3753592735156417,
194
  "epoch": 0.2327718223583461,
195
- "grad_norm": 0.00019168853759765625,
196
  "learning_rate": 0.00017804878048780488,
197
- "loss": 2.8633825422730297e-05,
198
- "mean_token_accuracy": 1.0,
199
- "num_tokens": 97146.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 0.3909287117421627,
204
  "epoch": 0.2450229709035222,
205
- "grad_norm": 0.0004482269287109375,
206
  "learning_rate": 0.00017682926829268295,
207
- "loss": 1.8875809473684058e-05,
208
- "mean_token_accuracy": 1.0,
209
- "num_tokens": 101943.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 0.4073780719190836,
214
  "epoch": 0.2572741194486983,
215
- "grad_norm": 0.36328125,
216
  "learning_rate": 0.000175609756097561,
217
- "loss": 0.005490713287144899,
218
- "mean_token_accuracy": 0.9996448867022991,
219
- "num_tokens": 106772.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 0.3673222251236439,
224
  "epoch": 0.26952526799387444,
225
- "grad_norm": 5.1975250244140625e-05,
226
  "learning_rate": 0.00017439024390243903,
227
- "loss": 1.0117664714925922e-05,
228
- "mean_token_accuracy": 1.0,
229
- "num_tokens": 112558.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 0.39382114820182323,
234
  "epoch": 0.28177641653905056,
235
- "grad_norm": 0.0003662109375,
236
  "learning_rate": 0.00017317073170731708,
237
- "loss": 1.4868882317387033e-05,
238
- "mean_token_accuracy": 1.0,
239
- "num_tokens": 117489.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 0.4107118733227253,
244
  "epoch": 0.29402756508422667,
245
- "grad_norm": 0.0009918212890625,
246
  "learning_rate": 0.00017195121951219512,
247
- "loss": 3.529411696945317e-05,
248
- "mean_token_accuracy": 1.0,
249
- "num_tokens": 123010.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 0.3787885829806328,
254
  "epoch": 0.30627871362940273,
255
- "grad_norm": 0.005859375,
256
  "learning_rate": 0.0001707317073170732,
257
- "loss": 9.493537800153717e-05,
258
- "mean_token_accuracy": 1.0,
259
- "num_tokens": 127716.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 0.37760412879288197,
264
  "epoch": 0.31852986217457885,
265
- "grad_norm": 0.00029754638671875,
266
  "learning_rate": 0.00016951219512195123,
267
- "loss": 1.7393856978742406e-05,
268
- "mean_token_accuracy": 1.0,
269
- "num_tokens": 132372.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 0.38016335386782885,
274
  "epoch": 0.33078101071975496,
275
- "grad_norm": 0.0198974609375,
276
  "learning_rate": 0.00016829268292682927,
277
- "loss": 0.00031554378801956773,
278
- "mean_token_accuracy": 1.0,
279
- "num_tokens": 137028.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 0.3974682204425335,
284
  "epoch": 0.3430321592649311,
285
- "grad_norm": 0.0546875,
286
  "learning_rate": 0.00016707317073170731,
287
- "loss": 0.0025693816132843494,
288
- "mean_token_accuracy": 0.9993556700646877,
289
- "num_tokens": 142088.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 0.3819452077150345,
294
  "epoch": 0.3552833078101072,
295
- "grad_norm": 0.0137939453125,
296
  "learning_rate": 0.00016585365853658536,
297
- "loss": 0.0001885725650936365,
298
- "mean_token_accuracy": 1.0,
299
- "num_tokens": 147481.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 0.41766250506043434,
304
  "epoch": 0.3675344563552833,
305
- "grad_norm": 0.000759124755859375,
306
  "learning_rate": 0.00016463414634146343,
307
- "loss": 1.8762426407192834e-05,
308
- "mean_token_accuracy": 1.0,
309
- "num_tokens": 152973.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 0.42338451743125916,
314
  "epoch": 0.37978560490045943,
315
- "grad_norm": 0.00015735626220703125,
316
  "learning_rate": 0.00016341463414634147,
317
- "loss": 1.797903678379953e-05,
318
- "mean_token_accuracy": 1.0,
319
- "num_tokens": 156786.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 0.41780348122119904,
324
  "epoch": 0.39203675344563554,
325
- "grad_norm": 0.00016117095947265625,
326
  "learning_rate": 0.00016219512195121954,
327
- "loss": 2.4896233298932202e-05,
328
- "mean_token_accuracy": 1.0,
329
- "num_tokens": 162859.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 0.3986742924898863,
334
  "epoch": 0.40428790199081166,
335
- "grad_norm": 0.0003185272216796875,
336
  "learning_rate": 0.00016097560975609758,
337
- "loss": 2.1766518329968676e-05,
338
- "mean_token_accuracy": 1.0,
339
- "num_tokens": 167969.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 0.40497588738799095,
344
  "epoch": 0.4165390505359878,
345
- "grad_norm": 0.00141143798828125,
346
  "learning_rate": 0.00015975609756097562,
347
- "loss": 5.013354166294448e-05,
348
- "mean_token_accuracy": 1.0,
349
- "num_tokens": 172518.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 0.44378601387143135,
354
  "epoch": 0.42879019908116384,
355
- "grad_norm": 0.007415771484375,
356
  "learning_rate": 0.00015853658536585366,
357
- "loss": 0.00011341742356307805,
358
- "mean_token_accuracy": 1.0,
359
- "num_tokens": 177085.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 0.44088135845959187,
364
  "epoch": 0.44104134762633995,
365
- "grad_norm": 0.0224609375,
366
  "learning_rate": 0.00015731707317073173,
367
- "loss": 0.0003354589862283319,
368
- "mean_token_accuracy": 1.0,
369
- "num_tokens": 181617.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 0.40403734613209963,
374
  "epoch": 0.45329249617151607,
375
- "grad_norm": 0.09326171875,
376
  "learning_rate": 0.00015609756097560978,
377
- "loss": 0.0009270600858144462,
378
- "mean_token_accuracy": 0.9998405613005161,
379
- "num_tokens": 186836.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 0.44129026494920254,
384
  "epoch": 0.4655436447166922,
385
- "grad_norm": 0.0001068115234375,
386
  "learning_rate": 0.00015487804878048782,
387
- "loss": 1.9685152437887155e-05,
388
- "mean_token_accuracy": 1.0,
389
- "num_tokens": 191224.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 0.41146982461214066,
394
  "epoch": 0.4777947932618683,
395
- "grad_norm": 6.437301635742188e-05,
396
  "learning_rate": 0.00015365853658536586,
397
- "loss": 1.4887214092595968e-05,
398
- "mean_token_accuracy": 1.0,
399
- "num_tokens": 195926.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 0.4401062335819006,
404
  "epoch": 0.4900459418070444,
405
- "grad_norm": 0.0125732421875,
406
  "learning_rate": 0.0001524390243902439,
407
- "loss": 0.0006239329231902957,
408
- "mean_token_accuracy": 0.999550361186266,
409
- "num_tokens": 200772.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 0.4169564712792635,
414
  "epoch": 0.5022970903522205,
415
- "grad_norm": 0.000118255615234375,
416
  "learning_rate": 0.00015121951219512197,
417
- "loss": 2.6680882001528516e-05,
418
- "mean_token_accuracy": 1.0,
419
- "num_tokens": 204499.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 0.45378032699227333,
424
  "epoch": 0.5145482388973966,
425
- "grad_norm": 0.00011491775512695312,
426
  "learning_rate": 0.00015000000000000001,
427
- "loss": 2.471652624080889e-05,
428
- "mean_token_accuracy": 1.0,
429
- "num_tokens": 208814.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 0.4465767778456211,
434
  "epoch": 0.5267993874425727,
435
- "grad_norm": 0.000263214111328125,
436
  "learning_rate": 0.00014878048780487806,
437
- "loss": 3.366273449501023e-05,
438
- "mean_token_accuracy": 1.0,
439
- "num_tokens": 213907.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 0.4534517452120781,
444
  "epoch": 0.5390505359877489,
445
- "grad_norm": 0.000728607177734375,
446
  "learning_rate": 0.0001475609756097561,
447
- "loss": 2.826840864145197e-05,
448
- "mean_token_accuracy": 1.0,
449
- "num_tokens": 218988.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 0.4201868511736393,
454
  "epoch": 0.5513016845329249,
455
- "grad_norm": 0.0196533203125,
456
  "learning_rate": 0.00014634146341463414,
457
- "loss": 0.000961265352088958,
458
- "mean_token_accuracy": 0.9995967745780945,
459
- "num_tokens": 223595.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 0.4538087658584118,
464
  "epoch": 0.5635528330781011,
465
- "grad_norm": 0.000629425048828125,
466
  "learning_rate": 0.0001451219512195122,
467
- "loss": 2.982705154863652e-05,
468
- "mean_token_accuracy": 1.0,
469
- "num_tokens": 228244.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 0.43760119564831257,
474
  "epoch": 0.5758039816232772,
475
- "grad_norm": 6.151199340820312e-05,
476
  "learning_rate": 0.00014390243902439025,
477
- "loss": 1.6359297660528682e-05,
478
- "mean_token_accuracy": 1.0,
479
- "num_tokens": 232606.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 0.44127281196415424,
484
  "epoch": 0.5880551301684533,
485
- "grad_norm": 9.632110595703125e-05,
486
  "learning_rate": 0.0001426829268292683,
487
- "loss": 2.9222681405371986e-05,
488
- "mean_token_accuracy": 1.0,
489
- "num_tokens": 236563.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 0.4647264387458563,
494
  "epoch": 0.6003062787136294,
495
- "grad_norm": 6.818771362304688e-05,
496
  "learning_rate": 0.00014146341463414634,
497
- "loss": 1.6634010535199195e-05,
498
- "mean_token_accuracy": 1.0,
499
- "num_tokens": 241214.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 0.43234376423060894,
504
  "epoch": 0.6125574272588055,
505
- "grad_norm": 9.107589721679688e-05,
506
  "learning_rate": 0.00014024390243902438,
507
- "loss": 2.512251739972271e-05,
508
- "mean_token_accuracy": 1.0,
509
- "num_tokens": 245200.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
- "eval_entropy": 0.42710635541141895,
515
- "eval_loss": 0.0009002267033793032,
516
- "eval_mean_token_accuracy": 0.9997843339823295,
517
- "eval_num_tokens": 245200.0,
518
- "eval_runtime": 51.2948,
519
- "eval_samples_per_second": 1.345,
520
- "eval_steps_per_second": 1.345,
521
  "step": 50
522
  },
523
  {
524
- "entropy": 0.436727499589324,
525
  "epoch": 0.6248085758039816,
526
- "grad_norm": 6.079673767089844e-05,
527
  "learning_rate": 0.00013902439024390245,
528
- "loss": 1.7863472749013454e-05,
529
- "mean_token_accuracy": 1.0,
530
- "num_tokens": 249761.0,
531
  "step": 51
532
  },
533
  {
534
- "entropy": 0.4489326383918524,
535
  "epoch": 0.6370597243491577,
536
- "grad_norm": 0.010009765625,
537
  "learning_rate": 0.0001378048780487805,
538
- "loss": 9.14962001843378e-05,
539
- "mean_token_accuracy": 1.0,
540
- "num_tokens": 254787.0,
541
  "step": 52
542
  },
543
  {
544
- "entropy": 0.4518893454223871,
545
  "epoch": 0.6493108728943339,
546
- "grad_norm": 0.029052734375,
547
  "learning_rate": 0.00013658536585365856,
548
- "loss": 0.002504949690774083,
549
- "mean_token_accuracy": 0.9991238303482533,
550
- "num_tokens": 260287.0,
551
  "step": 53
552
  },
553
  {
554
- "entropy": 0.4276025863364339,
555
  "epoch": 0.6615620214395099,
556
- "grad_norm": 0.00022411346435546875,
557
  "learning_rate": 0.0001353658536585366,
558
- "loss": 1.9805909687420353e-05,
559
- "mean_token_accuracy": 1.0,
560
- "num_tokens": 264810.0,
561
  "step": 54
562
  },
563
  {
564
- "entropy": 0.4455657321959734,
565
  "epoch": 0.6738131699846861,
566
- "grad_norm": 0.09912109375,
567
  "learning_rate": 0.00013414634146341464,
568
- "loss": 0.005040395073592663,
569
- "mean_token_accuracy": 0.9974489808082581,
570
- "num_tokens": 270386.0,
571
  "step": 55
572
  },
573
  {
574
- "entropy": 0.48375592939555645,
575
  "epoch": 0.6860643185298622,
576
- "grad_norm": 0.00020694732666015625,
577
  "learning_rate": 0.0001329268292682927,
578
- "loss": 3.307354199932888e-05,
579
- "mean_token_accuracy": 1.0,
580
- "num_tokens": 274391.0,
581
  "step": 56
582
  },
583
  {
584
- "entropy": 0.4558328855782747,
585
  "epoch": 0.6983154670750383,
586
- "grad_norm": 0.00011205673217773438,
587
  "learning_rate": 0.00013170731707317076,
588
- "loss": 2.9195363822509535e-05,
589
- "mean_token_accuracy": 1.0,
590
- "num_tokens": 279716.0,
591
  "step": 57
592
  },
593
  {
594
- "entropy": 0.4038175716996193,
595
  "epoch": 0.7105666156202144,
596
- "grad_norm": 0.130859375,
597
  "learning_rate": 0.0001304878048780488,
598
- "loss": 0.002872227458283305,
599
- "mean_token_accuracy": 0.9989018365740776,
600
- "num_tokens": 285404.0,
601
  "step": 58
602
  },
603
  {
604
- "entropy": 0.4584309756755829,
605
  "epoch": 0.7228177641653905,
606
- "grad_norm": 0.02294921875,
607
  "learning_rate": 0.00012926829268292684,
608
- "loss": 0.0006162020144984126,
609
- "mean_token_accuracy": 0.9997650384902954,
610
- "num_tokens": 289992.0,
611
  "step": 59
612
  },
613
  {
614
- "entropy": 0.47067076340317726,
615
  "epoch": 0.7350689127105666,
616
- "grad_norm": 5.14984130859375e-05,
617
  "learning_rate": 0.00012804878048780488,
618
- "loss": 1.8253980670124292e-05,
619
- "mean_token_accuracy": 1.0,
620
- "num_tokens": 294861.0,
621
  "step": 60
622
  },
623
  {
624
- "entropy": 0.4258435070514679,
625
  "epoch": 0.7473200612557427,
626
- "grad_norm": 6.437301635742188e-05,
627
  "learning_rate": 0.00012682926829268293,
628
- "loss": 2.3211847292259336e-05,
629
- "mean_token_accuracy": 1.0,
630
- "num_tokens": 300355.0,
631
  "step": 61
632
  },
633
  {
634
- "entropy": 0.4751600846648216,
635
  "epoch": 0.7595712098009189,
636
- "grad_norm": 0.0001201629638671875,
637
  "learning_rate": 0.000125609756097561,
638
- "loss": 2.862562905647792e-05,
639
- "mean_token_accuracy": 1.0,
640
- "num_tokens": 305776.0,
641
  "step": 62
642
  },
643
  {
644
- "entropy": 0.43714143335819244,
645
  "epoch": 0.7718223583460949,
646
- "grad_norm": 8.153915405273438e-05,
647
  "learning_rate": 0.00012439024390243904,
648
- "loss": 2.0440007574507035e-05,
649
- "mean_token_accuracy": 1.0,
650
- "num_tokens": 310204.0,
651
  "step": 63
652
  },
653
  {
654
- "entropy": 0.436653483659029,
655
  "epoch": 0.7840735068912711,
656
- "grad_norm": 9.298324584960938e-05,
657
  "learning_rate": 0.00012317073170731708,
658
- "loss": 2.5547835321049206e-05,
659
- "mean_token_accuracy": 1.0,
660
- "num_tokens": 314205.0,
661
  "step": 64
662
  },
663
  {
664
- "entropy": 0.4625023826956749,
665
  "epoch": 0.7963246554364471,
666
- "grad_norm": 4.9591064453125e-05,
667
  "learning_rate": 0.00012195121951219512,
668
- "loss": 1.6659454558975995e-05,
669
- "mean_token_accuracy": 1.0,
670
- "num_tokens": 319157.0,
671
  "step": 65
672
  },
673
  {
674
- "entropy": 0.45398022420704365,
675
  "epoch": 0.8085758039816233,
676
- "grad_norm": 0.0004730224609375,
677
  "learning_rate": 0.00012073170731707318,
678
- "loss": 2.831750134646427e-05,
679
- "mean_token_accuracy": 1.0,
680
- "num_tokens": 324681.0,
681
  "step": 66
682
  },
683
  {
684
- "entropy": 0.39901847764849663,
685
  "epoch": 0.8208269525267994,
686
- "grad_norm": 0.0113525390625,
687
  "learning_rate": 0.00011951219512195122,
688
- "loss": 0.0010163490660488605,
689
- "mean_token_accuracy": 0.9993686862289906,
690
- "num_tokens": 329929.0,
691
  "step": 67
692
  },
693
  {
694
- "entropy": 0.43489386700093746,
695
  "epoch": 0.8330781010719756,
696
- "grad_norm": 0.0002841949462890625,
697
  "learning_rate": 0.00011829268292682926,
698
- "loss": 3.556731462595053e-05,
699
- "mean_token_accuracy": 1.0,
700
- "num_tokens": 334474.0,
701
  "step": 68
702
  },
703
  {
704
- "entropy": 0.43658433854579926,
705
  "epoch": 0.8453292496171516,
706
- "grad_norm": 0.00021457672119140625,
707
  "learning_rate": 0.00011707317073170732,
708
- "loss": 3.145977098029107e-05,
709
- "mean_token_accuracy": 1.0,
710
- "num_tokens": 338171.0,
711
  "step": 69
712
  },
713
  {
714
- "entropy": 0.47345293685793877,
715
  "epoch": 0.8575803981623277,
716
- "grad_norm": 0.04052734375,
717
  "learning_rate": 0.00011585365853658536,
718
- "loss": 0.006434774026274681,
719
- "mean_token_accuracy": 0.9988360889256,
720
- "num_tokens": 342581.0,
721
  "step": 70
722
  },
723
  {
724
- "entropy": 0.47144644521176815,
725
  "epoch": 0.8698315467075038,
726
- "grad_norm": 0.03857421875,
727
  "learning_rate": 0.00011463414634146342,
728
- "loss": 0.0040056235156953335,
729
- "mean_token_accuracy": 0.9997807033360004,
730
- "num_tokens": 347785.0,
731
  "step": 71
732
  },
733
  {
734
- "entropy": 0.44001554138958454,
735
  "epoch": 0.8820826952526799,
736
- "grad_norm": 0.00081634521484375,
737
  "learning_rate": 0.00011341463414634146,
738
- "loss": 3.297243165434338e-05,
739
- "mean_token_accuracy": 1.0,
740
- "num_tokens": 352109.0,
741
  "step": 72
742
  },
743
  {
744
- "entropy": 0.44880508445203304,
745
  "epoch": 0.8943338437978561,
746
- "grad_norm": 0.0002689361572265625,
747
  "learning_rate": 0.00011219512195121953,
748
- "loss": 2.6160523702856153e-05,
749
- "mean_token_accuracy": 1.0,
750
- "num_tokens": 357931.0,
751
  "step": 73
752
  },
753
  {
754
- "entropy": 0.41770973429083824,
755
  "epoch": 0.9065849923430321,
756
- "grad_norm": 0.0002231597900390625,
757
  "learning_rate": 0.00011097560975609757,
758
- "loss": 3.9217924495460466e-05,
759
- "mean_token_accuracy": 1.0,
760
- "num_tokens": 363802.0,
761
  "step": 74
762
  },
763
  {
764
- "entropy": 0.45532275550067425,
765
  "epoch": 0.9188361408882083,
766
- "grad_norm": 6.389617919921875e-05,
767
  "learning_rate": 0.00010975609756097563,
768
- "loss": 2.482662421243731e-05,
769
- "mean_token_accuracy": 1.0,
770
- "num_tokens": 368858.0,
771
  "step": 75
772
  },
773
  {
774
- "entropy": 0.4533053319901228,
775
  "epoch": 0.9310872894333844,
776
- "grad_norm": 0.000492095947265625,
777
  "learning_rate": 0.00010853658536585367,
778
- "loss": 3.297019793535583e-05,
779
- "mean_token_accuracy": 1.0,
780
- "num_tokens": 373658.0,
781
  "step": 76
782
  },
783
  {
784
- "entropy": 0.4135119281709194,
785
  "epoch": 0.9433384379785605,
786
- "grad_norm": 0.000347137451171875,
787
  "learning_rate": 0.00010731707317073172,
788
- "loss": 3.026250487891957e-05,
789
- "mean_token_accuracy": 1.0,
790
- "num_tokens": 379025.0,
791
  "step": 77
792
  },
793
  {
794
- "entropy": 0.44705197028815746,
795
  "epoch": 0.9555895865237366,
796
- "grad_norm": 0.00067901611328125,
797
  "learning_rate": 0.00010609756097560977,
798
- "loss": 4.355545388534665e-05,
799
- "mean_token_accuracy": 1.0,
800
- "num_tokens": 384240.0,
801
  "step": 78
802
  },
803
  {
804
- "entropy": 0.459016814827919,
805
  "epoch": 0.9678407350689127,
806
- "grad_norm": 0.00098419189453125,
807
  "learning_rate": 0.00010487804878048781,
808
- "loss": 4.3970921979052946e-05,
809
- "mean_token_accuracy": 1.0,
810
- "num_tokens": 388335.0,
811
  "step": 79
812
  },
813
  {
814
- "entropy": 0.4241188894957304,
815
  "epoch": 0.9800918836140888,
816
- "grad_norm": 0.07275390625,
817
  "learning_rate": 0.00010365853658536586,
818
- "loss": 0.009294007904827595,
819
- "mean_token_accuracy": 0.9970472455024719,
820
- "num_tokens": 394201.0,
821
  "step": 80
822
  },
823
  {
824
- "entropy": 0.4442194551229477,
825
  "epoch": 0.9923430321592649,
826
- "grad_norm": 0.000377655029296875,
827
  "learning_rate": 0.0001024390243902439,
828
- "loss": 3.1872321414994076e-05,
829
- "mean_token_accuracy": 1.0,
830
- "num_tokens": 399101.0,
831
  "step": 81
832
  },
833
  {
834
- "entropy": 0.429327929019928,
835
  "epoch": 1.0,
836
- "grad_norm": 0.00064849853515625,
837
  "learning_rate": 0.00010121951219512196,
838
- "loss": 3.4027863875962794e-05,
839
- "mean_token_accuracy": 1.0,
840
- "num_tokens": 402117.0,
841
  "step": 82
842
  },
843
  {
844
- "entropy": 0.4472597725689411,
845
  "epoch": 1.0122511485451762,
846
- "grad_norm": 0.08056640625,
847
  "learning_rate": 0.0001,
848
- "loss": 0.005052679218351841,
849
- "mean_token_accuracy": 0.9986319616436958,
850
- "num_tokens": 406748.0,
851
  "step": 83
852
  },
853
  {
854
- "entropy": 0.4647933579981327,
855
  "epoch": 1.0245022970903521,
856
- "grad_norm": 0.0001888275146484375,
857
  "learning_rate": 9.878048780487805e-05,
858
- "loss": 3.911805833922699e-05,
859
- "mean_token_accuracy": 1.0,
860
- "num_tokens": 411354.0,
861
  "step": 84
862
  },
863
  {
864
- "entropy": 0.49184724502265453,
865
  "epoch": 1.0367534456355283,
866
- "grad_norm": 0.0009307861328125,
867
  "learning_rate": 9.75609756097561e-05,
868
- "loss": 6.517933798022568e-05,
869
- "mean_token_accuracy": 1.0,
870
- "num_tokens": 417755.0,
871
  "step": 85
872
  },
873
  {
874
- "entropy": 0.45203530229628086,
875
  "epoch": 1.0490045941807045,
876
- "grad_norm": 0.00017547607421875,
877
  "learning_rate": 9.634146341463415e-05,
878
- "loss": 4.648843969334848e-05,
879
- "mean_token_accuracy": 1.0,
880
- "num_tokens": 421725.0,
881
  "step": 86
882
  },
883
  {
884
- "entropy": 0.44451451301574707,
885
  "epoch": 1.0612557427258806,
886
- "grad_norm": 0.00012493133544921875,
887
  "learning_rate": 9.51219512195122e-05,
888
- "loss": 3.813640068983659e-05,
889
- "mean_token_accuracy": 1.0,
890
- "num_tokens": 426841.0,
891
  "step": 87
892
  },
893
  {
894
- "entropy": 0.5532373636960983,
895
  "epoch": 1.0735068912710566,
896
- "grad_norm": 0.0004425048828125,
897
  "learning_rate": 9.390243902439024e-05,
898
- "loss": 8.416183845838532e-05,
899
- "mean_token_accuracy": 1.0,
900
- "num_tokens": 431070.0,
901
  "step": 88
902
  },
903
  {
904
- "entropy": 0.5114028844982386,
905
  "epoch": 1.0857580398162328,
906
- "grad_norm": 0.021484375,
907
  "learning_rate": 9.26829268292683e-05,
908
- "loss": 0.0013321326114237309,
909
- "mean_token_accuracy": 0.9995535723865032,
910
- "num_tokens": 435681.0,
911
  "step": 89
912
  },
913
  {
914
- "entropy": 0.48618660122156143,
915
  "epoch": 1.098009188361409,
916
- "grad_norm": 0.0002498626708984375,
917
  "learning_rate": 9.146341463414635e-05,
918
- "loss": 6.904367910465226e-05,
919
- "mean_token_accuracy": 1.0,
920
- "num_tokens": 440565.0,
921
  "step": 90
922
  },
923
  {
924
- "entropy": 0.5016148556023836,
925
  "epoch": 1.110260336906585,
926
- "grad_norm": 0.019287109375,
927
  "learning_rate": 9.02439024390244e-05,
928
- "loss": 0.0020695198327302933,
929
- "mean_token_accuracy": 0.9993686862289906,
930
- "num_tokens": 445241.0,
931
  "step": 91
932
  },
933
  {
934
- "entropy": 0.5162393897771835,
935
  "epoch": 1.122511485451761,
936
- "grad_norm": 0.04638671875,
937
  "learning_rate": 8.902439024390244e-05,
938
- "loss": 0.0038224293384701014,
939
- "mean_token_accuracy": 0.9989322870969772,
940
- "num_tokens": 449996.0,
941
  "step": 92
942
  },
943
  {
944
- "entropy": 0.47938764840364456,
945
  "epoch": 1.1347626339969372,
946
- "grad_norm": 0.0028533935546875,
947
  "learning_rate": 8.78048780487805e-05,
948
- "loss": 0.00016397782019339502,
949
- "mean_token_accuracy": 1.0,
950
- "num_tokens": 454979.0,
951
  "step": 93
952
  },
953
  {
954
- "entropy": 0.5016432590782642,
955
  "epoch": 1.1470137825421134,
956
- "grad_norm": 0.000400543212890625,
957
  "learning_rate": 8.658536585365854e-05,
958
- "loss": 0.00010612564074108377,
959
- "mean_token_accuracy": 1.0,
960
- "num_tokens": 459674.0,
961
  "step": 94
962
  },
963
  {
964
- "entropy": 0.5095659829676151,
965
  "epoch": 1.1592649310872893,
966
- "grad_norm": 0.0004520416259765625,
967
  "learning_rate": 8.53658536585366e-05,
968
- "loss": 0.00011354458547430113,
969
- "mean_token_accuracy": 1.0,
970
- "num_tokens": 464313.0,
971
  "step": 95
972
  },
973
  {
974
- "entropy": 0.4933694824576378,
975
  "epoch": 1.1715160796324655,
976
- "grad_norm": 0.0556640625,
977
  "learning_rate": 8.414634146341464e-05,
978
- "loss": 0.004786409437656403,
979
- "mean_token_accuracy": 0.9988460540771484,
980
- "num_tokens": 468858.0,
981
  "step": 96
982
  },
983
  {
984
- "entropy": 0.5068543236702681,
985
  "epoch": 1.1837672281776417,
986
- "grad_norm": 0.000492095947265625,
987
  "learning_rate": 8.292682926829268e-05,
988
- "loss": 9.500309533905238e-05,
989
- "mean_token_accuracy": 1.0,
990
- "num_tokens": 473732.0,
991
  "step": 97
992
  },
993
  {
994
- "entropy": 0.502707714214921,
995
  "epoch": 1.1960183767228179,
996
- "grad_norm": 0.026123046875,
997
  "learning_rate": 8.170731707317073e-05,
998
- "loss": 0.002030049916356802,
999
- "mean_token_accuracy": 0.9993131868541241,
1000
- "num_tokens": 479037.0,
1001
  "step": 98
1002
  },
1003
  {
1004
- "entropy": 0.5147993545979261,
1005
  "epoch": 1.2082695252679938,
1006
- "grad_norm": 0.000354766845703125,
1007
  "learning_rate": 8.048780487804879e-05,
1008
- "loss": 6.365451554302126e-05,
1009
- "mean_token_accuracy": 1.0,
1010
- "num_tokens": 484811.0,
1011
  "step": 99
1012
  },
1013
  {
1014
- "entropy": 0.4471734017133713,
1015
  "epoch": 1.22052067381317,
1016
- "grad_norm": 0.0047607421875,
1017
  "learning_rate": 7.926829268292683e-05,
1018
- "loss": 0.0003211660368833691,
1019
- "mean_token_accuracy": 1.0,
1020
- "num_tokens": 489522.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
- "eval_entropy": 0.4696715573469798,
1026
- "eval_loss": 0.0007750109070912004,
1027
- "eval_mean_token_accuracy": 0.9997843339823295,
1028
- "eval_num_tokens": 489522.0,
1029
- "eval_runtime": 51.321,
1030
- "eval_samples_per_second": 1.344,
1031
- "eval_steps_per_second": 1.344,
1032
  "step": 100
1033
  },
1034
  {
1035
- "entropy": 0.4984112149104476,
1036
  "epoch": 1.2327718223583461,
1037
- "grad_norm": 0.0001850128173828125,
1038
  "learning_rate": 7.804878048780489e-05,
1039
- "loss": 5.6583492550998926e-05,
1040
- "mean_token_accuracy": 1.0,
1041
- "num_tokens": 494757.0,
1042
  "step": 101
1043
  },
1044
  {
1045
- "entropy": 0.46644425205886364,
1046
  "epoch": 1.245022970903522,
1047
- "grad_norm": 0.0001506805419921875,
1048
  "learning_rate": 7.682926829268293e-05,
1049
- "loss": 5.076146044302732e-05,
1050
- "mean_token_accuracy": 1.0,
1051
- "num_tokens": 499837.0,
1052
  "step": 102
1053
  },
1054
  {
1055
- "entropy": 0.4746809806674719,
1056
  "epoch": 1.2572741194486983,
1057
- "grad_norm": 0.00015354156494140625,
1058
  "learning_rate": 7.560975609756099e-05,
1059
- "loss": 5.508732647285797e-05,
1060
- "mean_token_accuracy": 1.0,
1061
- "num_tokens": 505267.0,
1062
  "step": 103
1063
  },
1064
  {
1065
- "entropy": 0.47748516872525215,
1066
  "epoch": 1.2695252679938744,
1067
- "grad_norm": 0.0001277923583984375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
- "loss": 4.464950325200334e-05,
1070
- "mean_token_accuracy": 1.0,
1071
- "num_tokens": 510260.0,
1072
  "step": 104
1073
  },
1074
  {
1075
- "entropy": 0.49103316478431225,
1076
  "epoch": 1.2817764165390506,
1077
- "grad_norm": 0.00689697265625,
1078
  "learning_rate": 7.317073170731707e-05,
1079
- "loss": 0.000652994611300528,
1080
- "mean_token_accuracy": 0.9993556700646877,
1081
- "num_tokens": 514493.0,
1082
  "step": 105
1083
  },
1084
  {
1085
- "entropy": 0.4787591751664877,
1086
  "epoch": 1.2940275650842268,
1087
- "grad_norm": 0.0003795623779296875,
1088
  "learning_rate": 7.195121951219513e-05,
1089
- "loss": 4.17455485148821e-05,
1090
- "mean_token_accuracy": 1.0,
1091
- "num_tokens": 519511.0,
1092
  "step": 106
1093
  },
1094
  {
1095
- "entropy": 0.46200828067958355,
1096
  "epoch": 1.3062787136294027,
1097
- "grad_norm": 0.0001678466796875,
1098
  "learning_rate": 7.073170731707317e-05,
1099
- "loss": 4.6432032831944525e-05,
1100
- "mean_token_accuracy": 1.0,
1101
- "num_tokens": 524373.0,
1102
  "step": 107
1103
  },
1104
  {
1105
- "entropy": 0.4632429350167513,
1106
  "epoch": 1.318529862174579,
1107
- "grad_norm": 0.00019073486328125,
1108
  "learning_rate": 6.951219512195122e-05,
1109
- "loss": 4.138273652642965e-05,
1110
- "mean_token_accuracy": 1.0,
1111
- "num_tokens": 528973.0,
1112
  "step": 108
1113
  },
1114
  {
1115
- "entropy": 0.4669873770326376,
1116
  "epoch": 1.3307810107197549,
1117
- "grad_norm": 0.000301361083984375,
1118
  "learning_rate": 6.829268292682928e-05,
1119
- "loss": 4.5484361180569977e-05,
1120
- "mean_token_accuracy": 1.0,
1121
- "num_tokens": 533941.0,
1122
  "step": 109
1123
  },
1124
  {
1125
- "entropy": 0.45179494842886925,
1126
  "epoch": 1.343032159264931,
1127
- "grad_norm": 0.00010776519775390625,
1128
  "learning_rate": 6.707317073170732e-05,
1129
- "loss": 3.3365573472110555e-05,
1130
- "mean_token_accuracy": 1.0,
1131
- "num_tokens": 539363.0,
1132
  "step": 110
1133
  },
1134
  {
1135
- "entropy": 0.438027735799551,
1136
  "epoch": 1.3552833078101072,
1137
- "grad_norm": 0.00014972686767578125,
1138
  "learning_rate": 6.585365853658538e-05,
1139
- "loss": 4.3530206312425435e-05,
1140
- "mean_token_accuracy": 1.0,
1141
- "num_tokens": 543731.0,
1142
  "step": 111
1143
  },
1144
  {
1145
- "entropy": 0.4696179609745741,
1146
  "epoch": 1.3675344563552834,
1147
- "grad_norm": 0.028076171875,
1148
  "learning_rate": 6.463414634146342e-05,
1149
- "loss": 0.00529400585219264,
1150
- "mean_token_accuracy": 0.9985632188618183,
1151
- "num_tokens": 548164.0,
1152
  "step": 112
1153
  },
1154
  {
1155
- "entropy": 0.4698081314563751,
1156
  "epoch": 1.3797856049004595,
1157
- "grad_norm": 0.00885009765625,
1158
  "learning_rate": 6.341463414634146e-05,
1159
- "loss": 0.0005042221746407449,
1160
- "mean_token_accuracy": 0.9995039664208889,
1161
- "num_tokens": 553693.0,
1162
  "step": 113
1163
  },
1164
  {
1165
- "entropy": 0.45541019923985004,
1166
  "epoch": 1.3920367534456355,
1167
- "grad_norm": 9.393692016601562e-05,
1168
  "learning_rate": 6.219512195121952e-05,
1169
- "loss": 3.189211565768346e-05,
1170
- "mean_token_accuracy": 1.0,
1171
- "num_tokens": 558477.0,
1172
  "step": 114
1173
  },
1174
  {
1175
- "entropy": 0.46046129800379276,
1176
  "epoch": 1.4042879019908117,
1177
- "grad_norm": 0.0001392364501953125,
1178
  "learning_rate": 6.097560975609756e-05,
1179
- "loss": 3.399374691070989e-05,
1180
- "mean_token_accuracy": 1.0,
1181
- "num_tokens": 563965.0,
1182
  "step": 115
1183
  },
1184
  {
1185
- "entropy": 0.49661404080688953,
1186
  "epoch": 1.4165390505359878,
1187
- "grad_norm": 0.0004062652587890625,
1188
  "learning_rate": 5.975609756097561e-05,
1189
- "loss": 5.0347538490314037e-05,
1190
- "mean_token_accuracy": 1.0,
1191
- "num_tokens": 568303.0,
1192
  "step": 116
1193
  },
1194
  {
1195
- "entropy": 0.4603871125727892,
1196
  "epoch": 1.4287901990811638,
1197
- "grad_norm": 9.870529174804688e-05,
1198
  "learning_rate": 5.853658536585366e-05,
1199
- "loss": 3.4569777199067175e-05,
1200
- "mean_token_accuracy": 1.0,
1201
- "num_tokens": 572895.0,
1202
  "step": 117
1203
  },
1204
  {
1205
- "entropy": 0.47774807177484035,
1206
  "epoch": 1.44104134762634,
1207
- "grad_norm": 0.00012063980102539062,
1208
  "learning_rate": 5.731707317073171e-05,
1209
- "loss": 4.4718148274114355e-05,
1210
- "mean_token_accuracy": 1.0,
1211
- "num_tokens": 577892.0,
1212
  "step": 118
1213
  },
1214
  {
1215
- "entropy": 0.4559262488037348,
1216
  "epoch": 1.4532924961715161,
1217
- "grad_norm": 8.440017700195312e-05,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
- "loss": 2.7120513550471514e-05,
1220
- "mean_token_accuracy": 1.0,
1221
- "num_tokens": 583128.0,
1222
  "step": 119
1223
  },
1224
  {
1225
- "entropy": 0.4927012659609318,
1226
  "epoch": 1.4655436447166923,
1227
- "grad_norm": 0.00011539459228515625,
1228
  "learning_rate": 5.487804878048781e-05,
1229
- "loss": 3.757046943064779e-05,
1230
- "mean_token_accuracy": 1.0,
1231
- "num_tokens": 587856.0,
1232
  "step": 120
1233
  },
1234
  {
1235
- "entropy": 0.43140678480267525,
1236
  "epoch": 1.4777947932618682,
1237
- "grad_norm": 0.000125885009765625,
1238
  "learning_rate": 5.365853658536586e-05,
1239
- "loss": 3.988837852375582e-05,
1240
- "mean_token_accuracy": 1.0,
1241
- "num_tokens": 592260.0,
1242
  "step": 121
1243
  },
1244
  {
1245
- "entropy": 0.46533982269465923,
1246
  "epoch": 1.4900459418070444,
1247
- "grad_norm": 9.822845458984375e-05,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
- "loss": 3.350730548845604e-05,
1250
- "mean_token_accuracy": 1.0,
1251
- "num_tokens": 597022.0,
1252
  "step": 122
1253
  },
1254
  {
1255
- "entropy": 0.4450340513139963,
1256
  "epoch": 1.5022970903522204,
1257
- "grad_norm": 0.00018596649169921875,
1258
  "learning_rate": 5.121951219512195e-05,
1259
- "loss": 4.867902316618711e-05,
1260
- "mean_token_accuracy": 1.0,
1261
- "num_tokens": 601326.0,
1262
  "step": 123
1263
  },
1264
  {
1265
- "entropy": 0.4453680943697691,
1266
  "epoch": 1.5145482388973965,
1267
- "grad_norm": 0.000270843505859375,
1268
  "learning_rate": 5e-05,
1269
- "loss": 4.58945614809636e-05,
1270
- "mean_token_accuracy": 1.0,
1271
- "num_tokens": 606619.0,
1272
  "step": 124
1273
  },
1274
  {
1275
- "entropy": 0.4738515168428421,
1276
  "epoch": 1.5267993874425727,
1277
- "grad_norm": 6.866455078125e-05,
1278
  "learning_rate": 4.878048780487805e-05,
1279
- "loss": 3.125666262349114e-05,
1280
- "mean_token_accuracy": 1.0,
1281
- "num_tokens": 612381.0,
1282
  "step": 125
1283
  },
1284
  {
1285
- "entropy": 0.4711528979241848,
1286
  "epoch": 1.5390505359877489,
1287
- "grad_norm": 0.0003032684326171875,
1288
  "learning_rate": 4.75609756097561e-05,
1289
- "loss": 4.3324482248863205e-05,
1290
- "mean_token_accuracy": 1.0,
1291
- "num_tokens": 617203.0,
1292
  "step": 126
1293
  },
1294
  {
1295
- "entropy": 0.4728289693593979,
1296
  "epoch": 1.551301684532925,
1297
- "grad_norm": 0.01611328125,
1298
  "learning_rate": 4.634146341463415e-05,
1299
- "loss": 0.00017536790983285755,
1300
- "mean_token_accuracy": 1.0,
1301
- "num_tokens": 622329.0,
1302
  "step": 127
1303
  },
1304
  {
1305
- "entropy": 0.48075354285538197,
1306
  "epoch": 1.5635528330781012,
1307
- "grad_norm": 0.000751495361328125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
- "loss": 5.28718919667881e-05,
1310
- "mean_token_accuracy": 1.0,
1311
- "num_tokens": 627243.0,
1312
  "step": 128
1313
  },
1314
  {
1315
- "entropy": 0.43419913947582245,
1316
  "epoch": 1.5758039816232772,
1317
- "grad_norm": 0.0001850128173828125,
1318
  "learning_rate": 4.390243902439025e-05,
1319
- "loss": 4.585986243910156e-05,
1320
- "mean_token_accuracy": 1.0,
1321
- "num_tokens": 631428.0,
1322
  "step": 129
1323
  },
1324
  {
1325
- "entropy": 0.4347258824855089,
1326
  "epoch": 1.5880551301684533,
1327
- "grad_norm": 0.0003814697265625,
1328
  "learning_rate": 4.26829268292683e-05,
1329
- "loss": 5.289731052471325e-05,
1330
- "mean_token_accuracy": 1.0,
1331
- "num_tokens": 636476.0,
1332
  "step": 130
1333
  },
1334
  {
1335
- "entropy": 0.44714186899363995,
1336
  "epoch": 1.6003062787136293,
1337
- "grad_norm": 0.04541015625,
1338
  "learning_rate": 4.146341463414634e-05,
1339
- "loss": 0.003742673434317112,
1340
- "mean_token_accuracy": 0.9986401423811913,
1341
- "num_tokens": 641238.0,
1342
  "step": 131
1343
  },
1344
  {
1345
- "entropy": 0.4518321752548218,
1346
  "epoch": 1.6125574272588055,
1347
- "grad_norm": 0.0751953125,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
- "loss": 0.006270918063819408,
1350
- "mean_token_accuracy": 0.999205507338047,
1351
- "num_tokens": 646351.0,
1352
  "step": 132
1353
  },
1354
  {
1355
- "entropy": 0.40802894718945026,
1356
  "epoch": 1.6248085758039816,
1357
- "grad_norm": 0.00011110305786132812,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
- "loss": 3.44005020451732e-05,
1360
- "mean_token_accuracy": 1.0,
1361
- "num_tokens": 650741.0,
1362
  "step": 133
1363
  },
1364
  {
1365
- "entropy": 0.42771636322140694,
1366
  "epoch": 1.6370597243491578,
1367
- "grad_norm": 0.0001239776611328125,
1368
  "learning_rate": 3.780487804878049e-05,
1369
- "loss": 4.249331323080696e-05,
1370
- "mean_token_accuracy": 1.0,
1371
- "num_tokens": 655143.0,
1372
  "step": 134
1373
  },
1374
  {
1375
- "entropy": 0.44244702346622944,
1376
  "epoch": 1.649310872894334,
1377
- "grad_norm": 0.00011205673217773438,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
- "loss": 3.287765503046103e-05,
1380
- "mean_token_accuracy": 1.0,
1381
- "num_tokens": 660264.0,
1382
  "step": 135
1383
  },
1384
  {
1385
- "entropy": 0.48481825925409794,
1386
  "epoch": 1.66156202143951,
1387
- "grad_norm": 0.000179290771484375,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
- "loss": 5.46249866602011e-05,
1390
- "mean_token_accuracy": 1.0,
1391
- "num_tokens": 664447.0,
1392
  "step": 136
1393
  },
1394
  {
1395
- "entropy": 0.46484761498868465,
1396
  "epoch": 1.673813169984686,
1397
- "grad_norm": 0.0002498626708984375,
1398
  "learning_rate": 3.414634146341464e-05,
1399
- "loss": 4.426595114637166e-05,
1400
- "mean_token_accuracy": 1.0,
1401
- "num_tokens": 669330.0,
1402
  "step": 137
1403
  },
1404
  {
1405
- "entropy": 0.4359226580709219,
1406
  "epoch": 1.686064318529862,
1407
- "grad_norm": 0.049560546875,
1408
  "learning_rate": 3.292682926829269e-05,
1409
- "loss": 0.004120181780308485,
1410
- "mean_token_accuracy": 0.9997727274894714,
1411
- "num_tokens": 674885.0,
1412
  "step": 138
1413
  },
1414
  {
1415
- "entropy": 0.4564925115555525,
1416
  "epoch": 1.6983154670750382,
1417
- "grad_norm": 0.0001544952392578125,
1418
  "learning_rate": 3.170731707317073e-05,
1419
- "loss": 5.280967161525041e-05,
1420
- "mean_token_accuracy": 1.0,
1421
- "num_tokens": 679666.0,
1422
  "step": 139
1423
  },
1424
  {
1425
- "entropy": 0.45392039604485035,
1426
  "epoch": 1.7105666156202144,
1427
- "grad_norm": 0.0001277923583984375,
1428
  "learning_rate": 3.048780487804878e-05,
1429
- "loss": 4.428522152011283e-05,
1430
- "mean_token_accuracy": 1.0,
1431
- "num_tokens": 685022.0,
1432
  "step": 140
1433
  },
1434
  {
1435
- "entropy": 0.4568201173096895,
1436
  "epoch": 1.7228177641653906,
1437
- "grad_norm": 0.000255584716796875,
1438
  "learning_rate": 2.926829268292683e-05,
1439
- "loss": 5.5990531109273434e-05,
1440
- "mean_token_accuracy": 1.0,
1441
- "num_tokens": 689370.0,
1442
  "step": 141
1443
  },
1444
  {
1445
- "entropy": 0.46470937319099903,
1446
  "epoch": 1.7350689127105667,
1447
- "grad_norm": 0.00020122528076171875,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
- "loss": 6.421299622161314e-05,
1450
- "mean_token_accuracy": 1.0,
1451
- "num_tokens": 693163.0,
1452
  "step": 142
1453
  },
1454
  {
1455
- "entropy": 0.47727371007204056,
1456
  "epoch": 1.7473200612557427,
1457
- "grad_norm": 0.000385284423828125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
- "loss": 7.020766497589648e-05,
1460
- "mean_token_accuracy": 1.0,
1461
- "num_tokens": 697577.0,
1462
  "step": 143
1463
  },
1464
  {
1465
- "entropy": 0.46956145390868187,
1466
  "epoch": 1.7595712098009189,
1467
- "grad_norm": 0.00017642974853515625,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
- "loss": 6.577485328307375e-05,
1470
- "mean_token_accuracy": 1.0,
1471
- "num_tokens": 703024.0,
1472
  "step": 144
1473
  },
1474
  {
1475
- "entropy": 0.4778987504541874,
1476
  "epoch": 1.7718223583460948,
1477
- "grad_norm": 0.0272216796875,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
- "loss": 0.0015227628173306584,
1480
- "mean_token_accuracy": 0.999507874250412,
1481
- "num_tokens": 707836.0,
1482
  "step": 145
1483
  },
1484
  {
1485
- "entropy": 0.4693255964666605,
1486
  "epoch": 1.784073506891271,
1487
- "grad_norm": 0.0016632080078125,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
- "loss": 8.514844375895336e-05,
1490
- "mean_token_accuracy": 1.0,
1491
- "num_tokens": 712795.0,
1492
  "step": 146
1493
  },
1494
  {
1495
- "entropy": 0.44871947541832924,
1496
  "epoch": 1.7963246554364471,
1497
- "grad_norm": 0.0001220703125,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
- "loss": 4.404923674883321e-05,
1500
- "mean_token_accuracy": 1.0,
1501
- "num_tokens": 718427.0,
1502
  "step": 147
1503
  },
1504
  {
1505
- "entropy": 0.46528770588338375,
1506
  "epoch": 1.8085758039816233,
1507
- "grad_norm": 0.00011539459228515625,
1508
  "learning_rate": 2.073170731707317e-05,
1509
- "loss": 4.299484135117382e-05,
1510
- "mean_token_accuracy": 1.0,
1511
- "num_tokens": 723784.0,
1512
  "step": 148
1513
  },
1514
  {
1515
- "entropy": 0.4871877897530794,
1516
  "epoch": 1.8208269525267995,
1517
- "grad_norm": 0.00018215179443359375,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
- "loss": 6.490876694442704e-05,
1520
- "mean_token_accuracy": 1.0,
1521
- "num_tokens": 728100.0,
1522
  "step": 149
1523
  },
1524
  {
1525
- "entropy": 0.4858295116573572,
1526
  "epoch": 1.8330781010719757,
1527
- "grad_norm": 0.004119873046875,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
- "loss": 0.0002347841509617865,
1530
- "mean_token_accuracy": 1.0,
1531
- "num_tokens": 733891.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
- "eval_entropy": 0.45632935347764386,
1537
- "eval_loss": 0.0005955203669145703,
1538
- "eval_mean_token_accuracy": 0.9997519842092542,
1539
- "eval_num_tokens": 733891.0,
1540
- "eval_runtime": 51.3196,
1541
- "eval_samples_per_second": 1.345,
1542
- "eval_steps_per_second": 1.345,
1543
  "step": 150
1544
  },
1545
  {
1546
- "entropy": 0.4611043408513069,
1547
  "epoch": 1.8453292496171516,
1548
- "grad_norm": 0.00018310546875,
1549
  "learning_rate": 1.707317073170732e-05,
1550
- "loss": 6.215952453203499e-05,
1551
- "mean_token_accuracy": 1.0,
1552
- "num_tokens": 738134.0,
1553
  "step": 151
1554
  },
1555
  {
1556
- "entropy": 0.4878769665956497,
1557
  "epoch": 1.8575803981623276,
1558
- "grad_norm": 0.000263214111328125,
1559
  "learning_rate": 1.5853658536585366e-05,
1560
- "loss": 4.827458178624511e-05,
1561
- "mean_token_accuracy": 1.0,
1562
- "num_tokens": 743890.0,
1563
  "step": 152
1564
  },
1565
  {
1566
- "entropy": 0.4312817621976137,
1567
  "epoch": 1.8698315467075037,
1568
- "grad_norm": 0.0001659393310546875,
1569
  "learning_rate": 1.4634146341463415e-05,
1570
- "loss": 4.2587878851918504e-05,
1571
- "mean_token_accuracy": 1.0,
1572
- "num_tokens": 749531.0,
1573
  "step": 153
1574
  },
1575
  {
1576
- "entropy": 0.4696353208273649,
1577
  "epoch": 1.88208269525268,
1578
- "grad_norm": 0.0001277923583984375,
1579
  "learning_rate": 1.3414634146341466e-05,
1580
- "loss": 4.060078936163336e-05,
1581
- "mean_token_accuracy": 1.0,
1582
- "num_tokens": 755323.0,
1583
  "step": 154
1584
  },
1585
  {
1586
- "entropy": 0.5023391880095005,
1587
  "epoch": 1.894333843797856,
1588
- "grad_norm": 0.05615234375,
1589
  "learning_rate": 1.2195121951219513e-05,
1590
- "loss": 0.0006454848335124552,
1591
- "mean_token_accuracy": 0.9996448867022991,
1592
- "num_tokens": 760481.0,
1593
  "step": 155
1594
  },
1595
  {
1596
- "entropy": 0.4852756280452013,
1597
  "epoch": 1.9065849923430322,
1598
- "grad_norm": 0.091796875,
1599
  "learning_rate": 1.0975609756097562e-05,
1600
- "loss": 0.0015374489594250917,
1601
- "mean_token_accuracy": 0.9998405613005161,
1602
- "num_tokens": 765389.0,
1603
  "step": 156
1604
  },
1605
  {
1606
- "entropy": 0.4805344957858324,
1607
  "epoch": 1.9188361408882084,
1608
- "grad_norm": 0.0003719329833984375,
1609
  "learning_rate": 9.756097560975611e-06,
1610
- "loss": 6.884219328640029e-05,
1611
- "mean_token_accuracy": 1.0,
1612
- "num_tokens": 770626.0,
1613
  "step": 157
1614
  },
1615
  {
1616
- "entropy": 0.4682777728885412,
1617
  "epoch": 1.9310872894333844,
1618
- "grad_norm": 0.00015163421630859375,
1619
  "learning_rate": 8.53658536585366e-06,
1620
- "loss": 5.355028042686172e-05,
1621
- "mean_token_accuracy": 1.0,
1622
- "num_tokens": 775626.0,
1623
  "step": 158
1624
  },
1625
  {
1626
- "entropy": 0.46833183616399765,
1627
  "epoch": 1.9433384379785605,
1628
- "grad_norm": 0.000244140625,
1629
  "learning_rate": 7.317073170731707e-06,
1630
- "loss": 6.813944492023438e-05,
1631
- "mean_token_accuracy": 1.0,
1632
- "num_tokens": 779824.0,
1633
  "step": 159
1634
  },
1635
  {
1636
- "entropy": 0.48119914904236794,
1637
  "epoch": 1.9555895865237365,
1638
- "grad_norm": 0.0005035400390625,
1639
  "learning_rate": 6.0975609756097564e-06,
1640
- "loss": 7.686868048040196e-05,
1641
- "mean_token_accuracy": 1.0,
1642
- "num_tokens": 784498.0,
1643
  "step": 160
1644
  },
1645
  {
1646
- "entropy": 0.49330189637839794,
1647
  "epoch": 1.9678407350689127,
1648
- "grad_norm": 0.00014972686767578125,
1649
  "learning_rate": 4.8780487804878055e-06,
1650
- "loss": 4.916799662169069e-05,
1651
- "mean_token_accuracy": 1.0,
1652
- "num_tokens": 790193.0,
1653
  "step": 161
1654
  },
1655
  {
1656
- "entropy": 0.4804691858589649,
1657
  "epoch": 1.9800918836140888,
1658
- "grad_norm": 0.000278472900390625,
1659
  "learning_rate": 3.6585365853658537e-06,
1660
- "loss": 5.59901563974563e-05,
1661
- "mean_token_accuracy": 1.0,
1662
- "num_tokens": 794958.0,
1663
  "step": 162
1664
  },
1665
  {
1666
- "entropy": 0.4819574113935232,
1667
  "epoch": 1.992343032159265,
1668
- "grad_norm": 0.0001239776611328125,
1669
  "learning_rate": 2.4390243902439027e-06,
1670
- "loss": 4.5460070396075025e-05,
1671
- "mean_token_accuracy": 1.0,
1672
- "num_tokens": 800578.0,
1673
  "step": 163
1674
  },
1675
  {
1676
- "entropy": 0.48914736807346343,
1677
  "epoch": 2.0,
1678
- "grad_norm": 0.000965118408203125,
1679
  "learning_rate": 1.2195121951219514e-06,
1680
- "loss": 0.00011097195238107815,
1681
- "mean_token_accuracy": 1.0,
1682
- "num_tokens": 804234.0,
1683
  "step": 164
1684
  }
1685
  ],
@@ -1700,7 +1700,7 @@
1700
  "attributes": {}
1701
  }
1702
  },
1703
- "total_flos": 3.641668564495565e+16,
1704
  "train_batch_size": 1,
1705
  "trial_name": null,
1706
  "trial_params": null
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 0.22490596678107977,
14
  "epoch": 0.01225114854517611,
15
+ "grad_norm": 8.875,
16
  "learning_rate": 0.0002,
17
+ "loss": 0.1875426322221756,
18
+ "mean_token_accuracy": 0.9461580626666546,
19
+ "num_tokens": 6770.0,
20
  "step": 1
21
  },
22
  {
23
+ "entropy": 0.24707041680812836,
24
  "epoch": 0.02450229709035222,
25
+ "grad_norm": 8.4375,
26
  "learning_rate": 0.00019878048780487805,
27
+ "loss": 0.16050274670124054,
28
+ "mean_token_accuracy": 0.9445944800972939,
29
+ "num_tokens": 14234.0,
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 0.32129648607224226,
34
  "epoch": 0.036753445635528334,
35
+ "grad_norm": 2.75,
36
  "learning_rate": 0.0001975609756097561,
37
+ "loss": 0.09863867610692978,
38
+ "mean_token_accuracy": 0.9659304060041904,
39
+ "num_tokens": 20673.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 0.32960685156285763,
44
  "epoch": 0.04900459418070444,
45
+ "grad_norm": 1.671875,
46
  "learning_rate": 0.00019634146341463416,
47
+ "loss": 0.08542143553495407,
48
+ "mean_token_accuracy": 0.9690693095326424,
49
+ "num_tokens": 26890.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 0.2677983660250902,
54
  "epoch": 0.06125574272588055,
55
+ "grad_norm": 1.359375,
56
  "learning_rate": 0.0001951219512195122,
57
+ "loss": 0.08666501939296722,
58
+ "mean_token_accuracy": 0.968298003077507,
59
+ "num_tokens": 35017.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 0.3096502358093858,
64
  "epoch": 0.07350689127105667,
65
+ "grad_norm": 0.66015625,
66
  "learning_rate": 0.00019390243902439025,
67
+ "loss": 0.07875043898820877,
68
+ "mean_token_accuracy": 0.969221331179142,
69
+ "num_tokens": 41478.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 0.3156957607716322,
74
  "epoch": 0.08575803981623277,
75
+ "grad_norm": 2.0,
76
  "learning_rate": 0.0001926829268292683,
77
+ "loss": 0.07807251811027527,
78
+ "mean_token_accuracy": 0.9681689888238907,
79
+ "num_tokens": 48204.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 0.2759731076657772,
84
  "epoch": 0.09800918836140889,
85
+ "grad_norm": 1.1328125,
86
  "learning_rate": 0.00019146341463414633,
87
+ "loss": 0.07681904733181,
88
+ "mean_token_accuracy": 0.9719767943024635,
89
+ "num_tokens": 54668.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 0.24453612882643938,
94
  "epoch": 0.11026033690658499,
95
+ "grad_norm": 0.875,
96
  "learning_rate": 0.0001902439024390244,
97
+ "loss": 0.07310224324464798,
98
+ "mean_token_accuracy": 0.96934475004673,
99
+ "num_tokens": 61929.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 0.25852775294333696,
104
  "epoch": 0.1225114854517611,
105
+ "grad_norm": 1.4921875,
106
  "learning_rate": 0.00018902439024390244,
107
+ "loss": 0.07384984195232391,
108
+ "mean_token_accuracy": 0.9701811708509922,
109
+ "num_tokens": 69036.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 0.27396084927022457,
114
  "epoch": 0.13476263399693722,
115
+ "grad_norm": 0.94140625,
116
  "learning_rate": 0.0001878048780487805,
117
+ "loss": 0.10277765244245529,
118
+ "mean_token_accuracy": 0.9634475558996201,
119
+ "num_tokens": 76394.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 0.3001147015020251,
124
  "epoch": 0.14701378254211334,
125
+ "grad_norm": 0.84765625,
126
  "learning_rate": 0.00018658536585365856,
127
+ "loss": 0.08927591890096664,
128
+ "mean_token_accuracy": 0.9625685028731823,
129
+ "num_tokens": 84073.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 0.29679975286126137,
134
  "epoch": 0.15926493108728942,
135
+ "grad_norm": 0.8359375,
136
  "learning_rate": 0.0001853658536585366,
137
+ "loss": 0.10607243329286575,
138
+ "mean_token_accuracy": 0.9608454070985317,
139
+ "num_tokens": 91135.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 0.28288435423746705,
144
  "epoch": 0.17151607963246554,
145
+ "grad_norm": 0.69921875,
146
  "learning_rate": 0.00018414634146341464,
147
+ "loss": 0.07875586301088333,
148
+ "mean_token_accuracy": 0.9699672348797321,
149
+ "num_tokens": 97740.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 0.2927901232615113,
154
  "epoch": 0.18376722817764166,
155
+ "grad_norm": 0.8515625,
156
  "learning_rate": 0.0001829268292682927,
157
+ "loss": 0.08531365543603897,
158
+ "mean_token_accuracy": 0.9704407565295696,
159
+ "num_tokens": 104242.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 0.2802786426618695,
164
  "epoch": 0.19601837672281777,
165
+ "grad_norm": 2.609375,
166
  "learning_rate": 0.00018170731707317075,
167
+ "loss": 0.10073477029800415,
168
+ "mean_token_accuracy": 0.9676352478563786,
169
+ "num_tokens": 112245.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 0.30663597770035267,
174
  "epoch": 0.2082695252679939,
175
+ "grad_norm": 2.3125,
176
  "learning_rate": 0.0001804878048780488,
177
+ "loss": 0.11375448107719421,
178
+ "mean_token_accuracy": 0.9604234844446182,
179
+ "num_tokens": 119086.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 0.31370354909449816,
184
  "epoch": 0.22052067381316998,
185
+ "grad_norm": 2.3125,
186
  "learning_rate": 0.00017926829268292684,
187
+ "loss": 0.10826913267374039,
188
+ "mean_token_accuracy": 0.9599097929894924,
189
+ "num_tokens": 126539.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 0.3203959669917822,
194
  "epoch": 0.2327718223583461,
195
+ "grad_norm": 1.3203125,
196
  "learning_rate": 0.00017804878048780488,
197
+ "loss": 0.075275719165802,
198
+ "mean_token_accuracy": 0.9775180667638779,
199
+ "num_tokens": 133104.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 0.32591533567756414,
204
  "epoch": 0.2450229709035222,
205
+ "grad_norm": 0.921875,
206
  "learning_rate": 0.00017682926829268295,
207
+ "loss": 0.09778374433517456,
208
+ "mean_token_accuracy": 0.9647064991295338,
209
+ "num_tokens": 139853.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 0.3228916050866246,
214
  "epoch": 0.2572741194486983,
215
+ "grad_norm": 1.9453125,
216
  "learning_rate": 0.000175609756097561,
217
+ "loss": 0.11495943367481232,
218
+ "mean_token_accuracy": 0.9573761746287346,
219
+ "num_tokens": 146482.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 0.3363859634846449,
224
  "epoch": 0.26952526799387444,
225
+ "grad_norm": 1.5,
226
  "learning_rate": 0.00017439024390243903,
227
+ "loss": 0.10473912209272385,
228
+ "mean_token_accuracy": 0.9618786759674549,
229
+ "num_tokens": 153819.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 0.3069695383310318,
234
  "epoch": 0.28177641653905056,
235
+ "grad_norm": 1.3203125,
236
  "learning_rate": 0.00017317073170731708,
237
+ "loss": 0.09256276488304138,
238
+ "mean_token_accuracy": 0.9625396281480789,
239
+ "num_tokens": 160972.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 0.3574997428804636,
244
  "epoch": 0.29402756508422667,
245
+ "grad_norm": 0.62890625,
246
  "learning_rate": 0.00017195121951219512,
247
+ "loss": 0.09489140659570694,
248
+ "mean_token_accuracy": 0.9578843042254448,
249
+ "num_tokens": 167730.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 0.3444826593622565,
254
  "epoch": 0.30627871362940273,
255
+ "grad_norm": 0.56640625,
256
  "learning_rate": 0.0001707317073170732,
257
+ "loss": 0.09492132067680359,
258
+ "mean_token_accuracy": 0.9603794105350971,
259
+ "num_tokens": 174078.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 0.328093777410686,
264
  "epoch": 0.31852986217457885,
265
+ "grad_norm": 0.97265625,
266
  "learning_rate": 0.00016951219512195123,
267
+ "loss": 0.08727280050516129,
268
+ "mean_token_accuracy": 0.9661480598151684,
269
+ "num_tokens": 180867.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 0.3213672311976552,
274
  "epoch": 0.33078101071975496,
275
+ "grad_norm": 1.203125,
276
  "learning_rate": 0.00016829268292682927,
277
+ "loss": 0.07705243676900864,
278
+ "mean_token_accuracy": 0.9675347730517387,
279
+ "num_tokens": 187459.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 0.3209801884368062,
284
  "epoch": 0.3430321592649311,
285
+ "grad_norm": 0.49609375,
286
  "learning_rate": 0.00016707317073170731,
287
+ "loss": 0.08744930475950241,
288
+ "mean_token_accuracy": 0.9658873043954372,
289
+ "num_tokens": 194265.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 0.2975130006670952,
294
  "epoch": 0.3552833078101072,
295
+ "grad_norm": 0.578125,
296
  "learning_rate": 0.00016585365853658536,
297
+ "loss": 0.08422811329364777,
298
+ "mean_token_accuracy": 0.9715595282614231,
299
+ "num_tokens": 201332.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 0.29833013843744993,
304
  "epoch": 0.3675344563552833,
305
+ "grad_norm": 0.82421875,
306
  "learning_rate": 0.00016463414634146343,
307
+ "loss": 0.08079958707094193,
308
+ "mean_token_accuracy": 0.9676232784986496,
309
+ "num_tokens": 208902.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 0.31810148898512125,
314
  "epoch": 0.37978560490045943,
315
+ "grad_norm": 0.6796875,
316
  "learning_rate": 0.00016341463414634147,
317
+ "loss": 0.09296617656946182,
318
+ "mean_token_accuracy": 0.9628731682896614,
319
+ "num_tokens": 214635.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 0.2774961022660136,
324
  "epoch": 0.39203675344563554,
325
+ "grad_norm": 1.2109375,
326
  "learning_rate": 0.00016219512195121954,
327
+ "loss": 0.08057809621095657,
328
+ "mean_token_accuracy": 0.9683544635772705,
329
+ "num_tokens": 222703.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 0.2500351797789335,
334
  "epoch": 0.40428790199081166,
335
+ "grad_norm": 0.6953125,
336
  "learning_rate": 0.00016097560975609758,
337
+ "loss": 0.07790188491344452,
338
+ "mean_token_accuracy": 0.9730625562369823,
339
+ "num_tokens": 230136.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 0.27261121198534966,
344
  "epoch": 0.4165390505359878,
345
+ "grad_norm": 1.21875,
346
  "learning_rate": 0.00015975609756097562,
347
+ "loss": 0.08459997177124023,
348
+ "mean_token_accuracy": 0.9683701656758785,
349
+ "num_tokens": 236711.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 0.25461648125201464,
354
  "epoch": 0.42879019908116384,
355
+ "grad_norm": 1.5078125,
356
  "learning_rate": 0.00015853658536585366,
357
+ "loss": 0.09788602590560913,
358
+ "mean_token_accuracy": 0.9601947516202927,
359
+ "num_tokens": 243492.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 0.250462488271296,
364
  "epoch": 0.44104134762633995,
365
+ "grad_norm": 0.62109375,
366
  "learning_rate": 0.00015731707317073173,
367
+ "loss": 0.09664106369018555,
368
+ "mean_token_accuracy": 0.9635641165077686,
369
+ "num_tokens": 250330.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 0.26719998102635145,
374
  "epoch": 0.45329249617151607,
375
+ "grad_norm": 0.609375,
376
  "learning_rate": 0.00015609756097560978,
377
+ "loss": 0.08978135138750076,
378
+ "mean_token_accuracy": 0.9730992764234543,
379
+ "num_tokens": 257503.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 0.25437645614147186,
384
  "epoch": 0.4655436447166922,
385
+ "grad_norm": 1.0859375,
386
  "learning_rate": 0.00015487804878048782,
387
+ "loss": 0.08938639611005783,
388
+ "mean_token_accuracy": 0.9675878100097179,
389
+ "num_tokens": 264436.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 0.2722023595124483,
394
  "epoch": 0.4777947932618683,
395
+ "grad_norm": 1.375,
396
  "learning_rate": 0.00015365853658536586,
397
+ "loss": 0.07785381376743317,
398
+ "mean_token_accuracy": 0.9736072942614555,
399
+ "num_tokens": 270483.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 0.3116175327450037,
404
  "epoch": 0.4900459418070444,
405
+ "grad_norm": 0.65625,
406
  "learning_rate": 0.0001524390243902439,
407
+ "loss": 0.09019558876752853,
408
+ "mean_token_accuracy": 0.9605641178786755,
409
+ "num_tokens": 276329.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 0.28687036503106356,
414
  "epoch": 0.5022970903522205,
415
+ "grad_norm": 0.62890625,
416
  "learning_rate": 0.00015121951219512197,
417
+ "loss": 0.0810370221734047,
418
+ "mean_token_accuracy": 0.9663555175065994,
419
+ "num_tokens": 281636.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 0.2999298516660929,
424
  "epoch": 0.5145482388973966,
425
+ "grad_norm": 1.1484375,
426
  "learning_rate": 0.00015000000000000001,
427
+ "loss": 0.06981078535318375,
428
+ "mean_token_accuracy": 0.9718391671776772,
429
+ "num_tokens": 287849.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 0.3097079414874315,
434
  "epoch": 0.5267993874425727,
435
+ "grad_norm": 1.5546875,
436
  "learning_rate": 0.00014878048780487806,
437
+ "loss": 0.09350281953811646,
438
+ "mean_token_accuracy": 0.9683773033320904,
439
+ "num_tokens": 294425.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 0.2796417009085417,
444
  "epoch": 0.5390505359877489,
445
+ "grad_norm": 1.25,
446
  "learning_rate": 0.0001475609756097561,
447
+ "loss": 0.09558023512363434,
448
+ "mean_token_accuracy": 0.9602576456964016,
449
+ "num_tokens": 301451.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 0.265599487349391,
454
  "epoch": 0.5513016845329249,
455
+ "grad_norm": 0.59375,
456
  "learning_rate": 0.00014634146341463414,
457
+ "loss": 0.07772304862737656,
458
+ "mean_token_accuracy": 0.9693298228085041,
459
+ "num_tokens": 308208.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 0.29693579114973545,
464
  "epoch": 0.5635528330781011,
465
+ "grad_norm": 1.2109375,
466
  "learning_rate": 0.0001451219512195122,
467
+ "loss": 0.09863201528787613,
468
+ "mean_token_accuracy": 0.963471919298172,
469
+ "num_tokens": 314427.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 0.2665130514651537,
474
  "epoch": 0.5758039816232772,
475
+ "grad_norm": 0.80078125,
476
  "learning_rate": 0.00014390243902439025,
477
+ "loss": 0.08794506639242172,
478
+ "mean_token_accuracy": 0.9714972339570522,
479
+ "num_tokens": 321146.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 0.2665897011756897,
484
  "epoch": 0.5880551301684533,
485
+ "grad_norm": 0.765625,
486
  "learning_rate": 0.0001426829268292683,
487
+ "loss": 0.07602453231811523,
488
+ "mean_token_accuracy": 0.9719848223030567,
489
+ "num_tokens": 326952.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 0.2814077762886882,
494
  "epoch": 0.6003062787136294,
495
+ "grad_norm": 0.74609375,
496
  "learning_rate": 0.00014146341463414634,
497
+ "loss": 0.08512163907289505,
498
+ "mean_token_accuracy": 0.9680779539048672,
499
+ "num_tokens": 333716.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 0.311913987621665,
504
  "epoch": 0.6125574272588055,
505
+ "grad_norm": 0.52734375,
506
  "learning_rate": 0.00014024390243902438,
507
+ "loss": 0.0735088661313057,
508
+ "mean_token_accuracy": 0.9693484716117382,
509
+ "num_tokens": 339075.0,
510
  "step": 50
511
  },
512
  {
513
  "epoch": 0.6125574272588055,
514
+ "eval_entropy": 0.2772115924652072,
515
+ "eval_loss": 0.08680303394794464,
516
+ "eval_mean_token_accuracy": 0.9665399781171826,
517
+ "eval_num_tokens": 339075.0,
518
+ "eval_runtime": 64.122,
519
+ "eval_samples_per_second": 1.076,
520
+ "eval_steps_per_second": 1.076,
521
  "step": 50
522
  },
523
  {
524
+ "entropy": 0.2713254941627383,
525
  "epoch": 0.6248085758039816,
526
+ "grad_norm": 0.76953125,
527
  "learning_rate": 0.00013902439024390245,
528
+ "loss": 0.07364857941865921,
529
+ "mean_token_accuracy": 0.9685694649815559,
530
+ "num_tokens": 346211.0,
531
  "step": 51
532
  },
533
  {
534
+ "entropy": 0.27622572146356106,
535
  "epoch": 0.6370597243491577,
536
+ "grad_norm": 1.953125,
537
  "learning_rate": 0.0001378048780487805,
538
+ "loss": 0.08796638250350952,
539
+ "mean_token_accuracy": 0.9678671807050705,
540
+ "num_tokens": 353743.0,
541
  "step": 52
542
  },
543
  {
544
+ "entropy": 0.3153565675020218,
545
  "epoch": 0.6493108728943339,
546
+ "grad_norm": 1.8125,
547
  "learning_rate": 0.00013658536585365856,
548
+ "loss": 0.09189874678850174,
549
+ "mean_token_accuracy": 0.9696005284786224,
550
+ "num_tokens": 360765.0,
551
  "step": 53
552
  },
553
  {
554
+ "entropy": 0.2793878586962819,
555
  "epoch": 0.6615620214395099,
556
+ "grad_norm": 0.94140625,
557
  "learning_rate": 0.0001353658536585366,
558
+ "loss": 0.0844489261507988,
559
+ "mean_token_accuracy": 0.9678577370941639,
560
+ "num_tokens": 366800.0,
561
  "step": 54
562
  },
563
  {
564
+ "entropy": 0.31044898089021444,
565
  "epoch": 0.6738131699846861,
566
+ "grad_norm": 0.9375,
567
  "learning_rate": 0.00013414634146341464,
568
+ "loss": 0.07886157184839249,
569
+ "mean_token_accuracy": 0.9673981033265591,
570
+ "num_tokens": 373439.0,
571
  "step": 55
572
  },
573
  {
574
+ "entropy": 0.27184910606592894,
575
  "epoch": 0.6860643185298622,
576
+ "grad_norm": 1.3359375,
577
  "learning_rate": 0.0001329268292682927,
578
+ "loss": 0.0787871852517128,
579
+ "mean_token_accuracy": 0.9677317887544632,
580
+ "num_tokens": 380492.0,
581
  "step": 56
582
  },
583
  {
584
+ "entropy": 0.31349051371216774,
585
  "epoch": 0.6983154670750383,
586
+ "grad_norm": 1.59375,
587
  "learning_rate": 0.00013170731707317076,
588
+ "loss": 0.08862332254648209,
589
+ "mean_token_accuracy": 0.9652546346187592,
590
+ "num_tokens": 386711.0,
591
  "step": 57
592
  },
593
  {
594
+ "entropy": 0.2799685625359416,
595
  "epoch": 0.7105666156202144,
596
+ "grad_norm": 1.5078125,
597
  "learning_rate": 0.0001304878048780488,
598
+ "loss": 0.10028493404388428,
599
+ "mean_token_accuracy": 0.9606899172067642,
600
+ "num_tokens": 394124.0,
601
  "step": 58
602
  },
603
  {
604
+ "entropy": 0.2792940763756633,
605
  "epoch": 0.7228177641653905,
606
+ "grad_norm": 1.5859375,
607
  "learning_rate": 0.00012926829268292684,
608
+ "loss": 0.07462260127067566,
609
+ "mean_token_accuracy": 0.9740471467375755,
610
+ "num_tokens": 401499.0,
611
  "step": 59
612
  },
613
  {
614
+ "entropy": 0.29724057391285896,
615
  "epoch": 0.7350689127105666,
616
+ "grad_norm": 0.95703125,
617
  "learning_rate": 0.00012804878048780488,
618
+ "loss": 0.06339482963085175,
619
+ "mean_token_accuracy": 0.9754546955227852,
620
+ "num_tokens": 407443.0,
621
  "step": 60
622
  },
623
  {
624
+ "entropy": 0.2698040744289756,
625
  "epoch": 0.7473200612557427,
626
+ "grad_norm": 0.60546875,
627
  "learning_rate": 0.00012682926829268293,
628
+ "loss": 0.10221480578184128,
629
+ "mean_token_accuracy": 0.9670109152793884,
630
+ "num_tokens": 415471.0,
631
  "step": 61
632
  },
633
  {
634
+ "entropy": 0.2995635373517871,
635
  "epoch": 0.7595712098009189,
636
+ "grad_norm": 1.75,
637
  "learning_rate": 0.000125609756097561,
638
+ "loss": 0.08588436245918274,
639
+ "mean_token_accuracy": 0.9686382673680782,
640
+ "num_tokens": 422504.0,
641
  "step": 62
642
  },
643
  {
644
+ "entropy": 0.2458120621740818,
645
  "epoch": 0.7718223583460949,
646
+ "grad_norm": 2.1875,
647
  "learning_rate": 0.00012439024390243904,
648
+ "loss": 0.08629653602838516,
649
+ "mean_token_accuracy": 0.966422975063324,
650
+ "num_tokens": 430143.0,
651
  "step": 63
652
  },
653
  {
654
+ "entropy": 0.2900782600045204,
655
  "epoch": 0.7840735068912711,
656
+ "grad_norm": 0.9296875,
657
  "learning_rate": 0.00012317073170731708,
658
+ "loss": 0.08716308325529099,
659
+ "mean_token_accuracy": 0.965714868158102,
660
+ "num_tokens": 435664.0,
661
  "step": 64
662
  },
663
  {
664
+ "entropy": 0.29250922333449125,
665
  "epoch": 0.7963246554364471,
666
+ "grad_norm": 0.59375,
667
  "learning_rate": 0.00012195121951219512,
668
+ "loss": 0.08158774673938751,
669
+ "mean_token_accuracy": 0.9694335348904133,
670
+ "num_tokens": 442457.0,
671
  "step": 65
672
  },
673
  {
674
+ "entropy": 0.3083174014464021,
675
  "epoch": 0.8085758039816233,
676
+ "grad_norm": 0.82421875,
677
  "learning_rate": 0.00012073170731707318,
678
+ "loss": 0.0988016203045845,
679
+ "mean_token_accuracy": 0.9648039489984512,
680
+ "num_tokens": 449983.0,
681
  "step": 66
682
  },
683
  {
684
+ "entropy": 0.25693165976554155,
685
  "epoch": 0.8208269525267994,
686
+ "grad_norm": 0.74609375,
687
  "learning_rate": 0.00011951219512195122,
688
+ "loss": 0.07928164303302765,
689
+ "mean_token_accuracy": 0.9698546566069126,
690
+ "num_tokens": 457640.0,
691
  "step": 67
692
  },
693
  {
694
+ "entropy": 0.2752681290730834,
695
  "epoch": 0.8330781010719756,
696
+ "grad_norm": 0.97265625,
697
  "learning_rate": 0.00011829268292682926,
698
+ "loss": 0.07464170455932617,
699
+ "mean_token_accuracy": 0.9697864800691605,
700
+ "num_tokens": 464050.0,
701
  "step": 68
702
  },
703
  {
704
+ "entropy": 0.27110164798796177,
705
  "epoch": 0.8453292496171516,
706
+ "grad_norm": 0.71875,
707
  "learning_rate": 0.00011707317073170732,
708
+ "loss": 0.0718315988779068,
709
+ "mean_token_accuracy": 0.9709942191839218,
710
+ "num_tokens": 469546.0,
711
  "step": 69
712
  },
713
  {
714
+ "entropy": 0.3264527218416333,
715
  "epoch": 0.8575803981623277,
716
+ "grad_norm": 0.62109375,
717
  "learning_rate": 0.00011585365853658536,
718
+ "loss": 0.0866687223315239,
719
+ "mean_token_accuracy": 0.9700192771852016,
720
+ "num_tokens": 475365.0,
721
  "step": 70
722
  },
723
  {
724
+ "entropy": 0.3122966531664133,
725
  "epoch": 0.8698315467075038,
726
+ "grad_norm": 0.67578125,
727
  "learning_rate": 0.00011463414634146342,
728
+ "loss": 0.06088244915008545,
729
+ "mean_token_accuracy": 0.9754119366407394,
730
+ "num_tokens": 481830.0,
731
  "step": 71
732
  },
733
  {
734
+ "entropy": 0.3018254106864333,
735
  "epoch": 0.8820826952526799,
736
+ "grad_norm": 0.56640625,
737
  "learning_rate": 0.00011341463414634146,
738
+ "loss": 0.08657931536436081,
739
+ "mean_token_accuracy": 0.9676030017435551,
740
+ "num_tokens": 487767.0,
741
  "step": 72
742
  },
743
  {
744
+ "entropy": 0.3276115320622921,
745
  "epoch": 0.8943338437978561,
746
+ "grad_norm": 0.5078125,
747
  "learning_rate": 0.00011219512195121953,
748
+ "loss": 0.08024603128433228,
749
+ "mean_token_accuracy": 0.9690204374492168,
750
+ "num_tokens": 494428.0,
751
  "step": 73
752
  },
753
  {
754
+ "entropy": 0.32397411670535803,
755
  "epoch": 0.9065849923430321,
756
+ "grad_norm": 1.1015625,
757
  "learning_rate": 0.00011097560975609757,
758
+ "loss": 0.07867392897605896,
759
+ "mean_token_accuracy": 0.9685576297342777,
760
+ "num_tokens": 500828.0,
761
  "step": 74
762
  },
763
  {
764
+ "entropy": 0.319146528840065,
765
  "epoch": 0.9188361408882083,
766
+ "grad_norm": 0.97265625,
767
  "learning_rate": 0.00010975609756097563,
768
+ "loss": 0.08432602882385254,
769
+ "mean_token_accuracy": 0.9689616709947586,
770
+ "num_tokens": 507523.0,
771
  "step": 75
772
  },
773
  {
774
+ "entropy": 0.3080446803942323,
775
  "epoch": 0.9310872894333844,
776
+ "grad_norm": 1.265625,
777
  "learning_rate": 0.00010853658536585367,
778
+ "loss": 0.0796058252453804,
779
+ "mean_token_accuracy": 0.9683922417461872,
780
+ "num_tokens": 513607.0,
781
  "step": 76
782
  },
783
  {
784
+ "entropy": 0.2667541950941086,
785
  "epoch": 0.9433384379785605,
786
+ "grad_norm": 0.59375,
787
  "learning_rate": 0.00010731707317073172,
788
+ "loss": 0.06495777517557144,
789
+ "mean_token_accuracy": 0.977863471955061,
790
+ "num_tokens": 521376.0,
791
  "step": 77
792
  },
793
  {
794
+ "entropy": 0.27901614736765623,
795
  "epoch": 0.9555895865237366,
796
+ "grad_norm": 1.0859375,
797
  "learning_rate": 0.00010609756097560977,
798
+ "loss": 0.08389777690172195,
799
+ "mean_token_accuracy": 0.967527512460947,
800
+ "num_tokens": 528624.0,
801
  "step": 78
802
  },
803
  {
804
+ "entropy": 0.2754220822826028,
805
  "epoch": 0.9678407350689127,
806
+ "grad_norm": 1.3515625,
807
  "learning_rate": 0.00010487804878048781,
808
+ "loss": 0.0762331560254097,
809
+ "mean_token_accuracy": 0.9713698588311672,
810
+ "num_tokens": 534817.0,
811
  "step": 79
812
  },
813
  {
814
+ "entropy": 0.2981132147833705,
815
  "epoch": 0.9800918836140888,
816
+ "grad_norm": 1.4375,
817
  "learning_rate": 0.00010365853658536586,
818
+ "loss": 0.07953717559576035,
819
+ "mean_token_accuracy": 0.967929158359766,
820
+ "num_tokens": 541716.0,
821
  "step": 80
822
  },
823
  {
824
+ "entropy": 0.30576920323073864,
825
  "epoch": 0.9923430321592649,
826
+ "grad_norm": 1.0234375,
827
  "learning_rate": 0.0001024390243902439,
828
+ "loss": 0.07800528407096863,
829
+ "mean_token_accuracy": 0.971219640225172,
830
+ "num_tokens": 548000.0,
831
  "step": 81
832
  },
833
  {
834
+ "entropy": 0.24986045509576799,
835
  "epoch": 1.0,
836
+ "grad_norm": 0.58203125,
837
  "learning_rate": 0.00010121951219512196,
838
+ "loss": 0.05879032611846924,
839
+ "mean_token_accuracy": 0.9748349964618683,
840
+ "num_tokens": 552608.0,
841
  "step": 82
842
  },
843
  {
844
+ "entropy": 0.2518839007243514,
845
  "epoch": 1.0122511485451762,
846
+ "grad_norm": 0.49609375,
847
  "learning_rate": 0.0001,
848
+ "loss": 0.047237373888492584,
849
+ "mean_token_accuracy": 0.9874232485890388,
850
+ "num_tokens": 559523.0,
851
  "step": 83
852
  },
853
  {
854
+ "entropy": 0.2561075631529093,
855
  "epoch": 1.0245022970903521,
856
+ "grad_norm": 0.65234375,
857
  "learning_rate": 9.878048780487805e-05,
858
+ "loss": 0.04376941919326782,
859
+ "mean_token_accuracy": 0.9896520264446735,
860
+ "num_tokens": 566232.0,
861
  "step": 84
862
  },
863
  {
864
+ "entropy": 0.2935391655191779,
865
  "epoch": 1.0367534456355283,
866
+ "grad_norm": 0.486328125,
867
  "learning_rate": 9.75609756097561e-05,
868
+ "loss": 0.052017997950315475,
869
+ "mean_token_accuracy": 0.9823879115283489,
870
+ "num_tokens": 573965.0,
871
  "step": 85
872
  },
873
  {
874
+ "entropy": 0.21971730748191476,
875
  "epoch": 1.0490045941807045,
876
+ "grad_norm": 0.330078125,
877
  "learning_rate": 9.634146341463415e-05,
878
+ "loss": 0.04022914543747902,
879
+ "mean_token_accuracy": 0.9874378368258476,
880
+ "num_tokens": 580768.0,
881
  "step": 86
882
  },
883
  {
884
+ "entropy": 0.23719595093280077,
885
  "epoch": 1.0612557427258806,
886
+ "grad_norm": 0.68359375,
887
  "learning_rate": 9.51219512195122e-05,
888
+ "loss": 0.04782414808869362,
889
+ "mean_token_accuracy": 0.9846052750945091,
890
+ "num_tokens": 588097.0,
891
  "step": 87
892
  },
893
  {
894
+ "entropy": 0.25634779036045074,
895
  "epoch": 1.0735068912710566,
896
+ "grad_norm": 0.291015625,
897
  "learning_rate": 9.390243902439024e-05,
898
+ "loss": 0.03357430174946785,
899
+ "mean_token_accuracy": 0.9895204566419125,
900
+ "num_tokens": 594215.0,
901
  "step": 88
902
  },
903
  {
904
+ "entropy": 0.26507470663636923,
905
  "epoch": 1.0857580398162328,
906
+ "grad_norm": 0.89453125,
907
  "learning_rate": 9.26829268292683e-05,
908
+ "loss": 0.0427095852792263,
909
+ "mean_token_accuracy": 0.984734483063221,
910
+ "num_tokens": 600018.0,
911
  "step": 89
912
  },
913
  {
914
+ "entropy": 0.25531507655978203,
915
  "epoch": 1.098009188361409,
916
+ "grad_norm": 0.357421875,
917
  "learning_rate": 9.146341463414635e-05,
918
+ "loss": 0.04051242396235466,
919
+ "mean_token_accuracy": 0.9878104776144028,
920
+ "num_tokens": 606254.0,
921
  "step": 90
922
  },
923
  {
924
+ "entropy": 0.26176126673817635,
925
  "epoch": 1.110260336906585,
926
+ "grad_norm": 0.55078125,
927
  "learning_rate": 9.02439024390244e-05,
928
+ "loss": 0.03882109373807907,
929
+ "mean_token_accuracy": 0.9838540144264698,
930
+ "num_tokens": 612316.0,
931
  "step": 91
932
  },
933
  {
934
+ "entropy": 0.2165100760757923,
935
  "epoch": 1.122511485451761,
936
+ "grad_norm": 0.3671875,
937
  "learning_rate": 8.902439024390244e-05,
938
+ "loss": 0.03010629303753376,
939
+ "mean_token_accuracy": 0.9918084405362606,
940
+ "num_tokens": 619629.0,
941
  "step": 92
942
  },
943
  {
944
+ "entropy": 0.24866555724292994,
945
  "epoch": 1.1347626339969372,
946
+ "grad_norm": 0.7578125,
947
  "learning_rate": 8.78048780487805e-05,
948
+ "loss": 0.03892926499247551,
949
+ "mean_token_accuracy": 0.984953761100769,
950
+ "num_tokens": 625947.0,
951
  "step": 93
952
  },
953
  {
954
+ "entropy": 0.21699398616328835,
955
  "epoch": 1.1470137825421134,
956
+ "grad_norm": 0.53125,
957
  "learning_rate": 8.658536585365854e-05,
958
+ "loss": 0.040178049355745316,
959
+ "mean_token_accuracy": 0.986099898815155,
960
+ "num_tokens": 632906.0,
961
  "step": 94
962
  },
963
  {
964
+ "entropy": 0.2104594809934497,
965
  "epoch": 1.1592649310872893,
966
+ "grad_norm": 1.4375,
967
  "learning_rate": 8.53658536585366e-05,
968
+ "loss": 0.05103502795100212,
969
+ "mean_token_accuracy": 0.9873828142881393,
970
+ "num_tokens": 639769.0,
971
  "step": 95
972
  },
973
  {
974
+ "entropy": 0.21941981185227633,
975
  "epoch": 1.1715160796324655,
976
+ "grad_norm": 0.984375,
977
  "learning_rate": 8.414634146341464e-05,
978
+ "loss": 0.03593335300683975,
979
+ "mean_token_accuracy": 0.9901031330227852,
980
+ "num_tokens": 646347.0,
981
  "step": 96
982
  },
983
  {
984
+ "entropy": 0.23086606059223413,
985
  "epoch": 1.1837672281776417,
986
+ "grad_norm": 0.65625,
987
  "learning_rate": 8.292682926829268e-05,
988
+ "loss": 0.034123439341783524,
989
+ "mean_token_accuracy": 0.9874096475541592,
990
+ "num_tokens": 652247.0,
991
  "step": 97
992
  },
993
  {
994
+ "entropy": 0.21858725044876337,
995
  "epoch": 1.1960183767228179,
996
+ "grad_norm": 0.3515625,
997
  "learning_rate": 8.170731707317073e-05,
998
+ "loss": 0.03983831778168678,
999
+ "mean_token_accuracy": 0.9883633032441139,
1000
+ "num_tokens": 659620.0,
1001
  "step": 98
1002
  },
1003
  {
1004
+ "entropy": 0.2186456574127078,
1005
  "epoch": 1.2082695252679938,
1006
+ "grad_norm": 0.50390625,
1007
  "learning_rate": 8.048780487804879e-05,
1008
+ "loss": 0.03659169375896454,
1009
+ "mean_token_accuracy": 0.9874354675412178,
1010
+ "num_tokens": 667017.0,
1011
  "step": 99
1012
  },
1013
  {
1014
+ "entropy": 0.21289387485012412,
1015
  "epoch": 1.22052067381317,
1016
+ "grad_norm": 1.2890625,
1017
  "learning_rate": 7.926829268292683e-05,
1018
+ "loss": 0.09039004892110825,
1019
+ "mean_token_accuracy": 0.9841732494533062,
1020
+ "num_tokens": 673866.0,
1021
  "step": 100
1022
  },
1023
  {
1024
  "epoch": 1.22052067381317,
1025
+ "eval_entropy": 0.22615607968275098,
1026
+ "eval_loss": 0.0748714804649353,
1027
+ "eval_mean_token_accuracy": 0.9701917439267256,
1028
+ "eval_num_tokens": 673866.0,
1029
+ "eval_runtime": 64.1728,
1030
+ "eval_samples_per_second": 1.075,
1031
+ "eval_steps_per_second": 1.075,
1032
  "step": 100
1033
  },
1034
  {
1035
+ "entropy": 0.20847708079963923,
1036
  "epoch": 1.2327718223583461,
1037
+ "grad_norm": 0.9453125,
1038
  "learning_rate": 7.804878048780489e-05,
1039
+ "loss": 0.032662514597177505,
1040
+ "mean_token_accuracy": 0.9919092357158661,
1041
+ "num_tokens": 681308.0,
1042
  "step": 101
1043
  },
1044
  {
1045
+ "entropy": 0.23787071648985147,
1046
  "epoch": 1.245022970903522,
1047
+ "grad_norm": 0.859375,
1048
  "learning_rate": 7.682926829268293e-05,
1049
+ "loss": 0.044949762523174286,
1050
+ "mean_token_accuracy": 0.987742405384779,
1051
+ "num_tokens": 687496.0,
1052
  "step": 102
1053
  },
1054
  {
1055
+ "entropy": 0.21969830617308617,
1056
  "epoch": 1.2572741194486983,
1057
+ "grad_norm": 0.8671875,
1058
  "learning_rate": 7.560975609756099e-05,
1059
+ "loss": 0.036048222333192825,
1060
+ "mean_token_accuracy": 0.98578891903162,
1061
+ "num_tokens": 694818.0,
1062
  "step": 103
1063
  },
1064
  {
1065
+ "entropy": 0.228535583242774,
1066
  "epoch": 1.2695252679938744,
1067
+ "grad_norm": 1.7109375,
1068
  "learning_rate": 7.439024390243903e-05,
1069
+ "loss": 0.050321951508522034,
1070
+ "mean_token_accuracy": 0.9846261814236641,
1071
+ "num_tokens": 701351.0,
1072
  "step": 104
1073
  },
1074
  {
1075
+ "entropy": 0.21918219700455666,
1076
  "epoch": 1.2817764165390506,
1077
+ "grad_norm": 0.57421875,
1078
  "learning_rate": 7.317073170731707e-05,
1079
+ "loss": 0.03220512717962265,
1080
+ "mean_token_accuracy": 0.9897662363946438,
1081
+ "num_tokens": 707212.0,
1082
  "step": 105
1083
  },
1084
  {
1085
+ "entropy": 0.21648676693439484,
1086
  "epoch": 1.2940275650842268,
1087
+ "grad_norm": 0.921875,
1088
  "learning_rate": 7.195121951219513e-05,
1089
+ "loss": 0.031827542930841446,
1090
+ "mean_token_accuracy": 0.9904872179031372,
1091
+ "num_tokens": 714524.0,
1092
  "step": 106
1093
  },
1094
  {
1095
+ "entropy": 0.20004846714437008,
1096
  "epoch": 1.3062787136294027,
1097
+ "grad_norm": 1.0234375,
1098
  "learning_rate": 7.073170731707317e-05,
1099
+ "loss": 0.03981270268559456,
1100
+ "mean_token_accuracy": 0.9861926138401031,
1101
+ "num_tokens": 722033.0,
1102
  "step": 107
1103
  },
1104
  {
1105
+ "entropy": 0.21497153211385012,
1106
  "epoch": 1.318529862174579,
1107
+ "grad_norm": 0.53515625,
1108
  "learning_rate": 6.951219512195122e-05,
1109
+ "loss": 0.03612194582819939,
1110
+ "mean_token_accuracy": 0.9883794784545898,
1111
+ "num_tokens": 728835.0,
1112
  "step": 108
1113
  },
1114
  {
1115
+ "entropy": 0.22441515233367682,
1116
  "epoch": 1.3307810107197549,
1117
+ "grad_norm": 0.66796875,
1118
  "learning_rate": 6.829268292682928e-05,
1119
+ "loss": 0.037204962223768234,
1120
+ "mean_token_accuracy": 0.9865190424025059,
1121
+ "num_tokens": 735463.0,
1122
  "step": 109
1123
  },
1124
  {
1125
+ "entropy": 0.21172351390123367,
1126
  "epoch": 1.343032159264931,
1127
+ "grad_norm": 0.314453125,
1128
  "learning_rate": 6.707317073170732e-05,
1129
+ "loss": 0.03260833024978638,
1130
+ "mean_token_accuracy": 0.9877017810940742,
1131
+ "num_tokens": 742536.0,
1132
  "step": 110
1133
  },
1134
  {
1135
+ "entropy": 0.19597876677289605,
1136
  "epoch": 1.3552833078101072,
1137
+ "grad_norm": 0.419921875,
1138
  "learning_rate": 6.585365853658538e-05,
1139
+ "loss": 0.0339697040617466,
1140
+ "mean_token_accuracy": 0.990579642355442,
1141
+ "num_tokens": 749606.0,
1142
  "step": 111
1143
  },
1144
  {
1145
+ "entropy": 0.21933963894844055,
1146
  "epoch": 1.3675344563552834,
1147
+ "grad_norm": 0.53515625,
1148
  "learning_rate": 6.463414634146342e-05,
1149
+ "loss": 0.028515402227640152,
1150
+ "mean_token_accuracy": 0.9883383698761463,
1151
+ "num_tokens": 755287.0,
1152
  "step": 112
1153
  },
1154
  {
1155
+ "entropy": 0.21494697034358978,
1156
  "epoch": 1.3797856049004595,
1157
+ "grad_norm": 0.37890625,
1158
  "learning_rate": 6.341463414634146e-05,
1159
+ "loss": 0.03924579173326492,
1160
+ "mean_token_accuracy": 0.9876385144889355,
1161
+ "num_tokens": 763515.0,
1162
  "step": 113
1163
  },
1164
  {
1165
+ "entropy": 0.22842750838026404,
1166
  "epoch": 1.3920367534456355,
1167
+ "grad_norm": 1.1484375,
1168
  "learning_rate": 6.219512195121952e-05,
1169
+ "loss": 0.0367334708571434,
1170
+ "mean_token_accuracy": 0.9872251562774181,
1171
+ "num_tokens": 769660.0,
1172
  "step": 114
1173
  },
1174
  {
1175
+ "entropy": 0.2147415135987103,
1176
  "epoch": 1.4042879019908117,
1177
+ "grad_norm": 0.921875,
1178
  "learning_rate": 6.097560975609756e-05,
1179
+ "loss": 0.030023006722331047,
1180
+ "mean_token_accuracy": 0.9890519753098488,
1181
+ "num_tokens": 777068.0,
1182
  "step": 115
1183
  },
1184
  {
1185
+ "entropy": 0.2247378919273615,
1186
  "epoch": 1.4165390505359878,
1187
+ "grad_norm": 0.9375,
1188
  "learning_rate": 5.975609756097561e-05,
1189
+ "loss": 0.03915408253669739,
1190
+ "mean_token_accuracy": 0.9883266240358353,
1191
+ "num_tokens": 783422.0,
1192
  "step": 116
1193
  },
1194
  {
1195
+ "entropy": 0.19090860895812511,
1196
  "epoch": 1.4287901990811638,
1197
+ "grad_norm": 0.765625,
1198
  "learning_rate": 5.853658536585366e-05,
1199
+ "loss": 0.037202730774879456,
1200
+ "mean_token_accuracy": 0.9874398410320282,
1201
+ "num_tokens": 790851.0,
1202
  "step": 117
1203
  },
1204
  {
1205
+ "entropy": 0.2285028137266636,
1206
  "epoch": 1.44104134762634,
1207
+ "grad_norm": 1.9140625,
1208
  "learning_rate": 5.731707317073171e-05,
1209
+ "loss": 0.043229859322309494,
1210
+ "mean_token_accuracy": 0.9905107729136944,
1211
+ "num_tokens": 797801.0,
1212
  "step": 118
1213
  },
1214
  {
1215
+ "entropy": 0.2443255502730608,
1216
  "epoch": 1.4532924961715161,
1217
+ "grad_norm": 0.365234375,
1218
  "learning_rate": 5.6097560975609764e-05,
1219
+ "loss": 0.04100143164396286,
1220
+ "mean_token_accuracy": 0.9880562499165535,
1221
+ "num_tokens": 804371.0,
1222
  "step": 119
1223
  },
1224
  {
1225
+ "entropy": 0.19626039918512106,
1226
  "epoch": 1.4655436447166923,
1227
+ "grad_norm": 0.83984375,
1228
  "learning_rate": 5.487804878048781e-05,
1229
+ "loss": 0.038516998291015625,
1230
+ "mean_token_accuracy": 0.988171175122261,
1231
+ "num_tokens": 812335.0,
1232
  "step": 120
1233
  },
1234
  {
1235
+ "entropy": 0.2181866616010666,
1236
  "epoch": 1.4777947932618682,
1237
+ "grad_norm": 0.53515625,
1238
  "learning_rate": 5.365853658536586e-05,
1239
+ "loss": 0.02816646918654442,
1240
+ "mean_token_accuracy": 0.9916124008595943,
1241
+ "num_tokens": 818577.0,
1242
  "step": 121
1243
  },
1244
  {
1245
+ "entropy": 0.20635052677243948,
1246
  "epoch": 1.4900459418070444,
1247
+ "grad_norm": 0.74609375,
1248
  "learning_rate": 5.2439024390243904e-05,
1249
+ "loss": 0.04106622561812401,
1250
+ "mean_token_accuracy": 0.9839451834559441,
1251
+ "num_tokens": 825535.0,
1252
  "step": 122
1253
  },
1254
  {
1255
+ "entropy": 0.21835408825427294,
1256
  "epoch": 1.5022970903522204,
1257
+ "grad_norm": 0.427734375,
1258
  "learning_rate": 5.121951219512195e-05,
1259
+ "loss": 0.026341412216424942,
1260
+ "mean_token_accuracy": 0.9940293915569782,
1261
+ "num_tokens": 831505.0,
1262
  "step": 123
1263
  },
1264
  {
1265
+ "entropy": 0.21729151718318462,
1266
  "epoch": 1.5145482388973965,
1267
+ "grad_norm": 0.455078125,
1268
  "learning_rate": 5e-05,
1269
+ "loss": 0.028432821854948997,
1270
+ "mean_token_accuracy": 0.9925089627504349,
1271
+ "num_tokens": 838385.0,
1272
  "step": 124
1273
  },
1274
  {
1275
+ "entropy": 0.23625962156802416,
1276
  "epoch": 1.5267993874425727,
1277
+ "grad_norm": 0.72265625,
1278
  "learning_rate": 4.878048780487805e-05,
1279
+ "loss": 0.03885198384523392,
1280
+ "mean_token_accuracy": 0.9883155077695847,
1281
+ "num_tokens": 845433.0,
1282
  "step": 125
1283
  },
1284
  {
1285
+ "entropy": 0.21153692342340946,
1286
  "epoch": 1.5390505359877489,
1287
+ "grad_norm": 0.66796875,
1288
  "learning_rate": 4.75609756097561e-05,
1289
+ "loss": 0.03570759296417236,
1290
+ "mean_token_accuracy": 0.9910184219479561,
1291
+ "num_tokens": 852471.0,
1292
  "step": 126
1293
  },
1294
  {
1295
+ "entropy": 0.23752436228096485,
1296
  "epoch": 1.551301684532925,
1297
+ "grad_norm": 0.640625,
1298
  "learning_rate": 4.634146341463415e-05,
1299
+ "loss": 0.028638798743486404,
1300
+ "mean_token_accuracy": 0.9928638078272343,
1301
+ "num_tokens": 858702.0,
1302
  "step": 127
1303
  },
1304
  {
1305
+ "entropy": 0.2128417994827032,
1306
  "epoch": 1.5635528330781012,
1307
+ "grad_norm": 0.8828125,
1308
  "learning_rate": 4.51219512195122e-05,
1309
+ "loss": 0.029636576771736145,
1310
+ "mean_token_accuracy": 0.9910452663898468,
1311
+ "num_tokens": 865325.0,
1312
  "step": 128
1313
  },
1314
  {
1315
+ "entropy": 0.216589767485857,
1316
  "epoch": 1.5758039816232772,
1317
+ "grad_norm": 0.453125,
1318
  "learning_rate": 4.390243902439025e-05,
1319
+ "loss": 0.03238631784915924,
1320
+ "mean_token_accuracy": 0.9904623441398144,
1321
+ "num_tokens": 871341.0,
1322
  "step": 129
1323
  },
1324
  {
1325
+ "entropy": 0.19242106284946203,
1326
  "epoch": 1.5880551301684533,
1327
+ "grad_norm": 0.392578125,
1328
  "learning_rate": 4.26829268292683e-05,
1329
+ "loss": 0.0261989776045084,
1330
+ "mean_token_accuracy": 0.9925210140645504,
1331
+ "num_tokens": 878973.0,
1332
  "step": 130
1333
  },
1334
  {
1335
+ "entropy": 0.22208478767424822,
1336
  "epoch": 1.6003062787136293,
1337
+ "grad_norm": 0.328125,
1338
  "learning_rate": 4.146341463414634e-05,
1339
+ "loss": 0.029643766582012177,
1340
+ "mean_token_accuracy": 0.9926025420427322,
1341
+ "num_tokens": 885517.0,
1342
  "step": 131
1343
  },
1344
  {
1345
+ "entropy": 0.19283092580735683,
1346
  "epoch": 1.6125574272588055,
1347
+ "grad_norm": 0.423828125,
1348
  "learning_rate": 4.0243902439024395e-05,
1349
+ "loss": 0.03948017954826355,
1350
+ "mean_token_accuracy": 0.9875317811965942,
1351
+ "num_tokens": 893273.0,
1352
  "step": 132
1353
  },
1354
  {
1355
+ "entropy": 0.18790056556463242,
1356
  "epoch": 1.6248085758039816,
1357
+ "grad_norm": 0.625,
1358
  "learning_rate": 3.9024390243902444e-05,
1359
+ "loss": 0.025747017934918404,
1360
+ "mean_token_accuracy": 0.9934940375387669,
1361
+ "num_tokens": 900019.0,
1362
  "step": 133
1363
  },
1364
  {
1365
+ "entropy": 0.20814241049811244,
1366
  "epoch": 1.6370597243491578,
1367
+ "grad_norm": 0.376953125,
1368
  "learning_rate": 3.780487804878049e-05,
1369
+ "loss": 0.03998865559697151,
1370
+ "mean_token_accuracy": 0.9876968078315258,
1371
+ "num_tokens": 906633.0,
1372
  "step": 134
1373
  },
1374
  {
1375
+ "entropy": 0.1975369704887271,
1376
  "epoch": 1.649310872894334,
1377
+ "grad_norm": 0.3203125,
1378
  "learning_rate": 3.6585365853658535e-05,
1379
+ "loss": 0.031131668016314507,
1380
+ "mean_token_accuracy": 0.9915927015244961,
1381
+ "num_tokens": 913990.0,
1382
  "step": 135
1383
  },
1384
  {
1385
+ "entropy": 0.23459685500711203,
1386
  "epoch": 1.66156202143951,
1387
+ "grad_norm": 0.76171875,
1388
  "learning_rate": 3.5365853658536584e-05,
1389
+ "loss": 0.03373259678483009,
1390
+ "mean_token_accuracy": 0.9898596629500389,
1391
+ "num_tokens": 919248.0,
1392
  "step": 136
1393
  },
1394
  {
1395
+ "entropy": 0.1909911371767521,
1396
  "epoch": 1.673813169984686,
1397
+ "grad_norm": 0.60546875,
1398
  "learning_rate": 3.414634146341464e-05,
1399
+ "loss": 0.037791188806295395,
1400
+ "mean_token_accuracy": 0.9897548258304596,
1401
+ "num_tokens": 926248.0,
1402
  "step": 137
1403
  },
1404
  {
1405
+ "entropy": 0.2332595670595765,
1406
  "epoch": 1.686064318529862,
1407
+ "grad_norm": 0.89453125,
1408
  "learning_rate": 3.292682926829269e-05,
1409
+ "loss": 0.03799242898821831,
1410
+ "mean_token_accuracy": 0.9867184162139893,
1411
+ "num_tokens": 932490.0,
1412
  "step": 138
1413
  },
1414
  {
1415
+ "entropy": 0.22243124432861805,
1416
  "epoch": 1.6983154670750382,
1417
+ "grad_norm": 0.61328125,
1418
  "learning_rate": 3.170731707317073e-05,
1419
+ "loss": 0.04291514679789543,
1420
+ "mean_token_accuracy": 0.9877815246582031,
1421
+ "num_tokens": 938756.0,
1422
  "step": 139
1423
  },
1424
  {
1425
+ "entropy": 0.20778016652911901,
1426
  "epoch": 1.7105666156202144,
1427
+ "grad_norm": 0.41796875,
1428
  "learning_rate": 3.048780487804878e-05,
1429
+ "loss": 0.023588458076119423,
1430
+ "mean_token_accuracy": 0.9942950084805489,
1431
+ "num_tokens": 945866.0,
1432
  "step": 140
1433
  },
1434
  {
1435
+ "entropy": 0.18776059616357088,
1436
  "epoch": 1.7228177641653906,
1437
+ "grad_norm": 0.41796875,
1438
  "learning_rate": 2.926829268292683e-05,
1439
+ "loss": 0.03229852020740509,
1440
+ "mean_token_accuracy": 0.9909596405923367,
1441
+ "num_tokens": 952865.0,
1442
  "step": 141
1443
  },
1444
  {
1445
+ "entropy": 0.18707702960819006,
1446
  "epoch": 1.7350689127105667,
1447
+ "grad_norm": 0.609375,
1448
  "learning_rate": 2.8048780487804882e-05,
1449
+ "loss": 0.03691868111491203,
1450
+ "mean_token_accuracy": 0.9900590926408768,
1451
+ "num_tokens": 959190.0,
1452
  "step": 142
1453
  },
1454
  {
1455
+ "entropy": 0.1914756903424859,
1456
  "epoch": 1.7473200612557427,
1457
+ "grad_norm": 0.408203125,
1458
  "learning_rate": 2.682926829268293e-05,
1459
+ "loss": 0.03487441688776016,
1460
+ "mean_token_accuracy": 0.9909356310963631,
1461
+ "num_tokens": 966059.0,
1462
  "step": 143
1463
  },
1464
  {
1465
+ "entropy": 0.20852853963151574,
1466
  "epoch": 1.7595712098009189,
1467
+ "grad_norm": 0.380859375,
1468
  "learning_rate": 2.5609756097560977e-05,
1469
+ "loss": 0.03023841790854931,
1470
+ "mean_token_accuracy": 0.9922478385269642,
1471
+ "num_tokens": 973553.0,
1472
  "step": 144
1473
  },
1474
  {
1475
+ "entropy": 0.18278094567358494,
1476
  "epoch": 1.7718223583460948,
1477
+ "grad_norm": 0.65625,
1478
  "learning_rate": 2.4390243902439026e-05,
1479
+ "loss": 0.03335383161902428,
1480
+ "mean_token_accuracy": 0.9902437664568424,
1481
+ "num_tokens": 980748.0,
1482
  "step": 145
1483
  },
1484
  {
1485
+ "entropy": 0.2156418706290424,
1486
  "epoch": 1.784073506891271,
1487
+ "grad_norm": 0.58984375,
1488
  "learning_rate": 2.3170731707317075e-05,
1489
+ "loss": 0.026211977005004883,
1490
+ "mean_token_accuracy": 0.9913386814296246,
1491
+ "num_tokens": 987018.0,
1492
  "step": 146
1493
  },
1494
  {
1495
+ "entropy": 0.2084086169488728,
1496
  "epoch": 1.7963246554364471,
1497
+ "grad_norm": 0.37890625,
1498
  "learning_rate": 2.1951219512195124e-05,
1499
+ "loss": 0.029074503108859062,
1500
+ "mean_token_accuracy": 0.9920879267156124,
1501
+ "num_tokens": 993841.0,
1502
  "step": 147
1503
  },
1504
  {
1505
+ "entropy": 0.2162067350000143,
1506
  "epoch": 1.8085758039816233,
1507
+ "grad_norm": 0.38671875,
1508
  "learning_rate": 2.073170731707317e-05,
1509
+ "loss": 0.027591165155172348,
1510
+ "mean_token_accuracy": 0.9916894063353539,
1511
+ "num_tokens": 1000318.0,
1512
  "step": 148
1513
  },
1514
  {
1515
+ "entropy": 0.22895692195743322,
1516
  "epoch": 1.8208269525267995,
1517
+ "grad_norm": 1.421875,
1518
  "learning_rate": 1.9512195121951222e-05,
1519
+ "loss": 0.034101299941539764,
1520
+ "mean_token_accuracy": 0.9889856353402138,
1521
+ "num_tokens": 1005747.0,
1522
  "step": 149
1523
  },
1524
  {
1525
+ "entropy": 0.21029841899871826,
1526
  "epoch": 1.8330781010719757,
1527
+ "grad_norm": 0.59375,
1528
  "learning_rate": 1.8292682926829268e-05,
1529
+ "loss": 0.04408642649650574,
1530
+ "mean_token_accuracy": 0.988445583730936,
1531
+ "num_tokens": 1013365.0,
1532
  "step": 150
1533
  },
1534
  {
1535
  "epoch": 1.8330781010719757,
1536
+ "eval_entropy": 0.21028992522885834,
1537
+ "eval_loss": 0.06481878459453583,
1538
+ "eval_mean_token_accuracy": 0.9753203677094501,
1539
+ "eval_num_tokens": 1013365.0,
1540
+ "eval_runtime": 64.1224,
1541
+ "eval_samples_per_second": 1.076,
1542
+ "eval_steps_per_second": 1.076,
1543
  "step": 150
1544
  },
1545
  {
1546
+ "entropy": 0.23085341975092888,
1547
  "epoch": 1.8453292496171516,
1548
+ "grad_norm": 0.375,
1549
  "learning_rate": 1.707317073170732e-05,
1550
+ "loss": 0.03518415987491608,
1551
+ "mean_token_accuracy": 0.9906566366553307,
1552
+ "num_tokens": 1018758.0,
1553
  "step": 151
1554
  },
1555
  {
1556
+ "entropy": 0.21574621414765716,
1557
  "epoch": 1.8575803981623276,
1558
+ "grad_norm": 0.51953125,
1559
  "learning_rate": 1.5853658536585366e-05,
1560
+ "loss": 0.028541577979922295,
1561
+ "mean_token_accuracy": 0.9907862320542336,
1562
+ "num_tokens": 1026224.0,
1563
  "step": 152
1564
  },
1565
  {
1566
+ "entropy": 0.18795023765414953,
1567
  "epoch": 1.8698315467075037,
1568
+ "grad_norm": 0.5234375,
1569
  "learning_rate": 1.4634146341463415e-05,
1570
+ "loss": 0.02955229952931404,
1571
+ "mean_token_accuracy": 0.9923531115055084,
1572
+ "num_tokens": 1033949.0,
1573
  "step": 153
1574
  },
1575
  {
1576
+ "entropy": 0.21928654983639717,
1577
  "epoch": 1.88208269525268,
1578
+ "grad_norm": 0.2421875,
1579
  "learning_rate": 1.3414634146341466e-05,
1580
+ "loss": 0.02629980631172657,
1581
+ "mean_token_accuracy": 0.9936561770737171,
1582
+ "num_tokens": 1041050.0,
1583
  "step": 154
1584
  },
1585
  {
1586
+ "entropy": 0.22051549516618252,
1587
  "epoch": 1.894333843797856,
1588
+ "grad_norm": 0.419921875,
1589
  "learning_rate": 1.2195121951219513e-05,
1590
+ "loss": 0.03377772495150566,
1591
+ "mean_token_accuracy": 0.9897954650223255,
1592
+ "num_tokens": 1047621.0,
1593
  "step": 155
1594
  },
1595
  {
1596
+ "entropy": 0.220434432849288,
1597
  "epoch": 1.9065849923430322,
1598
+ "grad_norm": 0.640625,
1599
  "learning_rate": 1.0975609756097562e-05,
1600
+ "loss": 0.03985638543963432,
1601
+ "mean_token_accuracy": 0.9877757839858532,
1602
+ "num_tokens": 1053963.0,
1603
  "step": 156
1604
  },
1605
  {
1606
+ "entropy": 0.20754134468734264,
1607
  "epoch": 1.9188361408882084,
1608
+ "grad_norm": 0.34765625,
1609
  "learning_rate": 9.756097560975611e-06,
1610
+ "loss": 0.03090263158082962,
1611
+ "mean_token_accuracy": 0.9914594441652298,
1612
+ "num_tokens": 1061378.0,
1613
  "step": 157
1614
  },
1615
  {
1616
+ "entropy": 0.21179267205297947,
1617
  "epoch": 1.9310872894333844,
1618
+ "grad_norm": 0.5625,
1619
  "learning_rate": 8.53658536585366e-06,
1620
+ "loss": 0.028556756675243378,
1621
+ "mean_token_accuracy": 0.9921947717666626,
1622
+ "num_tokens": 1067649.0,
1623
  "step": 158
1624
  },
1625
  {
1626
+ "entropy": 0.20413818582892418,
1627
  "epoch": 1.9433384379785605,
1628
+ "grad_norm": 0.54296875,
1629
  "learning_rate": 7.317073170731707e-06,
1630
+ "loss": 0.029063822701573372,
1631
+ "mean_token_accuracy": 0.9915811195969582,
1632
+ "num_tokens": 1073729.0,
1633
  "step": 159
1634
  },
1635
  {
1636
+ "entropy": 0.19627654599025846,
1637
  "epoch": 1.9555895865237365,
1638
+ "grad_norm": 0.3125,
1639
  "learning_rate": 6.0975609756097564e-06,
1640
+ "loss": 0.03230921924114227,
1641
+ "mean_token_accuracy": 0.9897297285497189,
1642
+ "num_tokens": 1080422.0,
1643
  "step": 160
1644
  },
1645
  {
1646
+ "entropy": 0.20519507862627506,
1647
  "epoch": 1.9678407350689127,
1648
+ "grad_norm": 0.306640625,
1649
  "learning_rate": 4.8780487804878055e-06,
1650
+ "loss": 0.026061467826366425,
1651
+ "mean_token_accuracy": 0.9930403046309948,
1652
+ "num_tokens": 1087517.0,
1653
  "step": 161
1654
  },
1655
  {
1656
+ "entropy": 0.21147159207612276,
1657
  "epoch": 1.9800918836140888,
1658
+ "grad_norm": 0.458984375,
1659
  "learning_rate": 3.6585365853658537e-06,
1660
+ "loss": 0.03128533065319061,
1661
+ "mean_token_accuracy": 0.9902758039534092,
1662
+ "num_tokens": 1093989.0,
1663
  "step": 162
1664
  },
1665
  {
1666
+ "entropy": 0.2088263975456357,
1667
  "epoch": 1.992343032159265,
1668
+ "grad_norm": 0.5390625,
1669
  "learning_rate": 2.4390243902439027e-06,
1670
+ "loss": 0.047114044427871704,
1671
+ "mean_token_accuracy": 0.9882928691804409,
1672
+ "num_tokens": 1101351.0,
1673
  "step": 163
1674
  },
1675
  {
1676
+ "entropy": 0.2400845021009445,
1677
  "epoch": 2.0,
1678
+ "grad_norm": 0.61328125,
1679
  "learning_rate": 1.2195121951219514e-06,
1680
+ "loss": 0.024358952417969704,
1681
+ "mean_token_accuracy": 0.9906890511512756,
1682
+ "num_tokens": 1105216.0,
1683
  "step": 164
1684
  }
1685
  ],
 
1700
  "attributes": {}
1701
  }
1702
  },
1703
+ "total_flos": 5.004551367111475e+16,
1704
  "train_batch_size": 1,
1705
  "trial_name": null,
1706
  "trial_params": null