ChristophSchuhmann commited on
Commit
5f2a770
·
verified ·
1 Parent(s): cff7621

Initial upload of Vocalino_0.1_alpha

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2bb27a6045f2a804225e4385145d74b9fe219cb6252a075b1ba16f193e08a55
3
  size 4991037968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71b6adcef4539fb5482f20abe9b1538f393f31a56c1eb1e15c2658ef980b217
3
  size 4991037968
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74cbeb23f27a066ede04ae5fb95da8660764c05811c5f58bb2327b55f4475a60
3
  size 1610725592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf8f47e69590cc9d5682340b5b2db04c088225dfa08d29f5289ea537a20414d4
3
  size 1610725592
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86b39713e36428f170f811389f92c827a268c0c4520f927242070d805ef09ef0
3
  size 13203690391
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7c9e30cc6ba4e695dc260982dc19384db615b96f806cd7c31f0cb7c1aee3136
3
  size 13203690391
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b093dfe59b41efeb45cc3d628d3360abaa2303bbaa489081411faf431e52941d
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8f92cf63e0989759370d24108b469c492c12202403f036015307ce49f12cedc
3
  size 16389
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:450a0ac1645503c0b14fe9c37d77060cc76b1c9942dcfdd0e779cd526b2e98d9
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ed40a0a4e9f365d2c6cc004d97e6705894eba46c8be4c160c1455bc3062dee1
3
  size 16389
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:938b37918eac9a4cbef3805f7d2abdcef094a334f848e73ac19fcdc39d38663a
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d688b304d19c260b5cfa471535ed51d7e1d60b3a0d0159dfd1a04b87904a9f42
3
  size 16389
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8b27a54988f134299ab296b95e8c1e63d476dffdba7c6f120f2076e8688f355
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9967425ebcaee80d9b518fa0244d52f739b1b983d87cda71d5fede0c073e9d3b
3
  size 16389
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d95f73d920296d5d9558e47894c5a2c0d649d7cb10a3b07a013d6bfbd3b8cf90
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:469900fd39c667ffbd49c3c407c0ba317a1e9f5f9339a99b5d38423b7d0ce6d4
3
  size 16389
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70b945bb634c9daf4a00433296ecc5245b34a2b5f09017993b5f5f03b84dabea
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:124688471ff2a6e80f2fcefedbf741fb18d08dd539d5bd07a52e81be545142a5
3
  size 16389
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfdd1fca0dace16a59c8592c531a70661218184bb0249c5862bbfb5ab0844fc9
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e69f1ced9f992a72c948698e5eb06088610788988cdb2fdbdd624e064319d60
3
  size 16389
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d106363f9f1b0ff898c86d083a097bf22fd84de35e5670aa299504abcc99752a
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a376268a55d6ee10c371c06aa952334c4c6a1af9ea2d71b1951a57367a0c6722
3
  size 16389
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3563cb5a79f07ff892bc530b5dd0506394891bf5253615cfbbffedad4030dc36
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88dc02118a954c9e13d2edebcae6232e190a54b8a4ac26352359138b0034c512
3
  size 1465
trainer_state.json CHANGED
@@ -2,2433 +2,823 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.0,
6
  "eval_steps": 500,
7
- "global_step": 3453,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.008694707097054668,
14
- "grad_norm": 0.1494140625,
15
- "learning_rate": 3.6000000000000003e-06,
16
- "loss": 4.1806,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.017389414194109335,
21
- "grad_norm": 0.138671875,
22
- "learning_rate": 7.600000000000001e-06,
23
- "loss": 4.2081,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.026084121291164004,
28
- "grad_norm": 0.06201171875,
29
- "learning_rate": 1.16e-05,
30
- "loss": 4.1679,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.03477882838821867,
35
- "grad_norm": 0.05078125,
36
- "learning_rate": 1.5600000000000003e-05,
37
- "loss": 4.1451,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.04347353548527334,
42
- "grad_norm": 0.044677734375,
43
- "learning_rate": 1.9600000000000002e-05,
44
- "loss": 4.1028,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.05216824258232801,
49
- "grad_norm": 0.0400390625,
50
- "learning_rate": 2.36e-05,
51
- "loss": 4.0724,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.060862949679382675,
56
- "grad_norm": 0.036865234375,
57
- "learning_rate": 2.76e-05,
58
- "loss": 4.0766,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.06955765677643734,
63
- "grad_norm": 0.03662109375,
64
- "learning_rate": 3.16e-05,
65
- "loss": 4.0271,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.078252363873492,
70
- "grad_norm": 0.03515625,
71
- "learning_rate": 3.5600000000000005e-05,
72
- "loss": 4.025,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.08694707097054669,
77
- "grad_norm": 0.034912109375,
78
- "learning_rate": 3.96e-05,
79
- "loss": 4.023,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.09564177806760135,
84
- "grad_norm": 0.03515625,
85
- "learning_rate": 3.9999288925533835e-05,
86
- "loss": 4.0125,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.10433648516465602,
91
- "grad_norm": 0.033447265625,
92
- "learning_rate": 3.9996830955256915e-05,
93
- "loss": 3.996,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.11303119226171068,
98
- "grad_norm": 0.03466796875,
99
- "learning_rate": 3.9992617526198226e-05,
100
- "loss": 3.9673,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.12172589935876535,
105
- "grad_norm": 0.03466796875,
106
- "learning_rate": 3.9986649008241436e-05,
107
- "loss": 3.9865,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.13042060645582002,
112
- "grad_norm": 0.033447265625,
113
- "learning_rate": 3.9978925925343914e-05,
114
- "loss": 3.9762,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.13911531355287468,
119
- "grad_norm": 0.03369140625,
120
- "learning_rate": 3.996944895549079e-05,
121
- "loss": 3.9589,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.14781002064992935,
126
- "grad_norm": 0.037353515625,
127
- "learning_rate": 3.9958218930635366e-05,
128
- "loss": 3.9763,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.156504727746984,
133
- "grad_norm": 0.033935546875,
134
- "learning_rate": 3.994523683662616e-05,
135
- "loss": 3.9622,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.16519943484403868,
140
- "grad_norm": 0.036865234375,
141
- "learning_rate": 3.99305038131203e-05,
142
- "loss": 3.9718,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 0.17389414194109337,
147
- "grad_norm": 0.03515625,
148
- "learning_rate": 3.991402115348347e-05,
149
- "loss": 3.9812,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 0.18258884903814804,
154
- "grad_norm": 0.03466796875,
155
- "learning_rate": 3.9895790304676455e-05,
156
- "loss": 3.9584,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.1912835561352027,
161
- "grad_norm": 0.033447265625,
162
- "learning_rate": 3.987581286712802e-05,
163
- "loss": 3.9556,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.19997826323225737,
168
- "grad_norm": 0.0361328125,
169
- "learning_rate": 3.985409059459444e-05,
170
- "loss": 3.9816,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.20867297032931204,
175
- "grad_norm": 0.033203125,
176
- "learning_rate": 3.9830625394005615e-05,
177
- "loss": 3.952,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.2173676774263667,
182
- "grad_norm": 0.035400390625,
183
- "learning_rate": 3.980541932529754e-05,
184
- "loss": 3.9577,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.22606238452342137,
189
- "grad_norm": 0.03515625,
190
- "learning_rate": 3.977847460123157e-05,
191
- "loss": 3.9635,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.23475709162047603,
196
- "grad_norm": 0.03369140625,
197
- "learning_rate": 3.9749793587200133e-05,
198
- "loss": 3.9593,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.2434517987175307,
203
- "grad_norm": 0.03564453125,
204
- "learning_rate": 3.9719378801019086e-05,
205
- "loss": 3.9556,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.25214650581458536,
210
- "grad_norm": 0.034423828125,
211
- "learning_rate": 3.968723291270667e-05,
212
- "loss": 3.9426,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.26084121291164003,
217
- "grad_norm": 0.034912109375,
218
- "learning_rate": 3.965335874424912e-05,
219
- "loss": 3.9334,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.2695359200086947,
224
- "grad_norm": 0.034912109375,
225
- "learning_rate": 3.9617759269352974e-05,
226
- "loss": 3.9393,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.27823062710574936,
231
- "grad_norm": 0.034423828125,
232
- "learning_rate": 3.958043761318394e-05,
233
- "loss": 3.9261,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.286925334202804,
238
- "grad_norm": 0.0361328125,
239
- "learning_rate": 3.9541397052092634e-05,
240
- "loss": 3.9246,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.2956200412998587,
245
- "grad_norm": 0.03369140625,
246
- "learning_rate": 3.9500641013326896e-05,
247
- "loss": 3.9479,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.30431474839691336,
252
- "grad_norm": 0.035400390625,
253
- "learning_rate": 3.9458173074730946e-05,
254
- "loss": 3.9463,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.313009455493968,
259
- "grad_norm": 0.033203125,
260
- "learning_rate": 3.941399696443131e-05,
261
- "loss": 3.9378,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.3217041625910227,
266
- "grad_norm": 0.0361328125,
267
- "learning_rate": 3.9368116560509515e-05,
268
- "loss": 3.9318,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.33039886968807736,
273
- "grad_norm": 0.03515625,
274
- "learning_rate": 3.9320535890661656e-05,
275
- "loss": 3.9432,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.3390935767851321,
280
- "grad_norm": 0.0341796875,
281
- "learning_rate": 3.927125913184482e-05,
282
- "loss": 3.918,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.34778828388218674,
287
- "grad_norm": 0.035400390625,
288
- "learning_rate": 3.9220290609910384e-05,
289
- "loss": 3.9465,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 0.3564829909792414,
294
- "grad_norm": 0.03466796875,
295
- "learning_rate": 3.9167634799224314e-05,
296
- "loss": 3.936,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.3651776980762961,
301
- "grad_norm": 0.036376953125,
302
- "learning_rate": 3.911329632227431e-05,
303
- "loss": 3.9363,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 0.37387240517335074,
308
- "grad_norm": 0.036376953125,
309
- "learning_rate": 3.905727994926404e-05,
310
- "loss": 3.9353,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 0.3825671122704054,
315
- "grad_norm": 0.033203125,
316
- "learning_rate": 3.89995905976944e-05,
317
- "loss": 3.9252,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 0.3912618193674601,
322
- "grad_norm": 0.03466796875,
323
- "learning_rate": 3.894023333193178e-05,
324
- "loss": 3.9201,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 0.39995652646451474,
329
- "grad_norm": 0.034423828125,
330
- "learning_rate": 3.8879213362763524e-05,
331
- "loss": 3.9431,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 0.4086512335615694,
336
- "grad_norm": 0.0341796875,
337
- "learning_rate": 3.8816536046940454e-05,
338
- "loss": 3.9165,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 0.41734594065862407,
343
- "grad_norm": 0.035888671875,
344
- "learning_rate": 3.8752206886706644e-05,
345
- "loss": 3.9136,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 0.42604064775567874,
350
- "grad_norm": 0.036865234375,
351
- "learning_rate": 3.868623152931636e-05,
352
- "loss": 3.9326,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 0.4347353548527334,
357
- "grad_norm": 0.0361328125,
358
- "learning_rate": 3.861861576653836e-05,
359
- "loss": 3.9389,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.44343006194978807,
364
- "grad_norm": 0.033935546875,
365
- "learning_rate": 3.854936553414741e-05,
366
- "loss": 3.9158,
367
  "step": 510
368
  },
369
  {
370
  "epoch": 0.45212476904684273,
371
- "grad_norm": 0.03466796875,
372
- "learning_rate": 3.8478486911403205e-05,
373
- "loss": 3.9115,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 0.4608194761438974,
378
- "grad_norm": 0.03466796875,
379
- "learning_rate": 3.84059861205167e-05,
380
- "loss": 3.9334,
381
  "step": 530
382
  },
383
  {
384
  "epoch": 0.46951418324095207,
385
- "grad_norm": 0.035888671875,
386
- "learning_rate": 3.83318695261039e-05,
387
- "loss": 3.9061,
388
  "step": 540
389
  },
390
  {
391
  "epoch": 0.47820889033800673,
392
- "grad_norm": 0.035888671875,
393
- "learning_rate": 3.825614363462708e-05,
394
- "loss": 3.9163,
395
  "step": 550
396
  },
397
  {
398
  "epoch": 0.4869035974350614,
399
- "grad_norm": 0.033935546875,
400
- "learning_rate": 3.8178815093823667e-05,
401
- "loss": 3.9329,
402
  "step": 560
403
  },
404
  {
405
  "epoch": 0.49559830453211606,
406
- "grad_norm": 0.0341796875,
407
- "learning_rate": 3.8099890692122585e-05,
408
- "loss": 3.9045,
409
  "step": 570
410
  },
411
  {
412
  "epoch": 0.5042930116291707,
413
- "grad_norm": 0.036865234375,
414
- "learning_rate": 3.801937735804838e-05,
415
- "loss": 3.9127,
416
  "step": 580
417
  },
418
  {
419
  "epoch": 0.5129877187262254,
420
- "grad_norm": 0.03662109375,
421
- "learning_rate": 3.7937282159612975e-05,
422
- "loss": 3.9108,
423
  "step": 590
424
  },
425
  {
426
  "epoch": 0.5216824258232801,
427
  "grad_norm": 0.03662109375,
428
- "learning_rate": 3.7853612303695165e-05,
429
- "loss": 3.9157,
430
  "step": 600
431
  },
432
  {
433
  "epoch": 0.5303771329203347,
434
- "grad_norm": 0.035888671875,
435
- "learning_rate": 3.776837513540796e-05,
436
- "loss": 3.9099,
437
  "step": 610
438
  },
439
  {
440
  "epoch": 0.5390718400173894,
441
- "grad_norm": 0.034423828125,
442
- "learning_rate": 3.768157813745379e-05,
443
- "loss": 3.9199,
444
  "step": 620
445
  },
446
  {
447
  "epoch": 0.5477665471144441,
448
- "grad_norm": 0.03515625,
449
- "learning_rate": 3.7593228929467625e-05,
450
- "loss": 3.9142,
451
  "step": 630
452
  },
453
  {
454
  "epoch": 0.5564612542114987,
455
- "grad_norm": 0.036865234375,
456
- "learning_rate": 3.7503335267348047e-05,
457
- "loss": 3.9015,
458
  "step": 640
459
  },
460
  {
461
  "epoch": 0.5651559613085534,
462
- "grad_norm": 0.033447265625,
463
- "learning_rate": 3.741190504257641e-05,
464
- "loss": 3.8872,
465
  "step": 650
466
  },
467
  {
468
  "epoch": 0.573850668405608,
469
- "grad_norm": 0.03515625,
470
- "learning_rate": 3.731894628152407e-05,
471
- "loss": 3.919,
472
  "step": 660
473
  },
474
  {
475
  "epoch": 0.5825453755026627,
476
- "grad_norm": 0.037109375,
477
- "learning_rate": 3.722446714474775e-05,
478
- "loss": 3.9111,
479
  "step": 670
480
  },
481
  {
482
  "epoch": 0.5912400825997174,
483
- "grad_norm": 0.03466796875,
484
- "learning_rate": 3.712847592627317e-05,
485
- "loss": 3.8901,
486
  "step": 680
487
  },
488
  {
489
  "epoch": 0.599934789696772,
490
- "grad_norm": 0.035888671875,
491
- "learning_rate": 3.703098105286695e-05,
492
- "loss": 3.8987,
493
  "step": 690
494
  },
495
  {
496
  "epoch": 0.6086294967938267,
497
- "grad_norm": 0.0341796875,
498
- "learning_rate": 3.693199108329682e-05,
499
- "loss": 3.8957,
500
  "step": 700
501
  },
502
  {
503
  "epoch": 0.6173242038908814,
504
- "grad_norm": 0.0341796875,
505
- "learning_rate": 3.683151470758032e-05,
506
- "loss": 3.8935,
507
  "step": 710
508
  },
509
  {
510
  "epoch": 0.626018910987936,
511
- "grad_norm": 0.034423828125,
512
- "learning_rate": 3.6729560746221876e-05,
513
- "loss": 3.8897,
514
  "step": 720
515
  },
516
  {
517
  "epoch": 0.6347136180849907,
518
- "grad_norm": 0.0341796875,
519
- "learning_rate": 3.66261381494385e-05,
520
- "loss": 3.889,
521
  "step": 730
522
  },
523
  {
524
  "epoch": 0.6434083251820454,
525
- "grad_norm": 0.036865234375,
526
- "learning_rate": 3.65212559963741e-05,
527
- "loss": 3.9228,
528
  "step": 740
529
  },
530
  {
531
  "epoch": 0.6521030322791,
532
- "grad_norm": 0.03515625,
533
- "learning_rate": 3.6414923494302426e-05,
534
- "loss": 3.9131,
535
  "step": 750
536
  },
537
  {
538
  "epoch": 0.6607977393761547,
539
  "grad_norm": 0.037109375,
540
- "learning_rate": 3.630714997781878e-05,
541
- "loss": 3.8993,
542
  "step": 760
543
  },
544
  {
545
  "epoch": 0.6694924464732094,
546
- "grad_norm": 0.034912109375,
547
- "learning_rate": 3.61979449080206e-05,
548
- "loss": 3.9163,
549
  "step": 770
550
  },
551
  {
552
  "epoch": 0.6781871535702642,
553
- "grad_norm": 0.036865234375,
554
- "learning_rate": 3.608731787167687e-05,
555
- "loss": 3.929,
556
  "step": 780
557
  },
558
  {
559
  "epoch": 0.6868818606673188,
560
- "grad_norm": 0.035400390625,
561
- "learning_rate": 3.597527858038654e-05,
562
- "loss": 3.8953,
563
  "step": 790
564
  },
565
  {
566
  "epoch": 0.6955765677643735,
567
- "grad_norm": 0.0341796875,
568
- "learning_rate": 3.5861836869725964e-05,
569
- "loss": 3.9063,
570
  "step": 800
571
  },
572
  {
573
  "epoch": 0.7042712748614282,
574
  "grad_norm": 0.035400390625,
575
- "learning_rate": 3.5747002698385476e-05,
576
- "loss": 3.9082,
577
  "step": 810
578
  },
579
  {
580
  "epoch": 0.7129659819584828,
581
- "grad_norm": 0.033935546875,
582
- "learning_rate": 3.563078614729516e-05,
583
- "loss": 3.8945,
584
  "step": 820
585
  },
586
  {
587
  "epoch": 0.7216606890555375,
588
- "grad_norm": 0.03466796875,
589
- "learning_rate": 3.551319741873984e-05,
590
- "loss": 3.9141,
591
  "step": 830
592
  },
593
  {
594
  "epoch": 0.7303553961525922,
595
- "grad_norm": 0.037353515625,
596
- "learning_rate": 3.539424683546349e-05,
597
- "loss": 3.9021,
598
  "step": 840
599
  },
600
  {
601
  "epoch": 0.7390501032496468,
602
- "grad_norm": 0.03369140625,
603
- "learning_rate": 3.5273944839763014e-05,
604
- "loss": 3.8953,
605
  "step": 850
606
  },
607
  {
608
  "epoch": 0.7477448103467015,
609
- "grad_norm": 0.034423828125,
610
- "learning_rate": 3.515230199257156e-05,
611
- "loss": 3.9092,
612
  "step": 860
613
  },
614
  {
615
  "epoch": 0.7564395174437561,
616
  "grad_norm": 0.038330078125,
617
- "learning_rate": 3.502932897253139e-05,
618
- "loss": 3.9191,
619
  "step": 870
620
  },
621
  {
622
  "epoch": 0.7651342245408108,
623
- "grad_norm": 0.03466796875,
624
- "learning_rate": 3.490503657505648e-05,
625
- "loss": 3.9052,
626
  "step": 880
627
  },
628
  {
629
  "epoch": 0.7738289316378655,
630
  "grad_norm": 0.036865234375,
631
- "learning_rate": 3.477943571138475e-05,
632
- "loss": 3.8903,
633
  "step": 890
634
  },
635
  {
636
  "epoch": 0.7825236387349201,
637
- "grad_norm": 0.03466796875,
638
- "learning_rate": 3.465253740762028e-05,
639
- "loss": 3.9222,
640
  "step": 900
641
  },
642
  {
643
  "epoch": 0.7912183458319748,
644
- "grad_norm": 0.03515625,
645
- "learning_rate": 3.452435280376532e-05,
646
- "loss": 3.9004,
647
  "step": 910
648
  },
649
  {
650
  "epoch": 0.7999130529290295,
651
- "grad_norm": 0.0341796875,
652
- "learning_rate": 3.439489315274233e-05,
653
- "loss": 3.8694,
654
  "step": 920
655
  },
656
  {
657
  "epoch": 0.8086077600260841,
658
- "grad_norm": 0.036865234375,
659
- "learning_rate": 3.4264169819406166e-05,
660
- "loss": 3.9082,
661
  "step": 930
662
  },
663
  {
664
  "epoch": 0.8173024671231388,
665
- "grad_norm": 0.035400390625,
666
- "learning_rate": 3.413219427954638e-05,
667
- "loss": 3.894,
668
  "step": 940
669
  },
670
  {
671
  "epoch": 0.8259971742201935,
672
- "grad_norm": 0.037353515625,
673
- "learning_rate": 3.399897811887976e-05,
674
- "loss": 3.9075,
675
  "step": 950
676
  },
677
  {
678
  "epoch": 0.8346918813172481,
679
- "grad_norm": 0.0341796875,
680
- "learning_rate": 3.386453303203331e-05,
681
- "loss": 3.8863,
682
  "step": 960
683
  },
684
  {
685
  "epoch": 0.8433865884143028,
686
- "grad_norm": 0.034912109375,
687
- "learning_rate": 3.3728870821517584e-05,
688
- "loss": 3.9042,
689
  "step": 970
690
  },
691
  {
692
  "epoch": 0.8520812955113575,
693
- "grad_norm": 0.03515625,
694
- "learning_rate": 3.359200339669058e-05,
695
- "loss": 3.9249,
696
  "step": 980
697
  },
698
  {
699
  "epoch": 0.8607760026084121,
700
- "grad_norm": 0.03369140625,
701
- "learning_rate": 3.3453942772712274e-05,
702
- "loss": 3.8849,
703
  "step": 990
704
  },
705
  {
706
  "epoch": 0.8694707097054668,
707
- "grad_norm": 0.0361328125,
708
- "learning_rate": 3.3314701069489835e-05,
709
- "loss": 3.9124,
710
  "step": 1000
711
  },
712
  {
713
  "epoch": 0.8781654168025215,
714
- "grad_norm": 0.037109375,
715
- "learning_rate": 3.317429051061364e-05,
716
- "loss": 3.9019,
717
  "step": 1010
718
  },
719
  {
720
  "epoch": 0.8868601238995761,
721
- "grad_norm": 0.035888671875,
722
- "learning_rate": 3.303272342228425e-05,
723
- "loss": 3.9227,
724
  "step": 1020
725
  },
726
  {
727
  "epoch": 0.8955548309966308,
728
- "grad_norm": 0.0341796875,
729
- "learning_rate": 3.2890012232230276e-05,
730
- "loss": 3.8926,
731
  "step": 1030
732
  },
733
  {
734
  "epoch": 0.9042495380936855,
735
- "grad_norm": 0.035888671875,
736
- "learning_rate": 3.274616946861744e-05,
737
- "loss": 3.9011,
738
  "step": 1040
739
  },
740
  {
741
  "epoch": 0.9129442451907401,
742
- "grad_norm": 0.036865234375,
743
- "learning_rate": 3.260120775894871e-05,
744
- "loss": 3.9038,
745
  "step": 1050
746
  },
747
  {
748
  "epoch": 0.9216389522877948,
749
- "grad_norm": 0.036376953125,
750
- "learning_rate": 3.245513982895583e-05,
751
- "loss": 3.9069,
752
  "step": 1060
753
  },
754
  {
755
  "epoch": 0.9303336593848495,
756
- "grad_norm": 0.035888671875,
757
- "learning_rate": 3.2307978501482126e-05,
758
- "loss": 3.8964,
759
  "step": 1070
760
  },
761
  {
762
  "epoch": 0.9390283664819041,
763
- "grad_norm": 0.03564453125,
764
- "learning_rate": 3.2159736695356846e-05,
765
- "loss": 3.8805,
766
  "step": 1080
767
  },
768
  {
769
  "epoch": 0.9477230735789588,
770
- "grad_norm": 0.03564453125,
771
- "learning_rate": 3.2010427424261065e-05,
772
- "loss": 3.8927,
773
  "step": 1090
774
  },
775
  {
776
  "epoch": 0.9564177806760135,
777
- "grad_norm": 0.03515625,
778
- "learning_rate": 3.1860063795585225e-05,
779
- "loss": 3.8937,
780
  "step": 1100
781
  },
782
  {
783
  "epoch": 0.9651124877730681,
784
- "grad_norm": 0.033935546875,
785
- "learning_rate": 3.170865900927852e-05,
786
- "loss": 3.9086,
787
  "step": 1110
788
  },
789
  {
790
  "epoch": 0.9738071948701228,
791
- "grad_norm": 0.0341796875,
792
- "learning_rate": 3.1556226356690066e-05,
793
- "loss": 3.8795,
794
  "step": 1120
795
  },
796
  {
797
  "epoch": 0.9825019019671775,
798
- "grad_norm": 0.034912109375,
799
- "learning_rate": 3.140277921940215e-05,
800
- "loss": 3.8904,
801
  "step": 1130
802
  },
803
  {
804
  "epoch": 0.9911966090642321,
805
- "grad_norm": 0.03369140625,
806
- "learning_rate": 3.124833106805546e-05,
807
- "loss": 3.8929,
808
  "step": 1140
809
  },
810
  {
811
  "epoch": 0.9998913161612868,
812
- "grad_norm": 0.0341796875,
813
- "learning_rate": 3.1092895461166534e-05,
814
- "loss": 3.9182,
815
- "step": 1150
816
- },
817
- {
818
- "epoch": 1.0078252363873492,
819
- "grad_norm": 0.03515625,
820
- "learning_rate": 3.093648604393756e-05,
821
- "loss": 3.8922,
822
- "step": 1160
823
- },
824
- {
825
- "epoch": 1.0165199434844039,
826
- "grad_norm": 0.03515625,
827
- "learning_rate": 3.077911654705843e-05,
828
- "loss": 3.8674,
829
- "step": 1170
830
- },
831
- {
832
- "epoch": 1.0252146505814586,
833
- "grad_norm": 0.03515625,
834
- "learning_rate": 3.062080078550145e-05,
835
- "loss": 3.8918,
836
- "step": 1180
837
- },
838
- {
839
- "epoch": 1.0339093576785132,
840
- "grad_norm": 0.034423828125,
841
- "learning_rate": 3.0461552657308462e-05,
842
- "loss": 3.8896,
843
- "step": 1190
844
- },
845
- {
846
- "epoch": 1.0426040647755679,
847
- "grad_norm": 0.0341796875,
848
- "learning_rate": 3.0301386142370883e-05,
849
- "loss": 3.8919,
850
- "step": 1200
851
- },
852
- {
853
- "epoch": 1.0512987718726226,
854
- "grad_norm": 0.036376953125,
855
- "learning_rate": 3.0140315301202386e-05,
856
- "loss": 3.8883,
857
- "step": 1210
858
- },
859
- {
860
- "epoch": 1.0599934789696772,
861
- "grad_norm": 0.038330078125,
862
- "learning_rate": 2.9978354273704604e-05,
863
- "loss": 3.8872,
864
- "step": 1220
865
- },
866
- {
867
- "epoch": 1.0686881860667319,
868
- "grad_norm": 0.03515625,
869
- "learning_rate": 2.981551727792582e-05,
870
- "loss": 3.8726,
871
- "step": 1230
872
- },
873
- {
874
- "epoch": 1.0773828931637865,
875
- "grad_norm": 0.03369140625,
876
- "learning_rate": 2.9651818608812807e-05,
877
- "loss": 3.9143,
878
- "step": 1240
879
- },
880
- {
881
- "epoch": 1.0860776002608412,
882
- "grad_norm": 0.033447265625,
883
- "learning_rate": 2.948727263695594e-05,
884
- "loss": 3.8643,
885
- "step": 1250
886
- },
887
- {
888
- "epoch": 1.0947723073578959,
889
- "grad_norm": 0.035888671875,
890
- "learning_rate": 2.9321893807327613e-05,
891
- "loss": 3.8943,
892
- "step": 1260
893
- },
894
- {
895
- "epoch": 1.1034670144549505,
896
- "grad_norm": 0.03515625,
897
- "learning_rate": 2.9155696638014196e-05,
898
- "loss": 3.893,
899
- "step": 1270
900
- },
901
- {
902
- "epoch": 1.1121617215520052,
903
- "grad_norm": 0.036865234375,
904
- "learning_rate": 2.8988695718941503e-05,
905
- "loss": 3.8933,
906
- "step": 1280
907
- },
908
- {
909
- "epoch": 1.1208564286490599,
910
- "grad_norm": 0.035400390625,
911
- "learning_rate": 2.8820905710594033e-05,
912
- "loss": 3.8639,
913
- "step": 1290
914
- },
915
- {
916
- "epoch": 1.1295511357461145,
917
- "grad_norm": 0.034912109375,
918
- "learning_rate": 2.8652341342727924e-05,
919
- "loss": 3.9006,
920
- "step": 1300
921
- },
922
- {
923
- "epoch": 1.1382458428431692,
924
- "grad_norm": 0.035400390625,
925
- "learning_rate": 2.84830174130779e-05,
926
- "loss": 3.8713,
927
- "step": 1310
928
- },
929
- {
930
- "epoch": 1.1469405499402239,
931
- "grad_norm": 0.0341796875,
932
- "learning_rate": 2.8312948786058226e-05,
933
- "loss": 3.8979,
934
- "step": 1320
935
- },
936
- {
937
- "epoch": 1.1556352570372785,
938
- "grad_norm": 0.0341796875,
939
- "learning_rate": 2.81421503914578e-05,
940
- "loss": 3.8656,
941
- "step": 1330
942
- },
943
- {
944
- "epoch": 1.1643299641343332,
945
- "grad_norm": 0.0361328125,
946
- "learning_rate": 2.7970637223129523e-05,
947
- "loss": 3.8645,
948
- "step": 1340
949
- },
950
- {
951
- "epoch": 1.1730246712313879,
952
- "grad_norm": 0.03564453125,
953
- "learning_rate": 2.7798424337674018e-05,
954
- "loss": 3.8833,
955
- "step": 1350
956
- },
957
- {
958
- "epoch": 1.1817193783284425,
959
- "grad_norm": 0.033935546875,
960
- "learning_rate": 2.762552685311788e-05,
961
- "loss": 3.8935,
962
- "step": 1360
963
- },
964
- {
965
- "epoch": 1.1904140854254972,
966
- "grad_norm": 0.035888671875,
967
- "learning_rate": 2.7451959947586487e-05,
968
- "loss": 3.8626,
969
- "step": 1370
970
- },
971
- {
972
- "epoch": 1.1991087925225519,
973
  "grad_norm": 0.035400390625,
974
- "learning_rate": 2.7277738857971604e-05,
975
- "loss": 3.8667,
976
- "step": 1380
977
- },
978
- {
979
- "epoch": 1.2078034996196065,
980
- "grad_norm": 0.033203125,
981
- "learning_rate": 2.7102878878593718e-05,
982
- "loss": 3.8976,
983
- "step": 1390
984
- },
985
- {
986
- "epoch": 1.2164982067166612,
987
- "grad_norm": 0.03662109375,
988
- "learning_rate": 2.6927395359859466e-05,
989
- "loss": 3.8638,
990
- "step": 1400
991
- },
992
- {
993
- "epoch": 1.2251929138137159,
994
- "grad_norm": 0.036376953125,
995
- "learning_rate": 2.6751303706914024e-05,
996
- "loss": 3.8755,
997
- "step": 1410
998
- },
999
- {
1000
- "epoch": 1.2338876209107705,
1001
- "grad_norm": 0.035888671875,
1002
- "learning_rate": 2.6574619378288756e-05,
1003
- "loss": 3.8552,
1004
- "step": 1420
1005
- },
1006
- {
1007
- "epoch": 1.2425823280078252,
1008
- "grad_norm": 0.03564453125,
1009
- "learning_rate": 2.639735788454417e-05,
1010
- "loss": 3.8723,
1011
- "step": 1430
1012
- },
1013
- {
1014
- "epoch": 1.2512770351048799,
1015
- "grad_norm": 0.03662109375,
1016
- "learning_rate": 2.621953478690826e-05,
1017
- "loss": 3.8719,
1018
- "step": 1440
1019
- },
1020
- {
1021
- "epoch": 1.2599717422019345,
1022
- "grad_norm": 0.03564453125,
1023
- "learning_rate": 2.6041165695910474e-05,
1024
- "loss": 3.8929,
1025
- "step": 1450
1026
- },
1027
- {
1028
- "epoch": 1.2686664492989892,
1029
- "grad_norm": 0.0341796875,
1030
- "learning_rate": 2.586226627001128e-05,
1031
- "loss": 3.8787,
1032
- "step": 1460
1033
- },
1034
- {
1035
- "epoch": 1.2773611563960439,
1036
- "grad_norm": 0.0361328125,
1037
- "learning_rate": 2.5682852214227595e-05,
1038
- "loss": 3.8795,
1039
- "step": 1470
1040
- },
1041
- {
1042
- "epoch": 1.2860558634930985,
1043
- "grad_norm": 0.036376953125,
1044
- "learning_rate": 2.5502939278754056e-05,
1045
- "loss": 3.8857,
1046
- "step": 1480
1047
- },
1048
- {
1049
- "epoch": 1.2947505705901532,
1050
- "grad_norm": 0.033935546875,
1051
- "learning_rate": 2.532254325758039e-05,
1052
- "loss": 3.8849,
1053
- "step": 1490
1054
- },
1055
- {
1056
- "epoch": 1.3034452776872079,
1057
- "grad_norm": 0.033935546875,
1058
- "learning_rate": 2.5141679987104874e-05,
1059
- "loss": 3.8941,
1060
- "step": 1500
1061
- },
1062
- {
1063
- "epoch": 1.3121399847842625,
1064
- "grad_norm": 0.033935546875,
1065
- "learning_rate": 2.496036534474416e-05,
1066
- "loss": 3.9017,
1067
- "step": 1510
1068
- },
1069
- {
1070
- "epoch": 1.3208346918813172,
1071
- "grad_norm": 0.037841796875,
1072
- "learning_rate": 2.4778615247539393e-05,
1073
- "loss": 3.9013,
1074
- "step": 1520
1075
- },
1076
- {
1077
- "epoch": 1.3295293989783719,
1078
- "grad_norm": 0.03564453125,
1079
- "learning_rate": 2.459644565075893e-05,
1080
- "loss": 3.8722,
1081
- "step": 1530
1082
- },
1083
- {
1084
- "epoch": 1.3382241060754265,
1085
- "grad_norm": 0.03466796875,
1086
- "learning_rate": 2.441387254649767e-05,
1087
- "loss": 3.875,
1088
- "step": 1540
1089
- },
1090
- {
1091
- "epoch": 1.3469188131724812,
1092
- "grad_norm": 0.03466796875,
1093
- "learning_rate": 2.4230911962273174e-05,
1094
- "loss": 3.8724,
1095
- "step": 1550
1096
- },
1097
- {
1098
- "epoch": 1.3556135202695359,
1099
- "grad_norm": 0.0361328125,
1100
- "learning_rate": 2.404757995961865e-05,
1101
- "loss": 3.8942,
1102
- "step": 1560
1103
- },
1104
- {
1105
- "epoch": 1.3643082273665905,
1106
- "grad_norm": 0.035888671875,
1107
- "learning_rate": 2.3863892632672946e-05,
1108
- "loss": 3.8788,
1109
- "step": 1570
1110
- },
1111
- {
1112
- "epoch": 1.3730029344636452,
1113
- "grad_norm": 0.036865234375,
1114
- "learning_rate": 2.3679866106767717e-05,
1115
- "loss": 3.8805,
1116
- "step": 1580
1117
- },
1118
- {
1119
- "epoch": 1.3816976415606999,
1120
- "grad_norm": 0.035400390625,
1121
- "learning_rate": 2.3495516537011817e-05,
1122
- "loss": 3.8724,
1123
- "step": 1590
1124
- },
1125
- {
1126
- "epoch": 1.3903923486577545,
1127
- "grad_norm": 0.0341796875,
1128
- "learning_rate": 2.331086010687312e-05,
1129
- "loss": 3.8605,
1130
- "step": 1600
1131
- },
1132
- {
1133
- "epoch": 1.3990870557548092,
1134
- "grad_norm": 0.035888671875,
1135
- "learning_rate": 2.3125913026757795e-05,
1136
- "loss": 3.8631,
1137
- "step": 1610
1138
- },
1139
- {
1140
- "epoch": 1.4077817628518638,
1141
- "grad_norm": 0.033935546875,
1142
- "learning_rate": 2.2940691532587254e-05,
1143
- "loss": 3.8697,
1144
- "step": 1620
1145
- },
1146
- {
1147
- "epoch": 1.4164764699489185,
1148
- "grad_norm": 0.0361328125,
1149
- "learning_rate": 2.275521188437286e-05,
1150
- "loss": 3.8924,
1151
- "step": 1630
1152
- },
1153
- {
1154
- "epoch": 1.4251711770459732,
1155
- "grad_norm": 0.03515625,
1156
- "learning_rate": 2.25694903647885e-05,
1157
- "loss": 3.8735,
1158
- "step": 1640
1159
- },
1160
- {
1161
- "epoch": 1.433865884143028,
1162
- "grad_norm": 0.036865234375,
1163
- "learning_rate": 2.2383543277741203e-05,
1164
- "loss": 3.8695,
1165
- "step": 1650
1166
- },
1167
- {
1168
- "epoch": 1.4425605912400825,
1169
- "grad_norm": 0.0341796875,
1170
- "learning_rate": 2.219738694693982e-05,
1171
- "loss": 3.8616,
1172
- "step": 1660
1173
- },
1174
- {
1175
- "epoch": 1.4512552983371374,
1176
- "grad_norm": 0.034912109375,
1177
- "learning_rate": 2.2011037714462097e-05,
1178
- "loss": 3.8746,
1179
- "step": 1670
1180
- },
1181
- {
1182
- "epoch": 1.4599500054341918,
1183
- "grad_norm": 0.03564453125,
1184
- "learning_rate": 2.1824511939319973e-05,
1185
- "loss": 3.9004,
1186
- "step": 1680
1187
- },
1188
- {
1189
- "epoch": 1.4686447125312467,
1190
- "grad_norm": 0.036376953125,
1191
- "learning_rate": 2.1637825996023532e-05,
1192
- "loss": 3.8916,
1193
- "step": 1690
1194
- },
1195
- {
1196
- "epoch": 1.4773394196283012,
1197
- "grad_norm": 0.03515625,
1198
- "learning_rate": 2.1450996273143505e-05,
1199
- "loss": 3.8843,
1200
- "step": 1700
1201
- },
1202
- {
1203
- "epoch": 1.486034126725356,
1204
- "grad_norm": 0.035400390625,
1205
- "learning_rate": 2.1264039171872595e-05,
1206
- "loss": 3.8682,
1207
- "step": 1710
1208
- },
1209
- {
1210
- "epoch": 1.4947288338224105,
1211
- "grad_norm": 0.03515625,
1212
- "learning_rate": 2.1076971104585627e-05,
1213
- "loss": 3.8677,
1214
- "step": 1720
1215
- },
1216
- {
1217
- "epoch": 1.5034235409194654,
1218
- "grad_norm": 0.035400390625,
1219
- "learning_rate": 2.0889808493398806e-05,
1220
- "loss": 3.8765,
1221
- "step": 1730
1222
- },
1223
- {
1224
- "epoch": 1.5121182480165198,
1225
- "grad_norm": 0.03662109375,
1226
- "learning_rate": 2.0702567768728038e-05,
1227
- "loss": 3.8712,
1228
- "step": 1740
1229
- },
1230
- {
1231
- "epoch": 1.5208129551135747,
1232
- "grad_norm": 0.03564453125,
1233
- "learning_rate": 2.051526536784656e-05,
1234
- "loss": 3.8496,
1235
- "step": 1750
1236
- },
1237
- {
1238
- "epoch": 1.5295076622106292,
1239
- "grad_norm": 0.03564453125,
1240
- "learning_rate": 2.0327917733441992e-05,
1241
- "loss": 3.8995,
1242
- "step": 1760
1243
- },
1244
- {
1245
- "epoch": 1.538202369307684,
1246
- "grad_norm": 0.035400390625,
1247
- "learning_rate": 2.0140541312172827e-05,
1248
- "loss": 3.8507,
1249
- "step": 1770
1250
- },
1251
- {
1252
- "epoch": 1.5468970764047385,
1253
- "grad_norm": 0.03662109375,
1254
- "learning_rate": 1.99531525532247e-05,
1255
- "loss": 3.868,
1256
- "step": 1780
1257
- },
1258
- {
1259
- "epoch": 1.5555917835017934,
1260
- "grad_norm": 0.0361328125,
1261
- "learning_rate": 1.9765767906866317e-05,
1262
- "loss": 3.8813,
1263
- "step": 1790
1264
- },
1265
- {
1266
- "epoch": 1.5642864905988478,
1267
- "grad_norm": 0.03515625,
1268
- "learning_rate": 1.9578403823005347e-05,
1269
- "loss": 3.8748,
1270
- "step": 1800
1271
- },
1272
- {
1273
- "epoch": 1.5729811976959027,
1274
- "grad_norm": 0.035888671875,
1275
- "learning_rate": 1.9391076749744358e-05,
1276
- "loss": 3.8824,
1277
- "step": 1810
1278
- },
1279
- {
1280
- "epoch": 1.5816759047929572,
1281
- "grad_norm": 0.03564453125,
1282
- "learning_rate": 1.9203803131936856e-05,
1283
- "loss": 3.8608,
1284
- "step": 1820
1285
- },
1286
- {
1287
- "epoch": 1.590370611890012,
1288
- "grad_norm": 0.035888671875,
1289
- "learning_rate": 1.901659940974368e-05,
1290
- "loss": 3.881,
1291
- "step": 1830
1292
- },
1293
- {
1294
- "epoch": 1.5990653189870665,
1295
- "grad_norm": 0.03515625,
1296
- "learning_rate": 1.8829482017189728e-05,
1297
- "loss": 3.872,
1298
- "step": 1840
1299
- },
1300
- {
1301
- "epoch": 1.6077600260841214,
1302
- "grad_norm": 0.036376953125,
1303
- "learning_rate": 1.864246738072132e-05,
1304
- "loss": 3.8695,
1305
- "step": 1850
1306
- },
1307
- {
1308
- "epoch": 1.6164547331811758,
1309
- "grad_norm": 0.03515625,
1310
- "learning_rate": 1.8455571917764116e-05,
1311
- "loss": 3.8826,
1312
- "step": 1860
1313
- },
1314
- {
1315
- "epoch": 1.6251494402782307,
1316
- "grad_norm": 0.035888671875,
1317
- "learning_rate": 1.8268812035281944e-05,
1318
- "loss": 3.8739,
1319
- "step": 1870
1320
- },
1321
- {
1322
- "epoch": 1.6338441473752852,
1323
- "grad_norm": 0.03515625,
1324
- "learning_rate": 1.8082204128336403e-05,
1325
- "loss": 3.8691,
1326
- "step": 1880
1327
- },
1328
- {
1329
- "epoch": 1.64253885447234,
1330
- "grad_norm": 0.03466796875,
1331
- "learning_rate": 1.789576457864768e-05,
1332
- "loss": 3.8715,
1333
- "step": 1890
1334
- },
1335
- {
1336
- "epoch": 1.6512335615693945,
1337
- "grad_norm": 0.03466796875,
1338
- "learning_rate": 1.770950975315638e-05,
1339
- "loss": 3.8714,
1340
- "step": 1900
1341
- },
1342
- {
1343
- "epoch": 1.6599282686664494,
1344
- "grad_norm": 0.03759765625,
1345
- "learning_rate": 1.7523456002586773e-05,
1346
- "loss": 3.8646,
1347
- "step": 1910
1348
- },
1349
- {
1350
- "epoch": 1.6686229757635038,
1351
- "grad_norm": 0.033935546875,
1352
- "learning_rate": 1.733761966001138e-05,
1353
- "loss": 3.8885,
1354
- "step": 1920
1355
- },
1356
- {
1357
- "epoch": 1.6773176828605587,
1358
- "grad_norm": 0.033935546875,
1359
- "learning_rate": 1.7152017039417174e-05,
1360
- "loss": 3.881,
1361
- "step": 1930
1362
- },
1363
- {
1364
- "epoch": 1.6860123899576132,
1365
- "grad_norm": 0.0361328125,
1366
- "learning_rate": 1.69666644342734e-05,
1367
- "loss": 3.8763,
1368
- "step": 1940
1369
- },
1370
- {
1371
- "epoch": 1.694707097054668,
1372
- "grad_norm": 0.036865234375,
1373
- "learning_rate": 1.678157811610124e-05,
1374
- "loss": 3.9071,
1375
- "step": 1950
1376
- },
1377
- {
1378
- "epoch": 1.7034018041517225,
1379
- "grad_norm": 0.03564453125,
1380
- "learning_rate": 1.6596774333045368e-05,
1381
- "loss": 3.8678,
1382
- "step": 1960
1383
- },
1384
- {
1385
- "epoch": 1.7120965112487774,
1386
- "grad_norm": 0.034912109375,
1387
- "learning_rate": 1.641226930844762e-05,
1388
- "loss": 3.8607,
1389
- "step": 1970
1390
- },
1391
- {
1392
- "epoch": 1.7207912183458318,
1393
- "grad_norm": 0.035400390625,
1394
- "learning_rate": 1.622807923942274e-05,
1395
- "loss": 3.8679,
1396
- "step": 1980
1397
- },
1398
- {
1399
- "epoch": 1.7294859254428867,
1400
- "grad_norm": 0.03662109375,
1401
- "learning_rate": 1.6044220295436533e-05,
1402
- "loss": 3.8905,
1403
- "step": 1990
1404
- },
1405
- {
1406
- "epoch": 1.7381806325399412,
1407
- "grad_norm": 0.037841796875,
1408
- "learning_rate": 1.586070861688636e-05,
1409
- "loss": 3.8872,
1410
- "step": 2000
1411
- },
1412
- {
1413
- "epoch": 1.746875339636996,
1414
- "grad_norm": 0.03564453125,
1415
- "learning_rate": 1.5677560313684267e-05,
1416
- "loss": 3.8655,
1417
- "step": 2010
1418
- },
1419
- {
1420
- "epoch": 1.7555700467340505,
1421
- "grad_norm": 0.0341796875,
1422
- "learning_rate": 1.5494791463842707e-05,
1423
- "loss": 3.8652,
1424
- "step": 2020
1425
- },
1426
- {
1427
- "epoch": 1.7642647538311054,
1428
- "grad_norm": 0.0341796875,
1429
- "learning_rate": 1.531241811206313e-05,
1430
- "loss": 3.8559,
1431
- "step": 2030
1432
- },
1433
- {
1434
- "epoch": 1.77295946092816,
1435
- "grad_norm": 0.035400390625,
1436
- "learning_rate": 1.5130456268327439e-05,
1437
- "loss": 3.9012,
1438
- "step": 2040
1439
- },
1440
- {
1441
- "epoch": 1.7816541680252147,
1442
- "grad_norm": 0.03369140625,
1443
- "learning_rate": 1.4948921906492566e-05,
1444
- "loss": 3.8698,
1445
- "step": 2050
1446
- },
1447
- {
1448
- "epoch": 1.7903488751222694,
1449
- "grad_norm": 0.03466796875,
1450
- "learning_rate": 1.4767830962888128e-05,
1451
- "loss": 3.8875,
1452
- "step": 2060
1453
- },
1454
- {
1455
- "epoch": 1.799043582219324,
1456
- "grad_norm": 0.03564453125,
1457
- "learning_rate": 1.4587199334917485e-05,
1458
- "loss": 3.8823,
1459
- "step": 2070
1460
- },
1461
- {
1462
- "epoch": 1.8077382893163787,
1463
- "grad_norm": 0.034423828125,
1464
- "learning_rate": 1.4407042879662086e-05,
1465
- "loss": 3.8793,
1466
- "step": 2080
1467
- },
1468
- {
1469
- "epoch": 1.8164329964134334,
1470
- "grad_norm": 0.035888671875,
1471
- "learning_rate": 1.4227377412489505e-05,
1472
- "loss": 3.8899,
1473
- "step": 2090
1474
- },
1475
- {
1476
- "epoch": 1.825127703510488,
1477
- "grad_norm": 0.03515625,
1478
- "learning_rate": 1.4048218705664983e-05,
1479
- "loss": 3.866,
1480
- "step": 2100
1481
- },
1482
- {
1483
- "epoch": 1.8338224106075427,
1484
- "grad_norm": 0.035400390625,
1485
- "learning_rate": 1.3869582486966891e-05,
1486
- "loss": 3.888,
1487
- "step": 2110
1488
- },
1489
- {
1490
- "epoch": 1.8425171177045974,
1491
- "grad_norm": 0.03515625,
1492
- "learning_rate": 1.3691484438306e-05,
1493
- "loss": 3.8963,
1494
- "step": 2120
1495
- },
1496
- {
1497
- "epoch": 1.851211824801652,
1498
- "grad_norm": 0.036376953125,
1499
- "learning_rate": 1.3513940194348844e-05,
1500
- "loss": 3.8881,
1501
- "step": 2130
1502
- },
1503
- {
1504
- "epoch": 1.8599065318987067,
1505
- "grad_norm": 0.036376953125,
1506
- "learning_rate": 1.333696534114517e-05,
1507
- "loss": 3.8843,
1508
- "step": 2140
1509
- },
1510
- {
1511
- "epoch": 1.8686012389957614,
1512
- "grad_norm": 0.03564453125,
1513
- "learning_rate": 1.3160575414759716e-05,
1514
- "loss": 3.8938,
1515
- "step": 2150
1516
- },
1517
- {
1518
- "epoch": 1.877295946092816,
1519
- "grad_norm": 0.035888671875,
1520
- "learning_rate": 1.298478589990833e-05,
1521
- "loss": 3.8784,
1522
- "step": 2160
1523
- },
1524
- {
1525
- "epoch": 1.8859906531898707,
1526
- "grad_norm": 0.035400390625,
1527
- "learning_rate": 1.2809612228598635e-05,
1528
- "loss": 3.8603,
1529
- "step": 2170
1530
- },
1531
- {
1532
- "epoch": 1.8946853602869254,
1533
- "grad_norm": 0.034912109375,
1534
- "learning_rate": 1.2635069778775276e-05,
1535
- "loss": 3.8714,
1536
- "step": 2180
1537
- },
1538
- {
1539
- "epoch": 1.90338006738398,
1540
- "grad_norm": 0.035400390625,
1541
- "learning_rate": 1.2461173872969963e-05,
1542
- "loss": 3.8824,
1543
- "step": 2190
1544
- },
1545
- {
1546
- "epoch": 1.9120747744810347,
1547
- "grad_norm": 0.03466796875,
1548
- "learning_rate": 1.2287939776956334e-05,
1549
- "loss": 3.8788,
1550
- "step": 2200
1551
- },
1552
- {
1553
- "epoch": 1.9207694815780894,
1554
- "grad_norm": 0.036376953125,
1555
- "learning_rate": 1.211538269840985e-05,
1556
- "loss": 3.8888,
1557
- "step": 2210
1558
- },
1559
- {
1560
- "epoch": 1.929464188675144,
1561
- "grad_norm": 0.03369140625,
1562
- "learning_rate": 1.1943517785572714e-05,
1563
- "loss": 3.8742,
1564
- "step": 2220
1565
- },
1566
- {
1567
- "epoch": 1.9381588957721987,
1568
- "grad_norm": 0.03662109375,
1569
- "learning_rate": 1.1772360125924102e-05,
1570
- "loss": 3.9104,
1571
- "step": 2230
1572
- },
1573
- {
1574
- "epoch": 1.9468536028692534,
1575
- "grad_norm": 0.03564453125,
1576
- "learning_rate": 1.1601924744855655e-05,
1577
- "loss": 3.87,
1578
- "step": 2240
1579
- },
1580
- {
1581
- "epoch": 1.955548309966308,
1582
- "grad_norm": 0.035400390625,
1583
- "learning_rate": 1.1432226604352463e-05,
1584
- "loss": 3.8769,
1585
- "step": 2250
1586
- },
1587
- {
1588
- "epoch": 1.9642430170633627,
1589
- "grad_norm": 0.03564453125,
1590
- "learning_rate": 1.1263280601679567e-05,
1591
- "loss": 3.8673,
1592
- "step": 2260
1593
- },
1594
- {
1595
- "epoch": 1.9729377241604173,
1596
- "grad_norm": 0.03515625,
1597
- "learning_rate": 1.1095101568074216e-05,
1598
- "loss": 3.8609,
1599
- "step": 2270
1600
- },
1601
- {
1602
- "epoch": 1.981632431257472,
1603
- "grad_norm": 0.03466796875,
1604
- "learning_rate": 1.092770426744386e-05,
1605
- "loss": 3.8815,
1606
- "step": 2280
1607
- },
1608
- {
1609
- "epoch": 1.9903271383545267,
1610
- "grad_norm": 0.035888671875,
1611
- "learning_rate": 1.0761103395070074e-05,
1612
- "loss": 3.8784,
1613
- "step": 2290
1614
- },
1615
- {
1616
- "epoch": 1.9990218454515813,
1617
- "grad_norm": 0.035400390625,
1618
- "learning_rate": 1.059531357631849e-05,
1619
- "loss": 3.8979,
1620
- "step": 2300
1621
- },
1622
- {
1623
- "epoch": 2.0069557656776436,
1624
- "grad_norm": 0.035400390625,
1625
- "learning_rate": 1.0430349365354922e-05,
1626
- "loss": 3.8533,
1627
- "step": 2310
1628
- },
1629
- {
1630
- "epoch": 2.0156504727746984,
1631
- "grad_norm": 0.03369140625,
1632
- "learning_rate": 1.0266225243867662e-05,
1633
- "loss": 3.8844,
1634
- "step": 2320
1635
- },
1636
- {
1637
- "epoch": 2.024345179871753,
1638
- "grad_norm": 0.03515625,
1639
- "learning_rate": 1.0102955619796208e-05,
1640
- "loss": 3.873,
1641
- "step": 2330
1642
- },
1643
- {
1644
- "epoch": 2.0330398869688078,
1645
- "grad_norm": 0.03466796875,
1646
- "learning_rate": 9.9405548260664e-06,
1647
- "loss": 3.8934,
1648
- "step": 2340
1649
- },
1650
- {
1651
- "epoch": 2.041734594065862,
1652
- "grad_norm": 0.03515625,
1653
- "learning_rate": 9.779037119332223e-06,
1654
- "loss": 3.8802,
1655
- "step": 2350
1656
- },
1657
- {
1658
- "epoch": 2.050429301162917,
1659
- "grad_norm": 0.036865234375,
1660
- "learning_rate": 9.618416678724205e-06,
1661
- "loss": 3.889,
1662
- "step": 2360
1663
- },
1664
- {
1665
- "epoch": 2.0591240082599715,
1666
- "grad_norm": 0.0361328125,
1667
- "learning_rate": 9.458707604604764e-06,
1668
- "loss": 3.8743,
1669
- "step": 2370
1670
- },
1671
- {
1672
- "epoch": 2.0678187153570264,
1673
- "grad_norm": 0.035400390625,
1674
- "learning_rate": 9.299923917330265e-06,
1675
- "loss": 3.8706,
1676
- "step": 2380
1677
- },
1678
- {
1679
- "epoch": 2.076513422454081,
1680
- "grad_norm": 0.03466796875,
1681
- "learning_rate": 9.14207955602032e-06,
1682
- "loss": 3.8572,
1683
- "step": 2390
1684
- },
1685
- {
1686
- "epoch": 2.0852081295511358,
1687
- "grad_norm": 0.03369140625,
1688
- "learning_rate": 8.985188377334044e-06,
1689
- "loss": 3.8926,
1690
- "step": 2400
1691
- },
1692
- {
1693
- "epoch": 2.09390283664819,
1694
- "grad_norm": 0.03515625,
1695
- "learning_rate": 8.829264154253673e-06,
1696
- "loss": 3.8978,
1697
- "step": 2410
1698
- },
1699
- {
1700
- "epoch": 2.102597543745245,
1701
- "grad_norm": 0.0341796875,
1702
- "learning_rate": 8.674320574875456e-06,
1703
- "loss": 3.8852,
1704
- "step": 2420
1705
- },
1706
- {
1707
- "epoch": 2.1112922508422995,
1708
- "grad_norm": 0.03564453125,
1709
- "learning_rate": 8.520371241208019e-06,
1710
- "loss": 3.8864,
1711
- "step": 2430
1712
- },
1713
- {
1714
- "epoch": 2.1199869579393544,
1715
- "grad_norm": 0.035400390625,
1716
- "learning_rate": 8.367429667978275e-06,
1717
- "loss": 3.8833,
1718
- "step": 2440
1719
- },
1720
- {
1721
- "epoch": 2.1286816650364093,
1722
- "grad_norm": 0.036865234375,
1723
- "learning_rate": 8.215509281445043e-06,
1724
- "loss": 3.8611,
1725
- "step": 2450
1726
- },
1727
- {
1728
- "epoch": 2.1373763721334638,
1729
- "grad_norm": 0.03564453125,
1730
- "learning_rate": 8.06462341822037e-06,
1731
- "loss": 3.8443,
1732
- "step": 2460
1733
- },
1734
- {
1735
- "epoch": 2.146071079230518,
1736
- "grad_norm": 0.035400390625,
1737
- "learning_rate": 7.914785324098775e-06,
1738
- "loss": 3.8879,
1739
- "step": 2470
1740
- },
1741
- {
1742
- "epoch": 2.154765786327573,
1743
- "grad_norm": 0.034423828125,
1744
- "learning_rate": 7.7660081528944e-06,
1745
- "loss": 3.8596,
1746
- "step": 2480
1747
- },
1748
- {
1749
- "epoch": 2.163460493424628,
1750
- "grad_norm": 0.03515625,
1751
- "learning_rate": 7.618304965286334e-06,
1752
- "loss": 3.8491,
1753
- "step": 2490
1754
- },
1755
- {
1756
- "epoch": 2.1721552005216824,
1757
- "grad_norm": 0.037109375,
1758
- "learning_rate": 7.471688727672022e-06,
1759
- "loss": 3.901,
1760
- "step": 2500
1761
- },
1762
- {
1763
- "epoch": 2.180849907618737,
1764
- "grad_norm": 0.0390625,
1765
- "learning_rate": 7.326172311028996e-06,
1766
- "loss": 3.865,
1767
- "step": 2510
1768
- },
1769
- {
1770
- "epoch": 2.1895446147157918,
1771
- "grad_norm": 0.03515625,
1772
- "learning_rate": 7.181768489784957e-06,
1773
- "loss": 3.8544,
1774
- "step": 2520
1775
- },
1776
- {
1777
- "epoch": 2.1982393218128466,
1778
- "grad_norm": 0.035400390625,
1779
- "learning_rate": 7.038489940696375e-06,
1780
- "loss": 3.8794,
1781
- "step": 2530
1782
- },
1783
- {
1784
- "epoch": 2.206934028909901,
1785
- "grad_norm": 0.03662109375,
1786
- "learning_rate": 6.8963492417356224e-06,
1787
- "loss": 3.863,
1788
- "step": 2540
1789
- },
1790
- {
1791
- "epoch": 2.2156287360069555,
1792
- "grad_norm": 0.03515625,
1793
- "learning_rate": 6.755358870986797e-06,
1794
- "loss": 3.8727,
1795
- "step": 2550
1796
- },
1797
- {
1798
- "epoch": 2.2243234431040104,
1799
- "grad_norm": 0.03515625,
1800
- "learning_rate": 6.615531205550288e-06,
1801
- "loss": 3.8702,
1802
- "step": 2560
1803
- },
1804
- {
1805
- "epoch": 2.2330181502010653,
1806
- "grad_norm": 0.0361328125,
1807
- "learning_rate": 6.476878520456278e-06,
1808
- "loss": 3.8891,
1809
- "step": 2570
1810
- },
1811
- {
1812
- "epoch": 2.2417128572981198,
1813
- "grad_norm": 0.0341796875,
1814
- "learning_rate": 6.339412987587088e-06,
1815
- "loss": 3.8554,
1816
- "step": 2580
1817
- },
1818
- {
1819
- "epoch": 2.250407564395174,
1820
- "grad_norm": 0.034423828125,
1821
- "learning_rate": 6.203146674608742e-06,
1822
- "loss": 3.8889,
1823
- "step": 2590
1824
- },
1825
- {
1826
- "epoch": 2.259102271492229,
1827
- "grad_norm": 0.03466796875,
1828
- "learning_rate": 6.068091543911472e-06,
1829
- "loss": 3.875,
1830
- "step": 2600
1831
- },
1832
- {
1833
- "epoch": 2.267796978589284,
1834
- "grad_norm": 0.033203125,
1835
- "learning_rate": 5.9342594515596716e-06,
1836
- "loss": 3.8745,
1837
- "step": 2610
1838
- },
1839
- {
1840
- "epoch": 2.2764916856863384,
1841
- "grad_norm": 0.035888671875,
1842
- "learning_rate": 5.8016621462510305e-06,
1843
- "loss": 3.9004,
1844
- "step": 2620
1845
- },
1846
- {
1847
- "epoch": 2.285186392783393,
1848
- "grad_norm": 0.033935546875,
1849
- "learning_rate": 5.670311268285183e-06,
1850
- "loss": 3.8669,
1851
- "step": 2630
1852
- },
1853
- {
1854
- "epoch": 2.2938810998804477,
1855
- "grad_norm": 0.0361328125,
1856
- "learning_rate": 5.540218348541837e-06,
1857
- "loss": 3.8654,
1858
- "step": 2640
1859
- },
1860
- {
1861
- "epoch": 2.3025758069775026,
1862
- "grad_norm": 0.0341796875,
1863
- "learning_rate": 5.411394807468513e-06,
1864
- "loss": 3.871,
1865
- "step": 2650
1866
- },
1867
- {
1868
- "epoch": 2.311270514074557,
1869
- "grad_norm": 0.03515625,
1870
- "learning_rate": 5.283851954077961e-06,
1871
- "loss": 3.8598,
1872
- "step": 2660
1873
- },
1874
- {
1875
- "epoch": 2.319965221171612,
1876
- "grad_norm": 0.035400390625,
1877
- "learning_rate": 5.157600984955413e-06,
1878
- "loss": 3.8763,
1879
- "step": 2670
1880
- },
1881
- {
1882
- "epoch": 2.3286599282686664,
1883
- "grad_norm": 0.034912109375,
1884
- "learning_rate": 5.032652983275645e-06,
1885
- "loss": 3.8473,
1886
- "step": 2680
1887
- },
1888
- {
1889
- "epoch": 2.3373546353657213,
1890
- "grad_norm": 0.03662109375,
1891
- "learning_rate": 4.909018917830035e-06,
1892
- "loss": 3.861,
1893
- "step": 2690
1894
- },
1895
- {
1896
- "epoch": 2.3460493424627757,
1897
- "grad_norm": 0.03564453125,
1898
- "learning_rate": 4.786709642063618e-06,
1899
- "loss": 3.8676,
1900
- "step": 2700
1901
- },
1902
- {
1903
- "epoch": 2.3547440495598306,
1904
- "grad_norm": 0.035888671875,
1905
- "learning_rate": 4.665735893122338e-06,
1906
- "loss": 3.858,
1907
- "step": 2710
1908
- },
1909
- {
1910
- "epoch": 2.363438756656885,
1911
- "grad_norm": 0.033935546875,
1912
- "learning_rate": 4.546108290910438e-06,
1913
- "loss": 3.8869,
1914
- "step": 2720
1915
- },
1916
- {
1917
- "epoch": 2.37213346375394,
1918
- "grad_norm": 0.036376953125,
1919
- "learning_rate": 4.427837337158187e-06,
1920
- "loss": 3.882,
1921
- "step": 2730
1922
- },
1923
- {
1924
- "epoch": 2.3808281708509944,
1925
- "grad_norm": 0.0341796875,
1926
- "learning_rate": 4.310933414499949e-06,
1927
- "loss": 3.8742,
1928
- "step": 2740
1929
- },
1930
- {
1931
- "epoch": 2.3895228779480493,
1932
- "grad_norm": 0.035888671875,
1933
- "learning_rate": 4.1954067855627476e-06,
1934
- "loss": 3.858,
1935
- "step": 2750
1936
- },
1937
- {
1938
- "epoch": 2.3982175850451037,
1939
- "grad_norm": 0.035888671875,
1940
- "learning_rate": 4.0812675920653236e-06,
1941
- "loss": 3.8501,
1942
- "step": 2760
1943
- },
1944
- {
1945
- "epoch": 2.4069122921421586,
1946
- "grad_norm": 0.03662109375,
1947
- "learning_rate": 3.968525853927841e-06,
1948
- "loss": 3.8677,
1949
- "step": 2770
1950
- },
1951
- {
1952
- "epoch": 2.415606999239213,
1953
- "grad_norm": 0.035400390625,
1954
- "learning_rate": 3.857191468392241e-06,
1955
- "loss": 3.8632,
1956
- "step": 2780
1957
- },
1958
- {
1959
- "epoch": 2.424301706336268,
1960
- "grad_norm": 0.03564453125,
1961
- "learning_rate": 3.747274209153442e-06,
1962
- "loss": 3.8909,
1963
- "step": 2790
1964
- },
1965
- {
1966
- "epoch": 2.4329964134333224,
1967
- "grad_norm": 0.036376953125,
1968
- "learning_rate": 3.6387837255012783e-06,
1969
- "loss": 3.8596,
1970
- "step": 2800
1971
- },
1972
- {
1973
- "epoch": 2.4416911205303773,
1974
- "grad_norm": 0.034423828125,
1975
- "learning_rate": 3.5317295414734875e-06,
1976
- "loss": 3.8785,
1977
- "step": 2810
1978
- },
1979
- {
1980
- "epoch": 2.4503858276274317,
1981
- "grad_norm": 0.035400390625,
1982
- "learning_rate": 3.426121055019556e-06,
1983
- "loss": 3.876,
1984
- "step": 2820
1985
- },
1986
- {
1987
- "epoch": 2.4590805347244866,
1988
- "grad_norm": 0.03466796875,
1989
- "learning_rate": 3.321967537175759e-06,
1990
- "loss": 3.8893,
1991
- "step": 2830
1992
- },
1993
- {
1994
- "epoch": 2.467775241821541,
1995
- "grad_norm": 0.035400390625,
1996
- "learning_rate": 3.21927813125124e-06,
1997
- "loss": 3.8601,
1998
- "step": 2840
1999
- },
2000
- {
2001
- "epoch": 2.476469948918596,
2002
- "grad_norm": 0.03515625,
2003
- "learning_rate": 3.1180618520253893e-06,
2004
- "loss": 3.8608,
2005
- "step": 2850
2006
- },
2007
- {
2008
- "epoch": 2.4851646560156504,
2009
- "grad_norm": 0.03515625,
2010
- "learning_rate": 3.0183275849564396e-06,
2011
- "loss": 3.8894,
2012
- "step": 2860
2013
- },
2014
- {
2015
- "epoch": 2.4938593631127053,
2016
- "grad_norm": 0.03515625,
2017
- "learning_rate": 2.9200840854014466e-06,
2018
- "loss": 3.8522,
2019
- "step": 2870
2020
- },
2021
- {
2022
- "epoch": 2.5025540702097597,
2023
- "grad_norm": 0.03515625,
2024
- "learning_rate": 2.8233399778476744e-06,
2025
- "loss": 3.8801,
2026
- "step": 2880
2027
- },
2028
- {
2029
- "epoch": 2.5112487773068146,
2030
- "grad_norm": 0.0361328125,
2031
- "learning_rate": 2.7281037551555e-06,
2032
- "loss": 3.8751,
2033
- "step": 2890
2034
- },
2035
- {
2036
- "epoch": 2.519943484403869,
2037
- "grad_norm": 0.03564453125,
2038
- "learning_rate": 2.6343837778128366e-06,
2039
- "loss": 3.863,
2040
- "step": 2900
2041
- },
2042
- {
2043
- "epoch": 2.528638191500924,
2044
- "grad_norm": 0.03515625,
2045
- "learning_rate": 2.5421882732011937e-06,
2046
- "loss": 3.8768,
2047
- "step": 2910
2048
- },
2049
- {
2050
- "epoch": 2.5373328985979784,
2051
- "grad_norm": 0.0361328125,
2052
- "learning_rate": 2.4515253348734192e-06,
2053
- "loss": 3.911,
2054
- "step": 2920
2055
- },
2056
- {
2057
- "epoch": 2.5460276056950333,
2058
- "grad_norm": 0.034912109375,
2059
- "learning_rate": 2.362402921843201e-06,
2060
- "loss": 3.8806,
2061
- "step": 2930
2062
- },
2063
- {
2064
- "epoch": 2.5547223127920877,
2065
- "grad_norm": 0.034912109375,
2066
- "learning_rate": 2.2748288578863664e-06,
2067
- "loss": 3.8654,
2068
- "step": 2940
2069
- },
2070
- {
2071
- "epoch": 2.5634170198891426,
2072
- "grad_norm": 0.035888671875,
2073
- "learning_rate": 2.188810830854058e-06,
2074
- "loss": 3.8697,
2075
- "step": 2950
2076
- },
2077
- {
2078
- "epoch": 2.572111726986197,
2079
- "grad_norm": 0.034423828125,
2080
- "learning_rate": 2.104356391997826e-06,
2081
- "loss": 3.8804,
2082
- "step": 2960
2083
- },
2084
- {
2085
- "epoch": 2.580806434083252,
2086
- "grad_norm": 0.035888671875,
2087
- "learning_rate": 2.02147295530676e-06,
2088
- "loss": 3.8637,
2089
- "step": 2970
2090
- },
2091
- {
2092
- "epoch": 2.5895011411803064,
2093
- "grad_norm": 0.03564453125,
2094
- "learning_rate": 1.940167796856598e-06,
2095
- "loss": 3.8623,
2096
- "step": 2980
2097
- },
2098
- {
2099
- "epoch": 2.5981958482773613,
2100
- "grad_norm": 0.03564453125,
2101
- "learning_rate": 1.8604480541710357e-06,
2102
- "loss": 3.8593,
2103
- "step": 2990
2104
- },
2105
- {
2106
- "epoch": 2.6068905553744157,
2107
- "grad_norm": 0.0341796875,
2108
- "learning_rate": 1.7823207255950792e-06,
2109
- "loss": 3.8711,
2110
- "step": 3000
2111
- },
2112
- {
2113
- "epoch": 2.6155852624714706,
2114
- "grad_norm": 0.035888671875,
2115
- "learning_rate": 1.7057926696807437e-06,
2116
- "loss": 3.8767,
2117
- "step": 3010
2118
- },
2119
- {
2120
- "epoch": 2.624279969568525,
2121
- "grad_norm": 0.036376953125,
2122
- "learning_rate": 1.6308706045849154e-06,
2123
- "loss": 3.8813,
2124
- "step": 3020
2125
- },
2126
- {
2127
- "epoch": 2.63297467666558,
2128
- "grad_norm": 0.034912109375,
2129
- "learning_rate": 1.5575611074796326e-06,
2130
- "loss": 3.8709,
2131
- "step": 3030
2132
- },
2133
- {
2134
- "epoch": 2.6416693837626344,
2135
- "grad_norm": 0.0361328125,
2136
- "learning_rate": 1.4858706139746449e-06,
2137
- "loss": 3.8536,
2138
- "step": 3040
2139
- },
2140
- {
2141
- "epoch": 2.6503640908596893,
2142
- "grad_norm": 0.03515625,
2143
- "learning_rate": 1.415805417552496e-06,
2144
- "loss": 3.8704,
2145
- "step": 3050
2146
- },
2147
- {
2148
- "epoch": 2.6590587979567437,
2149
- "grad_norm": 0.03662109375,
2150
- "learning_rate": 1.3473716690160022e-06,
2151
- "loss": 3.8812,
2152
- "step": 3060
2153
- },
2154
- {
2155
- "epoch": 2.6677535050537986,
2156
- "grad_norm": 0.03515625,
2157
- "learning_rate": 1.2805753759483296e-06,
2158
- "loss": 3.8587,
2159
- "step": 3070
2160
- },
2161
- {
2162
- "epoch": 2.676448212150853,
2163
- "grad_norm": 0.03515625,
2164
- "learning_rate": 1.2154224021855733e-06,
2165
- "loss": 3.855,
2166
- "step": 3080
2167
- },
2168
- {
2169
- "epoch": 2.685142919247908,
2170
- "grad_norm": 0.0361328125,
2171
- "learning_rate": 1.1519184673020156e-06,
2172
- "loss": 3.892,
2173
- "step": 3090
2174
- },
2175
- {
2176
- "epoch": 2.6938376263449624,
2177
- "grad_norm": 0.032958984375,
2178
- "learning_rate": 1.0900691461080016e-06,
2179
- "loss": 3.8608,
2180
- "step": 3100
2181
- },
2182
- {
2183
- "epoch": 2.7025323334420173,
2184
- "grad_norm": 0.0341796875,
2185
- "learning_rate": 1.0298798681605592e-06,
2186
- "loss": 3.8663,
2187
- "step": 3110
2188
- },
2189
- {
2190
- "epoch": 2.7112270405390717,
2191
- "grad_norm": 0.034423828125,
2192
- "learning_rate": 9.713559172867492e-07,
2193
- "loss": 3.8662,
2194
- "step": 3120
2195
- },
2196
- {
2197
- "epoch": 2.7199217476361266,
2198
- "grad_norm": 0.035400390625,
2199
- "learning_rate": 9.145024311198214e-07,
2200
- "loss": 3.8786,
2201
- "step": 3130
2202
- },
2203
- {
2204
- "epoch": 2.728616454733181,
2205
- "grad_norm": 0.03662109375,
2206
- "learning_rate": 8.593244006481805e-07,
2207
- "loss": 3.8842,
2208
- "step": 3140
2209
- },
2210
- {
2211
- "epoch": 2.737311161830236,
2212
- "grad_norm": 0.035400390625,
2213
- "learning_rate": 8.058266697772654e-07,
2214
- "loss": 3.8759,
2215
- "step": 3150
2216
- },
2217
- {
2218
- "epoch": 2.7460058689272904,
2219
- "grad_norm": 0.036865234375,
2220
- "learning_rate": 7.54013934904303e-07,
2221
- "loss": 3.8871,
2222
- "step": 3160
2223
- },
2224
- {
2225
- "epoch": 2.7547005760243453,
2226
- "grad_norm": 0.0361328125,
2227
- "learning_rate": 7.038907445060328e-07,
2228
- "loss": 3.8612,
2229
- "step": 3170
2230
- },
2231
- {
2232
- "epoch": 2.7633952831213997,
2233
- "grad_norm": 0.03466796875,
2234
- "learning_rate": 6.55461498739407e-07,
2235
- "loss": 3.8801,
2236
- "step": 3180
2237
- },
2238
- {
2239
- "epoch": 2.7720899902184546,
2240
- "grad_norm": 0.035400390625,
2241
- "learning_rate": 6.087304490553148e-07,
2242
- "loss": 3.8663,
2243
- "step": 3190
2244
- },
2245
- {
2246
- "epoch": 2.780784697315509,
2247
- "grad_norm": 0.03759765625,
2248
- "learning_rate": 5.637016978253607e-07,
2249
- "loss": 3.8537,
2250
- "step": 3200
2251
- },
2252
- {
2253
- "epoch": 2.789479404412564,
2254
- "grad_norm": 0.03564453125,
2255
- "learning_rate": 5.203791979817441e-07,
2256
- "loss": 3.8857,
2257
- "step": 3210
2258
- },
2259
- {
2260
- "epoch": 2.7981741115096184,
2261
- "grad_norm": 0.034423828125,
2262
- "learning_rate": 4.787667526702122e-07,
2263
- "loss": 3.8555,
2264
- "step": 3220
2265
- },
2266
- {
2267
- "epoch": 2.8068688186066733,
2268
- "grad_norm": 0.03662109375,
2269
- "learning_rate": 4.3886801491622946e-07,
2270
- "loss": 3.9065,
2271
- "step": 3230
2272
- },
2273
- {
2274
- "epoch": 2.8155635257037277,
2275
- "grad_norm": 0.03515625,
2276
- "learning_rate": 4.0068648730426264e-07,
2277
- "loss": 3.8765,
2278
- "step": 3240
2279
- },
2280
- {
2281
- "epoch": 2.8242582328007826,
2282
- "grad_norm": 0.03466796875,
2283
- "learning_rate": 3.642255216703161e-07,
2284
- "loss": 3.8651,
2285
- "step": 3250
2286
- },
2287
- {
2288
- "epoch": 2.832952939897837,
2289
- "grad_norm": 0.034912109375,
2290
- "learning_rate": 3.2948831880767806e-07,
2291
- "loss": 3.8236,
2292
- "step": 3260
2293
- },
2294
- {
2295
- "epoch": 2.841647646994892,
2296
- "grad_norm": 0.037353515625,
2297
- "learning_rate": 2.9647792818593646e-07,
2298
- "loss": 3.8867,
2299
- "step": 3270
2300
- },
2301
- {
2302
- "epoch": 2.8503423540919464,
2303
- "grad_norm": 0.0361328125,
2304
- "learning_rate": 2.651972476832709e-07,
2305
- "loss": 3.8663,
2306
- "step": 3280
2307
- },
2308
- {
2309
- "epoch": 2.8590370611890012,
2310
- "grad_norm": 0.0361328125,
2311
- "learning_rate": 2.3564902333205851e-07,
2312
- "loss": 3.8985,
2313
- "step": 3290
2314
- },
2315
- {
2316
- "epoch": 2.867731768286056,
2317
- "grad_norm": 0.035888671875,
2318
- "learning_rate": 2.078358490778154e-07,
2319
- "loss": 3.8638,
2320
- "step": 3300
2321
- },
2322
- {
2323
- "epoch": 2.8764264753831106,
2324
- "grad_norm": 0.036865234375,
2325
- "learning_rate": 1.8176016655147454e-07,
2326
- "loss": 3.8679,
2327
- "step": 3310
2328
- },
2329
- {
2330
- "epoch": 2.885121182480165,
2331
- "grad_norm": 0.035400390625,
2332
- "learning_rate": 1.5742426485503725e-07,
2333
- "loss": 3.8655,
2334
- "step": 3320
2335
- },
2336
- {
2337
- "epoch": 2.89381588957722,
2338
- "grad_norm": 0.03515625,
2339
- "learning_rate": 1.3483028036063829e-07,
2340
- "loss": 3.8847,
2341
- "step": 3330
2342
- },
2343
- {
2344
- "epoch": 2.902510596674275,
2345
- "grad_norm": 0.034423828125,
2346
- "learning_rate": 1.1398019652298254e-07,
2347
- "loss": 3.8767,
2348
- "step": 3340
2349
- },
2350
- {
2351
- "epoch": 2.9112053037713292,
2352
- "grad_norm": 0.03369140625,
2353
- "learning_rate": 9.487584370523772e-08,
2354
- "loss": 3.8769,
2355
- "step": 3350
2356
- },
2357
- {
2358
- "epoch": 2.9199000108683837,
2359
- "grad_norm": 0.034912109375,
2360
- "learning_rate": 7.751889901833842e-08,
2361
- "loss": 3.8587,
2362
- "step": 3360
2363
- },
2364
- {
2365
- "epoch": 2.9285947179654386,
2366
- "grad_norm": 0.03564453125,
2367
- "learning_rate": 6.191088617376828e-08,
2368
- "loss": 3.8742,
2369
- "step": 3370
2370
- },
2371
- {
2372
- "epoch": 2.9372894250624935,
2373
- "grad_norm": 0.03466796875,
2374
- "learning_rate": 4.805317534980036e-08,
2375
- "loss": 3.8757,
2376
- "step": 3380
2377
- },
2378
- {
2379
- "epoch": 2.945984132159548,
2380
- "grad_norm": 0.03466796875,
2381
- "learning_rate": 3.5946983071200037e-08,
2382
- "loss": 3.8686,
2383
- "step": 3390
2384
- },
2385
- {
2386
- "epoch": 2.9546788392566024,
2387
- "grad_norm": 0.03466796875,
2388
- "learning_rate": 2.5593372102443724e-08,
2389
- "loss": 3.8705,
2390
- "step": 3400
2391
- },
2392
- {
2393
- "epoch": 2.9633735463536572,
2394
- "grad_norm": 0.034912109375,
2395
- "learning_rate": 1.6993251354415763e-08,
2396
- "loss": 3.8715,
2397
- "step": 3410
2398
- },
2399
- {
2400
- "epoch": 2.972068253450712,
2401
- "grad_norm": 0.034423828125,
2402
- "learning_rate": 1.0147375804614446e-08,
2403
- "loss": 3.8607,
2404
- "step": 3420
2405
- },
2406
- {
2407
- "epoch": 2.9807629605477666,
2408
- "grad_norm": 0.03564453125,
2409
- "learning_rate": 5.05634643088948e-09,
2410
- "loss": 3.8656,
2411
- "step": 3430
2412
- },
2413
- {
2414
- "epoch": 2.989457667644821,
2415
- "grad_norm": 0.035400390625,
2416
- "learning_rate": 1.7206101586664248e-09,
2417
- "loss": 3.8606,
2418
- "step": 3440
2419
- },
2420
- {
2421
- "epoch": 2.998152374741876,
2422
- "grad_norm": 0.034912109375,
2423
- "learning_rate": 1.4045982171806416e-10,
2424
- "loss": 3.8691,
2425
- "step": 3450
2426
  }
2427
  ],
2428
  "logging_steps": 10,
2429
- "max_steps": 3453,
2430
  "num_input_tokens_seen": 0,
2431
- "num_train_epochs": 3,
2432
  "save_steps": 157,
2433
  "stateful_callbacks": {
2434
  "TrainerControl": {
@@ -2442,7 +832,7 @@
2442
  "attributes": {}
2443
  }
2444
  },
2445
- "total_flos": 1.5697437216944947e+19,
2446
  "train_batch_size": 2,
2447
  "trial_name": null,
2448
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 1151,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.008694707097054668,
14
+ "grad_norm": 0.1513671875,
15
+ "learning_rate": 9.000000000000001e-07,
16
+ "loss": 4.1809,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.017389414194109335,
21
+ "grad_norm": 0.1630859375,
22
+ "learning_rate": 1.9000000000000002e-06,
23
+ "loss": 4.214,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.026084121291164004,
28
+ "grad_norm": 0.158203125,
29
+ "learning_rate": 2.9e-06,
30
+ "loss": 4.1983,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.03477882838821867,
35
+ "grad_norm": 0.1376953125,
36
+ "learning_rate": 3.900000000000001e-06,
37
+ "loss": 4.2097,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.04347353548527334,
42
+ "grad_norm": 0.1494140625,
43
+ "learning_rate": 4.9000000000000005e-06,
44
+ "loss": 4.1934,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.05216824258232801,
49
+ "grad_norm": 0.130859375,
50
+ "learning_rate": 5.9e-06,
51
+ "loss": 4.1777,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.060862949679382675,
56
+ "grad_norm": 0.09228515625,
57
+ "learning_rate": 6.9e-06,
58
+ "loss": 4.1902,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.06955765677643734,
63
+ "grad_norm": 0.11181640625,
64
+ "learning_rate": 7.9e-06,
65
+ "loss": 4.1578,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.078252363873492,
70
+ "grad_norm": 0.10546875,
71
+ "learning_rate": 8.900000000000001e-06,
72
+ "loss": 4.1555,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.08694707097054669,
77
+ "grad_norm": 0.07421875,
78
+ "learning_rate": 9.9e-06,
79
+ "loss": 4.1578,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.09564177806760135,
84
+ "grad_norm": 0.059326171875,
85
+ "learning_rate": 9.998190772635151e-06,
86
+ "loss": 4.1387,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.10433648516465602,
91
+ "grad_norm": 0.05712890625,
92
+ "learning_rate": 9.991938334301789e-06,
93
+ "loss": 4.121,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.11303119226171068,
98
+ "grad_norm": 0.04931640625,
99
+ "learning_rate": 9.981225933663634e-06,
100
+ "loss": 4.0872,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.12172589935876535,
105
+ "grad_norm": 0.05029296875,
106
+ "learning_rate": 9.966063141532634e-06,
107
+ "loss": 4.1091,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.13042060645582002,
112
+ "grad_norm": 0.04541015625,
113
+ "learning_rate": 9.946463504847235e-06,
114
+ "loss": 4.0941,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.13911531355287468,
119
+ "grad_norm": 0.041748046875,
120
+ "learning_rate": 9.9224445345691e-06,
121
+ "loss": 4.0769,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.14781002064992935,
126
+ "grad_norm": 0.044189453125,
127
+ "learning_rate": 9.894027690038244e-06,
128
+ "loss": 4.0895,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.156504727746984,
133
+ "grad_norm": 0.042724609375,
134
+ "learning_rate": 9.861238359800543e-06,
135
+ "loss": 4.0772,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.16519943484403868,
140
+ "grad_norm": 0.043212890625,
141
+ "learning_rate": 9.824105838924784e-06,
142
+ "loss": 4.0843,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 0.17389414194109337,
147
+ "grad_norm": 0.040283203125,
148
+ "learning_rate": 9.782663302829467e-06,
149
+ "loss": 4.0945,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 0.18258884903814804,
154
+ "grad_norm": 0.0419921875,
155
+ "learning_rate": 9.736947777642809e-06,
156
+ "loss": 4.0721,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.1912835561352027,
161
+ "grad_norm": 0.039794921875,
162
+ "learning_rate": 9.687000107122367e-06,
163
+ "loss": 4.0701,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.19997826323225737,
168
+ "grad_norm": 0.041259765625,
169
+ "learning_rate": 9.632864916163886e-06,
170
+ "loss": 4.0938,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.20867297032931204,
175
+ "grad_norm": 0.038330078125,
176
+ "learning_rate": 9.574590570931944e-06,
177
+ "loss": 4.0663,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.2173676774263667,
182
+ "grad_norm": 0.03857421875,
183
+ "learning_rate": 9.512229135648023e-06,
184
+ "loss": 4.0691,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.22606238452342137,
189
+ "grad_norm": 0.039306640625,
190
+ "learning_rate": 9.445836326074625e-06,
191
+ "loss": 4.0786,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.23475709162047603,
196
+ "grad_norm": 0.039794921875,
197
+ "learning_rate": 9.37547145973696e-06,
198
+ "loss": 4.0746,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.2434517987175307,
203
+ "grad_norm": 0.03955078125,
204
+ "learning_rate": 9.301197402926726e-06,
205
+ "loss": 4.0718,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.25214650581458536,
210
+ "grad_norm": 0.041259765625,
211
+ "learning_rate": 9.223080514535277e-06,
212
+ "loss": 4.0605,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.26084121291164003,
217
+ "grad_norm": 0.03955078125,
218
+ "learning_rate": 9.141190586766418e-06,
219
+ "loss": 4.0513,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.2695359200086947,
224
+ "grad_norm": 0.041259765625,
225
+ "learning_rate": 9.055600782781738e-06,
226
+ "loss": 4.0529,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.27823062710574936,
231
+ "grad_norm": 0.037841796875,
232
+ "learning_rate": 8.96638757133423e-06,
233
+ "loss": 4.0448,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.286925334202804,
238
+ "grad_norm": 0.041015625,
239
+ "learning_rate": 8.873630658448586e-06,
240
+ "loss": 4.042,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.2956200412998587,
245
+ "grad_norm": 0.03857421875,
246
+ "learning_rate": 8.777412916209214e-06,
247
+ "loss": 4.0662,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.30431474839691336,
252
+ "grad_norm": 0.03857421875,
253
+ "learning_rate": 8.677820308719572e-06,
254
+ "loss": 4.065,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.313009455493968,
259
+ "grad_norm": 0.0361328125,
260
+ "learning_rate": 8.574941815299012e-06,
261
+ "loss": 4.0534,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.3217041625910227,
266
+ "grad_norm": 0.039306640625,
267
+ "learning_rate": 8.468869350985725e-06,
268
+ "loss": 4.0515,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.33039886968807736,
273
+ "grad_norm": 0.037353515625,
274
+ "learning_rate": 8.359697684416805e-06,
275
+ "loss": 4.0615,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.3390935767851321,
280
+ "grad_norm": 0.036865234375,
281
+ "learning_rate": 8.247524353158836e-06,
282
+ "loss": 4.038,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.34778828388218674,
287
+ "grad_norm": 0.037109375,
288
+ "learning_rate": 8.132449576564603e-06,
289
+ "loss": 4.0643,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 0.3564829909792414,
294
+ "grad_norm": 0.036376953125,
295
+ "learning_rate": 8.014576166233823e-06,
296
+ "loss": 4.0567,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.3651776980762961,
301
+ "grad_norm": 0.038818359375,
302
+ "learning_rate": 7.894009434157873e-06,
303
+ "loss": 4.0569,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 0.37387240517335074,
308
+ "grad_norm": 0.037841796875,
309
+ "learning_rate": 7.77085709863058e-06,
310
+ "loss": 4.0561,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 0.3825671122704054,
315
+ "grad_norm": 0.035400390625,
316
+ "learning_rate": 7.645229188009153e-06,
317
+ "loss": 4.0437,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 0.3912618193674601,
322
+ "grad_norm": 0.037109375,
323
+ "learning_rate": 7.517237942411213e-06,
324
+ "loss": 4.0419,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 0.39995652646451474,
329
+ "grad_norm": 0.036376953125,
330
+ "learning_rate": 7.386997713435774e-06,
331
+ "loss": 4.0653,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 0.4086512335615694,
336
+ "grad_norm": 0.035888671875,
337
+ "learning_rate": 7.254624861997754e-06,
338
+ "loss": 4.0415,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 0.41734594065862407,
343
+ "grad_norm": 0.03662109375,
344
+ "learning_rate": 7.120237654367301e-06,
345
+ "loss": 4.0362,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 0.42604064775567874,
350
+ "grad_norm": 0.038818359375,
351
+ "learning_rate": 6.983956156506798e-06,
352
+ "loss": 4.0543,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 0.4347353548527334,
357
+ "grad_norm": 0.03759765625,
358
+ "learning_rate": 6.845902126799981e-06,
359
+ "loss": 4.0623,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.44343006194978807,
364
+ "grad_norm": 0.035400390625,
365
+ "learning_rate": 6.706198907268986e-06,
366
+ "loss": 4.0398,
367
  "step": 510
368
  },
369
  {
370
  "epoch": 0.45212476904684273,
371
+ "grad_norm": 0.035888671875,
372
+ "learning_rate": 6.5649713133765115e-06,
373
+ "loss": 4.0346,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 0.4608194761438974,
378
+ "grad_norm": 0.036865234375,
379
+ "learning_rate": 6.422345522511575e-06,
380
+ "loss": 4.0574,
381
  "step": 530
382
  },
383
  {
384
  "epoch": 0.46951418324095207,
385
+ "grad_norm": 0.036865234375,
386
+ "learning_rate": 6.2784489612584695e-06,
387
+ "loss": 4.0289,
388
  "step": 540
389
  },
390
  {
391
  "epoch": 0.47820889033800673,
392
+ "grad_norm": 0.03564453125,
393
+ "learning_rate": 6.133410191549658e-06,
394
+ "loss": 4.0427,
395
  "step": 550
396
  },
397
  {
398
  "epoch": 0.4869035974350614,
399
+ "grad_norm": 0.0361328125,
400
+ "learning_rate": 5.987358795804294e-06,
401
+ "loss": 4.0587,
402
  "step": 560
403
  },
404
  {
405
  "epoch": 0.49559830453211606,
406
+ "grad_norm": 0.036376953125,
407
+ "learning_rate": 5.840425261155022e-06,
408
+ "loss": 4.0301,
409
  "step": 570
410
  },
411
  {
412
  "epoch": 0.5042930116291707,
413
+ "grad_norm": 0.0380859375,
414
+ "learning_rate": 5.692740862866472e-06,
415
+ "loss": 4.0396,
416
  "step": 580
417
  },
418
  {
419
  "epoch": 0.5129877187262254,
420
+ "grad_norm": 0.03759765625,
421
+ "learning_rate": 5.544437547049608e-06,
422
+ "loss": 4.0401,
423
  "step": 590
424
  },
425
  {
426
  "epoch": 0.5216824258232801,
427
  "grad_norm": 0.03662109375,
428
+ "learning_rate": 5.3956478127767155e-06,
429
+ "loss": 4.0432,
430
  "step": 600
431
  },
432
  {
433
  "epoch": 0.5303771329203347,
434
+ "grad_norm": 0.03662109375,
435
+ "learning_rate": 5.2465045937023704e-06,
436
+ "loss": 4.0351,
437
  "step": 610
438
  },
439
  {
440
  "epoch": 0.5390718400173894,
441
+ "grad_norm": 0.0361328125,
442
+ "learning_rate": 5.097141139296129e-06,
443
+ "loss": 4.0482,
444
  "step": 620
445
  },
446
  {
447
  "epoch": 0.5477665471144441,
448
+ "grad_norm": 0.035888671875,
449
+ "learning_rate": 4.947690895793049e-06,
450
+ "loss": 4.044,
451
  "step": 630
452
  },
453
  {
454
  "epoch": 0.5564612542114987,
455
+ "grad_norm": 0.0400390625,
456
+ "learning_rate": 4.7982873869684315e-06,
457
+ "loss": 4.0323,
458
  "step": 640
459
  },
460
  {
461
  "epoch": 0.5651559613085534,
462
+ "grad_norm": 0.034912109375,
463
+ "learning_rate": 4.649064094843274e-06,
464
+ "loss": 4.0172,
465
  "step": 650
466
  },
467
  {
468
  "epoch": 0.573850668405608,
469
+ "grad_norm": 0.03564453125,
470
+ "learning_rate": 4.500154340427037e-06,
471
+ "loss": 4.0479,
472
  "step": 660
473
  },
474
  {
475
  "epoch": 0.5825453755026627,
476
+ "grad_norm": 0.036376953125,
477
+ "learning_rate": 4.351691164604247e-06,
478
+ "loss": 4.0412,
479
  "step": 670
480
  },
481
  {
482
  "epoch": 0.5912400825997174,
483
+ "grad_norm": 0.035888671875,
484
+ "learning_rate": 4.203807209271393e-06,
485
+ "loss": 4.0207,
486
  "step": 680
487
  },
488
  {
489
  "epoch": 0.599934789696772,
490
+ "grad_norm": 0.0361328125,
491
+ "learning_rate": 4.056634598830282e-06,
492
+ "loss": 4.0281,
493
  "step": 690
494
  },
495
  {
496
  "epoch": 0.6086294967938267,
497
+ "grad_norm": 0.03564453125,
498
+ "learning_rate": 3.910304822143734e-06,
499
+ "loss": 4.0252,
500
  "step": 700
501
  },
502
  {
503
  "epoch": 0.6173242038908814,
504
+ "grad_norm": 0.03466796875,
505
+ "learning_rate": 3.7649486150591115e-06,
506
+ "loss": 4.0236,
507
  "step": 710
508
  },
509
  {
510
  "epoch": 0.626018910987936,
511
+ "grad_norm": 0.0361328125,
512
+ "learning_rate": 3.6206958436045856e-06,
513
+ "loss": 4.0225,
514
  "step": 720
515
  },
516
  {
517
  "epoch": 0.6347136180849907,
518
+ "grad_norm": 0.036865234375,
519
+ "learning_rate": 3.4776753879625563e-06,
520
+ "loss": 4.0214,
521
  "step": 730
522
  },
523
  {
524
  "epoch": 0.6434083251820454,
525
+ "grad_norm": 0.0361328125,
526
+ "learning_rate": 3.3360150273238413e-06,
527
+ "loss": 4.0541,
528
  "step": 740
529
  },
530
  {
531
  "epoch": 0.6521030322791,
532
+ "grad_norm": 0.037109375,
533
+ "learning_rate": 3.1958413257255403e-06,
534
+ "loss": 4.0453,
535
  "step": 750
536
  },
537
  {
538
  "epoch": 0.6607977393761547,
539
  "grad_norm": 0.037109375,
540
+ "learning_rate": 3.057279518974544e-06,
541
+ "loss": 4.0308,
542
  "step": 760
543
  },
544
  {
545
  "epoch": 0.6694924464732094,
546
+ "grad_norm": 0.03515625,
547
+ "learning_rate": 2.9204534027577387e-06,
548
+ "loss": 4.0492,
549
  "step": 770
550
  },
551
  {
552
  "epoch": 0.6781871535702642,
553
+ "grad_norm": 0.041748046875,
554
+ "learning_rate": 2.7854852220388617e-06,
555
+ "loss": 4.0639,
556
  "step": 780
557
  },
558
  {
559
  "epoch": 0.6868818606673188,
560
+ "grad_norm": 0.035888671875,
561
+ "learning_rate": 2.6524955618408093e-06,
562
+ "loss": 4.0283,
563
  "step": 790
564
  },
565
  {
566
  "epoch": 0.6955765677643735,
567
+ "grad_norm": 0.035400390625,
568
+ "learning_rate": 2.521603239511011e-06,
569
+ "loss": 4.0418,
570
  "step": 800
571
  },
572
  {
573
  "epoch": 0.7042712748614282,
574
  "grad_norm": 0.035400390625,
575
+ "learning_rate": 2.3929251985660866e-06,
576
+ "loss": 4.0417,
577
  "step": 810
578
  },
579
  {
580
  "epoch": 0.7129659819584828,
581
+ "grad_norm": 0.03564453125,
582
+ "learning_rate": 2.2665764042106647e-06,
583
+ "loss": 4.0288,
584
  "step": 820
585
  },
586
  {
587
  "epoch": 0.7216606890555375,
588
+ "grad_norm": 0.035888671875,
589
+ "learning_rate": 2.142669740623661e-06,
590
+ "loss": 4.0501,
591
  "step": 830
592
  },
593
  {
594
  "epoch": 0.7303553961525922,
595
+ "grad_norm": 0.036376953125,
596
+ "learning_rate": 2.021315910103841e-06,
597
+ "loss": 4.0402,
598
  "step": 840
599
  },
600
  {
601
  "epoch": 0.7390501032496468,
602
+ "grad_norm": 0.03466796875,
603
+ "learning_rate": 1.9026233341647398e-06,
604
+ "loss": 4.0303,
605
  "step": 850
606
  },
607
  {
608
  "epoch": 0.7477448103467015,
609
+ "grad_norm": 0.036865234375,
610
+ "learning_rate": 1.786698056667297e-06,
611
+ "loss": 4.0461,
612
  "step": 860
613
  },
614
  {
615
  "epoch": 0.7564395174437561,
616
  "grad_norm": 0.038330078125,
617
+ "learning_rate": 1.6736436490767793e-06,
618
+ "loss": 4.0543,
619
  "step": 870
620
  },
621
  {
622
  "epoch": 0.7651342245408108,
623
+ "grad_norm": 0.035888671875,
624
+ "learning_rate": 1.5635611179286203e-06,
625
+ "loss": 4.0414,
626
  "step": 880
627
  },
628
  {
629
  "epoch": 0.7738289316378655,
630
  "grad_norm": 0.036865234375,
631
+ "learning_rate": 1.4565488145858497e-06,
632
+ "loss": 4.028,
633
  "step": 890
634
  },
635
  {
636
  "epoch": 0.7825236387349201,
637
+ "grad_norm": 0.035888671875,
638
+ "learning_rate": 1.3527023473687417e-06,
639
+ "loss": 4.0582,
640
  "step": 900
641
  },
642
  {
643
  "epoch": 0.7912183458319748,
644
+ "grad_norm": 0.037353515625,
645
+ "learning_rate": 1.2521144961351893e-06,
646
+ "loss": 4.0381,
647
  "step": 910
648
  },
649
  {
650
  "epoch": 0.7999130529290295,
651
+ "grad_norm": 0.0361328125,
652
+ "learning_rate": 1.154875129388126e-06,
653
+ "loss": 4.0082,
654
  "step": 920
655
  },
656
  {
657
  "epoch": 0.8086077600260841,
658
+ "grad_norm": 0.038330078125,
659
+ "learning_rate": 1.061071123984031e-06,
660
+ "loss": 4.0471,
661
  "step": 930
662
  },
663
  {
664
  "epoch": 0.8173024671231388,
665
+ "grad_norm": 0.035888671875,
666
+ "learning_rate": 9.707862875142898e-07,
667
+ "loss": 4.0324,
668
  "step": 940
669
  },
670
  {
671
  "epoch": 0.8259971742201935,
672
+ "grad_norm": 0.037841796875,
673
+ "learning_rate": 8.841012834287254e-07,
674
+ "loss": 4.0469,
675
  "step": 950
676
  },
677
  {
678
  "epoch": 0.8346918813172481,
679
+ "grad_norm": 0.035400390625,
680
+ "learning_rate": 8.010935589682134e-07,
681
+ "loss": 4.0267,
682
  "step": 960
683
  },
684
  {
685
  "epoch": 0.8433865884143028,
686
+ "grad_norm": 0.036376953125,
687
+ "learning_rate": 7.218372759707626e-07,
688
+ "loss": 4.0408,
689
  "step": 970
690
  },
691
  {
692
  "epoch": 0.8520812955113575,
693
+ "grad_norm": 0.035888671875,
694
+ "learning_rate": 6.464032446128837e-07,
695
+ "loss": 4.0647,
696
  "step": 980
697
  },
698
  {
699
  "epoch": 0.8607760026084121,
700
+ "grad_norm": 0.03466796875,
701
+ "learning_rate": 5.748588601454463e-07,
702
+ "loss": 4.0245,
703
  "step": 990
704
  },
705
  {
706
  "epoch": 0.8694707097054668,
707
+ "grad_norm": 0.03759765625,
708
+ "learning_rate": 5.072680426805332e-07,
709
+ "loss": 4.0502,
710
  "step": 1000
711
  },
712
  {
713
  "epoch": 0.8781654168025215,
714
+ "grad_norm": 0.037841796875,
715
+ "learning_rate": 4.436911800831084e-07,
716
+ "loss": 4.0439,
717
  "step": 1010
718
  },
719
  {
720
  "epoch": 0.8868601238995761,
721
+ "grad_norm": 0.036865234375,
722
+ "learning_rate": 3.841850740185088e-07,
723
+ "loss": 4.0625,
724
  "step": 1020
725
  },
726
  {
727
  "epoch": 0.8955548309966308,
728
+ "grad_norm": 0.03515625,
729
+ "learning_rate": 3.288028892039585e-07,
730
+ "loss": 4.0363,
731
  "step": 1030
732
  },
733
  {
734
  "epoch": 0.9042495380936855,
735
+ "grad_norm": 0.036865234375,
736
+ "learning_rate": 2.7759410590946446e-07,
737
+ "loss": 4.0429,
738
  "step": 1040
739
  },
740
  {
741
  "epoch": 0.9129442451907401,
742
+ "grad_norm": 0.0380859375,
743
+ "learning_rate": 2.306044757505055e-07,
744
+ "loss": 4.0474,
745
  "step": 1050
746
  },
747
  {
748
  "epoch": 0.9216389522877948,
749
+ "grad_norm": 0.037109375,
750
+ "learning_rate": 1.8787598081203285e-07,
751
+ "loss": 4.0502,
752
  "step": 1060
753
  },
754
  {
755
  "epoch": 0.9303336593848495,
756
+ "grad_norm": 0.036376953125,
757
+ "learning_rate": 1.4944679614029346e-07,
758
+ "loss": 4.038,
759
  "step": 1070
760
  },
761
  {
762
  "epoch": 0.9390283664819041,
763
+ "grad_norm": 0.0390625,
764
+ "learning_rate": 1.1535125563597927e-07,
765
+ "loss": 4.0239,
766
  "step": 1080
767
  },
768
  {
769
  "epoch": 0.9477230735789588,
770
+ "grad_norm": 0.036865234375,
771
+ "learning_rate": 8.561982137919478e-08,
772
+ "loss": 4.0352,
773
  "step": 1090
774
  },
775
  {
776
  "epoch": 0.9564177806760135,
777
+ "grad_norm": 0.03564453125,
778
+ "learning_rate": 6.02790564136263e-08,
779
+ "loss": 4.0371,
780
  "step": 1100
781
  },
782
  {
783
  "epoch": 0.9651124877730681,
784
+ "grad_norm": 0.03466796875,
785
+ "learning_rate": 3.935160101424251e-08,
786
+ "loss": 4.0525,
787
  "step": 1110
788
  },
789
  {
790
  "epoch": 0.9738071948701228,
791
+ "grad_norm": 0.034912109375,
792
+ "learning_rate": 2.285615245972983e-08,
793
+ "loss": 4.0236,
794
  "step": 1120
795
  },
796
  {
797
  "epoch": 0.9825019019671775,
798
+ "grad_norm": 0.035888671875,
799
+ "learning_rate": 1.0807448327723735e-08,
800
+ "loss": 4.0348,
801
  "step": 1130
802
  },
803
  {
804
  "epoch": 0.9911966090642321,
805
+ "grad_norm": 0.035400390625,
806
+ "learning_rate": 3.2162533277713125e-09,
807
+ "loss": 4.0388,
808
  "step": 1140
809
  },
810
  {
811
  "epoch": 0.9998913161612868,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
812
  "grad_norm": 0.035400390625,
813
+ "learning_rate": 8.93496837878205e-11,
814
+ "loss": 4.0638,
815
+ "step": 1150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  }
817
  ],
818
  "logging_steps": 10,
819
+ "max_steps": 1151,
820
  "num_input_tokens_seen": 0,
821
+ "num_train_epochs": 1,
822
  "save_steps": 157,
823
  "stateful_callbacks": {
824
  "TrainerControl": {
 
832
  "attributes": {}
833
  }
834
  },
835
+ "total_flos": 5.234099929420399e+18,
836
  "train_batch_size": 2,
837
  "trial_name": null,
838
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26ebdf4bff438801370fc8e7baa4a985d5ad2a84ab4f08832492e4066609b96a
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:301d89f1700999978d1aab66fccdff9692ce49fe1df2946ff54d29752274abac
3
  size 5905