dq158 commited on
Commit
00dc058
1 Parent(s): a009eed

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -16,8 +16,8 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "v",
20
- "q"
21
  ],
22
  "task_type": "SEQ_2_SEQ_LM"
23
  }
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "q",
20
+ "v"
21
  ],
22
  "task_type": "SEQ_2_SEQ_LM"
23
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d59a1bdda24e44f2444043aebd61cb910e5ddc3d63b85b414b042a835e9b5e5
3
  size 37789864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:021b25b4ae02b4c67260af9aaa828a04ad34d01af99d0da4a565af18e7741c65
3
  size 37789864
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b8f327ba865f24dfea32c7122116687661ed960bb9cd3e50057b7cac4357fd7
3
  size 2622266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f1f2c45e7d776d5a5ad5cc885d98e86d0067fec7d8d3681868cb22bdd658413
3
  size 2622266
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36a7109c13b0c541d186aea78a4bfc1b1fd2a88b673eb5bdc0ce4cb92eeeacf6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:705f5b8f17ce338386080ca0e9df02a5405f987bdc5acf7da6fc0a4db61ba5c6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c005869a93a2083e266662b043cce087b86c03eccc6fd40eecb44f82b7664e2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:414072090aace5de1cfd92c248a20ab1aa35254978e9abbeb3d9bae9b3088281
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 3.0774757862091064,
3
- "best_model_checkpoint": "dq158/pingusPongus/checkpoint-80324",
4
- "epoch": 2.0,
5
  "eval_steps": 500,
6
- "global_step": 80324,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,1007 +11,850 @@
11
  {
12
  "epoch": 0.01,
13
  "learning_rate": 0.0001,
14
- "loss": 4.5056,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.02,
19
- "learning_rate": 9.9999957472774e-05,
20
- "loss": 3.8512,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.04,
25
- "learning_rate": 9.999982989116833e-05,
26
- "loss": 3.6498,
27
  "step": 1500
28
  },
29
  {
30
- "epoch": 0.05,
31
- "learning_rate": 9.999961725540003e-05,
32
- "loss": 3.6328,
33
  "step": 2000
34
  },
35
  {
36
- "epoch": 0.06,
37
- "learning_rate": 9.999931956583082e-05,
38
- "loss": 3.6162,
39
  "step": 2500
40
  },
41
  {
42
- "epoch": 0.07,
43
- "learning_rate": 9.999893682296706e-05,
44
- "loss": 3.6415,
45
  "step": 3000
46
  },
47
  {
48
- "epoch": 0.09,
49
- "learning_rate": 9.999846902745986e-05,
50
- "loss": 3.6273,
51
  "step": 3500
52
  },
53
  {
54
- "epoch": 0.1,
55
- "learning_rate": 9.999791618010498e-05,
56
- "loss": 3.5555,
57
  "step": 4000
58
  },
59
  {
60
- "epoch": 0.11,
61
- "learning_rate": 9.999727828184285e-05,
62
- "loss": 3.5176,
63
  "step": 4500
64
  },
65
  {
66
- "epoch": 0.12,
67
- "learning_rate": 9.99965553337586e-05,
68
- "loss": 3.4374,
69
  "step": 5000
70
  },
71
  {
72
- "epoch": 0.14,
73
- "learning_rate": 9.999574733708204e-05,
74
- "loss": 3.5565,
75
  "step": 5500
76
  },
77
  {
78
- "epoch": 0.15,
79
- "learning_rate": 9.999485429318763e-05,
80
- "loss": 3.4639,
81
  "step": 6000
82
  },
83
  {
84
- "epoch": 0.16,
85
- "learning_rate": 9.99938762035945e-05,
86
- "loss": 3.3833,
87
  "step": 6500
88
  },
89
  {
90
- "epoch": 0.17,
91
- "learning_rate": 9.999281306996651e-05,
92
- "loss": 3.4726,
93
  "step": 7000
94
  },
95
  {
96
- "epoch": 0.19,
97
- "learning_rate": 9.999166489411211e-05,
98
- "loss": 3.4132,
99
  "step": 7500
100
  },
101
  {
102
- "epoch": 0.2,
103
- "learning_rate": 9.999043167798448e-05,
104
- "loss": 3.3754,
105
  "step": 8000
106
  },
107
  {
108
- "epoch": 0.21,
109
- "learning_rate": 9.99891134236814e-05,
110
- "loss": 3.4333,
111
  "step": 8500
112
  },
113
  {
114
- "epoch": 0.22,
115
- "learning_rate": 9.998771013344535e-05,
116
- "loss": 3.3313,
117
  "step": 9000
118
  },
119
  {
120
- "epoch": 0.24,
121
- "learning_rate": 9.998622180966344e-05,
122
- "loss": 3.3758,
123
  "step": 9500
124
  },
125
  {
126
- "epoch": 0.25,
127
- "learning_rate": 9.998464845486746e-05,
128
- "loss": 3.3814,
129
  "step": 10000
130
  },
131
  {
132
- "epoch": 0.26,
133
- "learning_rate": 9.998299007173383e-05,
134
- "loss": 3.429,
135
  "step": 10500
136
  },
137
  {
138
- "epoch": 0.27,
139
- "learning_rate": 9.99812466630836e-05,
140
- "loss": 3.4547,
141
  "step": 11000
142
  },
143
  {
144
- "epoch": 0.29,
145
- "learning_rate": 9.997941823188243e-05,
146
- "loss": 3.4204,
147
  "step": 11500
148
  },
149
  {
150
- "epoch": 0.3,
151
- "learning_rate": 9.99775047812407e-05,
152
- "loss": 3.3807,
153
  "step": 12000
154
  },
155
  {
156
- "epoch": 0.31,
157
- "learning_rate": 9.997550631441332e-05,
158
- "loss": 3.3749,
159
  "step": 12500
160
  },
161
  {
162
- "epoch": 0.32,
163
- "learning_rate": 9.997342283479989e-05,
164
- "loss": 3.326,
165
  "step": 13000
166
  },
167
  {
168
- "epoch": 0.34,
169
- "learning_rate": 9.997125434594458e-05,
170
- "loss": 3.3626,
171
  "step": 13500
172
  },
173
  {
174
- "epoch": 0.35,
175
- "learning_rate": 9.996900085153617e-05,
176
- "loss": 3.304,
177
  "step": 14000
178
  },
179
  {
180
- "epoch": 0.36,
181
- "learning_rate": 9.996666235540808e-05,
182
- "loss": 3.3671,
183
  "step": 14500
184
  },
185
  {
186
- "epoch": 0.37,
187
- "learning_rate": 9.996423886153828e-05,
188
- "loss": 3.3667,
189
  "step": 15000
190
  },
191
  {
192
- "epoch": 0.39,
193
- "learning_rate": 9.996173037404934e-05,
194
- "loss": 3.321,
195
  "step": 15500
196
  },
197
  {
198
- "epoch": 0.4,
199
- "learning_rate": 9.995913689720844e-05,
200
- "loss": 3.2639,
201
  "step": 16000
202
  },
203
  {
204
- "epoch": 0.41,
205
- "learning_rate": 9.995645843542732e-05,
206
- "loss": 3.3529,
207
  "step": 16500
208
  },
209
  {
210
- "epoch": 0.42,
211
- "learning_rate": 9.995369499326228e-05,
212
- "loss": 3.2607,
213
  "step": 17000
214
  },
215
  {
216
- "epoch": 0.44,
217
- "learning_rate": 9.995084657541416e-05,
218
- "loss": 3.2858,
219
  "step": 17500
220
  },
221
  {
222
- "epoch": 0.45,
223
- "learning_rate": 9.994791318672838e-05,
224
- "loss": 3.3516,
225
  "step": 18000
226
  },
227
  {
228
- "epoch": 0.46,
229
- "learning_rate": 9.994489483219492e-05,
230
- "loss": 3.2944,
231
  "step": 18500
232
  },
233
  {
234
- "epoch": 0.47,
235
- "learning_rate": 9.994179151694824e-05,
236
- "loss": 3.278,
237
  "step": 19000
238
  },
239
  {
240
- "epoch": 0.49,
241
- "learning_rate": 9.993860324626737e-05,
242
- "loss": 3.2557,
243
  "step": 19500
244
  },
245
  {
246
- "epoch": 0.5,
247
- "learning_rate": 9.993533002557585e-05,
248
- "loss": 3.366,
249
  "step": 20000
250
  },
251
  {
252
- "epoch": 0.51,
253
- "learning_rate": 9.99319718604417e-05,
254
- "loss": 3.2581,
255
  "step": 20500
256
  },
257
  {
258
- "epoch": 0.52,
259
- "learning_rate": 9.992852875657746e-05,
260
- "loss": 3.3302,
261
  "step": 21000
262
  },
263
  {
264
- "epoch": 0.54,
265
- "learning_rate": 9.992500071984017e-05,
266
- "loss": 3.334,
267
  "step": 21500
268
  },
269
  {
270
- "epoch": 0.55,
271
- "learning_rate": 9.992138775623132e-05,
272
- "loss": 3.3146,
273
  "step": 22000
274
  },
275
  {
276
- "epoch": 0.56,
277
- "learning_rate": 9.991768987189688e-05,
278
- "loss": 3.3315,
279
  "step": 22500
280
  },
281
  {
282
- "epoch": 0.57,
283
- "learning_rate": 9.991390707312733e-05,
284
- "loss": 3.3853,
285
  "step": 23000
286
  },
287
  {
288
- "epoch": 0.59,
289
- "learning_rate": 9.991003936635747e-05,
290
- "loss": 3.3447,
291
  "step": 23500
292
  },
293
  {
294
- "epoch": 0.6,
295
- "learning_rate": 9.990608675816668e-05,
296
- "loss": 3.1906,
297
  "step": 24000
298
  },
299
  {
300
- "epoch": 0.61,
301
- "learning_rate": 9.990204925527867e-05,
302
- "loss": 3.3639,
303
  "step": 24500
304
  },
305
  {
306
- "epoch": 0.62,
307
- "learning_rate": 9.989792686456158e-05,
308
- "loss": 3.2723,
309
  "step": 25000
310
  },
311
  {
312
- "epoch": 0.63,
313
- "learning_rate": 9.989371959302797e-05,
314
- "loss": 3.2156,
315
  "step": 25500
316
  },
317
  {
318
- "epoch": 0.65,
319
- "learning_rate": 9.988942744783481e-05,
320
- "loss": 3.3264,
321
  "step": 26000
322
  },
323
  {
324
- "epoch": 0.66,
325
- "learning_rate": 9.988505043628337e-05,
326
- "loss": 3.2336,
327
  "step": 26500
328
  },
329
  {
330
- "epoch": 0.67,
331
- "learning_rate": 9.98805885658194e-05,
332
- "loss": 3.2806,
333
  "step": 27000
334
  },
335
  {
336
- "epoch": 0.68,
337
- "learning_rate": 9.98760418440329e-05,
338
- "loss": 3.3251,
339
  "step": 27500
340
  },
341
  {
342
- "epoch": 0.7,
343
- "learning_rate": 9.987141027865825e-05,
344
- "loss": 3.2188,
345
  "step": 28000
346
  },
347
  {
348
- "epoch": 0.71,
349
- "learning_rate": 9.986669387757414e-05,
350
- "loss": 3.2981,
351
  "step": 28500
352
  },
353
  {
354
- "epoch": 0.72,
355
- "learning_rate": 9.986189264880364e-05,
356
- "loss": 3.3023,
357
  "step": 29000
358
  },
359
  {
360
- "epoch": 0.73,
361
- "learning_rate": 9.985700660051403e-05,
362
- "loss": 3.3271,
363
  "step": 29500
364
  },
365
  {
366
- "epoch": 0.75,
367
- "learning_rate": 9.985203574101691e-05,
368
- "loss": 3.2955,
369
  "step": 30000
370
  },
371
  {
372
- "epoch": 0.76,
373
- "learning_rate": 9.984698007876816e-05,
374
- "loss": 3.3756,
375
  "step": 30500
376
  },
377
  {
378
- "epoch": 0.77,
379
- "learning_rate": 9.984183962236792e-05,
380
- "loss": 3.2936,
381
  "step": 31000
382
  },
383
  {
384
- "epoch": 0.78,
385
- "learning_rate": 9.983661438056056e-05,
386
- "loss": 3.3248,
387
  "step": 31500
388
  },
389
  {
390
- "epoch": 0.8,
391
- "learning_rate": 9.983130436223469e-05,
392
- "loss": 3.2569,
393
  "step": 32000
394
  },
395
  {
396
- "epoch": 0.81,
397
- "learning_rate": 9.98259095764231e-05,
398
- "loss": 3.3616,
399
  "step": 32500
400
  },
401
  {
402
- "epoch": 0.82,
403
- "learning_rate": 9.982043003230282e-05,
404
- "loss": 3.2892,
405
  "step": 33000
406
  },
407
  {
408
- "epoch": 0.83,
409
- "learning_rate": 9.981486573919504e-05,
410
- "loss": 3.2331,
411
  "step": 33500
412
  },
413
  {
414
- "epoch": 0.85,
415
- "learning_rate": 9.98092167065651e-05,
416
- "loss": 3.3242,
417
  "step": 34000
418
  },
419
  {
420
- "epoch": 0.86,
421
- "learning_rate": 9.980348294402255e-05,
422
- "loss": 3.3123,
423
  "step": 34500
424
  },
425
  {
426
- "epoch": 0.87,
427
- "learning_rate": 9.9797664461321e-05,
428
- "loss": 3.2869,
429
  "step": 35000
430
  },
431
  {
432
- "epoch": 0.88,
433
- "learning_rate": 9.979176126835821e-05,
434
- "loss": 3.2566,
435
  "step": 35500
436
  },
437
  {
438
- "epoch": 0.9,
439
- "learning_rate": 9.978577337517603e-05,
440
- "loss": 3.3036,
441
  "step": 36000
442
  },
443
  {
444
- "epoch": 0.91,
445
- "learning_rate": 9.977970079196041e-05,
446
- "loss": 3.299,
447
  "step": 36500
448
  },
449
  {
450
- "epoch": 0.92,
451
- "learning_rate": 9.977354352904136e-05,
452
- "loss": 3.31,
453
  "step": 37000
454
  },
455
  {
456
- "epoch": 0.93,
457
- "learning_rate": 9.976730159689292e-05,
458
- "loss": 3.2459,
459
  "step": 37500
460
  },
461
  {
462
- "epoch": 0.95,
463
- "learning_rate": 9.976097500613318e-05,
464
- "loss": 3.3162,
465
  "step": 38000
466
  },
467
  {
468
- "epoch": 0.96,
469
- "learning_rate": 9.975456376752424e-05,
470
- "loss": 3.3056,
471
  "step": 38500
472
  },
473
  {
474
- "epoch": 0.97,
475
- "learning_rate": 9.974806789197216e-05,
476
- "loss": 3.2404,
477
  "step": 39000
478
  },
479
  {
480
- "epoch": 0.98,
481
- "learning_rate": 9.974148739052703e-05,
482
- "loss": 3.2376,
483
  "step": 39500
484
  },
485
  {
486
- "epoch": 1.0,
487
- "learning_rate": 9.973482227438287e-05,
488
- "loss": 3.356,
489
  "step": 40000
490
  },
491
  {
492
- "epoch": 1.0,
493
- "eval_bleu": 1.0,
494
- "eval_brevity_penalty": 1.0,
495
- "eval_length_ratio": 1.0,
496
- "eval_loss": 3.1455512046813965,
497
- "eval_precisions": [
498
- 1.0,
499
- 1.0,
500
- 1.0,
501
- 1.0
502
- ],
503
- "eval_reference_length": 4569600,
504
- "eval_runtime": 8741.384,
505
- "eval_samples_per_second": 1.021,
506
- "eval_steps_per_second": 0.511,
507
- "eval_translation_length": 4569600,
508
- "step": 40162
509
- },
510
- {
511
- "epoch": 1.01,
512
- "learning_rate": 9.972807255487761e-05,
513
- "loss": 3.2504,
514
  "step": 40500
515
  },
516
  {
517
- "epoch": 1.02,
518
- "learning_rate": 9.972123824349316e-05,
519
- "loss": 3.23,
520
  "step": 41000
521
  },
522
  {
523
- "epoch": 1.03,
524
- "learning_rate": 9.971431935185526e-05,
525
- "loss": 3.1975,
526
  "step": 41500
527
  },
528
  {
529
- "epoch": 1.05,
530
- "learning_rate": 9.970731589173359e-05,
531
- "loss": 3.2147,
532
  "step": 42000
533
  },
534
  {
535
- "epoch": 1.06,
536
- "learning_rate": 9.970022787504163e-05,
537
- "loss": 3.1061,
538
  "step": 42500
539
  },
540
  {
541
- "epoch": 1.07,
542
- "learning_rate": 9.969305531383673e-05,
543
- "loss": 3.1693,
544
  "step": 43000
545
  },
546
  {
547
- "epoch": 1.08,
548
- "learning_rate": 9.968579822032009e-05,
549
- "loss": 3.1406,
550
  "step": 43500
551
  },
552
  {
553
- "epoch": 1.1,
554
- "learning_rate": 9.967845660683664e-05,
555
- "loss": 3.28,
556
  "step": 44000
557
  },
558
  {
559
- "epoch": 1.11,
560
- "learning_rate": 9.967103048587511e-05,
561
- "loss": 3.1341,
562
  "step": 44500
563
  },
564
  {
565
- "epoch": 1.12,
566
- "learning_rate": 9.966351987006803e-05,
567
- "loss": 3.1911,
568
  "step": 45000
569
  },
570
  {
571
- "epoch": 1.13,
572
- "learning_rate": 9.965592477219158e-05,
573
- "loss": 3.2183,
574
  "step": 45500
575
  },
576
  {
577
- "epoch": 1.15,
578
- "learning_rate": 9.964824520516576e-05,
579
- "loss": 3.1452,
580
  "step": 46000
581
  },
582
  {
583
- "epoch": 1.16,
584
- "learning_rate": 9.964048118205414e-05,
585
- "loss": 3.2294,
586
  "step": 46500
587
  },
588
  {
589
- "epoch": 1.17,
590
- "learning_rate": 9.963263271606403e-05,
591
- "loss": 3.2148,
592
  "step": 47000
593
  },
594
  {
595
- "epoch": 1.18,
596
- "learning_rate": 9.962469982054638e-05,
597
- "loss": 3.2715,
598
  "step": 47500
599
  },
600
  {
601
- "epoch": 1.2,
602
- "learning_rate": 9.961668250899575e-05,
603
- "loss": 3.1949,
604
  "step": 48000
605
  },
606
  {
607
- "epoch": 1.21,
608
- "learning_rate": 9.96085807950503e-05,
609
- "loss": 3.1926,
610
  "step": 48500
611
  },
612
  {
613
- "epoch": 1.22,
614
- "learning_rate": 9.960039469249177e-05,
615
- "loss": 3.2328,
616
  "step": 49000
617
  },
618
  {
619
- "epoch": 1.23,
620
- "learning_rate": 9.959212421524542e-05,
621
- "loss": 3.121,
622
  "step": 49500
623
  },
624
  {
625
- "epoch": 1.24,
626
- "learning_rate": 9.95837693773801e-05,
627
- "loss": 3.1933,
628
  "step": 50000
629
  },
630
  {
631
- "epoch": 1.26,
632
- "learning_rate": 9.957533019310813e-05,
633
- "loss": 3.1807,
634
  "step": 50500
635
  },
636
  {
637
- "epoch": 1.27,
638
- "learning_rate": 9.956680667678531e-05,
639
- "loss": 3.2696,
640
  "step": 51000
641
  },
642
  {
643
- "epoch": 1.28,
644
- "learning_rate": 9.955819884291088e-05,
645
- "loss": 3.1498,
646
  "step": 51500
647
  },
648
  {
649
- "epoch": 1.29,
650
- "learning_rate": 9.954950670612758e-05,
651
- "loss": 3.1895,
652
  "step": 52000
653
  },
654
  {
655
- "epoch": 1.31,
656
- "learning_rate": 9.954073028122147e-05,
657
- "loss": 3.2206,
658
  "step": 52500
659
  },
660
  {
661
- "epoch": 1.32,
662
- "learning_rate": 9.953186958312204e-05,
663
- "loss": 3.1473,
664
  "step": 53000
665
  },
666
  {
667
- "epoch": 1.33,
668
- "learning_rate": 9.952292462690212e-05,
669
- "loss": 3.2062,
670
  "step": 53500
671
  },
672
  {
673
- "epoch": 1.34,
674
- "learning_rate": 9.951389542777789e-05,
675
- "loss": 3.2043,
676
  "step": 54000
677
  },
678
  {
679
- "epoch": 1.36,
680
- "learning_rate": 9.95047820011088e-05,
681
- "loss": 3.1293,
682
  "step": 54500
683
  },
684
  {
685
- "epoch": 1.37,
686
- "learning_rate": 9.949558436239762e-05,
687
- "loss": 3.2194,
688
  "step": 55000
689
  },
690
  {
691
- "epoch": 1.38,
692
- "learning_rate": 9.948630252729036e-05,
693
- "loss": 3.2101,
694
  "step": 55500
695
  },
696
  {
697
- "epoch": 1.39,
698
- "learning_rate": 9.947693651157621e-05,
699
- "loss": 3.2321,
700
  "step": 56000
701
  },
702
  {
703
- "epoch": 1.41,
704
- "learning_rate": 9.946748633118766e-05,
705
- "loss": 3.2132,
706
  "step": 56500
707
  },
708
  {
709
- "epoch": 1.42,
710
- "learning_rate": 9.945795200220022e-05,
711
- "loss": 3.2587,
712
  "step": 57000
713
  },
714
  {
715
- "epoch": 1.43,
716
- "learning_rate": 9.944833354083273e-05,
717
- "loss": 3.1798,
718
  "step": 57500
719
  },
720
  {
721
- "epoch": 1.44,
722
- "learning_rate": 9.943863096344698e-05,
723
- "loss": 3.2519,
724
  "step": 58000
725
  },
726
  {
727
- "epoch": 1.46,
728
- "learning_rate": 9.942884428654794e-05,
729
- "loss": 3.1065,
730
  "step": 58500
731
  },
732
  {
733
- "epoch": 1.47,
734
- "learning_rate": 9.941897352678362e-05,
735
- "loss": 3.3128,
736
  "step": 59000
737
  },
738
  {
739
- "epoch": 1.48,
740
- "learning_rate": 9.940901870094506e-05,
741
- "loss": 3.2352,
742
  "step": 59500
743
  },
744
  {
745
- "epoch": 1.49,
746
- "learning_rate": 9.939897982596631e-05,
747
- "loss": 3.1773,
748
  "step": 60000
749
  },
750
  {
751
- "epoch": 1.51,
752
- "learning_rate": 9.938885691892437e-05,
753
- "loss": 3.2472,
754
  "step": 60500
755
  },
756
  {
757
- "epoch": 1.52,
758
- "learning_rate": 9.937864999703925e-05,
759
- "loss": 3.0983,
760
  "step": 61000
761
  },
762
  {
763
- "epoch": 1.53,
764
- "learning_rate": 9.936835907767378e-05,
765
- "loss": 3.2046,
766
  "step": 61500
767
  },
768
  {
769
- "epoch": 1.54,
770
- "learning_rate": 9.935798417833376e-05,
771
- "loss": 3.1239,
772
  "step": 62000
773
  },
774
  {
775
- "epoch": 1.56,
776
- "learning_rate": 9.934752531666782e-05,
777
- "loss": 3.2518,
778
  "step": 62500
779
  },
780
  {
781
- "epoch": 1.57,
782
- "learning_rate": 9.933698251046739e-05,
783
- "loss": 3.2518,
784
  "step": 63000
785
  },
786
  {
787
- "epoch": 1.58,
788
- "learning_rate": 9.932635577766676e-05,
789
- "loss": 3.0939,
790
  "step": 63500
791
  },
792
  {
793
- "epoch": 1.59,
794
- "learning_rate": 9.931564513634291e-05,
795
- "loss": 3.2243,
796
  "step": 64000
797
  },
798
  {
799
- "epoch": 1.61,
800
- "learning_rate": 9.930485060471562e-05,
801
- "loss": 3.196,
802
  "step": 64500
803
  },
804
  {
805
- "epoch": 1.62,
806
- "learning_rate": 9.929397220114736e-05,
807
- "loss": 3.2016,
808
  "step": 65000
809
  },
810
  {
811
- "epoch": 1.63,
812
- "learning_rate": 9.928300994414321e-05,
813
- "loss": 3.1955,
814
  "step": 65500
815
  },
816
  {
817
- "epoch": 1.64,
818
- "learning_rate": 9.9271963852351e-05,
819
- "loss": 3.2093,
820
  "step": 66000
821
  },
822
  {
823
- "epoch": 1.66,
824
- "learning_rate": 9.92608339445611e-05,
825
- "loss": 3.2283,
826
  "step": 66500
827
  },
828
  {
829
- "epoch": 1.67,
830
- "learning_rate": 9.924962023970646e-05,
831
- "loss": 3.2368,
832
  "step": 67000
833
  },
834
  {
835
- "epoch": 1.68,
836
- "learning_rate": 9.92383227568626e-05,
837
- "loss": 3.1782,
838
  "step": 67500
839
  },
840
  {
841
- "epoch": 1.69,
842
- "learning_rate": 9.922694151524756e-05,
843
- "loss": 3.1762,
844
  "step": 68000
845
  },
846
  {
847
- "epoch": 1.71,
848
- "learning_rate": 9.921547653422182e-05,
849
- "loss": 3.2227,
850
  "step": 68500
851
  },
852
  {
853
- "epoch": 1.72,
854
- "learning_rate": 9.920392783328834e-05,
855
- "loss": 3.1744,
856
- "step": 69000
857
- },
858
- {
859
- "epoch": 1.73,
860
- "learning_rate": 9.91922954320925e-05,
861
- "loss": 3.1868,
862
- "step": 69500
863
- },
864
- {
865
- "epoch": 1.74,
866
- "learning_rate": 9.918057935042204e-05,
867
- "loss": 3.1313,
868
- "step": 70000
869
- },
870
- {
871
- "epoch": 1.76,
872
- "learning_rate": 9.916877960820705e-05,
873
- "loss": 3.1623,
874
- "step": 70500
875
- },
876
- {
877
- "epoch": 1.77,
878
- "learning_rate": 9.915689622551996e-05,
879
- "loss": 3.0594,
880
- "step": 71000
881
- },
882
- {
883
- "epoch": 1.78,
884
- "learning_rate": 9.914492922257546e-05,
885
- "loss": 3.2163,
886
- "step": 71500
887
- },
888
- {
889
- "epoch": 1.79,
890
- "learning_rate": 9.913287861973049e-05,
891
- "loss": 3.0811,
892
- "step": 72000
893
- },
894
- {
895
- "epoch": 1.81,
896
- "learning_rate": 9.912074443748416e-05,
897
- "loss": 3.2226,
898
- "step": 72500
899
- },
900
- {
901
- "epoch": 1.82,
902
- "learning_rate": 9.910852669647785e-05,
903
- "loss": 3.2876,
904
- "step": 73000
905
- },
906
- {
907
- "epoch": 1.83,
908
- "learning_rate": 9.909622541749499e-05,
909
- "loss": 3.2285,
910
- "step": 73500
911
- },
912
- {
913
- "epoch": 1.84,
914
- "learning_rate": 9.908384062146118e-05,
915
- "loss": 3.1274,
916
- "step": 74000
917
- },
918
- {
919
- "epoch": 1.85,
920
- "learning_rate": 9.907137232944404e-05,
921
- "loss": 3.1894,
922
- "step": 74500
923
- },
924
- {
925
- "epoch": 1.87,
926
- "learning_rate": 9.905882056265323e-05,
927
- "loss": 3.1468,
928
- "step": 75000
929
- },
930
- {
931
- "epoch": 1.88,
932
- "learning_rate": 9.904618534244044e-05,
933
- "loss": 3.2397,
934
- "step": 75500
935
- },
936
- {
937
- "epoch": 1.89,
938
- "learning_rate": 9.903346669029932e-05,
939
- "loss": 3.0135,
940
- "step": 76000
941
- },
942
- {
943
- "epoch": 1.9,
944
- "learning_rate": 9.90206646278654e-05,
945
- "loss": 3.2206,
946
- "step": 76500
947
- },
948
- {
949
- "epoch": 1.92,
950
- "learning_rate": 9.900777917691615e-05,
951
- "loss": 3.1868,
952
- "step": 77000
953
- },
954
- {
955
- "epoch": 1.93,
956
- "learning_rate": 9.899481035937086e-05,
957
- "loss": 3.2184,
958
- "step": 77500
959
- },
960
- {
961
- "epoch": 1.94,
962
- "learning_rate": 9.898175819729063e-05,
963
- "loss": 3.1739,
964
- "step": 78000
965
- },
966
- {
967
- "epoch": 1.95,
968
- "learning_rate": 9.896862271287839e-05,
969
- "loss": 3.2227,
970
- "step": 78500
971
- },
972
- {
973
- "epoch": 1.97,
974
- "learning_rate": 9.895540392847874e-05,
975
- "loss": 3.2236,
976
- "step": 79000
977
- },
978
- {
979
- "epoch": 1.98,
980
- "learning_rate": 9.8942101866578e-05,
981
- "loss": 3.1321,
982
- "step": 79500
983
- },
984
- {
985
- "epoch": 1.99,
986
- "learning_rate": 9.892871654980418e-05,
987
- "loss": 3.1849,
988
- "step": 80000
989
- },
990
- {
991
- "epoch": 2.0,
992
  "eval_bleu": 1.0,
993
  "eval_brevity_penalty": 1.0,
994
  "eval_length_ratio": 1.0,
995
- "eval_loss": 3.0774757862091064,
996
  "eval_precisions": [
997
  1.0,
998
  1.0,
999
  1.0,
1000
  1.0
1001
  ],
1002
- "eval_reference_length": 4569600,
1003
- "eval_runtime": 8605.765,
1004
- "eval_samples_per_second": 1.037,
1005
- "eval_steps_per_second": 0.519,
1006
- "eval_translation_length": 4569600,
1007
- "step": 80324
1008
  }
1009
  ],
1010
  "logging_steps": 500,
1011
- "max_steps": 1204860,
1012
  "num_train_epochs": 30,
1013
  "save_steps": 1000,
1014
- "total_flos": 1.3785697922643395e+18,
1015
  "trial_name": null,
1016
  "trial_params": null
1017
  }
 
1
  {
2
+ "best_metric": 3.0806901454925537,
3
+ "best_model_checkpoint": "dq158/pingusPongus/checkpoint-68803",
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 68803,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.01,
13
  "learning_rate": 0.0001,
14
+ "loss": 4.4427,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.01,
19
+ "learning_rate": 9.999998551451928e-05,
20
+ "loss": 3.7748,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.02,
25
+ "learning_rate": 9.999994205808551e-05,
26
+ "loss": 3.6469,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 0.03,
31
+ "learning_rate": 9.999986963072388e-05,
32
+ "loss": 3.6827,
33
  "step": 2000
34
  },
35
  {
36
+ "epoch": 0.04,
37
+ "learning_rate": 9.999976823247632e-05,
38
+ "loss": 3.6186,
39
  "step": 2500
40
  },
41
  {
42
+ "epoch": 0.04,
43
+ "learning_rate": 9.999963786340163e-05,
44
+ "loss": 3.5199,
45
  "step": 3000
46
  },
47
  {
48
+ "epoch": 0.05,
49
+ "learning_rate": 9.999947852357531e-05,
50
+ "loss": 3.5221,
51
  "step": 3500
52
  },
53
  {
54
+ "epoch": 0.06,
55
+ "learning_rate": 9.999929021308971e-05,
56
+ "loss": 3.469,
57
  "step": 4000
58
  },
59
  {
60
+ "epoch": 0.07,
61
+ "learning_rate": 9.999907293205393e-05,
62
+ "loss": 3.5388,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 0.07,
67
+ "learning_rate": 9.999882668059387e-05,
68
+ "loss": 3.4063,
69
  "step": 5000
70
  },
71
  {
72
+ "epoch": 0.08,
73
+ "learning_rate": 9.99985514588522e-05,
74
+ "loss": 3.4314,
75
  "step": 5500
76
  },
77
  {
78
+ "epoch": 0.09,
79
+ "learning_rate": 9.99982472669884e-05,
80
+ "loss": 3.4252,
81
  "step": 6000
82
  },
83
  {
84
+ "epoch": 0.09,
85
+ "learning_rate": 9.999791410517874e-05,
86
+ "loss": 3.4736,
87
  "step": 6500
88
  },
89
  {
90
+ "epoch": 0.1,
91
+ "learning_rate": 9.999755197361624e-05,
92
+ "loss": 3.4011,
93
  "step": 7000
94
  },
95
  {
96
+ "epoch": 0.11,
97
+ "learning_rate": 9.999716087251072e-05,
98
+ "loss": 3.4709,
99
  "step": 7500
100
  },
101
  {
102
+ "epoch": 0.12,
103
+ "learning_rate": 9.99967408020888e-05,
104
+ "loss": 3.401,
105
  "step": 8000
106
  },
107
  {
108
+ "epoch": 0.12,
109
+ "learning_rate": 9.999629176259391e-05,
110
+ "loss": 3.4339,
111
  "step": 8500
112
  },
113
  {
114
+ "epoch": 0.13,
115
+ "learning_rate": 9.999581375428617e-05,
116
+ "loss": 3.3573,
117
  "step": 9000
118
  },
119
  {
120
+ "epoch": 0.14,
121
+ "learning_rate": 9.999530677744258e-05,
122
+ "loss": 3.364,
123
  "step": 9500
124
  },
125
  {
126
+ "epoch": 0.15,
127
+ "learning_rate": 9.999477083235691e-05,
128
+ "loss": 3.4358,
129
  "step": 10000
130
  },
131
  {
132
+ "epoch": 0.15,
133
+ "learning_rate": 9.999420591933965e-05,
134
+ "loss": 3.369,
135
  "step": 10500
136
  },
137
  {
138
+ "epoch": 0.16,
139
+ "learning_rate": 9.999361203871817e-05,
140
+ "loss": 3.3874,
141
  "step": 11000
142
  },
143
  {
144
+ "epoch": 0.17,
145
+ "learning_rate": 9.999298919083656e-05,
146
+ "loss": 3.3604,
147
  "step": 11500
148
  },
149
  {
150
+ "epoch": 0.17,
151
+ "learning_rate": 9.99923373760557e-05,
152
+ "loss": 3.3134,
153
  "step": 12000
154
  },
155
  {
156
+ "epoch": 0.18,
157
+ "learning_rate": 9.999165659475324e-05,
158
+ "loss": 3.3988,
159
  "step": 12500
160
  },
161
  {
162
+ "epoch": 0.19,
163
+ "learning_rate": 9.999094684732369e-05,
164
+ "loss": 3.3562,
165
  "step": 13000
166
  },
167
  {
168
+ "epoch": 0.2,
169
+ "learning_rate": 9.999020813417826e-05,
170
+ "loss": 3.4156,
171
  "step": 13500
172
  },
173
  {
174
+ "epoch": 0.2,
175
+ "learning_rate": 9.998944045574499e-05,
176
+ "loss": 3.2524,
177
  "step": 14000
178
  },
179
  {
180
+ "epoch": 0.21,
181
+ "learning_rate": 9.998864381246869e-05,
182
+ "loss": 3.4463,
183
  "step": 14500
184
  },
185
  {
186
+ "epoch": 0.22,
187
+ "learning_rate": 9.998781820481091e-05,
188
+ "loss": 3.3492,
189
  "step": 15000
190
  },
191
  {
192
+ "epoch": 0.23,
193
+ "learning_rate": 9.998696363325009e-05,
194
+ "loss": 3.4512,
195
  "step": 15500
196
  },
197
  {
198
+ "epoch": 0.23,
199
+ "learning_rate": 9.998608009828132e-05,
200
+ "loss": 3.3218,
201
  "step": 16000
202
  },
203
  {
204
+ "epoch": 0.24,
205
+ "learning_rate": 9.998516760041659e-05,
206
+ "loss": 3.2985,
207
  "step": 16500
208
  },
209
  {
210
+ "epoch": 0.25,
211
+ "learning_rate": 9.998422614018456e-05,
212
+ "loss": 3.3771,
213
  "step": 17000
214
  },
215
  {
216
+ "epoch": 0.25,
217
+ "learning_rate": 9.998325571813079e-05,
218
+ "loss": 3.3023,
219
  "step": 17500
220
  },
221
  {
222
+ "epoch": 0.26,
223
+ "learning_rate": 9.998225633481753e-05,
224
+ "loss": 3.2226,
225
  "step": 18000
226
  },
227
  {
228
+ "epoch": 0.27,
229
+ "learning_rate": 9.998122799082386e-05,
230
+ "loss": 3.3422,
231
  "step": 18500
232
  },
233
  {
234
+ "epoch": 0.28,
235
+ "learning_rate": 9.998017068674558e-05,
236
+ "loss": 3.3089,
237
  "step": 19000
238
  },
239
  {
240
+ "epoch": 0.28,
241
+ "learning_rate": 9.997908442319536e-05,
242
+ "loss": 3.2337,
243
  "step": 19500
244
  },
245
  {
246
+ "epoch": 0.29,
247
+ "learning_rate": 9.99779692008026e-05,
248
+ "loss": 3.3586,
249
  "step": 20000
250
  },
251
  {
252
+ "epoch": 0.3,
253
+ "learning_rate": 9.997682502021345e-05,
254
+ "loss": 3.2019,
255
  "step": 20500
256
  },
257
  {
258
+ "epoch": 0.31,
259
+ "learning_rate": 9.997565188209089e-05,
260
+ "loss": 3.2937,
261
  "step": 21000
262
  },
263
  {
264
+ "epoch": 0.31,
265
+ "learning_rate": 9.997444978711465e-05,
266
+ "loss": 3.4064,
267
  "step": 21500
268
  },
269
  {
270
+ "epoch": 0.32,
271
+ "learning_rate": 9.997321873598125e-05,
272
+ "loss": 3.339,
273
  "step": 22000
274
  },
275
  {
276
+ "epoch": 0.33,
277
+ "learning_rate": 9.9971958729404e-05,
278
+ "loss": 3.3237,
279
  "step": 22500
280
  },
281
  {
282
+ "epoch": 0.33,
283
+ "learning_rate": 9.997066976811294e-05,
284
+ "loss": 3.3782,
285
  "step": 23000
286
  },
287
  {
288
+ "epoch": 0.34,
289
+ "learning_rate": 9.996935185285495e-05,
290
+ "loss": 3.336,
291
  "step": 23500
292
  },
293
  {
294
+ "epoch": 0.35,
295
+ "learning_rate": 9.996800498439362e-05,
296
+ "loss": 3.1749,
297
  "step": 24000
298
  },
299
  {
300
+ "epoch": 0.36,
301
+ "learning_rate": 9.99666291635094e-05,
302
+ "loss": 3.252,
303
  "step": 24500
304
  },
305
  {
306
+ "epoch": 0.36,
307
+ "learning_rate": 9.996522439099943e-05,
308
+ "loss": 3.3544,
309
  "step": 25000
310
  },
311
  {
312
+ "epoch": 0.37,
313
+ "learning_rate": 9.99637906676777e-05,
314
+ "loss": 3.2898,
315
  "step": 25500
316
  },
317
  {
318
+ "epoch": 0.38,
319
+ "learning_rate": 9.996232799437487e-05,
320
+ "loss": 3.2753,
321
  "step": 26000
322
  },
323
  {
324
+ "epoch": 0.39,
325
+ "learning_rate": 9.996083637193849e-05,
326
+ "loss": 3.3539,
327
  "step": 26500
328
  },
329
  {
330
+ "epoch": 0.39,
331
+ "learning_rate": 9.995931580123284e-05,
332
+ "loss": 3.2567,
333
  "step": 27000
334
  },
335
  {
336
+ "epoch": 0.4,
337
+ "learning_rate": 9.995776628313896e-05,
338
+ "loss": 3.1842,
339
  "step": 27500
340
  },
341
  {
342
+ "epoch": 0.41,
343
+ "learning_rate": 9.995618781855464e-05,
344
+ "loss": 3.2446,
345
  "step": 28000
346
  },
347
  {
348
+ "epoch": 0.41,
349
+ "learning_rate": 9.995458040839452e-05,
350
+ "loss": 3.2132,
351
  "step": 28500
352
  },
353
  {
354
+ "epoch": 0.42,
355
+ "learning_rate": 9.995294405358993e-05,
356
+ "loss": 3.2992,
357
  "step": 29000
358
  },
359
  {
360
+ "epoch": 0.43,
361
+ "learning_rate": 9.995127875508903e-05,
362
+ "loss": 3.2555,
363
  "step": 29500
364
  },
365
  {
366
+ "epoch": 0.44,
367
+ "learning_rate": 9.99495845138567e-05,
368
+ "loss": 3.3704,
369
  "step": 30000
370
  },
371
  {
372
+ "epoch": 0.44,
373
+ "learning_rate": 9.994786133087464e-05,
374
+ "loss": 3.2629,
375
  "step": 30500
376
  },
377
  {
378
+ "epoch": 0.45,
379
+ "learning_rate": 9.994610920714126e-05,
380
+ "loss": 3.224,
381
  "step": 31000
382
  },
383
  {
384
+ "epoch": 0.46,
385
+ "learning_rate": 9.994432814367183e-05,
386
+ "loss": 3.31,
387
  "step": 31500
388
  },
389
  {
390
+ "epoch": 0.47,
391
+ "learning_rate": 9.99425181414983e-05,
392
+ "loss": 3.2763,
393
  "step": 32000
394
  },
395
  {
396
+ "epoch": 0.47,
397
+ "learning_rate": 9.994067920166939e-05,
398
+ "loss": 3.2862,
399
  "step": 32500
400
  },
401
  {
402
+ "epoch": 0.48,
403
+ "learning_rate": 9.993881132525067e-05,
404
+ "loss": 3.3125,
405
  "step": 33000
406
  },
407
  {
408
+ "epoch": 0.49,
409
+ "learning_rate": 9.993691451332439e-05,
410
+ "loss": 3.1288,
411
  "step": 33500
412
  },
413
  {
414
+ "epoch": 0.49,
415
+ "learning_rate": 9.993498876698963e-05,
416
+ "loss": 3.2958,
417
  "step": 34000
418
  },
419
  {
420
+ "epoch": 0.5,
421
+ "learning_rate": 9.993303408736217e-05,
422
+ "loss": 3.1933,
423
  "step": 34500
424
  },
425
  {
426
+ "epoch": 0.51,
427
+ "learning_rate": 9.993105047557461e-05,
428
+ "loss": 3.2504,
429
  "step": 35000
430
  },
431
  {
432
+ "epoch": 0.52,
433
+ "learning_rate": 9.992903793277628e-05,
434
+ "loss": 3.3293,
435
  "step": 35500
436
  },
437
  {
438
+ "epoch": 0.52,
439
+ "learning_rate": 9.99269964601333e-05,
440
+ "loss": 3.1969,
441
  "step": 36000
442
  },
443
  {
444
+ "epoch": 0.53,
445
+ "learning_rate": 9.992492605882853e-05,
446
+ "loss": 3.2087,
447
  "step": 36500
448
  },
449
  {
450
+ "epoch": 0.54,
451
+ "learning_rate": 9.99228267300616e-05,
452
+ "loss": 3.3125,
453
  "step": 37000
454
  },
455
  {
456
+ "epoch": 0.55,
457
+ "learning_rate": 9.992069847504891e-05,
458
+ "loss": 3.2677,
459
  "step": 37500
460
  },
461
  {
462
+ "epoch": 0.55,
463
+ "learning_rate": 9.99185412950236e-05,
464
+ "loss": 3.2132,
465
  "step": 38000
466
  },
467
  {
468
+ "epoch": 0.56,
469
+ "learning_rate": 9.991635519123559e-05,
470
+ "loss": 3.2534,
471
  "step": 38500
472
  },
473
  {
474
+ "epoch": 0.57,
475
+ "learning_rate": 9.991414016495155e-05,
476
+ "loss": 3.1735,
477
  "step": 39000
478
  },
479
  {
480
+ "epoch": 0.57,
481
+ "learning_rate": 9.99118962174549e-05,
482
+ "loss": 3.2422,
483
  "step": 39500
484
  },
485
  {
486
+ "epoch": 0.58,
487
+ "learning_rate": 9.990962335004584e-05,
488
+ "loss": 3.1625,
489
  "step": 40000
490
  },
491
  {
492
+ "epoch": 0.59,
493
+ "learning_rate": 9.99073215640413e-05,
494
+ "loss": 3.1163,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  "step": 40500
496
  },
497
  {
498
+ "epoch": 0.6,
499
+ "learning_rate": 9.990499086077498e-05,
500
+ "loss": 3.1521,
501
  "step": 41000
502
  },
503
  {
504
+ "epoch": 0.6,
505
+ "learning_rate": 9.990263124159736e-05,
506
+ "loss": 3.2036,
507
  "step": 41500
508
  },
509
  {
510
+ "epoch": 0.61,
511
+ "learning_rate": 9.990024270787561e-05,
512
+ "loss": 3.181,
513
  "step": 42000
514
  },
515
  {
516
+ "epoch": 0.62,
517
+ "learning_rate": 9.989782526099372e-05,
518
+ "loss": 3.1672,
519
  "step": 42500
520
  },
521
  {
522
+ "epoch": 0.62,
523
+ "learning_rate": 9.989537890235238e-05,
524
+ "loss": 3.2336,
525
  "step": 43000
526
  },
527
  {
528
+ "epoch": 0.63,
529
+ "learning_rate": 9.989290363336908e-05,
530
+ "loss": 3.1455,
531
  "step": 43500
532
  },
533
  {
534
+ "epoch": 0.64,
535
+ "learning_rate": 9.989039945547803e-05,
536
+ "loss": 3.1859,
537
  "step": 44000
538
  },
539
  {
540
+ "epoch": 0.65,
541
+ "learning_rate": 9.98878663701302e-05,
542
+ "loss": 3.1396,
543
  "step": 44500
544
  },
545
  {
546
+ "epoch": 0.65,
547
+ "learning_rate": 9.988530437879333e-05,
548
+ "loss": 3.2585,
549
  "step": 45000
550
  },
551
  {
552
+ "epoch": 0.66,
553
+ "learning_rate": 9.988271348295184e-05,
554
+ "loss": 3.2201,
555
  "step": 45500
556
  },
557
  {
558
+ "epoch": 0.67,
559
+ "learning_rate": 9.988009368410698e-05,
560
+ "loss": 3.2758,
561
  "step": 46000
562
  },
563
  {
564
+ "epoch": 0.68,
565
+ "learning_rate": 9.98774449837767e-05,
566
+ "loss": 3.1742,
567
  "step": 46500
568
  },
569
  {
570
+ "epoch": 0.68,
571
+ "learning_rate": 9.987476738349571e-05,
572
+ "loss": 3.3212,
573
  "step": 47000
574
  },
575
  {
576
+ "epoch": 0.69,
577
+ "learning_rate": 9.987206088481545e-05,
578
+ "loss": 3.1915,
579
  "step": 47500
580
  },
581
  {
582
+ "epoch": 0.7,
583
+ "learning_rate": 9.986932548930414e-05,
584
+ "loss": 3.1608,
585
  "step": 48000
586
  },
587
  {
588
+ "epoch": 0.7,
589
+ "learning_rate": 9.986656119854672e-05,
590
+ "loss": 3.217,
591
  "step": 48500
592
  },
593
  {
594
+ "epoch": 0.71,
595
+ "learning_rate": 9.986376801414485e-05,
596
+ "loss": 3.1989,
597
  "step": 49000
598
  },
599
  {
600
+ "epoch": 0.72,
601
+ "learning_rate": 9.986094593771699e-05,
602
+ "loss": 3.3067,
603
  "step": 49500
604
  },
605
  {
606
+ "epoch": 0.73,
607
+ "learning_rate": 9.985809497089827e-05,
608
+ "loss": 3.2195,
609
  "step": 50000
610
  },
611
  {
612
+ "epoch": 0.73,
613
+ "learning_rate": 9.985521511534062e-05,
614
+ "loss": 3.148,
615
  "step": 50500
616
  },
617
  {
618
+ "epoch": 0.74,
619
+ "learning_rate": 9.985230637271266e-05,
620
+ "loss": 3.1987,
621
  "step": 51000
622
  },
623
  {
624
+ "epoch": 0.75,
625
+ "learning_rate": 9.984936874469979e-05,
626
+ "loss": 3.1153,
627
  "step": 51500
628
  },
629
  {
630
+ "epoch": 0.76,
631
+ "learning_rate": 9.984640223300413e-05,
632
+ "loss": 3.2841,
633
  "step": 52000
634
  },
635
  {
636
+ "epoch": 0.76,
637
+ "learning_rate": 9.98434068393445e-05,
638
+ "loss": 3.2033,
639
  "step": 52500
640
  },
641
  {
642
+ "epoch": 0.77,
643
+ "learning_rate": 9.984038256545653e-05,
644
+ "loss": 3.2102,
645
  "step": 53000
646
  },
647
  {
648
+ "epoch": 0.78,
649
+ "learning_rate": 9.983732941309253e-05,
650
+ "loss": 3.1817,
651
  "step": 53500
652
  },
653
  {
654
+ "epoch": 0.78,
655
+ "learning_rate": 9.983424738402156e-05,
656
+ "loss": 3.1485,
657
  "step": 54000
658
  },
659
  {
660
+ "epoch": 0.79,
661
+ "learning_rate": 9.98311364800294e-05,
662
+ "loss": 3.2417,
663
  "step": 54500
664
  },
665
  {
666
+ "epoch": 0.8,
667
+ "learning_rate": 9.982799670291857e-05,
668
+ "loss": 3.2174,
669
  "step": 55000
670
  },
671
  {
672
+ "epoch": 0.81,
673
+ "learning_rate": 9.98248280545083e-05,
674
+ "loss": 3.2862,
675
  "step": 55500
676
  },
677
  {
678
+ "epoch": 0.81,
679
+ "learning_rate": 9.982163053663459e-05,
680
+ "loss": 3.201,
681
  "step": 56000
682
  },
683
  {
684
+ "epoch": 0.82,
685
+ "learning_rate": 9.981840415115014e-05,
686
+ "loss": 3.3873,
687
  "step": 56500
688
  },
689
  {
690
+ "epoch": 0.83,
691
+ "learning_rate": 9.981514889992436e-05,
692
+ "loss": 3.1844,
693
  "step": 57000
694
  },
695
  {
696
+ "epoch": 0.84,
697
+ "learning_rate": 9.981186478484344e-05,
698
+ "loss": 3.1807,
699
  "step": 57500
700
  },
701
  {
702
+ "epoch": 0.84,
703
+ "learning_rate": 9.980855180781021e-05,
704
+ "loss": 3.2758,
705
  "step": 58000
706
  },
707
  {
708
+ "epoch": 0.85,
709
+ "learning_rate": 9.980520997074432e-05,
710
+ "loss": 3.1406,
711
  "step": 58500
712
  },
713
  {
714
+ "epoch": 0.86,
715
+ "learning_rate": 9.980183927558207e-05,
716
+ "loss": 3.1607,
717
  "step": 59000
718
  },
719
  {
720
+ "epoch": 0.86,
721
+ "learning_rate": 9.97984397242765e-05,
722
+ "loss": 3.1829,
723
  "step": 59500
724
  },
725
  {
726
+ "epoch": 0.87,
727
+ "learning_rate": 9.979501131879741e-05,
728
+ "loss": 3.2238,
729
  "step": 60000
730
  },
731
  {
732
+ "epoch": 0.88,
733
+ "learning_rate": 9.979155406113124e-05,
734
+ "loss": 3.2348,
735
  "step": 60500
736
  },
737
  {
738
+ "epoch": 0.89,
739
+ "learning_rate": 9.978806795328121e-05,
740
+ "loss": 3.2933,
741
  "step": 61000
742
  },
743
  {
744
+ "epoch": 0.89,
745
+ "learning_rate": 9.978455299726726e-05,
746
+ "loss": 3.2051,
747
  "step": 61500
748
  },
749
  {
750
+ "epoch": 0.9,
751
+ "learning_rate": 9.978100919512598e-05,
752
+ "loss": 3.1736,
753
  "step": 62000
754
  },
755
  {
756
+ "epoch": 0.91,
757
+ "learning_rate": 9.977743654891077e-05,
758
+ "loss": 3.173,
759
  "step": 62500
760
  },
761
  {
762
+ "epoch": 0.92,
763
+ "learning_rate": 9.977383506069164e-05,
764
+ "loss": 3.2732,
765
  "step": 63000
766
  },
767
  {
768
+ "epoch": 0.92,
769
+ "learning_rate": 9.977020473255539e-05,
770
+ "loss": 3.2447,
771
  "step": 63500
772
  },
773
  {
774
+ "epoch": 0.93,
775
+ "learning_rate": 9.976654556660548e-05,
776
+ "loss": 3.2526,
777
  "step": 64000
778
  },
779
  {
780
+ "epoch": 0.94,
781
+ "learning_rate": 9.976285756496211e-05,
782
+ "loss": 3.1814,
783
  "step": 64500
784
  },
785
  {
786
+ "epoch": 0.94,
787
+ "learning_rate": 9.97591407297622e-05,
788
+ "loss": 3.1533,
789
  "step": 65000
790
  },
791
  {
792
+ "epoch": 0.95,
793
+ "learning_rate": 9.975539506315933e-05,
794
+ "loss": 3.0491,
795
  "step": 65500
796
  },
797
  {
798
+ "epoch": 0.96,
799
+ "learning_rate": 9.975162056732385e-05,
800
+ "loss": 3.1918,
801
  "step": 66000
802
  },
803
  {
804
+ "epoch": 0.97,
805
+ "learning_rate": 9.974781724444272e-05,
806
+ "loss": 3.1918,
807
  "step": 66500
808
  },
809
  {
810
+ "epoch": 0.97,
811
+ "learning_rate": 9.974398509671969e-05,
812
+ "loss": 3.1769,
813
  "step": 67000
814
  },
815
  {
816
+ "epoch": 0.98,
817
+ "learning_rate": 9.974012412637517e-05,
818
+ "loss": 3.196,
819
  "step": 67500
820
  },
821
  {
822
+ "epoch": 0.99,
823
+ "learning_rate": 9.97362343356463e-05,
824
+ "loss": 3.1799,
825
  "step": 68000
826
  },
827
  {
828
+ "epoch": 1.0,
829
+ "learning_rate": 9.973231572678686e-05,
830
+ "loss": 3.2516,
831
  "step": 68500
832
  },
833
  {
834
+ "epoch": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
835
  "eval_bleu": 1.0,
836
  "eval_brevity_penalty": 1.0,
837
  "eval_length_ratio": 1.0,
838
+ "eval_loss": 3.0806901454925537,
839
  "eval_precisions": [
840
  1.0,
841
  1.0,
842
  1.0,
843
  1.0
844
  ],
845
+ "eval_reference_length": 7828480,
846
+ "eval_runtime": 21022.3759,
847
+ "eval_samples_per_second": 0.727,
848
+ "eval_steps_per_second": 0.364,
849
+ "eval_translation_length": 7828480,
850
+ "step": 68803
851
  }
852
  ],
853
  "logging_steps": 500,
854
+ "max_steps": 2064090,
855
  "num_train_epochs": 30,
856
  "save_steps": 1000,
857
+ "total_flos": 1.1808307371678106e+18,
858
  "trial_name": null,
859
  "trial_params": null
860
  }