dq158 commited on
Commit
597681f
1 Parent(s): a5726dc

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -8,16 +8,16 @@
8
  "init_lora_weights": true,
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
- "lora_alpha": 16,
12
  "lora_dropout": 0.1,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
- "r": 16,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "q",
20
- "v"
21
  ],
22
  "task_type": "SEQ_2_SEQ_LM"
23
  }
 
8
  "init_lora_weights": true,
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
+ "lora_alpha": 32,
12
  "lora_dropout": 0.1,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
+ "r": 8,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "v",
20
+ "q"
21
  ],
22
  "task_type": "SEQ_2_SEQ_LM"
23
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f805d53d7dddec9940b8dda0a4fc6f84c5e194e588072b3124f4b98d4dba6b2d
3
- size 37789864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99bb7dd9f9d339c1b6ea8b7c4913f5e5a59ddcf1d651ea8af246503e39076873
3
+ size 18915040
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:138083fcab266a12c8bdf751a269df328c6c9ed84e6b49b0dd7314b5b256a7c2
3
- size 2622266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce6d1c7f359ee86eddcfca26e584b2e996e19c9d2397f3e83bb6b92e27e4a9a8
3
+ size 37990394
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:705f5b8f17ce338386080ca0e9df02a5405f987bdc5acf7da6fc0a4db61ba5c6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82fcab858db719cf57493411f22ed07e25f5e61522b33c35190b4108d341449e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:414072090aace5de1cfd92c248a20ab1aa35254978e9abbeb3d9bae9b3088281
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bafc5641cb904ce836a25edd0b40d466bd94a372c40e8a43b0b01499a11c7b0c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,860 +1,79 @@
1
  {
2
- "best_metric": 3.016300916671753,
3
- "best_model_checkpoint": "dq158/pingusPongus/checkpoint-68803",
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 68803,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "learning_rate": 0.0001,
14
- "loss": 4.5647,
15
  "step": 500
16
  },
17
- {
18
- "epoch": 0.01,
19
- "learning_rate": 9.999998551451928e-05,
20
- "loss": 3.7798,
21
- "step": 1000
22
- },
23
- {
24
- "epoch": 0.02,
25
- "learning_rate": 9.999994205808551e-05,
26
- "loss": 3.6154,
27
- "step": 1500
28
- },
29
  {
30
  "epoch": 0.03,
31
- "learning_rate": 9.999986963072388e-05,
32
- "loss": 3.629,
33
- "step": 2000
34
- },
35
- {
36
- "epoch": 0.04,
37
- "learning_rate": 9.999976823247632e-05,
38
- "loss": 3.5877,
39
- "step": 2500
40
  },
41
  {
42
  "epoch": 0.04,
43
- "learning_rate": 9.999963786340163e-05,
44
- "loss": 3.5401,
45
- "step": 3000
46
- },
47
- {
48
- "epoch": 0.05,
49
- "learning_rate": 9.999947852357531e-05,
50
- "loss": 3.5804,
51
- "step": 3500
52
  },
53
  {
54
  "epoch": 0.06,
55
- "learning_rate": 9.999929021308971e-05,
56
- "loss": 3.4367,
57
- "step": 4000
58
- },
59
- {
60
- "epoch": 0.07,
61
- "learning_rate": 9.999907293205393e-05,
62
- "loss": 3.4906,
63
- "step": 4500
64
  },
65
  {
66
  "epoch": 0.07,
67
- "learning_rate": 9.999882668059387e-05,
68
- "loss": 3.4193,
69
- "step": 5000
70
- },
71
- {
72
- "epoch": 0.08,
73
- "learning_rate": 9.99985514588522e-05,
74
- "loss": 3.4256,
75
- "step": 5500
76
- },
77
- {
78
- "epoch": 0.09,
79
- "learning_rate": 9.99982472669884e-05,
80
- "loss": 3.4774,
81
- "step": 6000
82
  },
83
  {
84
  "epoch": 0.09,
85
- "learning_rate": 9.999791410517874e-05,
86
- "loss": 3.4629,
87
- "step": 6500
88
  },
89
  {
90
  "epoch": 0.1,
91
- "learning_rate": 9.999755197361624e-05,
92
- "loss": 3.3818,
93
- "step": 7000
94
- },
95
- {
96
- "epoch": 0.11,
97
- "learning_rate": 9.999716087251072e-05,
98
- "loss": 3.4648,
99
- "step": 7500
100
- },
101
- {
102
- "epoch": 0.12,
103
- "learning_rate": 9.99967408020888e-05,
104
- "loss": 3.4705,
105
- "step": 8000
106
  },
107
  {
108
  "epoch": 0.12,
109
- "learning_rate": 9.999629176259391e-05,
110
- "loss": 3.4356,
111
- "step": 8500
112
  },
113
  {
114
  "epoch": 0.13,
115
- "learning_rate": 9.999581375428617e-05,
116
- "loss": 3.4238,
117
- "step": 9000
118
- },
119
- {
120
- "epoch": 0.14,
121
- "learning_rate": 9.999530677744258e-05,
122
- "loss": 3.4732,
123
- "step": 9500
124
- },
125
- {
126
- "epoch": 0.15,
127
- "learning_rate": 9.999477083235691e-05,
128
- "loss": 3.3216,
129
- "step": 10000
130
  },
131
  {
132
  "epoch": 0.15,
133
- "learning_rate": 9.999420591933965e-05,
134
- "loss": 3.465,
135
- "step": 10500
136
- },
137
- {
138
- "epoch": 0.16,
139
- "learning_rate": 9.999361203871817e-05,
140
- "loss": 3.3641,
141
- "step": 11000
142
- },
143
- {
144
- "epoch": 0.17,
145
- "learning_rate": 9.999298919083656e-05,
146
- "loss": 3.4407,
147
- "step": 11500
148
- },
149
- {
150
- "epoch": 0.17,
151
- "learning_rate": 9.99923373760557e-05,
152
- "loss": 3.3962,
153
- "step": 12000
154
- },
155
- {
156
- "epoch": 0.18,
157
- "learning_rate": 9.999165659475324e-05,
158
- "loss": 3.3776,
159
- "step": 12500
160
- },
161
- {
162
- "epoch": 0.19,
163
- "learning_rate": 9.999094684732369e-05,
164
- "loss": 3.3157,
165
- "step": 13000
166
- },
167
- {
168
- "epoch": 0.2,
169
- "learning_rate": 9.999020813417826e-05,
170
- "loss": 3.2517,
171
- "step": 13500
172
- },
173
- {
174
- "epoch": 0.2,
175
- "learning_rate": 9.998944045574499e-05,
176
- "loss": 3.4232,
177
- "step": 14000
178
- },
179
- {
180
- "epoch": 0.21,
181
- "learning_rate": 9.998864381246869e-05,
182
- "loss": 3.3539,
183
- "step": 14500
184
- },
185
- {
186
- "epoch": 0.22,
187
- "learning_rate": 9.998781820481091e-05,
188
- "loss": 3.2431,
189
- "step": 15000
190
- },
191
- {
192
- "epoch": 0.23,
193
- "learning_rate": 9.998696363325009e-05,
194
- "loss": 3.2796,
195
- "step": 15500
196
- },
197
- {
198
- "epoch": 0.23,
199
- "learning_rate": 9.998608009828132e-05,
200
- "loss": 3.3468,
201
- "step": 16000
202
- },
203
- {
204
- "epoch": 0.24,
205
- "learning_rate": 9.998516760041659e-05,
206
- "loss": 3.3159,
207
- "step": 16500
208
- },
209
- {
210
- "epoch": 0.25,
211
- "learning_rate": 9.998422614018456e-05,
212
- "loss": 3.3635,
213
- "step": 17000
214
- },
215
- {
216
- "epoch": 0.25,
217
- "learning_rate": 9.998325571813079e-05,
218
- "loss": 3.3708,
219
- "step": 17500
220
- },
221
- {
222
- "epoch": 0.26,
223
- "learning_rate": 9.998225633481753e-05,
224
- "loss": 3.3435,
225
- "step": 18000
226
- },
227
- {
228
- "epoch": 0.27,
229
- "learning_rate": 9.998122799082386e-05,
230
- "loss": 3.3649,
231
- "step": 18500
232
- },
233
- {
234
- "epoch": 0.28,
235
- "learning_rate": 9.998017068674558e-05,
236
- "loss": 3.3519,
237
- "step": 19000
238
- },
239
- {
240
- "epoch": 0.28,
241
- "learning_rate": 9.997908442319536e-05,
242
- "loss": 3.3408,
243
- "step": 19500
244
- },
245
- {
246
- "epoch": 0.29,
247
- "learning_rate": 9.99779692008026e-05,
248
- "loss": 3.2678,
249
- "step": 20000
250
- },
251
- {
252
- "epoch": 0.3,
253
- "learning_rate": 9.997682502021345e-05,
254
- "loss": 3.2453,
255
- "step": 20500
256
- },
257
- {
258
- "epoch": 0.31,
259
- "learning_rate": 9.997565188209089e-05,
260
- "loss": 3.2948,
261
- "step": 21000
262
- },
263
- {
264
- "epoch": 0.31,
265
- "learning_rate": 9.997444978711465e-05,
266
- "loss": 3.2849,
267
- "step": 21500
268
- },
269
- {
270
- "epoch": 0.32,
271
- "learning_rate": 9.997321873598125e-05,
272
- "loss": 3.3274,
273
- "step": 22000
274
- },
275
- {
276
- "epoch": 0.33,
277
- "learning_rate": 9.9971958729404e-05,
278
- "loss": 3.3084,
279
- "step": 22500
280
- },
281
- {
282
- "epoch": 0.33,
283
- "learning_rate": 9.997066976811294e-05,
284
- "loss": 3.3019,
285
- "step": 23000
286
- },
287
- {
288
- "epoch": 0.34,
289
- "learning_rate": 9.996935185285495e-05,
290
- "loss": 3.2998,
291
- "step": 23500
292
- },
293
- {
294
- "epoch": 0.35,
295
- "learning_rate": 9.996800498439362e-05,
296
- "loss": 3.268,
297
- "step": 24000
298
- },
299
- {
300
- "epoch": 0.36,
301
- "learning_rate": 9.99666291635094e-05,
302
- "loss": 3.248,
303
- "step": 24500
304
- },
305
- {
306
- "epoch": 0.36,
307
- "learning_rate": 9.996522439099943e-05,
308
- "loss": 3.2204,
309
- "step": 25000
310
- },
311
- {
312
- "epoch": 0.37,
313
- "learning_rate": 9.99637906676777e-05,
314
- "loss": 3.2922,
315
- "step": 25500
316
- },
317
- {
318
- "epoch": 0.38,
319
- "learning_rate": 9.996232799437487e-05,
320
- "loss": 3.2716,
321
- "step": 26000
322
- },
323
- {
324
- "epoch": 0.39,
325
- "learning_rate": 9.996083637193849e-05,
326
- "loss": 3.3181,
327
- "step": 26500
328
- },
329
- {
330
- "epoch": 0.39,
331
- "learning_rate": 9.995931580123284e-05,
332
- "loss": 3.2899,
333
- "step": 27000
334
- },
335
- {
336
- "epoch": 0.4,
337
- "learning_rate": 9.995776628313896e-05,
338
- "loss": 3.266,
339
- "step": 27500
340
- },
341
- {
342
- "epoch": 0.41,
343
- "learning_rate": 9.995618781855464e-05,
344
- "loss": 3.3644,
345
- "step": 28000
346
- },
347
- {
348
- "epoch": 0.41,
349
- "learning_rate": 9.995458040839452e-05,
350
- "loss": 3.2413,
351
- "step": 28500
352
- },
353
- {
354
- "epoch": 0.42,
355
- "learning_rate": 9.995294405358993e-05,
356
- "loss": 3.2759,
357
- "step": 29000
358
- },
359
- {
360
- "epoch": 0.43,
361
- "learning_rate": 9.995127875508903e-05,
362
- "loss": 3.3065,
363
- "step": 29500
364
- },
365
- {
366
- "epoch": 0.44,
367
- "learning_rate": 9.99495845138567e-05,
368
- "loss": 3.3028,
369
- "step": 30000
370
- },
371
- {
372
- "epoch": 0.44,
373
- "learning_rate": 9.994786133087464e-05,
374
- "loss": 3.2613,
375
- "step": 30500
376
- },
377
- {
378
- "epoch": 0.45,
379
- "learning_rate": 9.994610920714126e-05,
380
- "loss": 3.2137,
381
- "step": 31000
382
- },
383
- {
384
- "epoch": 0.46,
385
- "learning_rate": 9.994432814367183e-05,
386
- "loss": 3.2899,
387
- "step": 31500
388
- },
389
- {
390
- "epoch": 0.47,
391
- "learning_rate": 9.99425181414983e-05,
392
- "loss": 3.2877,
393
- "step": 32000
394
- },
395
- {
396
- "epoch": 0.47,
397
- "learning_rate": 9.994067920166939e-05,
398
- "loss": 3.2139,
399
- "step": 32500
400
- },
401
- {
402
- "epoch": 0.48,
403
- "learning_rate": 9.993881132525067e-05,
404
- "loss": 3.2407,
405
- "step": 33000
406
- },
407
- {
408
- "epoch": 0.49,
409
- "learning_rate": 9.993691451332439e-05,
410
- "loss": 3.3226,
411
- "step": 33500
412
- },
413
- {
414
- "epoch": 0.49,
415
- "learning_rate": 9.993498876698963e-05,
416
- "loss": 3.2004,
417
- "step": 34000
418
- },
419
- {
420
- "epoch": 0.5,
421
- "learning_rate": 9.993303408736217e-05,
422
- "loss": 3.2773,
423
- "step": 34500
424
- },
425
- {
426
- "epoch": 0.51,
427
- "learning_rate": 9.993105047557461e-05,
428
- "loss": 3.2183,
429
- "step": 35000
430
- },
431
- {
432
- "epoch": 0.52,
433
- "learning_rate": 9.992903793277628e-05,
434
- "loss": 3.2648,
435
- "step": 35500
436
- },
437
- {
438
- "epoch": 0.52,
439
- "learning_rate": 9.99269964601333e-05,
440
- "loss": 3.1963,
441
- "step": 36000
442
- },
443
- {
444
- "epoch": 0.53,
445
- "learning_rate": 9.992492605882853e-05,
446
- "loss": 3.2825,
447
- "step": 36500
448
- },
449
- {
450
- "epoch": 0.54,
451
- "learning_rate": 9.99228267300616e-05,
452
- "loss": 3.2658,
453
- "step": 37000
454
- },
455
- {
456
- "epoch": 0.55,
457
- "learning_rate": 9.992069847504891e-05,
458
- "loss": 3.1058,
459
- "step": 37500
460
- },
461
- {
462
- "epoch": 0.55,
463
- "learning_rate": 9.99185412950236e-05,
464
- "loss": 3.1375,
465
- "step": 38000
466
- },
467
- {
468
- "epoch": 0.56,
469
- "learning_rate": 9.991635519123559e-05,
470
- "loss": 3.2075,
471
- "step": 38500
472
- },
473
- {
474
- "epoch": 0.57,
475
- "learning_rate": 9.991414016495155e-05,
476
- "loss": 3.2228,
477
- "step": 39000
478
- },
479
- {
480
- "epoch": 0.57,
481
- "learning_rate": 9.99118962174549e-05,
482
- "loss": 3.2004,
483
- "step": 39500
484
- },
485
- {
486
- "epoch": 0.58,
487
- "learning_rate": 9.990962335004584e-05,
488
- "loss": 3.1924,
489
- "step": 40000
490
- },
491
- {
492
- "epoch": 0.59,
493
- "learning_rate": 9.99073215640413e-05,
494
- "loss": 3.2956,
495
- "step": 40500
496
- },
497
- {
498
- "epoch": 0.6,
499
- "learning_rate": 9.990499086077498e-05,
500
- "loss": 3.2698,
501
- "step": 41000
502
- },
503
- {
504
- "epoch": 0.6,
505
- "learning_rate": 9.990263124159736e-05,
506
- "loss": 3.2863,
507
- "step": 41500
508
- },
509
- {
510
- "epoch": 0.61,
511
- "learning_rate": 9.990024270787561e-05,
512
- "loss": 3.3611,
513
- "step": 42000
514
- },
515
- {
516
- "epoch": 0.62,
517
- "learning_rate": 9.989782526099372e-05,
518
- "loss": 3.1691,
519
- "step": 42500
520
- },
521
- {
522
- "epoch": 0.62,
523
- "learning_rate": 9.989537890235238e-05,
524
- "loss": 3.1085,
525
- "step": 43000
526
- },
527
- {
528
- "epoch": 0.63,
529
- "learning_rate": 9.989290363336908e-05,
530
- "loss": 3.1825,
531
- "step": 43500
532
- },
533
- {
534
- "epoch": 0.64,
535
- "learning_rate": 9.989039945547803e-05,
536
- "loss": 3.2333,
537
- "step": 44000
538
- },
539
- {
540
- "epoch": 0.65,
541
- "learning_rate": 9.98878663701302e-05,
542
- "loss": 3.2635,
543
- "step": 44500
544
- },
545
- {
546
- "epoch": 0.65,
547
- "learning_rate": 9.988530437879333e-05,
548
- "loss": 3.2907,
549
- "step": 45000
550
- },
551
- {
552
- "epoch": 0.66,
553
- "learning_rate": 9.988271348295184e-05,
554
- "loss": 3.1334,
555
- "step": 45500
556
- },
557
- {
558
- "epoch": 0.67,
559
- "learning_rate": 9.988009368410698e-05,
560
- "loss": 3.2239,
561
- "step": 46000
562
- },
563
- {
564
- "epoch": 0.68,
565
- "learning_rate": 9.98774449837767e-05,
566
- "loss": 3.1904,
567
- "step": 46500
568
- },
569
- {
570
- "epoch": 0.68,
571
- "learning_rate": 9.987476738349571e-05,
572
- "loss": 3.2781,
573
- "step": 47000
574
- },
575
- {
576
- "epoch": 0.69,
577
- "learning_rate": 9.987206088481545e-05,
578
- "loss": 3.19,
579
- "step": 47500
580
- },
581
- {
582
- "epoch": 0.7,
583
- "learning_rate": 9.986932548930414e-05,
584
- "loss": 3.2235,
585
- "step": 48000
586
- },
587
- {
588
- "epoch": 0.7,
589
- "learning_rate": 9.986656119854672e-05,
590
- "loss": 3.2302,
591
- "step": 48500
592
- },
593
- {
594
- "epoch": 0.71,
595
- "learning_rate": 9.986376801414485e-05,
596
- "loss": 3.289,
597
- "step": 49000
598
- },
599
- {
600
- "epoch": 0.72,
601
- "learning_rate": 9.986094593771699e-05,
602
- "loss": 3.2874,
603
- "step": 49500
604
- },
605
- {
606
- "epoch": 0.73,
607
- "learning_rate": 9.985809497089827e-05,
608
- "loss": 3.144,
609
- "step": 50000
610
- },
611
- {
612
- "epoch": 0.73,
613
- "learning_rate": 9.985521511534062e-05,
614
- "loss": 3.1967,
615
- "step": 50500
616
- },
617
- {
618
- "epoch": 0.74,
619
- "learning_rate": 9.985230637271266e-05,
620
- "loss": 3.248,
621
- "step": 51000
622
- },
623
- {
624
- "epoch": 0.75,
625
- "learning_rate": 9.984936874469979e-05,
626
- "loss": 3.1933,
627
- "step": 51500
628
- },
629
- {
630
- "epoch": 0.76,
631
- "learning_rate": 9.984640223300413e-05,
632
- "loss": 3.2108,
633
- "step": 52000
634
- },
635
- {
636
- "epoch": 0.76,
637
- "learning_rate": 9.98434068393445e-05,
638
- "loss": 3.0986,
639
- "step": 52500
640
- },
641
- {
642
- "epoch": 0.77,
643
- "learning_rate": 9.984038256545653e-05,
644
- "loss": 3.2016,
645
- "step": 53000
646
- },
647
- {
648
- "epoch": 0.78,
649
- "learning_rate": 9.983732941309253e-05,
650
- "loss": 3.1967,
651
- "step": 53500
652
- },
653
- {
654
- "epoch": 0.78,
655
- "learning_rate": 9.983424738402156e-05,
656
- "loss": 3.2803,
657
- "step": 54000
658
- },
659
- {
660
- "epoch": 0.79,
661
- "learning_rate": 9.98311364800294e-05,
662
- "loss": 3.2952,
663
- "step": 54500
664
- },
665
- {
666
- "epoch": 0.8,
667
- "learning_rate": 9.982799670291857e-05,
668
- "loss": 3.18,
669
- "step": 55000
670
- },
671
- {
672
- "epoch": 0.81,
673
- "learning_rate": 9.98248280545083e-05,
674
- "loss": 3.1768,
675
- "step": 55500
676
- },
677
- {
678
- "epoch": 0.81,
679
- "learning_rate": 9.982163053663459e-05,
680
- "loss": 3.1986,
681
- "step": 56000
682
- },
683
- {
684
- "epoch": 0.82,
685
- "learning_rate": 9.981840415115014e-05,
686
- "loss": 3.2241,
687
- "step": 56500
688
- },
689
- {
690
- "epoch": 0.83,
691
- "learning_rate": 9.981514889992436e-05,
692
- "loss": 3.2754,
693
- "step": 57000
694
- },
695
- {
696
- "epoch": 0.84,
697
- "learning_rate": 9.981186478484344e-05,
698
- "loss": 3.2162,
699
- "step": 57500
700
- },
701
- {
702
- "epoch": 0.84,
703
- "learning_rate": 9.980855180781021e-05,
704
- "loss": 3.2659,
705
- "step": 58000
706
- },
707
- {
708
- "epoch": 0.85,
709
- "learning_rate": 9.980520997074432e-05,
710
- "loss": 3.172,
711
- "step": 58500
712
- },
713
- {
714
- "epoch": 0.86,
715
- "learning_rate": 9.980183927558207e-05,
716
- "loss": 3.3035,
717
- "step": 59000
718
- },
719
- {
720
- "epoch": 0.86,
721
- "learning_rate": 9.97984397242765e-05,
722
- "loss": 3.1746,
723
- "step": 59500
724
- },
725
- {
726
- "epoch": 0.87,
727
- "learning_rate": 9.979501131879741e-05,
728
- "loss": 3.1678,
729
- "step": 60000
730
- },
731
- {
732
- "epoch": 0.88,
733
- "learning_rate": 9.979155406113124e-05,
734
- "loss": 3.162,
735
- "step": 60500
736
- },
737
- {
738
- "epoch": 0.89,
739
- "learning_rate": 9.978806795328121e-05,
740
- "loss": 3.3041,
741
- "step": 61000
742
- },
743
- {
744
- "epoch": 0.89,
745
- "learning_rate": 9.978455299726726e-05,
746
- "loss": 3.2086,
747
- "step": 61500
748
- },
749
- {
750
- "epoch": 0.9,
751
- "learning_rate": 9.978100919512598e-05,
752
- "loss": 3.1832,
753
- "step": 62000
754
- },
755
- {
756
- "epoch": 0.91,
757
- "learning_rate": 9.977743654891077e-05,
758
- "loss": 3.1839,
759
- "step": 62500
760
- },
761
- {
762
- "epoch": 0.92,
763
- "learning_rate": 9.977383506069164e-05,
764
- "loss": 3.123,
765
- "step": 63000
766
- },
767
- {
768
- "epoch": 0.92,
769
- "learning_rate": 9.977020473255539e-05,
770
- "loss": 3.2868,
771
- "step": 63500
772
- },
773
- {
774
- "epoch": 0.93,
775
- "learning_rate": 9.976654556660548e-05,
776
- "loss": 3.1488,
777
- "step": 64000
778
- },
779
- {
780
- "epoch": 0.94,
781
- "learning_rate": 9.976285756496211e-05,
782
- "loss": 3.2116,
783
- "step": 64500
784
- },
785
- {
786
- "epoch": 0.94,
787
- "learning_rate": 9.97591407297622e-05,
788
- "loss": 3.2233,
789
- "step": 65000
790
- },
791
- {
792
- "epoch": 0.95,
793
- "learning_rate": 9.975539506315933e-05,
794
- "loss": 3.1018,
795
- "step": 65500
796
- },
797
- {
798
- "epoch": 0.96,
799
- "learning_rate": 9.975162056732385e-05,
800
- "loss": 3.1404,
801
- "step": 66000
802
- },
803
- {
804
- "epoch": 0.97,
805
- "learning_rate": 9.974781724444272e-05,
806
- "loss": 3.1984,
807
- "step": 66500
808
- },
809
- {
810
- "epoch": 0.97,
811
- "learning_rate": 9.974398509671969e-05,
812
- "loss": 3.2421,
813
- "step": 67000
814
- },
815
- {
816
- "epoch": 0.98,
817
- "learning_rate": 9.974012412637517e-05,
818
- "loss": 3.1499,
819
- "step": 67500
820
- },
821
- {
822
- "epoch": 0.99,
823
- "learning_rate": 9.97362343356463e-05,
824
- "loss": 3.174,
825
- "step": 68000
826
- },
827
- {
828
- "epoch": 1.0,
829
- "learning_rate": 9.973231572678686e-05,
830
- "loss": 3.1816,
831
- "step": 68500
832
- },
833
- {
834
- "epoch": 1.0,
835
- "eval_bleu": 1.0,
836
- "eval_brevity_penalty": 1.0,
837
- "eval_length_ratio": 1.0,
838
- "eval_loss": 3.016300916671753,
839
- "eval_precisions": [
840
- 1.0,
841
- 1.0,
842
- 1.0,
843
- 1.0
844
- ],
845
- "eval_reference_length": 7828480,
846
- "eval_runtime": 20103.3013,
847
- "eval_samples_per_second": 0.761,
848
- "eval_steps_per_second": 0.38,
849
- "eval_translation_length": 7828480,
850
- "step": 68803
851
  }
852
  ],
853
  "logging_steps": 500,
854
- "max_steps": 2064090,
855
- "num_train_epochs": 30,
856
- "save_steps": 1000,
857
- "total_flos": 1.1808307371678106e+18,
858
  "trial_name": null,
859
  "trial_params": null
860
  }
 
1
  {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.14534249960030812,
5
  "eval_steps": 500,
6
+ "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "learning_rate": 4e-05,
14
+ "loss": 4.4951,
15
  "step": 500
16
  },
 
 
 
 
 
 
 
 
 
 
 
 
17
  {
18
  "epoch": 0.03,
19
+ "learning_rate": 8e-05,
20
+ "loss": 3.777,
21
+ "step": 1000
 
 
 
 
 
 
22
  },
23
  {
24
  "epoch": 0.04,
25
+ "learning_rate": 7.999831247941866e-05,
26
+ "loss": 3.6246,
27
+ "step": 1500
 
 
 
 
 
 
28
  },
29
  {
30
  "epoch": 0.06,
31
+ "learning_rate": 7.99932500600609e-05,
32
+ "loss": 3.5067,
33
+ "step": 2000
 
 
 
 
 
 
34
  },
35
  {
36
  "epoch": 0.07,
37
+ "learning_rate": 7.998481316907362e-05,
38
+ "loss": 3.4947,
39
+ "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
40
  },
41
  {
42
  "epoch": 0.09,
43
+ "learning_rate": 7.99730025183281e-05,
44
+ "loss": 3.4452,
45
+ "step": 3000
46
  },
47
  {
48
  "epoch": 0.1,
49
+ "learning_rate": 7.995781910436019e-05,
50
+ "loss": 3.3696,
51
+ "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.12,
55
+ "learning_rate": 7.993926420828609e-05,
56
+ "loss": 3.4226,
57
+ "step": 4000
58
  },
59
  {
60
  "epoch": 0.13,
61
+ "learning_rate": 7.991733939569422e-05,
62
+ "loss": 3.3765,
63
+ "step": 4500
 
 
 
 
 
 
 
 
 
 
 
 
64
  },
65
  {
66
  "epoch": 0.15,
67
+ "learning_rate": 7.989204651651322e-05,
68
+ "loss": 3.4237,
69
+ "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
71
  ],
72
  "logging_steps": 500,
73
+ "max_steps": 172005,
74
+ "num_train_epochs": 5,
75
+ "save_steps": 5000,
76
+ "total_flos": 1.7133622788096e+17,
77
  "trial_name": null,
78
  "trial_params": null
79
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b62efb5e27cf22233e7e3d90ba4b9b06ff50053e55dc9f07a3ad40dc5b14b43a
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac93b83e20cb3266b0249c2b9cb223a898cd7a840eb97f009fc035d490eabb4d
3
  size 4728