siddharth963 commited on
Commit
f9fa1e3
1 Parent(s): 0400a46

End of training

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 9.99,
3
- "eval_accuracy": 0.8644859813084113,
4
- "eval_loss": 0.4172683358192444,
5
- "eval_runtime": 69.3033,
6
- "eval_samples_per_second": 61.758,
7
- "eval_steps_per_second": 1.934,
8
- "total_flos": 1.3257453564799912e+19,
9
- "train_loss": 0.3000902961967583,
10
- "train_runtime": 6004.4248,
11
- "train_samples_per_second": 28.507,
12
- "train_steps_per_second": 0.222
13
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.8761682242990654,
4
+ "eval_loss": 0.38753223419189453,
5
+ "eval_runtime": 92.7247,
6
+ "eval_samples_per_second": 46.158,
7
+ "eval_steps_per_second": 5.77,
8
+ "total_flos": 1.3264660513609667e+19,
9
+ "train_loss": 0.36025063641717503,
10
+ "train_runtime": 7147.2488,
11
+ "train_samples_per_second": 23.949,
12
+ "train_steps_per_second": 0.749
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.99,
3
- "eval_accuracy": 0.8644859813084113,
4
- "eval_loss": 0.4172683358192444,
5
- "eval_runtime": 69.3033,
6
- "eval_samples_per_second": 61.758,
7
- "eval_steps_per_second": 1.934
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.8761682242990654,
4
+ "eval_loss": 0.38753223419189453,
5
+ "eval_runtime": 92.7247,
6
+ "eval_samples_per_second": 46.158,
7
+ "eval_steps_per_second": 5.77
8
  }
runs/Oct21_08-44-12_fd488c9ba56a/events.out.tfevents.1666349321.fd488c9ba56a.17.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90e63c3e5c01b29b3229ad22ba8f5dbf00a58c6ddcad2384170ef90d6105c5f4
3
+ size 363
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.99,
3
- "total_flos": 1.3257453564799912e+19,
4
- "train_loss": 0.3000902961967583,
5
- "train_runtime": 6004.4248,
6
- "train_samples_per_second": 28.507,
7
- "train_steps_per_second": 0.222
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.3264660513609667e+19,
4
+ "train_loss": 0.36025063641717503,
5
+ "train_runtime": 7147.2488,
6
+ "train_samples_per_second": 23.949,
7
+ "train_steps_per_second": 0.749
8
  }
trainer_state.json CHANGED
@@ -1,913 +1,3325 @@
1
  {
2
- "best_metric": 0.8644859813084113,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-cassava/checkpoint-399",
4
- "epoch": 9.994392523364485,
5
- "global_step": 1330,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.07,
12
- "learning_rate": 3.7593984962406014e-06,
13
- "loss": 1.6081,
14
  "step": 10
15
  },
16
  {
17
- "epoch": 0.15,
18
- "learning_rate": 7.518796992481203e-06,
19
- "loss": 1.5302,
20
  "step": 20
21
  },
22
  {
23
- "epoch": 0.22,
24
- "learning_rate": 1.1278195488721805e-05,
25
- "loss": 1.3659,
26
  "step": 30
27
  },
28
  {
29
- "epoch": 0.3,
30
- "learning_rate": 1.5037593984962406e-05,
31
- "loss": 1.2237,
32
  "step": 40
33
  },
34
  {
35
- "epoch": 0.37,
36
- "learning_rate": 1.8796992481203007e-05,
37
- "loss": 1.0832,
38
  "step": 50
39
  },
40
  {
41
- "epoch": 0.45,
42
- "learning_rate": 2.255639097744361e-05,
43
- "loss": 1.02,
44
  "step": 60
45
  },
46
  {
47
- "epoch": 0.52,
48
- "learning_rate": 2.6315789473684212e-05,
49
- "loss": 0.8901,
50
  "step": 70
51
  },
52
  {
53
- "epoch": 0.6,
54
- "learning_rate": 3.007518796992481e-05,
55
- "loss": 0.8331,
56
  "step": 80
57
  },
58
  {
59
- "epoch": 0.67,
60
- "learning_rate": 3.3834586466165414e-05,
61
- "loss": 0.751,
62
  "step": 90
63
  },
64
  {
65
- "epoch": 0.75,
66
- "learning_rate": 3.759398496240601e-05,
67
- "loss": 0.7102,
68
  "step": 100
69
  },
70
  {
71
- "epoch": 0.82,
72
- "learning_rate": 4.135338345864662e-05,
73
- "loss": 0.6395,
74
  "step": 110
75
  },
76
  {
77
- "epoch": 0.9,
78
- "learning_rate": 4.511278195488722e-05,
79
- "loss": 0.6145,
80
  "step": 120
81
  },
82
  {
83
- "epoch": 0.97,
84
- "learning_rate": 4.887218045112782e-05,
85
- "loss": 0.5832,
86
  "step": 130
87
  },
88
  {
89
- "epoch": 0.99,
90
- "eval_accuracy": 0.8299065420560747,
91
- "eval_loss": 0.5485312342643738,
92
- "eval_runtime": 73.2179,
93
- "eval_samples_per_second": 58.456,
94
- "eval_steps_per_second": 1.83,
95
- "step": 133
96
- },
97
- {
98
- "epoch": 1.05,
99
- "learning_rate": 4.970760233918128e-05,
100
- "loss": 0.5927,
101
  "step": 140
102
  },
103
  {
104
- "epoch": 1.13,
105
- "learning_rate": 4.928989139515455e-05,
106
- "loss": 0.4566,
107
  "step": 150
108
  },
109
  {
110
- "epoch": 1.2,
111
- "learning_rate": 4.887218045112782e-05,
112
- "loss": 0.5143,
113
  "step": 160
114
  },
115
  {
116
- "epoch": 1.28,
117
- "learning_rate": 4.8454469507101085e-05,
118
- "loss": 0.4993,
119
  "step": 170
120
  },
121
  {
122
- "epoch": 1.35,
123
- "learning_rate": 4.803675856307435e-05,
124
- "loss": 0.4912,
125
  "step": 180
126
  },
127
  {
128
- "epoch": 1.43,
129
- "learning_rate": 4.761904761904762e-05,
130
- "loss": 0.4608,
131
  "step": 190
132
  },
133
  {
134
- "epoch": 1.5,
135
- "learning_rate": 4.720133667502089e-05,
136
- "loss": 0.4569,
137
  "step": 200
138
  },
139
  {
140
- "epoch": 1.58,
141
- "learning_rate": 4.678362573099415e-05,
142
- "loss": 0.4649,
143
  "step": 210
144
  },
145
  {
146
- "epoch": 1.65,
147
- "learning_rate": 4.6365914786967416e-05,
148
- "loss": 0.4428,
149
  "step": 220
150
  },
151
  {
152
- "epoch": 1.73,
153
- "learning_rate": 4.5948203842940684e-05,
154
- "loss": 0.4654,
155
  "step": 230
156
  },
157
  {
158
- "epoch": 1.8,
159
- "learning_rate": 4.553049289891395e-05,
160
- "loss": 0.4404,
161
  "step": 240
162
  },
163
  {
164
- "epoch": 1.87,
165
- "learning_rate": 4.511278195488722e-05,
166
- "loss": 0.4231,
167
  "step": 250
168
  },
169
  {
170
- "epoch": 1.95,
171
- "learning_rate": 4.4695071010860486e-05,
172
- "loss": 0.4638,
173
  "step": 260
174
  },
175
  {
176
- "epoch": 1.99,
177
- "eval_accuracy": 0.8574766355140186,
178
- "eval_loss": 0.44362181425094604,
179
- "eval_runtime": 75.8626,
180
- "eval_samples_per_second": 56.418,
181
- "eval_steps_per_second": 1.766,
182
- "step": 266
183
- },
184
- {
185
- "epoch": 2.03,
186
- "learning_rate": 4.4277360066833754e-05,
187
- "loss": 0.4435,
188
  "step": 270
189
  },
190
  {
191
- "epoch": 2.1,
192
- "learning_rate": 4.3859649122807014e-05,
193
- "loss": 0.3596,
194
  "step": 280
195
  },
196
  {
197
- "epoch": 2.18,
198
- "learning_rate": 4.344193817878028e-05,
199
- "loss": 0.3644,
200
  "step": 290
201
  },
202
  {
203
- "epoch": 2.25,
204
- "learning_rate": 4.302422723475355e-05,
205
- "loss": 0.3731,
206
  "step": 300
207
  },
208
  {
209
- "epoch": 2.33,
210
- "learning_rate": 4.260651629072682e-05,
211
- "loss": 0.3682,
212
  "step": 310
213
  },
214
  {
215
- "epoch": 2.4,
216
- "learning_rate": 4.2188805346700084e-05,
217
- "loss": 0.3619,
218
  "step": 320
219
  },
220
  {
221
- "epoch": 2.48,
222
- "learning_rate": 4.177109440267335e-05,
223
- "loss": 0.3729,
224
  "step": 330
225
  },
226
  {
227
- "epoch": 2.55,
228
- "learning_rate": 4.135338345864662e-05,
229
- "loss": 0.4054,
230
  "step": 340
231
  },
232
  {
233
- "epoch": 2.63,
234
- "learning_rate": 4.093567251461988e-05,
235
- "loss": 0.339,
236
  "step": 350
237
  },
238
  {
239
- "epoch": 2.7,
240
- "learning_rate": 4.051796157059315e-05,
241
- "loss": 0.3598,
242
  "step": 360
243
  },
244
  {
245
- "epoch": 2.78,
246
- "learning_rate": 4.0100250626566415e-05,
247
- "loss": 0.3417,
248
  "step": 370
249
  },
250
  {
251
- "epoch": 2.85,
252
- "learning_rate": 3.968253968253968e-05,
253
- "loss": 0.3379,
254
  "step": 380
255
  },
256
  {
257
- "epoch": 2.93,
258
- "learning_rate": 3.926482873851295e-05,
259
- "loss": 0.3115,
260
  "step": 390
261
  },
262
  {
263
- "epoch": 2.99,
264
- "eval_accuracy": 0.8644859813084113,
265
- "eval_loss": 0.4172683358192444,
266
- "eval_runtime": 70.472,
267
- "eval_samples_per_second": 60.733,
268
- "eval_steps_per_second": 1.901,
269
- "step": 399
270
- },
271
- {
272
- "epoch": 3.01,
273
- "learning_rate": 3.884711779448622e-05,
274
- "loss": 0.4131,
275
  "step": 400
276
  },
277
  {
278
- "epoch": 3.08,
279
- "learning_rate": 3.8429406850459485e-05,
280
- "loss": 0.2941,
281
  "step": 410
282
  },
283
  {
284
- "epoch": 3.16,
285
- "learning_rate": 3.8011695906432746e-05,
286
- "loss": 0.3021,
287
  "step": 420
288
  },
289
  {
290
- "epoch": 3.23,
291
- "learning_rate": 3.759398496240601e-05,
292
- "loss": 0.2967,
293
  "step": 430
294
  },
295
  {
296
- "epoch": 3.31,
297
- "learning_rate": 3.717627401837928e-05,
298
- "loss": 0.2906,
299
  "step": 440
300
  },
301
  {
302
- "epoch": 3.38,
303
- "learning_rate": 3.675856307435255e-05,
304
- "loss": 0.2741,
305
  "step": 450
306
  },
307
  {
308
- "epoch": 3.46,
309
- "learning_rate": 3.6340852130325816e-05,
310
- "loss": 0.3069,
311
  "step": 460
312
  },
313
  {
314
- "epoch": 3.53,
315
- "learning_rate": 3.592314118629908e-05,
316
- "loss": 0.2697,
317
  "step": 470
318
  },
319
  {
320
- "epoch": 3.61,
321
- "learning_rate": 3.5505430242272344e-05,
322
- "loss": 0.2777,
323
  "step": 480
324
  },
325
  {
326
- "epoch": 3.68,
327
- "learning_rate": 3.508771929824561e-05,
328
- "loss": 0.2863,
329
  "step": 490
330
  },
331
  {
332
- "epoch": 3.76,
333
- "learning_rate": 3.467000835421888e-05,
334
- "loss": 0.3172,
335
  "step": 500
336
  },
337
  {
338
- "epoch": 3.83,
339
- "learning_rate": 3.4252297410192146e-05,
340
- "loss": 0.2982,
341
  "step": 510
342
  },
343
  {
344
- "epoch": 3.9,
345
- "learning_rate": 3.3834586466165414e-05,
346
- "loss": 0.2902,
347
  "step": 520
348
  },
349
  {
350
- "epoch": 3.98,
351
- "learning_rate": 3.341687552213868e-05,
352
- "loss": 0.2926,
353
  "step": 530
354
  },
355
  {
356
- "epoch": 3.99,
357
- "eval_accuracy": 0.8476635514018691,
358
- "eval_loss": 0.44750651717185974,
359
- "eval_runtime": 69.0564,
360
- "eval_samples_per_second": 61.978,
361
- "eval_steps_per_second": 1.94,
362
- "step": 532
363
  },
364
  {
365
- "epoch": 4.06,
366
- "learning_rate": 3.299916457811195e-05,
367
- "loss": 0.2541,
368
  "step": 540
369
  },
370
  {
371
- "epoch": 4.13,
372
- "learning_rate": 3.258145363408521e-05,
373
- "loss": 0.2287,
374
  "step": 550
375
  },
376
  {
377
- "epoch": 4.21,
378
- "learning_rate": 3.216374269005848e-05,
379
- "loss": 0.248,
380
  "step": 560
381
  },
382
  {
383
- "epoch": 4.28,
384
- "learning_rate": 3.1746031746031745e-05,
385
- "loss": 0.2502,
386
  "step": 570
387
  },
388
  {
389
- "epoch": 4.36,
390
- "learning_rate": 3.132832080200501e-05,
391
- "loss": 0.2053,
392
  "step": 580
393
  },
394
  {
395
- "epoch": 4.43,
396
- "learning_rate": 3.091060985797828e-05,
397
- "loss": 0.2359,
398
  "step": 590
399
  },
400
  {
401
- "epoch": 4.51,
402
- "learning_rate": 3.0492898913951544e-05,
403
- "loss": 0.2538,
404
  "step": 600
405
  },
406
  {
407
- "epoch": 4.58,
408
- "learning_rate": 3.007518796992481e-05,
409
- "loss": 0.2371,
410
  "step": 610
411
  },
412
  {
413
- "epoch": 4.66,
414
- "learning_rate": 2.965747702589808e-05,
415
- "loss": 0.2402,
416
  "step": 620
417
  },
418
  {
419
- "epoch": 4.73,
420
- "learning_rate": 2.9239766081871346e-05,
421
- "loss": 0.2418,
422
  "step": 630
423
  },
424
  {
425
- "epoch": 4.81,
426
- "learning_rate": 2.882205513784461e-05,
427
- "loss": 0.2465,
428
  "step": 640
429
  },
430
  {
431
- "epoch": 4.88,
432
- "learning_rate": 2.8404344193817878e-05,
433
- "loss": 0.2241,
434
  "step": 650
435
  },
436
  {
437
- "epoch": 4.96,
438
- "learning_rate": 2.7986633249791145e-05,
439
- "loss": 0.2127,
440
  "step": 660
441
  },
442
  {
443
- "epoch": 4.99,
444
- "eval_accuracy": 0.8574766355140186,
445
- "eval_loss": 0.4497027099132538,
446
- "eval_runtime": 74.3222,
447
- "eval_samples_per_second": 57.587,
448
- "eval_steps_per_second": 1.803,
449
- "step": 665
450
- },
451
- {
452
- "epoch": 5.04,
453
- "learning_rate": 2.756892230576441e-05,
454
- "loss": 0.2127,
455
  "step": 670
456
  },
457
  {
458
- "epoch": 5.11,
459
- "learning_rate": 2.7151211361737677e-05,
460
- "loss": 0.1995,
461
  "step": 680
462
  },
463
  {
464
- "epoch": 5.19,
465
- "learning_rate": 2.6733500417710944e-05,
466
- "loss": 0.184,
467
  "step": 690
468
  },
469
  {
470
- "epoch": 5.26,
471
- "learning_rate": 2.6315789473684212e-05,
472
- "loss": 0.1686,
473
  "step": 700
474
  },
475
  {
476
- "epoch": 5.34,
477
- "learning_rate": 2.5898078529657476e-05,
478
- "loss": 0.1944,
479
  "step": 710
480
  },
481
  {
482
- "epoch": 5.41,
483
- "learning_rate": 2.5480367585630744e-05,
484
- "loss": 0.1827,
485
  "step": 720
486
  },
487
  {
488
- "epoch": 5.49,
489
- "learning_rate": 2.506265664160401e-05,
490
- "loss": 0.2056,
491
  "step": 730
492
  },
493
  {
494
- "epoch": 5.56,
495
- "learning_rate": 2.4644945697577275e-05,
496
- "loss": 0.1706,
497
  "step": 740
498
  },
499
  {
500
- "epoch": 5.64,
501
- "learning_rate": 2.4227234753550543e-05,
502
- "loss": 0.1585,
503
  "step": 750
504
  },
505
  {
506
- "epoch": 5.71,
507
- "learning_rate": 2.380952380952381e-05,
508
- "loss": 0.1805,
509
  "step": 760
510
  },
511
  {
512
- "epoch": 5.79,
513
- "learning_rate": 2.3391812865497074e-05,
514
- "loss": 0.2051,
515
  "step": 770
516
  },
517
  {
518
- "epoch": 5.86,
519
- "learning_rate": 2.2974101921470342e-05,
520
- "loss": 0.1997,
521
  "step": 780
522
  },
523
  {
524
- "epoch": 5.93,
525
- "learning_rate": 2.255639097744361e-05,
526
- "loss": 0.1934,
527
  "step": 790
528
  },
529
  {
530
- "epoch": 5.99,
531
- "eval_accuracy": 0.858177570093458,
532
- "eval_loss": 0.4547787010669708,
533
- "eval_runtime": 69.4508,
534
- "eval_samples_per_second": 61.626,
535
- "eval_steps_per_second": 1.929,
536
- "step": 798
537
- },
538
- {
539
- "epoch": 6.01,
540
- "learning_rate": 2.2138680033416877e-05,
541
- "loss": 0.2103,
542
  "step": 800
543
  },
544
  {
545
- "epoch": 6.09,
546
- "learning_rate": 2.172096908939014e-05,
547
- "loss": 0.1442,
548
  "step": 810
549
  },
550
  {
551
- "epoch": 6.16,
552
- "learning_rate": 2.130325814536341e-05,
553
- "loss": 0.144,
554
  "step": 820
555
  },
556
  {
557
- "epoch": 6.24,
558
- "learning_rate": 2.0885547201336676e-05,
559
- "loss": 0.1844,
560
  "step": 830
561
  },
562
  {
563
- "epoch": 6.31,
564
- "learning_rate": 2.046783625730994e-05,
565
- "loss": 0.1574,
566
  "step": 840
567
  },
568
  {
569
- "epoch": 6.39,
570
- "learning_rate": 2.0050125313283208e-05,
571
- "loss": 0.1594,
572
  "step": 850
573
  },
574
  {
575
- "epoch": 6.46,
576
- "learning_rate": 1.9632414369256475e-05,
577
- "loss": 0.1394,
578
  "step": 860
579
  },
580
  {
581
- "epoch": 6.54,
582
- "learning_rate": 1.9214703425229743e-05,
583
- "loss": 0.1556,
584
  "step": 870
585
  },
586
  {
587
- "epoch": 6.61,
588
- "learning_rate": 1.8796992481203007e-05,
589
- "loss": 0.1181,
590
  "step": 880
591
  },
592
  {
593
- "epoch": 6.69,
594
- "learning_rate": 1.8379281537176274e-05,
595
- "loss": 0.1504,
596
  "step": 890
597
  },
598
  {
599
- "epoch": 6.76,
600
- "learning_rate": 1.796157059314954e-05,
601
- "loss": 0.1296,
602
  "step": 900
603
  },
604
  {
605
- "epoch": 6.84,
606
- "learning_rate": 1.7543859649122806e-05,
607
- "loss": 0.1444,
608
  "step": 910
609
  },
610
  {
611
- "epoch": 6.91,
612
- "learning_rate": 1.7126148705096073e-05,
613
- "loss": 0.1789,
614
  "step": 920
615
  },
616
  {
617
- "epoch": 6.99,
618
- "learning_rate": 1.670843776106934e-05,
619
- "loss": 0.1777,
620
  "step": 930
621
  },
622
  {
623
- "epoch": 6.99,
624
- "eval_accuracy": 0.8560747663551402,
625
- "eval_loss": 0.46795374155044556,
626
- "eval_runtime": 69.8404,
627
- "eval_samples_per_second": 61.283,
628
- "eval_steps_per_second": 1.919,
629
- "step": 931
630
- },
631
- {
632
- "epoch": 7.07,
633
- "learning_rate": 1.6290726817042605e-05,
634
- "loss": 0.132,
635
  "step": 940
636
  },
637
  {
638
- "epoch": 7.14,
639
- "learning_rate": 1.5873015873015872e-05,
640
- "loss": 0.1483,
641
  "step": 950
642
  },
643
  {
644
- "epoch": 7.22,
645
- "learning_rate": 1.545530492898914e-05,
646
- "loss": 0.1322,
647
  "step": 960
648
  },
649
  {
650
- "epoch": 7.29,
651
- "learning_rate": 1.5037593984962406e-05,
652
- "loss": 0.1382,
653
  "step": 970
654
  },
655
  {
656
- "epoch": 7.37,
657
- "learning_rate": 1.4619883040935673e-05,
658
- "loss": 0.1041,
659
  "step": 980
660
  },
661
  {
662
- "epoch": 7.44,
663
- "learning_rate": 1.4202172096908939e-05,
664
- "loss": 0.096,
665
  "step": 990
666
  },
667
  {
668
- "epoch": 7.52,
669
- "learning_rate": 1.3784461152882205e-05,
670
- "loss": 0.1336,
671
  "step": 1000
672
  },
673
  {
674
- "epoch": 7.59,
675
- "learning_rate": 1.3366750208855472e-05,
676
- "loss": 0.1269,
677
  "step": 1010
678
  },
679
  {
680
- "epoch": 7.67,
681
- "learning_rate": 1.2949039264828738e-05,
682
- "loss": 0.116,
683
  "step": 1020
684
  },
685
  {
686
- "epoch": 7.74,
687
- "learning_rate": 1.2531328320802006e-05,
688
- "loss": 0.1386,
689
  "step": 1030
690
  },
691
  {
692
- "epoch": 7.81,
693
- "learning_rate": 1.2113617376775271e-05,
694
- "loss": 0.1228,
695
  "step": 1040
696
  },
697
  {
698
- "epoch": 7.89,
699
- "learning_rate": 1.1695906432748537e-05,
700
- "loss": 0.103,
701
  "step": 1050
702
  },
703
  {
704
- "epoch": 7.96,
705
- "learning_rate": 1.1278195488721805e-05,
706
- "loss": 0.1187,
707
  "step": 1060
708
  },
709
  {
710
- "epoch": 7.99,
711
- "eval_accuracy": 0.8591121495327103,
712
- "eval_loss": 0.4879584014415741,
713
- "eval_runtime": 69.5594,
714
- "eval_samples_per_second": 61.53,
715
- "eval_steps_per_second": 1.926,
716
- "step": 1064
717
  },
718
  {
719
- "epoch": 8.04,
720
- "learning_rate": 1.086048454469507e-05,
721
- "loss": 0.1422,
 
 
 
722
  "step": 1070
723
  },
724
  {
725
- "epoch": 8.12,
726
- "learning_rate": 1.0442773600668338e-05,
727
- "loss": 0.0984,
728
  "step": 1080
729
  },
730
  {
731
- "epoch": 8.19,
732
- "learning_rate": 1.0025062656641604e-05,
733
- "loss": 0.1016,
734
  "step": 1090
735
  },
736
  {
737
- "epoch": 8.27,
738
- "learning_rate": 9.607351712614871e-06,
739
- "loss": 0.097,
740
  "step": 1100
741
  },
742
  {
743
- "epoch": 8.34,
744
- "learning_rate": 9.189640768588137e-06,
745
- "loss": 0.1138,
746
  "step": 1110
747
  },
748
  {
749
- "epoch": 8.42,
750
- "learning_rate": 8.771929824561403e-06,
751
- "loss": 0.0806,
752
  "step": 1120
753
  },
754
  {
755
- "epoch": 8.49,
756
- "learning_rate": 8.35421888053467e-06,
757
- "loss": 0.0968,
758
  "step": 1130
759
  },
760
  {
761
- "epoch": 8.57,
762
- "learning_rate": 7.936507936507936e-06,
763
- "loss": 0.0855,
764
  "step": 1140
765
  },
766
  {
767
- "epoch": 8.64,
768
- "learning_rate": 7.518796992481203e-06,
769
- "loss": 0.1295,
770
  "step": 1150
771
  },
772
  {
773
- "epoch": 8.72,
774
- "learning_rate": 7.1010860484544695e-06,
775
- "loss": 0.1062,
776
  "step": 1160
777
  },
778
  {
779
- "epoch": 8.79,
780
- "learning_rate": 6.683375104427736e-06,
781
- "loss": 0.0931,
782
  "step": 1170
783
  },
784
  {
785
- "epoch": 8.87,
786
- "learning_rate": 6.265664160401003e-06,
787
- "loss": 0.118,
788
  "step": 1180
789
  },
790
  {
791
- "epoch": 8.94,
792
- "learning_rate": 5.8479532163742686e-06,
793
- "loss": 0.0801,
794
  "step": 1190
795
  },
796
  {
797
- "epoch": 8.99,
798
- "eval_accuracy": 0.8556074766355141,
799
- "eval_loss": 0.5013762712478638,
800
- "eval_runtime": 69.6837,
801
- "eval_samples_per_second": 61.42,
802
- "eval_steps_per_second": 1.923,
803
- "step": 1197
804
- },
805
- {
806
- "epoch": 9.02,
807
- "learning_rate": 5.430242272347535e-06,
808
- "loss": 0.1025,
809
  "step": 1200
810
  },
811
  {
812
- "epoch": 9.1,
813
- "learning_rate": 5.012531328320802e-06,
814
- "loss": 0.0854,
815
  "step": 1210
816
  },
817
  {
818
- "epoch": 9.17,
819
- "learning_rate": 4.5948203842940685e-06,
820
- "loss": 0.1032,
821
  "step": 1220
822
  },
823
  {
824
- "epoch": 9.25,
825
- "learning_rate": 4.177109440267335e-06,
826
- "loss": 0.108,
827
  "step": 1230
828
  },
829
  {
830
- "epoch": 9.32,
831
- "learning_rate": 3.7593984962406014e-06,
832
- "loss": 0.0933,
833
  "step": 1240
834
  },
835
  {
836
- "epoch": 9.4,
837
- "learning_rate": 3.341687552213868e-06,
838
- "loss": 0.0795,
839
  "step": 1250
840
  },
841
  {
842
- "epoch": 9.47,
843
- "learning_rate": 2.9239766081871343e-06,
844
- "loss": 0.0882,
845
  "step": 1260
846
  },
847
  {
848
- "epoch": 9.55,
849
- "learning_rate": 2.506265664160401e-06,
850
- "loss": 0.0907,
851
  "step": 1270
852
  },
853
  {
854
- "epoch": 9.62,
855
- "learning_rate": 2.0885547201336676e-06,
856
- "loss": 0.0775,
857
  "step": 1280
858
  },
859
  {
860
- "epoch": 9.7,
861
- "learning_rate": 1.670843776106934e-06,
862
- "loss": 0.091,
863
  "step": 1290
864
  },
865
  {
866
- "epoch": 9.77,
867
- "learning_rate": 1.2531328320802005e-06,
868
- "loss": 0.0741,
869
  "step": 1300
870
  },
871
  {
872
- "epoch": 9.84,
873
- "learning_rate": 8.35421888053467e-07,
874
- "loss": 0.0728,
875
  "step": 1310
876
  },
877
  {
878
- "epoch": 9.92,
879
- "learning_rate": 4.177109440267335e-07,
880
- "loss": 0.1036,
881
  "step": 1320
882
  },
883
  {
884
- "epoch": 9.99,
885
- "learning_rate": 0.0,
886
- "loss": 0.088,
887
  "step": 1330
888
  },
889
  {
890
- "epoch": 9.99,
891
- "eval_accuracy": 0.8558411214953271,
892
- "eval_loss": 0.5052832365036011,
893
- "eval_runtime": 69.3744,
894
- "eval_samples_per_second": 61.694,
895
- "eval_steps_per_second": 1.932,
896
- "step": 1330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
  },
898
  {
899
- "epoch": 9.99,
900
- "step": 1330,
901
- "total_flos": 1.3257453564799912e+19,
902
- "train_loss": 0.3000902961967583,
903
- "train_runtime": 6004.4248,
904
- "train_samples_per_second": 28.507,
905
- "train_steps_per_second": 0.222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
906
  }
907
  ],
908
- "max_steps": 1330,
909
  "num_train_epochs": 10,
910
- "total_flos": 1.3257453564799912e+19,
911
  "trial_name": null,
912
  "trial_params": null
913
  }
 
1
  {
2
+ "best_metric": 0.8761682242990654,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-cassava/checkpoint-4280",
4
+ "epoch": 10.0,
5
+ "global_step": 5350,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.02,
12
+ "learning_rate": 9.345794392523364e-07,
13
+ "loss": 1.6171,
14
  "step": 10
15
  },
16
  {
17
+ "epoch": 0.04,
18
+ "learning_rate": 1.8691588785046728e-06,
19
+ "loss": 1.6057,
20
  "step": 20
21
  },
22
  {
23
+ "epoch": 0.06,
24
+ "learning_rate": 2.8037383177570094e-06,
25
+ "loss": 1.5864,
26
  "step": 30
27
  },
28
  {
29
+ "epoch": 0.07,
30
+ "learning_rate": 3.7383177570093455e-06,
31
+ "loss": 1.5484,
32
  "step": 40
33
  },
34
  {
35
+ "epoch": 0.09,
36
+ "learning_rate": 4.6728971962616825e-06,
37
+ "loss": 1.4915,
38
  "step": 50
39
  },
40
  {
41
+ "epoch": 0.11,
42
+ "learning_rate": 5.607476635514019e-06,
43
+ "loss": 1.4295,
44
  "step": 60
45
  },
46
  {
47
+ "epoch": 0.13,
48
+ "learning_rate": 6.542056074766355e-06,
49
+ "loss": 1.306,
50
  "step": 70
51
  },
52
  {
53
+ "epoch": 0.15,
54
+ "learning_rate": 7.476635514018691e-06,
55
+ "loss": 1.2262,
56
  "step": 80
57
  },
58
  {
59
+ "epoch": 0.17,
60
+ "learning_rate": 8.411214953271028e-06,
61
+ "loss": 1.1393,
62
  "step": 90
63
  },
64
  {
65
+ "epoch": 0.19,
66
+ "learning_rate": 9.345794392523365e-06,
67
+ "loss": 1.0983,
68
  "step": 100
69
  },
70
  {
71
+ "epoch": 0.21,
72
+ "learning_rate": 1.02803738317757e-05,
73
+ "loss": 1.1016,
74
  "step": 110
75
  },
76
  {
77
+ "epoch": 0.22,
78
+ "learning_rate": 1.1214953271028037e-05,
79
+ "loss": 1.0129,
80
  "step": 120
81
  },
82
  {
83
+ "epoch": 0.24,
84
+ "learning_rate": 1.2149532710280374e-05,
85
+ "loss": 1.0331,
86
  "step": 130
87
  },
88
  {
89
+ "epoch": 0.26,
90
+ "learning_rate": 1.308411214953271e-05,
91
+ "loss": 0.9059,
 
 
 
 
 
 
 
 
 
92
  "step": 140
93
  },
94
  {
95
+ "epoch": 0.28,
96
+ "learning_rate": 1.4018691588785047e-05,
97
+ "loss": 0.9411,
98
  "step": 150
99
  },
100
  {
101
+ "epoch": 0.3,
102
+ "learning_rate": 1.4953271028037382e-05,
103
+ "loss": 0.8249,
104
  "step": 160
105
  },
106
  {
107
+ "epoch": 0.32,
108
+ "learning_rate": 1.588785046728972e-05,
109
+ "loss": 0.8729,
110
  "step": 170
111
  },
112
  {
113
+ "epoch": 0.34,
114
+ "learning_rate": 1.6822429906542056e-05,
115
+ "loss": 0.8896,
116
  "step": 180
117
  },
118
  {
119
+ "epoch": 0.36,
120
+ "learning_rate": 1.775700934579439e-05,
121
+ "loss": 0.8313,
122
  "step": 190
123
  },
124
  {
125
+ "epoch": 0.37,
126
+ "learning_rate": 1.869158878504673e-05,
127
+ "loss": 0.7836,
128
  "step": 200
129
  },
130
  {
131
+ "epoch": 0.39,
132
+ "learning_rate": 1.9626168224299065e-05,
133
+ "loss": 0.7587,
134
  "step": 210
135
  },
136
  {
137
+ "epoch": 0.41,
138
+ "learning_rate": 2.05607476635514e-05,
139
+ "loss": 0.6824,
140
  "step": 220
141
  },
142
  {
143
+ "epoch": 0.43,
144
+ "learning_rate": 2.149532710280374e-05,
145
+ "loss": 0.7291,
146
  "step": 230
147
  },
148
  {
149
+ "epoch": 0.45,
150
+ "learning_rate": 2.2429906542056075e-05,
151
+ "loss": 0.7577,
152
  "step": 240
153
  },
154
  {
155
+ "epoch": 0.47,
156
+ "learning_rate": 2.3364485981308414e-05,
157
+ "loss": 0.7538,
158
  "step": 250
159
  },
160
  {
161
+ "epoch": 0.49,
162
+ "learning_rate": 2.429906542056075e-05,
163
+ "loss": 0.6234,
164
  "step": 260
165
  },
166
  {
167
+ "epoch": 0.5,
168
+ "learning_rate": 2.5233644859813084e-05,
169
+ "loss": 0.6444,
 
 
 
 
 
 
 
 
 
170
  "step": 270
171
  },
172
  {
173
+ "epoch": 0.52,
174
+ "learning_rate": 2.616822429906542e-05,
175
+ "loss": 0.6348,
176
  "step": 280
177
  },
178
  {
179
+ "epoch": 0.54,
180
+ "learning_rate": 2.7102803738317755e-05,
181
+ "loss": 0.6086,
182
  "step": 290
183
  },
184
  {
185
+ "epoch": 0.56,
186
+ "learning_rate": 2.8037383177570094e-05,
187
+ "loss": 0.6258,
188
  "step": 300
189
  },
190
  {
191
+ "epoch": 0.58,
192
+ "learning_rate": 2.897196261682243e-05,
193
+ "loss": 0.6055,
194
  "step": 310
195
  },
196
  {
197
+ "epoch": 0.6,
198
+ "learning_rate": 2.9906542056074764e-05,
199
+ "loss": 0.5749,
200
  "step": 320
201
  },
202
  {
203
+ "epoch": 0.62,
204
+ "learning_rate": 3.08411214953271e-05,
205
+ "loss": 0.595,
206
  "step": 330
207
  },
208
  {
209
+ "epoch": 0.64,
210
+ "learning_rate": 3.177570093457944e-05,
211
+ "loss": 0.541,
212
  "step": 340
213
  },
214
  {
215
+ "epoch": 0.65,
216
+ "learning_rate": 3.2710280373831774e-05,
217
+ "loss": 0.597,
218
  "step": 350
219
  },
220
  {
221
+ "epoch": 0.67,
222
+ "learning_rate": 3.364485981308411e-05,
223
+ "loss": 0.465,
224
  "step": 360
225
  },
226
  {
227
+ "epoch": 0.69,
228
+ "learning_rate": 3.457943925233645e-05,
229
+ "loss": 0.5728,
230
  "step": 370
231
  },
232
  {
233
+ "epoch": 0.71,
234
+ "learning_rate": 3.551401869158878e-05,
235
+ "loss": 0.5068,
236
  "step": 380
237
  },
238
  {
239
+ "epoch": 0.73,
240
+ "learning_rate": 3.644859813084112e-05,
241
+ "loss": 0.5928,
242
  "step": 390
243
  },
244
  {
245
+ "epoch": 0.75,
246
+ "learning_rate": 3.738317757009346e-05,
247
+ "loss": 0.5488,
 
 
 
 
 
 
 
 
 
248
  "step": 400
249
  },
250
  {
251
+ "epoch": 0.77,
252
+ "learning_rate": 3.831775700934579e-05,
253
+ "loss": 0.5565,
254
  "step": 410
255
  },
256
  {
257
+ "epoch": 0.79,
258
+ "learning_rate": 3.925233644859813e-05,
259
+ "loss": 0.5147,
260
  "step": 420
261
  },
262
  {
263
+ "epoch": 0.8,
264
+ "learning_rate": 4.018691588785047e-05,
265
+ "loss": 0.5709,
266
  "step": 430
267
  },
268
  {
269
+ "epoch": 0.82,
270
+ "learning_rate": 4.11214953271028e-05,
271
+ "loss": 0.5086,
272
  "step": 440
273
  },
274
  {
275
+ "epoch": 0.84,
276
+ "learning_rate": 4.205607476635514e-05,
277
+ "loss": 0.4941,
278
  "step": 450
279
  },
280
  {
281
+ "epoch": 0.86,
282
+ "learning_rate": 4.299065420560748e-05,
283
+ "loss": 0.492,
284
  "step": 460
285
  },
286
  {
287
+ "epoch": 0.88,
288
+ "learning_rate": 4.392523364485982e-05,
289
+ "loss": 0.522,
290
  "step": 470
291
  },
292
  {
293
+ "epoch": 0.9,
294
+ "learning_rate": 4.485981308411215e-05,
295
+ "loss": 0.5502,
296
  "step": 480
297
  },
298
  {
299
+ "epoch": 0.92,
300
+ "learning_rate": 4.579439252336449e-05,
301
+ "loss": 0.5002,
302
  "step": 490
303
  },
304
  {
305
+ "epoch": 0.93,
306
+ "learning_rate": 4.672897196261683e-05,
307
+ "loss": 0.5522,
308
  "step": 500
309
  },
310
  {
311
+ "epoch": 0.95,
312
+ "learning_rate": 4.766355140186916e-05,
313
+ "loss": 0.5166,
314
  "step": 510
315
  },
316
  {
317
+ "epoch": 0.97,
318
+ "learning_rate": 4.85981308411215e-05,
319
+ "loss": 0.5825,
320
  "step": 520
321
  },
322
  {
323
+ "epoch": 0.99,
324
+ "learning_rate": 4.9532710280373836e-05,
325
+ "loss": 0.5531,
326
  "step": 530
327
  },
328
  {
329
+ "epoch": 1.0,
330
+ "eval_accuracy": 0.8336448598130841,
331
+ "eval_loss": 0.4937918782234192,
332
+ "eval_runtime": 96.0141,
333
+ "eval_samples_per_second": 44.577,
334
+ "eval_steps_per_second": 5.572,
335
+ "step": 535
336
  },
337
  {
338
+ "epoch": 1.01,
339
+ "learning_rate": 4.994807892004154e-05,
340
+ "loss": 0.491,
341
  "step": 540
342
  },
343
  {
344
+ "epoch": 1.03,
345
+ "learning_rate": 4.9844236760124614e-05,
346
+ "loss": 0.4679,
347
  "step": 550
348
  },
349
  {
350
+ "epoch": 1.05,
351
+ "learning_rate": 4.974039460020769e-05,
352
+ "loss": 0.4584,
353
  "step": 560
354
  },
355
  {
356
+ "epoch": 1.07,
357
+ "learning_rate": 4.963655244029076e-05,
358
+ "loss": 0.5556,
359
  "step": 570
360
  },
361
  {
362
+ "epoch": 1.08,
363
+ "learning_rate": 4.9532710280373836e-05,
364
+ "loss": 0.4374,
365
  "step": 580
366
  },
367
  {
368
+ "epoch": 1.1,
369
+ "learning_rate": 4.9428868120456904e-05,
370
+ "loss": 0.4499,
371
  "step": 590
372
  },
373
  {
374
+ "epoch": 1.12,
375
+ "learning_rate": 4.9325025960539985e-05,
376
+ "loss": 0.5201,
377
  "step": 600
378
  },
379
  {
380
+ "epoch": 1.14,
381
+ "learning_rate": 4.922118380062305e-05,
382
+ "loss": 0.6573,
383
  "step": 610
384
  },
385
  {
386
+ "epoch": 1.16,
387
+ "learning_rate": 4.9117341640706127e-05,
388
+ "loss": 0.5029,
389
  "step": 620
390
  },
391
  {
392
+ "epoch": 1.18,
393
+ "learning_rate": 4.901349948078921e-05,
394
+ "loss": 0.5457,
395
  "step": 630
396
  },
397
  {
398
+ "epoch": 1.2,
399
+ "learning_rate": 4.8909657320872275e-05,
400
+ "loss": 0.492,
401
  "step": 640
402
  },
403
  {
404
+ "epoch": 1.21,
405
+ "learning_rate": 4.880581516095535e-05,
406
+ "loss": 0.3407,
407
  "step": 650
408
  },
409
  {
410
+ "epoch": 1.23,
411
+ "learning_rate": 4.8701973001038423e-05,
412
+ "loss": 0.5129,
413
  "step": 660
414
  },
415
  {
416
+ "epoch": 1.25,
417
+ "learning_rate": 4.85981308411215e-05,
418
+ "loss": 0.4626,
 
 
 
 
 
 
 
 
 
419
  "step": 670
420
  },
421
  {
422
+ "epoch": 1.27,
423
+ "learning_rate": 4.849428868120457e-05,
424
+ "loss": 0.5528,
425
  "step": 680
426
  },
427
  {
428
+ "epoch": 1.29,
429
+ "learning_rate": 4.8390446521287646e-05,
430
+ "loss": 0.4977,
431
  "step": 690
432
  },
433
  {
434
+ "epoch": 1.31,
435
+ "learning_rate": 4.828660436137072e-05,
436
+ "loss": 0.387,
437
  "step": 700
438
  },
439
  {
440
+ "epoch": 1.33,
441
+ "learning_rate": 4.818276220145379e-05,
442
+ "loss": 0.5349,
443
  "step": 710
444
  },
445
  {
446
+ "epoch": 1.35,
447
+ "learning_rate": 4.807892004153687e-05,
448
+ "loss": 0.4265,
449
  "step": 720
450
  },
451
  {
452
+ "epoch": 1.36,
453
+ "learning_rate": 4.797507788161994e-05,
454
+ "loss": 0.4719,
455
  "step": 730
456
  },
457
  {
458
+ "epoch": 1.38,
459
+ "learning_rate": 4.787123572170301e-05,
460
+ "loss": 0.5005,
461
  "step": 740
462
  },
463
  {
464
+ "epoch": 1.4,
465
+ "learning_rate": 4.776739356178609e-05,
466
+ "loss": 0.5112,
467
  "step": 750
468
  },
469
  {
470
+ "epoch": 1.42,
471
+ "learning_rate": 4.766355140186916e-05,
472
+ "loss": 0.5112,
473
  "step": 760
474
  },
475
  {
476
+ "epoch": 1.44,
477
+ "learning_rate": 4.755970924195223e-05,
478
+ "loss": 0.3827,
479
  "step": 770
480
  },
481
  {
482
+ "epoch": 1.46,
483
+ "learning_rate": 4.745586708203531e-05,
484
+ "loss": 0.5576,
485
  "step": 780
486
  },
487
  {
488
+ "epoch": 1.48,
489
+ "learning_rate": 4.735202492211838e-05,
490
+ "loss": 0.3974,
491
  "step": 790
492
  },
493
  {
494
+ "epoch": 1.5,
495
+ "learning_rate": 4.7248182762201456e-05,
496
+ "loss": 0.5209,
 
 
 
 
 
 
 
 
 
497
  "step": 800
498
  },
499
  {
500
+ "epoch": 1.51,
501
+ "learning_rate": 4.714434060228453e-05,
502
+ "loss": 0.4721,
503
  "step": 810
504
  },
505
  {
506
+ "epoch": 1.53,
507
+ "learning_rate": 4.7040498442367604e-05,
508
+ "loss": 0.385,
509
  "step": 820
510
  },
511
  {
512
+ "epoch": 1.55,
513
+ "learning_rate": 4.693665628245067e-05,
514
+ "loss": 0.4874,
515
  "step": 830
516
  },
517
  {
518
+ "epoch": 1.57,
519
+ "learning_rate": 4.683281412253375e-05,
520
+ "loss": 0.4409,
521
  "step": 840
522
  },
523
  {
524
+ "epoch": 1.59,
525
+ "learning_rate": 4.672897196261683e-05,
526
+ "loss": 0.506,
527
  "step": 850
528
  },
529
  {
530
+ "epoch": 1.61,
531
+ "learning_rate": 4.6625129802699895e-05,
532
+ "loss": 0.4446,
533
  "step": 860
534
  },
535
  {
536
+ "epoch": 1.63,
537
+ "learning_rate": 4.6521287642782976e-05,
538
+ "loss": 0.4086,
539
  "step": 870
540
  },
541
  {
542
+ "epoch": 1.64,
543
+ "learning_rate": 4.641744548286604e-05,
544
+ "loss": 0.3869,
545
  "step": 880
546
  },
547
  {
548
+ "epoch": 1.66,
549
+ "learning_rate": 4.631360332294912e-05,
550
+ "loss": 0.395,
551
  "step": 890
552
  },
553
  {
554
+ "epoch": 1.68,
555
+ "learning_rate": 4.620976116303219e-05,
556
+ "loss": 0.464,
557
  "step": 900
558
  },
559
  {
560
+ "epoch": 1.7,
561
+ "learning_rate": 4.6105919003115266e-05,
562
+ "loss": 0.3581,
563
  "step": 910
564
  },
565
  {
566
+ "epoch": 1.72,
567
+ "learning_rate": 4.600207684319834e-05,
568
+ "loss": 0.387,
569
  "step": 920
570
  },
571
  {
572
+ "epoch": 1.74,
573
+ "learning_rate": 4.5898234683281414e-05,
574
+ "loss": 0.4066,
575
  "step": 930
576
  },
577
  {
578
+ "epoch": 1.76,
579
+ "learning_rate": 4.579439252336449e-05,
580
+ "loss": 0.3997,
 
 
 
 
 
 
 
 
 
581
  "step": 940
582
  },
583
  {
584
+ "epoch": 1.78,
585
+ "learning_rate": 4.569055036344756e-05,
586
+ "loss": 0.4705,
587
  "step": 950
588
  },
589
  {
590
+ "epoch": 1.79,
591
+ "learning_rate": 4.558670820353064e-05,
592
+ "loss": 0.4063,
593
  "step": 960
594
  },
595
  {
596
+ "epoch": 1.81,
597
+ "learning_rate": 4.548286604361371e-05,
598
+ "loss": 0.4227,
599
  "step": 970
600
  },
601
  {
602
+ "epoch": 1.83,
603
+ "learning_rate": 4.537902388369678e-05,
604
+ "loss": 0.4844,
605
  "step": 980
606
  },
607
  {
608
+ "epoch": 1.85,
609
+ "learning_rate": 4.527518172377986e-05,
610
+ "loss": 0.4631,
611
  "step": 990
612
  },
613
  {
614
+ "epoch": 1.87,
615
+ "learning_rate": 4.517133956386293e-05,
616
+ "loss": 0.4847,
617
  "step": 1000
618
  },
619
  {
620
+ "epoch": 1.89,
621
+ "learning_rate": 4.5067497403946e-05,
622
+ "loss": 0.4112,
623
  "step": 1010
624
  },
625
  {
626
+ "epoch": 1.91,
627
+ "learning_rate": 4.496365524402908e-05,
628
+ "loss": 0.3964,
629
  "step": 1020
630
  },
631
  {
632
+ "epoch": 1.93,
633
+ "learning_rate": 4.485981308411215e-05,
634
+ "loss": 0.4341,
635
  "step": 1030
636
  },
637
  {
638
+ "epoch": 1.94,
639
+ "learning_rate": 4.4755970924195224e-05,
640
+ "loss": 0.5106,
641
  "step": 1040
642
  },
643
  {
644
+ "epoch": 1.96,
645
+ "learning_rate": 4.46521287642783e-05,
646
+ "loss": 0.4269,
647
  "step": 1050
648
  },
649
  {
650
+ "epoch": 1.98,
651
+ "learning_rate": 4.454828660436137e-05,
652
+ "loss": 0.413,
653
  "step": 1060
654
  },
655
  {
656
+ "epoch": 2.0,
657
+ "learning_rate": 4.4444444444444447e-05,
658
+ "loss": 0.4139,
659
+ "step": 1070
 
 
 
660
  },
661
  {
662
+ "epoch": 2.0,
663
+ "eval_accuracy": 0.861214953271028,
664
+ "eval_loss": 0.40707165002822876,
665
+ "eval_runtime": 99.8719,
666
+ "eval_samples_per_second": 42.855,
667
+ "eval_steps_per_second": 5.357,
668
  "step": 1070
669
  },
670
  {
671
+ "epoch": 2.02,
672
+ "learning_rate": 4.434060228452752e-05,
673
+ "loss": 0.395,
674
  "step": 1080
675
  },
676
  {
677
+ "epoch": 2.04,
678
+ "learning_rate": 4.4236760124610595e-05,
679
+ "loss": 0.3672,
680
  "step": 1090
681
  },
682
  {
683
+ "epoch": 2.06,
684
+ "learning_rate": 4.413291796469366e-05,
685
+ "loss": 0.3994,
686
  "step": 1100
687
  },
688
  {
689
+ "epoch": 2.07,
690
+ "learning_rate": 4.4029075804776743e-05,
691
+ "loss": 0.3818,
692
  "step": 1110
693
  },
694
  {
695
+ "epoch": 2.09,
696
+ "learning_rate": 4.392523364485982e-05,
697
+ "loss": 0.4013,
698
  "step": 1120
699
  },
700
  {
701
+ "epoch": 2.11,
702
+ "learning_rate": 4.3821391484942885e-05,
703
+ "loss": 0.3409,
704
  "step": 1130
705
  },
706
  {
707
+ "epoch": 2.13,
708
+ "learning_rate": 4.3717549325025966e-05,
709
+ "loss": 0.4753,
710
  "step": 1140
711
  },
712
  {
713
+ "epoch": 2.15,
714
+ "learning_rate": 4.3613707165109034e-05,
715
+ "loss": 0.3671,
716
  "step": 1150
717
  },
718
  {
719
+ "epoch": 2.17,
720
+ "learning_rate": 4.350986500519211e-05,
721
+ "loss": 0.462,
722
  "step": 1160
723
  },
724
  {
725
+ "epoch": 2.19,
726
+ "learning_rate": 4.340602284527518e-05,
727
+ "loss": 0.3729,
728
  "step": 1170
729
  },
730
  {
731
+ "epoch": 2.21,
732
+ "learning_rate": 4.3302180685358256e-05,
733
+ "loss": 0.3876,
734
  "step": 1180
735
  },
736
  {
737
+ "epoch": 2.22,
738
+ "learning_rate": 4.319833852544133e-05,
739
+ "loss": 0.3794,
740
  "step": 1190
741
  },
742
  {
743
+ "epoch": 2.24,
744
+ "learning_rate": 4.3094496365524405e-05,
745
+ "loss": 0.3609,
 
 
 
 
 
 
 
 
 
746
  "step": 1200
747
  },
748
  {
749
+ "epoch": 2.26,
750
+ "learning_rate": 4.299065420560748e-05,
751
+ "loss": 0.4161,
752
  "step": 1210
753
  },
754
  {
755
+ "epoch": 2.28,
756
+ "learning_rate": 4.2886812045690546e-05,
757
+ "loss": 0.5202,
758
  "step": 1220
759
  },
760
  {
761
+ "epoch": 2.3,
762
+ "learning_rate": 4.278296988577363e-05,
763
+ "loss": 0.4579,
764
  "step": 1230
765
  },
766
  {
767
+ "epoch": 2.32,
768
+ "learning_rate": 4.26791277258567e-05,
769
+ "loss": 0.408,
770
  "step": 1240
771
  },
772
  {
773
+ "epoch": 2.34,
774
+ "learning_rate": 4.257528556593977e-05,
775
+ "loss": 0.36,
776
  "step": 1250
777
  },
778
  {
779
+ "epoch": 2.36,
780
+ "learning_rate": 4.247144340602285e-05,
781
+ "loss": 0.3109,
782
  "step": 1260
783
  },
784
  {
785
+ "epoch": 2.37,
786
+ "learning_rate": 4.236760124610592e-05,
787
+ "loss": 0.3713,
788
  "step": 1270
789
  },
790
  {
791
+ "epoch": 2.39,
792
+ "learning_rate": 4.226375908618899e-05,
793
+ "loss": 0.4329,
794
  "step": 1280
795
  },
796
  {
797
+ "epoch": 2.41,
798
+ "learning_rate": 4.2159916926272066e-05,
799
+ "loss": 0.3612,
800
  "step": 1290
801
  },
802
  {
803
+ "epoch": 2.43,
804
+ "learning_rate": 4.205607476635514e-05,
805
+ "loss": 0.387,
806
  "step": 1300
807
  },
808
  {
809
+ "epoch": 2.45,
810
+ "learning_rate": 4.1952232606438215e-05,
811
+ "loss": 0.4461,
812
  "step": 1310
813
  },
814
  {
815
+ "epoch": 2.47,
816
+ "learning_rate": 4.184839044652129e-05,
817
+ "loss": 0.2903,
818
  "step": 1320
819
  },
820
  {
821
+ "epoch": 2.49,
822
+ "learning_rate": 4.174454828660436e-05,
823
+ "loss": 0.328,
824
  "step": 1330
825
  },
826
  {
827
+ "epoch": 2.5,
828
+ "learning_rate": 4.164070612668744e-05,
829
+ "loss": 0.4151,
830
+ "step": 1340
831
+ },
832
+ {
833
+ "epoch": 2.52,
834
+ "learning_rate": 4.153686396677051e-05,
835
+ "loss": 0.3888,
836
+ "step": 1350
837
+ },
838
+ {
839
+ "epoch": 2.54,
840
+ "learning_rate": 4.1433021806853586e-05,
841
+ "loss": 0.3871,
842
+ "step": 1360
843
+ },
844
+ {
845
+ "epoch": 2.56,
846
+ "learning_rate": 4.132917964693666e-05,
847
+ "loss": 0.4304,
848
+ "step": 1370
849
+ },
850
+ {
851
+ "epoch": 2.58,
852
+ "learning_rate": 4.1225337487019734e-05,
853
+ "loss": 0.387,
854
+ "step": 1380
855
+ },
856
+ {
857
+ "epoch": 2.6,
858
+ "learning_rate": 4.11214953271028e-05,
859
+ "loss": 0.3658,
860
+ "step": 1390
861
+ },
862
+ {
863
+ "epoch": 2.62,
864
+ "learning_rate": 4.101765316718588e-05,
865
+ "loss": 0.4494,
866
+ "step": 1400
867
+ },
868
+ {
869
+ "epoch": 2.64,
870
+ "learning_rate": 4.091381100726896e-05,
871
+ "loss": 0.3355,
872
+ "step": 1410
873
+ },
874
+ {
875
+ "epoch": 2.65,
876
+ "learning_rate": 4.0809968847352024e-05,
877
+ "loss": 0.3368,
878
+ "step": 1420
879
+ },
880
+ {
881
+ "epoch": 2.67,
882
+ "learning_rate": 4.0706126687435105e-05,
883
+ "loss": 0.4087,
884
+ "step": 1430
885
+ },
886
+ {
887
+ "epoch": 2.69,
888
+ "learning_rate": 4.060228452751817e-05,
889
+ "loss": 0.4279,
890
+ "step": 1440
891
+ },
892
+ {
893
+ "epoch": 2.71,
894
+ "learning_rate": 4.049844236760125e-05,
895
+ "loss": 0.3548,
896
+ "step": 1450
897
+ },
898
+ {
899
+ "epoch": 2.73,
900
+ "learning_rate": 4.039460020768432e-05,
901
+ "loss": 0.4466,
902
+ "step": 1460
903
+ },
904
+ {
905
+ "epoch": 2.75,
906
+ "learning_rate": 4.0290758047767395e-05,
907
+ "loss": 0.4133,
908
+ "step": 1470
909
+ },
910
+ {
911
+ "epoch": 2.77,
912
+ "learning_rate": 4.018691588785047e-05,
913
+ "loss": 0.3845,
914
+ "step": 1480
915
+ },
916
+ {
917
+ "epoch": 2.79,
918
+ "learning_rate": 4.0083073727933544e-05,
919
+ "loss": 0.4471,
920
+ "step": 1490
921
+ },
922
+ {
923
+ "epoch": 2.8,
924
+ "learning_rate": 3.997923156801662e-05,
925
+ "loss": 0.4421,
926
+ "step": 1500
927
+ },
928
+ {
929
+ "epoch": 2.82,
930
+ "learning_rate": 3.987538940809969e-05,
931
+ "loss": 0.4383,
932
+ "step": 1510
933
+ },
934
+ {
935
+ "epoch": 2.84,
936
+ "learning_rate": 3.9771547248182767e-05,
937
+ "loss": 0.3326,
938
+ "step": 1520
939
+ },
940
+ {
941
+ "epoch": 2.86,
942
+ "learning_rate": 3.966770508826584e-05,
943
+ "loss": 0.4058,
944
+ "step": 1530
945
+ },
946
+ {
947
+ "epoch": 2.88,
948
+ "learning_rate": 3.956386292834891e-05,
949
+ "loss": 0.3653,
950
+ "step": 1540
951
+ },
952
+ {
953
+ "epoch": 2.9,
954
+ "learning_rate": 3.946002076843199e-05,
955
+ "loss": 0.4157,
956
+ "step": 1550
957
+ },
958
+ {
959
+ "epoch": 2.92,
960
+ "learning_rate": 3.935617860851506e-05,
961
+ "loss": 0.3944,
962
+ "step": 1560
963
+ },
964
+ {
965
+ "epoch": 2.93,
966
+ "learning_rate": 3.925233644859813e-05,
967
+ "loss": 0.4103,
968
+ "step": 1570
969
+ },
970
+ {
971
+ "epoch": 2.95,
972
+ "learning_rate": 3.914849428868121e-05,
973
+ "loss": 0.4601,
974
+ "step": 1580
975
+ },
976
+ {
977
+ "epoch": 2.97,
978
+ "learning_rate": 3.904465212876428e-05,
979
+ "loss": 0.3561,
980
+ "step": 1590
981
+ },
982
+ {
983
+ "epoch": 2.99,
984
+ "learning_rate": 3.8940809968847354e-05,
985
+ "loss": 0.287,
986
+ "step": 1600
987
+ },
988
+ {
989
+ "epoch": 3.0,
990
+ "eval_accuracy": 0.8642523364485981,
991
+ "eval_loss": 0.39544418454170227,
992
+ "eval_runtime": 95.8501,
993
+ "eval_samples_per_second": 44.653,
994
+ "eval_steps_per_second": 5.582,
995
+ "step": 1605
996
+ },
997
+ {
998
+ "epoch": 3.01,
999
+ "learning_rate": 3.883696780893043e-05,
1000
+ "loss": 0.3781,
1001
+ "step": 1610
1002
+ },
1003
+ {
1004
+ "epoch": 3.03,
1005
+ "learning_rate": 3.87331256490135e-05,
1006
+ "loss": 0.3999,
1007
+ "step": 1620
1008
+ },
1009
+ {
1010
+ "epoch": 3.05,
1011
+ "learning_rate": 3.8629283489096576e-05,
1012
+ "loss": 0.3507,
1013
+ "step": 1630
1014
+ },
1015
+ {
1016
+ "epoch": 3.07,
1017
+ "learning_rate": 3.852544132917965e-05,
1018
+ "loss": 0.354,
1019
+ "step": 1640
1020
+ },
1021
+ {
1022
+ "epoch": 3.08,
1023
+ "learning_rate": 3.8421599169262725e-05,
1024
+ "loss": 0.4233,
1025
+ "step": 1650
1026
+ },
1027
+ {
1028
+ "epoch": 3.1,
1029
+ "learning_rate": 3.831775700934579e-05,
1030
+ "loss": 0.3766,
1031
+ "step": 1660
1032
+ },
1033
+ {
1034
+ "epoch": 3.12,
1035
+ "learning_rate": 3.821391484942887e-05,
1036
+ "loss": 0.4494,
1037
+ "step": 1670
1038
+ },
1039
+ {
1040
+ "epoch": 3.14,
1041
+ "learning_rate": 3.811007268951195e-05,
1042
+ "loss": 0.3772,
1043
+ "step": 1680
1044
+ },
1045
+ {
1046
+ "epoch": 3.16,
1047
+ "learning_rate": 3.8006230529595015e-05,
1048
+ "loss": 0.4079,
1049
+ "step": 1690
1050
+ },
1051
+ {
1052
+ "epoch": 3.18,
1053
+ "learning_rate": 3.7902388369678096e-05,
1054
+ "loss": 0.3796,
1055
+ "step": 1700
1056
+ },
1057
+ {
1058
+ "epoch": 3.2,
1059
+ "learning_rate": 3.779854620976116e-05,
1060
+ "loss": 0.2899,
1061
+ "step": 1710
1062
+ },
1063
+ {
1064
+ "epoch": 3.21,
1065
+ "learning_rate": 3.769470404984424e-05,
1066
+ "loss": 0.357,
1067
+ "step": 1720
1068
+ },
1069
+ {
1070
+ "epoch": 3.23,
1071
+ "learning_rate": 3.759086188992731e-05,
1072
+ "loss": 0.3922,
1073
+ "step": 1730
1074
+ },
1075
+ {
1076
+ "epoch": 3.25,
1077
+ "learning_rate": 3.7487019730010386e-05,
1078
+ "loss": 0.3743,
1079
+ "step": 1740
1080
+ },
1081
+ {
1082
+ "epoch": 3.27,
1083
+ "learning_rate": 3.738317757009346e-05,
1084
+ "loss": 0.3275,
1085
+ "step": 1750
1086
+ },
1087
+ {
1088
+ "epoch": 3.29,
1089
+ "learning_rate": 3.7279335410176535e-05,
1090
+ "loss": 0.3128,
1091
+ "step": 1760
1092
+ },
1093
+ {
1094
+ "epoch": 3.31,
1095
+ "learning_rate": 3.717549325025961e-05,
1096
+ "loss": 0.3351,
1097
+ "step": 1770
1098
+ },
1099
+ {
1100
+ "epoch": 3.33,
1101
+ "learning_rate": 3.7071651090342676e-05,
1102
+ "loss": 0.354,
1103
+ "step": 1780
1104
+ },
1105
+ {
1106
+ "epoch": 3.35,
1107
+ "learning_rate": 3.696780893042576e-05,
1108
+ "loss": 0.3604,
1109
+ "step": 1790
1110
+ },
1111
+ {
1112
+ "epoch": 3.36,
1113
+ "learning_rate": 3.686396677050883e-05,
1114
+ "loss": 0.389,
1115
+ "step": 1800
1116
+ },
1117
+ {
1118
+ "epoch": 3.38,
1119
+ "learning_rate": 3.67601246105919e-05,
1120
+ "loss": 0.3892,
1121
+ "step": 1810
1122
+ },
1123
+ {
1124
+ "epoch": 3.4,
1125
+ "learning_rate": 3.665628245067498e-05,
1126
+ "loss": 0.3525,
1127
+ "step": 1820
1128
+ },
1129
+ {
1130
+ "epoch": 3.42,
1131
+ "learning_rate": 3.655244029075805e-05,
1132
+ "loss": 0.3725,
1133
+ "step": 1830
1134
+ },
1135
+ {
1136
+ "epoch": 3.44,
1137
+ "learning_rate": 3.644859813084112e-05,
1138
+ "loss": 0.3909,
1139
+ "step": 1840
1140
+ },
1141
+ {
1142
+ "epoch": 3.46,
1143
+ "learning_rate": 3.6344755970924196e-05,
1144
+ "loss": 0.2884,
1145
+ "step": 1850
1146
+ },
1147
+ {
1148
+ "epoch": 3.48,
1149
+ "learning_rate": 3.624091381100727e-05,
1150
+ "loss": 0.3384,
1151
+ "step": 1860
1152
+ },
1153
+ {
1154
+ "epoch": 3.5,
1155
+ "learning_rate": 3.6137071651090344e-05,
1156
+ "loss": 0.3645,
1157
+ "step": 1870
1158
+ },
1159
+ {
1160
+ "epoch": 3.51,
1161
+ "learning_rate": 3.603322949117342e-05,
1162
+ "loss": 0.3407,
1163
+ "step": 1880
1164
+ },
1165
+ {
1166
+ "epoch": 3.53,
1167
+ "learning_rate": 3.592938733125649e-05,
1168
+ "loss": 0.3847,
1169
+ "step": 1890
1170
+ },
1171
+ {
1172
+ "epoch": 3.55,
1173
+ "learning_rate": 3.582554517133957e-05,
1174
+ "loss": 0.3862,
1175
+ "step": 1900
1176
+ },
1177
+ {
1178
+ "epoch": 3.57,
1179
+ "learning_rate": 3.572170301142264e-05,
1180
+ "loss": 0.3831,
1181
+ "step": 1910
1182
+ },
1183
+ {
1184
+ "epoch": 3.59,
1185
+ "learning_rate": 3.5617860851505715e-05,
1186
+ "loss": 0.3147,
1187
+ "step": 1920
1188
+ },
1189
+ {
1190
+ "epoch": 3.61,
1191
+ "learning_rate": 3.551401869158878e-05,
1192
+ "loss": 0.3686,
1193
+ "step": 1930
1194
  },
1195
  {
1196
+ "epoch": 3.63,
1197
+ "learning_rate": 3.5410176531671864e-05,
1198
+ "loss": 0.3913,
1199
+ "step": 1940
1200
+ },
1201
+ {
1202
+ "epoch": 3.64,
1203
+ "learning_rate": 3.530633437175493e-05,
1204
+ "loss": 0.3717,
1205
+ "step": 1950
1206
+ },
1207
+ {
1208
+ "epoch": 3.66,
1209
+ "learning_rate": 3.5202492211838006e-05,
1210
+ "loss": 0.4135,
1211
+ "step": 1960
1212
+ },
1213
+ {
1214
+ "epoch": 3.68,
1215
+ "learning_rate": 3.5098650051921087e-05,
1216
+ "loss": 0.372,
1217
+ "step": 1970
1218
+ },
1219
+ {
1220
+ "epoch": 3.7,
1221
+ "learning_rate": 3.4994807892004154e-05,
1222
+ "loss": 0.3738,
1223
+ "step": 1980
1224
+ },
1225
+ {
1226
+ "epoch": 3.72,
1227
+ "learning_rate": 3.489096573208723e-05,
1228
+ "loss": 0.3847,
1229
+ "step": 1990
1230
+ },
1231
+ {
1232
+ "epoch": 3.74,
1233
+ "learning_rate": 3.47871235721703e-05,
1234
+ "loss": 0.3893,
1235
+ "step": 2000
1236
+ },
1237
+ {
1238
+ "epoch": 3.76,
1239
+ "learning_rate": 3.468328141225338e-05,
1240
+ "loss": 0.3045,
1241
+ "step": 2010
1242
+ },
1243
+ {
1244
+ "epoch": 3.78,
1245
+ "learning_rate": 3.457943925233645e-05,
1246
+ "loss": 0.3242,
1247
+ "step": 2020
1248
+ },
1249
+ {
1250
+ "epoch": 3.79,
1251
+ "learning_rate": 3.4475597092419525e-05,
1252
+ "loss": 0.3417,
1253
+ "step": 2030
1254
+ },
1255
+ {
1256
+ "epoch": 3.81,
1257
+ "learning_rate": 3.43717549325026e-05,
1258
+ "loss": 0.3568,
1259
+ "step": 2040
1260
+ },
1261
+ {
1262
+ "epoch": 3.83,
1263
+ "learning_rate": 3.426791277258567e-05,
1264
+ "loss": 0.3225,
1265
+ "step": 2050
1266
+ },
1267
+ {
1268
+ "epoch": 3.85,
1269
+ "learning_rate": 3.416407061266875e-05,
1270
+ "loss": 0.3341,
1271
+ "step": 2060
1272
+ },
1273
+ {
1274
+ "epoch": 3.87,
1275
+ "learning_rate": 3.406022845275182e-05,
1276
+ "loss": 0.326,
1277
+ "step": 2070
1278
+ },
1279
+ {
1280
+ "epoch": 3.89,
1281
+ "learning_rate": 3.395638629283489e-05,
1282
+ "loss": 0.3508,
1283
+ "step": 2080
1284
+ },
1285
+ {
1286
+ "epoch": 3.91,
1287
+ "learning_rate": 3.385254413291797e-05,
1288
+ "loss": 0.333,
1289
+ "step": 2090
1290
+ },
1291
+ {
1292
+ "epoch": 3.93,
1293
+ "learning_rate": 3.374870197300104e-05,
1294
+ "loss": 0.4373,
1295
+ "step": 2100
1296
+ },
1297
+ {
1298
+ "epoch": 3.94,
1299
+ "learning_rate": 3.364485981308411e-05,
1300
+ "loss": 0.3769,
1301
+ "step": 2110
1302
+ },
1303
+ {
1304
+ "epoch": 3.96,
1305
+ "learning_rate": 3.3541017653167186e-05,
1306
+ "loss": 0.3697,
1307
+ "step": 2120
1308
+ },
1309
+ {
1310
+ "epoch": 3.98,
1311
+ "learning_rate": 3.343717549325026e-05,
1312
+ "loss": 0.3187,
1313
+ "step": 2130
1314
+ },
1315
+ {
1316
+ "epoch": 4.0,
1317
+ "learning_rate": 3.3333333333333335e-05,
1318
+ "loss": 0.4211,
1319
+ "step": 2140
1320
+ },
1321
+ {
1322
+ "epoch": 4.0,
1323
+ "eval_accuracy": 0.8700934579439252,
1324
+ "eval_loss": 0.39060959219932556,
1325
+ "eval_runtime": 93.1195,
1326
+ "eval_samples_per_second": 45.962,
1327
+ "eval_steps_per_second": 5.745,
1328
+ "step": 2140
1329
+ },
1330
+ {
1331
+ "epoch": 4.02,
1332
+ "learning_rate": 3.322949117341641e-05,
1333
+ "loss": 0.2857,
1334
+ "step": 2150
1335
+ },
1336
+ {
1337
+ "epoch": 4.04,
1338
+ "learning_rate": 3.3125649013499483e-05,
1339
+ "loss": 0.3271,
1340
+ "step": 2160
1341
+ },
1342
+ {
1343
+ "epoch": 4.06,
1344
+ "learning_rate": 3.302180685358255e-05,
1345
+ "loss": 0.3109,
1346
+ "step": 2170
1347
+ },
1348
+ {
1349
+ "epoch": 4.07,
1350
+ "learning_rate": 3.291796469366563e-05,
1351
+ "loss": 0.2679,
1352
+ "step": 2180
1353
+ },
1354
+ {
1355
+ "epoch": 4.09,
1356
+ "learning_rate": 3.2814122533748706e-05,
1357
+ "loss": 0.3242,
1358
+ "step": 2190
1359
+ },
1360
+ {
1361
+ "epoch": 4.11,
1362
+ "learning_rate": 3.2710280373831774e-05,
1363
+ "loss": 0.3083,
1364
+ "step": 2200
1365
+ },
1366
+ {
1367
+ "epoch": 4.13,
1368
+ "learning_rate": 3.2606438213914855e-05,
1369
+ "loss": 0.3325,
1370
+ "step": 2210
1371
+ },
1372
+ {
1373
+ "epoch": 4.15,
1374
+ "learning_rate": 3.250259605399792e-05,
1375
+ "loss": 0.3989,
1376
+ "step": 2220
1377
+ },
1378
+ {
1379
+ "epoch": 4.17,
1380
+ "learning_rate": 3.2398753894080996e-05,
1381
+ "loss": 0.3044,
1382
+ "step": 2230
1383
+ },
1384
+ {
1385
+ "epoch": 4.19,
1386
+ "learning_rate": 3.229491173416408e-05,
1387
+ "loss": 0.3389,
1388
+ "step": 2240
1389
+ },
1390
+ {
1391
+ "epoch": 4.21,
1392
+ "learning_rate": 3.2191069574247145e-05,
1393
+ "loss": 0.3284,
1394
+ "step": 2250
1395
+ },
1396
+ {
1397
+ "epoch": 4.22,
1398
+ "learning_rate": 3.208722741433022e-05,
1399
+ "loss": 0.2777,
1400
+ "step": 2260
1401
+ },
1402
+ {
1403
+ "epoch": 4.24,
1404
+ "learning_rate": 3.198338525441329e-05,
1405
+ "loss": 0.3531,
1406
+ "step": 2270
1407
+ },
1408
+ {
1409
+ "epoch": 4.26,
1410
+ "learning_rate": 3.187954309449637e-05,
1411
+ "loss": 0.3578,
1412
+ "step": 2280
1413
+ },
1414
+ {
1415
+ "epoch": 4.28,
1416
+ "learning_rate": 3.177570093457944e-05,
1417
+ "loss": 0.375,
1418
+ "step": 2290
1419
+ },
1420
+ {
1421
+ "epoch": 4.3,
1422
+ "learning_rate": 3.1671858774662516e-05,
1423
+ "loss": 0.3912,
1424
+ "step": 2300
1425
+ },
1426
+ {
1427
+ "epoch": 4.32,
1428
+ "learning_rate": 3.156801661474559e-05,
1429
+ "loss": 0.3632,
1430
+ "step": 2310
1431
+ },
1432
+ {
1433
+ "epoch": 4.34,
1434
+ "learning_rate": 3.146417445482866e-05,
1435
+ "loss": 0.3453,
1436
+ "step": 2320
1437
+ },
1438
+ {
1439
+ "epoch": 4.36,
1440
+ "learning_rate": 3.136033229491174e-05,
1441
+ "loss": 0.3412,
1442
+ "step": 2330
1443
+ },
1444
+ {
1445
+ "epoch": 4.37,
1446
+ "learning_rate": 3.1256490134994806e-05,
1447
+ "loss": 0.3506,
1448
+ "step": 2340
1449
+ },
1450
+ {
1451
+ "epoch": 4.39,
1452
+ "learning_rate": 3.115264797507788e-05,
1453
+ "loss": 0.3178,
1454
+ "step": 2350
1455
+ },
1456
+ {
1457
+ "epoch": 4.41,
1458
+ "learning_rate": 3.104880581516096e-05,
1459
+ "loss": 0.3787,
1460
+ "step": 2360
1461
+ },
1462
+ {
1463
+ "epoch": 4.43,
1464
+ "learning_rate": 3.094496365524403e-05,
1465
+ "loss": 0.2522,
1466
+ "step": 2370
1467
+ },
1468
+ {
1469
+ "epoch": 4.45,
1470
+ "learning_rate": 3.08411214953271e-05,
1471
+ "loss": 0.2986,
1472
+ "step": 2380
1473
+ },
1474
+ {
1475
+ "epoch": 4.47,
1476
+ "learning_rate": 3.073727933541018e-05,
1477
+ "loss": 0.3647,
1478
+ "step": 2390
1479
+ },
1480
+ {
1481
+ "epoch": 4.49,
1482
+ "learning_rate": 3.063343717549325e-05,
1483
+ "loss": 0.2927,
1484
+ "step": 2400
1485
+ },
1486
+ {
1487
+ "epoch": 4.5,
1488
+ "learning_rate": 3.0529595015576326e-05,
1489
+ "loss": 0.3868,
1490
+ "step": 2410
1491
+ },
1492
+ {
1493
+ "epoch": 4.52,
1494
+ "learning_rate": 3.04257528556594e-05,
1495
+ "loss": 0.2927,
1496
+ "step": 2420
1497
+ },
1498
+ {
1499
+ "epoch": 4.54,
1500
+ "learning_rate": 3.0321910695742474e-05,
1501
+ "loss": 0.2755,
1502
+ "step": 2430
1503
+ },
1504
+ {
1505
+ "epoch": 4.56,
1506
+ "learning_rate": 3.0218068535825545e-05,
1507
+ "loss": 0.2953,
1508
+ "step": 2440
1509
+ },
1510
+ {
1511
+ "epoch": 4.58,
1512
+ "learning_rate": 3.0114226375908622e-05,
1513
+ "loss": 0.2946,
1514
+ "step": 2450
1515
+ },
1516
+ {
1517
+ "epoch": 4.6,
1518
+ "learning_rate": 3.0010384215991693e-05,
1519
+ "loss": 0.3302,
1520
+ "step": 2460
1521
+ },
1522
+ {
1523
+ "epoch": 4.62,
1524
+ "learning_rate": 2.9906542056074764e-05,
1525
+ "loss": 0.406,
1526
+ "step": 2470
1527
+ },
1528
+ {
1529
+ "epoch": 4.64,
1530
+ "learning_rate": 2.9802699896157842e-05,
1531
+ "loss": 0.323,
1532
+ "step": 2480
1533
+ },
1534
+ {
1535
+ "epoch": 4.65,
1536
+ "learning_rate": 2.9698857736240916e-05,
1537
+ "loss": 0.3808,
1538
+ "step": 2490
1539
+ },
1540
+ {
1541
+ "epoch": 4.67,
1542
+ "learning_rate": 2.9595015576323987e-05,
1543
+ "loss": 0.3269,
1544
+ "step": 2500
1545
+ },
1546
+ {
1547
+ "epoch": 4.69,
1548
+ "learning_rate": 2.9491173416407064e-05,
1549
+ "loss": 0.3703,
1550
+ "step": 2510
1551
+ },
1552
+ {
1553
+ "epoch": 4.71,
1554
+ "learning_rate": 2.9387331256490135e-05,
1555
+ "loss": 0.3173,
1556
+ "step": 2520
1557
+ },
1558
+ {
1559
+ "epoch": 4.73,
1560
+ "learning_rate": 2.9283489096573206e-05,
1561
+ "loss": 0.2749,
1562
+ "step": 2530
1563
+ },
1564
+ {
1565
+ "epoch": 4.75,
1566
+ "learning_rate": 2.9179646936656284e-05,
1567
+ "loss": 0.3128,
1568
+ "step": 2540
1569
+ },
1570
+ {
1571
+ "epoch": 4.77,
1572
+ "learning_rate": 2.9075804776739358e-05,
1573
+ "loss": 0.3725,
1574
+ "step": 2550
1575
+ },
1576
+ {
1577
+ "epoch": 4.79,
1578
+ "learning_rate": 2.897196261682243e-05,
1579
+ "loss": 0.3052,
1580
+ "step": 2560
1581
+ },
1582
+ {
1583
+ "epoch": 4.8,
1584
+ "learning_rate": 2.8868120456905506e-05,
1585
+ "loss": 0.3159,
1586
+ "step": 2570
1587
+ },
1588
+ {
1589
+ "epoch": 4.82,
1590
+ "learning_rate": 2.8764278296988577e-05,
1591
+ "loss": 0.27,
1592
+ "step": 2580
1593
+ },
1594
+ {
1595
+ "epoch": 4.84,
1596
+ "learning_rate": 2.866043613707165e-05,
1597
+ "loss": 0.3566,
1598
+ "step": 2590
1599
+ },
1600
+ {
1601
+ "epoch": 4.86,
1602
+ "learning_rate": 2.855659397715473e-05,
1603
+ "loss": 0.3622,
1604
+ "step": 2600
1605
+ },
1606
+ {
1607
+ "epoch": 4.88,
1608
+ "learning_rate": 2.84527518172378e-05,
1609
+ "loss": 0.3318,
1610
+ "step": 2610
1611
+ },
1612
+ {
1613
+ "epoch": 4.9,
1614
+ "learning_rate": 2.834890965732087e-05,
1615
+ "loss": 0.334,
1616
+ "step": 2620
1617
+ },
1618
+ {
1619
+ "epoch": 4.92,
1620
+ "learning_rate": 2.824506749740395e-05,
1621
+ "loss": 0.3084,
1622
+ "step": 2630
1623
+ },
1624
+ {
1625
+ "epoch": 4.93,
1626
+ "learning_rate": 2.814122533748702e-05,
1627
+ "loss": 0.3524,
1628
+ "step": 2640
1629
+ },
1630
+ {
1631
+ "epoch": 4.95,
1632
+ "learning_rate": 2.8037383177570094e-05,
1633
+ "loss": 0.3473,
1634
+ "step": 2650
1635
+ },
1636
+ {
1637
+ "epoch": 4.97,
1638
+ "learning_rate": 2.793354101765317e-05,
1639
+ "loss": 0.3593,
1640
+ "step": 2660
1641
+ },
1642
+ {
1643
+ "epoch": 4.99,
1644
+ "learning_rate": 2.7829698857736242e-05,
1645
+ "loss": 0.316,
1646
+ "step": 2670
1647
+ },
1648
+ {
1649
+ "epoch": 5.0,
1650
+ "eval_accuracy": 0.8754672897196262,
1651
+ "eval_loss": 0.37160804867744446,
1652
+ "eval_runtime": 95.1891,
1653
+ "eval_samples_per_second": 44.963,
1654
+ "eval_steps_per_second": 5.62,
1655
+ "step": 2675
1656
+ },
1657
+ {
1658
+ "epoch": 5.01,
1659
+ "learning_rate": 2.7725856697819313e-05,
1660
+ "loss": 0.2738,
1661
+ "step": 2680
1662
+ },
1663
+ {
1664
+ "epoch": 5.03,
1665
+ "learning_rate": 2.762201453790239e-05,
1666
+ "loss": 0.342,
1667
+ "step": 2690
1668
+ },
1669
+ {
1670
+ "epoch": 5.05,
1671
+ "learning_rate": 2.751817237798546e-05,
1672
+ "loss": 0.3448,
1673
+ "step": 2700
1674
+ },
1675
+ {
1676
+ "epoch": 5.07,
1677
+ "learning_rate": 2.7414330218068536e-05,
1678
+ "loss": 0.3099,
1679
+ "step": 2710
1680
+ },
1681
+ {
1682
+ "epoch": 5.08,
1683
+ "learning_rate": 2.7310488058151613e-05,
1684
+ "loss": 0.3368,
1685
+ "step": 2720
1686
+ },
1687
+ {
1688
+ "epoch": 5.1,
1689
+ "learning_rate": 2.7206645898234684e-05,
1690
+ "loss": 0.2743,
1691
+ "step": 2730
1692
+ },
1693
+ {
1694
+ "epoch": 5.12,
1695
+ "learning_rate": 2.7102803738317755e-05,
1696
+ "loss": 0.3064,
1697
+ "step": 2740
1698
+ },
1699
+ {
1700
+ "epoch": 5.14,
1701
+ "learning_rate": 2.6998961578400832e-05,
1702
+ "loss": 0.2109,
1703
+ "step": 2750
1704
+ },
1705
+ {
1706
+ "epoch": 5.16,
1707
+ "learning_rate": 2.6895119418483907e-05,
1708
+ "loss": 0.3611,
1709
+ "step": 2760
1710
+ },
1711
+ {
1712
+ "epoch": 5.18,
1713
+ "learning_rate": 2.6791277258566978e-05,
1714
+ "loss": 0.3456,
1715
+ "step": 2770
1716
+ },
1717
+ {
1718
+ "epoch": 5.2,
1719
+ "learning_rate": 2.6687435098650055e-05,
1720
+ "loss": 0.2599,
1721
+ "step": 2780
1722
+ },
1723
+ {
1724
+ "epoch": 5.21,
1725
+ "learning_rate": 2.6583592938733126e-05,
1726
+ "loss": 0.2951,
1727
+ "step": 2790
1728
+ },
1729
+ {
1730
+ "epoch": 5.23,
1731
+ "learning_rate": 2.6479750778816197e-05,
1732
+ "loss": 0.1794,
1733
+ "step": 2800
1734
+ },
1735
+ {
1736
+ "epoch": 5.25,
1737
+ "learning_rate": 2.6375908618899274e-05,
1738
+ "loss": 0.2954,
1739
+ "step": 2810
1740
+ },
1741
+ {
1742
+ "epoch": 5.27,
1743
+ "learning_rate": 2.627206645898235e-05,
1744
+ "loss": 0.298,
1745
+ "step": 2820
1746
+ },
1747
+ {
1748
+ "epoch": 5.29,
1749
+ "learning_rate": 2.616822429906542e-05,
1750
+ "loss": 0.3187,
1751
+ "step": 2830
1752
+ },
1753
+ {
1754
+ "epoch": 5.31,
1755
+ "learning_rate": 2.6064382139148497e-05,
1756
+ "loss": 0.3297,
1757
+ "step": 2840
1758
+ },
1759
+ {
1760
+ "epoch": 5.33,
1761
+ "learning_rate": 2.5960539979231568e-05,
1762
+ "loss": 0.2431,
1763
+ "step": 2850
1764
+ },
1765
+ {
1766
+ "epoch": 5.35,
1767
+ "learning_rate": 2.585669781931464e-05,
1768
+ "loss": 0.3967,
1769
+ "step": 2860
1770
+ },
1771
+ {
1772
+ "epoch": 5.36,
1773
+ "learning_rate": 2.5752855659397716e-05,
1774
+ "loss": 0.3722,
1775
+ "step": 2870
1776
+ },
1777
+ {
1778
+ "epoch": 5.38,
1779
+ "learning_rate": 2.564901349948079e-05,
1780
+ "loss": 0.2752,
1781
+ "step": 2880
1782
+ },
1783
+ {
1784
+ "epoch": 5.4,
1785
+ "learning_rate": 2.554517133956386e-05,
1786
+ "loss": 0.2971,
1787
+ "step": 2890
1788
+ },
1789
+ {
1790
+ "epoch": 5.42,
1791
+ "learning_rate": 2.544132917964694e-05,
1792
+ "loss": 0.2804,
1793
+ "step": 2900
1794
+ },
1795
+ {
1796
+ "epoch": 5.44,
1797
+ "learning_rate": 2.533748701973001e-05,
1798
+ "loss": 0.3715,
1799
+ "step": 2910
1800
+ },
1801
+ {
1802
+ "epoch": 5.46,
1803
+ "learning_rate": 2.5233644859813084e-05,
1804
+ "loss": 0.2833,
1805
+ "step": 2920
1806
+ },
1807
+ {
1808
+ "epoch": 5.48,
1809
+ "learning_rate": 2.512980269989616e-05,
1810
+ "loss": 0.2891,
1811
+ "step": 2930
1812
+ },
1813
+ {
1814
+ "epoch": 5.5,
1815
+ "learning_rate": 2.5025960539979233e-05,
1816
+ "loss": 0.3122,
1817
+ "step": 2940
1818
+ },
1819
+ {
1820
+ "epoch": 5.51,
1821
+ "learning_rate": 2.4922118380062307e-05,
1822
+ "loss": 0.2891,
1823
+ "step": 2950
1824
+ },
1825
+ {
1826
+ "epoch": 5.53,
1827
+ "learning_rate": 2.481827622014538e-05,
1828
+ "loss": 0.3167,
1829
+ "step": 2960
1830
+ },
1831
+ {
1832
+ "epoch": 5.55,
1833
+ "learning_rate": 2.4714434060228452e-05,
1834
+ "loss": 0.3505,
1835
+ "step": 2970
1836
+ },
1837
+ {
1838
+ "epoch": 5.57,
1839
+ "learning_rate": 2.4610591900311526e-05,
1840
+ "loss": 0.3255,
1841
+ "step": 2980
1842
+ },
1843
+ {
1844
+ "epoch": 5.59,
1845
+ "learning_rate": 2.4506749740394604e-05,
1846
+ "loss": 0.2708,
1847
+ "step": 2990
1848
+ },
1849
+ {
1850
+ "epoch": 5.61,
1851
+ "learning_rate": 2.4402907580477675e-05,
1852
+ "loss": 0.3709,
1853
+ "step": 3000
1854
+ },
1855
+ {
1856
+ "epoch": 5.63,
1857
+ "learning_rate": 2.429906542056075e-05,
1858
+ "loss": 0.3623,
1859
+ "step": 3010
1860
+ },
1861
+ {
1862
+ "epoch": 5.64,
1863
+ "learning_rate": 2.4195223260643823e-05,
1864
+ "loss": 0.2488,
1865
+ "step": 3020
1866
+ },
1867
+ {
1868
+ "epoch": 5.66,
1869
+ "learning_rate": 2.4091381100726894e-05,
1870
+ "loss": 0.3012,
1871
+ "step": 3030
1872
+ },
1873
+ {
1874
+ "epoch": 5.68,
1875
+ "learning_rate": 2.398753894080997e-05,
1876
+ "loss": 0.318,
1877
+ "step": 3040
1878
+ },
1879
+ {
1880
+ "epoch": 5.7,
1881
+ "learning_rate": 2.3883696780893046e-05,
1882
+ "loss": 0.3596,
1883
+ "step": 3050
1884
+ },
1885
+ {
1886
+ "epoch": 5.72,
1887
+ "learning_rate": 2.3779854620976117e-05,
1888
+ "loss": 0.2923,
1889
+ "step": 3060
1890
+ },
1891
+ {
1892
+ "epoch": 5.74,
1893
+ "learning_rate": 2.367601246105919e-05,
1894
+ "loss": 0.3677,
1895
+ "step": 3070
1896
+ },
1897
+ {
1898
+ "epoch": 5.76,
1899
+ "learning_rate": 2.3572170301142265e-05,
1900
+ "loss": 0.2508,
1901
+ "step": 3080
1902
+ },
1903
+ {
1904
+ "epoch": 5.78,
1905
+ "learning_rate": 2.3468328141225336e-05,
1906
+ "loss": 0.2464,
1907
+ "step": 3090
1908
+ },
1909
+ {
1910
+ "epoch": 5.79,
1911
+ "learning_rate": 2.3364485981308414e-05,
1912
+ "loss": 0.3364,
1913
+ "step": 3100
1914
+ },
1915
+ {
1916
+ "epoch": 5.81,
1917
+ "learning_rate": 2.3260643821391488e-05,
1918
+ "loss": 0.2931,
1919
+ "step": 3110
1920
+ },
1921
+ {
1922
+ "epoch": 5.83,
1923
+ "learning_rate": 2.315680166147456e-05,
1924
+ "loss": 0.2962,
1925
+ "step": 3120
1926
+ },
1927
+ {
1928
+ "epoch": 5.85,
1929
+ "learning_rate": 2.3052959501557633e-05,
1930
+ "loss": 0.282,
1931
+ "step": 3130
1932
+ },
1933
+ {
1934
+ "epoch": 5.87,
1935
+ "learning_rate": 2.2949117341640707e-05,
1936
+ "loss": 0.2978,
1937
+ "step": 3140
1938
+ },
1939
+ {
1940
+ "epoch": 5.89,
1941
+ "learning_rate": 2.284527518172378e-05,
1942
+ "loss": 0.2524,
1943
+ "step": 3150
1944
+ },
1945
+ {
1946
+ "epoch": 5.91,
1947
+ "learning_rate": 2.2741433021806856e-05,
1948
+ "loss": 0.3384,
1949
+ "step": 3160
1950
+ },
1951
+ {
1952
+ "epoch": 5.93,
1953
+ "learning_rate": 2.263759086188993e-05,
1954
+ "loss": 0.2936,
1955
+ "step": 3170
1956
+ },
1957
+ {
1958
+ "epoch": 5.94,
1959
+ "learning_rate": 2.2533748701973e-05,
1960
+ "loss": 0.2521,
1961
+ "step": 3180
1962
+ },
1963
+ {
1964
+ "epoch": 5.96,
1965
+ "learning_rate": 2.2429906542056075e-05,
1966
+ "loss": 0.3685,
1967
+ "step": 3190
1968
+ },
1969
+ {
1970
+ "epoch": 5.98,
1971
+ "learning_rate": 2.232606438213915e-05,
1972
+ "loss": 0.2391,
1973
+ "step": 3200
1974
+ },
1975
+ {
1976
+ "epoch": 6.0,
1977
+ "learning_rate": 2.2222222222222223e-05,
1978
+ "loss": 0.2709,
1979
+ "step": 3210
1980
+ },
1981
+ {
1982
+ "epoch": 6.0,
1983
+ "eval_accuracy": 0.8735981308411215,
1984
+ "eval_loss": 0.37839841842651367,
1985
+ "eval_runtime": 92.127,
1986
+ "eval_samples_per_second": 46.458,
1987
+ "eval_steps_per_second": 5.807,
1988
+ "step": 3210
1989
+ },
1990
+ {
1991
+ "epoch": 6.02,
1992
+ "learning_rate": 2.2118380062305298e-05,
1993
+ "loss": 0.2305,
1994
+ "step": 3220
1995
+ },
1996
+ {
1997
+ "epoch": 6.04,
1998
+ "learning_rate": 2.2014537902388372e-05,
1999
+ "loss": 0.2758,
2000
+ "step": 3230
2001
+ },
2002
+ {
2003
+ "epoch": 6.06,
2004
+ "learning_rate": 2.1910695742471443e-05,
2005
+ "loss": 0.2602,
2006
+ "step": 3240
2007
+ },
2008
+ {
2009
+ "epoch": 6.07,
2010
+ "learning_rate": 2.1806853582554517e-05,
2011
+ "loss": 0.311,
2012
+ "step": 3250
2013
+ },
2014
+ {
2015
+ "epoch": 6.09,
2016
+ "learning_rate": 2.170301142263759e-05,
2017
+ "loss": 0.3315,
2018
+ "step": 3260
2019
+ },
2020
+ {
2021
+ "epoch": 6.11,
2022
+ "learning_rate": 2.1599169262720665e-05,
2023
+ "loss": 0.3078,
2024
+ "step": 3270
2025
+ },
2026
+ {
2027
+ "epoch": 6.13,
2028
+ "learning_rate": 2.149532710280374e-05,
2029
+ "loss": 0.319,
2030
+ "step": 3280
2031
+ },
2032
+ {
2033
+ "epoch": 6.15,
2034
+ "learning_rate": 2.1391484942886814e-05,
2035
+ "loss": 0.3326,
2036
+ "step": 3290
2037
+ },
2038
+ {
2039
+ "epoch": 6.17,
2040
+ "learning_rate": 2.1287642782969885e-05,
2041
+ "loss": 0.2892,
2042
+ "step": 3300
2043
+ },
2044
+ {
2045
+ "epoch": 6.19,
2046
+ "learning_rate": 2.118380062305296e-05,
2047
+ "loss": 0.2669,
2048
+ "step": 3310
2049
+ },
2050
+ {
2051
+ "epoch": 6.21,
2052
+ "learning_rate": 2.1079958463136033e-05,
2053
+ "loss": 0.2829,
2054
+ "step": 3320
2055
+ },
2056
+ {
2057
+ "epoch": 6.22,
2058
+ "learning_rate": 2.0976116303219107e-05,
2059
+ "loss": 0.2434,
2060
+ "step": 3330
2061
+ },
2062
+ {
2063
+ "epoch": 6.24,
2064
+ "learning_rate": 2.087227414330218e-05,
2065
+ "loss": 0.2422,
2066
+ "step": 3340
2067
+ },
2068
+ {
2069
+ "epoch": 6.26,
2070
+ "learning_rate": 2.0768431983385256e-05,
2071
+ "loss": 0.3655,
2072
+ "step": 3350
2073
+ },
2074
+ {
2075
+ "epoch": 6.28,
2076
+ "learning_rate": 2.066458982346833e-05,
2077
+ "loss": 0.2772,
2078
+ "step": 3360
2079
+ },
2080
+ {
2081
+ "epoch": 6.3,
2082
+ "learning_rate": 2.05607476635514e-05,
2083
+ "loss": 0.2542,
2084
+ "step": 3370
2085
+ },
2086
+ {
2087
+ "epoch": 6.32,
2088
+ "learning_rate": 2.045690550363448e-05,
2089
+ "loss": 0.2978,
2090
+ "step": 3380
2091
+ },
2092
+ {
2093
+ "epoch": 6.34,
2094
+ "learning_rate": 2.0353063343717553e-05,
2095
+ "loss": 0.2518,
2096
+ "step": 3390
2097
+ },
2098
+ {
2099
+ "epoch": 6.36,
2100
+ "learning_rate": 2.0249221183800623e-05,
2101
+ "loss": 0.2697,
2102
+ "step": 3400
2103
+ },
2104
+ {
2105
+ "epoch": 6.37,
2106
+ "learning_rate": 2.0145379023883698e-05,
2107
+ "loss": 0.2326,
2108
+ "step": 3410
2109
+ },
2110
+ {
2111
+ "epoch": 6.39,
2112
+ "learning_rate": 2.0041536863966772e-05,
2113
+ "loss": 0.29,
2114
+ "step": 3420
2115
+ },
2116
+ {
2117
+ "epoch": 6.41,
2118
+ "learning_rate": 1.9937694704049846e-05,
2119
+ "loss": 0.2858,
2120
+ "step": 3430
2121
+ },
2122
+ {
2123
+ "epoch": 6.43,
2124
+ "learning_rate": 1.983385254413292e-05,
2125
+ "loss": 0.2665,
2126
+ "step": 3440
2127
+ },
2128
+ {
2129
+ "epoch": 6.45,
2130
+ "learning_rate": 1.9730010384215995e-05,
2131
+ "loss": 0.2583,
2132
+ "step": 3450
2133
+ },
2134
+ {
2135
+ "epoch": 6.47,
2136
+ "learning_rate": 1.9626168224299065e-05,
2137
+ "loss": 0.3213,
2138
+ "step": 3460
2139
+ },
2140
+ {
2141
+ "epoch": 6.49,
2142
+ "learning_rate": 1.952232606438214e-05,
2143
+ "loss": 0.3385,
2144
+ "step": 3470
2145
+ },
2146
+ {
2147
+ "epoch": 6.5,
2148
+ "learning_rate": 1.9418483904465214e-05,
2149
+ "loss": 0.2492,
2150
+ "step": 3480
2151
+ },
2152
+ {
2153
+ "epoch": 6.52,
2154
+ "learning_rate": 1.9314641744548288e-05,
2155
+ "loss": 0.2829,
2156
+ "step": 3490
2157
+ },
2158
+ {
2159
+ "epoch": 6.54,
2160
+ "learning_rate": 1.9210799584631362e-05,
2161
+ "loss": 0.2651,
2162
+ "step": 3500
2163
+ },
2164
+ {
2165
+ "epoch": 6.56,
2166
+ "learning_rate": 1.9106957424714437e-05,
2167
+ "loss": 0.2671,
2168
+ "step": 3510
2169
+ },
2170
+ {
2171
+ "epoch": 6.58,
2172
+ "learning_rate": 1.9003115264797507e-05,
2173
+ "loss": 0.3116,
2174
+ "step": 3520
2175
+ },
2176
+ {
2177
+ "epoch": 6.6,
2178
+ "learning_rate": 1.889927310488058e-05,
2179
+ "loss": 0.3116,
2180
+ "step": 3530
2181
+ },
2182
+ {
2183
+ "epoch": 6.62,
2184
+ "learning_rate": 1.8795430944963656e-05,
2185
+ "loss": 0.2305,
2186
+ "step": 3540
2187
+ },
2188
+ {
2189
+ "epoch": 6.64,
2190
+ "learning_rate": 1.869158878504673e-05,
2191
+ "loss": 0.3035,
2192
+ "step": 3550
2193
+ },
2194
+ {
2195
+ "epoch": 6.65,
2196
+ "learning_rate": 1.8587746625129804e-05,
2197
+ "loss": 0.2388,
2198
+ "step": 3560
2199
+ },
2200
+ {
2201
+ "epoch": 6.67,
2202
+ "learning_rate": 1.848390446521288e-05,
2203
+ "loss": 0.2424,
2204
+ "step": 3570
2205
+ },
2206
+ {
2207
+ "epoch": 6.69,
2208
+ "learning_rate": 1.838006230529595e-05,
2209
+ "loss": 0.219,
2210
+ "step": 3580
2211
+ },
2212
+ {
2213
+ "epoch": 6.71,
2214
+ "learning_rate": 1.8276220145379024e-05,
2215
+ "loss": 0.212,
2216
+ "step": 3590
2217
+ },
2218
+ {
2219
+ "epoch": 6.73,
2220
+ "learning_rate": 1.8172377985462098e-05,
2221
+ "loss": 0.3682,
2222
+ "step": 3600
2223
+ },
2224
+ {
2225
+ "epoch": 6.75,
2226
+ "learning_rate": 1.8068535825545172e-05,
2227
+ "loss": 0.2882,
2228
+ "step": 3610
2229
+ },
2230
+ {
2231
+ "epoch": 6.77,
2232
+ "learning_rate": 1.7964693665628246e-05,
2233
+ "loss": 0.2846,
2234
+ "step": 3620
2235
+ },
2236
+ {
2237
+ "epoch": 6.79,
2238
+ "learning_rate": 1.786085150571132e-05,
2239
+ "loss": 0.276,
2240
+ "step": 3630
2241
+ },
2242
+ {
2243
+ "epoch": 6.8,
2244
+ "learning_rate": 1.775700934579439e-05,
2245
+ "loss": 0.214,
2246
+ "step": 3640
2247
+ },
2248
+ {
2249
+ "epoch": 6.82,
2250
+ "learning_rate": 1.7653167185877466e-05,
2251
+ "loss": 0.2351,
2252
+ "step": 3650
2253
+ },
2254
+ {
2255
+ "epoch": 6.84,
2256
+ "learning_rate": 1.7549325025960543e-05,
2257
+ "loss": 0.2428,
2258
+ "step": 3660
2259
+ },
2260
+ {
2261
+ "epoch": 6.86,
2262
+ "learning_rate": 1.7445482866043614e-05,
2263
+ "loss": 0.2289,
2264
+ "step": 3670
2265
+ },
2266
+ {
2267
+ "epoch": 6.88,
2268
+ "learning_rate": 1.734164070612669e-05,
2269
+ "loss": 0.2087,
2270
+ "step": 3680
2271
+ },
2272
+ {
2273
+ "epoch": 6.9,
2274
+ "learning_rate": 1.7237798546209763e-05,
2275
+ "loss": 0.2492,
2276
+ "step": 3690
2277
+ },
2278
+ {
2279
+ "epoch": 6.92,
2280
+ "learning_rate": 1.7133956386292833e-05,
2281
+ "loss": 0.2855,
2282
+ "step": 3700
2283
+ },
2284
+ {
2285
+ "epoch": 6.93,
2286
+ "learning_rate": 1.703011422637591e-05,
2287
+ "loss": 0.2701,
2288
+ "step": 3710
2289
+ },
2290
+ {
2291
+ "epoch": 6.95,
2292
+ "learning_rate": 1.6926272066458985e-05,
2293
+ "loss": 0.2635,
2294
+ "step": 3720
2295
+ },
2296
+ {
2297
+ "epoch": 6.97,
2298
+ "learning_rate": 1.6822429906542056e-05,
2299
+ "loss": 0.267,
2300
+ "step": 3730
2301
+ },
2302
+ {
2303
+ "epoch": 6.99,
2304
+ "learning_rate": 1.671858774662513e-05,
2305
+ "loss": 0.177,
2306
+ "step": 3740
2307
+ },
2308
+ {
2309
+ "epoch": 7.0,
2310
+ "eval_accuracy": 0.8745327102803738,
2311
+ "eval_loss": 0.37715595960617065,
2312
+ "eval_runtime": 95.1073,
2313
+ "eval_samples_per_second": 45.002,
2314
+ "eval_steps_per_second": 5.625,
2315
+ "step": 3745
2316
+ },
2317
+ {
2318
+ "epoch": 7.01,
2319
+ "learning_rate": 1.6614745586708205e-05,
2320
+ "loss": 0.307,
2321
+ "step": 3750
2322
+ },
2323
+ {
2324
+ "epoch": 7.03,
2325
+ "learning_rate": 1.6510903426791275e-05,
2326
+ "loss": 0.258,
2327
+ "step": 3760
2328
+ },
2329
+ {
2330
+ "epoch": 7.05,
2331
+ "learning_rate": 1.6407061266874353e-05,
2332
+ "loss": 0.2322,
2333
+ "step": 3770
2334
+ },
2335
+ {
2336
+ "epoch": 7.07,
2337
+ "learning_rate": 1.6303219106957427e-05,
2338
+ "loss": 0.187,
2339
+ "step": 3780
2340
+ },
2341
+ {
2342
+ "epoch": 7.08,
2343
+ "learning_rate": 1.6199376947040498e-05,
2344
+ "loss": 0.2838,
2345
+ "step": 3790
2346
+ },
2347
+ {
2348
+ "epoch": 7.1,
2349
+ "learning_rate": 1.6095534787123572e-05,
2350
+ "loss": 0.2353,
2351
+ "step": 3800
2352
+ },
2353
+ {
2354
+ "epoch": 7.12,
2355
+ "learning_rate": 1.5991692627206647e-05,
2356
+ "loss": 0.2661,
2357
+ "step": 3810
2358
+ },
2359
+ {
2360
+ "epoch": 7.14,
2361
+ "learning_rate": 1.588785046728972e-05,
2362
+ "loss": 0.1917,
2363
+ "step": 3820
2364
+ },
2365
+ {
2366
+ "epoch": 7.16,
2367
+ "learning_rate": 1.5784008307372795e-05,
2368
+ "loss": 0.1978,
2369
+ "step": 3830
2370
+ },
2371
+ {
2372
+ "epoch": 7.18,
2373
+ "learning_rate": 1.568016614745587e-05,
2374
+ "loss": 0.2688,
2375
+ "step": 3840
2376
+ },
2377
+ {
2378
+ "epoch": 7.2,
2379
+ "learning_rate": 1.557632398753894e-05,
2380
+ "loss": 0.2383,
2381
+ "step": 3850
2382
+ },
2383
+ {
2384
+ "epoch": 7.21,
2385
+ "learning_rate": 1.5472481827622014e-05,
2386
+ "loss": 0.2874,
2387
+ "step": 3860
2388
+ },
2389
+ {
2390
+ "epoch": 7.23,
2391
+ "learning_rate": 1.536863966770509e-05,
2392
+ "loss": 0.265,
2393
+ "step": 3870
2394
+ },
2395
+ {
2396
+ "epoch": 7.25,
2397
+ "learning_rate": 1.5264797507788163e-05,
2398
+ "loss": 0.1821,
2399
+ "step": 3880
2400
+ },
2401
+ {
2402
+ "epoch": 7.27,
2403
+ "learning_rate": 1.5160955347871237e-05,
2404
+ "loss": 0.193,
2405
+ "step": 3890
2406
+ },
2407
+ {
2408
+ "epoch": 7.29,
2409
+ "learning_rate": 1.5057113187954311e-05,
2410
+ "loss": 0.2681,
2411
+ "step": 3900
2412
+ },
2413
+ {
2414
+ "epoch": 7.31,
2415
+ "learning_rate": 1.4953271028037382e-05,
2416
+ "loss": 0.2406,
2417
+ "step": 3910
2418
+ },
2419
+ {
2420
+ "epoch": 7.33,
2421
+ "learning_rate": 1.4849428868120458e-05,
2422
+ "loss": 0.3214,
2423
+ "step": 3920
2424
+ },
2425
+ {
2426
+ "epoch": 7.35,
2427
+ "learning_rate": 1.4745586708203532e-05,
2428
+ "loss": 0.2521,
2429
+ "step": 3930
2430
+ },
2431
+ {
2432
+ "epoch": 7.36,
2433
+ "learning_rate": 1.4641744548286603e-05,
2434
+ "loss": 0.3427,
2435
+ "step": 3940
2436
+ },
2437
+ {
2438
+ "epoch": 7.38,
2439
+ "learning_rate": 1.4537902388369679e-05,
2440
+ "loss": 0.3272,
2441
+ "step": 3950
2442
+ },
2443
+ {
2444
+ "epoch": 7.4,
2445
+ "learning_rate": 1.4434060228452753e-05,
2446
+ "loss": 0.2336,
2447
+ "step": 3960
2448
+ },
2449
+ {
2450
+ "epoch": 7.42,
2451
+ "learning_rate": 1.4330218068535826e-05,
2452
+ "loss": 0.2209,
2453
+ "step": 3970
2454
+ },
2455
+ {
2456
+ "epoch": 7.44,
2457
+ "learning_rate": 1.42263759086189e-05,
2458
+ "loss": 0.222,
2459
+ "step": 3980
2460
+ },
2461
+ {
2462
+ "epoch": 7.46,
2463
+ "learning_rate": 1.4122533748701974e-05,
2464
+ "loss": 0.236,
2465
+ "step": 3990
2466
+ },
2467
+ {
2468
+ "epoch": 7.48,
2469
+ "learning_rate": 1.4018691588785047e-05,
2470
+ "loss": 0.2875,
2471
+ "step": 4000
2472
+ },
2473
+ {
2474
+ "epoch": 7.5,
2475
+ "learning_rate": 1.3914849428868121e-05,
2476
+ "loss": 0.2098,
2477
+ "step": 4010
2478
+ },
2479
+ {
2480
+ "epoch": 7.51,
2481
+ "learning_rate": 1.3811007268951195e-05,
2482
+ "loss": 0.324,
2483
+ "step": 4020
2484
+ },
2485
+ {
2486
+ "epoch": 7.53,
2487
+ "learning_rate": 1.3707165109034268e-05,
2488
+ "loss": 0.2038,
2489
+ "step": 4030
2490
+ },
2491
+ {
2492
+ "epoch": 7.55,
2493
+ "learning_rate": 1.3603322949117342e-05,
2494
+ "loss": 0.271,
2495
+ "step": 4040
2496
+ },
2497
+ {
2498
+ "epoch": 7.57,
2499
+ "learning_rate": 1.3499480789200416e-05,
2500
+ "loss": 0.296,
2501
+ "step": 4050
2502
+ },
2503
+ {
2504
+ "epoch": 7.59,
2505
+ "learning_rate": 1.3395638629283489e-05,
2506
+ "loss": 0.2557,
2507
+ "step": 4060
2508
+ },
2509
+ {
2510
+ "epoch": 7.61,
2511
+ "learning_rate": 1.3291796469366563e-05,
2512
+ "loss": 0.2788,
2513
+ "step": 4070
2514
+ },
2515
+ {
2516
+ "epoch": 7.63,
2517
+ "learning_rate": 1.3187954309449637e-05,
2518
+ "loss": 0.2684,
2519
+ "step": 4080
2520
+ },
2521
+ {
2522
+ "epoch": 7.64,
2523
+ "learning_rate": 1.308411214953271e-05,
2524
+ "loss": 0.2447,
2525
+ "step": 4090
2526
+ },
2527
+ {
2528
+ "epoch": 7.66,
2529
+ "learning_rate": 1.2980269989615784e-05,
2530
+ "loss": 0.287,
2531
+ "step": 4100
2532
+ },
2533
+ {
2534
+ "epoch": 7.68,
2535
+ "learning_rate": 1.2876427829698858e-05,
2536
+ "loss": 0.2186,
2537
+ "step": 4110
2538
+ },
2539
+ {
2540
+ "epoch": 7.7,
2541
+ "learning_rate": 1.277258566978193e-05,
2542
+ "loss": 0.1978,
2543
+ "step": 4120
2544
+ },
2545
+ {
2546
+ "epoch": 7.72,
2547
+ "learning_rate": 1.2668743509865005e-05,
2548
+ "loss": 0.2698,
2549
+ "step": 4130
2550
+ },
2551
+ {
2552
+ "epoch": 7.74,
2553
+ "learning_rate": 1.256490134994808e-05,
2554
+ "loss": 0.1624,
2555
+ "step": 4140
2556
+ },
2557
+ {
2558
+ "epoch": 7.76,
2559
+ "learning_rate": 1.2461059190031153e-05,
2560
+ "loss": 0.2591,
2561
+ "step": 4150
2562
+ },
2563
+ {
2564
+ "epoch": 7.78,
2565
+ "learning_rate": 1.2357217030114226e-05,
2566
+ "loss": 0.2035,
2567
+ "step": 4160
2568
+ },
2569
+ {
2570
+ "epoch": 7.79,
2571
+ "learning_rate": 1.2253374870197302e-05,
2572
+ "loss": 0.2285,
2573
+ "step": 4170
2574
+ },
2575
+ {
2576
+ "epoch": 7.81,
2577
+ "learning_rate": 1.2149532710280374e-05,
2578
+ "loss": 0.2257,
2579
+ "step": 4180
2580
+ },
2581
+ {
2582
+ "epoch": 7.83,
2583
+ "learning_rate": 1.2045690550363447e-05,
2584
+ "loss": 0.2555,
2585
+ "step": 4190
2586
+ },
2587
+ {
2588
+ "epoch": 7.85,
2589
+ "learning_rate": 1.1941848390446523e-05,
2590
+ "loss": 0.2192,
2591
+ "step": 4200
2592
+ },
2593
+ {
2594
+ "epoch": 7.87,
2595
+ "learning_rate": 1.1838006230529595e-05,
2596
+ "loss": 0.2169,
2597
+ "step": 4210
2598
+ },
2599
+ {
2600
+ "epoch": 7.89,
2601
+ "learning_rate": 1.1734164070612668e-05,
2602
+ "loss": 0.2914,
2603
+ "step": 4220
2604
+ },
2605
+ {
2606
+ "epoch": 7.91,
2607
+ "learning_rate": 1.1630321910695744e-05,
2608
+ "loss": 0.253,
2609
+ "step": 4230
2610
+ },
2611
+ {
2612
+ "epoch": 7.93,
2613
+ "learning_rate": 1.1526479750778816e-05,
2614
+ "loss": 0.2067,
2615
+ "step": 4240
2616
+ },
2617
+ {
2618
+ "epoch": 7.94,
2619
+ "learning_rate": 1.142263759086189e-05,
2620
+ "loss": 0.1995,
2621
+ "step": 4250
2622
+ },
2623
+ {
2624
+ "epoch": 7.96,
2625
+ "learning_rate": 1.1318795430944965e-05,
2626
+ "loss": 0.2902,
2627
+ "step": 4260
2628
+ },
2629
+ {
2630
+ "epoch": 7.98,
2631
+ "learning_rate": 1.1214953271028037e-05,
2632
+ "loss": 0.2051,
2633
+ "step": 4270
2634
+ },
2635
+ {
2636
+ "epoch": 8.0,
2637
+ "learning_rate": 1.1111111111111112e-05,
2638
+ "loss": 0.2409,
2639
+ "step": 4280
2640
+ },
2641
+ {
2642
+ "epoch": 8.0,
2643
+ "eval_accuracy": 0.8761682242990654,
2644
+ "eval_loss": 0.38753223419189453,
2645
+ "eval_runtime": 91.8924,
2646
+ "eval_samples_per_second": 46.576,
2647
+ "eval_steps_per_second": 5.822,
2648
+ "step": 4280
2649
+ },
2650
+ {
2651
+ "epoch": 8.02,
2652
+ "learning_rate": 1.1007268951194186e-05,
2653
+ "loss": 0.2436,
2654
+ "step": 4290
2655
+ },
2656
+ {
2657
+ "epoch": 8.04,
2658
+ "learning_rate": 1.0903426791277258e-05,
2659
+ "loss": 0.2468,
2660
+ "step": 4300
2661
+ },
2662
+ {
2663
+ "epoch": 8.06,
2664
+ "learning_rate": 1.0799584631360333e-05,
2665
+ "loss": 0.168,
2666
+ "step": 4310
2667
+ },
2668
+ {
2669
+ "epoch": 8.07,
2670
+ "learning_rate": 1.0695742471443407e-05,
2671
+ "loss": 0.2093,
2672
+ "step": 4320
2673
+ },
2674
+ {
2675
+ "epoch": 8.09,
2676
+ "learning_rate": 1.059190031152648e-05,
2677
+ "loss": 0.1616,
2678
+ "step": 4330
2679
+ },
2680
+ {
2681
+ "epoch": 8.11,
2682
+ "learning_rate": 1.0488058151609554e-05,
2683
+ "loss": 0.1912,
2684
+ "step": 4340
2685
+ },
2686
+ {
2687
+ "epoch": 8.13,
2688
+ "learning_rate": 1.0384215991692628e-05,
2689
+ "loss": 0.3157,
2690
+ "step": 4350
2691
+ },
2692
+ {
2693
+ "epoch": 8.15,
2694
+ "learning_rate": 1.02803738317757e-05,
2695
+ "loss": 0.2213,
2696
+ "step": 4360
2697
+ },
2698
+ {
2699
+ "epoch": 8.17,
2700
+ "learning_rate": 1.0176531671858776e-05,
2701
+ "loss": 0.2473,
2702
+ "step": 4370
2703
+ },
2704
+ {
2705
+ "epoch": 8.19,
2706
+ "learning_rate": 1.0072689511941849e-05,
2707
+ "loss": 0.2155,
2708
+ "step": 4380
2709
+ },
2710
+ {
2711
+ "epoch": 8.21,
2712
+ "learning_rate": 9.968847352024923e-06,
2713
+ "loss": 0.209,
2714
+ "step": 4390
2715
+ },
2716
+ {
2717
+ "epoch": 8.22,
2718
+ "learning_rate": 9.865005192107997e-06,
2719
+ "loss": 0.2552,
2720
+ "step": 4400
2721
+ },
2722
+ {
2723
+ "epoch": 8.24,
2724
+ "learning_rate": 9.76116303219107e-06,
2725
+ "loss": 0.2469,
2726
+ "step": 4410
2727
+ },
2728
+ {
2729
+ "epoch": 8.26,
2730
+ "learning_rate": 9.657320872274144e-06,
2731
+ "loss": 0.2195,
2732
+ "step": 4420
2733
+ },
2734
+ {
2735
+ "epoch": 8.28,
2736
+ "learning_rate": 9.553478712357218e-06,
2737
+ "loss": 0.2094,
2738
+ "step": 4430
2739
+ },
2740
+ {
2741
+ "epoch": 8.3,
2742
+ "learning_rate": 9.44963655244029e-06,
2743
+ "loss": 0.2593,
2744
+ "step": 4440
2745
+ },
2746
+ {
2747
+ "epoch": 8.32,
2748
+ "learning_rate": 9.345794392523365e-06,
2749
+ "loss": 0.254,
2750
+ "step": 4450
2751
+ },
2752
+ {
2753
+ "epoch": 8.34,
2754
+ "learning_rate": 9.24195223260644e-06,
2755
+ "loss": 0.1353,
2756
+ "step": 4460
2757
+ },
2758
+ {
2759
+ "epoch": 8.36,
2760
+ "learning_rate": 9.138110072689512e-06,
2761
+ "loss": 0.2152,
2762
+ "step": 4470
2763
+ },
2764
+ {
2765
+ "epoch": 8.37,
2766
+ "learning_rate": 9.034267912772586e-06,
2767
+ "loss": 0.2258,
2768
+ "step": 4480
2769
+ },
2770
+ {
2771
+ "epoch": 8.39,
2772
+ "learning_rate": 8.93042575285566e-06,
2773
+ "loss": 0.1593,
2774
+ "step": 4490
2775
+ },
2776
+ {
2777
+ "epoch": 8.41,
2778
+ "learning_rate": 8.826583592938733e-06,
2779
+ "loss": 0.1939,
2780
+ "step": 4500
2781
+ },
2782
+ {
2783
+ "epoch": 8.43,
2784
+ "learning_rate": 8.722741433021807e-06,
2785
+ "loss": 0.2717,
2786
+ "step": 4510
2787
+ },
2788
+ {
2789
+ "epoch": 8.45,
2790
+ "learning_rate": 8.618899273104881e-06,
2791
+ "loss": 0.2263,
2792
+ "step": 4520
2793
+ },
2794
+ {
2795
+ "epoch": 8.47,
2796
+ "learning_rate": 8.515057113187956e-06,
2797
+ "loss": 0.281,
2798
+ "step": 4530
2799
+ },
2800
+ {
2801
+ "epoch": 8.49,
2802
+ "learning_rate": 8.411214953271028e-06,
2803
+ "loss": 0.2438,
2804
+ "step": 4540
2805
+ },
2806
+ {
2807
+ "epoch": 8.5,
2808
+ "learning_rate": 8.307372793354102e-06,
2809
+ "loss": 0.1487,
2810
+ "step": 4550
2811
+ },
2812
+ {
2813
+ "epoch": 8.52,
2814
+ "learning_rate": 8.203530633437177e-06,
2815
+ "loss": 0.1811,
2816
+ "step": 4560
2817
+ },
2818
+ {
2819
+ "epoch": 8.54,
2820
+ "learning_rate": 8.099688473520249e-06,
2821
+ "loss": 0.2803,
2822
+ "step": 4570
2823
+ },
2824
+ {
2825
+ "epoch": 8.56,
2826
+ "learning_rate": 7.995846313603323e-06,
2827
+ "loss": 0.2339,
2828
+ "step": 4580
2829
+ },
2830
+ {
2831
+ "epoch": 8.58,
2832
+ "learning_rate": 7.892004153686398e-06,
2833
+ "loss": 0.2617,
2834
+ "step": 4590
2835
+ },
2836
+ {
2837
+ "epoch": 8.6,
2838
+ "learning_rate": 7.78816199376947e-06,
2839
+ "loss": 0.3019,
2840
+ "step": 4600
2841
+ },
2842
+ {
2843
+ "epoch": 8.62,
2844
+ "learning_rate": 7.684319833852544e-06,
2845
+ "loss": 0.2083,
2846
+ "step": 4610
2847
+ },
2848
+ {
2849
+ "epoch": 8.64,
2850
+ "learning_rate": 7.5804776739356185e-06,
2851
+ "loss": 0.24,
2852
+ "step": 4620
2853
+ },
2854
+ {
2855
+ "epoch": 8.65,
2856
+ "learning_rate": 7.476635514018691e-06,
2857
+ "loss": 0.2378,
2858
+ "step": 4630
2859
+ },
2860
+ {
2861
+ "epoch": 8.67,
2862
+ "learning_rate": 7.372793354101766e-06,
2863
+ "loss": 0.2475,
2864
+ "step": 4640
2865
+ },
2866
+ {
2867
+ "epoch": 8.69,
2868
+ "learning_rate": 7.2689511941848395e-06,
2869
+ "loss": 0.197,
2870
+ "step": 4650
2871
+ },
2872
+ {
2873
+ "epoch": 8.71,
2874
+ "learning_rate": 7.165109034267913e-06,
2875
+ "loss": 0.2302,
2876
+ "step": 4660
2877
+ },
2878
+ {
2879
+ "epoch": 8.73,
2880
+ "learning_rate": 7.061266874350987e-06,
2881
+ "loss": 0.2186,
2882
+ "step": 4670
2883
+ },
2884
+ {
2885
+ "epoch": 8.75,
2886
+ "learning_rate": 6.9574247144340605e-06,
2887
+ "loss": 0.2172,
2888
+ "step": 4680
2889
+ },
2890
+ {
2891
+ "epoch": 8.77,
2892
+ "learning_rate": 6.853582554517134e-06,
2893
+ "loss": 0.2189,
2894
+ "step": 4690
2895
+ },
2896
+ {
2897
+ "epoch": 8.79,
2898
+ "learning_rate": 6.749740394600208e-06,
2899
+ "loss": 0.2137,
2900
+ "step": 4700
2901
+ },
2902
+ {
2903
+ "epoch": 8.8,
2904
+ "learning_rate": 6.6458982346832815e-06,
2905
+ "loss": 0.2259,
2906
+ "step": 4710
2907
+ },
2908
+ {
2909
+ "epoch": 8.82,
2910
+ "learning_rate": 6.542056074766355e-06,
2911
+ "loss": 0.15,
2912
+ "step": 4720
2913
+ },
2914
+ {
2915
+ "epoch": 8.84,
2916
+ "learning_rate": 6.438213914849429e-06,
2917
+ "loss": 0.2622,
2918
+ "step": 4730
2919
+ },
2920
+ {
2921
+ "epoch": 8.86,
2922
+ "learning_rate": 6.3343717549325025e-06,
2923
+ "loss": 0.2009,
2924
+ "step": 4740
2925
+ },
2926
+ {
2927
+ "epoch": 8.88,
2928
+ "learning_rate": 6.230529595015577e-06,
2929
+ "loss": 0.2095,
2930
+ "step": 4750
2931
+ },
2932
+ {
2933
+ "epoch": 8.9,
2934
+ "learning_rate": 6.126687435098651e-06,
2935
+ "loss": 0.1766,
2936
+ "step": 4760
2937
+ },
2938
+ {
2939
+ "epoch": 8.92,
2940
+ "learning_rate": 6.0228452751817235e-06,
2941
+ "loss": 0.1982,
2942
+ "step": 4770
2943
+ },
2944
+ {
2945
+ "epoch": 8.93,
2946
+ "learning_rate": 5.919003115264798e-06,
2947
+ "loss": 0.2287,
2948
+ "step": 4780
2949
+ },
2950
+ {
2951
+ "epoch": 8.95,
2952
+ "learning_rate": 5.815160955347872e-06,
2953
+ "loss": 0.266,
2954
+ "step": 4790
2955
+ },
2956
+ {
2957
+ "epoch": 8.97,
2958
+ "learning_rate": 5.711318795430945e-06,
2959
+ "loss": 0.2196,
2960
+ "step": 4800
2961
+ },
2962
+ {
2963
+ "epoch": 8.99,
2964
+ "learning_rate": 5.607476635514019e-06,
2965
+ "loss": 0.1929,
2966
+ "step": 4810
2967
+ },
2968
+ {
2969
+ "epoch": 9.0,
2970
+ "eval_accuracy": 0.8707943925233644,
2971
+ "eval_loss": 0.3915020227432251,
2972
+ "eval_runtime": 93.367,
2973
+ "eval_samples_per_second": 45.841,
2974
+ "eval_steps_per_second": 5.73,
2975
+ "step": 4815
2976
+ },
2977
+ {
2978
+ "epoch": 9.01,
2979
+ "learning_rate": 5.503634475597093e-06,
2980
+ "loss": 0.2137,
2981
+ "step": 4820
2982
+ },
2983
+ {
2984
+ "epoch": 9.03,
2985
+ "learning_rate": 5.399792315680166e-06,
2986
+ "loss": 0.1994,
2987
+ "step": 4830
2988
+ },
2989
+ {
2990
+ "epoch": 9.05,
2991
+ "learning_rate": 5.29595015576324e-06,
2992
+ "loss": 0.2243,
2993
+ "step": 4840
2994
+ },
2995
+ {
2996
+ "epoch": 9.07,
2997
+ "learning_rate": 5.192107995846314e-06,
2998
+ "loss": 0.282,
2999
+ "step": 4850
3000
+ },
3001
+ {
3002
+ "epoch": 9.08,
3003
+ "learning_rate": 5.088265835929388e-06,
3004
+ "loss": 0.1425,
3005
+ "step": 4860
3006
+ },
3007
+ {
3008
+ "epoch": 9.1,
3009
+ "learning_rate": 4.9844236760124615e-06,
3010
+ "loss": 0.2213,
3011
+ "step": 4870
3012
+ },
3013
+ {
3014
+ "epoch": 9.12,
3015
+ "learning_rate": 4.880581516095535e-06,
3016
+ "loss": 0.1939,
3017
+ "step": 4880
3018
+ },
3019
+ {
3020
+ "epoch": 9.14,
3021
+ "learning_rate": 4.776739356178609e-06,
3022
+ "loss": 0.1753,
3023
+ "step": 4890
3024
+ },
3025
+ {
3026
+ "epoch": 9.16,
3027
+ "learning_rate": 4.6728971962616825e-06,
3028
+ "loss": 0.1758,
3029
+ "step": 4900
3030
+ },
3031
+ {
3032
+ "epoch": 9.18,
3033
+ "learning_rate": 4.569055036344756e-06,
3034
+ "loss": 0.1597,
3035
+ "step": 4910
3036
+ },
3037
+ {
3038
+ "epoch": 9.2,
3039
+ "learning_rate": 4.46521287642783e-06,
3040
+ "loss": 0.2269,
3041
+ "step": 4920
3042
+ },
3043
+ {
3044
+ "epoch": 9.21,
3045
+ "learning_rate": 4.3613707165109035e-06,
3046
+ "loss": 0.2723,
3047
+ "step": 4930
3048
+ },
3049
+ {
3050
+ "epoch": 9.23,
3051
+ "learning_rate": 4.257528556593978e-06,
3052
+ "loss": 0.2012,
3053
+ "step": 4940
3054
+ },
3055
+ {
3056
+ "epoch": 9.25,
3057
+ "learning_rate": 4.153686396677051e-06,
3058
+ "loss": 0.1863,
3059
+ "step": 4950
3060
+ },
3061
+ {
3062
+ "epoch": 9.27,
3063
+ "learning_rate": 4.0498442367601245e-06,
3064
+ "loss": 0.2029,
3065
+ "step": 4960
3066
+ },
3067
+ {
3068
+ "epoch": 9.29,
3069
+ "learning_rate": 3.946002076843199e-06,
3070
+ "loss": 0.2067,
3071
+ "step": 4970
3072
+ },
3073
+ {
3074
+ "epoch": 9.31,
3075
+ "learning_rate": 3.842159916926272e-06,
3076
+ "loss": 0.2359,
3077
+ "step": 4980
3078
+ },
3079
+ {
3080
+ "epoch": 9.33,
3081
+ "learning_rate": 3.7383177570093455e-06,
3082
+ "loss": 0.2398,
3083
+ "step": 4990
3084
+ },
3085
+ {
3086
+ "epoch": 9.35,
3087
+ "learning_rate": 3.6344755970924198e-06,
3088
+ "loss": 0.2233,
3089
+ "step": 5000
3090
+ },
3091
+ {
3092
+ "epoch": 9.36,
3093
+ "learning_rate": 3.5306334371754936e-06,
3094
+ "loss": 0.1934,
3095
+ "step": 5010
3096
+ },
3097
+ {
3098
+ "epoch": 9.38,
3099
+ "learning_rate": 3.426791277258567e-06,
3100
+ "loss": 0.2625,
3101
+ "step": 5020
3102
+ },
3103
+ {
3104
+ "epoch": 9.4,
3105
+ "learning_rate": 3.3229491173416407e-06,
3106
+ "loss": 0.2548,
3107
+ "step": 5030
3108
+ },
3109
+ {
3110
+ "epoch": 9.42,
3111
+ "learning_rate": 3.2191069574247146e-06,
3112
+ "loss": 0.1718,
3113
+ "step": 5040
3114
+ },
3115
+ {
3116
+ "epoch": 9.44,
3117
+ "learning_rate": 3.1152647975077884e-06,
3118
+ "loss": 0.1668,
3119
+ "step": 5050
3120
+ },
3121
+ {
3122
+ "epoch": 9.46,
3123
+ "learning_rate": 3.0114226375908617e-06,
3124
+ "loss": 0.2202,
3125
+ "step": 5060
3126
+ },
3127
+ {
3128
+ "epoch": 9.48,
3129
+ "learning_rate": 2.907580477673936e-06,
3130
+ "loss": 0.2091,
3131
+ "step": 5070
3132
+ },
3133
+ {
3134
+ "epoch": 9.5,
3135
+ "learning_rate": 2.8037383177570094e-06,
3136
+ "loss": 0.2213,
3137
+ "step": 5080
3138
+ },
3139
+ {
3140
+ "epoch": 9.51,
3141
+ "learning_rate": 2.699896157840083e-06,
3142
+ "loss": 0.2857,
3143
+ "step": 5090
3144
+ },
3145
+ {
3146
+ "epoch": 9.53,
3147
+ "learning_rate": 2.596053997923157e-06,
3148
+ "loss": 0.2209,
3149
+ "step": 5100
3150
+ },
3151
+ {
3152
+ "epoch": 9.55,
3153
+ "learning_rate": 2.4922118380062308e-06,
3154
+ "loss": 0.2341,
3155
+ "step": 5110
3156
+ },
3157
+ {
3158
+ "epoch": 9.57,
3159
+ "learning_rate": 2.3883696780893046e-06,
3160
+ "loss": 0.2111,
3161
+ "step": 5120
3162
+ },
3163
+ {
3164
+ "epoch": 9.59,
3165
+ "learning_rate": 2.284527518172378e-06,
3166
+ "loss": 0.1869,
3167
+ "step": 5130
3168
+ },
3169
+ {
3170
+ "epoch": 9.61,
3171
+ "learning_rate": 2.1806853582554518e-06,
3172
+ "loss": 0.2229,
3173
+ "step": 5140
3174
+ },
3175
+ {
3176
+ "epoch": 9.63,
3177
+ "learning_rate": 2.0768431983385256e-06,
3178
+ "loss": 0.2251,
3179
+ "step": 5150
3180
+ },
3181
+ {
3182
+ "epoch": 9.64,
3183
+ "learning_rate": 1.9730010384215994e-06,
3184
+ "loss": 0.183,
3185
+ "step": 5160
3186
+ },
3187
+ {
3188
+ "epoch": 9.66,
3189
+ "learning_rate": 1.8691588785046728e-06,
3190
+ "loss": 0.1606,
3191
+ "step": 5170
3192
+ },
3193
+ {
3194
+ "epoch": 9.68,
3195
+ "learning_rate": 1.7653167185877468e-06,
3196
+ "loss": 0.2067,
3197
+ "step": 5180
3198
+ },
3199
+ {
3200
+ "epoch": 9.7,
3201
+ "learning_rate": 1.6614745586708204e-06,
3202
+ "loss": 0.2346,
3203
+ "step": 5190
3204
+ },
3205
+ {
3206
+ "epoch": 9.72,
3207
+ "learning_rate": 1.5576323987538942e-06,
3208
+ "loss": 0.2161,
3209
+ "step": 5200
3210
+ },
3211
+ {
3212
+ "epoch": 9.74,
3213
+ "learning_rate": 1.453790238836968e-06,
3214
+ "loss": 0.2893,
3215
+ "step": 5210
3216
+ },
3217
+ {
3218
+ "epoch": 9.76,
3219
+ "learning_rate": 1.3499480789200416e-06,
3220
+ "loss": 0.2196,
3221
+ "step": 5220
3222
+ },
3223
+ {
3224
+ "epoch": 9.78,
3225
+ "learning_rate": 1.2461059190031154e-06,
3226
+ "loss": 0.2035,
3227
+ "step": 5230
3228
+ },
3229
+ {
3230
+ "epoch": 9.79,
3231
+ "learning_rate": 1.142263759086189e-06,
3232
+ "loss": 0.1541,
3233
+ "step": 5240
3234
+ },
3235
+ {
3236
+ "epoch": 9.81,
3237
+ "learning_rate": 1.0384215991692628e-06,
3238
+ "loss": 0.2105,
3239
+ "step": 5250
3240
+ },
3241
+ {
3242
+ "epoch": 9.83,
3243
+ "learning_rate": 9.345794392523364e-07,
3244
+ "loss": 0.2065,
3245
+ "step": 5260
3246
+ },
3247
+ {
3248
+ "epoch": 9.85,
3249
+ "learning_rate": 8.307372793354102e-07,
3250
+ "loss": 0.161,
3251
+ "step": 5270
3252
+ },
3253
+ {
3254
+ "epoch": 9.87,
3255
+ "learning_rate": 7.26895119418484e-07,
3256
+ "loss": 0.2285,
3257
+ "step": 5280
3258
+ },
3259
+ {
3260
+ "epoch": 9.89,
3261
+ "learning_rate": 6.230529595015577e-07,
3262
+ "loss": 0.2051,
3263
+ "step": 5290
3264
+ },
3265
+ {
3266
+ "epoch": 9.91,
3267
+ "learning_rate": 5.192107995846314e-07,
3268
+ "loss": 0.2291,
3269
+ "step": 5300
3270
+ },
3271
+ {
3272
+ "epoch": 9.93,
3273
+ "learning_rate": 4.153686396677051e-07,
3274
+ "loss": 0.2071,
3275
+ "step": 5310
3276
+ },
3277
+ {
3278
+ "epoch": 9.94,
3279
+ "learning_rate": 3.1152647975077885e-07,
3280
+ "loss": 0.2055,
3281
+ "step": 5320
3282
+ },
3283
+ {
3284
+ "epoch": 9.96,
3285
+ "learning_rate": 2.0768431983385255e-07,
3286
+ "loss": 0.1433,
3287
+ "step": 5330
3288
+ },
3289
+ {
3290
+ "epoch": 9.98,
3291
+ "learning_rate": 1.0384215991692627e-07,
3292
+ "loss": 0.2493,
3293
+ "step": 5340
3294
+ },
3295
+ {
3296
+ "epoch": 10.0,
3297
+ "learning_rate": 0.0,
3298
+ "loss": 0.1877,
3299
+ "step": 5350
3300
+ },
3301
+ {
3302
+ "epoch": 10.0,
3303
+ "eval_accuracy": 0.8714953271028038,
3304
+ "eval_loss": 0.38807550072669983,
3305
+ "eval_runtime": 92.2965,
3306
+ "eval_samples_per_second": 46.372,
3307
+ "eval_steps_per_second": 5.797,
3308
+ "step": 5350
3309
+ },
3310
+ {
3311
+ "epoch": 10.0,
3312
+ "step": 5350,
3313
+ "total_flos": 1.3264660513609667e+19,
3314
+ "train_loss": 0.36025063641717503,
3315
+ "train_runtime": 7147.2488,
3316
+ "train_samples_per_second": 23.949,
3317
+ "train_steps_per_second": 0.749
3318
  }
3319
  ],
3320
+ "max_steps": 5350,
3321
  "num_train_epochs": 10,
3322
+ "total_flos": 1.3264660513609667e+19,
3323
  "trial_name": null,
3324
  "trial_params": null
3325
  }