ihsanahakiim commited on
Commit
848df05
·
verified ·
1 Parent(s): 9ec60da

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 30.026442307692307,
3
- "eval_accuracy": 0.35570469798657717,
4
- "eval_loss": 2.520230293273926,
5
- "eval_runtime": 112.9248,
6
- "eval_samples_per_second": 3.958,
7
- "eval_steps_per_second": 0.124
8
  }
 
1
  {
2
+ "epoch": 59.00721153846154,
3
+ "eval_accuracy": 0.7319819819819819,
4
+ "eval_loss": 1.084416389465332,
5
+ "eval_runtime": 75.3126,
6
+ "eval_samples_per_second": 5.895,
7
+ "eval_steps_per_second": 0.093
8
  }
runs/Jan14_16-01-40_GAN-SVR/events.out.tfevents.1736851415.GAN-SVR.3870842.8 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a5dfbf870d294a569baa230c78ce95feed24ab80b22e9a78ba14e68699ae25d
3
- size 411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa347f8819808b6ed0a069ac2bd27d55402bf8858e7c9bbf607cecfab1d1288
3
+ size 734
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 30.026442307692307,
3
- "eval_accuracy": 0.35570469798657717,
4
- "eval_loss": 2.520230293273926,
5
- "eval_runtime": 112.9248,
6
- "eval_samples_per_second": 3.958,
7
- "eval_steps_per_second": 0.124
8
  }
 
1
  {
2
+ "epoch": 59.00721153846154,
3
+ "eval_accuracy": 0.7319819819819819,
4
+ "eval_loss": 1.084416389465332,
5
+ "eval_runtime": 75.3126,
6
+ "eval_samples_per_second": 5.895,
7
+ "eval_steps_per_second": 0.093
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 0.35570469798657717,
3
- "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-783",
4
- "epoch": 30.026442307692307,
5
  "eval_steps": 500,
6
  "global_step": 832,
7
  "is_hyper_param_search": false,
@@ -10,889 +10,1150 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01201923076923077,
13
- "grad_norm": 9.181777954101562,
14
  "learning_rate": 5.9523809523809525e-06,
15
- "loss": 4.2532,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.02403846153846154,
20
- "grad_norm": 6.192462921142578,
 
 
 
 
 
 
 
 
 
21
  "learning_rate": 1.1904761904761905e-05,
22
- "loss": 4.2583,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.03245192307692308,
27
- "eval_accuracy": 0.015659955257270694,
28
- "eval_loss": 4.225129127502441,
29
- "eval_runtime": 113.6438,
30
- "eval_samples_per_second": 3.933,
31
- "eval_steps_per_second": 0.123,
32
- "step": 27
33
  },
34
  {
35
- "epoch": 1.0036057692307692,
36
- "grad_norm": 6.222796440124512,
37
  "learning_rate": 1.785714285714286e-05,
38
- "loss": 4.2505,
39
  "step": 30
40
  },
41
  {
42
- "epoch": 1.015625,
43
- "grad_norm": 3.965486526489258,
44
  "learning_rate": 2.380952380952381e-05,
45
- "loss": 4.2489,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 1.0276442307692308,
50
- "grad_norm": 5.0857744216918945,
 
 
 
 
 
 
 
 
 
51
  "learning_rate": 2.9761904761904762e-05,
52
- "loss": 4.2374,
53
  "step": 50
54
  },
55
  {
56
- "epoch": 1.0324519230769231,
57
- "eval_accuracy": 0.013422818791946308,
58
- "eval_loss": 4.226736068725586,
59
- "eval_runtime": 113.905,
60
- "eval_samples_per_second": 3.924,
61
- "eval_steps_per_second": 0.123,
62
- "step": 54
63
  },
64
  {
65
- "epoch": 2.0072115384615383,
66
- "grad_norm": 3.5929408073425293,
67
  "learning_rate": 3.571428571428572e-05,
68
- "loss": 4.2403,
69
  "step": 60
70
  },
71
  {
72
- "epoch": 2.019230769230769,
73
- "grad_norm": 3.6417300701141357,
74
  "learning_rate": 4.166666666666667e-05,
75
- "loss": 4.255,
 
 
 
 
 
 
 
 
 
76
  "step": 70
77
  },
78
  {
79
- "epoch": 2.03125,
80
- "grad_norm": 3.2748305797576904,
81
  "learning_rate": 4.761904761904762e-05,
82
- "loss": 4.2678,
83
  "step": 80
84
  },
85
  {
86
- "epoch": 2.032451923076923,
87
- "eval_accuracy": 0.013422818791946308,
88
- "eval_loss": 4.2263360023498535,
89
- "eval_runtime": 115.125,
90
- "eval_samples_per_second": 3.883,
91
- "eval_steps_per_second": 0.122,
92
- "step": 81
93
  },
94
  {
95
- "epoch": 3.0108173076923075,
96
- "grad_norm": 3.1406173706054688,
97
  "learning_rate": 4.959893048128342e-05,
98
- "loss": 4.2151,
99
  "step": 90
100
  },
101
  {
102
- "epoch": 3.0228365384615383,
103
- "grad_norm": 2.8441436290740967,
104
- "learning_rate": 4.8930481283422465e-05,
105
- "loss": 4.2537,
106
- "step": 100
 
 
107
  },
108
  {
109
- "epoch": 3.032451923076923,
110
- "eval_accuracy": 0.015659955257270694,
111
- "eval_loss": 4.221240997314453,
112
- "eval_runtime": 115.863,
113
- "eval_samples_per_second": 3.858,
114
- "eval_steps_per_second": 0.121,
115
- "step": 108
116
  },
117
  {
118
- "epoch": 4.002403846153846,
119
- "grad_norm": 2.7364041805267334,
120
  "learning_rate": 4.8262032085561496e-05,
121
- "loss": 4.2568,
122
  "step": 110
123
  },
124
  {
125
- "epoch": 4.014423076923077,
126
- "grad_norm": 2.71181058883667,
 
 
 
 
 
 
 
 
 
127
  "learning_rate": 4.759358288770054e-05,
128
- "loss": 4.2288,
129
  "step": 120
130
  },
131
  {
132
- "epoch": 4.0264423076923075,
133
- "grad_norm": 2.7206692695617676,
 
 
 
 
 
 
 
 
 
134
  "learning_rate": 4.6925133689839576e-05,
135
- "loss": 4.2401,
136
  "step": 130
137
  },
138
  {
139
- "epoch": 4.032451923076923,
140
- "eval_accuracy": 0.015659955257270694,
141
- "eval_loss": 4.206515312194824,
142
- "eval_runtime": 114.2862,
143
- "eval_samples_per_second": 3.911,
144
- "eval_steps_per_second": 0.122,
145
- "step": 135
146
  },
147
  {
148
- "epoch": 5.006009615384615,
149
- "grad_norm": 2.6117472648620605,
150
- "learning_rate": 4.625668449197861e-05,
151
- "loss": 4.2192,
 
 
152
  "step": 140
153
  },
154
  {
155
- "epoch": 5.018028846153846,
156
- "grad_norm": 2.90580677986145,
157
  "learning_rate": 4.558823529411765e-05,
158
- "loss": 4.2204,
159
  "step": 150
160
  },
161
  {
162
- "epoch": 5.030048076923077,
163
- "grad_norm": 2.9660141468048096,
 
 
 
 
 
 
 
 
 
164
  "learning_rate": 4.491978609625669e-05,
165
- "loss": 4.2519,
166
  "step": 160
167
  },
168
  {
169
- "epoch": 5.032451923076923,
170
- "eval_accuracy": 0.017897091722595078,
171
- "eval_loss": 4.207516193389893,
172
- "eval_runtime": 117.7024,
173
- "eval_samples_per_second": 3.798,
174
- "eval_steps_per_second": 0.119,
175
- "step": 162
176
  },
177
  {
178
- "epoch": 6.009615384615385,
179
- "grad_norm": 2.8973159790039062,
180
  "learning_rate": 4.4251336898395724e-05,
181
- "loss": 4.2112,
182
  "step": 170
183
  },
184
  {
185
- "epoch": 6.021634615384615,
186
- "grad_norm": 2.734694480895996,
187
  "learning_rate": 4.358288770053476e-05,
188
- "loss": 4.2198,
189
  "step": 180
190
  },
191
  {
192
- "epoch": 6.032451923076923,
193
- "eval_accuracy": 0.013422818791946308,
194
- "eval_loss": 4.205463409423828,
195
- "eval_runtime": 116.7575,
196
- "eval_samples_per_second": 3.828,
197
- "eval_steps_per_second": 0.12,
198
- "step": 189
199
  },
200
  {
201
- "epoch": 7.001201923076923,
202
- "grad_norm": 2.421506881713867,
203
  "learning_rate": 4.29144385026738e-05,
204
- "loss": 4.2292,
205
  "step": 190
206
  },
207
  {
208
- "epoch": 7.013221153846154,
209
- "grad_norm": 2.82832670211792,
 
 
 
 
 
 
 
 
 
210
  "learning_rate": 4.224598930481284e-05,
211
- "loss": 4.2019,
212
  "step": 200
213
  },
214
  {
215
- "epoch": 7.025240384615385,
216
- "grad_norm": 3.0022928714752197,
217
  "learning_rate": 4.157754010695187e-05,
218
- "loss": 4.2111,
219
  "step": 210
220
  },
221
  {
222
- "epoch": 7.032451923076923,
223
- "eval_accuracy": 0.017897091722595078,
224
- "eval_loss": 4.19577693939209,
225
- "eval_runtime": 119.1446,
226
- "eval_samples_per_second": 3.752,
227
- "eval_steps_per_second": 0.118,
228
- "step": 216
229
  },
230
  {
231
- "epoch": 8.004807692307692,
232
- "grad_norm": 2.6406443119049072,
233
  "learning_rate": 4.0909090909090915e-05,
234
- "loss": 4.2068,
235
  "step": 220
236
  },
237
  {
238
- "epoch": 8.016826923076923,
239
- "grad_norm": 3.169625759124756,
 
 
 
 
 
 
 
 
 
240
  "learning_rate": 4.024064171122995e-05,
241
- "loss": 4.1759,
242
  "step": 230
243
  },
244
  {
245
- "epoch": 8.028846153846153,
246
- "grad_norm": 2.8210370540618896,
247
- "learning_rate": 3.957219251336899e-05,
248
- "loss": 4.1871,
249
- "step": 240
 
 
250
  },
251
  {
252
- "epoch": 8.032451923076923,
253
- "eval_accuracy": 0.04697986577181208,
254
- "eval_loss": 4.147375583648682,
255
- "eval_runtime": 117.849,
256
- "eval_samples_per_second": 3.793,
257
- "eval_steps_per_second": 0.119,
258
- "step": 243
259
  },
260
  {
261
- "epoch": 9.008413461538462,
262
- "grad_norm": 3.140075206756592,
263
  "learning_rate": 3.8903743315508025e-05,
264
- "loss": 4.1314,
265
  "step": 250
266
  },
267
  {
268
- "epoch": 9.020432692307692,
269
- "grad_norm": 3.0806028842926025,
 
 
 
 
 
 
 
 
 
270
  "learning_rate": 3.8235294117647055e-05,
271
- "loss": 4.0778,
272
  "step": 260
273
  },
274
  {
275
- "epoch": 9.032451923076923,
276
- "grad_norm": 12.136305809020996,
277
- "learning_rate": 3.75668449197861e-05,
278
- "loss": 4.0891,
279
- "step": 270
 
 
280
  },
281
  {
282
- "epoch": 9.032451923076923,
283
- "eval_accuracy": 0.0447427293064877,
284
- "eval_loss": 4.032660007476807,
285
- "eval_runtime": 116.5729,
286
- "eval_samples_per_second": 3.835,
287
- "eval_steps_per_second": 0.12,
288
  "step": 270
289
  },
290
  {
291
- "epoch": 10.01201923076923,
292
- "grad_norm": 4.812855243682861,
293
  "learning_rate": 3.6898395721925136e-05,
294
- "loss": 3.9545,
295
  "step": 280
296
  },
297
  {
298
- "epoch": 10.024038461538462,
299
- "grad_norm": 5.218036651611328,
 
 
 
 
 
 
 
 
 
300
  "learning_rate": 3.622994652406417e-05,
301
- "loss": 3.7963,
302
  "step": 290
303
  },
304
  {
305
- "epoch": 10.032451923076923,
306
- "eval_accuracy": 0.08277404921700224,
307
- "eval_loss": 3.82175874710083,
308
- "eval_runtime": 115.9952,
309
- "eval_samples_per_second": 3.854,
310
- "eval_steps_per_second": 0.121,
311
- "step": 297
312
  },
313
  {
314
- "epoch": 11.00360576923077,
315
- "grad_norm": 4.765311241149902,
316
  "learning_rate": 3.556149732620321e-05,
317
- "loss": 3.7306,
318
  "step": 300
319
  },
320
  {
321
- "epoch": 11.015625,
322
- "grad_norm": 6.956701755523682,
 
 
 
 
 
 
 
 
 
323
  "learning_rate": 3.489304812834225e-05,
324
- "loss": 3.5563,
325
  "step": 310
326
  },
327
  {
328
- "epoch": 11.02764423076923,
329
- "grad_norm": 5.394344329833984,
330
  "learning_rate": 3.4224598930481284e-05,
331
- "loss": 3.4787,
332
  "step": 320
333
  },
334
  {
335
- "epoch": 11.032451923076923,
336
- "eval_accuracy": 0.11185682326621924,
337
- "eval_loss": 3.7061688899993896,
338
- "eval_runtime": 113.6071,
339
- "eval_samples_per_second": 3.935,
340
- "eval_steps_per_second": 0.123,
341
- "step": 324
342
  },
343
  {
344
- "epoch": 12.007211538461538,
345
- "grad_norm": 5.70036506652832,
346
  "learning_rate": 3.355614973262032e-05,
347
- "loss": 3.4821,
348
  "step": 330
349
  },
350
  {
351
- "epoch": 12.01923076923077,
352
- "grad_norm": 6.370093822479248,
 
 
 
 
 
 
 
 
 
353
  "learning_rate": 3.288770053475936e-05,
354
- "loss": 3.3756,
355
  "step": 340
356
  },
357
  {
358
- "epoch": 12.03125,
359
- "grad_norm": 5.8564133644104,
360
  "learning_rate": 3.22192513368984e-05,
361
- "loss": 3.1883,
362
  "step": 350
363
  },
364
  {
365
- "epoch": 12.032451923076923,
366
- "eval_accuracy": 0.11185682326621924,
367
- "eval_loss": 3.58866810798645,
368
- "eval_runtime": 112.5771,
369
- "eval_samples_per_second": 3.971,
370
- "eval_steps_per_second": 0.124,
371
- "step": 351
372
  },
373
  {
374
- "epoch": 13.010817307692308,
375
- "grad_norm": 6.884145736694336,
376
  "learning_rate": 3.155080213903743e-05,
377
- "loss": 3.1542,
378
  "step": 360
379
  },
380
  {
381
- "epoch": 13.022836538461538,
382
- "grad_norm": 7.7231059074401855,
 
 
 
 
 
 
 
 
 
383
  "learning_rate": 3.0882352941176475e-05,
384
- "loss": 3.0045,
385
  "step": 370
386
  },
387
  {
388
- "epoch": 13.032451923076923,
389
- "eval_accuracy": 0.14317673378076062,
390
- "eval_loss": 3.380981206893921,
391
- "eval_runtime": 113.3509,
392
- "eval_samples_per_second": 3.944,
393
- "eval_steps_per_second": 0.124,
394
  "step": 378
395
  },
396
  {
397
- "epoch": 14.002403846153847,
398
- "grad_norm": 6.160039901733398,
399
  "learning_rate": 3.0213903743315508e-05,
400
- "loss": 2.9704,
401
  "step": 380
402
  },
403
  {
404
- "epoch": 14.014423076923077,
405
- "grad_norm": 8.257513046264648,
406
  "learning_rate": 2.954545454545455e-05,
407
- "loss": 2.7723,
408
  "step": 390
409
  },
410
  {
411
- "epoch": 14.026442307692308,
412
- "grad_norm": 6.512996196746826,
 
 
 
 
 
 
 
 
 
413
  "learning_rate": 2.8877005347593582e-05,
414
- "loss": 2.8045,
415
  "step": 400
416
  },
417
  {
418
- "epoch": 14.032451923076923,
419
- "eval_accuracy": 0.21923937360178972,
420
- "eval_loss": 3.2211973667144775,
421
- "eval_runtime": 116.514,
422
- "eval_samples_per_second": 3.836,
423
- "eval_steps_per_second": 0.12,
424
- "step": 405
425
  },
426
  {
427
- "epoch": 15.006009615384615,
428
- "grad_norm": 6.627754211425781,
429
  "learning_rate": 2.8208556149732622e-05,
430
- "loss": 2.7247,
431
  "step": 410
432
  },
433
  {
434
- "epoch": 15.018028846153847,
435
- "grad_norm": 6.555057048797607,
436
  "learning_rate": 2.754010695187166e-05,
437
- "loss": 2.5789,
 
 
 
 
 
 
 
 
 
438
  "step": 420
439
  },
440
  {
441
- "epoch": 15.030048076923077,
442
- "grad_norm": 8.675689697265625,
443
  "learning_rate": 2.68716577540107e-05,
444
- "loss": 2.5344,
445
  "step": 430
446
  },
447
  {
448
- "epoch": 15.032451923076923,
449
- "eval_accuracy": 0.15883668903803133,
450
- "eval_loss": 3.270237922668457,
451
- "eval_runtime": 114.1519,
452
- "eval_samples_per_second": 3.916,
453
- "eval_steps_per_second": 0.123,
454
- "step": 432
455
  },
456
  {
457
- "epoch": 16.009615384615383,
458
- "grad_norm": 7.248364448547363,
459
  "learning_rate": 2.6203208556149733e-05,
460
- "loss": 2.3791,
461
  "step": 440
462
  },
463
  {
464
- "epoch": 16.021634615384617,
465
- "grad_norm": 9.409223556518555,
466
- "learning_rate": 2.5534759358288773e-05,
467
- "loss": 2.3725,
468
- "step": 450
 
 
469
  },
470
  {
471
- "epoch": 16.032451923076923,
472
- "eval_accuracy": 0.14093959731543623,
473
- "eval_loss": 3.35996413230896,
474
- "eval_runtime": 113.8728,
475
- "eval_samples_per_second": 3.925,
476
- "eval_steps_per_second": 0.123,
477
- "step": 459
478
  },
479
  {
480
- "epoch": 17.001201923076923,
481
- "grad_norm": 7.834836006164551,
482
  "learning_rate": 2.4866310160427807e-05,
483
- "loss": 2.2722,
484
  "step": 460
485
  },
486
  {
487
- "epoch": 17.013221153846153,
488
- "grad_norm": 7.643532752990723,
 
 
 
 
 
 
 
 
 
489
  "learning_rate": 2.4197860962566847e-05,
490
- "loss": 2.1434,
491
  "step": 470
492
  },
493
  {
494
- "epoch": 17.025240384615383,
495
- "grad_norm": 8.425023078918457,
 
 
 
 
 
 
 
 
 
496
  "learning_rate": 2.3529411764705884e-05,
497
- "loss": 2.2074,
498
  "step": 480
499
  },
500
  {
501
- "epoch": 17.032451923076923,
502
- "eval_accuracy": 0.2371364653243848,
503
- "eval_loss": 2.9730582237243652,
504
- "eval_runtime": 113.5853,
505
- "eval_samples_per_second": 3.935,
506
- "eval_steps_per_second": 0.123,
507
- "step": 486
508
  },
509
  {
510
- "epoch": 18.004807692307693,
511
- "grad_norm": 7.840888500213623,
512
- "learning_rate": 2.286096256684492e-05,
513
- "loss": 2.0823,
 
 
514
  "step": 490
515
  },
516
  {
517
- "epoch": 18.016826923076923,
518
- "grad_norm": 9.385211944580078,
519
  "learning_rate": 2.2192513368983957e-05,
520
- "loss": 1.9511,
521
  "step": 500
522
  },
523
  {
524
- "epoch": 18.028846153846153,
525
- "grad_norm": 8.72028636932373,
 
 
 
 
 
 
 
 
 
526
  "learning_rate": 2.1524064171122994e-05,
527
- "loss": 2.1094,
528
  "step": 510
529
  },
530
  {
531
- "epoch": 18.032451923076923,
532
- "eval_accuracy": 0.26174496644295303,
533
- "eval_loss": 2.8679935932159424,
534
- "eval_runtime": 113.6275,
535
- "eval_samples_per_second": 3.934,
536
- "eval_steps_per_second": 0.123,
537
- "step": 513
538
  },
539
  {
540
- "epoch": 19.00841346153846,
541
- "grad_norm": 9.284242630004883,
542
  "learning_rate": 2.0855614973262035e-05,
543
- "loss": 2.0278,
544
  "step": 520
545
  },
546
  {
547
- "epoch": 19.020432692307693,
548
- "grad_norm": 6.913205623626709,
549
  "learning_rate": 2.018716577540107e-05,
550
- "loss": 1.8727,
551
  "step": 530
552
  },
553
  {
554
- "epoch": 19.032451923076923,
555
- "grad_norm": 21.923686981201172,
 
 
 
 
 
 
 
 
 
556
  "learning_rate": 1.951871657754011e-05,
557
- "loss": 1.9839,
558
  "step": 540
559
  },
560
  {
561
- "epoch": 19.032451923076923,
562
- "eval_accuracy": 0.27069351230425054,
563
- "eval_loss": 2.8359620571136475,
564
- "eval_runtime": 115.0445,
565
- "eval_samples_per_second": 3.885,
566
- "eval_steps_per_second": 0.122,
567
- "step": 540
568
  },
569
  {
570
- "epoch": 20.01201923076923,
571
- "grad_norm": 8.244462013244629,
572
  "learning_rate": 1.8850267379679145e-05,
573
- "loss": 1.7584,
574
  "step": 550
575
  },
576
  {
577
- "epoch": 20.02403846153846,
578
- "grad_norm": 8.496162414550781,
579
  "learning_rate": 1.8181818181818182e-05,
580
- "loss": 1.7354,
581
  "step": 560
582
  },
583
  {
584
- "epoch": 20.032451923076923,
585
- "eval_accuracy": 0.28187919463087246,
586
- "eval_loss": 2.7890186309814453,
587
- "eval_runtime": 111.5479,
588
- "eval_samples_per_second": 4.007,
589
- "eval_steps_per_second": 0.126,
590
- "step": 567
591
  },
592
  {
593
- "epoch": 21.00360576923077,
594
- "grad_norm": 7.818769931793213,
595
  "learning_rate": 1.7513368983957222e-05,
596
- "loss": 1.701,
597
  "step": 570
598
  },
599
  {
600
- "epoch": 21.015625,
601
- "grad_norm": 9.921280860900879,
 
 
 
 
 
 
 
 
 
602
  "learning_rate": 1.684491978609626e-05,
603
- "loss": 1.648,
604
  "step": 580
605
  },
606
  {
607
- "epoch": 21.02764423076923,
608
- "grad_norm": 8.053401947021484,
609
- "learning_rate": 1.6176470588235296e-05,
610
- "loss": 1.6843,
611
- "step": 590
 
 
612
  },
613
  {
614
- "epoch": 21.032451923076923,
615
- "eval_accuracy": 0.29977628635346754,
616
- "eval_loss": 2.728635787963867,
617
- "eval_runtime": 114.938,
618
- "eval_samples_per_second": 3.889,
619
- "eval_steps_per_second": 0.122,
620
- "step": 594
621
  },
622
  {
623
- "epoch": 22.00721153846154,
624
- "grad_norm": 10.640948295593262,
625
  "learning_rate": 1.5508021390374333e-05,
626
- "loss": 1.4562,
627
  "step": 600
628
  },
629
  {
630
- "epoch": 22.01923076923077,
631
- "grad_norm": 9.333779335021973,
 
 
 
 
 
 
 
 
 
632
  "learning_rate": 1.4839572192513372e-05,
633
- "loss": 1.5091,
634
  "step": 610
635
  },
636
  {
637
- "epoch": 22.03125,
638
- "grad_norm": 9.433292388916016,
 
 
 
 
 
 
 
 
 
639
  "learning_rate": 1.4171122994652408e-05,
640
- "loss": 1.6266,
641
  "step": 620
642
  },
643
  {
644
- "epoch": 22.032451923076923,
645
- "eval_accuracy": 0.2841163310961969,
646
- "eval_loss": 2.806154489517212,
647
- "eval_runtime": 115.4195,
648
- "eval_samples_per_second": 3.873,
649
- "eval_steps_per_second": 0.121,
650
- "step": 621
651
  },
652
  {
653
- "epoch": 23.010817307692307,
654
- "grad_norm": 8.609794616699219,
655
- "learning_rate": 1.3502673796791445e-05,
656
- "loss": 1.6214,
 
 
657
  "step": 630
658
  },
659
  {
660
- "epoch": 23.02283653846154,
661
- "grad_norm": 6.943145275115967,
662
  "learning_rate": 1.2834224598930484e-05,
663
- "loss": 1.4083,
664
  "step": 640
665
  },
666
  {
667
- "epoch": 23.032451923076923,
668
- "eval_accuracy": 0.2595078299776286,
669
- "eval_loss": 2.8204569816589355,
670
- "eval_runtime": 115.0186,
671
- "eval_samples_per_second": 3.886,
672
- "eval_steps_per_second": 0.122,
673
- "step": 648
674
  },
675
  {
676
- "epoch": 24.002403846153847,
677
- "grad_norm": 10.106169700622559,
678
  "learning_rate": 1.2165775401069519e-05,
679
- "loss": 1.3264,
680
  "step": 650
681
  },
682
  {
683
- "epoch": 24.014423076923077,
684
- "grad_norm": 10.27568244934082,
 
 
 
 
 
 
 
 
 
685
  "learning_rate": 1.1497326203208558e-05,
686
- "loss": 1.3662,
687
  "step": 660
688
  },
689
  {
690
- "epoch": 24.026442307692307,
691
- "grad_norm": 10.36124038696289,
692
  "learning_rate": 1.0828877005347594e-05,
693
- "loss": 1.4422,
694
  "step": 670
695
  },
696
  {
697
- "epoch": 24.032451923076923,
698
- "eval_accuracy": 0.30648769574944074,
699
- "eval_loss": 2.6406848430633545,
700
- "eval_runtime": 116.1998,
701
- "eval_samples_per_second": 3.847,
702
- "eval_steps_per_second": 0.12,
703
- "step": 675
704
  },
705
  {
706
- "epoch": 25.006009615384617,
707
- "grad_norm": 8.807711601257324,
708
  "learning_rate": 1.0160427807486631e-05,
709
- "loss": 1.3912,
710
  "step": 680
711
  },
712
  {
713
- "epoch": 25.018028846153847,
714
- "grad_norm": 8.629064559936523,
 
 
 
 
 
 
 
 
 
715
  "learning_rate": 9.49197860962567e-06,
716
- "loss": 1.2904,
717
  "step": 690
718
  },
719
  {
720
- "epoch": 25.030048076923077,
721
- "grad_norm": 9.190649032592773,
722
  "learning_rate": 8.823529411764707e-06,
723
- "loss": 1.3897,
724
  "step": 700
725
  },
726
  {
727
- "epoch": 25.032451923076923,
728
- "eval_accuracy": 0.34675615212527966,
729
- "eval_loss": 2.5948002338409424,
730
- "eval_runtime": 118.4696,
731
- "eval_samples_per_second": 3.773,
732
- "eval_steps_per_second": 0.118,
733
- "step": 702
734
  },
735
  {
736
- "epoch": 26.009615384615383,
737
- "grad_norm": 10.941615104675293,
738
  "learning_rate": 8.155080213903744e-06,
739
- "loss": 1.2647,
740
  "step": 710
741
  },
742
  {
743
- "epoch": 26.021634615384617,
744
- "grad_norm": 9.282670021057129,
 
 
 
 
 
 
 
 
 
745
  "learning_rate": 7.4866310160427806e-06,
746
- "loss": 1.3906,
747
  "step": 720
748
  },
749
  {
750
- "epoch": 26.032451923076923,
751
- "eval_accuracy": 0.31543624161073824,
752
- "eval_loss": 2.629518985748291,
753
- "eval_runtime": 120.316,
754
- "eval_samples_per_second": 3.715,
755
- "eval_steps_per_second": 0.116,
756
- "step": 729
757
  },
758
  {
759
- "epoch": 27.001201923076923,
760
- "grad_norm": 9.254827499389648,
761
  "learning_rate": 6.818181818181818e-06,
762
- "loss": 1.3257,
763
  "step": 730
764
  },
765
  {
766
- "epoch": 27.013221153846153,
767
- "grad_norm": 7.69126558303833,
768
  "learning_rate": 6.149732620320856e-06,
769
- "loss": 1.2581,
770
  "step": 740
771
  },
772
  {
773
- "epoch": 27.025240384615383,
774
- "grad_norm": 10.26952075958252,
 
 
 
 
 
 
 
 
 
775
  "learning_rate": 5.481283422459893e-06,
776
- "loss": 1.2291,
777
  "step": 750
778
  },
779
  {
780
- "epoch": 27.032451923076923,
781
- "eval_accuracy": 0.3378076062639821,
782
- "eval_loss": 2.5539445877075195,
783
- "eval_runtime": 116.6007,
784
- "eval_samples_per_second": 3.834,
785
- "eval_steps_per_second": 0.12,
786
  "step": 756
787
  },
788
  {
789
- "epoch": 28.004807692307693,
790
- "grad_norm": 11.07676887512207,
791
  "learning_rate": 4.812834224598931e-06,
792
- "loss": 1.29,
793
  "step": 760
794
  },
795
  {
796
- "epoch": 28.016826923076923,
797
- "grad_norm": 8.940054893493652,
798
  "learning_rate": 4.144385026737968e-06,
799
- "loss": 1.1383,
 
 
 
 
 
 
 
 
 
800
  "step": 770
801
  },
802
  {
803
- "epoch": 28.028846153846153,
804
- "grad_norm": 9.17213249206543,
805
  "learning_rate": 3.4759358288770056e-06,
806
- "loss": 1.3166,
807
  "step": 780
808
  },
809
  {
810
- "epoch": 28.032451923076923,
811
- "eval_accuracy": 0.35570469798657717,
812
- "eval_loss": 2.519993305206299,
813
- "eval_runtime": 116.4901,
814
- "eval_samples_per_second": 3.837,
815
- "eval_steps_per_second": 0.12,
816
- "step": 783
817
  },
818
  {
819
- "epoch": 29.00841346153846,
820
- "grad_norm": 11.884153366088867,
821
  "learning_rate": 2.807486631016043e-06,
822
- "loss": 1.232,
823
  "step": 790
824
  },
825
  {
826
- "epoch": 29.020432692307693,
827
- "grad_norm": 9.171028137207031,
 
 
 
 
 
 
 
 
 
828
  "learning_rate": 2.1390374331550802e-06,
829
- "loss": 1.1733,
830
  "step": 800
831
  },
832
  {
833
- "epoch": 29.032451923076923,
834
- "grad_norm": 24.7429256439209,
835
  "learning_rate": 1.4705882352941177e-06,
836
- "loss": 1.2619,
837
  "step": 810
838
  },
839
  {
840
- "epoch": 29.032451923076923,
841
- "eval_accuracy": 0.35570469798657717,
842
- "eval_loss": 2.530792236328125,
843
- "eval_runtime": 118.4418,
844
- "eval_samples_per_second": 3.774,
845
- "eval_steps_per_second": 0.118,
846
- "step": 810
847
  },
848
  {
849
- "epoch": 30.01201923076923,
850
- "grad_norm": 8.288737297058105,
851
  "learning_rate": 8.021390374331552e-07,
852
- "loss": 1.1824,
853
  "step": 820
854
  },
855
  {
856
- "epoch": 30.02403846153846,
857
- "grad_norm": 8.749896049499512,
 
 
 
 
 
 
 
 
 
858
  "learning_rate": 1.3368983957219251e-07,
859
- "loss": 1.1393,
860
  "step": 830
861
  },
862
  {
863
- "epoch": 30.026442307692307,
864
- "eval_accuracy": 0.3243847874720358,
865
- "eval_loss": 2.5855681896209717,
866
- "eval_runtime": 118.0615,
867
- "eval_samples_per_second": 3.786,
868
- "eval_steps_per_second": 0.119,
869
  "step": 832
870
  },
871
  {
872
- "epoch": 30.026442307692307,
873
  "step": 832,
874
- "total_flos": 3.2147153612960956e+19,
875
- "train_loss": 2.754230235058528,
876
- "train_runtime": 10713.0029,
877
- "train_samples_per_second": 2.485,
878
- "train_steps_per_second": 0.078
879
- },
880
- {
881
- "epoch": 30.026442307692307,
882
- "eval_accuracy": 0.35570469798657717,
883
- "eval_loss": 2.5194525718688965,
884
- "eval_runtime": 114.2775,
885
- "eval_samples_per_second": 3.912,
886
- "eval_steps_per_second": 0.123,
887
  "step": 832
888
  },
889
  {
890
- "epoch": 30.026442307692307,
891
- "eval_accuracy": 0.35570469798657717,
892
- "eval_loss": 2.520230293273926,
893
- "eval_runtime": 112.9248,
894
- "eval_samples_per_second": 3.958,
895
- "eval_steps_per_second": 0.124,
896
  "step": 832
897
  }
898
  ],
@@ -913,8 +1174,8 @@
913
  "attributes": {}
914
  }
915
  },
916
- "total_flos": 3.2147153612960956e+19,
917
- "train_batch_size": 32,
918
  "trial_name": null,
919
  "trial_params": null
920
  }
 
1
  {
2
+ "best_metric": 0.7319819819819819,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-812",
4
+ "epoch": 59.00721153846154,
5
  "eval_steps": 500,
6
  "global_step": 832,
7
  "is_hyper_param_search": false,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01201923076923077,
13
+ "grad_norm": 4.912038326263428,
14
  "learning_rate": 5.9523809523809525e-06,
15
+ "loss": 4.238,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.016826923076923076,
20
+ "eval_accuracy": 0.018018018018018018,
21
+ "eval_loss": 4.223487377166748,
22
+ "eval_runtime": 70.4378,
23
+ "eval_samples_per_second": 6.303,
24
+ "eval_steps_per_second": 0.099,
25
+ "step": 14
26
+ },
27
+ {
28
+ "epoch": 1.0072115384615385,
29
+ "grad_norm": 5.0958333015441895,
30
  "learning_rate": 1.1904761904761905e-05,
31
+ "loss": 4.2658,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 1.0168269230769231,
36
+ "eval_accuracy": 0.02027027027027027,
37
+ "eval_loss": 4.207489490509033,
38
+ "eval_runtime": 72.9674,
39
+ "eval_samples_per_second": 6.085,
40
+ "eval_steps_per_second": 0.096,
41
+ "step": 28
42
  },
43
  {
44
+ "epoch": 2.0024038461538463,
45
+ "grad_norm": 3.8229589462280273,
46
  "learning_rate": 1.785714285714286e-05,
47
+ "loss": 4.2236,
48
  "step": 30
49
  },
50
  {
51
+ "epoch": 2.014423076923077,
52
+ "grad_norm": 3.7527875900268555,
53
  "learning_rate": 2.380952380952381e-05,
54
+ "loss": 4.219,
55
  "step": 40
56
  },
57
  {
58
+ "epoch": 2.016826923076923,
59
+ "eval_accuracy": 0.015765765765765764,
60
+ "eval_loss": 4.205322265625,
61
+ "eval_runtime": 70.576,
62
+ "eval_samples_per_second": 6.291,
63
+ "eval_steps_per_second": 0.099,
64
+ "step": 42
65
+ },
66
+ {
67
+ "epoch": 3.0096153846153846,
68
+ "grad_norm": 2.5568084716796875,
69
  "learning_rate": 2.9761904761904762e-05,
70
+ "loss": 4.2146,
71
  "step": 50
72
  },
73
  {
74
+ "epoch": 3.016826923076923,
75
+ "eval_accuracy": 0.02252252252252252,
76
+ "eval_loss": 4.209786415100098,
77
+ "eval_runtime": 73.3716,
78
+ "eval_samples_per_second": 6.051,
79
+ "eval_steps_per_second": 0.095,
80
+ "step": 56
81
  },
82
  {
83
+ "epoch": 4.0048076923076925,
84
+ "grad_norm": 2.623307943344116,
85
  "learning_rate": 3.571428571428572e-05,
86
+ "loss": 4.2083,
87
  "step": 60
88
  },
89
  {
90
+ "epoch": 4.016826923076923,
91
+ "grad_norm": 11.633626937866211,
92
  "learning_rate": 4.166666666666667e-05,
93
+ "loss": 4.1925,
94
+ "step": 70
95
+ },
96
+ {
97
+ "epoch": 4.016826923076923,
98
+ "eval_accuracy": 0.02252252252252252,
99
+ "eval_loss": 4.182406902313232,
100
+ "eval_runtime": 73.5871,
101
+ "eval_samples_per_second": 6.034,
102
+ "eval_steps_per_second": 0.095,
103
  "step": 70
104
  },
105
  {
106
+ "epoch": 5.012019230769231,
107
+ "grad_norm": 4.2888689041137695,
108
  "learning_rate": 4.761904761904762e-05,
109
+ "loss": 4.1192,
110
  "step": 80
111
  },
112
  {
113
+ "epoch": 5.016826923076923,
114
+ "eval_accuracy": 0.038288288288288286,
115
+ "eval_loss": 4.081945896148682,
116
+ "eval_runtime": 77.2884,
117
+ "eval_samples_per_second": 5.745,
118
+ "eval_steps_per_second": 0.091,
119
+ "step": 84
120
  },
121
  {
122
+ "epoch": 6.007211538461538,
123
+ "grad_norm": 3.5548884868621826,
124
  "learning_rate": 4.959893048128342e-05,
125
+ "loss": 4.0297,
126
  "step": 90
127
  },
128
  {
129
+ "epoch": 6.016826923076923,
130
+ "eval_accuracy": 0.06981981981981981,
131
+ "eval_loss": 3.981917142868042,
132
+ "eval_runtime": 72.62,
133
+ "eval_samples_per_second": 6.114,
134
+ "eval_steps_per_second": 0.096,
135
+ "step": 98
136
  },
137
  {
138
+ "epoch": 7.002403846153846,
139
+ "grad_norm": 6.507851600646973,
140
+ "learning_rate": 4.8930481283422465e-05,
141
+ "loss": 3.9499,
142
+ "step": 100
 
 
143
  },
144
  {
145
+ "epoch": 7.014423076923077,
146
+ "grad_norm": 5.1739959716796875,
147
  "learning_rate": 4.8262032085561496e-05,
148
+ "loss": 3.7134,
149
  "step": 110
150
  },
151
  {
152
+ "epoch": 7.016826923076923,
153
+ "eval_accuracy": 0.1036036036036036,
154
+ "eval_loss": 3.7339625358581543,
155
+ "eval_runtime": 73.0624,
156
+ "eval_samples_per_second": 6.077,
157
+ "eval_steps_per_second": 0.096,
158
+ "step": 112
159
+ },
160
+ {
161
+ "epoch": 8.009615384615385,
162
+ "grad_norm": 4.7554779052734375,
163
  "learning_rate": 4.759358288770054e-05,
164
+ "loss": 3.5289,
165
  "step": 120
166
  },
167
  {
168
+ "epoch": 8.016826923076923,
169
+ "eval_accuracy": 0.18018018018018017,
170
+ "eval_loss": 3.420488119125366,
171
+ "eval_runtime": 73.5384,
172
+ "eval_samples_per_second": 6.038,
173
+ "eval_steps_per_second": 0.095,
174
+ "step": 126
175
+ },
176
+ {
177
+ "epoch": 9.004807692307692,
178
+ "grad_norm": 6.714512348175049,
179
  "learning_rate": 4.6925133689839576e-05,
180
+ "loss": 3.3709,
181
  "step": 130
182
  },
183
  {
184
+ "epoch": 9.016826923076923,
185
+ "grad_norm": 18.599552154541016,
186
+ "learning_rate": 4.625668449197861e-05,
187
+ "loss": 3.0625,
188
+ "step": 140
 
 
189
  },
190
  {
191
+ "epoch": 9.016826923076923,
192
+ "eval_accuracy": 0.26126126126126126,
193
+ "eval_loss": 3.196960687637329,
194
+ "eval_runtime": 74.7592,
195
+ "eval_samples_per_second": 5.939,
196
+ "eval_steps_per_second": 0.094,
197
  "step": 140
198
  },
199
  {
200
+ "epoch": 10.01201923076923,
201
+ "grad_norm": 7.006440162658691,
202
  "learning_rate": 4.558823529411765e-05,
203
+ "loss": 2.8776,
204
  "step": 150
205
  },
206
  {
207
+ "epoch": 10.016826923076923,
208
+ "eval_accuracy": 0.30180180180180183,
209
+ "eval_loss": 2.9350106716156006,
210
+ "eval_runtime": 75.4948,
211
+ "eval_samples_per_second": 5.881,
212
+ "eval_steps_per_second": 0.093,
213
+ "step": 154
214
+ },
215
+ {
216
+ "epoch": 11.007211538461538,
217
+ "grad_norm": 6.219061374664307,
218
  "learning_rate": 4.491978609625669e-05,
219
+ "loss": 2.6375,
220
  "step": 160
221
  },
222
  {
223
+ "epoch": 11.016826923076923,
224
+ "eval_accuracy": 0.36036036036036034,
225
+ "eval_loss": 2.762986660003662,
226
+ "eval_runtime": 73.4162,
227
+ "eval_samples_per_second": 6.048,
228
+ "eval_steps_per_second": 0.095,
229
+ "step": 168
230
  },
231
  {
232
+ "epoch": 12.002403846153847,
233
+ "grad_norm": 6.574148178100586,
234
  "learning_rate": 4.4251336898395724e-05,
235
+ "loss": 2.4071,
236
  "step": 170
237
  },
238
  {
239
+ "epoch": 12.014423076923077,
240
+ "grad_norm": 8.191123008728027,
241
  "learning_rate": 4.358288770053476e-05,
242
+ "loss": 2.2954,
243
  "step": 180
244
  },
245
  {
246
+ "epoch": 12.016826923076923,
247
+ "eval_accuracy": 0.42792792792792794,
248
+ "eval_loss": 2.4990594387054443,
249
+ "eval_runtime": 80.1492,
250
+ "eval_samples_per_second": 5.54,
251
+ "eval_steps_per_second": 0.087,
252
+ "step": 182
253
  },
254
  {
255
+ "epoch": 13.009615384615385,
256
+ "grad_norm": 6.4414825439453125,
257
  "learning_rate": 4.29144385026738e-05,
258
+ "loss": 2.1337,
259
  "step": 190
260
  },
261
  {
262
+ "epoch": 13.016826923076923,
263
+ "eval_accuracy": 0.43243243243243246,
264
+ "eval_loss": 2.3827102184295654,
265
+ "eval_runtime": 73.7035,
266
+ "eval_samples_per_second": 6.024,
267
+ "eval_steps_per_second": 0.095,
268
+ "step": 196
269
+ },
270
+ {
271
+ "epoch": 14.004807692307692,
272
+ "grad_norm": 8.87990951538086,
273
  "learning_rate": 4.224598930481284e-05,
274
+ "loss": 1.9319,
275
  "step": 200
276
  },
277
  {
278
+ "epoch": 14.016826923076923,
279
+ "grad_norm": 22.130002975463867,
280
  "learning_rate": 4.157754010695187e-05,
281
+ "loss": 1.8195,
282
  "step": 210
283
  },
284
  {
285
+ "epoch": 14.016826923076923,
286
+ "eval_accuracy": 0.481981981981982,
287
+ "eval_loss": 2.2311551570892334,
288
+ "eval_runtime": 75.5729,
289
+ "eval_samples_per_second": 5.875,
290
+ "eval_steps_per_second": 0.093,
291
+ "step": 210
292
  },
293
  {
294
+ "epoch": 15.01201923076923,
295
+ "grad_norm": 7.134392261505127,
296
  "learning_rate": 4.0909090909090915e-05,
297
+ "loss": 1.6436,
298
  "step": 220
299
  },
300
  {
301
+ "epoch": 15.016826923076923,
302
+ "eval_accuracy": 0.44594594594594594,
303
+ "eval_loss": 2.3628787994384766,
304
+ "eval_runtime": 76.2152,
305
+ "eval_samples_per_second": 5.826,
306
+ "eval_steps_per_second": 0.092,
307
+ "step": 224
308
+ },
309
+ {
310
+ "epoch": 16.00721153846154,
311
+ "grad_norm": 11.737651824951172,
312
  "learning_rate": 4.024064171122995e-05,
313
+ "loss": 1.6289,
314
  "step": 230
315
  },
316
  {
317
+ "epoch": 16.016826923076923,
318
+ "eval_accuracy": 0.4752252252252252,
319
+ "eval_loss": 2.151165723800659,
320
+ "eval_runtime": 72.7861,
321
+ "eval_samples_per_second": 6.1,
322
+ "eval_steps_per_second": 0.096,
323
+ "step": 238
324
  },
325
  {
326
+ "epoch": 17.002403846153847,
327
+ "grad_norm": 12.988323211669922,
328
+ "learning_rate": 3.957219251336899e-05,
329
+ "loss": 1.423,
330
+ "step": 240
 
 
331
  },
332
  {
333
+ "epoch": 17.014423076923077,
334
+ "grad_norm": 5.394784450531006,
335
  "learning_rate": 3.8903743315508025e-05,
336
+ "loss": 1.2957,
337
  "step": 250
338
  },
339
  {
340
+ "epoch": 17.016826923076923,
341
+ "eval_accuracy": 0.5022522522522522,
342
+ "eval_loss": 2.0142312049865723,
343
+ "eval_runtime": 74.6687,
344
+ "eval_samples_per_second": 5.946,
345
+ "eval_steps_per_second": 0.094,
346
+ "step": 252
347
+ },
348
+ {
349
+ "epoch": 18.009615384615383,
350
+ "grad_norm": 6.641128063201904,
351
  "learning_rate": 3.8235294117647055e-05,
352
+ "loss": 1.2761,
353
  "step": 260
354
  },
355
  {
356
+ "epoch": 18.016826923076923,
357
+ "eval_accuracy": 0.527027027027027,
358
+ "eval_loss": 1.906096339225769,
359
+ "eval_runtime": 75.2853,
360
+ "eval_samples_per_second": 5.898,
361
+ "eval_steps_per_second": 0.093,
362
+ "step": 266
363
  },
364
  {
365
+ "epoch": 19.004807692307693,
366
+ "grad_norm": 7.859009742736816,
367
+ "learning_rate": 3.75668449197861e-05,
368
+ "loss": 1.217,
 
 
369
  "step": 270
370
  },
371
  {
372
+ "epoch": 19.016826923076923,
373
+ "grad_norm": 29.06666374206543,
374
  "learning_rate": 3.6898395721925136e-05,
375
+ "loss": 1.1118,
376
  "step": 280
377
  },
378
  {
379
+ "epoch": 19.016826923076923,
380
+ "eval_accuracy": 0.5495495495495496,
381
+ "eval_loss": 1.811281681060791,
382
+ "eval_runtime": 81.5227,
383
+ "eval_samples_per_second": 5.446,
384
+ "eval_steps_per_second": 0.086,
385
+ "step": 280
386
+ },
387
+ {
388
+ "epoch": 20.01201923076923,
389
+ "grad_norm": 8.949642181396484,
390
  "learning_rate": 3.622994652406417e-05,
391
+ "loss": 0.9642,
392
  "step": 290
393
  },
394
  {
395
+ "epoch": 20.016826923076923,
396
+ "eval_accuracy": 0.6036036036036037,
397
+ "eval_loss": 1.727989673614502,
398
+ "eval_runtime": 72.1469,
399
+ "eval_samples_per_second": 6.154,
400
+ "eval_steps_per_second": 0.097,
401
+ "step": 294
402
  },
403
  {
404
+ "epoch": 21.00721153846154,
405
+ "grad_norm": 6.73243522644043,
406
  "learning_rate": 3.556149732620321e-05,
407
+ "loss": 0.894,
408
  "step": 300
409
  },
410
  {
411
+ "epoch": 21.016826923076923,
412
+ "eval_accuracy": 0.5382882882882883,
413
+ "eval_loss": 1.8722944259643555,
414
+ "eval_runtime": 76.1286,
415
+ "eval_samples_per_second": 5.832,
416
+ "eval_steps_per_second": 0.092,
417
+ "step": 308
418
+ },
419
+ {
420
+ "epoch": 22.002403846153847,
421
+ "grad_norm": 8.856271743774414,
422
  "learning_rate": 3.489304812834225e-05,
423
+ "loss": 0.8454,
424
  "step": 310
425
  },
426
  {
427
+ "epoch": 22.014423076923077,
428
+ "grad_norm": 6.268283843994141,
429
  "learning_rate": 3.4224598930481284e-05,
430
+ "loss": 0.7974,
431
  "step": 320
432
  },
433
  {
434
+ "epoch": 22.016826923076923,
435
+ "eval_accuracy": 0.6058558558558559,
436
+ "eval_loss": 1.658478021621704,
437
+ "eval_runtime": 69.8498,
438
+ "eval_samples_per_second": 6.356,
439
+ "eval_steps_per_second": 0.1,
440
+ "step": 322
441
  },
442
  {
443
+ "epoch": 23.009615384615383,
444
+ "grad_norm": 6.027498722076416,
445
  "learning_rate": 3.355614973262032e-05,
446
+ "loss": 0.833,
447
  "step": 330
448
  },
449
  {
450
+ "epoch": 23.016826923076923,
451
+ "eval_accuracy": 0.6148648648648649,
452
+ "eval_loss": 1.599608302116394,
453
+ "eval_runtime": 76.7983,
454
+ "eval_samples_per_second": 5.781,
455
+ "eval_steps_per_second": 0.091,
456
+ "step": 336
457
+ },
458
+ {
459
+ "epoch": 24.004807692307693,
460
+ "grad_norm": 5.669070243835449,
461
  "learning_rate": 3.288770053475936e-05,
462
+ "loss": 0.6668,
463
  "step": 340
464
  },
465
  {
466
+ "epoch": 24.016826923076923,
467
+ "grad_norm": 12.669683456420898,
468
  "learning_rate": 3.22192513368984e-05,
469
+ "loss": 0.6431,
470
  "step": 350
471
  },
472
  {
473
+ "epoch": 24.016826923076923,
474
+ "eval_accuracy": 0.6148648648648649,
475
+ "eval_loss": 1.560927152633667,
476
+ "eval_runtime": 69.5019,
477
+ "eval_samples_per_second": 6.388,
478
+ "eval_steps_per_second": 0.101,
479
+ "step": 350
480
  },
481
  {
482
+ "epoch": 25.01201923076923,
483
+ "grad_norm": 5.4236626625061035,
484
  "learning_rate": 3.155080213903743e-05,
485
+ "loss": 0.5873,
486
  "step": 360
487
  },
488
  {
489
+ "epoch": 25.016826923076923,
490
+ "eval_accuracy": 0.6171171171171171,
491
+ "eval_loss": 1.6108297109603882,
492
+ "eval_runtime": 73.8756,
493
+ "eval_samples_per_second": 6.01,
494
+ "eval_steps_per_second": 0.095,
495
+ "step": 364
496
+ },
497
+ {
498
+ "epoch": 26.00721153846154,
499
+ "grad_norm": 5.605001449584961,
500
  "learning_rate": 3.0882352941176475e-05,
501
+ "loss": 0.5554,
502
  "step": 370
503
  },
504
  {
505
+ "epoch": 26.016826923076923,
506
+ "eval_accuracy": 0.6531531531531531,
507
+ "eval_loss": 1.4013562202453613,
508
+ "eval_runtime": 79.7998,
509
+ "eval_samples_per_second": 5.564,
510
+ "eval_steps_per_second": 0.088,
511
  "step": 378
512
  },
513
  {
514
+ "epoch": 27.002403846153847,
515
+ "grad_norm": 6.0180277824401855,
516
  "learning_rate": 3.0213903743315508e-05,
517
+ "loss": 0.5428,
518
  "step": 380
519
  },
520
  {
521
+ "epoch": 27.014423076923077,
522
+ "grad_norm": 7.198999404907227,
523
  "learning_rate": 2.954545454545455e-05,
524
+ "loss": 0.4786,
525
  "step": 390
526
  },
527
  {
528
+ "epoch": 27.016826923076923,
529
+ "eval_accuracy": 0.6621621621621622,
530
+ "eval_loss": 1.433493733406067,
531
+ "eval_runtime": 71.2505,
532
+ "eval_samples_per_second": 6.232,
533
+ "eval_steps_per_second": 0.098,
534
+ "step": 392
535
+ },
536
+ {
537
+ "epoch": 28.009615384615383,
538
+ "grad_norm": 4.281993389129639,
539
  "learning_rate": 2.8877005347593582e-05,
540
+ "loss": 0.4252,
541
  "step": 400
542
  },
543
  {
544
+ "epoch": 28.016826923076923,
545
+ "eval_accuracy": 0.6509009009009009,
546
+ "eval_loss": 1.444474220275879,
547
+ "eval_runtime": 71.5031,
548
+ "eval_samples_per_second": 6.21,
549
+ "eval_steps_per_second": 0.098,
550
+ "step": 406
551
  },
552
  {
553
+ "epoch": 29.004807692307693,
554
+ "grad_norm": 6.2630534172058105,
555
  "learning_rate": 2.8208556149732622e-05,
556
+ "loss": 0.4527,
557
  "step": 410
558
  },
559
  {
560
+ "epoch": 29.016826923076923,
561
+ "grad_norm": 4.163498878479004,
562
  "learning_rate": 2.754010695187166e-05,
563
+ "loss": 0.382,
564
+ "step": 420
565
+ },
566
+ {
567
+ "epoch": 29.016826923076923,
568
+ "eval_accuracy": 0.6621621621621622,
569
+ "eval_loss": 1.3914759159088135,
570
+ "eval_runtime": 72.0609,
571
+ "eval_samples_per_second": 6.161,
572
+ "eval_steps_per_second": 0.097,
573
  "step": 420
574
  },
575
  {
576
+ "epoch": 30.01201923076923,
577
+ "grad_norm": 5.012676239013672,
578
  "learning_rate": 2.68716577540107e-05,
579
+ "loss": 0.365,
580
  "step": 430
581
  },
582
  {
583
+ "epoch": 30.016826923076923,
584
+ "eval_accuracy": 0.6846846846846847,
585
+ "eval_loss": 1.297812819480896,
586
+ "eval_runtime": 73.674,
587
+ "eval_samples_per_second": 6.027,
588
+ "eval_steps_per_second": 0.095,
589
+ "step": 434
590
  },
591
  {
592
+ "epoch": 31.00721153846154,
593
+ "grad_norm": 4.618253707885742,
594
  "learning_rate": 2.6203208556149733e-05,
595
+ "loss": 0.319,
596
  "step": 440
597
  },
598
  {
599
+ "epoch": 31.016826923076923,
600
+ "eval_accuracy": 0.6824324324324325,
601
+ "eval_loss": 1.3218427896499634,
602
+ "eval_runtime": 74.4745,
603
+ "eval_samples_per_second": 5.962,
604
+ "eval_steps_per_second": 0.094,
605
+ "step": 448
606
  },
607
  {
608
+ "epoch": 32.00240384615385,
609
+ "grad_norm": 6.823300361633301,
610
+ "learning_rate": 2.5534759358288773e-05,
611
+ "loss": 0.3829,
612
+ "step": 450
 
 
613
  },
614
  {
615
+ "epoch": 32.01442307692308,
616
+ "grad_norm": 5.096348285675049,
617
  "learning_rate": 2.4866310160427807e-05,
618
+ "loss": 0.3167,
619
  "step": 460
620
  },
621
  {
622
+ "epoch": 32.01682692307692,
623
+ "eval_accuracy": 0.6644144144144144,
624
+ "eval_loss": 1.3495796918869019,
625
+ "eval_runtime": 73.0271,
626
+ "eval_samples_per_second": 6.08,
627
+ "eval_steps_per_second": 0.096,
628
+ "step": 462
629
+ },
630
+ {
631
+ "epoch": 33.00961538461539,
632
+ "grad_norm": 6.042089462280273,
633
  "learning_rate": 2.4197860962566847e-05,
634
+ "loss": 0.2797,
635
  "step": 470
636
  },
637
  {
638
+ "epoch": 33.01682692307692,
639
+ "eval_accuracy": 0.6801801801801802,
640
+ "eval_loss": 1.2806001901626587,
641
+ "eval_runtime": 72.9489,
642
+ "eval_samples_per_second": 6.086,
643
+ "eval_steps_per_second": 0.096,
644
+ "step": 476
645
+ },
646
+ {
647
+ "epoch": 34.00480769230769,
648
+ "grad_norm": 4.125849723815918,
649
  "learning_rate": 2.3529411764705884e-05,
650
+ "loss": 0.281,
651
  "step": 480
652
  },
653
  {
654
+ "epoch": 34.01682692307692,
655
+ "grad_norm": 7.1154704093933105,
656
+ "learning_rate": 2.286096256684492e-05,
657
+ "loss": 0.2864,
658
+ "step": 490
 
 
659
  },
660
  {
661
+ "epoch": 34.01682692307692,
662
+ "eval_accuracy": 0.7072072072072072,
663
+ "eval_loss": 1.219117283821106,
664
+ "eval_runtime": 74.6962,
665
+ "eval_samples_per_second": 5.944,
666
+ "eval_steps_per_second": 0.094,
667
  "step": 490
668
  },
669
  {
670
+ "epoch": 35.01201923076923,
671
+ "grad_norm": 6.536057949066162,
672
  "learning_rate": 2.2192513368983957e-05,
673
+ "loss": 0.2927,
674
  "step": 500
675
  },
676
  {
677
+ "epoch": 35.01682692307692,
678
+ "eval_accuracy": 0.7207207207207207,
679
+ "eval_loss": 1.2134795188903809,
680
+ "eval_runtime": 72.1404,
681
+ "eval_samples_per_second": 6.155,
682
+ "eval_steps_per_second": 0.097,
683
+ "step": 504
684
+ },
685
+ {
686
+ "epoch": 36.00721153846154,
687
+ "grad_norm": 4.703156471252441,
688
  "learning_rate": 2.1524064171122994e-05,
689
+ "loss": 0.2698,
690
  "step": 510
691
  },
692
  {
693
+ "epoch": 36.01682692307692,
694
+ "eval_accuracy": 0.6914414414414415,
695
+ "eval_loss": 1.250654697418213,
696
+ "eval_runtime": 73.2059,
697
+ "eval_samples_per_second": 6.065,
698
+ "eval_steps_per_second": 0.096,
699
+ "step": 518
700
  },
701
  {
702
+ "epoch": 37.00240384615385,
703
+ "grad_norm": 4.395312786102295,
704
  "learning_rate": 2.0855614973262035e-05,
705
+ "loss": 0.256,
706
  "step": 520
707
  },
708
  {
709
+ "epoch": 37.01442307692308,
710
+ "grad_norm": 3.43831729888916,
711
  "learning_rate": 2.018716577540107e-05,
712
+ "loss": 0.2333,
713
  "step": 530
714
  },
715
  {
716
+ "epoch": 37.01682692307692,
717
+ "eval_accuracy": 0.7094594594594594,
718
+ "eval_loss": 1.2037817239761353,
719
+ "eval_runtime": 74.757,
720
+ "eval_samples_per_second": 5.939,
721
+ "eval_steps_per_second": 0.094,
722
+ "step": 532
723
+ },
724
+ {
725
+ "epoch": 38.00961538461539,
726
+ "grad_norm": 4.047479152679443,
727
  "learning_rate": 1.951871657754011e-05,
728
+ "loss": 0.2366,
729
  "step": 540
730
  },
731
  {
732
+ "epoch": 38.01682692307692,
733
+ "eval_accuracy": 0.7207207207207207,
734
+ "eval_loss": 1.1517395973205566,
735
+ "eval_runtime": 73.764,
736
+ "eval_samples_per_second": 6.019,
737
+ "eval_steps_per_second": 0.095,
738
+ "step": 546
739
  },
740
  {
741
+ "epoch": 39.00480769230769,
742
+ "grad_norm": 5.707859516143799,
743
  "learning_rate": 1.8850267379679145e-05,
744
+ "loss": 0.1938,
745
  "step": 550
746
  },
747
  {
748
+ "epoch": 39.01682692307692,
749
+ "grad_norm": 3.7590107917785645,
750
  "learning_rate": 1.8181818181818182e-05,
751
+ "loss": 0.1886,
752
  "step": 560
753
  },
754
  {
755
+ "epoch": 39.01682692307692,
756
+ "eval_accuracy": 0.7094594594594594,
757
+ "eval_loss": 1.2073884010314941,
758
+ "eval_runtime": 72.0607,
759
+ "eval_samples_per_second": 6.161,
760
+ "eval_steps_per_second": 0.097,
761
+ "step": 560
762
  },
763
  {
764
+ "epoch": 40.01201923076923,
765
+ "grad_norm": 2.7063074111938477,
766
  "learning_rate": 1.7513368983957222e-05,
767
+ "loss": 0.1804,
768
  "step": 570
769
  },
770
  {
771
+ "epoch": 40.01682692307692,
772
+ "eval_accuracy": 0.7027027027027027,
773
+ "eval_loss": 1.1658011674880981,
774
+ "eval_runtime": 77.6928,
775
+ "eval_samples_per_second": 5.715,
776
+ "eval_steps_per_second": 0.09,
777
+ "step": 574
778
+ },
779
+ {
780
+ "epoch": 41.00721153846154,
781
+ "grad_norm": 6.454692363739014,
782
  "learning_rate": 1.684491978609626e-05,
783
+ "loss": 0.1778,
784
  "step": 580
785
  },
786
  {
787
+ "epoch": 41.01682692307692,
788
+ "eval_accuracy": 0.6824324324324325,
789
+ "eval_loss": 1.2350265979766846,
790
+ "eval_runtime": 73.7916,
791
+ "eval_samples_per_second": 6.017,
792
+ "eval_steps_per_second": 0.095,
793
+ "step": 588
794
  },
795
  {
796
+ "epoch": 42.00240384615385,
797
+ "grad_norm": 3.444183349609375,
798
+ "learning_rate": 1.6176470588235296e-05,
799
+ "loss": 0.2187,
800
+ "step": 590
 
 
801
  },
802
  {
803
+ "epoch": 42.01442307692308,
804
+ "grad_norm": 3.9001522064208984,
805
  "learning_rate": 1.5508021390374333e-05,
806
+ "loss": 0.1728,
807
  "step": 600
808
  },
809
  {
810
+ "epoch": 42.01682692307692,
811
+ "eval_accuracy": 0.7162162162162162,
812
+ "eval_loss": 1.1637804508209229,
813
+ "eval_runtime": 77.911,
814
+ "eval_samples_per_second": 5.699,
815
+ "eval_steps_per_second": 0.09,
816
+ "step": 602
817
+ },
818
+ {
819
+ "epoch": 43.00961538461539,
820
+ "grad_norm": 7.290647029876709,
821
  "learning_rate": 1.4839572192513372e-05,
822
+ "loss": 0.1998,
823
  "step": 610
824
  },
825
  {
826
+ "epoch": 43.01682692307692,
827
+ "eval_accuracy": 0.6959459459459459,
828
+ "eval_loss": 1.2359126806259155,
829
+ "eval_runtime": 72.9535,
830
+ "eval_samples_per_second": 6.086,
831
+ "eval_steps_per_second": 0.096,
832
+ "step": 616
833
+ },
834
+ {
835
+ "epoch": 44.00480769230769,
836
+ "grad_norm": 5.636298656463623,
837
  "learning_rate": 1.4171122994652408e-05,
838
+ "loss": 0.1639,
839
  "step": 620
840
  },
841
  {
842
+ "epoch": 44.01682692307692,
843
+ "grad_norm": 3.6947968006134033,
844
+ "learning_rate": 1.3502673796791445e-05,
845
+ "loss": 0.1727,
846
+ "step": 630
 
 
847
  },
848
  {
849
+ "epoch": 44.01682692307692,
850
+ "eval_accuracy": 0.6936936936936937,
851
+ "eval_loss": 1.232095718383789,
852
+ "eval_runtime": 81.4582,
853
+ "eval_samples_per_second": 5.451,
854
+ "eval_steps_per_second": 0.086,
855
  "step": 630
856
  },
857
  {
858
+ "epoch": 45.01201923076923,
859
+ "grad_norm": 3.2019245624542236,
860
  "learning_rate": 1.2834224598930484e-05,
861
+ "loss": 0.1564,
862
  "step": 640
863
  },
864
  {
865
+ "epoch": 45.01682692307692,
866
+ "eval_accuracy": 0.713963963963964,
867
+ "eval_loss": 1.1604844331741333,
868
+ "eval_runtime": 72.1149,
869
+ "eval_samples_per_second": 6.157,
870
+ "eval_steps_per_second": 0.097,
871
+ "step": 644
872
  },
873
  {
874
+ "epoch": 46.00721153846154,
875
+ "grad_norm": 1.9841790199279785,
876
  "learning_rate": 1.2165775401069519e-05,
877
+ "loss": 0.1888,
878
  "step": 650
879
  },
880
  {
881
+ "epoch": 46.01682692307692,
882
+ "eval_accuracy": 0.7094594594594594,
883
+ "eval_loss": 1.1609092950820923,
884
+ "eval_runtime": 71.8777,
885
+ "eval_samples_per_second": 6.177,
886
+ "eval_steps_per_second": 0.097,
887
+ "step": 658
888
+ },
889
+ {
890
+ "epoch": 47.00240384615385,
891
+ "grad_norm": 7.085150241851807,
892
  "learning_rate": 1.1497326203208558e-05,
893
+ "loss": 0.1618,
894
  "step": 660
895
  },
896
  {
897
+ "epoch": 47.01442307692308,
898
+ "grad_norm": 5.41819429397583,
899
  "learning_rate": 1.0828877005347594e-05,
900
+ "loss": 0.1227,
901
  "step": 670
902
  },
903
  {
904
+ "epoch": 47.01682692307692,
905
+ "eval_accuracy": 0.7117117117117117,
906
+ "eval_loss": 1.1588457822799683,
907
+ "eval_runtime": 82.846,
908
+ "eval_samples_per_second": 5.359,
909
+ "eval_steps_per_second": 0.084,
910
+ "step": 672
911
  },
912
  {
913
+ "epoch": 48.00961538461539,
914
+ "grad_norm": 2.3894031047821045,
915
  "learning_rate": 1.0160427807486631e-05,
916
+ "loss": 0.134,
917
  "step": 680
918
  },
919
  {
920
+ "epoch": 48.01682692307692,
921
+ "eval_accuracy": 0.7072072072072072,
922
+ "eval_loss": 1.1698147058486938,
923
+ "eval_runtime": 81.0499,
924
+ "eval_samples_per_second": 5.478,
925
+ "eval_steps_per_second": 0.086,
926
+ "step": 686
927
+ },
928
+ {
929
+ "epoch": 49.00480769230769,
930
+ "grad_norm": 4.01511287689209,
931
  "learning_rate": 9.49197860962567e-06,
932
+ "loss": 0.1234,
933
  "step": 690
934
  },
935
  {
936
+ "epoch": 49.01682692307692,
937
+ "grad_norm": 7.249420642852783,
938
  "learning_rate": 8.823529411764707e-06,
939
+ "loss": 0.1622,
940
  "step": 700
941
  },
942
  {
943
+ "epoch": 49.01682692307692,
944
+ "eval_accuracy": 0.6981981981981982,
945
+ "eval_loss": 1.2014145851135254,
946
+ "eval_runtime": 73.38,
947
+ "eval_samples_per_second": 6.051,
948
+ "eval_steps_per_second": 0.095,
949
+ "step": 700
950
  },
951
  {
952
+ "epoch": 50.01201923076923,
953
+ "grad_norm": 4.699652194976807,
954
  "learning_rate": 8.155080213903744e-06,
955
+ "loss": 0.1391,
956
  "step": 710
957
  },
958
  {
959
+ "epoch": 50.01682692307692,
960
+ "eval_accuracy": 0.7162162162162162,
961
+ "eval_loss": 1.1005959510803223,
962
+ "eval_runtime": 73.5034,
963
+ "eval_samples_per_second": 6.041,
964
+ "eval_steps_per_second": 0.095,
965
+ "step": 714
966
+ },
967
+ {
968
+ "epoch": 51.00721153846154,
969
+ "grad_norm": 3.143523931503296,
970
  "learning_rate": 7.4866310160427806e-06,
971
+ "loss": 0.1276,
972
  "step": 720
973
  },
974
  {
975
+ "epoch": 51.01682692307692,
976
+ "eval_accuracy": 0.6891891891891891,
977
+ "eval_loss": 1.1485284566879272,
978
+ "eval_runtime": 77.8096,
979
+ "eval_samples_per_second": 5.706,
980
+ "eval_steps_per_second": 0.09,
981
+ "step": 728
982
  },
983
  {
984
+ "epoch": 52.00240384615385,
985
+ "grad_norm": 1.1826622486114502,
986
  "learning_rate": 6.818181818181818e-06,
987
+ "loss": 0.0979,
988
  "step": 730
989
  },
990
  {
991
+ "epoch": 52.01442307692308,
992
+ "grad_norm": 3.218552589416504,
993
  "learning_rate": 6.149732620320856e-06,
994
+ "loss": 0.1222,
995
  "step": 740
996
  },
997
  {
998
+ "epoch": 52.01682692307692,
999
+ "eval_accuracy": 0.713963963963964,
1000
+ "eval_loss": 1.0801595449447632,
1001
+ "eval_runtime": 73.9346,
1002
+ "eval_samples_per_second": 6.005,
1003
+ "eval_steps_per_second": 0.095,
1004
+ "step": 742
1005
+ },
1006
+ {
1007
+ "epoch": 53.00961538461539,
1008
+ "grad_norm": 4.1740312576293945,
1009
  "learning_rate": 5.481283422459893e-06,
1010
+ "loss": 0.1024,
1011
  "step": 750
1012
  },
1013
  {
1014
+ "epoch": 53.01682692307692,
1015
+ "eval_accuracy": 0.704954954954955,
1016
+ "eval_loss": 1.1242848634719849,
1017
+ "eval_runtime": 72.7853,
1018
+ "eval_samples_per_second": 6.1,
1019
+ "eval_steps_per_second": 0.096,
1020
  "step": 756
1021
  },
1022
  {
1023
+ "epoch": 54.00480769230769,
1024
+ "grad_norm": 3.4602904319763184,
1025
  "learning_rate": 4.812834224598931e-06,
1026
+ "loss": 0.1014,
1027
  "step": 760
1028
  },
1029
  {
1030
+ "epoch": 54.01682692307692,
1031
+ "grad_norm": 24.67173957824707,
1032
  "learning_rate": 4.144385026737968e-06,
1033
+ "loss": 0.1186,
1034
+ "step": 770
1035
+ },
1036
+ {
1037
+ "epoch": 54.01682692307692,
1038
+ "eval_accuracy": 0.7297297297297297,
1039
+ "eval_loss": 1.0909080505371094,
1040
+ "eval_runtime": 74.1097,
1041
+ "eval_samples_per_second": 5.991,
1042
+ "eval_steps_per_second": 0.094,
1043
  "step": 770
1044
  },
1045
  {
1046
+ "epoch": 55.01201923076923,
1047
+ "grad_norm": 2.6615827083587646,
1048
  "learning_rate": 3.4759358288770056e-06,
1049
+ "loss": 0.1121,
1050
  "step": 780
1051
  },
1052
  {
1053
+ "epoch": 55.01682692307692,
1054
+ "eval_accuracy": 0.7094594594594594,
1055
+ "eval_loss": 1.1351189613342285,
1056
+ "eval_runtime": 73.5115,
1057
+ "eval_samples_per_second": 6.04,
1058
+ "eval_steps_per_second": 0.095,
1059
+ "step": 784
1060
  },
1061
  {
1062
+ "epoch": 56.00721153846154,
1063
+ "grad_norm": 3.412928581237793,
1064
  "learning_rate": 2.807486631016043e-06,
1065
+ "loss": 0.1284,
1066
  "step": 790
1067
  },
1068
  {
1069
+ "epoch": 56.01682692307692,
1070
+ "eval_accuracy": 0.7274774774774775,
1071
+ "eval_loss": 1.1095025539398193,
1072
+ "eval_runtime": 73.055,
1073
+ "eval_samples_per_second": 6.078,
1074
+ "eval_steps_per_second": 0.096,
1075
+ "step": 798
1076
+ },
1077
+ {
1078
+ "epoch": 57.00240384615385,
1079
+ "grad_norm": 4.214470386505127,
1080
  "learning_rate": 2.1390374331550802e-06,
1081
+ "loss": 0.1418,
1082
  "step": 800
1083
  },
1084
  {
1085
+ "epoch": 57.01442307692308,
1086
+ "grad_norm": 2.0941081047058105,
1087
  "learning_rate": 1.4705882352941177e-06,
1088
+ "loss": 0.0893,
1089
  "step": 810
1090
  },
1091
  {
1092
+ "epoch": 57.01682692307692,
1093
+ "eval_accuracy": 0.7319819819819819,
1094
+ "eval_loss": 1.084416389465332,
1095
+ "eval_runtime": 71.5145,
1096
+ "eval_samples_per_second": 6.209,
1097
+ "eval_steps_per_second": 0.098,
1098
+ "step": 812
1099
  },
1100
  {
1101
+ "epoch": 58.00961538461539,
1102
+ "grad_norm": 4.0398101806640625,
1103
  "learning_rate": 8.021390374331552e-07,
1104
+ "loss": 0.0878,
1105
  "step": 820
1106
  },
1107
  {
1108
+ "epoch": 58.01682692307692,
1109
+ "eval_accuracy": 0.7297297297297297,
1110
+ "eval_loss": 1.0803806781768799,
1111
+ "eval_runtime": 71.6842,
1112
+ "eval_samples_per_second": 6.194,
1113
+ "eval_steps_per_second": 0.098,
1114
+ "step": 826
1115
+ },
1116
+ {
1117
+ "epoch": 59.00480769230769,
1118
+ "grad_norm": 1.0705435276031494,
1119
  "learning_rate": 1.3368983957219251e-07,
1120
+ "loss": 0.0887,
1121
  "step": 830
1122
  },
1123
  {
1124
+ "epoch": 59.00721153846154,
1125
+ "eval_accuracy": 0.7297297297297297,
1126
+ "eval_loss": 1.0809234380722046,
1127
+ "eval_runtime": 72.7582,
1128
+ "eval_samples_per_second": 6.102,
1129
+ "eval_steps_per_second": 0.096,
1130
  "step": 832
1131
  },
1132
  {
1133
+ "epoch": 59.00721153846154,
1134
  "step": 832,
1135
+ "total_flos": 6.1975283578694664e+19,
1136
+ "train_loss": 1.175738023289551,
1137
+ "train_runtime": 12896.1846,
1138
+ "train_samples_per_second": 4.129,
1139
+ "train_steps_per_second": 0.065
1140
+ },
1141
+ {
1142
+ "epoch": 59.00721153846154,
1143
+ "eval_accuracy": 0.7319819819819819,
1144
+ "eval_loss": 1.084416389465332,
1145
+ "eval_runtime": 73.94,
1146
+ "eval_samples_per_second": 6.005,
1147
+ "eval_steps_per_second": 0.095,
1148
  "step": 832
1149
  },
1150
  {
1151
+ "epoch": 59.00721153846154,
1152
+ "eval_accuracy": 0.7319819819819819,
1153
+ "eval_loss": 1.084416389465332,
1154
+ "eval_runtime": 75.3126,
1155
+ "eval_samples_per_second": 5.895,
1156
+ "eval_steps_per_second": 0.093,
1157
  "step": 832
1158
  }
1159
  ],
 
1174
  "attributes": {}
1175
  }
1176
  },
1177
+ "total_flos": 6.1975283578694664e+19,
1178
+ "train_batch_size": 64,
1179
  "trial_name": null,
1180
  "trial_params": null
1181
  }