ihsanahakiim commited on
Commit
9cf6492
·
verified ·
1 Parent(s): b0eb596

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 59.00721153846154,
3
- "eval_accuracy": 0.7319819819819819,
4
- "eval_loss": 1.084416389465332,
5
- "eval_runtime": 75.3126,
6
- "eval_samples_per_second": 5.895,
7
- "eval_steps_per_second": 0.093
8
  }
 
1
  {
2
+ "epoch": 119.00833333333334,
3
+ "eval_accuracy": 0.5539772727272727,
4
+ "eval_loss": 1.9018956422805786,
5
+ "eval_runtime": 133.7632,
6
+ "eval_samples_per_second": 10.526,
7
+ "eval_steps_per_second": 0.164
8
  }
runs/Jan16_11-24-13_GAN-SVR/events.out.tfevents.1737080111.GAN-SVR.864097.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bba6adbc35ef809071241cec5b961f758e5a25c8e06a4b0f62104a6bf8356c0e
3
- size 411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b76810f99e49e3c5531f5cf094dc0380534fa9fdda0f2e9bfdf313f9262f6d4
3
+ size 734
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 59.00721153846154,
3
- "eval_accuracy": 0.7319819819819819,
4
- "eval_loss": 1.084416389465332,
5
- "eval_runtime": 75.3126,
6
- "eval_samples_per_second": 5.895,
7
- "eval_steps_per_second": 0.093
8
  }
 
1
  {
2
+ "epoch": 119.00833333333334,
3
+ "eval_accuracy": 0.5539772727272727,
4
+ "eval_loss": 1.9018956422805786,
5
+ "eval_runtime": 133.7632,
6
+ "eval_samples_per_second": 10.526,
7
+ "eval_steps_per_second": 0.164
8
  }
trainer_state.json CHANGED
@@ -1,1164 +1,2467 @@
1
  {
2
- "best_metric": 0.7319819819819819,
3
- "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-812",
4
- "epoch": 59.00721153846154,
5
  "eval_steps": 500,
6
- "global_step": 832,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01201923076923077,
13
- "grad_norm": 4.912038326263428,
14
- "learning_rate": 5.9523809523809525e-06,
15
- "loss": 4.238,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.016826923076923076,
20
- "eval_accuracy": 0.018018018018018018,
21
- "eval_loss": 4.223487377166748,
22
- "eval_runtime": 70.4378,
23
- "eval_samples_per_second": 6.303,
24
- "eval_steps_per_second": 0.099,
25
- "step": 14
26
  },
27
  {
28
- "epoch": 1.0072115384615385,
29
- "grad_norm": 5.0958333015441895,
30
- "learning_rate": 1.1904761904761905e-05,
31
- "loss": 4.2658,
32
  "step": 20
33
  },
34
  {
35
- "epoch": 1.0168269230769231,
36
- "eval_accuracy": 0.02027027027027027,
37
- "eval_loss": 4.207489490509033,
38
- "eval_runtime": 72.9674,
39
- "eval_samples_per_second": 6.085,
40
- "eval_steps_per_second": 0.096,
41
- "step": 28
42
  },
43
  {
44
- "epoch": 2.0024038461538463,
45
- "grad_norm": 3.8229589462280273,
46
- "learning_rate": 1.785714285714286e-05,
47
- "loss": 4.2236,
48
- "step": 30
 
 
49
  },
50
  {
51
- "epoch": 2.014423076923077,
52
- "grad_norm": 3.7527875900268555,
53
- "learning_rate": 2.380952380952381e-05,
54
- "loss": 4.219,
55
  "step": 40
56
  },
57
  {
58
- "epoch": 2.016826923076923,
59
- "eval_accuracy": 0.015765765765765764,
60
- "eval_loss": 4.205322265625,
61
- "eval_runtime": 70.576,
62
- "eval_samples_per_second": 6.291,
63
- "eval_steps_per_second": 0.099,
64
- "step": 42
65
  },
66
  {
67
- "epoch": 3.0096153846153846,
68
- "grad_norm": 2.5568084716796875,
69
- "learning_rate": 2.9761904761904762e-05,
70
- "loss": 4.2146,
71
  "step": 50
72
  },
73
  {
74
- "epoch": 3.016826923076923,
75
- "eval_accuracy": 0.02252252252252252,
76
- "eval_loss": 4.209786415100098,
77
- "eval_runtime": 73.3716,
78
- "eval_samples_per_second": 6.051,
79
- "eval_steps_per_second": 0.095,
80
- "step": 56
81
- },
82
- {
83
- "epoch": 4.0048076923076925,
84
- "grad_norm": 2.623307943344116,
85
- "learning_rate": 3.571428571428572e-05,
86
- "loss": 4.2083,
87
  "step": 60
88
  },
89
  {
90
- "epoch": 4.016826923076923,
91
- "grad_norm": 11.633626937866211,
92
- "learning_rate": 4.166666666666667e-05,
93
- "loss": 4.1925,
94
- "step": 70
 
 
95
  },
96
  {
97
- "epoch": 4.016826923076923,
98
- "eval_accuracy": 0.02252252252252252,
99
- "eval_loss": 4.182406902313232,
100
- "eval_runtime": 73.5871,
101
- "eval_samples_per_second": 6.034,
102
- "eval_steps_per_second": 0.095,
103
  "step": 70
104
  },
105
  {
106
- "epoch": 5.012019230769231,
107
- "grad_norm": 4.2888689041137695,
108
- "learning_rate": 4.761904761904762e-05,
109
- "loss": 4.1192,
110
  "step": 80
111
  },
112
  {
113
- "epoch": 5.016826923076923,
114
- "eval_accuracy": 0.038288288288288286,
115
- "eval_loss": 4.081945896148682,
116
- "eval_runtime": 77.2884,
117
- "eval_samples_per_second": 5.745,
118
- "eval_steps_per_second": 0.091,
119
- "step": 84
120
  },
121
  {
122
- "epoch": 6.007211538461538,
123
- "grad_norm": 3.5548884868621826,
124
- "learning_rate": 4.959893048128342e-05,
125
- "loss": 4.0297,
126
  "step": 90
127
  },
128
  {
129
- "epoch": 6.016826923076923,
130
- "eval_accuracy": 0.06981981981981981,
131
- "eval_loss": 3.981917142868042,
132
- "eval_runtime": 72.62,
133
- "eval_samples_per_second": 6.114,
134
- "eval_steps_per_second": 0.096,
135
- "step": 98
136
  },
137
  {
138
- "epoch": 7.002403846153846,
139
- "grad_norm": 6.507851600646973,
140
- "learning_rate": 4.8930481283422465e-05,
141
- "loss": 3.9499,
142
  "step": 100
143
  },
144
  {
145
- "epoch": 7.014423076923077,
146
- "grad_norm": 5.1739959716796875,
147
- "learning_rate": 4.8262032085561496e-05,
148
- "loss": 3.7134,
149
  "step": 110
150
  },
151
  {
152
- "epoch": 7.016826923076923,
153
- "eval_accuracy": 0.1036036036036036,
154
- "eval_loss": 3.7339625358581543,
155
- "eval_runtime": 73.0624,
156
- "eval_samples_per_second": 6.077,
157
- "eval_steps_per_second": 0.096,
158
  "step": 112
159
  },
160
  {
161
- "epoch": 8.009615384615385,
162
- "grad_norm": 4.7554779052734375,
163
- "learning_rate": 4.759358288770054e-05,
164
- "loss": 3.5289,
165
  "step": 120
166
  },
167
  {
168
- "epoch": 8.016826923076923,
169
- "eval_accuracy": 0.18018018018018017,
170
- "eval_loss": 3.420488119125366,
171
- "eval_runtime": 73.5384,
172
- "eval_samples_per_second": 6.038,
173
- "eval_steps_per_second": 0.095,
174
- "step": 126
175
  },
176
  {
177
- "epoch": 9.004807692307692,
178
- "grad_norm": 6.714512348175049,
179
- "learning_rate": 4.6925133689839576e-05,
180
- "loss": 3.3709,
181
  "step": 130
182
  },
183
  {
184
- "epoch": 9.016826923076923,
185
- "grad_norm": 18.599552154541016,
186
- "learning_rate": 4.625668449197861e-05,
187
- "loss": 3.0625,
188
  "step": 140
189
  },
190
  {
191
- "epoch": 9.016826923076923,
192
- "eval_accuracy": 0.26126126126126126,
193
- "eval_loss": 3.196960687637329,
194
- "eval_runtime": 74.7592,
195
- "eval_samples_per_second": 5.939,
196
- "eval_steps_per_second": 0.094,
197
- "step": 140
198
  },
199
  {
200
- "epoch": 10.01201923076923,
201
- "grad_norm": 7.006440162658691,
202
- "learning_rate": 4.558823529411765e-05,
203
- "loss": 2.8776,
204
  "step": 150
205
  },
206
  {
207
- "epoch": 10.016826923076923,
208
- "eval_accuracy": 0.30180180180180183,
209
- "eval_loss": 2.9350106716156006,
210
- "eval_runtime": 75.4948,
211
- "eval_samples_per_second": 5.881,
212
- "eval_steps_per_second": 0.093,
213
- "step": 154
214
- },
215
- {
216
- "epoch": 11.007211538461538,
217
- "grad_norm": 6.219061374664307,
218
- "learning_rate": 4.491978609625669e-05,
219
- "loss": 2.6375,
220
  "step": 160
221
  },
222
  {
223
- "epoch": 11.016826923076923,
224
- "eval_accuracy": 0.36036036036036034,
225
- "eval_loss": 2.762986660003662,
226
- "eval_runtime": 73.4162,
227
- "eval_samples_per_second": 6.048,
228
- "eval_steps_per_second": 0.095,
229
- "step": 168
230
  },
231
  {
232
- "epoch": 12.002403846153847,
233
- "grad_norm": 6.574148178100586,
234
- "learning_rate": 4.4251336898395724e-05,
235
- "loss": 2.4071,
236
  "step": 170
237
  },
238
  {
239
- "epoch": 12.014423076923077,
240
- "grad_norm": 8.191123008728027,
241
- "learning_rate": 4.358288770053476e-05,
242
- "loss": 2.2954,
243
- "step": 180
 
 
244
  },
245
  {
246
- "epoch": 12.016826923076923,
247
- "eval_accuracy": 0.42792792792792794,
248
- "eval_loss": 2.4990594387054443,
249
- "eval_runtime": 80.1492,
250
- "eval_samples_per_second": 5.54,
251
- "eval_steps_per_second": 0.087,
252
- "step": 182
253
  },
254
  {
255
- "epoch": 13.009615384615385,
256
- "grad_norm": 6.4414825439453125,
257
- "learning_rate": 4.29144385026738e-05,
258
- "loss": 2.1337,
259
  "step": 190
260
  },
261
  {
262
- "epoch": 13.016826923076923,
263
- "eval_accuracy": 0.43243243243243246,
264
- "eval_loss": 2.3827102184295654,
265
- "eval_runtime": 73.7035,
266
- "eval_samples_per_second": 6.024,
267
- "eval_steps_per_second": 0.095,
268
- "step": 196
269
  },
270
  {
271
- "epoch": 14.004807692307692,
272
- "grad_norm": 8.87990951538086,
273
- "learning_rate": 4.224598930481284e-05,
274
- "loss": 1.9319,
275
  "step": 200
276
  },
277
  {
278
- "epoch": 14.016826923076923,
279
- "grad_norm": 22.130002975463867,
280
- "learning_rate": 4.157754010695187e-05,
281
- "loss": 1.8195,
282
- "step": 210
 
 
283
  },
284
  {
285
- "epoch": 14.016826923076923,
286
- "eval_accuracy": 0.481981981981982,
287
- "eval_loss": 2.2311551570892334,
288
- "eval_runtime": 75.5729,
289
- "eval_samples_per_second": 5.875,
290
- "eval_steps_per_second": 0.093,
291
  "step": 210
292
  },
293
  {
294
- "epoch": 15.01201923076923,
295
- "grad_norm": 7.134392261505127,
296
- "learning_rate": 4.0909090909090915e-05,
297
- "loss": 1.6436,
298
  "step": 220
299
  },
300
  {
301
- "epoch": 15.016826923076923,
302
- "eval_accuracy": 0.44594594594594594,
303
- "eval_loss": 2.3628787994384766,
304
- "eval_runtime": 76.2152,
305
- "eval_samples_per_second": 5.826,
306
- "eval_steps_per_second": 0.092,
307
  "step": 224
308
  },
309
  {
310
- "epoch": 16.00721153846154,
311
- "grad_norm": 11.737651824951172,
312
- "learning_rate": 4.024064171122995e-05,
313
- "loss": 1.6289,
314
  "step": 230
315
  },
316
  {
317
- "epoch": 16.016826923076923,
318
- "eval_accuracy": 0.4752252252252252,
319
- "eval_loss": 2.151165723800659,
320
- "eval_runtime": 72.7861,
321
- "eval_samples_per_second": 6.1,
322
- "eval_steps_per_second": 0.096,
323
- "step": 238
324
  },
325
  {
326
- "epoch": 17.002403846153847,
327
- "grad_norm": 12.988323211669922,
328
- "learning_rate": 3.957219251336899e-05,
329
- "loss": 1.423,
 
 
330
  "step": 240
331
  },
332
  {
333
- "epoch": 17.014423076923077,
334
- "grad_norm": 5.394784450531006,
335
- "learning_rate": 3.8903743315508025e-05,
336
- "loss": 1.2957,
337
  "step": 250
338
  },
339
  {
340
- "epoch": 17.016826923076923,
341
- "eval_accuracy": 0.5022522522522522,
342
- "eval_loss": 2.0142312049865723,
343
- "eval_runtime": 74.6687,
344
- "eval_samples_per_second": 5.946,
345
- "eval_steps_per_second": 0.094,
346
- "step": 252
347
  },
348
  {
349
- "epoch": 18.009615384615383,
350
- "grad_norm": 6.641128063201904,
351
- "learning_rate": 3.8235294117647055e-05,
352
- "loss": 1.2761,
353
  "step": 260
354
  },
355
  {
356
- "epoch": 18.016826923076923,
357
- "eval_accuracy": 0.527027027027027,
358
- "eval_loss": 1.906096339225769,
359
- "eval_runtime": 75.2853,
360
- "eval_samples_per_second": 5.898,
361
- "eval_steps_per_second": 0.093,
362
- "step": 266
363
- },
364
- {
365
- "epoch": 19.004807692307693,
366
- "grad_norm": 7.859009742736816,
367
- "learning_rate": 3.75668449197861e-05,
368
- "loss": 1.217,
369
  "step": 270
370
  },
371
  {
372
- "epoch": 19.016826923076923,
373
- "grad_norm": 29.06666374206543,
374
- "learning_rate": 3.6898395721925136e-05,
375
- "loss": 1.1118,
376
- "step": 280
 
 
377
  },
378
  {
379
- "epoch": 19.016826923076923,
380
- "eval_accuracy": 0.5495495495495496,
381
- "eval_loss": 1.811281681060791,
382
- "eval_runtime": 81.5227,
383
- "eval_samples_per_second": 5.446,
384
- "eval_steps_per_second": 0.086,
385
  "step": 280
386
  },
387
  {
388
- "epoch": 20.01201923076923,
389
- "grad_norm": 8.949642181396484,
390
- "learning_rate": 3.622994652406417e-05,
391
- "loss": 0.9642,
392
- "step": 290
 
 
393
  },
394
  {
395
- "epoch": 20.016826923076923,
396
- "eval_accuracy": 0.6036036036036037,
397
- "eval_loss": 1.727989673614502,
398
- "eval_runtime": 72.1469,
399
- "eval_samples_per_second": 6.154,
400
- "eval_steps_per_second": 0.097,
401
- "step": 294
402
  },
403
  {
404
- "epoch": 21.00721153846154,
405
- "grad_norm": 6.73243522644043,
406
- "learning_rate": 3.556149732620321e-05,
407
- "loss": 0.894,
408
  "step": 300
409
  },
410
  {
411
- "epoch": 21.016826923076923,
412
- "eval_accuracy": 0.5382882882882883,
413
- "eval_loss": 1.8722944259643555,
414
- "eval_runtime": 76.1286,
415
- "eval_samples_per_second": 5.832,
416
- "eval_steps_per_second": 0.092,
417
- "step": 308
418
  },
419
  {
420
- "epoch": 22.002403846153847,
421
- "grad_norm": 8.856271743774414,
422
- "learning_rate": 3.489304812834225e-05,
423
- "loss": 0.8454,
424
  "step": 310
425
  },
426
  {
427
- "epoch": 22.014423076923077,
428
- "grad_norm": 6.268283843994141,
429
- "learning_rate": 3.4224598930481284e-05,
430
- "loss": 0.7974,
431
  "step": 320
432
  },
433
  {
434
- "epoch": 22.016826923076923,
435
- "eval_accuracy": 0.6058558558558559,
436
- "eval_loss": 1.658478021621704,
437
- "eval_runtime": 69.8498,
438
- "eval_samples_per_second": 6.356,
439
- "eval_steps_per_second": 0.1,
440
- "step": 322
441
  },
442
  {
443
- "epoch": 23.009615384615383,
444
- "grad_norm": 6.027498722076416,
445
- "learning_rate": 3.355614973262032e-05,
446
- "loss": 0.833,
447
  "step": 330
448
  },
449
  {
450
- "epoch": 23.016826923076923,
451
- "eval_accuracy": 0.6148648648648649,
452
- "eval_loss": 1.599608302116394,
453
- "eval_runtime": 76.7983,
454
- "eval_samples_per_second": 5.781,
455
- "eval_steps_per_second": 0.091,
456
  "step": 336
457
  },
458
  {
459
- "epoch": 24.004807692307693,
460
- "grad_norm": 5.669070243835449,
461
- "learning_rate": 3.288770053475936e-05,
462
- "loss": 0.6668,
463
  "step": 340
464
  },
465
  {
466
- "epoch": 24.016826923076923,
467
- "grad_norm": 12.669683456420898,
468
- "learning_rate": 3.22192513368984e-05,
469
- "loss": 0.6431,
470
  "step": 350
471
  },
472
  {
473
- "epoch": 24.016826923076923,
474
- "eval_accuracy": 0.6148648648648649,
475
- "eval_loss": 1.560927152633667,
476
- "eval_runtime": 69.5019,
477
- "eval_samples_per_second": 6.388,
478
- "eval_steps_per_second": 0.101,
479
- "step": 350
480
  },
481
  {
482
- "epoch": 25.01201923076923,
483
- "grad_norm": 5.4236626625061035,
484
- "learning_rate": 3.155080213903743e-05,
485
- "loss": 0.5873,
486
  "step": 360
487
  },
488
  {
489
- "epoch": 25.016826923076923,
490
- "eval_accuracy": 0.6171171171171171,
491
- "eval_loss": 1.6108297109603882,
492
- "eval_runtime": 73.8756,
493
- "eval_samples_per_second": 6.01,
494
- "eval_steps_per_second": 0.095,
495
- "step": 364
496
  },
497
  {
498
- "epoch": 26.00721153846154,
499
- "grad_norm": 5.605001449584961,
500
- "learning_rate": 3.0882352941176475e-05,
501
- "loss": 0.5554,
502
  "step": 370
503
  },
504
  {
505
- "epoch": 26.016826923076923,
506
- "eval_accuracy": 0.6531531531531531,
507
- "eval_loss": 1.4013562202453613,
508
- "eval_runtime": 79.7998,
509
- "eval_samples_per_second": 5.564,
510
- "eval_steps_per_second": 0.088,
511
- "step": 378
512
- },
513
- {
514
- "epoch": 27.002403846153847,
515
- "grad_norm": 6.0180277824401855,
516
- "learning_rate": 3.0213903743315508e-05,
517
- "loss": 0.5428,
518
  "step": 380
519
  },
520
  {
521
- "epoch": 27.014423076923077,
522
- "grad_norm": 7.198999404907227,
523
- "learning_rate": 2.954545454545455e-05,
524
- "loss": 0.4786,
525
- "step": 390
 
 
526
  },
527
  {
528
- "epoch": 27.016826923076923,
529
- "eval_accuracy": 0.6621621621621622,
530
- "eval_loss": 1.433493733406067,
531
- "eval_runtime": 71.2505,
532
- "eval_samples_per_second": 6.232,
533
- "eval_steps_per_second": 0.098,
534
- "step": 392
535
  },
536
  {
537
- "epoch": 28.009615384615383,
538
- "grad_norm": 4.281993389129639,
539
- "learning_rate": 2.8877005347593582e-05,
540
- "loss": 0.4252,
541
  "step": 400
542
  },
543
  {
544
- "epoch": 28.016826923076923,
545
- "eval_accuracy": 0.6509009009009009,
546
- "eval_loss": 1.444474220275879,
547
- "eval_runtime": 71.5031,
548
- "eval_samples_per_second": 6.21,
549
- "eval_steps_per_second": 0.098,
550
- "step": 406
551
  },
552
  {
553
- "epoch": 29.004807692307693,
554
- "grad_norm": 6.2630534172058105,
555
- "learning_rate": 2.8208556149732622e-05,
556
- "loss": 0.4527,
557
  "step": 410
558
  },
559
  {
560
- "epoch": 29.016826923076923,
561
- "grad_norm": 4.163498878479004,
562
- "learning_rate": 2.754010695187166e-05,
563
- "loss": 0.382,
564
- "step": 420
 
 
565
  },
566
  {
567
- "epoch": 29.016826923076923,
568
- "eval_accuracy": 0.6621621621621622,
569
- "eval_loss": 1.3914759159088135,
570
- "eval_runtime": 72.0609,
571
- "eval_samples_per_second": 6.161,
572
- "eval_steps_per_second": 0.097,
573
  "step": 420
574
  },
575
  {
576
- "epoch": 30.01201923076923,
577
- "grad_norm": 5.012676239013672,
578
- "learning_rate": 2.68716577540107e-05,
579
- "loss": 0.365,
580
  "step": 430
581
  },
582
  {
583
- "epoch": 30.016826923076923,
584
- "eval_accuracy": 0.6846846846846847,
585
- "eval_loss": 1.297812819480896,
586
- "eval_runtime": 73.674,
587
- "eval_samples_per_second": 6.027,
588
- "eval_steps_per_second": 0.095,
589
- "step": 434
590
  },
591
  {
592
- "epoch": 31.00721153846154,
593
- "grad_norm": 4.618253707885742,
594
- "learning_rate": 2.6203208556149733e-05,
595
- "loss": 0.319,
596
  "step": 440
597
  },
598
  {
599
- "epoch": 31.016826923076923,
600
- "eval_accuracy": 0.6824324324324325,
601
- "eval_loss": 1.3218427896499634,
602
- "eval_runtime": 74.4745,
603
- "eval_samples_per_second": 5.962,
604
- "eval_steps_per_second": 0.094,
605
  "step": 448
606
  },
607
  {
608
- "epoch": 32.00240384615385,
609
- "grad_norm": 6.823300361633301,
610
- "learning_rate": 2.5534759358288773e-05,
611
- "loss": 0.3829,
612
  "step": 450
613
  },
614
  {
615
- "epoch": 32.01442307692308,
616
- "grad_norm": 5.096348285675049,
617
- "learning_rate": 2.4866310160427807e-05,
618
- "loss": 0.3167,
619
  "step": 460
620
  },
621
  {
622
- "epoch": 32.01682692307692,
623
- "eval_accuracy": 0.6644144144144144,
624
- "eval_loss": 1.3495796918869019,
625
- "eval_runtime": 73.0271,
626
- "eval_samples_per_second": 6.08,
627
- "eval_steps_per_second": 0.096,
628
- "step": 462
629
  },
630
  {
631
- "epoch": 33.00961538461539,
632
- "grad_norm": 6.042089462280273,
633
- "learning_rate": 2.4197860962566847e-05,
634
- "loss": 0.2797,
635
  "step": 470
636
  },
637
  {
638
- "epoch": 33.01682692307692,
639
- "eval_accuracy": 0.6801801801801802,
640
- "eval_loss": 1.2806001901626587,
641
- "eval_runtime": 72.9489,
642
- "eval_samples_per_second": 6.086,
643
- "eval_steps_per_second": 0.096,
644
- "step": 476
645
  },
646
  {
647
- "epoch": 34.00480769230769,
648
- "grad_norm": 4.125849723815918,
649
- "learning_rate": 2.3529411764705884e-05,
650
- "loss": 0.281,
 
 
651
  "step": 480
652
  },
653
  {
654
- "epoch": 34.01682692307692,
655
- "grad_norm": 7.1154704093933105,
656
- "learning_rate": 2.286096256684492e-05,
657
- "loss": 0.2864,
658
  "step": 490
659
  },
660
  {
661
- "epoch": 34.01682692307692,
662
- "eval_accuracy": 0.7072072072072072,
663
- "eval_loss": 1.219117283821106,
664
- "eval_runtime": 74.6962,
665
- "eval_samples_per_second": 5.944,
666
- "eval_steps_per_second": 0.094,
667
- "step": 490
668
  },
669
  {
670
- "epoch": 35.01201923076923,
671
- "grad_norm": 6.536057949066162,
672
- "learning_rate": 2.2192513368983957e-05,
673
- "loss": 0.2927,
674
  "step": 500
675
  },
676
  {
677
- "epoch": 35.01682692307692,
678
- "eval_accuracy": 0.7207207207207207,
679
- "eval_loss": 1.2134795188903809,
680
- "eval_runtime": 72.1404,
681
- "eval_samples_per_second": 6.155,
682
- "eval_steps_per_second": 0.097,
683
- "step": 504
684
- },
685
- {
686
- "epoch": 36.00721153846154,
687
- "grad_norm": 4.703156471252441,
688
- "learning_rate": 2.1524064171122994e-05,
689
- "loss": 0.2698,
690
  "step": 510
691
  },
692
  {
693
- "epoch": 36.01682692307692,
694
- "eval_accuracy": 0.6914414414414415,
695
- "eval_loss": 1.250654697418213,
696
- "eval_runtime": 73.2059,
697
- "eval_samples_per_second": 6.065,
698
- "eval_steps_per_second": 0.096,
699
- "step": 518
700
  },
701
  {
702
- "epoch": 37.00240384615385,
703
- "grad_norm": 4.395312786102295,
704
- "learning_rate": 2.0855614973262035e-05,
705
- "loss": 0.256,
706
  "step": 520
707
  },
708
  {
709
- "epoch": 37.01442307692308,
710
- "grad_norm": 3.43831729888916,
711
- "learning_rate": 2.018716577540107e-05,
712
- "loss": 0.2333,
713
- "step": 530
 
 
714
  },
715
  {
716
- "epoch": 37.01682692307692,
717
- "eval_accuracy": 0.7094594594594594,
718
- "eval_loss": 1.2037817239761353,
719
- "eval_runtime": 74.757,
720
- "eval_samples_per_second": 5.939,
721
- "eval_steps_per_second": 0.094,
722
- "step": 532
723
  },
724
  {
725
- "epoch": 38.00961538461539,
726
- "grad_norm": 4.047479152679443,
727
- "learning_rate": 1.951871657754011e-05,
728
- "loss": 0.2366,
729
  "step": 540
730
  },
731
  {
732
- "epoch": 38.01682692307692,
733
- "eval_accuracy": 0.7207207207207207,
734
- "eval_loss": 1.1517395973205566,
735
- "eval_runtime": 73.764,
736
- "eval_samples_per_second": 6.019,
737
- "eval_steps_per_second": 0.095,
738
- "step": 546
739
  },
740
  {
741
- "epoch": 39.00480769230769,
742
- "grad_norm": 5.707859516143799,
743
- "learning_rate": 1.8850267379679145e-05,
744
- "loss": 0.1938,
745
  "step": 550
746
  },
747
  {
748
- "epoch": 39.01682692307692,
749
- "grad_norm": 3.7590107917785645,
750
- "learning_rate": 1.8181818181818182e-05,
751
- "loss": 0.1886,
752
  "step": 560
753
  },
754
  {
755
- "epoch": 39.01682692307692,
756
- "eval_accuracy": 0.7094594594594594,
757
- "eval_loss": 1.2073884010314941,
758
- "eval_runtime": 72.0607,
759
- "eval_samples_per_second": 6.161,
760
- "eval_steps_per_second": 0.097,
761
  "step": 560
762
  },
763
  {
764
- "epoch": 40.01201923076923,
765
- "grad_norm": 2.7063074111938477,
766
- "learning_rate": 1.7513368983957222e-05,
767
- "loss": 0.1804,
768
  "step": 570
769
  },
770
  {
771
- "epoch": 40.01682692307692,
772
- "eval_accuracy": 0.7027027027027027,
773
- "eval_loss": 1.1658011674880981,
774
- "eval_runtime": 77.6928,
775
- "eval_samples_per_second": 5.715,
776
- "eval_steps_per_second": 0.09,
777
- "step": 574
778
  },
779
  {
780
- "epoch": 41.00721153846154,
781
- "grad_norm": 6.454692363739014,
782
- "learning_rate": 1.684491978609626e-05,
783
- "loss": 0.1778,
784
  "step": 580
785
  },
786
  {
787
- "epoch": 41.01682692307692,
788
- "eval_accuracy": 0.6824324324324325,
789
- "eval_loss": 1.2350265979766846,
790
- "eval_runtime": 73.7916,
791
- "eval_samples_per_second": 6.017,
792
- "eval_steps_per_second": 0.095,
793
- "step": 588
794
  },
795
  {
796
- "epoch": 42.00240384615385,
797
- "grad_norm": 3.444183349609375,
798
- "learning_rate": 1.6176470588235296e-05,
799
- "loss": 0.2187,
800
- "step": 590
 
 
801
  },
802
  {
803
- "epoch": 42.01442307692308,
804
- "grad_norm": 3.9001522064208984,
805
- "learning_rate": 1.5508021390374333e-05,
806
- "loss": 0.1728,
807
  "step": 600
808
  },
809
  {
810
- "epoch": 42.01682692307692,
811
- "eval_accuracy": 0.7162162162162162,
812
- "eval_loss": 1.1637804508209229,
813
- "eval_runtime": 77.911,
814
- "eval_samples_per_second": 5.699,
815
- "eval_steps_per_second": 0.09,
816
- "step": 602
817
  },
818
  {
819
- "epoch": 43.00961538461539,
820
- "grad_norm": 7.290647029876709,
821
- "learning_rate": 1.4839572192513372e-05,
822
- "loss": 0.1998,
823
  "step": 610
824
  },
825
  {
826
- "epoch": 43.01682692307692,
827
- "eval_accuracy": 0.6959459459459459,
828
- "eval_loss": 1.2359126806259155,
829
- "eval_runtime": 72.9535,
830
- "eval_samples_per_second": 6.086,
831
- "eval_steps_per_second": 0.096,
832
- "step": 616
833
- },
834
- {
835
- "epoch": 44.00480769230769,
836
- "grad_norm": 5.636298656463623,
837
- "learning_rate": 1.4171122994652408e-05,
838
- "loss": 0.1639,
839
  "step": 620
840
  },
841
  {
842
- "epoch": 44.01682692307692,
843
- "grad_norm": 3.6947968006134033,
844
- "learning_rate": 1.3502673796791445e-05,
845
- "loss": 0.1727,
846
- "step": 630
 
 
847
  },
848
  {
849
- "epoch": 44.01682692307692,
850
- "eval_accuracy": 0.6936936936936937,
851
- "eval_loss": 1.232095718383789,
852
- "eval_runtime": 81.4582,
853
- "eval_samples_per_second": 5.451,
854
- "eval_steps_per_second": 0.086,
855
  "step": 630
856
  },
857
  {
858
- "epoch": 45.01201923076923,
859
- "grad_norm": 3.2019245624542236,
860
- "learning_rate": 1.2834224598930484e-05,
861
- "loss": 0.1564,
862
  "step": 640
863
  },
864
  {
865
- "epoch": 45.01682692307692,
866
- "eval_accuracy": 0.713963963963964,
867
- "eval_loss": 1.1604844331741333,
868
- "eval_runtime": 72.1149,
869
- "eval_samples_per_second": 6.157,
870
- "eval_steps_per_second": 0.097,
871
- "step": 644
872
  },
873
  {
874
- "epoch": 46.00721153846154,
875
- "grad_norm": 1.9841790199279785,
876
- "learning_rate": 1.2165775401069519e-05,
877
- "loss": 0.1888,
878
  "step": 650
879
  },
880
  {
881
- "epoch": 46.01682692307692,
882
- "eval_accuracy": 0.7094594594594594,
883
- "eval_loss": 1.1609092950820923,
884
- "eval_runtime": 71.8777,
885
- "eval_samples_per_second": 6.177,
886
- "eval_steps_per_second": 0.097,
887
- "step": 658
888
  },
889
  {
890
- "epoch": 47.00240384615385,
891
- "grad_norm": 7.085150241851807,
892
- "learning_rate": 1.1497326203208558e-05,
893
- "loss": 0.1618,
894
  "step": 660
895
  },
896
  {
897
- "epoch": 47.01442307692308,
898
- "grad_norm": 5.41819429397583,
899
- "learning_rate": 1.0828877005347594e-05,
900
- "loss": 0.1227,
901
  "step": 670
902
  },
903
  {
904
- "epoch": 47.01682692307692,
905
- "eval_accuracy": 0.7117117117117117,
906
- "eval_loss": 1.1588457822799683,
907
- "eval_runtime": 82.846,
908
- "eval_samples_per_second": 5.359,
909
- "eval_steps_per_second": 0.084,
910
  "step": 672
911
  },
912
  {
913
- "epoch": 48.00961538461539,
914
- "grad_norm": 2.3894031047821045,
915
- "learning_rate": 1.0160427807486631e-05,
916
- "loss": 0.134,
917
  "step": 680
918
  },
919
  {
920
- "epoch": 48.01682692307692,
921
- "eval_accuracy": 0.7072072072072072,
922
- "eval_loss": 1.1698147058486938,
923
- "eval_runtime": 81.0499,
924
- "eval_samples_per_second": 5.478,
925
- "eval_steps_per_second": 0.086,
926
- "step": 686
927
  },
928
  {
929
- "epoch": 49.00480769230769,
930
- "grad_norm": 4.01511287689209,
931
- "learning_rate": 9.49197860962567e-06,
932
- "loss": 0.1234,
933
  "step": 690
934
  },
935
  {
936
- "epoch": 49.01682692307692,
937
- "grad_norm": 7.249420642852783,
938
- "learning_rate": 8.823529411764707e-06,
939
- "loss": 0.1622,
940
  "step": 700
941
  },
942
  {
943
- "epoch": 49.01682692307692,
944
- "eval_accuracy": 0.6981981981981982,
945
- "eval_loss": 1.2014145851135254,
946
- "eval_runtime": 73.38,
947
- "eval_samples_per_second": 6.051,
948
- "eval_steps_per_second": 0.095,
949
- "step": 700
950
  },
951
  {
952
- "epoch": 50.01201923076923,
953
- "grad_norm": 4.699652194976807,
954
- "learning_rate": 8.155080213903744e-06,
955
- "loss": 0.1391,
956
  "step": 710
957
  },
958
  {
959
- "epoch": 50.01682692307692,
960
- "eval_accuracy": 0.7162162162162162,
961
- "eval_loss": 1.1005959510803223,
962
- "eval_runtime": 73.5034,
963
- "eval_samples_per_second": 6.041,
964
- "eval_steps_per_second": 0.095,
965
- "step": 714
966
- },
967
- {
968
- "epoch": 51.00721153846154,
969
- "grad_norm": 3.143523931503296,
970
- "learning_rate": 7.4866310160427806e-06,
971
- "loss": 0.1276,
972
  "step": 720
973
  },
974
  {
975
- "epoch": 51.01682692307692,
976
- "eval_accuracy": 0.6891891891891891,
977
- "eval_loss": 1.1485284566879272,
978
- "eval_runtime": 77.8096,
979
- "eval_samples_per_second": 5.706,
980
- "eval_steps_per_second": 0.09,
981
- "step": 728
982
  },
983
  {
984
- "epoch": 52.00240384615385,
985
- "grad_norm": 1.1826622486114502,
986
- "learning_rate": 6.818181818181818e-06,
987
- "loss": 0.0979,
988
  "step": 730
989
  },
990
  {
991
- "epoch": 52.01442307692308,
992
- "grad_norm": 3.218552589416504,
993
- "learning_rate": 6.149732620320856e-06,
994
- "loss": 0.1222,
995
- "step": 740
 
 
996
  },
997
  {
998
- "epoch": 52.01682692307692,
999
- "eval_accuracy": 0.713963963963964,
1000
- "eval_loss": 1.0801595449447632,
1001
- "eval_runtime": 73.9346,
1002
- "eval_samples_per_second": 6.005,
1003
- "eval_steps_per_second": 0.095,
1004
- "step": 742
1005
  },
1006
  {
1007
- "epoch": 53.00961538461539,
1008
- "grad_norm": 4.1740312576293945,
1009
- "learning_rate": 5.481283422459893e-06,
1010
- "loss": 0.1024,
1011
  "step": 750
1012
  },
1013
  {
1014
- "epoch": 53.01682692307692,
1015
- "eval_accuracy": 0.704954954954955,
1016
- "eval_loss": 1.1242848634719849,
1017
- "eval_runtime": 72.7853,
1018
- "eval_samples_per_second": 6.1,
1019
- "eval_steps_per_second": 0.096,
1020
- "step": 756
1021
  },
1022
  {
1023
- "epoch": 54.00480769230769,
1024
- "grad_norm": 3.4602904319763184,
1025
- "learning_rate": 4.812834224598931e-06,
1026
- "loss": 0.1014,
1027
  "step": 760
1028
  },
1029
  {
1030
- "epoch": 54.01682692307692,
1031
- "grad_norm": 24.67173957824707,
1032
- "learning_rate": 4.144385026737968e-06,
1033
- "loss": 0.1186,
1034
- "step": 770
 
 
1035
  },
1036
  {
1037
- "epoch": 54.01682692307692,
1038
- "eval_accuracy": 0.7297297297297297,
1039
- "eval_loss": 1.0909080505371094,
1040
- "eval_runtime": 74.1097,
1041
- "eval_samples_per_second": 5.991,
1042
- "eval_steps_per_second": 0.094,
1043
  "step": 770
1044
  },
1045
  {
1046
- "epoch": 55.01201923076923,
1047
- "grad_norm": 2.6615827083587646,
1048
- "learning_rate": 3.4759358288770056e-06,
1049
- "loss": 0.1121,
1050
  "step": 780
1051
  },
1052
  {
1053
- "epoch": 55.01682692307692,
1054
- "eval_accuracy": 0.7094594594594594,
1055
- "eval_loss": 1.1351189613342285,
1056
- "eval_runtime": 73.5115,
1057
- "eval_samples_per_second": 6.04,
1058
- "eval_steps_per_second": 0.095,
1059
  "step": 784
1060
  },
1061
  {
1062
- "epoch": 56.00721153846154,
1063
- "grad_norm": 3.412928581237793,
1064
- "learning_rate": 2.807486631016043e-06,
1065
- "loss": 0.1284,
1066
  "step": 790
1067
  },
1068
  {
1069
- "epoch": 56.01682692307692,
1070
- "eval_accuracy": 0.7274774774774775,
1071
- "eval_loss": 1.1095025539398193,
1072
- "eval_runtime": 73.055,
1073
- "eval_samples_per_second": 6.078,
1074
- "eval_steps_per_second": 0.096,
1075
- "step": 798
1076
  },
1077
  {
1078
- "epoch": 57.00240384615385,
1079
- "grad_norm": 4.214470386505127,
1080
- "learning_rate": 2.1390374331550802e-06,
1081
- "loss": 0.1418,
 
 
1082
  "step": 800
1083
  },
1084
  {
1085
- "epoch": 57.01442307692308,
1086
- "grad_norm": 2.0941081047058105,
1087
- "learning_rate": 1.4705882352941177e-06,
1088
- "loss": 0.0893,
1089
  "step": 810
1090
  },
1091
  {
1092
- "epoch": 57.01682692307692,
1093
- "eval_accuracy": 0.7319819819819819,
1094
- "eval_loss": 1.084416389465332,
1095
- "eval_runtime": 71.5145,
1096
- "eval_samples_per_second": 6.209,
1097
- "eval_steps_per_second": 0.098,
1098
- "step": 812
1099
  },
1100
  {
1101
- "epoch": 58.00961538461539,
1102
- "grad_norm": 4.0398101806640625,
1103
- "learning_rate": 8.021390374331552e-07,
1104
- "loss": 0.0878,
1105
  "step": 820
1106
  },
1107
  {
1108
- "epoch": 58.01682692307692,
1109
- "eval_accuracy": 0.7297297297297297,
1110
- "eval_loss": 1.0803806781768799,
1111
- "eval_runtime": 71.6842,
1112
- "eval_samples_per_second": 6.194,
1113
- "eval_steps_per_second": 0.098,
1114
- "step": 826
1115
- },
1116
- {
1117
- "epoch": 59.00480769230769,
1118
- "grad_norm": 1.0705435276031494,
1119
- "learning_rate": 1.3368983957219251e-07,
1120
- "loss": 0.0887,
1121
  "step": 830
1122
  },
1123
  {
1124
- "epoch": 59.00721153846154,
1125
- "eval_accuracy": 0.7297297297297297,
1126
- "eval_loss": 1.0809234380722046,
1127
- "eval_runtime": 72.7582,
1128
- "eval_samples_per_second": 6.102,
1129
- "eval_steps_per_second": 0.096,
1130
  "step": 832
1131
  },
1132
  {
1133
- "epoch": 59.00721153846154,
1134
- "step": 832,
1135
- "total_flos": 6.1975283578694664e+19,
1136
- "train_loss": 1.175738023289551,
1137
- "train_runtime": 12896.1846,
1138
- "train_samples_per_second": 4.129,
1139
- "train_steps_per_second": 0.065
1140
  },
1141
  {
1142
- "epoch": 59.00721153846154,
1143
- "eval_accuracy": 0.7319819819819819,
1144
- "eval_loss": 1.084416389465332,
1145
- "eval_runtime": 73.94,
1146
- "eval_samples_per_second": 6.005,
1147
- "eval_steps_per_second": 0.095,
1148
- "step": 832
1149
  },
1150
  {
1151
- "epoch": 59.00721153846154,
1152
- "eval_accuracy": 0.7319819819819819,
1153
- "eval_loss": 1.084416389465332,
1154
- "eval_runtime": 75.3126,
1155
- "eval_samples_per_second": 5.895,
1156
- "eval_steps_per_second": 0.093,
1157
- "step": 832
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1158
  }
1159
  ],
1160
  "logging_steps": 10,
1161
- "max_steps": 832,
1162
  "num_input_tokens_seen": 0,
1163
  "num_train_epochs": 9223372036854775807,
1164
  "save_steps": 500,
@@ -1174,7 +2477,7 @@
1174
  "attributes": {}
1175
  }
1176
  },
1177
- "total_flos": 6.1975283578694664e+19,
1178
  "train_batch_size": 64,
1179
  "trial_name": null,
1180
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5539772727272727,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-1600",
4
+ "epoch": 119.00833333333334,
5
  "eval_steps": 500,
6
+ "global_step": 1920,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.005208333333333333,
13
+ "grad_norm": 3.7305831909179688,
14
+ "learning_rate": 2.604166666666667e-06,
15
+ "loss": 4.2451,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.008333333333333333,
20
+ "eval_accuracy": 0.01065340909090909,
21
+ "eval_loss": 4.228378772735596,
22
+ "eval_runtime": 133.9626,
23
+ "eval_samples_per_second": 10.51,
24
+ "eval_steps_per_second": 0.164,
25
+ "step": 16
26
  },
27
  {
28
+ "epoch": 1.0020833333333334,
29
+ "grad_norm": 3.0969808101654053,
30
+ "learning_rate": 5.208333333333334e-06,
31
+ "loss": 4.231,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 1.0072916666666667,
36
+ "grad_norm": 4.606812000274658,
37
+ "learning_rate": 7.8125e-06,
38
+ "loss": 4.2251,
39
+ "step": 30
 
 
40
  },
41
  {
42
+ "epoch": 1.0083333333333333,
43
+ "eval_accuracy": 0.01065340909090909,
44
+ "eval_loss": 4.215222358703613,
45
+ "eval_runtime": 129.5768,
46
+ "eval_samples_per_second": 10.866,
47
+ "eval_steps_per_second": 0.17,
48
+ "step": 32
49
  },
50
  {
51
+ "epoch": 2.004166666666667,
52
+ "grad_norm": 3.574765682220459,
53
+ "learning_rate": 1.0416666666666668e-05,
54
+ "loss": 4.2276,
55
  "step": 40
56
  },
57
  {
58
+ "epoch": 2.0083333333333333,
59
+ "eval_accuracy": 0.012073863636363636,
60
+ "eval_loss": 4.209568500518799,
61
+ "eval_runtime": 130.2735,
62
+ "eval_samples_per_second": 10.808,
63
+ "eval_steps_per_second": 0.169,
64
+ "step": 48
65
  },
66
  {
67
+ "epoch": 3.0010416666666666,
68
+ "grad_norm": 4.630516052246094,
69
+ "learning_rate": 1.3020833333333334e-05,
70
+ "loss": 4.2126,
71
  "step": 50
72
  },
73
  {
74
+ "epoch": 3.00625,
75
+ "grad_norm": 3.0715420246124268,
76
+ "learning_rate": 1.5625e-05,
77
+ "loss": 4.2146,
 
 
 
 
 
 
 
 
 
78
  "step": 60
79
  },
80
  {
81
+ "epoch": 3.0083333333333333,
82
+ "eval_accuracy": 0.014914772727272728,
83
+ "eval_loss": 4.212440490722656,
84
+ "eval_runtime": 128.256,
85
+ "eval_samples_per_second": 10.978,
86
+ "eval_steps_per_second": 0.172,
87
+ "step": 64
88
  },
89
  {
90
+ "epoch": 4.003125,
91
+ "grad_norm": 2.934682846069336,
92
+ "learning_rate": 1.8229166666666668e-05,
93
+ "loss": 4.2235,
 
 
94
  "step": 70
95
  },
96
  {
97
+ "epoch": 4.008333333333334,
98
+ "grad_norm": 8.70511245727539,
99
+ "learning_rate": 2.0833333333333336e-05,
100
+ "loss": 4.2217,
101
  "step": 80
102
  },
103
  {
104
+ "epoch": 4.008333333333334,
105
+ "eval_accuracy": 0.01775568181818182,
106
+ "eval_loss": 4.2041802406311035,
107
+ "eval_runtime": 126.2921,
108
+ "eval_samples_per_second": 11.149,
109
+ "eval_steps_per_second": 0.174,
110
+ "step": 80
111
  },
112
  {
113
+ "epoch": 5.005208333333333,
114
+ "grad_norm": 2.5297322273254395,
115
+ "learning_rate": 2.34375e-05,
116
+ "loss": 4.2091,
117
  "step": 90
118
  },
119
  {
120
+ "epoch": 5.008333333333334,
121
+ "eval_accuracy": 0.0234375,
122
+ "eval_loss": 4.205115795135498,
123
+ "eval_runtime": 127.2102,
124
+ "eval_samples_per_second": 11.068,
125
+ "eval_steps_per_second": 0.173,
126
+ "step": 96
127
  },
128
  {
129
+ "epoch": 6.002083333333333,
130
+ "grad_norm": 2.4549720287323,
131
+ "learning_rate": 2.604166666666667e-05,
132
+ "loss": 4.2256,
133
  "step": 100
134
  },
135
  {
136
+ "epoch": 6.007291666666666,
137
+ "grad_norm": 2.3255746364593506,
138
+ "learning_rate": 2.8645833333333333e-05,
139
+ "loss": 4.2085,
140
  "step": 110
141
  },
142
  {
143
+ "epoch": 6.008333333333334,
144
+ "eval_accuracy": 0.018465909090909092,
145
+ "eval_loss": 4.193885803222656,
146
+ "eval_runtime": 130.9427,
147
+ "eval_samples_per_second": 10.753,
148
+ "eval_steps_per_second": 0.168,
149
  "step": 112
150
  },
151
  {
152
+ "epoch": 7.004166666666666,
153
+ "grad_norm": 2.676177978515625,
154
+ "learning_rate": 3.125e-05,
155
+ "loss": 4.2044,
156
  "step": 120
157
  },
158
  {
159
+ "epoch": 7.008333333333334,
160
+ "eval_accuracy": 0.0390625,
161
+ "eval_loss": 4.1791582107543945,
162
+ "eval_runtime": 125.2796,
163
+ "eval_samples_per_second": 11.239,
164
+ "eval_steps_per_second": 0.176,
165
+ "step": 128
166
  },
167
  {
168
+ "epoch": 8.001041666666667,
169
+ "grad_norm": 2.616762161254883,
170
+ "learning_rate": 3.385416666666667e-05,
171
+ "loss": 4.1911,
172
  "step": 130
173
  },
174
  {
175
+ "epoch": 8.00625,
176
+ "grad_norm": 2.983841896057129,
177
+ "learning_rate": 3.6458333333333336e-05,
178
+ "loss": 4.1624,
179
  "step": 140
180
  },
181
  {
182
+ "epoch": 8.008333333333333,
183
+ "eval_accuracy": 0.022017045454545456,
184
+ "eval_loss": 4.201480388641357,
185
+ "eval_runtime": 136.9987,
186
+ "eval_samples_per_second": 10.277,
187
+ "eval_steps_per_second": 0.161,
188
+ "step": 144
189
  },
190
  {
191
+ "epoch": 9.003125,
192
+ "grad_norm": 2.874126434326172,
193
+ "learning_rate": 3.90625e-05,
194
+ "loss": 4.1349,
195
  "step": 150
196
  },
197
  {
198
+ "epoch": 9.008333333333333,
199
+ "grad_norm": 10.345636367797852,
200
+ "learning_rate": 4.166666666666667e-05,
201
+ "loss": 4.1253,
 
 
 
 
 
 
 
 
 
202
  "step": 160
203
  },
204
  {
205
+ "epoch": 9.008333333333333,
206
+ "eval_accuracy": 0.029829545454545456,
207
+ "eval_loss": 4.1215105056762695,
208
+ "eval_runtime": 132.9763,
209
+ "eval_samples_per_second": 10.588,
210
+ "eval_steps_per_second": 0.165,
211
+ "step": 160
212
  },
213
  {
214
+ "epoch": 10.005208333333334,
215
+ "grad_norm": 3.5383424758911133,
216
+ "learning_rate": 4.4270833333333337e-05,
217
+ "loss": 4.0308,
218
  "step": 170
219
  },
220
  {
221
+ "epoch": 10.008333333333333,
222
+ "eval_accuracy": 0.07102272727272728,
223
+ "eval_loss": 4.002456188201904,
224
+ "eval_runtime": 127.9472,
225
+ "eval_samples_per_second": 11.005,
226
+ "eval_steps_per_second": 0.172,
227
+ "step": 176
228
  },
229
  {
230
+ "epoch": 11.002083333333333,
231
+ "grad_norm": 4.1187543869018555,
232
+ "learning_rate": 4.6875e-05,
233
+ "loss": 3.9438,
234
+ "step": 180
 
 
235
  },
236
  {
237
+ "epoch": 11.007291666666667,
238
+ "grad_norm": 4.370882034301758,
239
+ "learning_rate": 4.947916666666667e-05,
240
+ "loss": 3.8065,
241
  "step": 190
242
  },
243
  {
244
+ "epoch": 11.008333333333333,
245
+ "eval_accuracy": 0.08380681818181818,
246
+ "eval_loss": 3.872328758239746,
247
+ "eval_runtime": 142.2129,
248
+ "eval_samples_per_second": 9.901,
249
+ "eval_steps_per_second": 0.155,
250
+ "step": 192
251
  },
252
  {
253
+ "epoch": 12.004166666666666,
254
+ "grad_norm": 3.4101152420043945,
255
+ "learning_rate": 4.976851851851852e-05,
256
+ "loss": 3.7614,
257
  "step": 200
258
  },
259
  {
260
+ "epoch": 12.008333333333333,
261
+ "eval_accuracy": 0.09943181818181818,
262
+ "eval_loss": 3.7138783931732178,
263
+ "eval_runtime": 125.568,
264
+ "eval_samples_per_second": 11.213,
265
+ "eval_steps_per_second": 0.175,
266
+ "step": 208
267
  },
268
  {
269
+ "epoch": 13.001041666666667,
270
+ "grad_norm": 4.585361480712891,
271
+ "learning_rate": 4.947916666666667e-05,
272
+ "loss": 3.6302,
 
 
273
  "step": 210
274
  },
275
  {
276
+ "epoch": 13.00625,
277
+ "grad_norm": 4.209935665130615,
278
+ "learning_rate": 4.9189814814814815e-05,
279
+ "loss": 3.4761,
280
  "step": 220
281
  },
282
  {
283
+ "epoch": 13.008333333333333,
284
+ "eval_accuracy": 0.1434659090909091,
285
+ "eval_loss": 3.6160459518432617,
286
+ "eval_runtime": 133.4296,
287
+ "eval_samples_per_second": 10.552,
288
+ "eval_steps_per_second": 0.165,
289
  "step": 224
290
  },
291
  {
292
+ "epoch": 14.003125,
293
+ "grad_norm": 4.6234049797058105,
294
+ "learning_rate": 4.8900462962962965e-05,
295
+ "loss": 3.3764,
296
  "step": 230
297
  },
298
  {
299
+ "epoch": 14.008333333333333,
300
+ "grad_norm": 13.464088439941406,
301
+ "learning_rate": 4.8611111111111115e-05,
302
+ "loss": 3.278,
303
+ "step": 240
 
 
304
  },
305
  {
306
+ "epoch": 14.008333333333333,
307
+ "eval_accuracy": 0.1924715909090909,
308
+ "eval_loss": 3.3939576148986816,
309
+ "eval_runtime": 129.3255,
310
+ "eval_samples_per_second": 10.887,
311
+ "eval_steps_per_second": 0.17,
312
  "step": 240
313
  },
314
  {
315
+ "epoch": 15.005208333333334,
316
+ "grad_norm": 4.547211647033691,
317
+ "learning_rate": 4.8321759259259265e-05,
318
+ "loss": 3.0999,
319
  "step": 250
320
  },
321
  {
322
+ "epoch": 15.008333333333333,
323
+ "eval_accuracy": 0.2080965909090909,
324
+ "eval_loss": 3.3182637691497803,
325
+ "eval_runtime": 123.805,
326
+ "eval_samples_per_second": 11.373,
327
+ "eval_steps_per_second": 0.178,
328
+ "step": 256
329
  },
330
  {
331
+ "epoch": 16.002083333333335,
332
+ "grad_norm": 4.825346946716309,
333
+ "learning_rate": 4.803240740740741e-05,
334
+ "loss": 3.0213,
335
  "step": 260
336
  },
337
  {
338
+ "epoch": 16.007291666666667,
339
+ "grad_norm": 5.93734884262085,
340
+ "learning_rate": 4.774305555555556e-05,
341
+ "loss": 2.9721,
 
 
 
 
 
 
 
 
 
342
  "step": 270
343
  },
344
  {
345
+ "epoch": 16.008333333333333,
346
+ "eval_accuracy": 0.24857954545454544,
347
+ "eval_loss": 3.159555673599243,
348
+ "eval_runtime": 124.115,
349
+ "eval_samples_per_second": 11.344,
350
+ "eval_steps_per_second": 0.177,
351
+ "step": 272
352
  },
353
  {
354
+ "epoch": 17.004166666666666,
355
+ "grad_norm": 4.84201717376709,
356
+ "learning_rate": 4.745370370370371e-05,
357
+ "loss": 2.8064,
 
 
358
  "step": 280
359
  },
360
  {
361
+ "epoch": 17.008333333333333,
362
+ "eval_accuracy": 0.26704545454545453,
363
+ "eval_loss": 3.023179054260254,
364
+ "eval_runtime": 131.2545,
365
+ "eval_samples_per_second": 10.727,
366
+ "eval_steps_per_second": 0.168,
367
+ "step": 288
368
  },
369
  {
370
+ "epoch": 18.001041666666666,
371
+ "grad_norm": 5.273012161254883,
372
+ "learning_rate": 4.716435185185186e-05,
373
+ "loss": 2.813,
374
+ "step": 290
 
 
375
  },
376
  {
377
+ "epoch": 18.00625,
378
+ "grad_norm": 5.6201276779174805,
379
+ "learning_rate": 4.6875e-05,
380
+ "loss": 2.6554,
381
  "step": 300
382
  },
383
  {
384
+ "epoch": 18.008333333333333,
385
+ "eval_accuracy": 0.2911931818181818,
386
+ "eval_loss": 2.9448060989379883,
387
+ "eval_runtime": 123.4251,
388
+ "eval_samples_per_second": 11.408,
389
+ "eval_steps_per_second": 0.178,
390
+ "step": 304
391
  },
392
  {
393
+ "epoch": 19.003125,
394
+ "grad_norm": 7.461442470550537,
395
+ "learning_rate": 4.658564814814815e-05,
396
+ "loss": 2.6668,
397
  "step": 310
398
  },
399
  {
400
+ "epoch": 19.008333333333333,
401
+ "grad_norm": 14.555140495300293,
402
+ "learning_rate": 4.62962962962963e-05,
403
+ "loss": 2.5052,
404
  "step": 320
405
  },
406
  {
407
+ "epoch": 19.008333333333333,
408
+ "eval_accuracy": 0.3309659090909091,
409
+ "eval_loss": 2.828484535217285,
410
+ "eval_runtime": 127.0838,
411
+ "eval_samples_per_second": 11.079,
412
+ "eval_steps_per_second": 0.173,
413
+ "step": 320
414
  },
415
  {
416
+ "epoch": 20.005208333333332,
417
+ "grad_norm": 6.753479957580566,
418
+ "learning_rate": 4.6006944444444444e-05,
419
+ "loss": 2.4322,
420
  "step": 330
421
  },
422
  {
423
+ "epoch": 20.008333333333333,
424
+ "eval_accuracy": 0.34019886363636365,
425
+ "eval_loss": 2.7478535175323486,
426
+ "eval_runtime": 125.7771,
427
+ "eval_samples_per_second": 11.194,
428
+ "eval_steps_per_second": 0.175,
429
  "step": 336
430
  },
431
  {
432
+ "epoch": 21.002083333333335,
433
+ "grad_norm": 5.665611743927002,
434
+ "learning_rate": 4.5717592592592594e-05,
435
+ "loss": 2.4376,
436
  "step": 340
437
  },
438
  {
439
+ "epoch": 21.007291666666667,
440
+ "grad_norm": 5.6526570320129395,
441
+ "learning_rate": 4.5428240740740744e-05,
442
+ "loss": 2.3193,
443
  "step": 350
444
  },
445
  {
446
+ "epoch": 21.008333333333333,
447
+ "eval_accuracy": 0.3309659090909091,
448
+ "eval_loss": 2.794116735458374,
449
+ "eval_runtime": 126.5702,
450
+ "eval_samples_per_second": 11.124,
451
+ "eval_steps_per_second": 0.174,
452
+ "step": 352
453
  },
454
  {
455
+ "epoch": 22.004166666666666,
456
+ "grad_norm": 6.312890529632568,
457
+ "learning_rate": 4.5138888888888894e-05,
458
+ "loss": 2.2565,
459
  "step": 360
460
  },
461
  {
462
+ "epoch": 22.008333333333333,
463
+ "eval_accuracy": 0.3671875,
464
+ "eval_loss": 2.6383402347564697,
465
+ "eval_runtime": 123.0345,
466
+ "eval_samples_per_second": 11.444,
467
+ "eval_steps_per_second": 0.179,
468
+ "step": 368
469
  },
470
  {
471
+ "epoch": 23.001041666666666,
472
+ "grad_norm": 5.865190505981445,
473
+ "learning_rate": 4.484953703703704e-05,
474
+ "loss": 2.1257,
475
  "step": 370
476
  },
477
  {
478
+ "epoch": 23.00625,
479
+ "grad_norm": 5.813269138336182,
480
+ "learning_rate": 4.456018518518519e-05,
481
+ "loss": 2.1405,
 
 
 
 
 
 
 
 
 
482
  "step": 380
483
  },
484
  {
485
+ "epoch": 23.008333333333333,
486
+ "eval_accuracy": 0.36079545454545453,
487
+ "eval_loss": 2.5905861854553223,
488
+ "eval_runtime": 126.2698,
489
+ "eval_samples_per_second": 11.151,
490
+ "eval_steps_per_second": 0.174,
491
+ "step": 384
492
  },
493
  {
494
+ "epoch": 24.003125,
495
+ "grad_norm": 6.6013569831848145,
496
+ "learning_rate": 4.4270833333333337e-05,
497
+ "loss": 1.9715,
498
+ "step": 390
 
 
499
  },
500
  {
501
+ "epoch": 24.008333333333333,
502
+ "grad_norm": 14.786352157592773,
503
+ "learning_rate": 4.3981481481481486e-05,
504
+ "loss": 2.1049,
505
  "step": 400
506
  },
507
  {
508
+ "epoch": 24.008333333333333,
509
+ "eval_accuracy": 0.37855113636363635,
510
+ "eval_loss": 2.5515265464782715,
511
+ "eval_runtime": 134.242,
512
+ "eval_samples_per_second": 10.489,
513
+ "eval_steps_per_second": 0.164,
514
+ "step": 400
515
  },
516
  {
517
+ "epoch": 25.005208333333332,
518
+ "grad_norm": 6.399519920349121,
519
+ "learning_rate": 4.369212962962963e-05,
520
+ "loss": 1.8424,
521
  "step": 410
522
  },
523
  {
524
+ "epoch": 25.008333333333333,
525
+ "eval_accuracy": 0.39204545454545453,
526
+ "eval_loss": 2.4692277908325195,
527
+ "eval_runtime": 134.5245,
528
+ "eval_samples_per_second": 10.466,
529
+ "eval_steps_per_second": 0.164,
530
+ "step": 416
531
  },
532
  {
533
+ "epoch": 26.002083333333335,
534
+ "grad_norm": 5.429991245269775,
535
+ "learning_rate": 4.340277777777778e-05,
536
+ "loss": 2.0616,
 
 
537
  "step": 420
538
  },
539
  {
540
+ "epoch": 26.007291666666667,
541
+ "grad_norm": 6.501383304595947,
542
+ "learning_rate": 4.311342592592593e-05,
543
+ "loss": 1.8685,
544
  "step": 430
545
  },
546
  {
547
+ "epoch": 26.008333333333333,
548
+ "eval_accuracy": 0.4275568181818182,
549
+ "eval_loss": 2.4325406551361084,
550
+ "eval_runtime": 138.3084,
551
+ "eval_samples_per_second": 10.18,
552
+ "eval_steps_per_second": 0.159,
553
+ "step": 432
554
  },
555
  {
556
+ "epoch": 27.004166666666666,
557
+ "grad_norm": 6.909026622772217,
558
+ "learning_rate": 4.282407407407408e-05,
559
+ "loss": 1.7478,
560
  "step": 440
561
  },
562
  {
563
+ "epoch": 27.008333333333333,
564
+ "eval_accuracy": 0.4147727272727273,
565
+ "eval_loss": 2.416501522064209,
566
+ "eval_runtime": 128.9504,
567
+ "eval_samples_per_second": 10.919,
568
+ "eval_steps_per_second": 0.171,
569
  "step": 448
570
  },
571
  {
572
+ "epoch": 28.001041666666666,
573
+ "grad_norm": 6.359575271606445,
574
+ "learning_rate": 4.253472222222222e-05,
575
+ "loss": 2.0103,
576
  "step": 450
577
  },
578
  {
579
+ "epoch": 28.00625,
580
+ "grad_norm": 7.018951416015625,
581
+ "learning_rate": 4.224537037037037e-05,
582
+ "loss": 1.7072,
583
  "step": 460
584
  },
585
  {
586
+ "epoch": 28.008333333333333,
587
+ "eval_accuracy": 0.4268465909090909,
588
+ "eval_loss": 2.3617048263549805,
589
+ "eval_runtime": 126.6826,
590
+ "eval_samples_per_second": 11.114,
591
+ "eval_steps_per_second": 0.174,
592
+ "step": 464
593
  },
594
  {
595
+ "epoch": 29.003125,
596
+ "grad_norm": 5.399332523345947,
597
+ "learning_rate": 4.195601851851852e-05,
598
+ "loss": 1.8236,
599
  "step": 470
600
  },
601
  {
602
+ "epoch": 29.008333333333333,
603
+ "grad_norm": 14.481021881103516,
604
+ "learning_rate": 4.166666666666667e-05,
605
+ "loss": 1.7206,
606
+ "step": 480
 
 
607
  },
608
  {
609
+ "epoch": 29.008333333333333,
610
+ "eval_accuracy": 0.4303977272727273,
611
+ "eval_loss": 2.372326135635376,
612
+ "eval_runtime": 128.5457,
613
+ "eval_samples_per_second": 10.953,
614
+ "eval_steps_per_second": 0.171,
615
  "step": 480
616
  },
617
  {
618
+ "epoch": 30.005208333333332,
619
+ "grad_norm": 7.11575984954834,
620
+ "learning_rate": 4.1377314814814815e-05,
621
+ "loss": 1.693,
622
  "step": 490
623
  },
624
  {
625
+ "epoch": 30.008333333333333,
626
+ "eval_accuracy": 0.4424715909090909,
627
+ "eval_loss": 2.2890071868896484,
628
+ "eval_runtime": 128.9784,
629
+ "eval_samples_per_second": 10.917,
630
+ "eval_steps_per_second": 0.171,
631
+ "step": 496
632
  },
633
  {
634
+ "epoch": 31.002083333333335,
635
+ "grad_norm": 7.223865985870361,
636
+ "learning_rate": 4.1087962962962965e-05,
637
+ "loss": 1.6722,
638
  "step": 500
639
  },
640
  {
641
+ "epoch": 31.007291666666667,
642
+ "grad_norm": 7.608266353607178,
643
+ "learning_rate": 4.0798611111111115e-05,
644
+ "loss": 1.6347,
 
 
 
 
 
 
 
 
 
645
  "step": 510
646
  },
647
  {
648
+ "epoch": 31.008333333333333,
649
+ "eval_accuracy": 0.44105113636363635,
650
+ "eval_loss": 2.244246482849121,
651
+ "eval_runtime": 129.1752,
652
+ "eval_samples_per_second": 10.9,
653
+ "eval_steps_per_second": 0.17,
654
+ "step": 512
655
  },
656
  {
657
+ "epoch": 32.00416666666667,
658
+ "grad_norm": 7.774643898010254,
659
+ "learning_rate": 4.0509259259259265e-05,
660
+ "loss": 1.5276,
661
  "step": 520
662
  },
663
  {
664
+ "epoch": 32.00833333333333,
665
+ "eval_accuracy": 0.46732954545454547,
666
+ "eval_loss": 2.210369110107422,
667
+ "eval_runtime": 120.0766,
668
+ "eval_samples_per_second": 11.726,
669
+ "eval_steps_per_second": 0.183,
670
+ "step": 528
671
  },
672
  {
673
+ "epoch": 33.001041666666666,
674
+ "grad_norm": 6.766725063323975,
675
+ "learning_rate": 4.021990740740741e-05,
676
+ "loss": 1.5203,
677
+ "step": 530
 
 
678
  },
679
  {
680
+ "epoch": 33.00625,
681
+ "grad_norm": 5.995940208435059,
682
+ "learning_rate": 3.993055555555556e-05,
683
+ "loss": 1.4576,
684
  "step": 540
685
  },
686
  {
687
+ "epoch": 33.00833333333333,
688
+ "eval_accuracy": 0.45454545454545453,
689
+ "eval_loss": 2.227916717529297,
690
+ "eval_runtime": 124.0445,
691
+ "eval_samples_per_second": 11.351,
692
+ "eval_steps_per_second": 0.177,
693
+ "step": 544
694
  },
695
  {
696
+ "epoch": 34.003125,
697
+ "grad_norm": 6.610184669494629,
698
+ "learning_rate": 3.964120370370371e-05,
699
+ "loss": 1.5347,
700
  "step": 550
701
  },
702
  {
703
+ "epoch": 34.00833333333333,
704
+ "grad_norm": 24.694988250732422,
705
+ "learning_rate": 3.935185185185186e-05,
706
+ "loss": 1.5455,
707
  "step": 560
708
  },
709
  {
710
+ "epoch": 34.00833333333333,
711
+ "eval_accuracy": 0.4524147727272727,
712
+ "eval_loss": 2.204953908920288,
713
+ "eval_runtime": 131.2835,
714
+ "eval_samples_per_second": 10.725,
715
+ "eval_steps_per_second": 0.168,
716
  "step": 560
717
  },
718
  {
719
+ "epoch": 35.005208333333336,
720
+ "grad_norm": 6.918396472930908,
721
+ "learning_rate": 3.90625e-05,
722
+ "loss": 1.4485,
723
  "step": 570
724
  },
725
  {
726
+ "epoch": 35.00833333333333,
727
+ "eval_accuracy": 0.4737215909090909,
728
+ "eval_loss": 2.1584527492523193,
729
+ "eval_runtime": 134.6721,
730
+ "eval_samples_per_second": 10.455,
731
+ "eval_steps_per_second": 0.163,
732
+ "step": 576
733
  },
734
  {
735
+ "epoch": 36.00208333333333,
736
+ "grad_norm": 6.077505588531494,
737
+ "learning_rate": 3.877314814814815e-05,
738
+ "loss": 1.4391,
739
  "step": 580
740
  },
741
  {
742
+ "epoch": 36.00729166666667,
743
+ "grad_norm": 6.617316722869873,
744
+ "learning_rate": 3.84837962962963e-05,
745
+ "loss": 1.3896,
746
+ "step": 590
 
 
747
  },
748
  {
749
+ "epoch": 36.00833333333333,
750
+ "eval_accuracy": 0.4446022727272727,
751
+ "eval_loss": 2.1850500106811523,
752
+ "eval_runtime": 129.9914,
753
+ "eval_samples_per_second": 10.831,
754
+ "eval_steps_per_second": 0.169,
755
+ "step": 592
756
  },
757
  {
758
+ "epoch": 37.00416666666667,
759
+ "grad_norm": 7.900727272033691,
760
+ "learning_rate": 3.8194444444444444e-05,
761
+ "loss": 1.3766,
762
  "step": 600
763
  },
764
  {
765
+ "epoch": 37.00833333333333,
766
+ "eval_accuracy": 0.4872159090909091,
767
+ "eval_loss": 2.118501663208008,
768
+ "eval_runtime": 131.0307,
769
+ "eval_samples_per_second": 10.746,
770
+ "eval_steps_per_second": 0.168,
771
+ "step": 608
772
  },
773
  {
774
+ "epoch": 38.001041666666666,
775
+ "grad_norm": 6.658583641052246,
776
+ "learning_rate": 3.7905092592592594e-05,
777
+ "loss": 1.287,
778
  "step": 610
779
  },
780
  {
781
+ "epoch": 38.00625,
782
+ "grad_norm": 5.483121395111084,
783
+ "learning_rate": 3.7615740740740744e-05,
784
+ "loss": 1.4035,
 
 
 
 
 
 
 
 
 
785
  "step": 620
786
  },
787
  {
788
+ "epoch": 38.00833333333333,
789
+ "eval_accuracy": 0.4794034090909091,
790
+ "eval_loss": 2.116427183151245,
791
+ "eval_runtime": 133.3027,
792
+ "eval_samples_per_second": 10.562,
793
+ "eval_steps_per_second": 0.165,
794
+ "step": 624
795
  },
796
  {
797
+ "epoch": 39.003125,
798
+ "grad_norm": 6.027029991149902,
799
+ "learning_rate": 3.7326388888888893e-05,
800
+ "loss": 1.416,
 
 
801
  "step": 630
802
  },
803
  {
804
+ "epoch": 39.00833333333333,
805
+ "grad_norm": 17.550609588623047,
806
+ "learning_rate": 3.7037037037037037e-05,
807
+ "loss": 1.5892,
808
  "step": 640
809
  },
810
  {
811
+ "epoch": 39.00833333333333,
812
+ "eval_accuracy": 0.48011363636363635,
813
+ "eval_loss": 2.102943181991577,
814
+ "eval_runtime": 124.9558,
815
+ "eval_samples_per_second": 11.268,
816
+ "eval_steps_per_second": 0.176,
817
+ "step": 640
818
  },
819
  {
820
+ "epoch": 40.005208333333336,
821
+ "grad_norm": 8.286770820617676,
822
+ "learning_rate": 3.6747685185185186e-05,
823
+ "loss": 1.3647,
824
  "step": 650
825
  },
826
  {
827
+ "epoch": 40.00833333333333,
828
+ "eval_accuracy": 0.4928977272727273,
829
+ "eval_loss": 2.0912482738494873,
830
+ "eval_runtime": 125.7809,
831
+ "eval_samples_per_second": 11.194,
832
+ "eval_steps_per_second": 0.175,
833
+ "step": 656
834
  },
835
  {
836
+ "epoch": 41.00208333333333,
837
+ "grad_norm": 6.9772138595581055,
838
+ "learning_rate": 3.6458333333333336e-05,
839
+ "loss": 1.3769,
840
  "step": 660
841
  },
842
  {
843
+ "epoch": 41.00729166666667,
844
+ "grad_norm": 6.215446472167969,
845
+ "learning_rate": 3.6168981481481486e-05,
846
+ "loss": 1.388,
847
  "step": 670
848
  },
849
  {
850
+ "epoch": 41.00833333333333,
851
+ "eval_accuracy": 0.47301136363636365,
852
+ "eval_loss": 2.1330864429473877,
853
+ "eval_runtime": 134.2267,
854
+ "eval_samples_per_second": 10.49,
855
+ "eval_steps_per_second": 0.164,
856
  "step": 672
857
  },
858
  {
859
+ "epoch": 42.00416666666667,
860
+ "grad_norm": 6.233066082000732,
861
+ "learning_rate": 3.587962962962963e-05,
862
+ "loss": 1.3425,
863
  "step": 680
864
  },
865
  {
866
+ "epoch": 42.00833333333333,
867
+ "eval_accuracy": 0.4794034090909091,
868
+ "eval_loss": 2.143657922744751,
869
+ "eval_runtime": 126.4352,
870
+ "eval_samples_per_second": 11.136,
871
+ "eval_steps_per_second": 0.174,
872
+ "step": 688
873
  },
874
  {
875
+ "epoch": 43.001041666666666,
876
+ "grad_norm": 7.223750591278076,
877
+ "learning_rate": 3.559027777777778e-05,
878
+ "loss": 1.1958,
879
  "step": 690
880
  },
881
  {
882
+ "epoch": 43.00625,
883
+ "grad_norm": 6.594886779785156,
884
+ "learning_rate": 3.530092592592593e-05,
885
+ "loss": 1.2909,
886
  "step": 700
887
  },
888
  {
889
+ "epoch": 43.00833333333333,
890
+ "eval_accuracy": 0.4715909090909091,
891
+ "eval_loss": 2.109005928039551,
892
+ "eval_runtime": 129.7594,
893
+ "eval_samples_per_second": 10.851,
894
+ "eval_steps_per_second": 0.17,
895
+ "step": 704
896
  },
897
  {
898
+ "epoch": 44.003125,
899
+ "grad_norm": 5.431155681610107,
900
+ "learning_rate": 3.501157407407408e-05,
901
+ "loss": 1.2277,
902
  "step": 710
903
  },
904
  {
905
+ "epoch": 44.00833333333333,
906
+ "grad_norm": 23.15688133239746,
907
+ "learning_rate": 3.472222222222222e-05,
908
+ "loss": 1.2757,
 
 
 
 
 
 
 
 
 
909
  "step": 720
910
  },
911
  {
912
+ "epoch": 44.00833333333333,
913
+ "eval_accuracy": 0.4900568181818182,
914
+ "eval_loss": 2.0685956478118896,
915
+ "eval_runtime": 117.5767,
916
+ "eval_samples_per_second": 11.975,
917
+ "eval_steps_per_second": 0.187,
918
+ "step": 720
919
  },
920
  {
921
+ "epoch": 45.005208333333336,
922
+ "grad_norm": 6.826360702514648,
923
+ "learning_rate": 3.443287037037037e-05,
924
+ "loss": 1.181,
925
  "step": 730
926
  },
927
  {
928
+ "epoch": 45.00833333333333,
929
+ "eval_accuracy": 0.4893465909090909,
930
+ "eval_loss": 2.0484628677368164,
931
+ "eval_runtime": 122.183,
932
+ "eval_samples_per_second": 11.524,
933
+ "eval_steps_per_second": 0.18,
934
+ "step": 736
935
  },
936
  {
937
+ "epoch": 46.00208333333333,
938
+ "grad_norm": 6.446974277496338,
939
+ "learning_rate": 3.414351851851852e-05,
940
+ "loss": 1.2034,
941
+ "step": 740
 
 
942
  },
943
  {
944
+ "epoch": 46.00729166666667,
945
+ "grad_norm": 6.026095390319824,
946
+ "learning_rate": 3.385416666666667e-05,
947
+ "loss": 1.1825,
948
  "step": 750
949
  },
950
  {
951
+ "epoch": 46.00833333333333,
952
+ "eval_accuracy": 0.484375,
953
+ "eval_loss": 2.0560503005981445,
954
+ "eval_runtime": 129.8384,
955
+ "eval_samples_per_second": 10.844,
956
+ "eval_steps_per_second": 0.169,
957
+ "step": 752
958
  },
959
  {
960
+ "epoch": 47.00416666666667,
961
+ "grad_norm": 7.490734577178955,
962
+ "learning_rate": 3.3564814814814815e-05,
963
+ "loss": 1.1594,
964
  "step": 760
965
  },
966
  {
967
+ "epoch": 47.00833333333333,
968
+ "eval_accuracy": 0.49644886363636365,
969
+ "eval_loss": 2.0327041149139404,
970
+ "eval_runtime": 125.1147,
971
+ "eval_samples_per_second": 11.254,
972
+ "eval_steps_per_second": 0.176,
973
+ "step": 768
974
  },
975
  {
976
+ "epoch": 48.001041666666666,
977
+ "grad_norm": 6.074591636657715,
978
+ "learning_rate": 3.3275462962962965e-05,
979
+ "loss": 1.0989,
 
 
980
  "step": 770
981
  },
982
  {
983
+ "epoch": 48.00625,
984
+ "grad_norm": 7.924504280090332,
985
+ "learning_rate": 3.2986111111111115e-05,
986
+ "loss": 1.1699,
987
  "step": 780
988
  },
989
  {
990
+ "epoch": 48.00833333333333,
991
+ "eval_accuracy": 0.4765625,
992
+ "eval_loss": 2.095003366470337,
993
+ "eval_runtime": 131.8398,
994
+ "eval_samples_per_second": 10.68,
995
+ "eval_steps_per_second": 0.167,
996
  "step": 784
997
  },
998
  {
999
+ "epoch": 49.003125,
1000
+ "grad_norm": 7.409806728363037,
1001
+ "learning_rate": 3.2696759259259265e-05,
1002
+ "loss": 1.2027,
1003
  "step": 790
1004
  },
1005
  {
1006
+ "epoch": 49.00833333333333,
1007
+ "grad_norm": 23.586345672607422,
1008
+ "learning_rate": 3.240740740740741e-05,
1009
+ "loss": 1.1908,
1010
+ "step": 800
 
 
1011
  },
1012
  {
1013
+ "epoch": 49.00833333333333,
1014
+ "eval_accuracy": 0.4850852272727273,
1015
+ "eval_loss": 2.0465078353881836,
1016
+ "eval_runtime": 129.3358,
1017
+ "eval_samples_per_second": 10.886,
1018
+ "eval_steps_per_second": 0.17,
1019
  "step": 800
1020
  },
1021
  {
1022
+ "epoch": 50.005208333333336,
1023
+ "grad_norm": 7.079756736755371,
1024
+ "learning_rate": 3.211805555555556e-05,
1025
+ "loss": 1.1149,
1026
  "step": 810
1027
  },
1028
  {
1029
+ "epoch": 50.00833333333333,
1030
+ "eval_accuracy": 0.48792613636363635,
1031
+ "eval_loss": 2.0569851398468018,
1032
+ "eval_runtime": 123.867,
1033
+ "eval_samples_per_second": 11.367,
1034
+ "eval_steps_per_second": 0.178,
1035
+ "step": 816
1036
  },
1037
  {
1038
+ "epoch": 51.00208333333333,
1039
+ "grad_norm": 5.848146438598633,
1040
+ "learning_rate": 3.182870370370371e-05,
1041
+ "loss": 1.0907,
1042
  "step": 820
1043
  },
1044
  {
1045
+ "epoch": 51.00729166666667,
1046
+ "grad_norm": 7.725104808807373,
1047
+ "learning_rate": 3.153935185185186e-05,
1048
+ "loss": 1.1388,
 
 
 
 
 
 
 
 
 
1049
  "step": 830
1050
  },
1051
  {
1052
+ "epoch": 51.00833333333333,
1053
+ "eval_accuracy": 0.4978693181818182,
1054
+ "eval_loss": 2.0232162475585938,
1055
+ "eval_runtime": 124.9726,
1056
+ "eval_samples_per_second": 11.266,
1057
+ "eval_steps_per_second": 0.176,
1058
  "step": 832
1059
  },
1060
  {
1061
+ "epoch": 52.00416666666667,
1062
+ "grad_norm": 6.411397457122803,
1063
+ "learning_rate": 3.125e-05,
1064
+ "loss": 1.0421,
1065
+ "step": 840
 
 
1066
  },
1067
  {
1068
+ "epoch": 52.00833333333333,
1069
+ "eval_accuracy": 0.49857954545454547,
1070
+ "eval_loss": 2.0132648944854736,
1071
+ "eval_runtime": 130.0486,
1072
+ "eval_samples_per_second": 10.827,
1073
+ "eval_steps_per_second": 0.169,
1074
+ "step": 848
1075
  },
1076
  {
1077
+ "epoch": 53.001041666666666,
1078
+ "grad_norm": 6.417796611785889,
1079
+ "learning_rate": 3.0960648148148144e-05,
1080
+ "loss": 1.128,
1081
+ "step": 850
1082
+ },
1083
+ {
1084
+ "epoch": 53.00625,
1085
+ "grad_norm": 6.483155727386475,
1086
+ "learning_rate": 3.06712962962963e-05,
1087
+ "loss": 1.1243,
1088
+ "step": 860
1089
+ },
1090
+ {
1091
+ "epoch": 53.00833333333333,
1092
+ "eval_accuracy": 0.4900568181818182,
1093
+ "eval_loss": 2.0420944690704346,
1094
+ "eval_runtime": 131.0098,
1095
+ "eval_samples_per_second": 10.747,
1096
+ "eval_steps_per_second": 0.168,
1097
+ "step": 864
1098
+ },
1099
+ {
1100
+ "epoch": 54.003125,
1101
+ "grad_norm": 7.885180950164795,
1102
+ "learning_rate": 3.0381944444444444e-05,
1103
+ "loss": 1.0331,
1104
+ "step": 870
1105
+ },
1106
+ {
1107
+ "epoch": 54.00833333333333,
1108
+ "grad_norm": 28.23676300048828,
1109
+ "learning_rate": 3.0092592592592593e-05,
1110
+ "loss": 1.1064,
1111
+ "step": 880
1112
+ },
1113
+ {
1114
+ "epoch": 54.00833333333333,
1115
+ "eval_accuracy": 0.5042613636363636,
1116
+ "eval_loss": 1.9613640308380127,
1117
+ "eval_runtime": 129.6513,
1118
+ "eval_samples_per_second": 10.86,
1119
+ "eval_steps_per_second": 0.17,
1120
+ "step": 880
1121
+ },
1122
+ {
1123
+ "epoch": 55.005208333333336,
1124
+ "grad_norm": 7.0548319816589355,
1125
+ "learning_rate": 2.980324074074074e-05,
1126
+ "loss": 0.9778,
1127
+ "step": 890
1128
+ },
1129
+ {
1130
+ "epoch": 55.00833333333333,
1131
+ "eval_accuracy": 0.5071022727272727,
1132
+ "eval_loss": 1.9938956499099731,
1133
+ "eval_runtime": 125.6294,
1134
+ "eval_samples_per_second": 11.208,
1135
+ "eval_steps_per_second": 0.175,
1136
+ "step": 896
1137
+ },
1138
+ {
1139
+ "epoch": 56.00208333333333,
1140
+ "grad_norm": 7.993034362792969,
1141
+ "learning_rate": 2.951388888888889e-05,
1142
+ "loss": 1.0192,
1143
+ "step": 900
1144
+ },
1145
+ {
1146
+ "epoch": 56.00729166666667,
1147
+ "grad_norm": 8.833446502685547,
1148
+ "learning_rate": 2.9224537037037036e-05,
1149
+ "loss": 1.1417,
1150
+ "step": 910
1151
+ },
1152
+ {
1153
+ "epoch": 56.00833333333333,
1154
+ "eval_accuracy": 0.5106534090909091,
1155
+ "eval_loss": 1.977372646331787,
1156
+ "eval_runtime": 141.0715,
1157
+ "eval_samples_per_second": 9.981,
1158
+ "eval_steps_per_second": 0.156,
1159
+ "step": 912
1160
+ },
1161
+ {
1162
+ "epoch": 57.00416666666667,
1163
+ "grad_norm": 6.480681896209717,
1164
+ "learning_rate": 2.8935185185185186e-05,
1165
+ "loss": 1.0578,
1166
+ "step": 920
1167
+ },
1168
+ {
1169
+ "epoch": 57.00833333333333,
1170
+ "eval_accuracy": 0.5305397727272727,
1171
+ "eval_loss": 1.9625455141067505,
1172
+ "eval_runtime": 137.3354,
1173
+ "eval_samples_per_second": 10.252,
1174
+ "eval_steps_per_second": 0.16,
1175
+ "step": 928
1176
+ },
1177
+ {
1178
+ "epoch": 58.001041666666666,
1179
+ "grad_norm": 6.03174352645874,
1180
+ "learning_rate": 2.8645833333333333e-05,
1181
+ "loss": 0.9695,
1182
+ "step": 930
1183
+ },
1184
+ {
1185
+ "epoch": 58.00625,
1186
+ "grad_norm": 5.930421352386475,
1187
+ "learning_rate": 2.8356481481481483e-05,
1188
+ "loss": 1.0904,
1189
+ "step": 940
1190
+ },
1191
+ {
1192
+ "epoch": 58.00833333333333,
1193
+ "eval_accuracy": 0.5056818181818182,
1194
+ "eval_loss": 1.971279501914978,
1195
+ "eval_runtime": 124.5828,
1196
+ "eval_samples_per_second": 11.302,
1197
+ "eval_steps_per_second": 0.177,
1198
+ "step": 944
1199
+ },
1200
+ {
1201
+ "epoch": 59.003125,
1202
+ "grad_norm": 7.177443027496338,
1203
+ "learning_rate": 2.806712962962963e-05,
1204
+ "loss": 1.0493,
1205
+ "step": 950
1206
+ },
1207
+ {
1208
+ "epoch": 59.00833333333333,
1209
+ "grad_norm": 32.896873474121094,
1210
+ "learning_rate": 2.777777777777778e-05,
1211
+ "loss": 1.2569,
1212
+ "step": 960
1213
+ },
1214
+ {
1215
+ "epoch": 59.00833333333333,
1216
+ "eval_accuracy": 0.5255681818181818,
1217
+ "eval_loss": 1.9495558738708496,
1218
+ "eval_runtime": 125.0434,
1219
+ "eval_samples_per_second": 11.26,
1220
+ "eval_steps_per_second": 0.176,
1221
+ "step": 960
1222
+ },
1223
+ {
1224
+ "epoch": 60.005208333333336,
1225
+ "grad_norm": 6.914553642272949,
1226
+ "learning_rate": 2.7488425925925926e-05,
1227
+ "loss": 1.076,
1228
+ "step": 970
1229
+ },
1230
+ {
1231
+ "epoch": 60.00833333333333,
1232
+ "eval_accuracy": 0.5369318181818182,
1233
+ "eval_loss": 1.9237945079803467,
1234
+ "eval_runtime": 130.5511,
1235
+ "eval_samples_per_second": 10.785,
1236
+ "eval_steps_per_second": 0.169,
1237
+ "step": 976
1238
+ },
1239
+ {
1240
+ "epoch": 61.00208333333333,
1241
+ "grad_norm": 6.131436347961426,
1242
+ "learning_rate": 2.7199074074074076e-05,
1243
+ "loss": 0.9972,
1244
+ "step": 980
1245
+ },
1246
+ {
1247
+ "epoch": 61.00729166666667,
1248
+ "grad_norm": 8.375364303588867,
1249
+ "learning_rate": 2.6909722222222222e-05,
1250
+ "loss": 1.018,
1251
+ "step": 990
1252
+ },
1253
+ {
1254
+ "epoch": 61.00833333333333,
1255
+ "eval_accuracy": 0.515625,
1256
+ "eval_loss": 1.9578070640563965,
1257
+ "eval_runtime": 128.8964,
1258
+ "eval_samples_per_second": 10.924,
1259
+ "eval_steps_per_second": 0.171,
1260
+ "step": 992
1261
+ },
1262
+ {
1263
+ "epoch": 62.00416666666667,
1264
+ "grad_norm": 8.409364700317383,
1265
+ "learning_rate": 2.6620370370370372e-05,
1266
+ "loss": 0.8569,
1267
+ "step": 1000
1268
+ },
1269
+ {
1270
+ "epoch": 62.00833333333333,
1271
+ "eval_accuracy": 0.5184659090909091,
1272
+ "eval_loss": 1.9410430192947388,
1273
+ "eval_runtime": 126.0956,
1274
+ "eval_samples_per_second": 11.166,
1275
+ "eval_steps_per_second": 0.174,
1276
+ "step": 1008
1277
+ },
1278
+ {
1279
+ "epoch": 63.001041666666666,
1280
+ "grad_norm": 6.331228256225586,
1281
+ "learning_rate": 2.633101851851852e-05,
1282
+ "loss": 1.1689,
1283
+ "step": 1010
1284
+ },
1285
+ {
1286
+ "epoch": 63.00625,
1287
+ "grad_norm": 5.7633161544799805,
1288
+ "learning_rate": 2.604166666666667e-05,
1289
+ "loss": 0.9847,
1290
+ "step": 1020
1291
+ },
1292
+ {
1293
+ "epoch": 63.00833333333333,
1294
+ "eval_accuracy": 0.5134943181818182,
1295
+ "eval_loss": 1.965501308441162,
1296
+ "eval_runtime": 134.8984,
1297
+ "eval_samples_per_second": 10.437,
1298
+ "eval_steps_per_second": 0.163,
1299
+ "step": 1024
1300
+ },
1301
+ {
1302
+ "epoch": 64.003125,
1303
+ "grad_norm": 6.778160572052002,
1304
+ "learning_rate": 2.5752314814814815e-05,
1305
+ "loss": 1.0541,
1306
+ "step": 1030
1307
+ },
1308
+ {
1309
+ "epoch": 64.00833333333334,
1310
+ "grad_norm": 15.068023681640625,
1311
+ "learning_rate": 2.5462962962962965e-05,
1312
+ "loss": 0.8992,
1313
+ "step": 1040
1314
+ },
1315
+ {
1316
+ "epoch": 64.00833333333334,
1317
+ "eval_accuracy": 0.5184659090909091,
1318
+ "eval_loss": 1.97407865524292,
1319
+ "eval_runtime": 131.1607,
1320
+ "eval_samples_per_second": 10.735,
1321
+ "eval_steps_per_second": 0.168,
1322
+ "step": 1040
1323
+ },
1324
+ {
1325
+ "epoch": 65.00520833333333,
1326
+ "grad_norm": 6.6322126388549805,
1327
+ "learning_rate": 2.517361111111111e-05,
1328
+ "loss": 0.9781,
1329
+ "step": 1050
1330
+ },
1331
+ {
1332
+ "epoch": 65.00833333333334,
1333
+ "eval_accuracy": 0.5248579545454546,
1334
+ "eval_loss": 1.9591399431228638,
1335
+ "eval_runtime": 130.8261,
1336
+ "eval_samples_per_second": 10.762,
1337
+ "eval_steps_per_second": 0.168,
1338
+ "step": 1056
1339
+ },
1340
+ {
1341
+ "epoch": 66.00208333333333,
1342
+ "grad_norm": 6.270596504211426,
1343
+ "learning_rate": 2.488425925925926e-05,
1344
+ "loss": 1.1274,
1345
+ "step": 1060
1346
+ },
1347
+ {
1348
+ "epoch": 66.00729166666666,
1349
+ "grad_norm": 5.216790676116943,
1350
+ "learning_rate": 2.4594907407407408e-05,
1351
+ "loss": 0.9016,
1352
+ "step": 1070
1353
+ },
1354
+ {
1355
+ "epoch": 66.00833333333334,
1356
+ "eval_accuracy": 0.5134943181818182,
1357
+ "eval_loss": 1.9802237749099731,
1358
+ "eval_runtime": 138.409,
1359
+ "eval_samples_per_second": 10.173,
1360
+ "eval_steps_per_second": 0.159,
1361
+ "step": 1072
1362
+ },
1363
+ {
1364
+ "epoch": 67.00416666666666,
1365
+ "grad_norm": 4.889580249786377,
1366
+ "learning_rate": 2.4305555555555558e-05,
1367
+ "loss": 0.9443,
1368
+ "step": 1080
1369
+ },
1370
+ {
1371
+ "epoch": 67.00833333333334,
1372
+ "eval_accuracy": 0.5035511363636364,
1373
+ "eval_loss": 1.9881755113601685,
1374
+ "eval_runtime": 132.577,
1375
+ "eval_samples_per_second": 10.62,
1376
+ "eval_steps_per_second": 0.166,
1377
+ "step": 1088
1378
+ },
1379
+ {
1380
+ "epoch": 68.00104166666667,
1381
+ "grad_norm": 8.338197708129883,
1382
+ "learning_rate": 2.4016203703703704e-05,
1383
+ "loss": 0.9719,
1384
+ "step": 1090
1385
+ },
1386
+ {
1387
+ "epoch": 68.00625,
1388
+ "grad_norm": 7.402223110198975,
1389
+ "learning_rate": 2.3726851851851854e-05,
1390
+ "loss": 0.9359,
1391
+ "step": 1100
1392
+ },
1393
+ {
1394
+ "epoch": 68.00833333333334,
1395
+ "eval_accuracy": 0.5092329545454546,
1396
+ "eval_loss": 2.0045785903930664,
1397
+ "eval_runtime": 122.9735,
1398
+ "eval_samples_per_second": 11.45,
1399
+ "eval_steps_per_second": 0.179,
1400
+ "step": 1104
1401
+ },
1402
+ {
1403
+ "epoch": 69.003125,
1404
+ "grad_norm": 5.299436092376709,
1405
+ "learning_rate": 2.34375e-05,
1406
+ "loss": 0.9554,
1407
+ "step": 1110
1408
+ },
1409
+ {
1410
+ "epoch": 69.00833333333334,
1411
+ "grad_norm": 22.75337028503418,
1412
+ "learning_rate": 2.314814814814815e-05,
1413
+ "loss": 0.7735,
1414
+ "step": 1120
1415
+ },
1416
+ {
1417
+ "epoch": 69.00833333333334,
1418
+ "eval_accuracy": 0.5063920454545454,
1419
+ "eval_loss": 2.017183542251587,
1420
+ "eval_runtime": 132.6432,
1421
+ "eval_samples_per_second": 10.615,
1422
+ "eval_steps_per_second": 0.166,
1423
+ "step": 1120
1424
+ },
1425
+ {
1426
+ "epoch": 70.00520833333333,
1427
+ "grad_norm": 6.869340896606445,
1428
+ "learning_rate": 2.2858796296296297e-05,
1429
+ "loss": 0.9405,
1430
+ "step": 1130
1431
+ },
1432
+ {
1433
+ "epoch": 70.00833333333334,
1434
+ "eval_accuracy": 0.5269886363636364,
1435
+ "eval_loss": 1.955207109451294,
1436
+ "eval_runtime": 135.5316,
1437
+ "eval_samples_per_second": 10.389,
1438
+ "eval_steps_per_second": 0.162,
1439
+ "step": 1136
1440
+ },
1441
+ {
1442
+ "epoch": 71.00208333333333,
1443
+ "grad_norm": 7.737886428833008,
1444
+ "learning_rate": 2.2569444444444447e-05,
1445
+ "loss": 0.9419,
1446
+ "step": 1140
1447
+ },
1448
+ {
1449
+ "epoch": 71.00729166666666,
1450
+ "grad_norm": 6.463024139404297,
1451
+ "learning_rate": 2.2280092592592593e-05,
1452
+ "loss": 0.9709,
1453
+ "step": 1150
1454
+ },
1455
+ {
1456
+ "epoch": 71.00833333333334,
1457
+ "eval_accuracy": 0.5227272727272727,
1458
+ "eval_loss": 1.9572663307189941,
1459
+ "eval_runtime": 134.3176,
1460
+ "eval_samples_per_second": 10.483,
1461
+ "eval_steps_per_second": 0.164,
1462
+ "step": 1152
1463
+ },
1464
+ {
1465
+ "epoch": 72.00416666666666,
1466
+ "grad_norm": 6.5754923820495605,
1467
+ "learning_rate": 2.1990740740740743e-05,
1468
+ "loss": 0.9914,
1469
+ "step": 1160
1470
+ },
1471
+ {
1472
+ "epoch": 72.00833333333334,
1473
+ "eval_accuracy": 0.5248579545454546,
1474
+ "eval_loss": 1.9767735004425049,
1475
+ "eval_runtime": 125.4669,
1476
+ "eval_samples_per_second": 11.222,
1477
+ "eval_steps_per_second": 0.175,
1478
+ "step": 1168
1479
+ },
1480
+ {
1481
+ "epoch": 73.00104166666667,
1482
+ "grad_norm": 6.831247329711914,
1483
+ "learning_rate": 2.170138888888889e-05,
1484
+ "loss": 0.8804,
1485
+ "step": 1170
1486
+ },
1487
+ {
1488
+ "epoch": 73.00625,
1489
+ "grad_norm": 7.8077392578125,
1490
+ "learning_rate": 2.141203703703704e-05,
1491
+ "loss": 0.8487,
1492
+ "step": 1180
1493
+ },
1494
+ {
1495
+ "epoch": 73.00833333333334,
1496
+ "eval_accuracy": 0.5326704545454546,
1497
+ "eval_loss": 1.9569602012634277,
1498
+ "eval_runtime": 134.6123,
1499
+ "eval_samples_per_second": 10.46,
1500
+ "eval_steps_per_second": 0.163,
1501
+ "step": 1184
1502
+ },
1503
+ {
1504
+ "epoch": 74.003125,
1505
+ "grad_norm": 5.786107063293457,
1506
+ "learning_rate": 2.1122685185185186e-05,
1507
+ "loss": 0.9529,
1508
+ "step": 1190
1509
+ },
1510
+ {
1511
+ "epoch": 74.00833333333334,
1512
+ "grad_norm": 21.245386123657227,
1513
+ "learning_rate": 2.0833333333333336e-05,
1514
+ "loss": 0.835,
1515
+ "step": 1200
1516
+ },
1517
+ {
1518
+ "epoch": 74.00833333333334,
1519
+ "eval_accuracy": 0.5241477272727273,
1520
+ "eval_loss": 1.9758590459823608,
1521
+ "eval_runtime": 129.6528,
1522
+ "eval_samples_per_second": 10.86,
1523
+ "eval_steps_per_second": 0.17,
1524
+ "step": 1200
1525
+ },
1526
+ {
1527
+ "epoch": 75.00520833333333,
1528
+ "grad_norm": 5.697115421295166,
1529
+ "learning_rate": 2.0543981481481483e-05,
1530
+ "loss": 0.8914,
1531
+ "step": 1210
1532
+ },
1533
+ {
1534
+ "epoch": 75.00833333333334,
1535
+ "eval_accuracy": 0.5298295454545454,
1536
+ "eval_loss": 1.9309029579162598,
1537
+ "eval_runtime": 130.377,
1538
+ "eval_samples_per_second": 10.799,
1539
+ "eval_steps_per_second": 0.169,
1540
+ "step": 1216
1541
+ },
1542
+ {
1543
+ "epoch": 76.00208333333333,
1544
+ "grad_norm": 4.525904655456543,
1545
+ "learning_rate": 2.0254629629629632e-05,
1546
+ "loss": 0.9268,
1547
+ "step": 1220
1548
+ },
1549
+ {
1550
+ "epoch": 76.00729166666666,
1551
+ "grad_norm": 6.657559871673584,
1552
+ "learning_rate": 1.996527777777778e-05,
1553
+ "loss": 0.9242,
1554
+ "step": 1230
1555
+ },
1556
+ {
1557
+ "epoch": 76.00833333333334,
1558
+ "eval_accuracy": 0.5241477272727273,
1559
+ "eval_loss": 1.9594990015029907,
1560
+ "eval_runtime": 130.7531,
1561
+ "eval_samples_per_second": 10.768,
1562
+ "eval_steps_per_second": 0.168,
1563
+ "step": 1232
1564
+ },
1565
+ {
1566
+ "epoch": 77.00416666666666,
1567
+ "grad_norm": 6.962241172790527,
1568
+ "learning_rate": 1.967592592592593e-05,
1569
+ "loss": 0.8235,
1570
+ "step": 1240
1571
+ },
1572
+ {
1573
+ "epoch": 77.00833333333334,
1574
+ "eval_accuracy": 0.5276988636363636,
1575
+ "eval_loss": 1.9556376934051514,
1576
+ "eval_runtime": 124.6215,
1577
+ "eval_samples_per_second": 11.298,
1578
+ "eval_steps_per_second": 0.177,
1579
+ "step": 1248
1580
+ },
1581
+ {
1582
+ "epoch": 78.00104166666667,
1583
+ "grad_norm": 6.555630207061768,
1584
+ "learning_rate": 1.9386574074074075e-05,
1585
+ "loss": 1.1044,
1586
+ "step": 1250
1587
+ },
1588
+ {
1589
+ "epoch": 78.00625,
1590
+ "grad_norm": 7.0458455085754395,
1591
+ "learning_rate": 1.9097222222222222e-05,
1592
+ "loss": 0.8664,
1593
+ "step": 1260
1594
+ },
1595
+ {
1596
+ "epoch": 78.00833333333334,
1597
+ "eval_accuracy": 0.5134943181818182,
1598
+ "eval_loss": 1.978991985321045,
1599
+ "eval_runtime": 130.4341,
1600
+ "eval_samples_per_second": 10.795,
1601
+ "eval_steps_per_second": 0.169,
1602
+ "step": 1264
1603
+ },
1604
+ {
1605
+ "epoch": 79.003125,
1606
+ "grad_norm": 4.815821170806885,
1607
+ "learning_rate": 1.8807870370370372e-05,
1608
+ "loss": 0.7884,
1609
+ "step": 1270
1610
+ },
1611
+ {
1612
+ "epoch": 79.00833333333334,
1613
+ "grad_norm": 26.361722946166992,
1614
+ "learning_rate": 1.8518518518518518e-05,
1615
+ "loss": 0.8699,
1616
+ "step": 1280
1617
+ },
1618
+ {
1619
+ "epoch": 79.00833333333334,
1620
+ "eval_accuracy": 0.5227272727272727,
1621
+ "eval_loss": 1.9835097789764404,
1622
+ "eval_runtime": 119.0563,
1623
+ "eval_samples_per_second": 11.826,
1624
+ "eval_steps_per_second": 0.185,
1625
+ "step": 1280
1626
+ },
1627
+ {
1628
+ "epoch": 80.00520833333333,
1629
+ "grad_norm": 4.6080474853515625,
1630
+ "learning_rate": 1.8229166666666668e-05,
1631
+ "loss": 0.9112,
1632
+ "step": 1290
1633
+ },
1634
+ {
1635
+ "epoch": 80.00833333333334,
1636
+ "eval_accuracy": 0.5291193181818182,
1637
+ "eval_loss": 1.9426430463790894,
1638
+ "eval_runtime": 137.1174,
1639
+ "eval_samples_per_second": 10.269,
1640
+ "eval_steps_per_second": 0.16,
1641
+ "step": 1296
1642
+ },
1643
+ {
1644
+ "epoch": 81.00208333333333,
1645
+ "grad_norm": 5.045470237731934,
1646
+ "learning_rate": 1.7939814814814815e-05,
1647
+ "loss": 0.8392,
1648
+ "step": 1300
1649
+ },
1650
+ {
1651
+ "epoch": 81.00729166666666,
1652
+ "grad_norm": 6.203034400939941,
1653
+ "learning_rate": 1.7650462962962965e-05,
1654
+ "loss": 0.7901,
1655
+ "step": 1310
1656
+ },
1657
+ {
1658
+ "epoch": 81.00833333333334,
1659
+ "eval_accuracy": 0.5255681818181818,
1660
+ "eval_loss": 1.959786295890808,
1661
+ "eval_runtime": 120.2871,
1662
+ "eval_samples_per_second": 11.705,
1663
+ "eval_steps_per_second": 0.183,
1664
+ "step": 1312
1665
+ },
1666
+ {
1667
+ "epoch": 82.00416666666666,
1668
+ "grad_norm": 6.214341163635254,
1669
+ "learning_rate": 1.736111111111111e-05,
1670
+ "loss": 0.8186,
1671
+ "step": 1320
1672
+ },
1673
+ {
1674
+ "epoch": 82.00833333333334,
1675
+ "eval_accuracy": 0.5319602272727273,
1676
+ "eval_loss": 1.9397464990615845,
1677
+ "eval_runtime": 130.0941,
1678
+ "eval_samples_per_second": 10.823,
1679
+ "eval_steps_per_second": 0.169,
1680
+ "step": 1328
1681
+ },
1682
+ {
1683
+ "epoch": 83.00104166666667,
1684
+ "grad_norm": 5.813119411468506,
1685
+ "learning_rate": 1.707175925925926e-05,
1686
+ "loss": 0.7438,
1687
+ "step": 1330
1688
+ },
1689
+ {
1690
+ "epoch": 83.00625,
1691
+ "grad_norm": 5.390239715576172,
1692
+ "learning_rate": 1.6782407407407408e-05,
1693
+ "loss": 0.8229,
1694
+ "step": 1340
1695
+ },
1696
+ {
1697
+ "epoch": 83.00833333333334,
1698
+ "eval_accuracy": 0.5326704545454546,
1699
+ "eval_loss": 1.938385009765625,
1700
+ "eval_runtime": 124.8818,
1701
+ "eval_samples_per_second": 11.275,
1702
+ "eval_steps_per_second": 0.176,
1703
+ "step": 1344
1704
+ },
1705
+ {
1706
+ "epoch": 84.003125,
1707
+ "grad_norm": 5.821295738220215,
1708
+ "learning_rate": 1.6493055555555557e-05,
1709
+ "loss": 1.0251,
1710
+ "step": 1350
1711
+ },
1712
+ {
1713
+ "epoch": 84.00833333333334,
1714
+ "grad_norm": 23.789480209350586,
1715
+ "learning_rate": 1.6203703703703704e-05,
1716
+ "loss": 0.9063,
1717
+ "step": 1360
1718
+ },
1719
+ {
1720
+ "epoch": 84.00833333333334,
1721
+ "eval_accuracy": 0.5291193181818182,
1722
+ "eval_loss": 1.9323524236679077,
1723
+ "eval_runtime": 122.6737,
1724
+ "eval_samples_per_second": 11.478,
1725
+ "eval_steps_per_second": 0.179,
1726
+ "step": 1360
1727
+ },
1728
+ {
1729
+ "epoch": 85.00520833333333,
1730
+ "grad_norm": 7.094452381134033,
1731
+ "learning_rate": 1.5914351851851854e-05,
1732
+ "loss": 0.8843,
1733
+ "step": 1370
1734
+ },
1735
+ {
1736
+ "epoch": 85.00833333333334,
1737
+ "eval_accuracy": 0.5369318181818182,
1738
+ "eval_loss": 1.9315829277038574,
1739
+ "eval_runtime": 124.9222,
1740
+ "eval_samples_per_second": 11.271,
1741
+ "eval_steps_per_second": 0.176,
1742
+ "step": 1376
1743
+ },
1744
+ {
1745
+ "epoch": 86.00208333333333,
1746
+ "grad_norm": 5.776772975921631,
1747
+ "learning_rate": 1.5625e-05,
1748
+ "loss": 0.7684,
1749
+ "step": 1380
1750
+ },
1751
+ {
1752
+ "epoch": 86.00729166666666,
1753
+ "grad_norm": 8.940707206726074,
1754
+ "learning_rate": 1.533564814814815e-05,
1755
+ "loss": 0.7904,
1756
+ "step": 1390
1757
+ },
1758
+ {
1759
+ "epoch": 86.00833333333334,
1760
+ "eval_accuracy": 0.5376420454545454,
1761
+ "eval_loss": 1.926943302154541,
1762
+ "eval_runtime": 126.2416,
1763
+ "eval_samples_per_second": 11.153,
1764
+ "eval_steps_per_second": 0.174,
1765
+ "step": 1392
1766
+ },
1767
+ {
1768
+ "epoch": 87.00416666666666,
1769
+ "grad_norm": 6.900567054748535,
1770
+ "learning_rate": 1.5046296296296297e-05,
1771
+ "loss": 0.7942,
1772
+ "step": 1400
1773
+ },
1774
+ {
1775
+ "epoch": 87.00833333333334,
1776
+ "eval_accuracy": 0.5291193181818182,
1777
+ "eval_loss": 1.9505839347839355,
1778
+ "eval_runtime": 129.4495,
1779
+ "eval_samples_per_second": 10.877,
1780
+ "eval_steps_per_second": 0.17,
1781
+ "step": 1408
1782
+ },
1783
+ {
1784
+ "epoch": 88.00104166666667,
1785
+ "grad_norm": 6.561887741088867,
1786
+ "learning_rate": 1.4756944444444445e-05,
1787
+ "loss": 0.7708,
1788
+ "step": 1410
1789
+ },
1790
+ {
1791
+ "epoch": 88.00625,
1792
+ "grad_norm": 6.302940368652344,
1793
+ "learning_rate": 1.4467592592592593e-05,
1794
+ "loss": 0.8798,
1795
+ "step": 1420
1796
+ },
1797
+ {
1798
+ "epoch": 88.00833333333334,
1799
+ "eval_accuracy": 0.5404829545454546,
1800
+ "eval_loss": 1.9184972047805786,
1801
+ "eval_runtime": 136.5919,
1802
+ "eval_samples_per_second": 10.308,
1803
+ "eval_steps_per_second": 0.161,
1804
+ "step": 1424
1805
+ },
1806
+ {
1807
+ "epoch": 89.003125,
1808
+ "grad_norm": 6.015424728393555,
1809
+ "learning_rate": 1.4178240740740741e-05,
1810
+ "loss": 0.7788,
1811
+ "step": 1430
1812
+ },
1813
+ {
1814
+ "epoch": 89.00833333333334,
1815
+ "grad_norm": 9.586054801940918,
1816
+ "learning_rate": 1.388888888888889e-05,
1817
+ "loss": 0.7678,
1818
+ "step": 1440
1819
+ },
1820
+ {
1821
+ "epoch": 89.00833333333334,
1822
+ "eval_accuracy": 0.5326704545454546,
1823
+ "eval_loss": 1.9361698627471924,
1824
+ "eval_runtime": 128.5409,
1825
+ "eval_samples_per_second": 10.954,
1826
+ "eval_steps_per_second": 0.171,
1827
+ "step": 1440
1828
+ },
1829
+ {
1830
+ "epoch": 90.00520833333333,
1831
+ "grad_norm": 6.406522750854492,
1832
+ "learning_rate": 1.3599537037037038e-05,
1833
+ "loss": 0.7589,
1834
+ "step": 1450
1835
+ },
1836
+ {
1837
+ "epoch": 90.00833333333334,
1838
+ "eval_accuracy": 0.5276988636363636,
1839
+ "eval_loss": 1.9496175050735474,
1840
+ "eval_runtime": 128.3627,
1841
+ "eval_samples_per_second": 10.969,
1842
+ "eval_steps_per_second": 0.171,
1843
+ "step": 1456
1844
+ },
1845
+ {
1846
+ "epoch": 91.00208333333333,
1847
+ "grad_norm": 5.885864734649658,
1848
+ "learning_rate": 1.3310185185185186e-05,
1849
+ "loss": 0.7746,
1850
+ "step": 1460
1851
+ },
1852
+ {
1853
+ "epoch": 91.00729166666666,
1854
+ "grad_norm": 6.763460159301758,
1855
+ "learning_rate": 1.3020833333333334e-05,
1856
+ "loss": 0.6679,
1857
+ "step": 1470
1858
+ },
1859
+ {
1860
+ "epoch": 91.00833333333334,
1861
+ "eval_accuracy": 0.5298295454545454,
1862
+ "eval_loss": 1.9507235288619995,
1863
+ "eval_runtime": 133.1729,
1864
+ "eval_samples_per_second": 10.573,
1865
+ "eval_steps_per_second": 0.165,
1866
+ "step": 1472
1867
+ },
1868
+ {
1869
+ "epoch": 92.00416666666666,
1870
+ "grad_norm": 8.202223777770996,
1871
+ "learning_rate": 1.2731481481481482e-05,
1872
+ "loss": 0.8042,
1873
+ "step": 1480
1874
+ },
1875
+ {
1876
+ "epoch": 92.00833333333334,
1877
+ "eval_accuracy": 0.5369318181818182,
1878
+ "eval_loss": 1.950987696647644,
1879
+ "eval_runtime": 134.0073,
1880
+ "eval_samples_per_second": 10.507,
1881
+ "eval_steps_per_second": 0.164,
1882
+ "step": 1488
1883
+ },
1884
+ {
1885
+ "epoch": 93.00104166666667,
1886
+ "grad_norm": 6.163456439971924,
1887
+ "learning_rate": 1.244212962962963e-05,
1888
+ "loss": 0.8758,
1889
+ "step": 1490
1890
+ },
1891
+ {
1892
+ "epoch": 93.00625,
1893
+ "grad_norm": 8.015287399291992,
1894
+ "learning_rate": 1.2152777777777779e-05,
1895
+ "loss": 0.7722,
1896
+ "step": 1500
1897
+ },
1898
+ {
1899
+ "epoch": 93.00833333333334,
1900
+ "eval_accuracy": 0.5333806818181818,
1901
+ "eval_loss": 1.9502681493759155,
1902
+ "eval_runtime": 125.0273,
1903
+ "eval_samples_per_second": 11.262,
1904
+ "eval_steps_per_second": 0.176,
1905
+ "step": 1504
1906
+ },
1907
+ {
1908
+ "epoch": 94.003125,
1909
+ "grad_norm": 5.918884754180908,
1910
+ "learning_rate": 1.1863425925925927e-05,
1911
+ "loss": 0.7579,
1912
+ "step": 1510
1913
+ },
1914
+ {
1915
+ "epoch": 94.00833333333334,
1916
+ "grad_norm": 18.762502670288086,
1917
+ "learning_rate": 1.1574074074074075e-05,
1918
+ "loss": 0.6831,
1919
+ "step": 1520
1920
+ },
1921
+ {
1922
+ "epoch": 94.00833333333334,
1923
+ "eval_accuracy": 0.5348011363636364,
1924
+ "eval_loss": 1.9531010389328003,
1925
+ "eval_runtime": 120.9603,
1926
+ "eval_samples_per_second": 11.64,
1927
+ "eval_steps_per_second": 0.182,
1928
+ "step": 1520
1929
+ },
1930
+ {
1931
+ "epoch": 95.00520833333333,
1932
+ "grad_norm": 7.426961421966553,
1933
+ "learning_rate": 1.1284722222222223e-05,
1934
+ "loss": 0.766,
1935
+ "step": 1530
1936
+ },
1937
+ {
1938
+ "epoch": 95.00833333333334,
1939
+ "eval_accuracy": 0.5383522727272727,
1940
+ "eval_loss": 1.9345489740371704,
1941
+ "eval_runtime": 129.3794,
1942
+ "eval_samples_per_second": 10.883,
1943
+ "eval_steps_per_second": 0.17,
1944
+ "step": 1536
1945
+ },
1946
+ {
1947
+ "epoch": 96.00208333333333,
1948
+ "grad_norm": 5.175162315368652,
1949
+ "learning_rate": 1.0995370370370372e-05,
1950
+ "loss": 0.7168,
1951
+ "step": 1540
1952
+ },
1953
+ {
1954
+ "epoch": 96.00729166666666,
1955
+ "grad_norm": 7.063689708709717,
1956
+ "learning_rate": 1.070601851851852e-05,
1957
+ "loss": 0.8099,
1958
+ "step": 1550
1959
+ },
1960
+ {
1961
+ "epoch": 96.00833333333334,
1962
+ "eval_accuracy": 0.5376420454545454,
1963
+ "eval_loss": 1.9349414110183716,
1964
+ "eval_runtime": 123.5725,
1965
+ "eval_samples_per_second": 11.394,
1966
+ "eval_steps_per_second": 0.178,
1967
+ "step": 1552
1968
+ },
1969
+ {
1970
+ "epoch": 97.00416666666666,
1971
+ "grad_norm": 6.602786064147949,
1972
+ "learning_rate": 1.0416666666666668e-05,
1973
+ "loss": 0.7513,
1974
+ "step": 1560
1975
+ },
1976
+ {
1977
+ "epoch": 97.00833333333334,
1978
+ "eval_accuracy": 0.5461647727272727,
1979
+ "eval_loss": 1.9238044023513794,
1980
+ "eval_runtime": 139.032,
1981
+ "eval_samples_per_second": 10.127,
1982
+ "eval_steps_per_second": 0.158,
1983
+ "step": 1568
1984
+ },
1985
+ {
1986
+ "epoch": 98.00104166666667,
1987
+ "grad_norm": 5.149336338043213,
1988
+ "learning_rate": 1.0127314814814816e-05,
1989
+ "loss": 0.8236,
1990
+ "step": 1570
1991
+ },
1992
+ {
1993
+ "epoch": 98.00625,
1994
+ "grad_norm": 8.144251823425293,
1995
+ "learning_rate": 9.837962962962964e-06,
1996
+ "loss": 0.6561,
1997
+ "step": 1580
1998
+ },
1999
+ {
2000
+ "epoch": 98.00833333333334,
2001
+ "eval_accuracy": 0.5426136363636364,
2002
+ "eval_loss": 1.9337557554244995,
2003
+ "eval_runtime": 133.1537,
2004
+ "eval_samples_per_second": 10.574,
2005
+ "eval_steps_per_second": 0.165,
2006
+ "step": 1584
2007
+ },
2008
+ {
2009
+ "epoch": 99.003125,
2010
+ "grad_norm": 5.338318347930908,
2011
+ "learning_rate": 9.548611111111111e-06,
2012
+ "loss": 0.778,
2013
+ "step": 1590
2014
+ },
2015
+ {
2016
+ "epoch": 99.00833333333334,
2017
+ "grad_norm": 15.469060897827148,
2018
+ "learning_rate": 9.259259259259259e-06,
2019
+ "loss": 0.7423,
2020
+ "step": 1600
2021
+ },
2022
+ {
2023
+ "epoch": 99.00833333333334,
2024
+ "eval_accuracy": 0.5539772727272727,
2025
+ "eval_loss": 1.901895523071289,
2026
+ "eval_runtime": 131.3276,
2027
+ "eval_samples_per_second": 10.721,
2028
+ "eval_steps_per_second": 0.168,
2029
+ "step": 1600
2030
+ },
2031
+ {
2032
+ "epoch": 100.00520833333333,
2033
+ "grad_norm": 4.922194957733154,
2034
+ "learning_rate": 8.969907407407407e-06,
2035
+ "loss": 0.7739,
2036
+ "step": 1610
2037
+ },
2038
+ {
2039
+ "epoch": 100.00833333333334,
2040
+ "eval_accuracy": 0.5504261363636364,
2041
+ "eval_loss": 1.916474461555481,
2042
+ "eval_runtime": 124.6426,
2043
+ "eval_samples_per_second": 11.296,
2044
+ "eval_steps_per_second": 0.177,
2045
+ "step": 1616
2046
+ },
2047
+ {
2048
+ "epoch": 101.00208333333333,
2049
+ "grad_norm": 7.351630210876465,
2050
+ "learning_rate": 8.680555555555556e-06,
2051
+ "loss": 0.6743,
2052
+ "step": 1620
2053
+ },
2054
+ {
2055
+ "epoch": 101.00729166666666,
2056
+ "grad_norm": 5.356750965118408,
2057
+ "learning_rate": 8.391203703703704e-06,
2058
+ "loss": 0.6562,
2059
+ "step": 1630
2060
+ },
2061
+ {
2062
+ "epoch": 101.00833333333334,
2063
+ "eval_accuracy": 0.5433238636363636,
2064
+ "eval_loss": 1.9271347522735596,
2065
+ "eval_runtime": 121.1594,
2066
+ "eval_samples_per_second": 11.621,
2067
+ "eval_steps_per_second": 0.182,
2068
+ "step": 1632
2069
+ },
2070
+ {
2071
+ "epoch": 102.00416666666666,
2072
+ "grad_norm": 6.858729839324951,
2073
+ "learning_rate": 8.101851851851852e-06,
2074
+ "loss": 0.7182,
2075
+ "step": 1640
2076
+ },
2077
+ {
2078
+ "epoch": 102.00833333333334,
2079
+ "eval_accuracy": 0.5440340909090909,
2080
+ "eval_loss": 1.9096194505691528,
2081
+ "eval_runtime": 133.4895,
2082
+ "eval_samples_per_second": 10.548,
2083
+ "eval_steps_per_second": 0.165,
2084
+ "step": 1648
2085
+ },
2086
+ {
2087
+ "epoch": 103.00104166666667,
2088
+ "grad_norm": 6.165463924407959,
2089
+ "learning_rate": 7.8125e-06,
2090
+ "loss": 0.75,
2091
+ "step": 1650
2092
+ },
2093
+ {
2094
+ "epoch": 103.00625,
2095
+ "grad_norm": 7.791365623474121,
2096
+ "learning_rate": 7.523148148148148e-06,
2097
+ "loss": 0.6898,
2098
+ "step": 1660
2099
+ },
2100
+ {
2101
+ "epoch": 103.00833333333334,
2102
+ "eval_accuracy": 0.5482954545454546,
2103
+ "eval_loss": 1.9213480949401855,
2104
+ "eval_runtime": 121.4191,
2105
+ "eval_samples_per_second": 11.596,
2106
+ "eval_steps_per_second": 0.181,
2107
+ "step": 1664
2108
+ },
2109
+ {
2110
+ "epoch": 104.003125,
2111
+ "grad_norm": 7.106443405151367,
2112
+ "learning_rate": 7.2337962962962966e-06,
2113
+ "loss": 0.8279,
2114
+ "step": 1670
2115
+ },
2116
+ {
2117
+ "epoch": 104.00833333333334,
2118
+ "grad_norm": 11.49569320678711,
2119
+ "learning_rate": 6.944444444444445e-06,
2120
+ "loss": 0.6541,
2121
+ "step": 1680
2122
+ },
2123
+ {
2124
+ "epoch": 104.00833333333334,
2125
+ "eval_accuracy": 0.5433238636363636,
2126
+ "eval_loss": 1.926275372505188,
2127
+ "eval_runtime": 128.2374,
2128
+ "eval_samples_per_second": 10.98,
2129
+ "eval_steps_per_second": 0.172,
2130
+ "step": 1680
2131
+ },
2132
+ {
2133
+ "epoch": 105.00520833333333,
2134
+ "grad_norm": 5.9457316398620605,
2135
+ "learning_rate": 6.655092592592593e-06,
2136
+ "loss": 0.7131,
2137
+ "step": 1690
2138
+ },
2139
+ {
2140
+ "epoch": 105.00833333333334,
2141
+ "eval_accuracy": 0.546875,
2142
+ "eval_loss": 1.9148098230361938,
2143
+ "eval_runtime": 128.976,
2144
+ "eval_samples_per_second": 10.917,
2145
+ "eval_steps_per_second": 0.171,
2146
+ "step": 1696
2147
+ },
2148
+ {
2149
+ "epoch": 106.00208333333333,
2150
+ "grad_norm": 7.0640716552734375,
2151
+ "learning_rate": 6.365740740740741e-06,
2152
+ "loss": 0.7,
2153
+ "step": 1700
2154
+ },
2155
+ {
2156
+ "epoch": 106.00729166666666,
2157
+ "grad_norm": 5.8036699295043945,
2158
+ "learning_rate": 6.076388888888889e-06,
2159
+ "loss": 0.7076,
2160
+ "step": 1710
2161
+ },
2162
+ {
2163
+ "epoch": 106.00833333333334,
2164
+ "eval_accuracy": 0.5454545454545454,
2165
+ "eval_loss": 1.9192754030227661,
2166
+ "eval_runtime": 128.1384,
2167
+ "eval_samples_per_second": 10.988,
2168
+ "eval_steps_per_second": 0.172,
2169
+ "step": 1712
2170
+ },
2171
+ {
2172
+ "epoch": 107.00416666666666,
2173
+ "grad_norm": 6.425831317901611,
2174
+ "learning_rate": 5.787037037037038e-06,
2175
+ "loss": 0.7822,
2176
+ "step": 1720
2177
+ },
2178
+ {
2179
+ "epoch": 107.00833333333334,
2180
+ "eval_accuracy": 0.5440340909090909,
2181
+ "eval_loss": 1.916624903678894,
2182
+ "eval_runtime": 128.0714,
2183
+ "eval_samples_per_second": 10.994,
2184
+ "eval_steps_per_second": 0.172,
2185
+ "step": 1728
2186
+ },
2187
+ {
2188
+ "epoch": 108.00104166666667,
2189
+ "grad_norm": 5.592070579528809,
2190
+ "learning_rate": 5.497685185185186e-06,
2191
+ "loss": 0.7629,
2192
+ "step": 1730
2193
+ },
2194
+ {
2195
+ "epoch": 108.00625,
2196
+ "grad_norm": 6.38240909576416,
2197
+ "learning_rate": 5.208333333333334e-06,
2198
+ "loss": 0.6955,
2199
+ "step": 1740
2200
+ },
2201
+ {
2202
+ "epoch": 108.00833333333334,
2203
+ "eval_accuracy": 0.5490056818181818,
2204
+ "eval_loss": 1.9166675806045532,
2205
+ "eval_runtime": 133.0053,
2206
+ "eval_samples_per_second": 10.586,
2207
+ "eval_steps_per_second": 0.165,
2208
+ "step": 1744
2209
+ },
2210
+ {
2211
+ "epoch": 109.003125,
2212
+ "grad_norm": 4.68496036529541,
2213
+ "learning_rate": 4.918981481481482e-06,
2214
+ "loss": 0.8267,
2215
+ "step": 1750
2216
+ },
2217
+ {
2218
+ "epoch": 109.00833333333334,
2219
+ "grad_norm": 22.761470794677734,
2220
+ "learning_rate": 4.6296296296296296e-06,
2221
+ "loss": 0.6939,
2222
+ "step": 1760
2223
+ },
2224
+ {
2225
+ "epoch": 109.00833333333334,
2226
+ "eval_accuracy": 0.5426136363636364,
2227
+ "eval_loss": 1.9129440784454346,
2228
+ "eval_runtime": 123.3598,
2229
+ "eval_samples_per_second": 11.414,
2230
+ "eval_steps_per_second": 0.178,
2231
+ "step": 1760
2232
+ },
2233
+ {
2234
+ "epoch": 110.00520833333333,
2235
+ "grad_norm": 6.633167266845703,
2236
+ "learning_rate": 4.340277777777778e-06,
2237
+ "loss": 0.7149,
2238
+ "step": 1770
2239
+ },
2240
+ {
2241
+ "epoch": 110.00833333333334,
2242
+ "eval_accuracy": 0.5355113636363636,
2243
+ "eval_loss": 1.9237289428710938,
2244
+ "eval_runtime": 133.1436,
2245
+ "eval_samples_per_second": 10.575,
2246
+ "eval_steps_per_second": 0.165,
2247
+ "step": 1776
2248
+ },
2249
+ {
2250
+ "epoch": 111.00208333333333,
2251
+ "grad_norm": 6.027252674102783,
2252
+ "learning_rate": 4.050925925925926e-06,
2253
+ "loss": 0.7654,
2254
+ "step": 1780
2255
+ },
2256
+ {
2257
+ "epoch": 111.00729166666666,
2258
+ "grad_norm": 7.40338134765625,
2259
+ "learning_rate": 3.761574074074074e-06,
2260
+ "loss": 0.7341,
2261
+ "step": 1790
2262
+ },
2263
+ {
2264
+ "epoch": 111.00833333333334,
2265
+ "eval_accuracy": 0.5433238636363636,
2266
+ "eval_loss": 1.9047017097473145,
2267
+ "eval_runtime": 133.6961,
2268
+ "eval_samples_per_second": 10.531,
2269
+ "eval_steps_per_second": 0.165,
2270
+ "step": 1792
2271
+ },
2272
+ {
2273
+ "epoch": 112.00416666666666,
2274
+ "grad_norm": 6.874715328216553,
2275
+ "learning_rate": 3.4722222222222224e-06,
2276
+ "loss": 0.7101,
2277
+ "step": 1800
2278
+ },
2279
+ {
2280
+ "epoch": 112.00833333333334,
2281
+ "eval_accuracy": 0.5433238636363636,
2282
+ "eval_loss": 1.9010353088378906,
2283
+ "eval_runtime": 130.9035,
2284
+ "eval_samples_per_second": 10.756,
2285
+ "eval_steps_per_second": 0.168,
2286
+ "step": 1808
2287
+ },
2288
+ {
2289
+ "epoch": 113.00104166666667,
2290
+ "grad_norm": 6.984443664550781,
2291
+ "learning_rate": 3.1828703703703706e-06,
2292
+ "loss": 0.6031,
2293
+ "step": 1810
2294
+ },
2295
+ {
2296
+ "epoch": 113.00625,
2297
+ "grad_norm": 7.135616302490234,
2298
+ "learning_rate": 2.893518518518519e-06,
2299
+ "loss": 0.764,
2300
+ "step": 1820
2301
+ },
2302
+ {
2303
+ "epoch": 113.00833333333334,
2304
+ "eval_accuracy": 0.5454545454545454,
2305
+ "eval_loss": 1.9023563861846924,
2306
+ "eval_runtime": 128.9627,
2307
+ "eval_samples_per_second": 10.918,
2308
+ "eval_steps_per_second": 0.171,
2309
+ "step": 1824
2310
+ },
2311
+ {
2312
+ "epoch": 114.003125,
2313
+ "grad_norm": 5.495314121246338,
2314
+ "learning_rate": 2.604166666666667e-06,
2315
+ "loss": 0.7183,
2316
+ "step": 1830
2317
+ },
2318
+ {
2319
+ "epoch": 114.00833333333334,
2320
+ "grad_norm": 25.88104248046875,
2321
+ "learning_rate": 2.3148148148148148e-06,
2322
+ "loss": 0.667,
2323
+ "step": 1840
2324
+ },
2325
+ {
2326
+ "epoch": 114.00833333333334,
2327
+ "eval_accuracy": 0.5475852272727273,
2328
+ "eval_loss": 1.9040805101394653,
2329
+ "eval_runtime": 132.319,
2330
+ "eval_samples_per_second": 10.641,
2331
+ "eval_steps_per_second": 0.166,
2332
+ "step": 1840
2333
+ },
2334
+ {
2335
+ "epoch": 115.00520833333333,
2336
+ "grad_norm": 7.5589280128479,
2337
+ "learning_rate": 2.025462962962963e-06,
2338
+ "loss": 0.7465,
2339
+ "step": 1850
2340
+ },
2341
+ {
2342
+ "epoch": 115.00833333333334,
2343
+ "eval_accuracy": 0.5482954545454546,
2344
+ "eval_loss": 1.9005664587020874,
2345
+ "eval_runtime": 129.3051,
2346
+ "eval_samples_per_second": 10.889,
2347
+ "eval_steps_per_second": 0.17,
2348
+ "step": 1856
2349
+ },
2350
+ {
2351
+ "epoch": 116.00208333333333,
2352
+ "grad_norm": 8.857230186462402,
2353
+ "learning_rate": 1.7361111111111112e-06,
2354
+ "loss": 0.5929,
2355
+ "step": 1860
2356
+ },
2357
+ {
2358
+ "epoch": 116.00729166666666,
2359
+ "grad_norm": 5.749704837799072,
2360
+ "learning_rate": 1.4467592592592594e-06,
2361
+ "loss": 0.6935,
2362
+ "step": 1870
2363
+ },
2364
+ {
2365
+ "epoch": 116.00833333333334,
2366
+ "eval_accuracy": 0.5461647727272727,
2367
+ "eval_loss": 1.901587724685669,
2368
+ "eval_runtime": 135.6846,
2369
+ "eval_samples_per_second": 10.377,
2370
+ "eval_steps_per_second": 0.162,
2371
+ "step": 1872
2372
+ },
2373
+ {
2374
+ "epoch": 117.00416666666666,
2375
+ "grad_norm": 6.5324883460998535,
2376
+ "learning_rate": 1.1574074074074074e-06,
2377
+ "loss": 0.7306,
2378
+ "step": 1880
2379
+ },
2380
+ {
2381
+ "epoch": 117.00833333333334,
2382
+ "eval_accuracy": 0.5482954545454546,
2383
+ "eval_loss": 1.9008712768554688,
2384
+ "eval_runtime": 123.3184,
2385
+ "eval_samples_per_second": 11.418,
2386
+ "eval_steps_per_second": 0.178,
2387
+ "step": 1888
2388
+ },
2389
+ {
2390
+ "epoch": 118.00104166666667,
2391
+ "grad_norm": 6.570457458496094,
2392
+ "learning_rate": 8.680555555555556e-07,
2393
+ "loss": 0.6663,
2394
+ "step": 1890
2395
+ },
2396
+ {
2397
+ "epoch": 118.00625,
2398
+ "grad_norm": 4.956265449523926,
2399
+ "learning_rate": 5.787037037037037e-07,
2400
+ "loss": 0.6578,
2401
+ "step": 1900
2402
+ },
2403
+ {
2404
+ "epoch": 118.00833333333334,
2405
+ "eval_accuracy": 0.5482954545454546,
2406
+ "eval_loss": 1.9007748365402222,
2407
+ "eval_runtime": 136.6271,
2408
+ "eval_samples_per_second": 10.305,
2409
+ "eval_steps_per_second": 0.161,
2410
+ "step": 1904
2411
+ },
2412
+ {
2413
+ "epoch": 119.003125,
2414
+ "grad_norm": 7.002548694610596,
2415
+ "learning_rate": 2.8935185185185185e-07,
2416
+ "loss": 0.6235,
2417
+ "step": 1910
2418
+ },
2419
+ {
2420
+ "epoch": 119.00833333333334,
2421
+ "grad_norm": 2.7440524101257324,
2422
+ "learning_rate": 0.0,
2423
+ "loss": 0.6427,
2424
+ "step": 1920
2425
+ },
2426
+ {
2427
+ "epoch": 119.00833333333334,
2428
+ "eval_accuracy": 0.5504261363636364,
2429
+ "eval_loss": 1.9013855457305908,
2430
+ "eval_runtime": 128.5584,
2431
+ "eval_samples_per_second": 10.952,
2432
+ "eval_steps_per_second": 0.171,
2433
+ "step": 1920
2434
+ },
2435
+ {
2436
+ "epoch": 119.00833333333334,
2437
+ "step": 1920,
2438
+ "total_flos": 1.4452753827235627e+20,
2439
+ "train_loss": 1.5160956154266994,
2440
+ "train_runtime": 26874.3406,
2441
+ "train_samples_per_second": 4.572,
2442
+ "train_steps_per_second": 0.071
2443
+ },
2444
+ {
2445
+ "epoch": 119.00833333333334,
2446
+ "eval_accuracy": 0.5539772727272727,
2447
+ "eval_loss": 1.901895523071289,
2448
+ "eval_runtime": 137.4484,
2449
+ "eval_samples_per_second": 10.244,
2450
+ "eval_steps_per_second": 0.16,
2451
+ "step": 1920
2452
+ },
2453
+ {
2454
+ "epoch": 119.00833333333334,
2455
+ "eval_accuracy": 0.5539772727272727,
2456
+ "eval_loss": 1.9018956422805786,
2457
+ "eval_runtime": 133.7632,
2458
+ "eval_samples_per_second": 10.526,
2459
+ "eval_steps_per_second": 0.164,
2460
+ "step": 1920
2461
  }
2462
  ],
2463
  "logging_steps": 10,
2464
+ "max_steps": 1920,
2465
  "num_input_tokens_seen": 0,
2466
  "num_train_epochs": 9223372036854775807,
2467
  "save_steps": 500,
 
2477
  "attributes": {}
2478
  }
2479
  },
2480
+ "total_flos": 1.4452753827235627e+20,
2481
  "train_batch_size": 64,
2482
  "trial_name": null,
2483
  "trial_params": null