bryanzhou008 commited on
Commit
d740899
1 Parent(s): 8d525cd

End of training

Browse files
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.8638888888888889
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +33,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 1.1319
37
- - Accuracy: 0.8639
38
 
39
  ## Model description
40
 
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.9483333333333334
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 1.0056
37
+ - Accuracy: 0.9483
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 40.0,
3
- "eval_accuracy": 0.9066666666666666,
4
- "eval_loss": 1.6493659019470215,
5
- "eval_runtime": 4.846,
6
- "eval_samples_per_second": 247.626,
7
- "eval_steps_per_second": 2.064,
8
- "total_flos": 3.720348931719168e+18,
9
- "train_loss": 2.217303295135498,
10
- "train_runtime": 465.0811,
11
- "train_samples_per_second": 129.01,
12
- "train_steps_per_second": 0.215
13
  }
 
1
  {
2
+ "epoch": 80.0,
3
+ "eval_accuracy": 0.9483333333333334,
4
+ "eval_loss": 1.0056288242340088,
5
+ "eval_runtime": 4.803,
6
+ "eval_samples_per_second": 249.845,
7
+ "eval_steps_per_second": 2.082,
8
+ "total_flos": 7.440697863438336e+18,
9
+ "train_loss": 1.5909362745285034,
10
+ "train_runtime": 1036.0381,
11
+ "train_samples_per_second": 115.826,
12
+ "train_steps_per_second": 0.193
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 40.0,
3
- "eval_accuracy": 0.8361111111111111,
4
- "eval_loss": 1.7763370275497437,
5
- "eval_runtime": 3.2333,
6
- "eval_samples_per_second": 222.684,
7
- "eval_steps_per_second": 1.856
8
  }
 
1
  {
2
+ "epoch": 80.0,
3
+ "eval_accuracy": 0.8638888888888889,
4
+ "eval_loss": 1.2210274934768677,
5
+ "eval_runtime": 3.2287,
6
+ "eval_samples_per_second": 223.003,
7
+ "eval_steps_per_second": 1.858
8
  }
train_eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 40.0,
3
- "eval_accuracy": 0.9066666666666666,
4
- "eval_loss": 1.6493659019470215,
5
- "eval_runtime": 4.846,
6
- "eval_samples_per_second": 247.626,
7
- "eval_steps_per_second": 2.064
8
  }
 
1
  {
2
+ "epoch": 80.0,
3
+ "eval_accuracy": 0.9483333333333334,
4
+ "eval_loss": 1.0056288242340088,
5
+ "eval_runtime": 4.803,
6
+ "eval_samples_per_second": 249.845,
7
+ "eval_steps_per_second": 2.082
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 40.0,
3
- "total_flos": 3.720348931719168e+18,
4
- "train_loss": 2.217303295135498,
5
- "train_runtime": 465.0811,
6
- "train_samples_per_second": 129.01,
7
- "train_steps_per_second": 0.215
8
  }
 
1
  {
2
+ "epoch": 80.0,
3
+ "total_flos": 7.440697863438336e+18,
4
+ "train_loss": 1.5909362745285034,
5
+ "train_runtime": 1036.0381,
6
+ "train_samples_per_second": 115.826,
7
+ "train_steps_per_second": 0.193
8
  }
trainer_state.json CHANGED
@@ -1,457 +1,887 @@
1
  {
2
- "best_metric": 0.8361111111111111,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-inaturalist/checkpoint-100",
4
- "epoch": 40.0,
5
  "eval_steps": 500,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.8,
13
- "eval_accuracy": 0.05694444444444444,
14
- "eval_loss": 3.172593116760254,
15
- "eval_runtime": 3.5125,
16
- "eval_samples_per_second": 204.98,
17
- "eval_steps_per_second": 1.708,
18
  "step": 2
19
  },
20
  {
21
  "epoch": 2.0,
22
- "eval_accuracy": 0.11666666666666667,
23
- "eval_loss": 3.1497302055358887,
24
- "eval_runtime": 3.1347,
25
- "eval_samples_per_second": 229.687,
26
- "eval_steps_per_second": 1.914,
27
  "step": 5
28
  },
29
  {
30
  "epoch": 2.8,
31
- "eval_accuracy": 0.18333333333333332,
32
- "eval_loss": 3.121361017227173,
33
- "eval_runtime": 3.1053,
34
- "eval_samples_per_second": 231.865,
35
- "eval_steps_per_second": 1.932,
36
  "step": 7
37
  },
38
  {
39
  "epoch": 4.0,
40
- "grad_norm": 0.7334493398666382,
41
- "learning_rate": 5e-05,
42
- "loss": 3.1408,
43
  "step": 10
44
  },
45
  {
46
  "epoch": 4.0,
47
- "eval_accuracy": 0.31527777777777777,
48
- "eval_loss": 3.0588860511779785,
49
- "eval_runtime": 3.1331,
50
- "eval_samples_per_second": 229.807,
51
- "eval_steps_per_second": 1.915,
52
  "step": 10
53
  },
54
  {
55
  "epoch": 4.8,
56
- "eval_accuracy": 0.4236111111111111,
57
- "eval_loss": 3.0067594051361084,
58
- "eval_runtime": 3.0962,
59
- "eval_samples_per_second": 232.544,
60
- "eval_steps_per_second": 1.938,
61
  "step": 12
62
  },
63
  {
64
  "epoch": 6.0,
65
- "eval_accuracy": 0.5347222222222222,
66
- "eval_loss": 2.9316446781158447,
67
- "eval_runtime": 3.0379,
68
- "eval_samples_per_second": 237.005,
69
- "eval_steps_per_second": 1.975,
70
  "step": 15
71
  },
72
  {
73
  "epoch": 6.8,
74
- "eval_accuracy": 0.5763888888888888,
75
- "eval_loss": 2.8821420669555664,
76
- "eval_runtime": 3.238,
77
- "eval_samples_per_second": 222.356,
78
- "eval_steps_per_second": 1.853,
79
  "step": 17
80
  },
81
  {
82
  "epoch": 8.0,
83
- "grad_norm": 0.798339307308197,
84
- "learning_rate": 4.4444444444444447e-05,
85
- "loss": 2.921,
86
  "step": 20
87
  },
88
  {
89
  "epoch": 8.0,
90
- "eval_accuracy": 0.6180555555555556,
91
- "eval_loss": 2.8060097694396973,
92
- "eval_runtime": 3.1439,
93
- "eval_samples_per_second": 229.013,
94
- "eval_steps_per_second": 1.908,
95
  "step": 20
96
  },
97
  {
98
  "epoch": 8.8,
99
- "eval_accuracy": 0.6458333333333334,
100
- "eval_loss": 2.7532873153686523,
101
- "eval_runtime": 3.0396,
102
- "eval_samples_per_second": 236.87,
103
- "eval_steps_per_second": 1.974,
104
  "step": 22
105
  },
106
  {
107
  "epoch": 10.0,
108
- "eval_accuracy": 0.6875,
109
- "eval_loss": 2.6782259941101074,
110
- "eval_runtime": 3.1324,
111
- "eval_samples_per_second": 229.852,
112
- "eval_steps_per_second": 1.915,
113
  "step": 25
114
  },
115
  {
116
  "epoch": 10.8,
117
- "eval_accuracy": 0.7069444444444445,
118
- "eval_loss": 2.6306703090667725,
119
- "eval_runtime": 3.0536,
120
- "eval_samples_per_second": 235.785,
121
- "eval_steps_per_second": 1.965,
122
  "step": 27
123
  },
124
  {
125
  "epoch": 12.0,
126
- "grad_norm": 0.838556706905365,
127
- "learning_rate": 3.888888888888889e-05,
128
- "loss": 2.6444,
129
  "step": 30
130
  },
131
  {
132
  "epoch": 12.0,
133
- "eval_accuracy": 0.7472222222222222,
134
- "eval_loss": 2.5581698417663574,
135
- "eval_runtime": 3.3694,
136
- "eval_samples_per_second": 213.685,
137
- "eval_steps_per_second": 1.781,
138
  "step": 30
139
  },
140
  {
141
  "epoch": 12.8,
142
- "eval_accuracy": 0.7583333333333333,
143
- "eval_loss": 2.510007381439209,
144
- "eval_runtime": 3.0368,
145
- "eval_samples_per_second": 237.091,
146
- "eval_steps_per_second": 1.976,
147
  "step": 32
148
  },
149
  {
150
  "epoch": 14.0,
151
- "eval_accuracy": 0.7666666666666667,
152
- "eval_loss": 2.4404165744781494,
153
- "eval_runtime": 3.1041,
154
- "eval_samples_per_second": 231.955,
155
- "eval_steps_per_second": 1.933,
156
  "step": 35
157
  },
158
  {
159
  "epoch": 14.8,
160
- "eval_accuracy": 0.7708333333333334,
161
- "eval_loss": 2.3961093425750732,
162
- "eval_runtime": 2.9999,
163
- "eval_samples_per_second": 240.006,
164
- "eval_steps_per_second": 2.0,
165
  "step": 37
166
  },
167
  {
168
  "epoch": 16.0,
169
- "grad_norm": 0.858555018901825,
170
- "learning_rate": 3.3333333333333335e-05,
171
- "loss": 2.3812,
172
  "step": 40
173
  },
174
  {
175
  "epoch": 16.0,
176
- "eval_accuracy": 0.7680555555555556,
177
- "eval_loss": 2.3315865993499756,
178
- "eval_runtime": 3.1014,
179
- "eval_samples_per_second": 232.15,
180
- "eval_steps_per_second": 1.935,
181
  "step": 40
182
  },
183
  {
184
  "epoch": 16.8,
185
- "eval_accuracy": 0.7763888888888889,
186
- "eval_loss": 2.290663957595825,
187
- "eval_runtime": 3.0562,
188
- "eval_samples_per_second": 235.585,
189
- "eval_steps_per_second": 1.963,
190
  "step": 42
191
  },
192
  {
193
  "epoch": 18.0,
194
- "eval_accuracy": 0.7791666666666667,
195
- "eval_loss": 2.2317349910736084,
196
- "eval_runtime": 3.075,
197
- "eval_samples_per_second": 234.149,
198
- "eval_steps_per_second": 1.951,
199
  "step": 45
200
  },
201
  {
202
  "epoch": 18.8,
203
- "eval_accuracy": 0.7888888888888889,
204
- "eval_loss": 2.1949777603149414,
205
- "eval_runtime": 3.2371,
206
- "eval_samples_per_second": 222.42,
207
- "eval_steps_per_second": 1.854,
208
  "step": 47
209
  },
210
  {
211
  "epoch": 20.0,
212
- "grad_norm": 0.8867940306663513,
213
- "learning_rate": 2.777777777777778e-05,
214
- "loss": 2.1612,
215
  "step": 50
216
  },
217
  {
218
  "epoch": 20.0,
219
- "eval_accuracy": 0.7958333333333333,
220
- "eval_loss": 2.1451728343963623,
221
- "eval_runtime": 3.1048,
222
- "eval_samples_per_second": 231.902,
223
- "eval_steps_per_second": 1.933,
224
  "step": 50
225
  },
226
  {
227
  "epoch": 20.8,
228
- "eval_accuracy": 0.7930555555555555,
229
- "eval_loss": 2.1138837337493896,
230
- "eval_runtime": 3.0589,
231
- "eval_samples_per_second": 235.381,
232
- "eval_steps_per_second": 1.962,
233
  "step": 52
234
  },
235
  {
236
  "epoch": 22.0,
237
  "eval_accuracy": 0.7930555555555555,
238
- "eval_loss": 2.0691888332366943,
239
- "eval_runtime": 3.1824,
240
- "eval_samples_per_second": 226.244,
241
- "eval_steps_per_second": 1.885,
242
  "step": 55
243
  },
244
  {
245
  "epoch": 22.8,
246
- "eval_accuracy": 0.7986111111111112,
247
- "eval_loss": 2.041097402572632,
248
- "eval_runtime": 3.0557,
249
- "eval_samples_per_second": 235.626,
250
- "eval_steps_per_second": 1.964,
251
  "step": 57
252
  },
253
  {
254
  "epoch": 24.0,
255
- "grad_norm": 0.9011107683181763,
256
- "learning_rate": 2.2222222222222223e-05,
257
- "loss": 1.9801,
258
  "step": 60
259
  },
260
  {
261
  "epoch": 24.0,
262
- "eval_accuracy": 0.7986111111111112,
263
- "eval_loss": 2.003042221069336,
264
- "eval_runtime": 3.2184,
265
- "eval_samples_per_second": 223.713,
266
- "eval_steps_per_second": 1.864,
267
  "step": 60
268
  },
269
  {
270
  "epoch": 24.8,
271
- "eval_accuracy": 0.8083333333333333,
272
- "eval_loss": 1.97919499874115,
273
- "eval_runtime": 3.1013,
274
- "eval_samples_per_second": 232.157,
275
- "eval_steps_per_second": 1.935,
276
  "step": 62
277
  },
278
  {
279
  "epoch": 26.0,
280
- "eval_accuracy": 0.8097222222222222,
281
- "eval_loss": 1.9466288089752197,
282
- "eval_runtime": 3.1382,
283
- "eval_samples_per_second": 229.434,
284
- "eval_steps_per_second": 1.912,
285
  "step": 65
286
  },
287
  {
288
  "epoch": 26.8,
289
- "eval_accuracy": 0.8083333333333333,
290
- "eval_loss": 1.9272509813308716,
291
- "eval_runtime": 3.2793,
292
- "eval_samples_per_second": 219.559,
293
- "eval_steps_per_second": 1.83,
294
  "step": 67
295
  },
296
  {
297
  "epoch": 28.0,
298
- "grad_norm": 0.8933854699134827,
299
- "learning_rate": 1.6666666666666667e-05,
300
- "loss": 1.8437,
301
  "step": 70
302
  },
303
  {
304
  "epoch": 28.0,
305
- "eval_accuracy": 0.8152777777777778,
306
- "eval_loss": 1.90021550655365,
307
- "eval_runtime": 3.1022,
308
- "eval_samples_per_second": 232.097,
309
- "eval_steps_per_second": 1.934,
310
  "step": 70
311
  },
312
  {
313
  "epoch": 28.8,
314
- "eval_accuracy": 0.8180555555555555,
315
- "eval_loss": 1.8828704357147217,
316
- "eval_runtime": 3.0817,
317
- "eval_samples_per_second": 233.636,
318
- "eval_steps_per_second": 1.947,
319
  "step": 72
320
  },
321
  {
322
  "epoch": 30.0,
323
- "eval_accuracy": 0.825,
324
- "eval_loss": 1.860001802444458,
325
- "eval_runtime": 3.3637,
326
- "eval_samples_per_second": 214.053,
327
- "eval_steps_per_second": 1.784,
328
  "step": 75
329
  },
330
  {
331
  "epoch": 30.8,
332
- "eval_accuracy": 0.8291666666666667,
333
- "eval_loss": 1.8465533256530762,
334
- "eval_runtime": 3.0565,
335
- "eval_samples_per_second": 235.565,
336
- "eval_steps_per_second": 1.963,
337
  "step": 77
338
  },
339
  {
340
  "epoch": 32.0,
341
- "grad_norm": 0.9217949509620667,
342
- "learning_rate": 1.1111111111111112e-05,
343
- "loss": 1.7518,
344
  "step": 80
345
  },
346
  {
347
  "epoch": 32.0,
348
- "eval_accuracy": 0.8277777777777777,
349
- "eval_loss": 1.827829360961914,
350
- "eval_runtime": 3.1649,
351
- "eval_samples_per_second": 227.494,
352
- "eval_steps_per_second": 1.896,
353
  "step": 80
354
  },
355
  {
356
  "epoch": 32.8,
357
- "eval_accuracy": 0.8319444444444445,
358
- "eval_loss": 1.8170872926712036,
359
- "eval_runtime": 3.4533,
360
- "eval_samples_per_second": 208.494,
361
- "eval_steps_per_second": 1.737,
362
  "step": 82
363
  },
364
  {
365
  "epoch": 34.0,
366
- "eval_accuracy": 0.8333333333333334,
367
- "eval_loss": 1.8042806386947632,
368
- "eval_runtime": 3.7195,
369
- "eval_samples_per_second": 193.576,
370
- "eval_steps_per_second": 1.613,
371
  "step": 85
372
  },
373
  {
374
  "epoch": 34.8,
375
  "eval_accuracy": 0.8347222222222223,
376
- "eval_loss": 1.7976477146148682,
377
- "eval_runtime": 3.0512,
378
- "eval_samples_per_second": 235.975,
379
- "eval_steps_per_second": 1.966,
380
  "step": 87
381
  },
382
  {
383
  "epoch": 36.0,
384
- "grad_norm": 0.8925730586051941,
385
- "learning_rate": 5.555555555555556e-06,
386
- "loss": 1.6966,
387
  "step": 90
388
  },
389
  {
390
  "epoch": 36.0,
391
- "eval_accuracy": 0.8319444444444445,
392
- "eval_loss": 1.7895526885986328,
393
- "eval_runtime": 3.0775,
394
- "eval_samples_per_second": 233.957,
395
- "eval_steps_per_second": 1.95,
396
  "step": 90
397
  },
398
  {
399
  "epoch": 36.8,
400
- "eval_accuracy": 0.8347222222222223,
401
- "eval_loss": 1.7851399183273315,
402
- "eval_runtime": 3.1908,
403
- "eval_samples_per_second": 225.647,
404
- "eval_steps_per_second": 1.88,
405
  "step": 92
406
  },
407
  {
408
  "epoch": 38.0,
409
- "eval_accuracy": 0.8347222222222223,
410
- "eval_loss": 1.7800432443618774,
411
- "eval_runtime": 3.2053,
412
- "eval_samples_per_second": 224.627,
413
- "eval_steps_per_second": 1.872,
414
  "step": 95
415
  },
416
  {
417
  "epoch": 38.8,
418
- "eval_accuracy": 0.8347222222222223,
419
- "eval_loss": 1.7778292894363403,
420
- "eval_runtime": 3.0496,
421
- "eval_samples_per_second": 236.093,
422
- "eval_steps_per_second": 1.967,
423
  "step": 97
424
  },
425
  {
426
  "epoch": 40.0,
427
- "grad_norm": 0.8934192061424255,
428
- "learning_rate": 0.0,
429
- "loss": 1.6522,
430
  "step": 100
431
  },
432
  {
433
  "epoch": 40.0,
434
- "eval_accuracy": 0.8361111111111111,
435
- "eval_loss": 1.7763370275497437,
436
- "eval_runtime": 3.345,
437
- "eval_samples_per_second": 215.246,
438
- "eval_steps_per_second": 1.794,
439
  "step": 100
440
  },
441
  {
442
- "epoch": 40.0,
443
- "step": 100,
444
- "total_flos": 3.720348931719168e+18,
445
- "train_loss": 2.217303295135498,
446
- "train_runtime": 465.0811,
447
- "train_samples_per_second": 129.01,
448
- "train_steps_per_second": 0.215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  }
450
  ],
451
  "logging_steps": 10,
452
- "max_steps": 100,
453
  "num_input_tokens_seen": 0,
454
- "num_train_epochs": 50,
455
  "save_steps": 500,
456
  "stateful_callbacks": {
457
  "TrainerControl": {
@@ -465,7 +895,7 @@
465
  "attributes": {}
466
  }
467
  },
468
- "total_flos": 3.720348931719168e+18,
469
  "train_batch_size": 128,
470
  "trial_name": null,
471
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8638888888888889,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-inaturalist/checkpoint-140",
4
+ "epoch": 80.0,
5
  "eval_steps": 500,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.8,
13
+ "eval_accuracy": 0.03333333333333333,
14
+ "eval_loss": 3.200693368911743,
15
+ "eval_runtime": 3.9107,
16
+ "eval_samples_per_second": 184.111,
17
+ "eval_steps_per_second": 1.534,
18
  "step": 2
19
  },
20
  {
21
  "epoch": 2.0,
22
+ "eval_accuracy": 0.044444444444444446,
23
+ "eval_loss": 3.188920021057129,
24
+ "eval_runtime": 3.5624,
25
+ "eval_samples_per_second": 202.11,
26
+ "eval_steps_per_second": 1.684,
27
  "step": 5
28
  },
29
  {
30
  "epoch": 2.8,
31
+ "eval_accuracy": 0.06388888888888888,
32
+ "eval_loss": 3.1746575832366943,
33
+ "eval_runtime": 3.358,
34
+ "eval_samples_per_second": 214.412,
35
+ "eval_steps_per_second": 1.787,
36
  "step": 7
37
  },
38
  {
39
  "epoch": 4.0,
40
+ "grad_norm": 0.6863054633140564,
41
+ "learning_rate": 2.5e-05,
42
+ "loss": 3.1888,
43
  "step": 10
44
  },
45
  {
46
  "epoch": 4.0,
47
+ "eval_accuracy": 0.10972222222222222,
48
+ "eval_loss": 3.144199848175049,
49
+ "eval_runtime": 3.3224,
50
+ "eval_samples_per_second": 216.708,
51
+ "eval_steps_per_second": 1.806,
52
  "step": 10
53
  },
54
  {
55
  "epoch": 4.8,
56
+ "eval_accuracy": 0.14583333333333334,
57
+ "eval_loss": 3.1183247566223145,
58
+ "eval_runtime": 3.3659,
59
+ "eval_samples_per_second": 213.913,
60
+ "eval_steps_per_second": 1.783,
61
  "step": 12
62
  },
63
  {
64
  "epoch": 6.0,
65
+ "eval_accuracy": 0.21944444444444444,
66
+ "eval_loss": 3.071033477783203,
67
+ "eval_runtime": 3.1013,
68
+ "eval_samples_per_second": 232.158,
69
+ "eval_steps_per_second": 1.935,
70
  "step": 15
71
  },
72
  {
73
  "epoch": 6.8,
74
+ "eval_accuracy": 0.30416666666666664,
75
+ "eval_loss": 3.0330796241760254,
76
+ "eval_runtime": 3.0337,
77
+ "eval_samples_per_second": 237.333,
78
+ "eval_steps_per_second": 1.978,
79
  "step": 17
80
  },
81
  {
82
  "epoch": 8.0,
83
+ "grad_norm": 0.7356524467468262,
84
+ "learning_rate": 5e-05,
85
+ "loss": 3.0673,
86
  "step": 20
87
  },
88
  {
89
  "epoch": 8.0,
90
+ "eval_accuracy": 0.4388888888888889,
91
+ "eval_loss": 2.96268367767334,
92
+ "eval_runtime": 3.2316,
93
+ "eval_samples_per_second": 222.8,
94
+ "eval_steps_per_second": 1.857,
95
  "step": 20
96
  },
97
  {
98
  "epoch": 8.8,
99
+ "eval_accuracy": 0.49444444444444446,
100
+ "eval_loss": 2.9109385013580322,
101
+ "eval_runtime": 3.1222,
102
+ "eval_samples_per_second": 230.605,
103
+ "eval_steps_per_second": 1.922,
104
  "step": 22
105
  },
106
  {
107
  "epoch": 10.0,
108
+ "eval_accuracy": 0.5763888888888888,
109
+ "eval_loss": 2.8359875679016113,
110
+ "eval_runtime": 3.0808,
111
+ "eval_samples_per_second": 233.708,
112
+ "eval_steps_per_second": 1.948,
113
  "step": 25
114
  },
115
  {
116
  "epoch": 10.8,
117
+ "eval_accuracy": 0.6055555555555555,
118
+ "eval_loss": 2.780921459197998,
119
+ "eval_runtime": 3.0798,
120
+ "eval_samples_per_second": 233.781,
121
+ "eval_steps_per_second": 1.948,
122
  "step": 27
123
  },
124
  {
125
  "epoch": 12.0,
126
+ "grad_norm": 0.76748126745224,
127
+ "learning_rate": 4.722222222222222e-05,
128
+ "loss": 2.8151,
129
  "step": 30
130
  },
131
  {
132
  "epoch": 12.0,
133
+ "eval_accuracy": 0.6541666666666667,
134
+ "eval_loss": 2.695770025253296,
135
+ "eval_runtime": 3.3541,
136
+ "eval_samples_per_second": 214.662,
137
+ "eval_steps_per_second": 1.789,
138
  "step": 30
139
  },
140
  {
141
  "epoch": 12.8,
142
+ "eval_accuracy": 0.6763888888888889,
143
+ "eval_loss": 2.640125036239624,
144
+ "eval_runtime": 3.1085,
145
+ "eval_samples_per_second": 231.622,
146
+ "eval_steps_per_second": 1.93,
147
  "step": 32
148
  },
149
  {
150
  "epoch": 14.0,
151
+ "eval_accuracy": 0.6944444444444444,
152
+ "eval_loss": 2.558335304260254,
153
+ "eval_runtime": 3.0528,
154
+ "eval_samples_per_second": 235.852,
155
+ "eval_steps_per_second": 1.965,
156
  "step": 35
157
  },
158
  {
159
  "epoch": 14.8,
160
+ "eval_accuracy": 0.7083333333333334,
161
+ "eval_loss": 2.503415822982788,
162
+ "eval_runtime": 3.1925,
163
+ "eval_samples_per_second": 225.525,
164
+ "eval_steps_per_second": 1.879,
165
  "step": 37
166
  },
167
  {
168
  "epoch": 16.0,
169
+ "grad_norm": 0.8363860249519348,
170
+ "learning_rate": 4.4444444444444447e-05,
171
+ "loss": 2.5143,
172
  "step": 40
173
  },
174
  {
175
  "epoch": 16.0,
176
+ "eval_accuracy": 0.7347222222222223,
177
+ "eval_loss": 2.4201643466949463,
178
+ "eval_runtime": 3.0856,
179
+ "eval_samples_per_second": 233.339,
180
+ "eval_steps_per_second": 1.944,
181
  "step": 40
182
  },
183
  {
184
  "epoch": 16.8,
185
+ "eval_accuracy": 0.7375,
186
+ "eval_loss": 2.3662188053131104,
187
+ "eval_runtime": 3.0872,
188
+ "eval_samples_per_second": 233.218,
189
+ "eval_steps_per_second": 1.943,
190
  "step": 42
191
  },
192
  {
193
  "epoch": 18.0,
194
+ "eval_accuracy": 0.7444444444444445,
195
+ "eval_loss": 2.2883973121643066,
196
+ "eval_runtime": 3.1262,
197
+ "eval_samples_per_second": 230.314,
198
+ "eval_steps_per_second": 1.919,
199
  "step": 45
200
  },
201
  {
202
  "epoch": 18.8,
203
+ "eval_accuracy": 0.7569444444444444,
204
+ "eval_loss": 2.237414598464966,
205
+ "eval_runtime": 3.0834,
206
+ "eval_samples_per_second": 233.51,
207
+ "eval_steps_per_second": 1.946,
208
  "step": 47
209
  },
210
  {
211
  "epoch": 20.0,
212
+ "grad_norm": 0.8517465591430664,
213
+ "learning_rate": 4.166666666666667e-05,
214
+ "loss": 2.2236,
215
  "step": 50
216
  },
217
  {
218
  "epoch": 20.0,
219
+ "eval_accuracy": 0.7777777777777778,
220
+ "eval_loss": 2.1632001399993896,
221
+ "eval_runtime": 3.0769,
222
+ "eval_samples_per_second": 234.001,
223
+ "eval_steps_per_second": 1.95,
224
  "step": 50
225
  },
226
  {
227
  "epoch": 20.8,
228
+ "eval_accuracy": 0.7833333333333333,
229
+ "eval_loss": 2.1174519062042236,
230
+ "eval_runtime": 3.0592,
231
+ "eval_samples_per_second": 235.356,
232
+ "eval_steps_per_second": 1.961,
233
  "step": 52
234
  },
235
  {
236
  "epoch": 22.0,
237
  "eval_accuracy": 0.7930555555555555,
238
+ "eval_loss": 2.052760124206543,
239
+ "eval_runtime": 3.21,
240
+ "eval_samples_per_second": 224.301,
241
+ "eval_steps_per_second": 1.869,
242
  "step": 55
243
  },
244
  {
245
  "epoch": 22.8,
246
+ "eval_accuracy": 0.7958333333333333,
247
+ "eval_loss": 2.009880304336548,
248
+ "eval_runtime": 3.0534,
249
+ "eval_samples_per_second": 235.801,
250
+ "eval_steps_per_second": 1.965,
251
  "step": 57
252
  },
253
  {
254
  "epoch": 24.0,
255
+ "grad_norm": 0.8587987422943115,
256
+ "learning_rate": 3.888888888888889e-05,
257
+ "loss": 1.9677,
258
  "step": 60
259
  },
260
  {
261
  "epoch": 24.0,
262
+ "eval_accuracy": 0.8013888888888889,
263
+ "eval_loss": 1.9488461017608643,
264
+ "eval_runtime": 3.1017,
265
+ "eval_samples_per_second": 232.129,
266
+ "eval_steps_per_second": 1.934,
267
  "step": 60
268
  },
269
  {
270
  "epoch": 24.8,
271
+ "eval_accuracy": 0.8097222222222222,
272
+ "eval_loss": 1.9112929105758667,
273
+ "eval_runtime": 3.0366,
274
+ "eval_samples_per_second": 237.104,
275
+ "eval_steps_per_second": 1.976,
276
  "step": 62
277
  },
278
  {
279
  "epoch": 26.0,
280
+ "eval_accuracy": 0.8138888888888889,
281
+ "eval_loss": 1.8581663370132446,
282
+ "eval_runtime": 3.0592,
283
+ "eval_samples_per_second": 235.359,
284
+ "eval_steps_per_second": 1.961,
285
  "step": 65
286
  },
287
  {
288
  "epoch": 26.8,
289
+ "eval_accuracy": 0.8138888888888889,
290
+ "eval_loss": 1.8241873979568481,
291
+ "eval_runtime": 3.2249,
292
+ "eval_samples_per_second": 223.26,
293
+ "eval_steps_per_second": 1.861,
294
  "step": 67
295
  },
296
  {
297
  "epoch": 28.0,
298
+ "grad_norm": 0.8794375061988831,
299
+ "learning_rate": 3.611111111111111e-05,
300
+ "loss": 1.7467,
301
  "step": 70
302
  },
303
  {
304
  "epoch": 28.0,
305
+ "eval_accuracy": 0.8111111111111111,
306
+ "eval_loss": 1.7740373611450195,
307
+ "eval_runtime": 3.1535,
308
+ "eval_samples_per_second": 228.319,
309
+ "eval_steps_per_second": 1.903,
310
  "step": 70
311
  },
312
  {
313
  "epoch": 28.8,
314
+ "eval_accuracy": 0.8055555555555556,
315
+ "eval_loss": 1.7457906007766724,
316
+ "eval_runtime": 3.0886,
317
+ "eval_samples_per_second": 233.114,
318
+ "eval_steps_per_second": 1.943,
319
  "step": 72
320
  },
321
  {
322
  "epoch": 30.0,
323
+ "eval_accuracy": 0.8180555555555555,
324
+ "eval_loss": 1.7013169527053833,
325
+ "eval_runtime": 3.0478,
326
+ "eval_samples_per_second": 236.234,
327
+ "eval_steps_per_second": 1.969,
328
  "step": 75
329
  },
330
  {
331
  "epoch": 30.8,
332
+ "eval_accuracy": 0.8194444444444444,
333
+ "eval_loss": 1.6714116334915161,
334
+ "eval_runtime": 3.1668,
335
+ "eval_samples_per_second": 227.361,
336
+ "eval_steps_per_second": 1.895,
337
  "step": 77
338
  },
339
  {
340
  "epoch": 32.0,
341
+ "grad_norm": 0.9110932946205139,
342
+ "learning_rate": 3.3333333333333335e-05,
343
+ "loss": 1.5765,
344
  "step": 80
345
  },
346
  {
347
  "epoch": 32.0,
348
+ "eval_accuracy": 0.8263888888888888,
349
+ "eval_loss": 1.631589412689209,
350
+ "eval_runtime": 3.3174,
351
+ "eval_samples_per_second": 217.038,
352
+ "eval_steps_per_second": 1.809,
353
  "step": 80
354
  },
355
  {
356
  "epoch": 32.8,
357
+ "eval_accuracy": 0.8236111111111111,
358
+ "eval_loss": 1.6082607507705688,
359
+ "eval_runtime": 3.107,
360
+ "eval_samples_per_second": 231.734,
361
+ "eval_steps_per_second": 1.931,
362
  "step": 82
363
  },
364
  {
365
  "epoch": 34.0,
366
+ "eval_accuracy": 0.8291666666666667,
367
+ "eval_loss": 1.5738186836242676,
368
+ "eval_runtime": 3.1316,
369
+ "eval_samples_per_second": 229.912,
370
+ "eval_steps_per_second": 1.916,
371
  "step": 85
372
  },
373
  {
374
  "epoch": 34.8,
375
  "eval_accuracy": 0.8347222222222223,
376
+ "eval_loss": 1.553105354309082,
377
+ "eval_runtime": 3.2321,
378
+ "eval_samples_per_second": 222.767,
379
+ "eval_steps_per_second": 1.856,
380
  "step": 87
381
  },
382
  {
383
  "epoch": 36.0,
384
+ "grad_norm": 0.9191176891326904,
385
+ "learning_rate": 3.055555555555556e-05,
386
+ "loss": 1.4431,
387
  "step": 90
388
  },
389
  {
390
  "epoch": 36.0,
391
+ "eval_accuracy": 0.8430555555555556,
392
+ "eval_loss": 1.5228244066238403,
393
+ "eval_runtime": 3.0649,
394
+ "eval_samples_per_second": 234.921,
395
+ "eval_steps_per_second": 1.958,
396
  "step": 90
397
  },
398
  {
399
  "epoch": 36.8,
400
+ "eval_accuracy": 0.8444444444444444,
401
+ "eval_loss": 1.5046004056930542,
402
+ "eval_runtime": 3.2047,
403
+ "eval_samples_per_second": 224.669,
404
+ "eval_steps_per_second": 1.872,
405
  "step": 92
406
  },
407
  {
408
  "epoch": 38.0,
409
+ "eval_accuracy": 0.8472222222222222,
410
+ "eval_loss": 1.4780092239379883,
411
+ "eval_runtime": 3.0681,
412
+ "eval_samples_per_second": 234.673,
413
+ "eval_steps_per_second": 1.956,
414
  "step": 95
415
  },
416
  {
417
  "epoch": 38.8,
418
+ "eval_accuracy": 0.8458333333333333,
419
+ "eval_loss": 1.4607552289962769,
420
+ "eval_runtime": 3.3287,
421
+ "eval_samples_per_second": 216.303,
422
+ "eval_steps_per_second": 1.803,
423
  "step": 97
424
  },
425
  {
426
  "epoch": 40.0,
427
+ "grad_norm": 0.896637499332428,
428
+ "learning_rate": 2.777777777777778e-05,
429
+ "loss": 1.3049,
430
  "step": 100
431
  },
432
  {
433
  "epoch": 40.0,
434
+ "eval_accuracy": 0.8458333333333333,
435
+ "eval_loss": 1.435728907585144,
436
+ "eval_runtime": 3.1142,
437
+ "eval_samples_per_second": 231.201,
438
+ "eval_steps_per_second": 1.927,
439
  "step": 100
440
  },
441
  {
442
+ "epoch": 40.8,
443
+ "eval_accuracy": 0.85,
444
+ "eval_loss": 1.4187902212142944,
445
+ "eval_runtime": 3.144,
446
+ "eval_samples_per_second": 229.01,
447
+ "eval_steps_per_second": 1.908,
448
+ "step": 102
449
+ },
450
+ {
451
+ "epoch": 42.0,
452
+ "eval_accuracy": 0.8527777777777777,
453
+ "eval_loss": 1.3949499130249023,
454
+ "eval_runtime": 3.0808,
455
+ "eval_samples_per_second": 233.709,
456
+ "eval_steps_per_second": 1.948,
457
+ "step": 105
458
+ },
459
+ {
460
+ "epoch": 42.8,
461
+ "eval_accuracy": 0.8527777777777777,
462
+ "eval_loss": 1.3807573318481445,
463
+ "eval_runtime": 3.1783,
464
+ "eval_samples_per_second": 226.537,
465
+ "eval_steps_per_second": 1.888,
466
+ "step": 107
467
+ },
468
+ {
469
+ "epoch": 44.0,
470
+ "grad_norm": 0.8933643102645874,
471
+ "learning_rate": 2.5e-05,
472
+ "loss": 1.2312,
473
+ "step": 110
474
+ },
475
+ {
476
+ "epoch": 44.0,
477
+ "eval_accuracy": 0.8458333333333333,
478
+ "eval_loss": 1.3636168241500854,
479
+ "eval_runtime": 3.356,
480
+ "eval_samples_per_second": 214.543,
481
+ "eval_steps_per_second": 1.788,
482
+ "step": 110
483
+ },
484
+ {
485
+ "epoch": 44.8,
486
+ "eval_accuracy": 0.8486111111111111,
487
+ "eval_loss": 1.3513400554656982,
488
+ "eval_runtime": 3.2597,
489
+ "eval_samples_per_second": 220.878,
490
+ "eval_steps_per_second": 1.841,
491
+ "step": 112
492
+ },
493
+ {
494
+ "epoch": 46.0,
495
+ "eval_accuracy": 0.8527777777777777,
496
+ "eval_loss": 1.3329037427902222,
497
+ "eval_runtime": 3.0929,
498
+ "eval_samples_per_second": 232.788,
499
+ "eval_steps_per_second": 1.94,
500
+ "step": 115
501
+ },
502
+ {
503
+ "epoch": 46.8,
504
+ "eval_accuracy": 0.8527777777777777,
505
+ "eval_loss": 1.319313883781433,
506
+ "eval_runtime": 3.1007,
507
+ "eval_samples_per_second": 232.203,
508
+ "eval_steps_per_second": 1.935,
509
+ "step": 117
510
+ },
511
+ {
512
+ "epoch": 48.0,
513
+ "grad_norm": 0.9017526507377625,
514
+ "learning_rate": 2.2222222222222223e-05,
515
+ "loss": 1.1368,
516
+ "step": 120
517
+ },
518
+ {
519
+ "epoch": 48.0,
520
+ "eval_accuracy": 0.8527777777777777,
521
+ "eval_loss": 1.3025320768356323,
522
+ "eval_runtime": 3.1234,
523
+ "eval_samples_per_second": 230.52,
524
+ "eval_steps_per_second": 1.921,
525
+ "step": 120
526
+ },
527
+ {
528
+ "epoch": 48.8,
529
+ "eval_accuracy": 0.8541666666666666,
530
+ "eval_loss": 1.2945308685302734,
531
+ "eval_runtime": 3.0692,
532
+ "eval_samples_per_second": 234.59,
533
+ "eval_steps_per_second": 1.955,
534
+ "step": 122
535
+ },
536
+ {
537
+ "epoch": 50.0,
538
+ "eval_accuracy": 0.8527777777777777,
539
+ "eval_loss": 1.2819503545761108,
540
+ "eval_runtime": 3.1162,
541
+ "eval_samples_per_second": 231.052,
542
+ "eval_steps_per_second": 1.925,
543
+ "step": 125
544
+ },
545
+ {
546
+ "epoch": 50.8,
547
+ "eval_accuracy": 0.8569444444444444,
548
+ "eval_loss": 1.2704639434814453,
549
+ "eval_runtime": 3.2771,
550
+ "eval_samples_per_second": 219.706,
551
+ "eval_steps_per_second": 1.831,
552
+ "step": 127
553
+ },
554
+ {
555
+ "epoch": 52.0,
556
+ "grad_norm": 0.8930371999740601,
557
+ "learning_rate": 1.9444444444444445e-05,
558
+ "loss": 1.0821,
559
+ "step": 130
560
+ },
561
+ {
562
+ "epoch": 52.0,
563
+ "eval_accuracy": 0.8583333333333333,
564
+ "eval_loss": 1.2615665197372437,
565
+ "eval_runtime": 3.1151,
566
+ "eval_samples_per_second": 231.134,
567
+ "eval_steps_per_second": 1.926,
568
+ "step": 130
569
+ },
570
+ {
571
+ "epoch": 52.8,
572
+ "eval_accuracy": 0.8555555555555555,
573
+ "eval_loss": 1.2545220851898193,
574
+ "eval_runtime": 3.0635,
575
+ "eval_samples_per_second": 235.024,
576
+ "eval_steps_per_second": 1.959,
577
+ "step": 132
578
+ },
579
+ {
580
+ "epoch": 54.0,
581
+ "eval_accuracy": 0.8541666666666666,
582
+ "eval_loss": 1.2422840595245361,
583
+ "eval_runtime": 3.0988,
584
+ "eval_samples_per_second": 232.348,
585
+ "eval_steps_per_second": 1.936,
586
+ "step": 135
587
+ },
588
+ {
589
+ "epoch": 54.8,
590
+ "eval_accuracy": 0.8597222222222223,
591
+ "eval_loss": 1.233168125152588,
592
+ "eval_runtime": 3.1982,
593
+ "eval_samples_per_second": 225.127,
594
+ "eval_steps_per_second": 1.876,
595
+ "step": 137
596
+ },
597
+ {
598
+ "epoch": 56.0,
599
+ "grad_norm": 0.8568278551101685,
600
+ "learning_rate": 1.6666666666666667e-05,
601
+ "loss": 1.0232,
602
+ "step": 140
603
+ },
604
+ {
605
+ "epoch": 56.0,
606
+ "eval_accuracy": 0.8638888888888889,
607
+ "eval_loss": 1.2210274934768677,
608
+ "eval_runtime": 3.355,
609
+ "eval_samples_per_second": 214.604,
610
+ "eval_steps_per_second": 1.788,
611
+ "step": 140
612
+ },
613
+ {
614
+ "epoch": 56.8,
615
+ "eval_accuracy": 0.8625,
616
+ "eval_loss": 1.2160966396331787,
617
+ "eval_runtime": 3.127,
618
+ "eval_samples_per_second": 230.252,
619
+ "eval_steps_per_second": 1.919,
620
+ "step": 142
621
+ },
622
+ {
623
+ "epoch": 58.0,
624
+ "eval_accuracy": 0.8569444444444444,
625
+ "eval_loss": 1.209418535232544,
626
+ "eval_runtime": 3.1685,
627
+ "eval_samples_per_second": 227.239,
628
+ "eval_steps_per_second": 1.894,
629
+ "step": 145
630
+ },
631
+ {
632
+ "epoch": 58.8,
633
+ "eval_accuracy": 0.8541666666666666,
634
+ "eval_loss": 1.205717921257019,
635
+ "eval_runtime": 3.1918,
636
+ "eval_samples_per_second": 225.576,
637
+ "eval_steps_per_second": 1.88,
638
+ "step": 147
639
+ },
640
+ {
641
+ "epoch": 60.0,
642
+ "grad_norm": 0.8556333780288696,
643
+ "learning_rate": 1.388888888888889e-05,
644
+ "loss": 0.9814,
645
+ "step": 150
646
+ },
647
+ {
648
+ "epoch": 60.0,
649
+ "eval_accuracy": 0.85,
650
+ "eval_loss": 1.1972941160202026,
651
+ "eval_runtime": 3.1515,
652
+ "eval_samples_per_second": 228.465,
653
+ "eval_steps_per_second": 1.904,
654
+ "step": 150
655
+ },
656
+ {
657
+ "epoch": 60.8,
658
+ "eval_accuracy": 0.8486111111111111,
659
+ "eval_loss": 1.1918764114379883,
660
+ "eval_runtime": 3.349,
661
+ "eval_samples_per_second": 214.987,
662
+ "eval_steps_per_second": 1.792,
663
+ "step": 152
664
+ },
665
+ {
666
+ "epoch": 62.0,
667
+ "eval_accuracy": 0.8625,
668
+ "eval_loss": 1.1825212240219116,
669
+ "eval_runtime": 3.1401,
670
+ "eval_samples_per_second": 229.295,
671
+ "eval_steps_per_second": 1.911,
672
+ "step": 155
673
+ },
674
+ {
675
+ "epoch": 62.8,
676
+ "eval_accuracy": 0.8597222222222223,
677
+ "eval_loss": 1.179900050163269,
678
+ "eval_runtime": 3.239,
679
+ "eval_samples_per_second": 222.293,
680
+ "eval_steps_per_second": 1.852,
681
+ "step": 157
682
+ },
683
+ {
684
+ "epoch": 64.0,
685
+ "grad_norm": 0.827712893486023,
686
+ "learning_rate": 1.1111111111111112e-05,
687
+ "loss": 0.9415,
688
+ "step": 160
689
+ },
690
+ {
691
+ "epoch": 64.0,
692
+ "eval_accuracy": 0.8597222222222223,
693
+ "eval_loss": 1.1716293096542358,
694
+ "eval_runtime": 3.1517,
695
+ "eval_samples_per_second": 228.447,
696
+ "eval_steps_per_second": 1.904,
697
+ "step": 160
698
+ },
699
+ {
700
+ "epoch": 64.8,
701
+ "eval_accuracy": 0.8625,
702
+ "eval_loss": 1.166538119316101,
703
+ "eval_runtime": 3.22,
704
+ "eval_samples_per_second": 223.601,
705
+ "eval_steps_per_second": 1.863,
706
+ "step": 162
707
+ },
708
+ {
709
+ "epoch": 66.0,
710
+ "eval_accuracy": 0.8638888888888889,
711
+ "eval_loss": 1.1611206531524658,
712
+ "eval_runtime": 3.323,
713
+ "eval_samples_per_second": 216.67,
714
+ "eval_steps_per_second": 1.806,
715
+ "step": 165
716
+ },
717
+ {
718
+ "epoch": 66.8,
719
+ "eval_accuracy": 0.8625,
720
+ "eval_loss": 1.1600357294082642,
721
+ "eval_runtime": 3.2253,
722
+ "eval_samples_per_second": 223.233,
723
+ "eval_steps_per_second": 1.86,
724
+ "step": 167
725
+ },
726
+ {
727
+ "epoch": 68.0,
728
+ "grad_norm": 0.9616327285766602,
729
+ "learning_rate": 8.333333333333334e-06,
730
+ "loss": 0.9135,
731
+ "step": 170
732
+ },
733
+ {
734
+ "epoch": 68.0,
735
+ "eval_accuracy": 0.8638888888888889,
736
+ "eval_loss": 1.1577341556549072,
737
+ "eval_runtime": 3.1095,
738
+ "eval_samples_per_second": 231.548,
739
+ "eval_steps_per_second": 1.93,
740
+ "step": 170
741
+ },
742
+ {
743
+ "epoch": 68.8,
744
+ "eval_accuracy": 0.8638888888888889,
745
+ "eval_loss": 1.1546900272369385,
746
+ "eval_runtime": 3.2289,
747
+ "eval_samples_per_second": 222.988,
748
+ "eval_steps_per_second": 1.858,
749
+ "step": 172
750
+ },
751
+ {
752
+ "epoch": 70.0,
753
+ "eval_accuracy": 0.8638888888888889,
754
+ "eval_loss": 1.1493217945098877,
755
+ "eval_runtime": 3.103,
756
+ "eval_samples_per_second": 232.033,
757
+ "eval_steps_per_second": 1.934,
758
+ "step": 175
759
+ },
760
+ {
761
+ "epoch": 70.8,
762
+ "eval_accuracy": 0.8611111111111112,
763
+ "eval_loss": 1.1463639736175537,
764
+ "eval_runtime": 3.3191,
765
+ "eval_samples_per_second": 216.926,
766
+ "eval_steps_per_second": 1.808,
767
+ "step": 177
768
+ },
769
+ {
770
+ "epoch": 72.0,
771
+ "grad_norm": 0.8919360637664795,
772
+ "learning_rate": 5.555555555555556e-06,
773
+ "loss": 0.8946,
774
+ "step": 180
775
+ },
776
+ {
777
+ "epoch": 72.0,
778
+ "eval_accuracy": 0.8555555555555555,
779
+ "eval_loss": 1.1423206329345703,
780
+ "eval_runtime": 3.0106,
781
+ "eval_samples_per_second": 239.152,
782
+ "eval_steps_per_second": 1.993,
783
+ "step": 180
784
+ },
785
+ {
786
+ "epoch": 72.8,
787
+ "eval_accuracy": 0.8611111111111112,
788
+ "eval_loss": 1.1402053833007812,
789
+ "eval_runtime": 3.4453,
790
+ "eval_samples_per_second": 208.981,
791
+ "eval_steps_per_second": 1.742,
792
+ "step": 182
793
+ },
794
+ {
795
+ "epoch": 74.0,
796
+ "eval_accuracy": 0.8583333333333333,
797
+ "eval_loss": 1.1375410556793213,
798
+ "eval_runtime": 3.1143,
799
+ "eval_samples_per_second": 231.188,
800
+ "eval_steps_per_second": 1.927,
801
+ "step": 185
802
+ },
803
+ {
804
+ "epoch": 74.8,
805
+ "eval_accuracy": 0.8597222222222223,
806
+ "eval_loss": 1.1360384225845337,
807
+ "eval_runtime": 3.2332,
808
+ "eval_samples_per_second": 222.688,
809
+ "eval_steps_per_second": 1.856,
810
+ "step": 187
811
+ },
812
+ {
813
+ "epoch": 76.0,
814
+ "grad_norm": 0.9335024356842041,
815
+ "learning_rate": 2.777777777777778e-06,
816
+ "loss": 0.8866,
817
+ "step": 190
818
+ },
819
+ {
820
+ "epoch": 76.0,
821
+ "eval_accuracy": 0.8625,
822
+ "eval_loss": 1.134353756904602,
823
+ "eval_runtime": 3.1331,
824
+ "eval_samples_per_second": 229.808,
825
+ "eval_steps_per_second": 1.915,
826
+ "step": 190
827
+ },
828
+ {
829
+ "epoch": 76.8,
830
+ "eval_accuracy": 0.8638888888888889,
831
+ "eval_loss": 1.1333543062210083,
832
+ "eval_runtime": 3.2199,
833
+ "eval_samples_per_second": 223.611,
834
+ "eval_steps_per_second": 1.863,
835
+ "step": 192
836
+ },
837
+ {
838
+ "epoch": 78.0,
839
+ "eval_accuracy": 0.8638888888888889,
840
+ "eval_loss": 1.1323890686035156,
841
+ "eval_runtime": 3.0815,
842
+ "eval_samples_per_second": 233.653,
843
+ "eval_steps_per_second": 1.947,
844
+ "step": 195
845
+ },
846
+ {
847
+ "epoch": 78.8,
848
+ "eval_accuracy": 0.8638888888888889,
849
+ "eval_loss": 1.1320444345474243,
850
+ "eval_runtime": 3.2638,
851
+ "eval_samples_per_second": 220.604,
852
+ "eval_steps_per_second": 1.838,
853
+ "step": 197
854
+ },
855
+ {
856
+ "epoch": 80.0,
857
+ "grad_norm": 0.8473469614982605,
858
+ "learning_rate": 0.0,
859
+ "loss": 0.8798,
860
+ "step": 200
861
+ },
862
+ {
863
+ "epoch": 80.0,
864
+ "eval_accuracy": 0.8638888888888889,
865
+ "eval_loss": 1.1318904161453247,
866
+ "eval_runtime": 3.2096,
867
+ "eval_samples_per_second": 224.325,
868
+ "eval_steps_per_second": 1.869,
869
+ "step": 200
870
+ },
871
+ {
872
+ "epoch": 80.0,
873
+ "step": 200,
874
+ "total_flos": 7.440697863438336e+18,
875
+ "train_loss": 1.5909362745285034,
876
+ "train_runtime": 1036.0381,
877
+ "train_samples_per_second": 115.826,
878
+ "train_steps_per_second": 0.193
879
  }
880
  ],
881
  "logging_steps": 10,
882
+ "max_steps": 200,
883
  "num_input_tokens_seen": 0,
884
+ "num_train_epochs": 100,
885
  "save_steps": 500,
886
  "stateful_callbacks": {
887
  "TrainerControl": {
 
895
  "attributes": {}
896
  }
897
  },
898
+ "total_flos": 7.440697863438336e+18,
899
  "train_batch_size": 128,
900
  "trial_name": null,
901
  "trial_params": null
val_eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 40.0,
3
- "eval_accuracy": 0.8361111111111111,
4
- "eval_loss": 1.7763370275497437,
5
- "eval_runtime": 3.2414,
6
- "eval_samples_per_second": 222.127,
7
- "eval_steps_per_second": 1.851
8
  }
 
1
  {
2
+ "epoch": 80.0,
3
+ "eval_accuracy": 0.8638888888888889,
4
+ "eval_loss": 1.2210274934768677,
5
+ "eval_runtime": 3.2732,
6
+ "eval_samples_per_second": 219.966,
7
+ "eval_steps_per_second": 1.833
8
  }