Encore02 commited on
Commit
11dad25
1 Parent(s): f8cac2a

🍻 cheers

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
 
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
@@ -23,7 +24,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.9136690647482014
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +34,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.3569
37
- - Accuracy: 0.9137
38
 
39
  ## Model description
40
 
 
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
6
+ - image-classification
7
  - generated_from_trainer
8
  datasets:
9
  - imagefolder
 
24
  metrics:
25
  - name: Accuracy
26
  type: accuracy
27
+ value: 0.920863309352518
28
  ---
29
 
30
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
34
 
35
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
36
  It achieves the following results on the evaluation set:
37
+ - Loss: 0.3301
38
+ - Accuracy: 0.9209
39
 
40
  ## Model description
41
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.9172661870503597,
4
- "eval_loss": 0.2732398509979248,
5
- "eval_runtime": 2.9789,
6
- "eval_samples_per_second": 93.324,
7
- "eval_steps_per_second": 11.749,
8
  "total_flos": 1.9334597982400512e+18,
9
- "train_loss": 0.21612570865485722,
10
- "train_runtime": 723.2579,
11
- "train_samples_per_second": 34.497,
12
- "train_steps_per_second": 2.157
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.920863309352518,
4
+ "eval_loss": 0.330095499753952,
5
+ "eval_runtime": 2.7656,
6
+ "eval_samples_per_second": 100.521,
7
+ "eval_steps_per_second": 12.656,
8
  "total_flos": 1.9334597982400512e+18,
9
+ "train_loss": 0.20911514231314263,
10
+ "train_runtime": 603.1201,
11
+ "train_samples_per_second": 41.368,
12
+ "train_steps_per_second": 2.587
13
  }
data/events.out.tfevents.1730711565.0894b137f84e.248.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b818a1630ed93ed7a3d1ba8b06671df4b96179958951f73df53dfb2c07f694c
3
+ size 411
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.9172661870503597,
4
- "eval_loss": 0.2732398509979248,
5
- "eval_runtime": 2.9789,
6
- "eval_samples_per_second": 93.324,
7
- "eval_steps_per_second": 11.749
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.920863309352518,
4
+ "eval_loss": 0.330095499753952,
5
+ "eval_runtime": 2.7656,
6
+ "eval_samples_per_second": 100.521,
7
+ "eval_steps_per_second": 12.656
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 1.9334597982400512e+18,
4
- "train_loss": 0.21612570865485722,
5
- "train_runtime": 723.2579,
6
- "train_samples_per_second": 34.497,
7
- "train_steps_per_second": 2.157
8
  }
 
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 1.9334597982400512e+18,
4
+ "train_loss": 0.20911514231314263,
5
+ "train_runtime": 603.1201,
6
+ "train_samples_per_second": 41.368,
7
+ "train_steps_per_second": 2.587
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.2732398509979248,
3
- "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-500",
4
  "epoch": 10.0,
5
  "eval_steps": 100,
6
  "global_step": 1560,
@@ -10,1239 +10,1239 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0641025641025641,
13
- "grad_norm": 1.2882777452468872,
14
  "learning_rate": 0.00019871794871794874,
15
- "loss": 1.2572,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1282051282051282,
20
- "grad_norm": 1.0699219703674316,
21
  "learning_rate": 0.00019743589743589744,
22
- "loss": 1.1322,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.19230769230769232,
27
- "grad_norm": 1.5904221534729004,
28
  "learning_rate": 0.00019615384615384615,
29
- "loss": 1.1501,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2564102564102564,
34
- "grad_norm": 1.3816825151443481,
35
  "learning_rate": 0.00019487179487179487,
36
- "loss": 1.0852,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.32051282051282054,
41
- "grad_norm": 2.104780435562134,
42
  "learning_rate": 0.0001935897435897436,
43
- "loss": 1.1374,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.38461538461538464,
48
- "grad_norm": 4.023139476776123,
49
  "learning_rate": 0.00019230769230769233,
50
- "loss": 0.9742,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.44871794871794873,
55
- "grad_norm": 2.541919469833374,
56
  "learning_rate": 0.00019102564102564104,
57
- "loss": 0.8644,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5128205128205128,
62
- "grad_norm": 1.7673051357269287,
63
  "learning_rate": 0.00018974358974358974,
64
- "loss": 0.928,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.5769230769230769,
69
- "grad_norm": 2.5402064323425293,
70
  "learning_rate": 0.00018846153846153847,
71
- "loss": 0.8873,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.6410256410256411,
76
- "grad_norm": 2.0333411693573,
77
  "learning_rate": 0.0001871794871794872,
78
- "loss": 0.8791,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6410256410256411,
83
- "eval_accuracy": 0.7014388489208633,
84
- "eval_loss": 0.7506538033485413,
85
- "eval_runtime": 5.5468,
86
- "eval_samples_per_second": 50.119,
87
- "eval_steps_per_second": 6.31,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.7051282051282052,
92
- "grad_norm": 3.6588566303253174,
93
  "learning_rate": 0.0001858974358974359,
94
- "loss": 0.8667,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.7692307692307693,
99
- "grad_norm": 4.510610580444336,
100
  "learning_rate": 0.00018461538461538463,
101
- "loss": 0.7618,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.8333333333333334,
106
- "grad_norm": 3.297905445098877,
107
  "learning_rate": 0.00018333333333333334,
108
- "loss": 0.814,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.8974358974358975,
113
- "grad_norm": 2.0982401371002197,
114
  "learning_rate": 0.00018205128205128207,
115
- "loss": 1.0144,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.9615384615384616,
120
- "grad_norm": 2.0412075519561768,
121
  "learning_rate": 0.00018076923076923077,
122
- "loss": 0.6803,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.0256410256410255,
127
- "grad_norm": 1.8364259004592896,
128
  "learning_rate": 0.0001794871794871795,
129
- "loss": 0.6764,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.0897435897435896,
134
- "grad_norm": 2.415219306945801,
135
  "learning_rate": 0.00017820512820512823,
136
- "loss": 0.6038,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.1538461538461537,
141
- "grad_norm": 2.8083627223968506,
142
  "learning_rate": 0.00017692307692307693,
143
- "loss": 0.609,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.217948717948718,
148
- "grad_norm": 2.6611833572387695,
149
  "learning_rate": 0.00017564102564102566,
150
- "loss": 0.4588,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.282051282051282,
155
- "grad_norm": 3.2890446186065674,
156
  "learning_rate": 0.00017435897435897436,
157
- "loss": 0.7436,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.282051282051282,
162
- "eval_accuracy": 0.7697841726618705,
163
- "eval_loss": 0.5399633049964905,
164
- "eval_runtime": 6.1673,
165
- "eval_samples_per_second": 45.076,
166
- "eval_steps_per_second": 5.675,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 1.3461538461538463,
171
- "grad_norm": 1.8595383167266846,
172
  "learning_rate": 0.0001730769230769231,
173
- "loss": 0.5508,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 1.4102564102564101,
178
- "grad_norm": 2.8622944355010986,
179
  "learning_rate": 0.0001717948717948718,
180
- "loss": 0.5182,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 1.4743589743589745,
185
- "grad_norm": 1.320397138595581,
186
  "learning_rate": 0.00017051282051282053,
187
- "loss": 0.5577,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 1.5384615384615383,
192
- "grad_norm": 2.9527108669281006,
193
  "learning_rate": 0.00016923076923076923,
194
- "loss": 0.5081,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.6025641025641026,
199
- "grad_norm": 6.604541301727295,
200
  "learning_rate": 0.00016794871794871796,
201
- "loss": 0.7143,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.6666666666666665,
206
- "grad_norm": 3.27010440826416,
207
  "learning_rate": 0.0001666666666666667,
208
- "loss": 0.5838,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.7307692307692308,
213
- "grad_norm": 2.723151922225952,
214
  "learning_rate": 0.0001653846153846154,
215
- "loss": 0.4859,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.7948717948717947,
220
- "grad_norm": 2.4719648361206055,
221
  "learning_rate": 0.0001641025641025641,
222
- "loss": 0.4969,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.858974358974359,
227
- "grad_norm": 1.9341636896133423,
228
  "learning_rate": 0.00016282051282051282,
229
- "loss": 0.4678,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.9230769230769231,
234
- "grad_norm": 2.635348320007324,
235
  "learning_rate": 0.00016153846153846155,
236
- "loss": 0.4783,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.9230769230769231,
241
- "eval_accuracy": 0.8345323741007195,
242
- "eval_loss": 0.48316845297813416,
243
- "eval_runtime": 6.0017,
244
- "eval_samples_per_second": 46.32,
245
- "eval_steps_per_second": 5.832,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 1.9871794871794872,
250
- "grad_norm": 3.740190267562866,
251
  "learning_rate": 0.00016025641025641028,
252
- "loss": 0.8054,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 2.051282051282051,
257
- "grad_norm": 3.149517059326172,
258
  "learning_rate": 0.00015897435897435896,
259
- "loss": 0.4985,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 2.1153846153846154,
264
- "grad_norm": 2.627647638320923,
265
  "learning_rate": 0.0001576923076923077,
266
- "loss": 0.3483,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 2.1794871794871793,
271
- "grad_norm": 4.553584575653076,
272
  "learning_rate": 0.00015641025641025642,
273
- "loss": 0.3432,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 2.2435897435897436,
278
- "grad_norm": 1.0300644636154175,
279
  "learning_rate": 0.00015512820512820515,
280
- "loss": 0.2012,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 2.3076923076923075,
285
- "grad_norm": 3.4395174980163574,
286
  "learning_rate": 0.00015384615384615385,
287
- "loss": 0.276,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 2.371794871794872,
292
- "grad_norm": 5.862714767456055,
293
  "learning_rate": 0.00015256410256410255,
294
- "loss": 0.2513,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 2.435897435897436,
299
- "grad_norm": 3.349158763885498,
300
  "learning_rate": 0.00015128205128205128,
301
- "loss": 0.3182,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 2.5,
306
- "grad_norm": 4.547815799713135,
307
  "learning_rate": 0.00015000000000000001,
308
- "loss": 0.3878,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 2.564102564102564,
313
- "grad_norm": 12.249879837036133,
314
  "learning_rate": 0.00014871794871794872,
315
- "loss": 0.3055,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 2.564102564102564,
320
- "eval_accuracy": 0.9100719424460432,
321
- "eval_loss": 0.2733714282512665,
322
- "eval_runtime": 3.0904,
323
- "eval_samples_per_second": 89.955,
324
- "eval_steps_per_second": 11.325,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 2.628205128205128,
329
- "grad_norm": 3.7284679412841797,
330
  "learning_rate": 0.00014743589743589745,
331
- "loss": 0.2669,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 2.6923076923076925,
336
- "grad_norm": 2.6539294719696045,
337
  "learning_rate": 0.00014615384615384615,
338
- "loss": 0.2455,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 2.7564102564102564,
343
- "grad_norm": 0.7099685668945312,
344
  "learning_rate": 0.00014487179487179488,
345
- "loss": 0.3793,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 2.8205128205128203,
350
- "grad_norm": 3.6865317821502686,
351
  "learning_rate": 0.0001435897435897436,
352
- "loss": 0.2432,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 2.8846153846153846,
357
- "grad_norm": 5.858238697052002,
358
  "learning_rate": 0.0001423076923076923,
359
- "loss": 0.2655,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 2.948717948717949,
364
- "grad_norm": 0.48953068256378174,
365
  "learning_rate": 0.00014102564102564104,
366
- "loss": 0.2394,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 3.0128205128205128,
371
- "grad_norm": 0.3400702476501465,
372
  "learning_rate": 0.00013974358974358974,
373
- "loss": 0.2037,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 3.076923076923077,
378
- "grad_norm": 6.050922393798828,
379
  "learning_rate": 0.00013846153846153847,
380
- "loss": 0.1398,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 3.141025641025641,
385
- "grad_norm": 3.962301731109619,
386
  "learning_rate": 0.00013717948717948718,
387
- "loss": 0.2337,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 3.2051282051282053,
392
- "grad_norm": 0.4021244943141937,
393
  "learning_rate": 0.0001358974358974359,
394
- "loss": 0.2407,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 3.2051282051282053,
399
- "eval_accuracy": 0.9172661870503597,
400
- "eval_loss": 0.2732398509979248,
401
- "eval_runtime": 3.5213,
402
- "eval_samples_per_second": 78.948,
403
- "eval_steps_per_second": 9.94,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 3.269230769230769,
408
- "grad_norm": 0.1918177306652069,
409
  "learning_rate": 0.00013461538461538464,
410
- "loss": 0.0846,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 3.3333333333333335,
415
- "grad_norm": 0.13242504000663757,
416
  "learning_rate": 0.00013333333333333334,
417
- "loss": 0.1978,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 3.3974358974358974,
422
- "grad_norm": 6.946755409240723,
423
  "learning_rate": 0.00013205128205128204,
424
- "loss": 0.228,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 3.4615384615384617,
429
- "grad_norm": 7.6276421546936035,
430
  "learning_rate": 0.00013076923076923077,
431
- "loss": 0.18,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 3.5256410256410255,
436
- "grad_norm": 0.35523688793182373,
437
  "learning_rate": 0.0001294871794871795,
438
- "loss": 0.2565,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 3.58974358974359,
443
- "grad_norm": 4.415125370025635,
444
  "learning_rate": 0.00012820512820512823,
445
- "loss": 0.2047,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 3.6538461538461537,
450
- "grad_norm": 0.24991707503795624,
451
  "learning_rate": 0.00012692307692307693,
452
- "loss": 0.0867,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 3.717948717948718,
457
- "grad_norm": 3.979609489440918,
458
  "learning_rate": 0.00012564102564102564,
459
- "loss": 0.2181,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 3.782051282051282,
464
- "grad_norm": 2.4911768436431885,
465
  "learning_rate": 0.00012435897435897437,
466
- "loss": 0.2627,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 3.8461538461538463,
471
- "grad_norm": 0.17239753901958466,
472
  "learning_rate": 0.0001230769230769231,
473
- "loss": 0.1367,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 3.8461538461538463,
478
- "eval_accuracy": 0.8525179856115108,
479
- "eval_loss": 0.40619969367980957,
480
- "eval_runtime": 3.7715,
481
- "eval_samples_per_second": 73.71,
482
- "eval_steps_per_second": 9.28,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 3.91025641025641,
487
- "grad_norm": 5.575563907623291,
488
  "learning_rate": 0.00012179487179487179,
489
- "loss": 0.1483,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 3.9743589743589745,
494
- "grad_norm": 6.226486682891846,
495
  "learning_rate": 0.00012051282051282052,
496
- "loss": 0.1488,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 4.038461538461538,
501
- "grad_norm": 4.32499885559082,
502
  "learning_rate": 0.00011923076923076923,
503
- "loss": 0.1551,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 4.102564102564102,
508
- "grad_norm": 1.007263422012329,
509
  "learning_rate": 0.00011794871794871796,
510
- "loss": 0.0952,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 4.166666666666667,
515
- "grad_norm": 0.3149818480014801,
516
  "learning_rate": 0.00011666666666666668,
517
- "loss": 0.0885,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 4.230769230769231,
522
- "grad_norm": 0.10396721214056015,
523
  "learning_rate": 0.00011538461538461538,
524
- "loss": 0.0646,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 4.294871794871795,
529
- "grad_norm": 1.3003551959991455,
530
  "learning_rate": 0.0001141025641025641,
531
- "loss": 0.1037,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 4.358974358974359,
536
- "grad_norm": 0.16181178390979767,
537
  "learning_rate": 0.00011282051282051283,
538
- "loss": 0.0971,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 4.423076923076923,
543
- "grad_norm": 0.11648831516504288,
544
  "learning_rate": 0.00011153846153846154,
545
- "loss": 0.1578,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 4.487179487179487,
550
- "grad_norm": 7.352902889251709,
551
  "learning_rate": 0.00011025641025641027,
552
- "loss": 0.0943,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 4.487179487179487,
557
- "eval_accuracy": 0.9136690647482014,
558
- "eval_loss": 0.3187541365623474,
559
- "eval_runtime": 3.5063,
560
- "eval_samples_per_second": 79.287,
561
- "eval_steps_per_second": 9.982,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 4.551282051282051,
566
- "grad_norm": 0.08271457999944687,
567
  "learning_rate": 0.00010897435897435896,
568
- "loss": 0.1492,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 4.615384615384615,
573
- "grad_norm": 3.449918270111084,
574
  "learning_rate": 0.0001076923076923077,
575
- "loss": 0.045,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 4.67948717948718,
580
- "grad_norm": 13.875882148742676,
581
  "learning_rate": 0.00010641025641025641,
582
- "loss": 0.0635,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 4.743589743589744,
587
- "grad_norm": 0.1391572505235672,
588
  "learning_rate": 0.00010512820512820514,
589
- "loss": 0.0433,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 4.8076923076923075,
594
- "grad_norm": 0.06808628886938095,
595
  "learning_rate": 0.00010384615384615386,
596
- "loss": 0.0359,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 4.871794871794872,
601
- "grad_norm": 0.17365112900733948,
602
  "learning_rate": 0.00010256410256410256,
603
- "loss": 0.0235,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 4.935897435897436,
608
- "grad_norm": 0.15800704061985016,
609
  "learning_rate": 0.00010128205128205129,
610
- "loss": 0.0686,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 5.0,
615
- "grad_norm": 0.584504246711731,
616
  "learning_rate": 0.0001,
617
- "loss": 0.1028,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 5.064102564102564,
622
- "grad_norm": 5.587973594665527,
623
  "learning_rate": 9.871794871794872e-05,
624
- "loss": 0.0321,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 5.128205128205128,
629
- "grad_norm": 11.099601745605469,
630
  "learning_rate": 9.743589743589744e-05,
631
- "loss": 0.0938,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 5.128205128205128,
636
- "eval_accuracy": 0.9172661870503597,
637
- "eval_loss": 0.32109296321868896,
638
- "eval_runtime": 2.5127,
639
- "eval_samples_per_second": 110.638,
640
- "eval_steps_per_second": 13.929,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 5.1923076923076925,
645
- "grad_norm": 5.029045104980469,
646
  "learning_rate": 9.615384615384617e-05,
647
- "loss": 0.0282,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 5.256410256410256,
652
- "grad_norm": 0.06672952324151993,
653
  "learning_rate": 9.487179487179487e-05,
654
- "loss": 0.0204,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 5.32051282051282,
659
- "grad_norm": 0.3987838923931122,
660
  "learning_rate": 9.35897435897436e-05,
661
- "loss": 0.0289,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 5.384615384615385,
666
- "grad_norm": 0.05064750835299492,
667
  "learning_rate": 9.230769230769232e-05,
668
- "loss": 0.0088,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 5.448717948717949,
673
- "grad_norm": 9.289505004882812,
674
  "learning_rate": 9.102564102564103e-05,
675
- "loss": 0.0158,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 5.512820512820513,
680
- "grad_norm": 0.046484652906656265,
681
  "learning_rate": 8.974358974358975e-05,
682
- "loss": 0.0354,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 5.576923076923077,
687
- "grad_norm": 2.9247965812683105,
688
  "learning_rate": 8.846153846153847e-05,
689
- "loss": 0.0583,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 5.641025641025641,
694
- "grad_norm": 18.817678451538086,
695
  "learning_rate": 8.717948717948718e-05,
696
- "loss": 0.2382,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 5.705128205128205,
701
- "grad_norm": 0.03485775738954544,
702
  "learning_rate": 8.58974358974359e-05,
703
- "loss": 0.03,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 5.769230769230769,
708
- "grad_norm": 0.03652678430080414,
709
  "learning_rate": 8.461538461538461e-05,
710
- "loss": 0.0352,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 5.769230769230769,
715
- "eval_accuracy": 0.9280575539568345,
716
- "eval_loss": 0.30018866062164307,
717
- "eval_runtime": 2.4539,
718
- "eval_samples_per_second": 113.29,
719
- "eval_steps_per_second": 14.263,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 5.833333333333333,
724
- "grad_norm": 0.0848119929432869,
725
  "learning_rate": 8.333333333333334e-05,
726
- "loss": 0.0352,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 5.897435897435898,
731
- "grad_norm": 0.04110792279243469,
732
  "learning_rate": 8.205128205128205e-05,
733
- "loss": 0.009,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 5.961538461538462,
738
- "grad_norm": 0.0703083947300911,
739
  "learning_rate": 8.076923076923078e-05,
740
- "loss": 0.0361,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 6.0256410256410255,
745
- "grad_norm": 0.0352785587310791,
746
  "learning_rate": 7.948717948717948e-05,
747
- "loss": 0.0073,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 6.089743589743589,
752
- "grad_norm": 0.03985786810517311,
753
  "learning_rate": 7.820512820512821e-05,
754
- "loss": 0.0062,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 6.153846153846154,
759
- "grad_norm": 1.1201144456863403,
760
  "learning_rate": 7.692307692307693e-05,
761
- "loss": 0.0108,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 6.217948717948718,
766
- "grad_norm": 0.028453074395656586,
767
  "learning_rate": 7.564102564102564e-05,
768
- "loss": 0.0067,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 6.282051282051282,
773
- "grad_norm": 0.11250611394643784,
774
  "learning_rate": 7.435897435897436e-05,
775
- "loss": 0.0061,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 6.346153846153846,
780
- "grad_norm": 0.09943367540836334,
781
  "learning_rate": 7.307692307692307e-05,
782
- "loss": 0.0234,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 6.410256410256411,
787
- "grad_norm": 0.031824991106987,
788
  "learning_rate": 7.17948717948718e-05,
789
- "loss": 0.0054,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 6.410256410256411,
794
- "eval_accuracy": 0.9244604316546763,
795
- "eval_loss": 0.3863399028778076,
796
- "eval_runtime": 2.6052,
797
- "eval_samples_per_second": 106.709,
798
- "eval_steps_per_second": 13.435,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 6.4743589743589745,
803
- "grad_norm": 0.0259853545576334,
804
  "learning_rate": 7.051282051282052e-05,
805
- "loss": 0.0134,
806
  "step": 1010
807
  },
808
  {
809
  "epoch": 6.538461538461538,
810
- "grad_norm": 0.07778998464345932,
811
  "learning_rate": 6.923076923076924e-05,
812
- "loss": 0.0055,
813
  "step": 1020
814
  },
815
  {
816
  "epoch": 6.602564102564102,
817
- "grad_norm": 0.023598596453666687,
818
  "learning_rate": 6.794871794871795e-05,
819
- "loss": 0.009,
820
  "step": 1030
821
  },
822
  {
823
  "epoch": 6.666666666666667,
824
- "grad_norm": 0.02402086742222309,
825
  "learning_rate": 6.666666666666667e-05,
826
- "loss": 0.0294,
827
  "step": 1040
828
  },
829
  {
830
  "epoch": 6.730769230769231,
831
- "grad_norm": 0.02628781832754612,
832
  "learning_rate": 6.538461538461539e-05,
833
- "loss": 0.0055,
834
  "step": 1050
835
  },
836
  {
837
  "epoch": 6.794871794871795,
838
- "grad_norm": 0.021159937605261803,
839
  "learning_rate": 6.410256410256412e-05,
840
- "loss": 0.0051,
841
  "step": 1060
842
  },
843
  {
844
  "epoch": 6.858974358974359,
845
- "grad_norm": 0.027991948649287224,
846
  "learning_rate": 6.282051282051282e-05,
847
- "loss": 0.0044,
848
  "step": 1070
849
  },
850
  {
851
  "epoch": 6.923076923076923,
852
- "grad_norm": 0.021519368514418602,
853
  "learning_rate": 6.153846153846155e-05,
854
- "loss": 0.0481,
855
  "step": 1080
856
  },
857
  {
858
  "epoch": 6.987179487179487,
859
- "grad_norm": 0.02785063348710537,
860
  "learning_rate": 6.025641025641026e-05,
861
- "loss": 0.0048,
862
  "step": 1090
863
  },
864
  {
865
  "epoch": 7.051282051282051,
866
- "grad_norm": 0.029052695259451866,
867
  "learning_rate": 5.897435897435898e-05,
868
- "loss": 0.0397,
869
  "step": 1100
870
  },
871
  {
872
  "epoch": 7.051282051282051,
873
- "eval_accuracy": 0.9316546762589928,
874
- "eval_loss": 0.37904202938079834,
875
- "eval_runtime": 3.1468,
876
- "eval_samples_per_second": 88.343,
877
- "eval_steps_per_second": 11.122,
878
  "step": 1100
879
  },
880
  {
881
  "epoch": 7.115384615384615,
882
- "grad_norm": 0.02662370726466179,
883
  "learning_rate": 5.769230769230769e-05,
884
- "loss": 0.0052,
885
  "step": 1110
886
  },
887
  {
888
  "epoch": 7.17948717948718,
889
- "grad_norm": 0.0247439406812191,
890
  "learning_rate": 5.6410256410256414e-05,
891
- "loss": 0.0051,
892
  "step": 1120
893
  },
894
  {
895
  "epoch": 7.243589743589744,
896
- "grad_norm": 0.025030823424458504,
897
  "learning_rate": 5.512820512820514e-05,
898
- "loss": 0.0044,
899
  "step": 1130
900
  },
901
  {
902
  "epoch": 7.3076923076923075,
903
- "grad_norm": 0.030788838863372803,
904
  "learning_rate": 5.384615384615385e-05,
905
- "loss": 0.0044,
906
  "step": 1140
907
  },
908
  {
909
  "epoch": 7.371794871794872,
910
- "grad_norm": 0.07316362112760544,
911
  "learning_rate": 5.256410256410257e-05,
912
- "loss": 0.0043,
913
  "step": 1150
914
  },
915
  {
916
  "epoch": 7.435897435897436,
917
- "grad_norm": 0.024025099352002144,
918
  "learning_rate": 5.128205128205128e-05,
919
- "loss": 0.0041,
920
  "step": 1160
921
  },
922
  {
923
  "epoch": 7.5,
924
- "grad_norm": 0.019680393859744072,
925
  "learning_rate": 5e-05,
926
- "loss": 0.0044,
927
  "step": 1170
928
  },
929
  {
930
  "epoch": 7.564102564102564,
931
- "grad_norm": 0.02100115269422531,
932
  "learning_rate": 4.871794871794872e-05,
933
- "loss": 0.004,
934
  "step": 1180
935
  },
936
  {
937
  "epoch": 7.628205128205128,
938
- "grad_norm": 0.01914617232978344,
939
  "learning_rate": 4.7435897435897435e-05,
940
  "loss": 0.0037,
941
  "step": 1190
942
  },
943
  {
944
  "epoch": 7.6923076923076925,
945
- "grad_norm": 0.018958982080221176,
946
  "learning_rate": 4.615384615384616e-05,
947
- "loss": 0.0038,
948
  "step": 1200
949
  },
950
  {
951
  "epoch": 7.6923076923076925,
952
- "eval_accuracy": 0.9388489208633094,
953
- "eval_loss": 0.29680585861206055,
954
- "eval_runtime": 3.2229,
955
- "eval_samples_per_second": 86.259,
956
- "eval_steps_per_second": 10.86,
957
  "step": 1200
958
  },
959
  {
960
  "epoch": 7.756410256410256,
961
- "grad_norm": 0.01917438395321369,
962
  "learning_rate": 4.4871794871794874e-05,
963
- "loss": 0.0038,
964
  "step": 1210
965
  },
966
  {
967
  "epoch": 7.82051282051282,
968
- "grad_norm": 0.01674094796180725,
969
  "learning_rate": 4.358974358974359e-05,
970
- "loss": 0.004,
971
  "step": 1220
972
  },
973
  {
974
  "epoch": 7.884615384615385,
975
- "grad_norm": 0.01729915663599968,
976
  "learning_rate": 4.230769230769231e-05,
977
- "loss": 0.0037,
978
  "step": 1230
979
  },
980
  {
981
  "epoch": 7.948717948717949,
982
- "grad_norm": 0.02169210836291313,
983
  "learning_rate": 4.1025641025641023e-05,
984
- "loss": 0.0036,
985
  "step": 1240
986
  },
987
  {
988
  "epoch": 8.012820512820513,
989
- "grad_norm": 0.017752377316355705,
990
  "learning_rate": 3.974358974358974e-05,
991
- "loss": 0.0036,
992
  "step": 1250
993
  },
994
  {
995
  "epoch": 8.076923076923077,
996
- "grad_norm": 0.017430851235985756,
997
  "learning_rate": 3.846153846153846e-05,
998
- "loss": 0.0035,
999
  "step": 1260
1000
  },
1001
  {
1002
  "epoch": 8.14102564102564,
1003
- "grad_norm": 0.017766138538718224,
1004
  "learning_rate": 3.717948717948718e-05,
1005
- "loss": 0.0036,
1006
  "step": 1270
1007
  },
1008
  {
1009
  "epoch": 8.205128205128204,
1010
- "grad_norm": 0.016512656584382057,
1011
  "learning_rate": 3.58974358974359e-05,
1012
- "loss": 0.0034,
1013
  "step": 1280
1014
  },
1015
  {
1016
  "epoch": 8.26923076923077,
1017
- "grad_norm": 0.018264977261424065,
1018
  "learning_rate": 3.461538461538462e-05,
1019
- "loss": 0.0034,
1020
  "step": 1290
1021
  },
1022
  {
1023
  "epoch": 8.333333333333334,
1024
- "grad_norm": 0.017350930720567703,
1025
  "learning_rate": 3.3333333333333335e-05,
1026
- "loss": 0.0035,
1027
  "step": 1300
1028
  },
1029
  {
1030
  "epoch": 8.333333333333334,
1031
- "eval_accuracy": 0.935251798561151,
1032
- "eval_loss": 0.2936682403087616,
1033
- "eval_runtime": 2.3785,
1034
- "eval_samples_per_second": 116.881,
1035
- "eval_steps_per_second": 14.715,
1036
  "step": 1300
1037
  },
1038
  {
1039
  "epoch": 8.397435897435898,
1040
- "grad_norm": 0.017029928043484688,
1041
  "learning_rate": 3.205128205128206e-05,
1042
- "loss": 0.0032,
1043
  "step": 1310
1044
  },
1045
  {
1046
  "epoch": 8.461538461538462,
1047
- "grad_norm": 0.015666915103793144,
1048
  "learning_rate": 3.0769230769230774e-05,
1049
- "loss": 0.0032,
1050
  "step": 1320
1051
  },
1052
  {
1053
  "epoch": 8.525641025641026,
1054
- "grad_norm": 0.01642964966595173,
1055
  "learning_rate": 2.948717948717949e-05,
1056
- "loss": 0.0034,
1057
  "step": 1330
1058
  },
1059
  {
1060
  "epoch": 8.58974358974359,
1061
- "grad_norm": 0.016315065324306488,
1062
  "learning_rate": 2.8205128205128207e-05,
1063
- "loss": 0.0033,
1064
  "step": 1340
1065
  },
1066
  {
1067
  "epoch": 8.653846153846153,
1068
- "grad_norm": 0.014955148100852966,
1069
  "learning_rate": 2.6923076923076923e-05,
1070
- "loss": 0.0032,
1071
  "step": 1350
1072
  },
1073
  {
1074
  "epoch": 8.717948717948717,
1075
- "grad_norm": 0.016276659443974495,
1076
  "learning_rate": 2.564102564102564e-05,
1077
- "loss": 0.0031,
1078
  "step": 1360
1079
  },
1080
  {
1081
  "epoch": 8.782051282051283,
1082
- "grad_norm": 0.015878435224294662,
1083
  "learning_rate": 2.435897435897436e-05,
1084
- "loss": 0.0034,
1085
  "step": 1370
1086
  },
1087
  {
1088
  "epoch": 8.846153846153847,
1089
- "grad_norm": 0.015775034204125404,
1090
  "learning_rate": 2.307692307692308e-05,
1091
- "loss": 0.0029,
1092
  "step": 1380
1093
  },
1094
  {
1095
  "epoch": 8.91025641025641,
1096
- "grad_norm": 0.025302419438958168,
1097
  "learning_rate": 2.1794871794871795e-05,
1098
- "loss": 0.0031,
1099
  "step": 1390
1100
  },
1101
  {
1102
  "epoch": 8.974358974358974,
1103
- "grad_norm": 0.01465103030204773,
1104
  "learning_rate": 2.0512820512820512e-05,
1105
  "loss": 0.003,
1106
  "step": 1400
1107
  },
1108
  {
1109
  "epoch": 8.974358974358974,
1110
- "eval_accuracy": 0.9388489208633094,
1111
- "eval_loss": 0.3025781810283661,
1112
- "eval_runtime": 2.9166,
1113
- "eval_samples_per_second": 95.318,
1114
- "eval_steps_per_second": 12.0,
1115
  "step": 1400
1116
  },
1117
  {
1118
  "epoch": 9.038461538461538,
1119
- "grad_norm": 0.016850067302584648,
1120
  "learning_rate": 1.923076923076923e-05,
1121
- "loss": 0.0032,
1122
  "step": 1410
1123
  },
1124
  {
1125
  "epoch": 9.102564102564102,
1126
- "grad_norm": 0.017839446663856506,
1127
  "learning_rate": 1.794871794871795e-05,
1128
- "loss": 0.0033,
1129
  "step": 1420
1130
  },
1131
  {
1132
  "epoch": 9.166666666666666,
1133
- "grad_norm": 0.019131546840071678,
1134
  "learning_rate": 1.6666666666666667e-05,
1135
- "loss": 0.0032,
1136
  "step": 1430
1137
  },
1138
  {
1139
  "epoch": 9.23076923076923,
1140
- "grad_norm": 0.014986937865614891,
1141
  "learning_rate": 1.5384615384615387e-05,
1142
- "loss": 0.003,
1143
  "step": 1440
1144
  },
1145
  {
1146
  "epoch": 9.294871794871796,
1147
- "grad_norm": 0.014039918780326843,
1148
  "learning_rate": 1.4102564102564104e-05,
1149
- "loss": 0.0029,
1150
  "step": 1450
1151
  },
1152
  {
1153
  "epoch": 9.35897435897436,
1154
- "grad_norm": 0.015353621914982796,
1155
  "learning_rate": 1.282051282051282e-05,
1156
- "loss": 0.0029,
1157
  "step": 1460
1158
  },
1159
  {
1160
  "epoch": 9.423076923076923,
1161
- "grad_norm": 0.01887168549001217,
1162
  "learning_rate": 1.153846153846154e-05,
1163
- "loss": 0.0031,
1164
  "step": 1470
1165
  },
1166
  {
1167
  "epoch": 9.487179487179487,
1168
- "grad_norm": 0.01612565666437149,
1169
  "learning_rate": 1.0256410256410256e-05,
1170
- "loss": 0.003,
1171
  "step": 1480
1172
  },
1173
  {
1174
  "epoch": 9.551282051282051,
1175
- "grad_norm": 0.014622623100876808,
1176
  "learning_rate": 8.974358974358976e-06,
1177
- "loss": 0.003,
1178
  "step": 1490
1179
  },
1180
  {
1181
  "epoch": 9.615384615384615,
1182
- "grad_norm": 0.015224343165755272,
1183
  "learning_rate": 7.692307692307694e-06,
1184
- "loss": 0.0031,
1185
  "step": 1500
1186
  },
1187
  {
1188
  "epoch": 9.615384615384615,
1189
- "eval_accuracy": 0.9388489208633094,
1190
- "eval_loss": 0.3090037703514099,
1191
- "eval_runtime": 2.3915,
1192
- "eval_samples_per_second": 116.245,
1193
- "eval_steps_per_second": 14.635,
1194
  "step": 1500
1195
  },
1196
  {
1197
  "epoch": 9.679487179487179,
1198
- "grad_norm": 0.013898891396820545,
1199
  "learning_rate": 6.41025641025641e-06,
1200
- "loss": 0.0029,
1201
  "step": 1510
1202
  },
1203
  {
1204
  "epoch": 9.743589743589745,
1205
- "grad_norm": 0.014191491529345512,
1206
  "learning_rate": 5.128205128205128e-06,
1207
- "loss": 0.003,
1208
  "step": 1520
1209
  },
1210
  {
1211
  "epoch": 9.807692307692308,
1212
- "grad_norm": 0.01462490577250719,
1213
  "learning_rate": 3.846153846153847e-06,
1214
  "loss": 0.0029,
1215
  "step": 1530
1216
  },
1217
  {
1218
  "epoch": 9.871794871794872,
1219
- "grad_norm": 0.019841287285089493,
1220
  "learning_rate": 2.564102564102564e-06,
1221
- "loss": 0.0029,
1222
  "step": 1540
1223
  },
1224
  {
1225
  "epoch": 9.935897435897436,
1226
- "grad_norm": 0.015159196220338345,
1227
  "learning_rate": 1.282051282051282e-06,
1228
- "loss": 0.003,
1229
  "step": 1550
1230
  },
1231
  {
1232
  "epoch": 10.0,
1233
- "grad_norm": 0.014964770525693893,
1234
  "learning_rate": 0.0,
1235
- "loss": 0.0031,
1236
  "step": 1560
1237
  },
1238
  {
1239
  "epoch": 10.0,
1240
  "step": 1560,
1241
  "total_flos": 1.9334597982400512e+18,
1242
- "train_loss": 0.21612570865485722,
1243
- "train_runtime": 723.2579,
1244
- "train_samples_per_second": 34.497,
1245
- "train_steps_per_second": 2.157
1246
  }
1247
  ],
1248
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.330095499753952,
3
+ "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-1000",
4
  "epoch": 10.0,
5
  "eval_steps": 100,
6
  "global_step": 1560,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0641025641025641,
13
+ "grad_norm": 1.5280327796936035,
14
  "learning_rate": 0.00019871794871794874,
15
+ "loss": 1.263,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.1282051282051282,
20
+ "grad_norm": 1.8514498472213745,
21
  "learning_rate": 0.00019743589743589744,
22
+ "loss": 1.1159,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.19230769230769232,
27
+ "grad_norm": 1.2513030767440796,
28
  "learning_rate": 0.00019615384615384615,
29
+ "loss": 1.164,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2564102564102564,
34
+ "grad_norm": 2.280437707901001,
35
  "learning_rate": 0.00019487179487179487,
36
+ "loss": 0.9528,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.32051282051282054,
41
+ "grad_norm": 1.246656060218811,
42
  "learning_rate": 0.0001935897435897436,
43
+ "loss": 1.0313,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.38461538461538464,
48
+ "grad_norm": 1.9358172416687012,
49
  "learning_rate": 0.00019230769230769233,
50
+ "loss": 1.038,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.44871794871794873,
55
+ "grad_norm": 2.415847063064575,
56
  "learning_rate": 0.00019102564102564104,
57
+ "loss": 0.9088,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5128205128205128,
62
+ "grad_norm": 2.1359400749206543,
63
  "learning_rate": 0.00018974358974358974,
64
+ "loss": 0.8265,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.5769230769230769,
69
+ "grad_norm": 2.2672863006591797,
70
  "learning_rate": 0.00018846153846153847,
71
+ "loss": 0.8904,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.6410256410256411,
76
+ "grad_norm": 2.525250196456909,
77
  "learning_rate": 0.0001871794871794872,
78
+ "loss": 0.8207,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6410256410256411,
83
+ "eval_accuracy": 0.564748201438849,
84
+ "eval_loss": 1.0335605144500732,
85
+ "eval_runtime": 2.4754,
86
+ "eval_samples_per_second": 112.303,
87
+ "eval_steps_per_second": 14.139,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.7051282051282052,
92
+ "grad_norm": 1.6615636348724365,
93
  "learning_rate": 0.0001858974358974359,
94
+ "loss": 0.9254,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.7692307692307693,
99
+ "grad_norm": 3.1376407146453857,
100
  "learning_rate": 0.00018461538461538463,
101
+ "loss": 0.8125,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.8333333333333334,
106
+ "grad_norm": 1.8494981527328491,
107
  "learning_rate": 0.00018333333333333334,
108
+ "loss": 0.6967,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.8974358974358975,
113
+ "grad_norm": 2.7374682426452637,
114
  "learning_rate": 0.00018205128205128207,
115
+ "loss": 0.7031,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.9615384615384616,
120
+ "grad_norm": 2.8239476680755615,
121
  "learning_rate": 0.00018076923076923077,
122
+ "loss": 0.7871,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.0256410256410255,
127
+ "grad_norm": 2.239936590194702,
128
  "learning_rate": 0.0001794871794871795,
129
+ "loss": 0.8064,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.0897435897435896,
134
+ "grad_norm": 2.60086727142334,
135
  "learning_rate": 0.00017820512820512823,
136
+ "loss": 0.5349,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.1538461538461537,
141
+ "grad_norm": 3.485903024673462,
142
  "learning_rate": 0.00017692307692307693,
143
+ "loss": 0.5896,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.217948717948718,
148
+ "grad_norm": 3.2817769050598145,
149
  "learning_rate": 0.00017564102564102566,
150
+ "loss": 0.729,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.282051282051282,
155
+ "grad_norm": 4.110422611236572,
156
  "learning_rate": 0.00017435897435897436,
157
+ "loss": 0.6506,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.282051282051282,
162
+ "eval_accuracy": 0.579136690647482,
163
+ "eval_loss": 1.1981723308563232,
164
+ "eval_runtime": 2.8799,
165
+ "eval_samples_per_second": 96.532,
166
+ "eval_steps_per_second": 12.153,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 1.3461538461538463,
171
+ "grad_norm": 2.515904664993286,
172
  "learning_rate": 0.0001730769230769231,
173
+ "loss": 0.8231,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 1.4102564102564101,
178
+ "grad_norm": 2.2017431259155273,
179
  "learning_rate": 0.0001717948717948718,
180
+ "loss": 0.6216,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 1.4743589743589745,
185
+ "grad_norm": 2.253706693649292,
186
  "learning_rate": 0.00017051282051282053,
187
+ "loss": 0.5126,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 1.5384615384615383,
192
+ "grad_norm": 3.4106080532073975,
193
  "learning_rate": 0.00016923076923076923,
194
+ "loss": 0.471,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.6025641025641026,
199
+ "grad_norm": 1.884901523590088,
200
  "learning_rate": 0.00016794871794871796,
201
+ "loss": 0.4521,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.6666666666666665,
206
+ "grad_norm": 3.056332588195801,
207
  "learning_rate": 0.0001666666666666667,
208
+ "loss": 0.3808,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.7307692307692308,
213
+ "grad_norm": 3.2792108058929443,
214
  "learning_rate": 0.0001653846153846154,
215
+ "loss": 0.3568,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.7948717948717947,
220
+ "grad_norm": 3.657397985458374,
221
  "learning_rate": 0.0001641025641025641,
222
+ "loss": 0.4942,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.858974358974359,
227
+ "grad_norm": 3.565870523452759,
228
  "learning_rate": 0.00016282051282051282,
229
+ "loss": 0.5117,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.9230769230769231,
234
+ "grad_norm": 1.8648629188537598,
235
  "learning_rate": 0.00016153846153846155,
236
+ "loss": 0.5324,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.9230769230769231,
241
+ "eval_accuracy": 0.7769784172661871,
242
+ "eval_loss": 0.605965256690979,
243
+ "eval_runtime": 2.4146,
244
+ "eval_samples_per_second": 115.134,
245
+ "eval_steps_per_second": 14.495,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 1.9871794871794872,
250
+ "grad_norm": 5.665286540985107,
251
  "learning_rate": 0.00016025641025641028,
252
+ "loss": 0.5566,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 2.051282051282051,
257
+ "grad_norm": 0.6339452266693115,
258
  "learning_rate": 0.00015897435897435896,
259
+ "loss": 0.3058,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 2.1153846153846154,
264
+ "grad_norm": 2.2966978549957275,
265
  "learning_rate": 0.0001576923076923077,
266
+ "loss": 0.2831,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 2.1794871794871793,
271
+ "grad_norm": 2.510307550430298,
272
  "learning_rate": 0.00015641025641025642,
273
+ "loss": 0.2055,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 2.2435897435897436,
278
+ "grad_norm": 1.1059399843215942,
279
  "learning_rate": 0.00015512820512820515,
280
+ "loss": 0.2886,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 2.3076923076923075,
285
+ "grad_norm": 6.055357933044434,
286
  "learning_rate": 0.00015384615384615385,
287
+ "loss": 0.3756,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 2.371794871794872,
292
+ "grad_norm": 0.5536957383155823,
293
  "learning_rate": 0.00015256410256410255,
294
+ "loss": 0.412,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 2.435897435897436,
299
+ "grad_norm": 5.276978969573975,
300
  "learning_rate": 0.00015128205128205128,
301
+ "loss": 0.1798,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 2.5,
306
+ "grad_norm": 2.25166916847229,
307
  "learning_rate": 0.00015000000000000001,
308
+ "loss": 0.1813,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 2.564102564102564,
313
+ "grad_norm": 4.955526351928711,
314
  "learning_rate": 0.00014871794871794872,
315
+ "loss": 0.2486,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 2.564102564102564,
320
+ "eval_accuracy": 0.7517985611510791,
321
+ "eval_loss": 0.729444682598114,
322
+ "eval_runtime": 4.0175,
323
+ "eval_samples_per_second": 69.198,
324
+ "eval_steps_per_second": 8.712,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 2.628205128205128,
329
+ "grad_norm": 5.6987690925598145,
330
  "learning_rate": 0.00014743589743589745,
331
+ "loss": 0.5142,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 2.6923076923076925,
336
+ "grad_norm": 0.589967668056488,
337
  "learning_rate": 0.00014615384615384615,
338
+ "loss": 0.2685,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 2.7564102564102564,
343
+ "grad_norm": 2.2702548503875732,
344
  "learning_rate": 0.00014487179487179488,
345
+ "loss": 0.3104,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 2.8205128205128203,
350
+ "grad_norm": 4.440503120422363,
351
  "learning_rate": 0.0001435897435897436,
352
+ "loss": 0.2192,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 2.8846153846153846,
357
+ "grad_norm": 1.690927267074585,
358
  "learning_rate": 0.0001423076923076923,
359
+ "loss": 0.2557,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 2.948717948717949,
364
+ "grad_norm": 9.020477294921875,
365
  "learning_rate": 0.00014102564102564104,
366
+ "loss": 0.3725,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 3.0128205128205128,
371
+ "grad_norm": 1.131715178489685,
372
  "learning_rate": 0.00013974358974358974,
373
+ "loss": 0.3586,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 3.076923076923077,
378
+ "grad_norm": 2.3979876041412354,
379
  "learning_rate": 0.00013846153846153847,
380
+ "loss": 0.1712,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 3.141025641025641,
385
+ "grad_norm": 1.2889968156814575,
386
  "learning_rate": 0.00013717948717948718,
387
+ "loss": 0.1988,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 3.2051282051282053,
392
+ "grad_norm": 2.893319606781006,
393
  "learning_rate": 0.0001358974358974359,
394
+ "loss": 0.1366,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 3.2051282051282053,
399
+ "eval_accuracy": 0.841726618705036,
400
+ "eval_loss": 0.4832339882850647,
401
+ "eval_runtime": 2.588,
402
+ "eval_samples_per_second": 107.42,
403
+ "eval_steps_per_second": 13.524,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 3.269230769230769,
408
+ "grad_norm": 3.6555581092834473,
409
  "learning_rate": 0.00013461538461538464,
410
+ "loss": 0.1222,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 3.3333333333333335,
415
+ "grad_norm": 0.1904444396495819,
416
  "learning_rate": 0.00013333333333333334,
417
+ "loss": 0.1654,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 3.3974358974358974,
422
+ "grad_norm": 4.902673244476318,
423
  "learning_rate": 0.00013205128205128204,
424
+ "loss": 0.198,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 3.4615384615384617,
429
+ "grad_norm": 0.30183860659599304,
430
  "learning_rate": 0.00013076923076923077,
431
+ "loss": 0.2074,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 3.5256410256410255,
436
+ "grad_norm": 4.17673397064209,
437
  "learning_rate": 0.0001294871794871795,
438
+ "loss": 0.1021,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 3.58974358974359,
443
+ "grad_norm": 1.6145508289337158,
444
  "learning_rate": 0.00012820512820512823,
445
+ "loss": 0.1074,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 3.6538461538461537,
450
+ "grad_norm": 4.717573165893555,
451
  "learning_rate": 0.00012692307692307693,
452
+ "loss": 0.1201,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 3.717948717948718,
457
+ "grad_norm": 1.2709864377975464,
458
  "learning_rate": 0.00012564102564102564,
459
+ "loss": 0.0544,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 3.782051282051282,
464
+ "grad_norm": 3.7621912956237793,
465
  "learning_rate": 0.00012435897435897437,
466
+ "loss": 0.2016,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 3.8461538461538463,
471
+ "grad_norm": 12.426462173461914,
472
  "learning_rate": 0.0001230769230769231,
473
+ "loss": 0.3124,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 3.8461538461538463,
478
+ "eval_accuracy": 0.762589928057554,
479
+ "eval_loss": 0.8676345348358154,
480
+ "eval_runtime": 3.0816,
481
+ "eval_samples_per_second": 90.213,
482
+ "eval_steps_per_second": 11.358,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 3.91025641025641,
487
+ "grad_norm": 10.93652057647705,
488
  "learning_rate": 0.00012179487179487179,
489
+ "loss": 0.2992,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 3.9743589743589745,
494
+ "grad_norm": 0.6971213221549988,
495
  "learning_rate": 0.00012051282051282052,
496
+ "loss": 0.1864,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 4.038461538461538,
501
+ "grad_norm": 6.531364917755127,
502
  "learning_rate": 0.00011923076923076923,
503
+ "loss": 0.1929,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 4.102564102564102,
508
+ "grad_norm": 0.8437137007713318,
509
  "learning_rate": 0.00011794871794871796,
510
+ "loss": 0.0816,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 4.166666666666667,
515
+ "grad_norm": 9.23108196258545,
516
  "learning_rate": 0.00011666666666666668,
517
+ "loss": 0.0654,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 4.230769230769231,
522
+ "grad_norm": 1.1913517713546753,
523
  "learning_rate": 0.00011538461538461538,
524
+ "loss": 0.0974,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 4.294871794871795,
529
+ "grad_norm": 8.05540657043457,
530
  "learning_rate": 0.0001141025641025641,
531
+ "loss": 0.0466,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 4.358974358974359,
536
+ "grad_norm": 0.1012343019247055,
537
  "learning_rate": 0.00011282051282051283,
538
+ "loss": 0.0641,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 4.423076923076923,
543
+ "grad_norm": 7.817044734954834,
544
  "learning_rate": 0.00011153846153846154,
545
+ "loss": 0.1442,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 4.487179487179487,
550
+ "grad_norm": 6.788941860198975,
551
  "learning_rate": 0.00011025641025641027,
552
+ "loss": 0.0296,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 4.487179487179487,
557
+ "eval_accuracy": 0.8884892086330936,
558
+ "eval_loss": 0.4233308434486389,
559
+ "eval_runtime": 2.523,
560
+ "eval_samples_per_second": 110.184,
561
+ "eval_steps_per_second": 13.872,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 4.551282051282051,
566
+ "grad_norm": 4.860511302947998,
567
  "learning_rate": 0.00010897435897435896,
568
+ "loss": 0.0231,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 4.615384615384615,
573
+ "grad_norm": 0.9598804116249084,
574
  "learning_rate": 0.0001076923076923077,
575
+ "loss": 0.0207,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 4.67948717948718,
580
+ "grad_norm": 12.745481491088867,
581
  "learning_rate": 0.00010641025641025641,
582
+ "loss": 0.1516,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 4.743589743589744,
587
+ "grad_norm": 7.983795166015625,
588
  "learning_rate": 0.00010512820512820514,
589
+ "loss": 0.1638,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 4.8076923076923075,
594
+ "grad_norm": 3.051384449005127,
595
  "learning_rate": 0.00010384615384615386,
596
+ "loss": 0.0477,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 4.871794871794872,
601
+ "grad_norm": 0.10625698417425156,
602
  "learning_rate": 0.00010256410256410256,
603
+ "loss": 0.0719,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 4.935897435897436,
608
+ "grad_norm": 0.04624614119529724,
609
  "learning_rate": 0.00010128205128205129,
610
+ "loss": 0.1069,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 5.0,
615
+ "grad_norm": 0.08277003467082977,
616
  "learning_rate": 0.0001,
617
+ "loss": 0.0152,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 5.064102564102564,
622
+ "grad_norm": 0.09980784356594086,
623
  "learning_rate": 9.871794871794872e-05,
624
+ "loss": 0.0719,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 5.128205128205128,
629
+ "grad_norm": 0.09162779897451401,
630
  "learning_rate": 9.743589743589744e-05,
631
+ "loss": 0.0723,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 5.128205128205128,
636
+ "eval_accuracy": 0.8848920863309353,
637
+ "eval_loss": 0.4469863176345825,
638
+ "eval_runtime": 2.7699,
639
+ "eval_samples_per_second": 100.363,
640
+ "eval_steps_per_second": 12.636,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 5.1923076923076925,
645
+ "grad_norm": 0.05171818658709526,
646
  "learning_rate": 9.615384615384617e-05,
647
+ "loss": 0.0192,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 5.256410256410256,
652
+ "grad_norm": 0.05209165811538696,
653
  "learning_rate": 9.487179487179487e-05,
654
+ "loss": 0.0394,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 5.32051282051282,
659
+ "grad_norm": 0.960054874420166,
660
  "learning_rate": 9.35897435897436e-05,
661
+ "loss": 0.0129,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 5.384615384615385,
666
+ "grad_norm": 0.09233374148607254,
667
  "learning_rate": 9.230769230769232e-05,
668
+ "loss": 0.0138,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 5.448717948717949,
673
+ "grad_norm": 0.09635169804096222,
674
  "learning_rate": 9.102564102564103e-05,
675
+ "loss": 0.0096,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 5.512820512820513,
680
+ "grad_norm": 1.3777004480361938,
681
  "learning_rate": 8.974358974358975e-05,
682
+ "loss": 0.0412,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 5.576923076923077,
687
+ "grad_norm": 0.03339802846312523,
688
  "learning_rate": 8.846153846153847e-05,
689
+ "loss": 0.0424,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 5.641025641025641,
694
+ "grad_norm": 0.032307617366313934,
695
  "learning_rate": 8.717948717948718e-05,
696
+ "loss": 0.0161,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 5.705128205128205,
701
+ "grad_norm": 0.03049471788108349,
702
  "learning_rate": 8.58974358974359e-05,
703
+ "loss": 0.0388,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 5.769230769230769,
708
+ "grad_norm": 0.05182625725865364,
709
  "learning_rate": 8.461538461538461e-05,
710
+ "loss": 0.0342,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 5.769230769230769,
715
+ "eval_accuracy": 0.9172661870503597,
716
+ "eval_loss": 0.3406282067298889,
717
+ "eval_runtime": 2.3863,
718
+ "eval_samples_per_second": 116.5,
719
+ "eval_steps_per_second": 14.667,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 5.833333333333333,
724
+ "grad_norm": 0.2365674525499344,
725
  "learning_rate": 8.333333333333334e-05,
726
+ "loss": 0.0837,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 5.897435897435898,
731
+ "grad_norm": 0.031284429132938385,
732
  "learning_rate": 8.205128205128205e-05,
733
+ "loss": 0.0691,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 5.961538461538462,
738
+ "grad_norm": 10.787687301635742,
739
  "learning_rate": 8.076923076923078e-05,
740
+ "loss": 0.0524,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 6.0256410256410255,
745
+ "grad_norm": 0.027590090408921242,
746
  "learning_rate": 7.948717948717948e-05,
747
+ "loss": 0.0086,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 6.089743589743589,
752
+ "grad_norm": 0.04675084725022316,
753
  "learning_rate": 7.820512820512821e-05,
754
+ "loss": 0.0066,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 6.153846153846154,
759
+ "grad_norm": 0.032889507710933685,
760
  "learning_rate": 7.692307692307693e-05,
761
+ "loss": 0.0432,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 6.217948717948718,
766
+ "grad_norm": 0.1580750048160553,
767
  "learning_rate": 7.564102564102564e-05,
768
+ "loss": 0.006,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 6.282051282051282,
773
+ "grad_norm": 0.024286190047860146,
774
  "learning_rate": 7.435897435897436e-05,
775
+ "loss": 0.0123,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 6.346153846153846,
780
+ "grad_norm": 0.02685542032122612,
781
  "learning_rate": 7.307692307692307e-05,
782
+ "loss": 0.0059,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 6.410256410256411,
787
+ "grad_norm": 0.9080101251602173,
788
  "learning_rate": 7.17948717948718e-05,
789
+ "loss": 0.0055,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 6.410256410256411,
794
+ "eval_accuracy": 0.920863309352518,
795
+ "eval_loss": 0.330095499753952,
796
+ "eval_runtime": 3.1626,
797
+ "eval_samples_per_second": 87.904,
798
+ "eval_steps_per_second": 11.067,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 6.4743589743589745,
803
+ "grad_norm": 0.156268909573555,
804
  "learning_rate": 7.051282051282052e-05,
805
+ "loss": 0.0051,
806
  "step": 1010
807
  },
808
  {
809
  "epoch": 6.538461538461538,
810
+ "grad_norm": 0.025522593408823013,
811
  "learning_rate": 6.923076923076924e-05,
812
+ "loss": 0.0175,
813
  "step": 1020
814
  },
815
  {
816
  "epoch": 6.602564102564102,
817
+ "grad_norm": 0.025892965495586395,
818
  "learning_rate": 6.794871794871795e-05,
819
+ "loss": 0.0133,
820
  "step": 1030
821
  },
822
  {
823
  "epoch": 6.666666666666667,
824
+ "grad_norm": 0.02324897050857544,
825
  "learning_rate": 6.666666666666667e-05,
826
+ "loss": 0.0051,
827
  "step": 1040
828
  },
829
  {
830
  "epoch": 6.730769230769231,
831
+ "grad_norm": 0.20136423408985138,
832
  "learning_rate": 6.538461538461539e-05,
833
+ "loss": 0.0318,
834
  "step": 1050
835
  },
836
  {
837
  "epoch": 6.794871794871795,
838
+ "grad_norm": 0.11247438937425613,
839
  "learning_rate": 6.410256410256412e-05,
840
+ "loss": 0.0331,
841
  "step": 1060
842
  },
843
  {
844
  "epoch": 6.858974358974359,
845
+ "grad_norm": 0.10950164496898651,
846
  "learning_rate": 6.282051282051282e-05,
847
+ "loss": 0.0055,
848
  "step": 1070
849
  },
850
  {
851
  "epoch": 6.923076923076923,
852
+ "grad_norm": 1.537802815437317,
853
  "learning_rate": 6.153846153846155e-05,
854
+ "loss": 0.0055,
855
  "step": 1080
856
  },
857
  {
858
  "epoch": 6.987179487179487,
859
+ "grad_norm": 0.023923929780721664,
860
  "learning_rate": 6.025641025641026e-05,
861
+ "loss": 0.0044,
862
  "step": 1090
863
  },
864
  {
865
  "epoch": 7.051282051282051,
866
+ "grad_norm": 0.02083686552941799,
867
  "learning_rate": 5.897435897435898e-05,
868
+ "loss": 0.0048,
869
  "step": 1100
870
  },
871
  {
872
  "epoch": 7.051282051282051,
873
+ "eval_accuracy": 0.9172661870503597,
874
+ "eval_loss": 0.3471122980117798,
875
+ "eval_runtime": 2.4758,
876
+ "eval_samples_per_second": 112.287,
877
+ "eval_steps_per_second": 14.137,
878
  "step": 1100
879
  },
880
  {
881
  "epoch": 7.115384615384615,
882
+ "grad_norm": 0.020538046956062317,
883
  "learning_rate": 5.769230769230769e-05,
884
+ "loss": 0.0042,
885
  "step": 1110
886
  },
887
  {
888
  "epoch": 7.17948717948718,
889
+ "grad_norm": 0.01733437366783619,
890
  "learning_rate": 5.6410256410256414e-05,
891
+ "loss": 0.0039,
892
  "step": 1120
893
  },
894
  {
895
  "epoch": 7.243589743589744,
896
+ "grad_norm": 0.01968984678387642,
897
  "learning_rate": 5.512820512820514e-05,
898
+ "loss": 0.0038,
899
  "step": 1130
900
  },
901
  {
902
  "epoch": 7.3076923076923075,
903
+ "grad_norm": 0.019213447347283363,
904
  "learning_rate": 5.384615384615385e-05,
905
+ "loss": 0.0036,
906
  "step": 1140
907
  },
908
  {
909
  "epoch": 7.371794871794872,
910
+ "grad_norm": 0.017935629934072495,
911
  "learning_rate": 5.256410256410257e-05,
912
+ "loss": 0.004,
913
  "step": 1150
914
  },
915
  {
916
  "epoch": 7.435897435897436,
917
+ "grad_norm": 0.01726532354950905,
918
  "learning_rate": 5.128205128205128e-05,
919
+ "loss": 0.0038,
920
  "step": 1160
921
  },
922
  {
923
  "epoch": 7.5,
924
+ "grad_norm": 0.01753012090921402,
925
  "learning_rate": 5e-05,
926
+ "loss": 0.0038,
927
  "step": 1170
928
  },
929
  {
930
  "epoch": 7.564102564102564,
931
+ "grad_norm": 0.018105851486325264,
932
  "learning_rate": 4.871794871794872e-05,
933
+ "loss": 0.0036,
934
  "step": 1180
935
  },
936
  {
937
  "epoch": 7.628205128205128,
938
+ "grad_norm": 0.019911447539925575,
939
  "learning_rate": 4.7435897435897435e-05,
940
  "loss": 0.0037,
941
  "step": 1190
942
  },
943
  {
944
  "epoch": 7.6923076923076925,
945
+ "grad_norm": 0.023634430021047592,
946
  "learning_rate": 4.615384615384616e-05,
947
+ "loss": 0.0036,
948
  "step": 1200
949
  },
950
  {
951
  "epoch": 7.6923076923076925,
952
+ "eval_accuracy": 0.9136690647482014,
953
+ "eval_loss": 0.33460894227027893,
954
+ "eval_runtime": 2.4621,
955
+ "eval_samples_per_second": 112.91,
956
+ "eval_steps_per_second": 14.215,
957
  "step": 1200
958
  },
959
  {
960
  "epoch": 7.756410256410256,
961
+ "grad_norm": 0.017936883494257927,
962
  "learning_rate": 4.4871794871794874e-05,
963
+ "loss": 0.0034,
964
  "step": 1210
965
  },
966
  {
967
  "epoch": 7.82051282051282,
968
+ "grad_norm": 0.01885095238685608,
969
  "learning_rate": 4.358974358974359e-05,
970
+ "loss": 0.0034,
971
  "step": 1220
972
  },
973
  {
974
  "epoch": 7.884615384615385,
975
+ "grad_norm": 0.017711780965328217,
976
  "learning_rate": 4.230769230769231e-05,
977
+ "loss": 0.0033,
978
  "step": 1230
979
  },
980
  {
981
  "epoch": 7.948717948717949,
982
+ "grad_norm": 0.014750463888049126,
983
  "learning_rate": 4.1025641025641023e-05,
984
+ "loss": 0.0034,
985
  "step": 1240
986
  },
987
  {
988
  "epoch": 8.012820512820513,
989
+ "grad_norm": 0.014598443172872066,
990
  "learning_rate": 3.974358974358974e-05,
991
+ "loss": 0.0031,
992
  "step": 1250
993
  },
994
  {
995
  "epoch": 8.076923076923077,
996
+ "grad_norm": 0.01595359854400158,
997
  "learning_rate": 3.846153846153846e-05,
998
+ "loss": 0.0032,
999
  "step": 1260
1000
  },
1001
  {
1002
  "epoch": 8.14102564102564,
1003
+ "grad_norm": 0.01710698939859867,
1004
  "learning_rate": 3.717948717948718e-05,
1005
+ "loss": 0.0032,
1006
  "step": 1270
1007
  },
1008
  {
1009
  "epoch": 8.205128205128204,
1010
+ "grad_norm": 0.015550950542092323,
1011
  "learning_rate": 3.58974358974359e-05,
1012
+ "loss": 0.0031,
1013
  "step": 1280
1014
  },
1015
  {
1016
  "epoch": 8.26923076923077,
1017
+ "grad_norm": 0.015512553043663502,
1018
  "learning_rate": 3.461538461538462e-05,
1019
+ "loss": 0.0031,
1020
  "step": 1290
1021
  },
1022
  {
1023
  "epoch": 8.333333333333334,
1024
+ "grad_norm": 0.01687728427350521,
1025
  "learning_rate": 3.3333333333333335e-05,
1026
+ "loss": 0.003,
1027
  "step": 1300
1028
  },
1029
  {
1030
  "epoch": 8.333333333333334,
1031
+ "eval_accuracy": 0.9136690647482014,
1032
+ "eval_loss": 0.34976544976234436,
1033
+ "eval_runtime": 2.3973,
1034
+ "eval_samples_per_second": 115.962,
1035
+ "eval_steps_per_second": 14.599,
1036
  "step": 1300
1037
  },
1038
  {
1039
  "epoch": 8.397435897435898,
1040
+ "grad_norm": 0.017616627737879753,
1041
  "learning_rate": 3.205128205128206e-05,
1042
+ "loss": 0.0031,
1043
  "step": 1310
1044
  },
1045
  {
1046
  "epoch": 8.461538461538462,
1047
+ "grad_norm": 0.017452212050557137,
1048
  "learning_rate": 3.0769230769230774e-05,
1049
+ "loss": 0.0031,
1050
  "step": 1320
1051
  },
1052
  {
1053
  "epoch": 8.525641025641026,
1054
+ "grad_norm": 0.017976053059101105,
1055
  "learning_rate": 2.948717948717949e-05,
1056
+ "loss": 0.0032,
1057
  "step": 1330
1058
  },
1059
  {
1060
  "epoch": 8.58974358974359,
1061
+ "grad_norm": 0.014091568998992443,
1062
  "learning_rate": 2.8205128205128207e-05,
1063
+ "loss": 0.0032,
1064
  "step": 1340
1065
  },
1066
  {
1067
  "epoch": 8.653846153846153,
1068
+ "grad_norm": 0.015703728422522545,
1069
  "learning_rate": 2.6923076923076923e-05,
1070
+ "loss": 0.003,
1071
  "step": 1350
1072
  },
1073
  {
1074
  "epoch": 8.717948717948717,
1075
+ "grad_norm": 0.01781061850488186,
1076
  "learning_rate": 2.564102564102564e-05,
1077
+ "loss": 0.003,
1078
  "step": 1360
1079
  },
1080
  {
1081
  "epoch": 8.782051282051283,
1082
+ "grad_norm": 0.01647392474114895,
1083
  "learning_rate": 2.435897435897436e-05,
1084
+ "loss": 0.0031,
1085
  "step": 1370
1086
  },
1087
  {
1088
  "epoch": 8.846153846153847,
1089
+ "grad_norm": 0.013498615473508835,
1090
  "learning_rate": 2.307692307692308e-05,
1091
+ "loss": 0.0028,
1092
  "step": 1380
1093
  },
1094
  {
1095
  "epoch": 8.91025641025641,
1096
+ "grad_norm": 0.016175739467144012,
1097
  "learning_rate": 2.1794871794871795e-05,
1098
+ "loss": 0.003,
1099
  "step": 1390
1100
  },
1101
  {
1102
  "epoch": 8.974358974358974,
1103
+ "grad_norm": 0.015950603410601616,
1104
  "learning_rate": 2.0512820512820512e-05,
1105
  "loss": 0.003,
1106
  "step": 1400
1107
  },
1108
  {
1109
  "epoch": 8.974358974358974,
1110
+ "eval_accuracy": 0.9100719424460432,
1111
+ "eval_loss": 0.354926198720932,
1112
+ "eval_runtime": 2.4324,
1113
+ "eval_samples_per_second": 114.29,
1114
+ "eval_steps_per_second": 14.389,
1115
  "step": 1400
1116
  },
1117
  {
1118
  "epoch": 9.038461538461538,
1119
+ "grad_norm": 0.014547958970069885,
1120
  "learning_rate": 1.923076923076923e-05,
1121
+ "loss": 0.0029,
1122
  "step": 1410
1123
  },
1124
  {
1125
  "epoch": 9.102564102564102,
1126
+ "grad_norm": 0.012941875495016575,
1127
  "learning_rate": 1.794871794871795e-05,
1128
+ "loss": 0.0028,
1129
  "step": 1420
1130
  },
1131
  {
1132
  "epoch": 9.166666666666666,
1133
+ "grad_norm": 0.016635097563266754,
1134
  "learning_rate": 1.6666666666666667e-05,
1135
+ "loss": 0.003,
1136
  "step": 1430
1137
  },
1138
  {
1139
  "epoch": 9.23076923076923,
1140
+ "grad_norm": 0.018657604232430458,
1141
  "learning_rate": 1.5384615384615387e-05,
1142
+ "loss": 0.0029,
1143
  "step": 1440
1144
  },
1145
  {
1146
  "epoch": 9.294871794871796,
1147
+ "grad_norm": 0.015006115660071373,
1148
  "learning_rate": 1.4102564102564104e-05,
1149
+ "loss": 0.0031,
1150
  "step": 1450
1151
  },
1152
  {
1153
  "epoch": 9.35897435897436,
1154
+ "grad_norm": 0.01575641520321369,
1155
  "learning_rate": 1.282051282051282e-05,
1156
+ "loss": 0.0027,
1157
  "step": 1460
1158
  },
1159
  {
1160
  "epoch": 9.423076923076923,
1161
+ "grad_norm": 0.013228046707808971,
1162
  "learning_rate": 1.153846153846154e-05,
1163
+ "loss": 0.0027,
1164
  "step": 1470
1165
  },
1166
  {
1167
  "epoch": 9.487179487179487,
1168
+ "grad_norm": 0.013002932071685791,
1169
  "learning_rate": 1.0256410256410256e-05,
1170
+ "loss": 0.0027,
1171
  "step": 1480
1172
  },
1173
  {
1174
  "epoch": 9.551282051282051,
1175
+ "grad_norm": 0.014644928276538849,
1176
  "learning_rate": 8.974358974358976e-06,
1177
+ "loss": 0.0029,
1178
  "step": 1490
1179
  },
1180
  {
1181
  "epoch": 9.615384615384615,
1182
+ "grad_norm": 0.01448275987058878,
1183
  "learning_rate": 7.692307692307694e-06,
1184
+ "loss": 0.0027,
1185
  "step": 1500
1186
  },
1187
  {
1188
  "epoch": 9.615384615384615,
1189
+ "eval_accuracy": 0.9136690647482014,
1190
+ "eval_loss": 0.3569168150424957,
1191
+ "eval_runtime": 2.7846,
1192
+ "eval_samples_per_second": 99.835,
1193
+ "eval_steps_per_second": 12.569,
1194
  "step": 1500
1195
  },
1196
  {
1197
  "epoch": 9.679487179487179,
1198
+ "grad_norm": 0.012788016349077225,
1199
  "learning_rate": 6.41025641025641e-06,
1200
+ "loss": 0.0028,
1201
  "step": 1510
1202
  },
1203
  {
1204
  "epoch": 9.743589743589745,
1205
+ "grad_norm": 0.014576306566596031,
1206
  "learning_rate": 5.128205128205128e-06,
1207
+ "loss": 0.0026,
1208
  "step": 1520
1209
  },
1210
  {
1211
  "epoch": 9.807692307692308,
1212
+ "grad_norm": 0.014699741266667843,
1213
  "learning_rate": 3.846153846153847e-06,
1214
  "loss": 0.0029,
1215
  "step": 1530
1216
  },
1217
  {
1218
  "epoch": 9.871794871794872,
1219
+ "grad_norm": 0.015379097312688828,
1220
  "learning_rate": 2.564102564102564e-06,
1221
+ "loss": 0.0026,
1222
  "step": 1540
1223
  },
1224
  {
1225
  "epoch": 9.935897435897436,
1226
+ "grad_norm": 0.012021095491945744,
1227
  "learning_rate": 1.282051282051282e-06,
1228
+ "loss": 0.0027,
1229
  "step": 1550
1230
  },
1231
  {
1232
  "epoch": 10.0,
1233
+ "grad_norm": 0.014515766873955727,
1234
  "learning_rate": 0.0,
1235
+ "loss": 0.0028,
1236
  "step": 1560
1237
  },
1238
  {
1239
  "epoch": 10.0,
1240
  "step": 1560,
1241
  "total_flos": 1.9334597982400512e+18,
1242
+ "train_loss": 0.20911514231314263,
1243
+ "train_runtime": 603.1201,
1244
+ "train_samples_per_second": 41.368,
1245
+ "train_steps_per_second": 2.587
1246
  }
1247
  ],
1248
  "logging_steps": 10,