Encore02 commited on
Commit
f060fd9
1 Parent(s): 1a3c6e9

🍻 cheers

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
 
6
  - generated_from_trainer
7
  datasets:
8
  - imagefolder
@@ -23,7 +24,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.7830188679245284
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +34,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 1.1463
37
- - Accuracy: 0.7830
38
 
39
  ## Model description
40
 
 
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
6
+ - image-classification
7
  - generated_from_trainer
8
  datasets:
9
  - imagefolder
 
24
  metrics:
25
  - name: Accuracy
26
  type: accuracy
27
+ value: 0.6745283018867925
28
  ---
29
 
30
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
34
 
35
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
36
  It achieves the following results on the evaluation set:
37
+ - Loss: 0.7452
38
+ - Accuracy: 0.6745
39
 
40
  ## Model description
41
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 13.0,
3
- "eval_accuracy": 0.7793594306049823,
4
- "eval_loss": 0.6139788031578064,
5
- "eval_runtime": 4.3698,
6
- "eval_samples_per_second": 64.304,
7
- "eval_steps_per_second": 8.238,
8
- "total_flos": 2.5427127414770565e+18,
9
- "train_loss": 0.21512354467338007,
10
- "train_runtime": 1043.0352,
11
- "train_samples_per_second": 31.458,
12
- "train_steps_per_second": 1.969
13
  }
 
1
  {
2
  "epoch": 13.0,
3
+ "eval_accuracy": 0.6745283018867925,
4
+ "eval_loss": 0.7452366948127747,
5
+ "eval_runtime": 2.5059,
6
+ "eval_samples_per_second": 84.599,
7
+ "eval_steps_per_second": 10.774,
8
+ "total_flos": 1.9140864535683072e+18,
9
+ "train_loss": 0.35688263059153663,
10
+ "train_runtime": 955.328,
11
+ "train_samples_per_second": 25.855,
12
+ "train_steps_per_second": 1.619
13
  }
data/events.out.tfevents.1730611272.07f6fc948a6b.436.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b63372cd34724660f122424e445f8909de85a62b82dc5b17c298bf39f592829d
3
+ size 411
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 13.0,
3
- "eval_accuracy": 0.7793594306049823,
4
- "eval_loss": 0.6139788031578064,
5
- "eval_runtime": 4.3698,
6
- "eval_samples_per_second": 64.304,
7
- "eval_steps_per_second": 8.238
8
  }
 
1
  {
2
  "epoch": 13.0,
3
+ "eval_accuracy": 0.6745283018867925,
4
+ "eval_loss": 0.7452366948127747,
5
+ "eval_runtime": 2.5059,
6
+ "eval_samples_per_second": 84.599,
7
+ "eval_steps_per_second": 10.774
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 13.0,
3
- "total_flos": 2.5427127414770565e+18,
4
- "train_loss": 0.21512354467338007,
5
- "train_runtime": 1043.0352,
6
- "train_samples_per_second": 31.458,
7
- "train_steps_per_second": 1.969
8
  }
 
1
  {
2
  "epoch": 13.0,
3
+ "total_flos": 1.9140864535683072e+18,
4
+ "train_loss": 0.35688263059153663,
5
+ "train_runtime": 955.328,
6
+ "train_samples_per_second": 25.855,
7
+ "train_steps_per_second": 1.619
8
  }
trainer_state.json CHANGED
@@ -1,1640 +1,1238 @@
1
  {
2
- "best_metric": 0.6139788031578064,
3
- "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-500",
4
  "epoch": 13.0,
5
  "eval_steps": 100,
6
- "global_step": 2054,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.06329113924050633,
13
- "grad_norm": 1.8231099843978882,
14
- "learning_rate": 0.00019902629016553067,
15
- "loss": 1.238,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.12658227848101267,
20
- "grad_norm": 0.7930148243904114,
21
- "learning_rate": 0.00019805258033106136,
22
- "loss": 1.2652,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.189873417721519,
27
- "grad_norm": 2.1662611961364746,
28
- "learning_rate": 0.00019707887049659202,
29
- "loss": 1.1927,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.25316455696202533,
34
- "grad_norm": 0.9586331248283386,
35
- "learning_rate": 0.0001961051606621227,
36
- "loss": 1.2159,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.31645569620253167,
41
- "grad_norm": 1.0158694982528687,
42
- "learning_rate": 0.00019513145082765337,
43
- "loss": 1.1697,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.379746835443038,
48
- "grad_norm": 2.5109598636627197,
49
- "learning_rate": 0.00019415774099318405,
50
- "loss": 1.1731,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.4430379746835443,
55
- "grad_norm": 1.9428660869598389,
56
- "learning_rate": 0.0001931840311587147,
57
- "loss": 1.0786,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.5063291139240507,
62
- "grad_norm": 1.6625088453292847,
63
- "learning_rate": 0.0001922103213242454,
64
- "loss": 1.0579,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.569620253164557,
69
- "grad_norm": 2.1774094104766846,
70
- "learning_rate": 0.00019123661148977606,
71
- "loss": 1.008,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.6329113924050633,
76
- "grad_norm": 2.2633728981018066,
77
- "learning_rate": 0.00019026290165530675,
78
- "loss": 0.9281,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.6329113924050633,
83
- "eval_accuracy": 0.5907473309608541,
84
- "eval_loss": 0.9793357849121094,
85
- "eval_runtime": 2.9407,
86
- "eval_samples_per_second": 95.555,
87
- "eval_steps_per_second": 12.242,
88
  "step": 100
89
  },
90
  {
91
- "epoch": 0.6962025316455697,
92
- "grad_norm": 1.4851062297821045,
93
- "learning_rate": 0.0001892891918208374,
94
- "loss": 0.8375,
95
  "step": 110
96
  },
97
  {
98
- "epoch": 0.759493670886076,
99
- "grad_norm": 3.343125820159912,
100
- "learning_rate": 0.00018831548198636807,
101
- "loss": 0.9856,
102
  "step": 120
103
  },
104
  {
105
- "epoch": 0.8227848101265823,
106
- "grad_norm": 1.3608458042144775,
107
- "learning_rate": 0.00018734177215189873,
108
- "loss": 0.9197,
109
  "step": 130
110
  },
111
  {
112
- "epoch": 0.8860759493670886,
113
- "grad_norm": 2.313962459564209,
114
- "learning_rate": 0.00018636806231742942,
115
- "loss": 0.8262,
116
  "step": 140
117
  },
118
  {
119
- "epoch": 0.9493670886075949,
120
- "grad_norm": 1.8323599100112915,
121
- "learning_rate": 0.00018539435248296008,
122
- "loss": 1.006,
123
  "step": 150
124
  },
125
  {
126
- "epoch": 1.0126582278481013,
127
- "grad_norm": 3.7629854679107666,
128
- "learning_rate": 0.00018442064264849074,
129
- "loss": 1.0289,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 1.0759493670886076,
134
- "grad_norm": 1.4926854372024536,
135
- "learning_rate": 0.00018344693281402142,
136
- "loss": 0.7291,
137
  "step": 170
138
  },
139
  {
140
- "epoch": 1.139240506329114,
141
- "grad_norm": 3.3833086490631104,
142
- "learning_rate": 0.00018247322297955209,
143
- "loss": 0.8654,
144
  "step": 180
145
  },
146
  {
147
- "epoch": 1.2025316455696202,
148
- "grad_norm": 3.208197593688965,
149
- "learning_rate": 0.00018149951314508277,
150
- "loss": 0.685,
151
  "step": 190
152
  },
153
  {
154
- "epoch": 1.2658227848101267,
155
- "grad_norm": 2.0218262672424316,
156
- "learning_rate": 0.00018052580331061343,
157
- "loss": 0.6894,
158
  "step": 200
159
  },
160
  {
161
- "epoch": 1.2658227848101267,
162
- "eval_accuracy": 0.6868327402135231,
163
- "eval_loss": 0.7116755843162537,
164
- "eval_runtime": 5.624,
165
- "eval_samples_per_second": 49.964,
166
- "eval_steps_per_second": 6.401,
167
  "step": 200
168
  },
169
  {
170
- "epoch": 1.3291139240506329,
171
- "grad_norm": 1.8904668092727661,
172
- "learning_rate": 0.00017955209347614412,
173
- "loss": 0.648,
174
  "step": 210
175
  },
176
  {
177
- "epoch": 1.3924050632911391,
178
- "grad_norm": 2.065814971923828,
179
- "learning_rate": 0.00017857838364167478,
180
- "loss": 0.6942,
181
  "step": 220
182
  },
183
  {
184
- "epoch": 1.4556962025316456,
185
- "grad_norm": 1.779307246208191,
186
- "learning_rate": 0.00017760467380720547,
187
- "loss": 0.6617,
188
  "step": 230
189
  },
190
  {
191
- "epoch": 1.518987341772152,
192
- "grad_norm": 4.920161724090576,
193
- "learning_rate": 0.00017663096397273613,
194
- "loss": 0.7961,
195
  "step": 240
196
  },
197
  {
198
- "epoch": 1.5822784810126582,
199
- "grad_norm": 2.617550849914551,
200
- "learning_rate": 0.00017565725413826682,
201
- "loss": 0.623,
202
  "step": 250
203
  },
204
  {
205
- "epoch": 1.6455696202531644,
206
- "grad_norm": 2.2660183906555176,
207
- "learning_rate": 0.00017468354430379748,
208
- "loss": 0.706,
209
  "step": 260
210
  },
211
  {
212
- "epoch": 1.7088607594936709,
213
- "grad_norm": 3.3759207725524902,
214
- "learning_rate": 0.00017370983446932816,
215
- "loss": 0.6196,
216
  "step": 270
217
  },
218
  {
219
- "epoch": 1.7721518987341773,
220
- "grad_norm": 2.01643967628479,
221
- "learning_rate": 0.00017273612463485882,
222
- "loss": 0.6502,
223
  "step": 280
224
  },
225
  {
226
- "epoch": 1.8354430379746836,
227
- "grad_norm": 3.487635374069214,
228
- "learning_rate": 0.00017176241480038948,
229
- "loss": 0.7236,
230
  "step": 290
231
  },
232
  {
233
- "epoch": 1.8987341772151898,
234
- "grad_norm": 2.225860595703125,
235
- "learning_rate": 0.00017078870496592017,
236
- "loss": 0.6074,
237
  "step": 300
238
  },
239
  {
240
- "epoch": 1.8987341772151898,
241
- "eval_accuracy": 0.693950177935943,
242
- "eval_loss": 0.7030865550041199,
243
- "eval_runtime": 2.9563,
244
- "eval_samples_per_second": 95.052,
245
- "eval_steps_per_second": 12.178,
246
  "step": 300
247
  },
248
  {
249
- "epoch": 1.9620253164556962,
250
- "grad_norm": 2.5282366275787354,
251
- "learning_rate": 0.00016981499513145083,
252
- "loss": 0.6951,
253
  "step": 310
254
  },
255
  {
256
- "epoch": 2.0253164556962027,
257
- "grad_norm": 2.416698932647705,
258
- "learning_rate": 0.00016884128529698152,
259
- "loss": 0.4789,
260
  "step": 320
261
  },
262
  {
263
- "epoch": 2.088607594936709,
264
- "grad_norm": 3.2094521522521973,
265
- "learning_rate": 0.00016786757546251218,
266
- "loss": 0.4443,
267
  "step": 330
268
  },
269
  {
270
- "epoch": 2.151898734177215,
271
- "grad_norm": 3.365262031555176,
272
- "learning_rate": 0.00016689386562804287,
273
- "loss": 0.4303,
274
  "step": 340
275
  },
276
  {
277
- "epoch": 2.2151898734177213,
278
- "grad_norm": 1.8025457859039307,
279
- "learning_rate": 0.00016592015579357353,
280
- "loss": 0.4212,
281
  "step": 350
282
  },
283
  {
284
- "epoch": 2.278481012658228,
285
- "grad_norm": 2.566657304763794,
286
- "learning_rate": 0.0001649464459591042,
287
- "loss": 0.491,
288
  "step": 360
289
  },
290
  {
291
- "epoch": 2.3417721518987342,
292
- "grad_norm": 1.4255659580230713,
293
- "learning_rate": 0.00016397273612463488,
294
- "loss": 0.4005,
295
  "step": 370
296
  },
297
  {
298
- "epoch": 2.4050632911392404,
299
- "grad_norm": 1.4234439134597778,
300
- "learning_rate": 0.00016299902629016554,
301
- "loss": 0.3792,
302
  "step": 380
303
  },
304
  {
305
- "epoch": 2.4683544303797467,
306
- "grad_norm": 3.924708604812622,
307
- "learning_rate": 0.0001620253164556962,
308
- "loss": 0.4056,
309
  "step": 390
310
  },
311
  {
312
- "epoch": 2.5316455696202533,
313
- "grad_norm": 3.877371072769165,
314
- "learning_rate": 0.00016105160662122688,
315
- "loss": 0.5389,
316
  "step": 400
317
  },
318
  {
319
- "epoch": 2.5316455696202533,
320
- "eval_accuracy": 0.7330960854092526,
321
- "eval_loss": 0.6998230218887329,
322
- "eval_runtime": 3.6859,
323
- "eval_samples_per_second": 76.237,
324
- "eval_steps_per_second": 9.767,
325
  "step": 400
326
  },
327
  {
328
- "epoch": 2.5949367088607596,
329
- "grad_norm": 4.078298091888428,
330
- "learning_rate": 0.00016007789678675754,
331
- "loss": 0.5042,
332
  "step": 410
333
  },
334
  {
335
- "epoch": 2.6582278481012658,
336
- "grad_norm": 2.0867278575897217,
337
- "learning_rate": 0.00015910418695228823,
338
- "loss": 0.674,
339
  "step": 420
340
  },
341
  {
342
- "epoch": 2.721518987341772,
343
- "grad_norm": 4.390700340270996,
344
- "learning_rate": 0.0001581304771178189,
345
- "loss": 0.5261,
346
  "step": 430
347
  },
348
  {
349
- "epoch": 2.7848101265822782,
350
- "grad_norm": 3.191239833831787,
351
- "learning_rate": 0.00015715676728334955,
352
- "loss": 0.4011,
353
  "step": 440
354
  },
355
  {
356
- "epoch": 2.848101265822785,
357
- "grad_norm": 0.9980124235153198,
358
- "learning_rate": 0.00015618305744888024,
359
- "loss": 0.3694,
360
  "step": 450
361
  },
362
  {
363
- "epoch": 2.911392405063291,
364
- "grad_norm": 2.584527015686035,
365
- "learning_rate": 0.0001552093476144109,
366
- "loss": 0.4619,
367
  "step": 460
368
  },
369
  {
370
- "epoch": 2.9746835443037973,
371
- "grad_norm": 2.5247104167938232,
372
- "learning_rate": 0.0001542356377799416,
373
- "loss": 0.3618,
374
  "step": 470
375
  },
376
  {
377
- "epoch": 3.037974683544304,
378
- "grad_norm": 3.4078636169433594,
379
- "learning_rate": 0.00015326192794547225,
380
- "loss": 0.3053,
381
  "step": 480
382
  },
383
  {
384
- "epoch": 3.1012658227848102,
385
- "grad_norm": 0.3695339858531952,
386
- "learning_rate": 0.00015228821811100294,
387
- "loss": 0.1686,
388
  "step": 490
389
  },
390
  {
391
- "epoch": 3.1645569620253164,
392
- "grad_norm": 2.54807186126709,
393
- "learning_rate": 0.0001513145082765336,
394
- "loss": 0.2922,
395
  "step": 500
396
  },
397
  {
398
- "epoch": 3.1645569620253164,
399
- "eval_accuracy": 0.7793594306049823,
400
- "eval_loss": 0.6139788031578064,
401
- "eval_runtime": 3.2606,
402
- "eval_samples_per_second": 86.179,
403
- "eval_steps_per_second": 11.041,
404
  "step": 500
405
  },
406
  {
407
- "epoch": 3.2278481012658227,
408
- "grad_norm": 2.1469414234161377,
409
- "learning_rate": 0.00015034079844206428,
410
- "loss": 0.1459,
411
  "step": 510
412
  },
413
  {
414
- "epoch": 3.291139240506329,
415
- "grad_norm": 2.7631657123565674,
416
- "learning_rate": 0.00014936708860759494,
417
- "loss": 0.2772,
418
  "step": 520
419
  },
420
  {
421
- "epoch": 3.3544303797468356,
422
- "grad_norm": 1.3479973077774048,
423
- "learning_rate": 0.00014839337877312563,
424
- "loss": 0.2665,
425
  "step": 530
426
  },
427
  {
428
- "epoch": 3.4177215189873418,
429
- "grad_norm": 0.4051523506641388,
430
- "learning_rate": 0.0001474196689386563,
431
- "loss": 0.2579,
432
  "step": 540
433
  },
434
  {
435
- "epoch": 3.481012658227848,
436
- "grad_norm": 3.5712709426879883,
437
- "learning_rate": 0.00014644595910418698,
438
- "loss": 0.2286,
439
  "step": 550
440
  },
441
  {
442
- "epoch": 3.5443037974683547,
443
- "grad_norm": 7.3584418296813965,
444
- "learning_rate": 0.00014547224926971764,
445
- "loss": 0.1577,
446
  "step": 560
447
  },
448
  {
449
- "epoch": 3.607594936708861,
450
- "grad_norm": 3.242455005645752,
451
- "learning_rate": 0.0001444985394352483,
452
- "loss": 0.2248,
453
  "step": 570
454
  },
455
  {
456
- "epoch": 3.670886075949367,
457
- "grad_norm": 2.749756336212158,
458
- "learning_rate": 0.000143524829600779,
459
- "loss": 0.242,
460
  "step": 580
461
  },
462
  {
463
- "epoch": 3.7341772151898733,
464
- "grad_norm": 5.646878719329834,
465
- "learning_rate": 0.00014255111976630965,
466
- "loss": 0.2622,
467
  "step": 590
468
  },
469
  {
470
- "epoch": 3.7974683544303796,
471
- "grad_norm": 1.418591856956482,
472
- "learning_rate": 0.00014157740993184033,
473
- "loss": 0.2661,
474
  "step": 600
475
  },
476
  {
477
- "epoch": 3.7974683544303796,
478
- "eval_accuracy": 0.7117437722419929,
479
- "eval_loss": 0.8140489459037781,
480
- "eval_runtime": 3.8296,
481
- "eval_samples_per_second": 73.376,
482
- "eval_steps_per_second": 9.4,
483
  "step": 600
484
  },
485
  {
486
- "epoch": 3.8607594936708862,
487
- "grad_norm": 3.7535617351531982,
488
- "learning_rate": 0.000140603700097371,
489
- "loss": 0.3347,
490
  "step": 610
491
  },
492
  {
493
- "epoch": 3.9240506329113924,
494
- "grad_norm": 3.35208797454834,
495
- "learning_rate": 0.00013962999026290165,
496
- "loss": 0.2483,
497
  "step": 620
498
  },
499
  {
500
- "epoch": 3.9873417721518987,
501
- "grad_norm": 5.050980567932129,
502
- "learning_rate": 0.00013865628042843232,
503
- "loss": 0.3502,
504
  "step": 630
505
  },
506
  {
507
- "epoch": 4.050632911392405,
508
- "grad_norm": 3.208113193511963,
509
- "learning_rate": 0.000137682570593963,
510
- "loss": 0.1731,
511
  "step": 640
512
  },
513
  {
514
- "epoch": 4.113924050632911,
515
- "grad_norm": 1.6071431636810303,
516
- "learning_rate": 0.00013670886075949366,
517
- "loss": 0.2249,
518
  "step": 650
519
  },
520
  {
521
- "epoch": 4.177215189873418,
522
- "grad_norm": 4.170581817626953,
523
- "learning_rate": 0.00013573515092502435,
524
- "loss": 0.2121,
525
  "step": 660
526
  },
527
  {
528
- "epoch": 4.2405063291139244,
529
- "grad_norm": 2.5262327194213867,
530
- "learning_rate": 0.000134761441090555,
531
- "loss": 0.1614,
532
  "step": 670
533
  },
534
  {
535
- "epoch": 4.30379746835443,
536
- "grad_norm": 3.41585373878479,
537
- "learning_rate": 0.0001337877312560857,
538
- "loss": 0.1355,
539
  "step": 680
540
  },
541
  {
542
- "epoch": 4.367088607594937,
543
- "grad_norm": 0.5682180523872375,
544
- "learning_rate": 0.00013281402142161636,
545
- "loss": 0.1756,
546
  "step": 690
547
  },
548
  {
549
- "epoch": 4.430379746835443,
550
- "grad_norm": 7.918303966522217,
551
- "learning_rate": 0.00013184031158714705,
552
- "loss": 0.1547,
553
  "step": 700
554
  },
555
  {
556
- "epoch": 4.430379746835443,
557
- "eval_accuracy": 0.7188612099644128,
558
- "eval_loss": 0.858184278011322,
559
- "eval_runtime": 2.8445,
560
- "eval_samples_per_second": 98.788,
561
- "eval_steps_per_second": 12.656,
562
  "step": 700
563
  },
564
  {
565
- "epoch": 4.493670886075949,
566
- "grad_norm": 0.23097099363803864,
567
- "learning_rate": 0.0001308666017526777,
568
- "loss": 0.1554,
569
  "step": 710
570
  },
571
  {
572
- "epoch": 4.556962025316456,
573
- "grad_norm": 0.948228657245636,
574
- "learning_rate": 0.00012989289191820837,
575
- "loss": 0.1475,
576
  "step": 720
577
  },
578
  {
579
- "epoch": 4.620253164556962,
580
- "grad_norm": 4.432514667510986,
581
- "learning_rate": 0.00012891918208373905,
582
- "loss": 0.248,
583
  "step": 730
584
  },
585
  {
586
- "epoch": 4.6835443037974684,
587
- "grad_norm": 1.4464466571807861,
588
- "learning_rate": 0.00012794547224926971,
589
- "loss": 0.1209,
590
  "step": 740
591
  },
592
  {
593
- "epoch": 4.746835443037975,
594
- "grad_norm": 1.1032408475875854,
595
- "learning_rate": 0.0001269717624148004,
596
- "loss": 0.1258,
597
  "step": 750
598
  },
599
  {
600
- "epoch": 4.810126582278481,
601
- "grad_norm": 4.464413166046143,
602
- "learning_rate": 0.00012599805258033106,
603
- "loss": 0.1769,
604
  "step": 760
605
  },
606
  {
607
- "epoch": 4.8734177215189876,
608
- "grad_norm": 4.2749738693237305,
609
- "learning_rate": 0.00012502434274586175,
610
- "loss": 0.1222,
611
  "step": 770
612
  },
613
  {
614
- "epoch": 4.936708860759493,
615
- "grad_norm": 5.434940814971924,
616
- "learning_rate": 0.0001240506329113924,
617
- "loss": 0.1473,
618
  "step": 780
619
  },
620
  {
621
- "epoch": 5.0,
622
- "grad_norm": 0.17561954259872437,
623
- "learning_rate": 0.0001230769230769231,
624
- "loss": 0.1127,
625
  "step": 790
626
  },
627
  {
628
- "epoch": 5.063291139240507,
629
- "grad_norm": 4.6776442527771,
630
- "learning_rate": 0.00012210321324245376,
631
- "loss": 0.1047,
632
  "step": 800
633
  },
634
  {
635
- "epoch": 5.063291139240507,
636
- "eval_accuracy": 0.800711743772242,
637
- "eval_loss": 0.7366448640823364,
638
- "eval_runtime": 2.9584,
639
- "eval_samples_per_second": 94.983,
640
- "eval_steps_per_second": 12.169,
641
  "step": 800
642
  },
643
  {
644
- "epoch": 5.1265822784810124,
645
- "grad_norm": 0.8573622703552246,
646
- "learning_rate": 0.00012112950340798443,
647
- "loss": 0.0739,
648
  "step": 810
649
  },
650
  {
651
- "epoch": 5.189873417721519,
652
- "grad_norm": 0.6034272909164429,
653
- "learning_rate": 0.00012015579357351509,
654
- "loss": 0.06,
655
  "step": 820
656
  },
657
  {
658
- "epoch": 5.253164556962025,
659
- "grad_norm": 0.07069756090641022,
660
- "learning_rate": 0.00011918208373904578,
661
- "loss": 0.0496,
662
  "step": 830
663
  },
664
  {
665
- "epoch": 5.3164556962025316,
666
- "grad_norm": 0.3019968867301941,
667
- "learning_rate": 0.00011820837390457644,
668
- "loss": 0.0825,
669
  "step": 840
670
  },
671
  {
672
- "epoch": 5.379746835443038,
673
- "grad_norm": 0.38130983710289,
674
- "learning_rate": 0.0001172346640701071,
675
- "loss": 0.0706,
676
  "step": 850
677
  },
678
  {
679
- "epoch": 5.443037974683544,
680
- "grad_norm": 0.086518295109272,
681
- "learning_rate": 0.00011626095423563779,
682
- "loss": 0.0317,
683
  "step": 860
684
  },
685
  {
686
- "epoch": 5.506329113924051,
687
- "grad_norm": 3.27998948097229,
688
- "learning_rate": 0.00011528724440116845,
689
- "loss": 0.0563,
690
  "step": 870
691
  },
692
  {
693
- "epoch": 5.569620253164557,
694
- "grad_norm": 0.3357122242450714,
695
- "learning_rate": 0.00011431353456669914,
696
- "loss": 0.0963,
697
  "step": 880
698
  },
699
  {
700
- "epoch": 5.632911392405063,
701
- "grad_norm": 4.3751373291015625,
702
- "learning_rate": 0.0001133398247322298,
703
- "loss": 0.0903,
704
  "step": 890
705
  },
706
  {
707
- "epoch": 5.69620253164557,
708
- "grad_norm": 4.348201751708984,
709
- "learning_rate": 0.00011236611489776048,
710
- "loss": 0.0672,
711
  "step": 900
712
  },
713
  {
714
- "epoch": 5.69620253164557,
715
- "eval_accuracy": 0.7366548042704626,
716
- "eval_loss": 1.0770440101623535,
717
- "eval_runtime": 3.7673,
718
- "eval_samples_per_second": 74.589,
719
- "eval_steps_per_second": 9.556,
720
  "step": 900
721
  },
722
  {
723
- "epoch": 5.759493670886076,
724
- "grad_norm": 0.5488149523735046,
725
- "learning_rate": 0.00011139240506329114,
726
- "loss": 0.1904,
727
  "step": 910
728
  },
729
  {
730
- "epoch": 5.822784810126582,
731
- "grad_norm": 0.07205292582511902,
732
- "learning_rate": 0.00011041869522882182,
733
- "loss": 0.0929,
734
  "step": 920
735
  },
736
  {
737
- "epoch": 5.886075949367089,
738
- "grad_norm": 4.9658002853393555,
739
- "learning_rate": 0.00010944498539435248,
740
- "loss": 0.1066,
741
  "step": 930
742
  },
743
  {
744
- "epoch": 5.949367088607595,
745
- "grad_norm": 9.347110748291016,
746
- "learning_rate": 0.00010847127555988317,
747
- "loss": 0.081,
748
  "step": 940
749
  },
750
  {
751
- "epoch": 6.012658227848101,
752
- "grad_norm": 0.12258296459913254,
753
- "learning_rate": 0.00010749756572541383,
754
- "loss": 0.0277,
755
  "step": 950
756
  },
757
  {
758
- "epoch": 6.075949367088608,
759
- "grad_norm": 0.06232326850295067,
760
- "learning_rate": 0.00010652385589094451,
761
- "loss": 0.0103,
762
  "step": 960
763
  },
764
  {
765
- "epoch": 6.139240506329114,
766
- "grad_norm": 0.07464442402124405,
767
- "learning_rate": 0.00010555014605647517,
768
- "loss": 0.0557,
769
  "step": 970
770
  },
771
  {
772
- "epoch": 6.2025316455696204,
773
- "grad_norm": 9.516839027404785,
774
- "learning_rate": 0.00010457643622200586,
775
- "loss": 0.0998,
776
  "step": 980
777
  },
778
  {
779
- "epoch": 6.265822784810126,
780
- "grad_norm": 4.102126598358154,
781
- "learning_rate": 0.00010360272638753652,
782
- "loss": 0.102,
783
  "step": 990
784
  },
785
  {
786
- "epoch": 6.329113924050633,
787
- "grad_norm": 12.039372444152832,
788
- "learning_rate": 0.00010262901655306718,
789
- "loss": 0.0316,
790
  "step": 1000
791
  },
792
  {
793
- "epoch": 6.329113924050633,
794
- "eval_accuracy": 0.8078291814946619,
795
- "eval_loss": 0.7480723857879639,
796
- "eval_runtime": 3.2804,
797
- "eval_samples_per_second": 85.659,
798
- "eval_steps_per_second": 10.974,
799
  "step": 1000
800
  },
801
  {
802
- "epoch": 6.3924050632911396,
803
- "grad_norm": 0.04922656714916229,
804
- "learning_rate": 0.00010165530671859787,
805
- "loss": 0.0443,
806
  "step": 1010
807
  },
808
  {
809
- "epoch": 6.455696202531645,
810
- "grad_norm": 0.2317192703485489,
811
- "learning_rate": 0.00010068159688412853,
812
- "loss": 0.02,
813
  "step": 1020
814
  },
815
  {
816
- "epoch": 6.518987341772152,
817
- "grad_norm": 0.05073447898030281,
818
- "learning_rate": 9.970788704965922e-05,
819
- "loss": 0.0171,
820
  "step": 1030
821
  },
822
  {
823
- "epoch": 6.582278481012658,
824
- "grad_norm": 5.737843036651611,
825
- "learning_rate": 9.873417721518988e-05,
826
- "loss": 0.1484,
827
  "step": 1040
828
  },
829
  {
830
- "epoch": 6.6455696202531644,
831
- "grad_norm": 3.3463170528411865,
832
- "learning_rate": 9.776046738072055e-05,
833
- "loss": 0.0879,
834
  "step": 1050
835
  },
836
  {
837
- "epoch": 6.708860759493671,
838
- "grad_norm": 0.06163879111409187,
839
- "learning_rate": 9.678675754625121e-05,
840
- "loss": 0.0119,
841
  "step": 1060
842
  },
843
  {
844
- "epoch": 6.772151898734177,
845
- "grad_norm": 0.6712504625320435,
846
- "learning_rate": 9.581304771178188e-05,
847
- "loss": 0.1082,
848
  "step": 1070
849
  },
850
  {
851
- "epoch": 6.8354430379746836,
852
- "grad_norm": 3.0736191272735596,
853
- "learning_rate": 9.483933787731256e-05,
854
- "loss": 0.1088,
855
  "step": 1080
856
  },
857
  {
858
- "epoch": 6.89873417721519,
859
- "grad_norm": 0.5992501974105835,
860
- "learning_rate": 9.386562804284323e-05,
861
- "loss": 0.1101,
862
  "step": 1090
863
  },
864
  {
865
- "epoch": 6.962025316455696,
866
- "grad_norm": 0.10351633280515671,
867
- "learning_rate": 9.28919182083739e-05,
868
- "loss": 0.0367,
869
  "step": 1100
870
  },
871
  {
872
- "epoch": 6.962025316455696,
873
- "eval_accuracy": 0.797153024911032,
874
- "eval_loss": 0.8765729665756226,
875
- "eval_runtime": 2.8804,
876
- "eval_samples_per_second": 97.557,
877
- "eval_steps_per_second": 12.498,
878
  "step": 1100
879
  },
880
  {
881
- "epoch": 7.025316455696203,
882
- "grad_norm": 0.4222586154937744,
883
- "learning_rate": 9.191820837390458e-05,
884
- "loss": 0.0135,
885
  "step": 1110
886
  },
887
  {
888
- "epoch": 7.0886075949367084,
889
- "grad_norm": 6.298471927642822,
890
- "learning_rate": 9.094449853943525e-05,
891
- "loss": 0.0173,
892
  "step": 1120
893
  },
894
  {
895
- "epoch": 7.151898734177215,
896
- "grad_norm": 0.04971346631646156,
897
- "learning_rate": 8.997078870496593e-05,
898
- "loss": 0.0445,
899
  "step": 1130
900
  },
901
  {
902
- "epoch": 7.215189873417722,
903
- "grad_norm": 0.0970793217420578,
904
- "learning_rate": 8.89970788704966e-05,
905
- "loss": 0.0434,
906
  "step": 1140
907
  },
908
  {
909
- "epoch": 7.2784810126582276,
910
- "grad_norm": 0.23834434151649475,
911
- "learning_rate": 8.802336903602728e-05,
912
- "loss": 0.0466,
913
  "step": 1150
914
  },
915
  {
916
- "epoch": 7.341772151898734,
917
- "grad_norm": 0.13099707663059235,
918
- "learning_rate": 8.704965920155794e-05,
919
- "loss": 0.0173,
920
  "step": 1160
921
  },
922
  {
923
- "epoch": 7.405063291139241,
924
- "grad_norm": 0.02944963611662388,
925
- "learning_rate": 8.607594936708861e-05,
926
- "loss": 0.0286,
927
  "step": 1170
928
  },
929
  {
930
- "epoch": 7.468354430379747,
931
- "grad_norm": 0.022327905520796776,
932
- "learning_rate": 8.510223953261928e-05,
933
- "loss": 0.0369,
934
  "step": 1180
935
  },
936
  {
937
- "epoch": 7.531645569620253,
938
- "grad_norm": 0.05552661046385765,
939
- "learning_rate": 8.412852969814996e-05,
940
- "loss": 0.0139,
941
  "step": 1190
942
  },
943
  {
944
- "epoch": 7.594936708860759,
945
- "grad_norm": 0.02235097438097,
946
- "learning_rate": 8.315481986368062e-05,
947
- "loss": 0.0185,
948
  "step": 1200
949
  },
950
  {
951
- "epoch": 7.594936708860759,
952
- "eval_accuracy": 0.8078291814946619,
953
- "eval_loss": 0.9476400017738342,
954
- "eval_runtime": 2.9174,
955
- "eval_samples_per_second": 96.318,
956
- "eval_steps_per_second": 12.34,
957
  "step": 1200
958
  },
959
  {
960
- "epoch": 7.658227848101266,
961
- "grad_norm": 0.022338826209306717,
962
- "learning_rate": 8.218111002921129e-05,
963
- "loss": 0.0069,
964
  "step": 1210
965
  },
966
  {
967
- "epoch": 7.7215189873417724,
968
- "grad_norm": 8.107586860656738,
969
- "learning_rate": 8.120740019474197e-05,
970
- "loss": 0.0201,
971
  "step": 1220
972
  },
973
  {
974
- "epoch": 7.784810126582278,
975
- "grad_norm": 0.02159648947417736,
976
- "learning_rate": 8.023369036027264e-05,
977
- "loss": 0.0054,
978
  "step": 1230
979
  },
980
  {
981
- "epoch": 7.848101265822785,
982
- "grad_norm": 0.026178089901804924,
983
- "learning_rate": 7.925998052580331e-05,
984
- "loss": 0.0074,
985
  "step": 1240
986
  },
987
  {
988
- "epoch": 7.911392405063291,
989
- "grad_norm": 0.027682237327098846,
990
- "learning_rate": 7.828627069133399e-05,
991
- "loss": 0.0081,
992
  "step": 1250
993
  },
994
  {
995
- "epoch": 7.974683544303797,
996
- "grad_norm": 0.022958675399422646,
997
- "learning_rate": 7.731256085686466e-05,
998
- "loss": 0.0169,
999
  "step": 1260
1000
  },
1001
  {
1002
- "epoch": 8.037974683544304,
1003
- "grad_norm": 0.020896993577480316,
1004
- "learning_rate": 7.633885102239534e-05,
1005
- "loss": 0.0048,
1006
  "step": 1270
1007
  },
1008
  {
1009
- "epoch": 8.10126582278481,
1010
- "grad_norm": 0.02698768861591816,
1011
- "learning_rate": 7.536514118792601e-05,
1012
- "loss": 0.0076,
1013
  "step": 1280
1014
  },
1015
  {
1016
- "epoch": 8.164556962025316,
1017
- "grad_norm": 0.022491389885544777,
1018
- "learning_rate": 7.439143135345667e-05,
1019
- "loss": 0.019,
1020
  "step": 1290
1021
  },
1022
  {
1023
- "epoch": 8.227848101265822,
1024
- "grad_norm": 0.02103598043322563,
1025
- "learning_rate": 7.341772151898734e-05,
1026
- "loss": 0.0254,
1027
  "step": 1300
1028
  },
1029
  {
1030
- "epoch": 8.227848101265822,
1031
- "eval_accuracy": 0.7935943060498221,
1032
- "eval_loss": 1.0394294261932373,
1033
- "eval_runtime": 2.9667,
1034
- "eval_samples_per_second": 94.718,
1035
- "eval_steps_per_second": 12.135,
1036
  "step": 1300
1037
  },
1038
  {
1039
- "epoch": 8.291139240506329,
1040
- "grad_norm": 0.0190192349255085,
1041
- "learning_rate": 7.244401168451802e-05,
1042
- "loss": 0.0046,
1043
  "step": 1310
1044
  },
1045
  {
1046
- "epoch": 8.354430379746836,
1047
- "grad_norm": 8.76252555847168,
1048
- "learning_rate": 7.147030185004869e-05,
1049
- "loss": 0.019,
1050
  "step": 1320
1051
  },
1052
  {
1053
- "epoch": 8.417721518987342,
1054
- "grad_norm": 0.019229834899306297,
1055
- "learning_rate": 7.049659201557937e-05,
1056
- "loss": 0.0037,
1057
  "step": 1330
1058
  },
1059
  {
1060
- "epoch": 8.481012658227849,
1061
- "grad_norm": 0.018845034763216972,
1062
- "learning_rate": 6.952288218111003e-05,
1063
- "loss": 0.0038,
1064
  "step": 1340
1065
  },
1066
  {
1067
- "epoch": 8.544303797468354,
1068
- "grad_norm": 0.016525857150554657,
1069
- "learning_rate": 6.85491723466407e-05,
1070
- "loss": 0.0036,
1071
  "step": 1350
1072
  },
1073
  {
1074
- "epoch": 8.60759493670886,
1075
- "grad_norm": 0.02218289114534855,
1076
- "learning_rate": 6.757546251217137e-05,
1077
- "loss": 0.0035,
1078
  "step": 1360
1079
  },
1080
  {
1081
- "epoch": 8.670886075949367,
1082
- "grad_norm": 0.016638299450278282,
1083
- "learning_rate": 6.660175267770205e-05,
1084
- "loss": 0.0034,
1085
  "step": 1370
1086
  },
1087
  {
1088
- "epoch": 8.734177215189874,
1089
- "grad_norm": 0.01862148381769657,
1090
- "learning_rate": 6.562804284323272e-05,
1091
- "loss": 0.0034,
1092
  "step": 1380
1093
  },
1094
  {
1095
- "epoch": 8.79746835443038,
1096
- "grad_norm": 0.01760050840675831,
1097
- "learning_rate": 6.46543330087634e-05,
1098
- "loss": 0.0037,
1099
  "step": 1390
1100
  },
1101
  {
1102
- "epoch": 8.860759493670885,
1103
- "grad_norm": 0.015400604344904423,
1104
- "learning_rate": 6.368062317429407e-05,
1105
- "loss": 0.0035,
1106
  "step": 1400
1107
  },
1108
  {
1109
- "epoch": 8.860759493670885,
1110
- "eval_accuracy": 0.8256227758007118,
1111
- "eval_loss": 0.9603848457336426,
1112
- "eval_runtime": 2.9345,
1113
- "eval_samples_per_second": 95.756,
1114
- "eval_steps_per_second": 12.268,
1115
  "step": 1400
1116
  },
1117
  {
1118
- "epoch": 8.924050632911392,
1119
- "grad_norm": 0.015519515611231327,
1120
- "learning_rate": 6.270691333982473e-05,
1121
- "loss": 0.0032,
1122
  "step": 1410
1123
  },
1124
  {
1125
- "epoch": 8.987341772151899,
1126
- "grad_norm": 0.019378239288926125,
1127
- "learning_rate": 6.17332035053554e-05,
1128
- "loss": 0.0286,
1129
  "step": 1420
1130
  },
1131
  {
1132
- "epoch": 9.050632911392405,
1133
- "grad_norm": 0.017819711938500404,
1134
- "learning_rate": 6.0759493670886084e-05,
1135
- "loss": 0.0032,
1136
  "step": 1430
1137
  },
1138
  {
1139
- "epoch": 9.113924050632912,
1140
- "grad_norm": 0.0265274066478014,
1141
- "learning_rate": 5.978578383641675e-05,
1142
- "loss": 0.0033,
1143
  "step": 1440
1144
  },
1145
  {
1146
- "epoch": 9.177215189873417,
1147
- "grad_norm": 0.018209749832749367,
1148
- "learning_rate": 5.8812074001947425e-05,
1149
- "loss": 0.003,
1150
  "step": 1450
1151
  },
1152
  {
1153
- "epoch": 9.240506329113924,
1154
- "grad_norm": 0.19067148864269257,
1155
- "learning_rate": 5.78383641674781e-05,
1156
- "loss": 0.0037,
1157
  "step": 1460
1158
  },
1159
  {
1160
- "epoch": 9.30379746835443,
1161
- "grad_norm": 0.014196806587278843,
1162
- "learning_rate": 5.686465433300877e-05,
1163
- "loss": 0.0174,
1164
  "step": 1470
1165
  },
1166
  {
1167
- "epoch": 9.367088607594937,
1168
- "grad_norm": 0.01469396986067295,
1169
- "learning_rate": 5.589094449853943e-05,
1170
- "loss": 0.0032,
1171
  "step": 1480
1172
  },
1173
  {
1174
- "epoch": 9.430379746835444,
1175
- "grad_norm": 0.014591199345886707,
1176
- "learning_rate": 5.491723466407011e-05,
1177
- "loss": 0.0211,
1178
  "step": 1490
1179
  },
1180
  {
1181
- "epoch": 9.49367088607595,
1182
- "grad_norm": 0.01651841588318348,
1183
- "learning_rate": 5.394352482960078e-05,
1184
- "loss": 0.0028,
1185
  "step": 1500
1186
  },
1187
  {
1188
- "epoch": 9.49367088607595,
1189
- "eval_accuracy": 0.8149466192170819,
1190
- "eval_loss": 1.0136470794677734,
1191
- "eval_runtime": 4.4372,
1192
- "eval_samples_per_second": 63.328,
1193
- "eval_steps_per_second": 8.113,
1194
  "step": 1500
1195
  },
1196
  {
1197
- "epoch": 9.556962025316455,
1198
- "grad_norm": 0.013669743202626705,
1199
- "learning_rate": 5.296981499513145e-05,
1200
- "loss": 0.0027,
1201
  "step": 1510
1202
  },
1203
  {
1204
- "epoch": 9.620253164556962,
1205
- "grad_norm": 0.013864479027688503,
1206
- "learning_rate": 5.199610516066212e-05,
1207
- "loss": 0.0028,
1208
  "step": 1520
1209
  },
1210
  {
1211
- "epoch": 9.683544303797468,
1212
- "grad_norm": 0.013118638657033443,
1213
- "learning_rate": 5.1022395326192795e-05,
1214
- "loss": 0.0038,
1215
  "step": 1530
1216
  },
1217
  {
1218
- "epoch": 9.746835443037975,
1219
- "grad_norm": 0.015414089895784855,
1220
- "learning_rate": 5.004868549172347e-05,
1221
- "loss": 0.0028,
1222
  "step": 1540
1223
  },
1224
- {
1225
- "epoch": 9.810126582278482,
1226
- "grad_norm": 0.013694948516786098,
1227
- "learning_rate": 4.907497565725414e-05,
1228
- "loss": 0.0061,
1229
- "step": 1550
1230
- },
1231
- {
1232
- "epoch": 9.873417721518987,
1233
- "grad_norm": 0.013068972155451775,
1234
- "learning_rate": 4.810126582278481e-05,
1235
- "loss": 0.0028,
1236
- "step": 1560
1237
- },
1238
- {
1239
- "epoch": 9.936708860759493,
1240
- "grad_norm": 0.013739065267145634,
1241
- "learning_rate": 4.7127555988315484e-05,
1242
- "loss": 0.0027,
1243
- "step": 1570
1244
- },
1245
- {
1246
- "epoch": 10.0,
1247
- "grad_norm": 0.014148131012916565,
1248
- "learning_rate": 4.615384615384616e-05,
1249
- "loss": 0.0027,
1250
- "step": 1580
1251
- },
1252
- {
1253
- "epoch": 10.063291139240507,
1254
- "grad_norm": 0.014314244501292706,
1255
- "learning_rate": 4.5180136319376825e-05,
1256
- "loss": 0.0026,
1257
- "step": 1590
1258
- },
1259
- {
1260
- "epoch": 10.126582278481013,
1261
- "grad_norm": 0.012181580066680908,
1262
- "learning_rate": 4.42064264849075e-05,
1263
- "loss": 0.0026,
1264
- "step": 1600
1265
- },
1266
- {
1267
- "epoch": 10.126582278481013,
1268
- "eval_accuracy": 0.8220640569395018,
1269
- "eval_loss": 1.00938880443573,
1270
- "eval_runtime": 3.7407,
1271
- "eval_samples_per_second": 75.12,
1272
- "eval_steps_per_second": 9.624,
1273
- "step": 1600
1274
- },
1275
- {
1276
- "epoch": 10.189873417721518,
1277
- "grad_norm": 0.014514009468257427,
1278
- "learning_rate": 4.323271665043817e-05,
1279
- "loss": 0.0025,
1280
- "step": 1610
1281
- },
1282
- {
1283
- "epoch": 10.253164556962025,
1284
- "grad_norm": 0.013074631802737713,
1285
- "learning_rate": 4.225900681596884e-05,
1286
- "loss": 0.0026,
1287
- "step": 1620
1288
- },
1289
- {
1290
- "epoch": 10.316455696202532,
1291
- "grad_norm": 0.012750120833516121,
1292
- "learning_rate": 4.1285296981499514e-05,
1293
- "loss": 0.0026,
1294
- "step": 1630
1295
- },
1296
- {
1297
- "epoch": 10.379746835443038,
1298
- "grad_norm": 0.011241636238992214,
1299
- "learning_rate": 4.031158714703019e-05,
1300
- "loss": 0.0023,
1301
- "step": 1640
1302
- },
1303
- {
1304
- "epoch": 10.443037974683545,
1305
- "grad_norm": 0.011269092559814453,
1306
- "learning_rate": 3.933787731256086e-05,
1307
- "loss": 0.0023,
1308
- "step": 1650
1309
- },
1310
- {
1311
- "epoch": 10.50632911392405,
1312
- "grad_norm": 0.011306311935186386,
1313
- "learning_rate": 3.836416747809153e-05,
1314
- "loss": 0.0026,
1315
- "step": 1660
1316
- },
1317
- {
1318
- "epoch": 10.569620253164556,
1319
- "grad_norm": 0.012098881416022778,
1320
- "learning_rate": 3.73904576436222e-05,
1321
- "loss": 0.0025,
1322
- "step": 1670
1323
- },
1324
- {
1325
- "epoch": 10.632911392405063,
1326
- "grad_norm": 0.013847197405993938,
1327
- "learning_rate": 3.641674780915288e-05,
1328
- "loss": 0.0025,
1329
- "step": 1680
1330
- },
1331
- {
1332
- "epoch": 10.69620253164557,
1333
- "grad_norm": 0.011289476417005062,
1334
- "learning_rate": 3.5443037974683544e-05,
1335
- "loss": 0.0024,
1336
- "step": 1690
1337
- },
1338
- {
1339
- "epoch": 10.759493670886076,
1340
- "grad_norm": 0.012470747344195843,
1341
- "learning_rate": 3.446932814021422e-05,
1342
- "loss": 0.0024,
1343
- "step": 1700
1344
- },
1345
- {
1346
- "epoch": 10.759493670886076,
1347
- "eval_accuracy": 0.8291814946619217,
1348
- "eval_loss": 1.0214886665344238,
1349
- "eval_runtime": 4.4027,
1350
- "eval_samples_per_second": 63.824,
1351
- "eval_steps_per_second": 8.177,
1352
- "step": 1700
1353
- },
1354
- {
1355
- "epoch": 10.822784810126583,
1356
- "grad_norm": 0.011075363494455814,
1357
- "learning_rate": 3.349561830574489e-05,
1358
- "loss": 0.0023,
1359
- "step": 1710
1360
- },
1361
- {
1362
- "epoch": 10.886075949367088,
1363
- "grad_norm": 0.0110908392816782,
1364
- "learning_rate": 3.2521908471275565e-05,
1365
- "loss": 0.0024,
1366
- "step": 1720
1367
- },
1368
- {
1369
- "epoch": 10.949367088607595,
1370
- "grad_norm": 0.01047087088227272,
1371
- "learning_rate": 3.154819863680623e-05,
1372
- "loss": 0.0024,
1373
- "step": 1730
1374
- },
1375
- {
1376
- "epoch": 11.012658227848101,
1377
- "grad_norm": 0.010718494653701782,
1378
- "learning_rate": 3.0574488802336906e-05,
1379
- "loss": 0.0021,
1380
- "step": 1740
1381
- },
1382
- {
1383
- "epoch": 11.075949367088608,
1384
- "grad_norm": 0.011001325212419033,
1385
- "learning_rate": 2.9600778967867577e-05,
1386
- "loss": 0.0024,
1387
- "step": 1750
1388
- },
1389
- {
1390
- "epoch": 11.139240506329115,
1391
- "grad_norm": 0.012532955966889858,
1392
- "learning_rate": 2.8627069133398247e-05,
1393
- "loss": 0.0024,
1394
- "step": 1760
1395
- },
1396
- {
1397
- "epoch": 11.20253164556962,
1398
- "grad_norm": 0.011483966372907162,
1399
- "learning_rate": 2.765335929892892e-05,
1400
- "loss": 0.0022,
1401
- "step": 1770
1402
- },
1403
- {
1404
- "epoch": 11.265822784810126,
1405
- "grad_norm": 0.01143474318087101,
1406
- "learning_rate": 2.6679649464459595e-05,
1407
- "loss": 0.0022,
1408
- "step": 1780
1409
- },
1410
- {
1411
- "epoch": 11.329113924050633,
1412
- "grad_norm": 0.01154111884534359,
1413
- "learning_rate": 2.5705939629990266e-05,
1414
- "loss": 0.0023,
1415
- "step": 1790
1416
- },
1417
- {
1418
- "epoch": 11.39240506329114,
1419
- "grad_norm": 0.011763243936002254,
1420
- "learning_rate": 2.4732229795520936e-05,
1421
- "loss": 0.0024,
1422
- "step": 1800
1423
- },
1424
- {
1425
- "epoch": 11.39240506329114,
1426
- "eval_accuracy": 0.8291814946619217,
1427
- "eval_loss": 1.0316418409347534,
1428
- "eval_runtime": 4.1371,
1429
- "eval_samples_per_second": 67.921,
1430
- "eval_steps_per_second": 8.702,
1431
- "step": 1800
1432
- },
1433
- {
1434
- "epoch": 11.455696202531646,
1435
- "grad_norm": 0.010971922427415848,
1436
- "learning_rate": 2.375851996105161e-05,
1437
- "loss": 0.0021,
1438
- "step": 1810
1439
- },
1440
- {
1441
- "epoch": 11.518987341772151,
1442
- "grad_norm": 0.011685609817504883,
1443
- "learning_rate": 2.278481012658228e-05,
1444
- "loss": 0.0021,
1445
- "step": 1820
1446
- },
1447
- {
1448
- "epoch": 11.582278481012658,
1449
- "grad_norm": 0.010025019757449627,
1450
- "learning_rate": 2.181110029211295e-05,
1451
- "loss": 0.0021,
1452
- "step": 1830
1453
- },
1454
- {
1455
- "epoch": 11.645569620253164,
1456
- "grad_norm": 0.011038082651793957,
1457
- "learning_rate": 2.0837390457643625e-05,
1458
- "loss": 0.0022,
1459
- "step": 1840
1460
- },
1461
- {
1462
- "epoch": 11.708860759493671,
1463
- "grad_norm": 0.011144652031362057,
1464
- "learning_rate": 1.9863680623174295e-05,
1465
- "loss": 0.0023,
1466
- "step": 1850
1467
- },
1468
- {
1469
- "epoch": 11.772151898734178,
1470
- "grad_norm": 0.010572181083261967,
1471
- "learning_rate": 1.8889970788704966e-05,
1472
- "loss": 0.002,
1473
- "step": 1860
1474
- },
1475
- {
1476
- "epoch": 11.835443037974684,
1477
- "grad_norm": 0.012614467181265354,
1478
- "learning_rate": 1.791626095423564e-05,
1479
- "loss": 0.0021,
1480
- "step": 1870
1481
- },
1482
- {
1483
- "epoch": 11.89873417721519,
1484
- "grad_norm": 0.011049304157495499,
1485
- "learning_rate": 1.694255111976631e-05,
1486
- "loss": 0.002,
1487
- "step": 1880
1488
- },
1489
- {
1490
- "epoch": 11.962025316455696,
1491
- "grad_norm": 0.011247304268181324,
1492
- "learning_rate": 1.596884128529698e-05,
1493
- "loss": 0.0021,
1494
- "step": 1890
1495
- },
1496
- {
1497
- "epoch": 12.025316455696203,
1498
- "grad_norm": 0.009822424501180649,
1499
- "learning_rate": 1.4995131450827655e-05,
1500
- "loss": 0.002,
1501
- "step": 1900
1502
- },
1503
- {
1504
- "epoch": 12.025316455696203,
1505
- "eval_accuracy": 0.8291814946619217,
1506
- "eval_loss": 1.0391350984573364,
1507
- "eval_runtime": 4.3594,
1508
- "eval_samples_per_second": 64.458,
1509
- "eval_steps_per_second": 8.258,
1510
- "step": 1900
1511
- },
1512
- {
1513
- "epoch": 12.08860759493671,
1514
- "grad_norm": 0.009912836365401745,
1515
- "learning_rate": 1.4021421616358327e-05,
1516
- "loss": 0.0021,
1517
- "step": 1910
1518
- },
1519
- {
1520
- "epoch": 12.151898734177216,
1521
- "grad_norm": 0.014617972075939178,
1522
- "learning_rate": 1.3047711781888997e-05,
1523
- "loss": 0.002,
1524
- "step": 1920
1525
- },
1526
- {
1527
- "epoch": 12.215189873417721,
1528
- "grad_norm": 0.0097211804240942,
1529
- "learning_rate": 1.207400194741967e-05,
1530
- "loss": 0.0021,
1531
- "step": 1930
1532
- },
1533
- {
1534
- "epoch": 12.278481012658228,
1535
- "grad_norm": 0.012906952761113644,
1536
- "learning_rate": 1.1100292112950342e-05,
1537
- "loss": 0.0023,
1538
- "step": 1940
1539
- },
1540
- {
1541
- "epoch": 12.341772151898734,
1542
- "grad_norm": 0.010059771127998829,
1543
- "learning_rate": 1.0126582278481012e-05,
1544
- "loss": 0.002,
1545
- "step": 1950
1546
- },
1547
- {
1548
- "epoch": 12.405063291139241,
1549
- "grad_norm": 0.010056668892502785,
1550
- "learning_rate": 9.152872444011686e-06,
1551
- "loss": 0.0022,
1552
- "step": 1960
1553
- },
1554
- {
1555
- "epoch": 12.468354430379748,
1556
- "grad_norm": 0.012864183634519577,
1557
- "learning_rate": 8.179162609542357e-06,
1558
- "loss": 0.0021,
1559
- "step": 1970
1560
- },
1561
- {
1562
- "epoch": 12.531645569620252,
1563
- "grad_norm": 0.011309951543807983,
1564
- "learning_rate": 7.205452775073028e-06,
1565
- "loss": 0.0021,
1566
- "step": 1980
1567
- },
1568
- {
1569
- "epoch": 12.594936708860759,
1570
- "grad_norm": 0.009780649095773697,
1571
- "learning_rate": 6.231742940603701e-06,
1572
- "loss": 0.002,
1573
- "step": 1990
1574
- },
1575
- {
1576
- "epoch": 12.658227848101266,
1577
- "grad_norm": 0.011779570952057838,
1578
- "learning_rate": 5.258033106134372e-06,
1579
- "loss": 0.0021,
1580
- "step": 2000
1581
- },
1582
- {
1583
- "epoch": 12.658227848101266,
1584
- "eval_accuracy": 0.8291814946619217,
1585
- "eval_loss": 1.0429939031600952,
1586
- "eval_runtime": 2.9477,
1587
- "eval_samples_per_second": 95.327,
1588
- "eval_steps_per_second": 12.213,
1589
- "step": 2000
1590
- },
1591
- {
1592
- "epoch": 12.721518987341772,
1593
- "grad_norm": 0.010478339157998562,
1594
- "learning_rate": 4.284323271665044e-06,
1595
- "loss": 0.0019,
1596
- "step": 2010
1597
- },
1598
- {
1599
- "epoch": 12.784810126582279,
1600
- "grad_norm": 0.009612773545086384,
1601
- "learning_rate": 3.3106134371957155e-06,
1602
- "loss": 0.0021,
1603
- "step": 2020
1604
- },
1605
- {
1606
- "epoch": 12.848101265822784,
1607
- "grad_norm": 0.010172748006880283,
1608
- "learning_rate": 2.3369036027263877e-06,
1609
- "loss": 0.0021,
1610
- "step": 2030
1611
- },
1612
- {
1613
- "epoch": 12.91139240506329,
1614
- "grad_norm": 0.009747604839503765,
1615
- "learning_rate": 1.3631937682570594e-06,
1616
- "loss": 0.0019,
1617
- "step": 2040
1618
- },
1619
- {
1620
- "epoch": 12.974683544303797,
1621
- "grad_norm": 0.010267944075167179,
1622
- "learning_rate": 3.894839337877313e-07,
1623
- "loss": 0.0021,
1624
- "step": 2050
1625
- },
1626
  {
1627
  "epoch": 13.0,
1628
- "step": 2054,
1629
- "total_flos": 2.5427127414770565e+18,
1630
- "train_loss": 0.21512354467338007,
1631
- "train_runtime": 1043.0352,
1632
- "train_samples_per_second": 31.458,
1633
- "train_steps_per_second": 1.969
1634
  }
1635
  ],
1636
  "logging_steps": 10,
1637
- "max_steps": 2054,
1638
  "num_input_tokens_seen": 0,
1639
  "num_train_epochs": 13,
1640
  "save_steps": 100,
@@ -1650,7 +1248,7 @@
1650
  "attributes": {}
1651
  }
1652
  },
1653
- "total_flos": 2.5427127414770565e+18,
1654
  "train_batch_size": 16,
1655
  "trial_name": null,
1656
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7452366948127747,
3
+ "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-400",
4
  "epoch": 13.0,
5
  "eval_steps": 100,
6
+ "global_step": 1547,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08403361344537816,
13
+ "grad_norm": 1.7778676748275757,
14
+ "learning_rate": 0.00019870717517776342,
15
+ "loss": 1.2807,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.16806722689075632,
20
+ "grad_norm": 1.3721851110458374,
21
+ "learning_rate": 0.00019741435035552685,
22
+ "loss": 1.187,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.25210084033613445,
27
+ "grad_norm": 0.4043492376804352,
28
+ "learning_rate": 0.00019612152553329023,
29
+ "loss": 1.2471,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.33613445378151263,
34
+ "grad_norm": 1.9244325160980225,
35
+ "learning_rate": 0.00019482870071105366,
36
+ "loss": 1.2329,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.42016806722689076,
41
+ "grad_norm": 0.6638385653495789,
42
+ "learning_rate": 0.00019353587588881707,
43
+ "loss": 1.1524,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.5042016806722689,
48
+ "grad_norm": 0.3663930594921112,
49
+ "learning_rate": 0.0001922430510665805,
50
+ "loss": 1.1548,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.5882352941176471,
55
+ "grad_norm": 0.6382243633270264,
56
+ "learning_rate": 0.0001909502262443439,
57
+ "loss": 1.1993,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.6722689075630253,
62
+ "grad_norm": 1.2848349809646606,
63
+ "learning_rate": 0.0001896574014221073,
64
+ "loss": 1.1822,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.7563025210084033,
69
+ "grad_norm": 1.2714462280273438,
70
+ "learning_rate": 0.00018836457659987072,
71
+ "loss": 1.1902,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.8403361344537815,
76
+ "grad_norm": 1.4769024848937988,
77
+ "learning_rate": 0.00018707175177763415,
78
+ "loss": 1.2099,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.8403361344537815,
83
+ "eval_accuracy": 0.5047169811320755,
84
+ "eval_loss": 1.1625308990478516,
85
+ "eval_runtime": 2.2575,
86
+ "eval_samples_per_second": 93.909,
87
+ "eval_steps_per_second": 11.96,
88
  "step": 100
89
  },
90
  {
91
+ "epoch": 0.9243697478991597,
92
+ "grad_norm": 1.2741708755493164,
93
+ "learning_rate": 0.00018577892695539755,
94
+ "loss": 1.2203,
95
  "step": 110
96
  },
97
  {
98
+ "epoch": 1.0084033613445378,
99
+ "grad_norm": 1.2036206722259521,
100
+ "learning_rate": 0.000184486102133161,
101
+ "loss": 1.1936,
102
  "step": 120
103
  },
104
  {
105
+ "epoch": 1.092436974789916,
106
+ "grad_norm": 1.2514188289642334,
107
+ "learning_rate": 0.00018319327731092437,
108
+ "loss": 1.1515,
109
  "step": 130
110
  },
111
  {
112
+ "epoch": 1.1764705882352942,
113
+ "grad_norm": 0.7073956727981567,
114
+ "learning_rate": 0.0001819004524886878,
115
+ "loss": 1.1483,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 1.2605042016806722,
120
+ "grad_norm": 0.7465972900390625,
121
+ "learning_rate": 0.0001806076276664512,
122
+ "loss": 1.1962,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 1.3445378151260505,
127
+ "grad_norm": 0.6339373588562012,
128
+ "learning_rate": 0.00017931480284421464,
129
+ "loss": 1.1156,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 1.4285714285714286,
134
+ "grad_norm": 1.036371111869812,
135
+ "learning_rate": 0.00017802197802197802,
136
+ "loss": 1.1304,
137
  "step": 170
138
  },
139
  {
140
+ "epoch": 1.5126050420168067,
141
+ "grad_norm": 1.3491630554199219,
142
+ "learning_rate": 0.00017672915319974145,
143
+ "loss": 1.1783,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 1.596638655462185,
148
+ "grad_norm": 1.3167691230773926,
149
+ "learning_rate": 0.00017543632837750485,
150
+ "loss": 1.1617,
151
  "step": 190
152
  },
153
  {
154
+ "epoch": 1.680672268907563,
155
+ "grad_norm": 1.0561383962631226,
156
+ "learning_rate": 0.00017414350355526826,
157
+ "loss": 1.1066,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 1.680672268907563,
162
+ "eval_accuracy": 0.49528301886792453,
163
+ "eval_loss": 1.0892218351364136,
164
+ "eval_runtime": 2.1715,
165
+ "eval_samples_per_second": 97.628,
166
+ "eval_steps_per_second": 12.434,
167
  "step": 200
168
  },
169
  {
170
+ "epoch": 1.7647058823529411,
171
+ "grad_norm": 1.2338758707046509,
172
+ "learning_rate": 0.0001728506787330317,
173
+ "loss": 1.0634,
174
  "step": 210
175
  },
176
  {
177
+ "epoch": 1.8487394957983194,
178
+ "grad_norm": 1.416668176651001,
179
+ "learning_rate": 0.0001715578539107951,
180
+ "loss": 1.1408,
181
  "step": 220
182
  },
183
  {
184
+ "epoch": 1.9327731092436975,
185
+ "grad_norm": 1.9194142818450928,
186
+ "learning_rate": 0.0001702650290885585,
187
+ "loss": 1.2019,
188
  "step": 230
189
  },
190
  {
191
+ "epoch": 2.0168067226890756,
192
+ "grad_norm": 1.1238566637039185,
193
+ "learning_rate": 0.0001689722042663219,
194
+ "loss": 1.0287,
195
  "step": 240
196
  },
197
  {
198
+ "epoch": 2.100840336134454,
199
+ "grad_norm": 3.9827613830566406,
200
+ "learning_rate": 0.00016767937944408534,
201
+ "loss": 0.9904,
202
  "step": 250
203
  },
204
  {
205
+ "epoch": 2.184873949579832,
206
+ "grad_norm": 1.7081505060195923,
207
+ "learning_rate": 0.00016638655462184875,
208
+ "loss": 1.0905,
209
  "step": 260
210
  },
211
  {
212
+ "epoch": 2.26890756302521,
213
+ "grad_norm": 2.686239719390869,
214
+ "learning_rate": 0.00016509372979961215,
215
+ "loss": 0.9177,
216
  "step": 270
217
  },
218
  {
219
+ "epoch": 2.3529411764705883,
220
+ "grad_norm": 1.3638893365859985,
221
+ "learning_rate": 0.00016380090497737556,
222
+ "loss": 0.9348,
223
  "step": 280
224
  },
225
  {
226
+ "epoch": 2.4369747899159666,
227
+ "grad_norm": 2.050823450088501,
228
+ "learning_rate": 0.000162508080155139,
229
+ "loss": 0.8364,
230
  "step": 290
231
  },
232
  {
233
+ "epoch": 2.5210084033613445,
234
+ "grad_norm": 1.718785047531128,
235
+ "learning_rate": 0.0001612152553329024,
236
+ "loss": 1.0298,
237
  "step": 300
238
  },
239
  {
240
+ "epoch": 2.5210084033613445,
241
+ "eval_accuracy": 0.589622641509434,
242
+ "eval_loss": 0.893924355506897,
243
+ "eval_runtime": 2.9715,
244
+ "eval_samples_per_second": 71.345,
245
+ "eval_steps_per_second": 9.086,
246
  "step": 300
247
  },
248
  {
249
+ "epoch": 2.6050420168067228,
250
+ "grad_norm": 2.593571424484253,
251
+ "learning_rate": 0.00015992243051066583,
252
+ "loss": 0.9351,
253
  "step": 310
254
  },
255
  {
256
+ "epoch": 2.689075630252101,
257
+ "grad_norm": 1.2314530611038208,
258
+ "learning_rate": 0.0001586296056884292,
259
+ "loss": 0.841,
260
  "step": 320
261
  },
262
  {
263
+ "epoch": 2.773109243697479,
264
+ "grad_norm": 2.589643716812134,
265
+ "learning_rate": 0.00015733678086619264,
266
+ "loss": 0.7806,
267
  "step": 330
268
  },
269
  {
270
+ "epoch": 2.857142857142857,
271
+ "grad_norm": 1.8523632287979126,
272
+ "learning_rate": 0.00015604395604395605,
273
+ "loss": 0.8915,
274
  "step": 340
275
  },
276
  {
277
+ "epoch": 2.9411764705882355,
278
+ "grad_norm": 2.1867382526397705,
279
+ "learning_rate": 0.00015475113122171948,
280
+ "loss": 0.8422,
281
  "step": 350
282
  },
283
  {
284
+ "epoch": 3.0252100840336134,
285
+ "grad_norm": 2.2038803100585938,
286
+ "learning_rate": 0.0001534583063994829,
287
+ "loss": 0.7271,
288
  "step": 360
289
  },
290
  {
291
+ "epoch": 3.1092436974789917,
292
+ "grad_norm": 2.8037712574005127,
293
+ "learning_rate": 0.0001521654815772463,
294
+ "loss": 0.6003,
295
  "step": 370
296
  },
297
  {
298
+ "epoch": 3.19327731092437,
299
+ "grad_norm": 1.8391917943954468,
300
+ "learning_rate": 0.0001508726567550097,
301
+ "loss": 0.6642,
302
  "step": 380
303
  },
304
  {
305
+ "epoch": 3.277310924369748,
306
+ "grad_norm": 4.166950225830078,
307
+ "learning_rate": 0.00014957983193277313,
308
+ "loss": 0.6483,
309
  "step": 390
310
  },
311
  {
312
+ "epoch": 3.361344537815126,
313
+ "grad_norm": 2.6970036029815674,
314
+ "learning_rate": 0.00014828700711053654,
315
+ "loss": 0.5798,
316
  "step": 400
317
  },
318
  {
319
+ "epoch": 3.361344537815126,
320
+ "eval_accuracy": 0.6745283018867925,
321
+ "eval_loss": 0.7452366948127747,
322
+ "eval_runtime": 2.3359,
323
+ "eval_samples_per_second": 90.759,
324
+ "eval_steps_per_second": 11.559,
325
  "step": 400
326
  },
327
  {
328
+ "epoch": 3.4453781512605044,
329
+ "grad_norm": 2.7985074520111084,
330
+ "learning_rate": 0.00014699418228829994,
331
+ "loss": 0.5447,
332
  "step": 410
333
  },
334
  {
335
+ "epoch": 3.5294117647058822,
336
+ "grad_norm": 5.049683094024658,
337
+ "learning_rate": 0.00014570135746606335,
338
+ "loss": 0.6424,
339
  "step": 420
340
  },
341
  {
342
+ "epoch": 3.6134453781512605,
343
+ "grad_norm": 2.288046360015869,
344
+ "learning_rate": 0.00014440853264382675,
345
+ "loss": 0.5786,
346
  "step": 430
347
  },
348
  {
349
+ "epoch": 3.697478991596639,
350
+ "grad_norm": 2.6284878253936768,
351
+ "learning_rate": 0.0001431157078215902,
352
+ "loss": 0.4704,
353
  "step": 440
354
  },
355
  {
356
+ "epoch": 3.7815126050420167,
357
+ "grad_norm": 3.1169135570526123,
358
+ "learning_rate": 0.0001418228829993536,
359
+ "loss": 0.5602,
360
  "step": 450
361
  },
362
  {
363
+ "epoch": 3.865546218487395,
364
+ "grad_norm": 4.109696388244629,
365
+ "learning_rate": 0.000140530058177117,
366
+ "loss": 0.5028,
367
  "step": 460
368
  },
369
  {
370
+ "epoch": 3.9495798319327733,
371
+ "grad_norm": 2.4809916019439697,
372
+ "learning_rate": 0.0001392372333548804,
373
+ "loss": 0.5331,
374
  "step": 470
375
  },
376
  {
377
+ "epoch": 4.033613445378151,
378
+ "grad_norm": 5.215726375579834,
379
+ "learning_rate": 0.00013794440853264384,
380
+ "loss": 0.6104,
381
  "step": 480
382
  },
383
  {
384
+ "epoch": 4.117647058823529,
385
+ "grad_norm": 2.3470144271850586,
386
+ "learning_rate": 0.00013665158371040724,
387
+ "loss": 0.3547,
388
  "step": 490
389
  },
390
  {
391
+ "epoch": 4.201680672268908,
392
+ "grad_norm": 2.578737258911133,
393
+ "learning_rate": 0.00013535875888817068,
394
+ "loss": 0.4879,
395
  "step": 500
396
  },
397
  {
398
+ "epoch": 4.201680672268908,
399
+ "eval_accuracy": 0.6556603773584906,
400
+ "eval_loss": 0.8673213720321655,
401
+ "eval_runtime": 2.9615,
402
+ "eval_samples_per_second": 71.585,
403
+ "eval_steps_per_second": 9.117,
404
  "step": 500
405
  },
406
  {
407
+ "epoch": 4.285714285714286,
408
+ "grad_norm": 8.1809663772583,
409
+ "learning_rate": 0.00013406593406593405,
410
+ "loss": 0.3164,
411
  "step": 510
412
  },
413
  {
414
+ "epoch": 4.369747899159664,
415
+ "grad_norm": 5.916128158569336,
416
+ "learning_rate": 0.0001327731092436975,
417
+ "loss": 0.3425,
418
  "step": 520
419
  },
420
  {
421
+ "epoch": 4.453781512605042,
422
+ "grad_norm": 1.5418981313705444,
423
+ "learning_rate": 0.0001314802844214609,
424
+ "loss": 0.3219,
425
  "step": 530
426
  },
427
  {
428
+ "epoch": 4.53781512605042,
429
+ "grad_norm": 0.39303484559059143,
430
+ "learning_rate": 0.00013018745959922433,
431
+ "loss": 0.372,
432
  "step": 540
433
  },
434
  {
435
+ "epoch": 4.621848739495798,
436
+ "grad_norm": 2.0171704292297363,
437
+ "learning_rate": 0.00012889463477698773,
438
+ "loss": 0.3941,
439
  "step": 550
440
  },
441
  {
442
+ "epoch": 4.705882352941177,
443
+ "grad_norm": 5.667063236236572,
444
+ "learning_rate": 0.00012760180995475114,
445
+ "loss": 0.3784,
446
  "step": 560
447
  },
448
  {
449
+ "epoch": 4.7899159663865545,
450
+ "grad_norm": 2.0401604175567627,
451
+ "learning_rate": 0.00012630898513251454,
452
+ "loss": 0.3231,
453
  "step": 570
454
  },
455
  {
456
+ "epoch": 4.873949579831933,
457
+ "grad_norm": 2.5163936614990234,
458
+ "learning_rate": 0.00012501616031027798,
459
+ "loss": 0.3287,
460
  "step": 580
461
  },
462
  {
463
+ "epoch": 4.957983193277311,
464
+ "grad_norm": 2.357574939727783,
465
+ "learning_rate": 0.00012372333548804138,
466
+ "loss": 0.2309,
467
  "step": 590
468
  },
469
  {
470
+ "epoch": 5.042016806722689,
471
+ "grad_norm": 2.399186611175537,
472
+ "learning_rate": 0.0001224305106658048,
473
+ "loss": 0.197,
474
  "step": 600
475
  },
476
  {
477
+ "epoch": 5.042016806722689,
478
+ "eval_accuracy": 0.6556603773584906,
479
+ "eval_loss": 1.0144904851913452,
480
+ "eval_runtime": 2.2097,
481
+ "eval_samples_per_second": 95.94,
482
+ "eval_steps_per_second": 12.219,
483
  "step": 600
484
  },
485
  {
486
+ "epoch": 5.126050420168067,
487
+ "grad_norm": 4.648927688598633,
488
+ "learning_rate": 0.00012113768584356819,
489
+ "loss": 0.2243,
490
  "step": 610
491
  },
492
  {
493
+ "epoch": 5.2100840336134455,
494
+ "grad_norm": 5.755702972412109,
495
+ "learning_rate": 0.00011984486102133161,
496
+ "loss": 0.1775,
497
  "step": 620
498
  },
499
  {
500
+ "epoch": 5.294117647058823,
501
+ "grad_norm": 5.102352619171143,
502
+ "learning_rate": 0.00011855203619909503,
503
+ "loss": 0.1982,
504
  "step": 630
505
  },
506
  {
507
+ "epoch": 5.378151260504202,
508
+ "grad_norm": 2.311920404434204,
509
+ "learning_rate": 0.00011725921137685845,
510
+ "loss": 0.2125,
511
  "step": 640
512
  },
513
  {
514
+ "epoch": 5.46218487394958,
515
+ "grad_norm": 5.563356876373291,
516
+ "learning_rate": 0.00011596638655462187,
517
+ "loss": 0.2259,
518
  "step": 650
519
  },
520
  {
521
+ "epoch": 5.546218487394958,
522
+ "grad_norm": 5.233443260192871,
523
+ "learning_rate": 0.00011467356173238526,
524
+ "loss": 0.263,
525
  "step": 660
526
  },
527
  {
528
+ "epoch": 5.630252100840336,
529
+ "grad_norm": 2.19209361076355,
530
+ "learning_rate": 0.00011338073691014868,
531
+ "loss": 0.2627,
532
  "step": 670
533
  },
534
  {
535
+ "epoch": 5.714285714285714,
536
+ "grad_norm": 5.696531772613525,
537
+ "learning_rate": 0.0001120879120879121,
538
+ "loss": 0.2588,
539
  "step": 680
540
  },
541
  {
542
+ "epoch": 5.798319327731092,
543
+ "grad_norm": 4.516761302947998,
544
+ "learning_rate": 0.0001107950872656755,
545
+ "loss": 0.2148,
546
  "step": 690
547
  },
548
  {
549
+ "epoch": 5.882352941176471,
550
+ "grad_norm": 0.21505996584892273,
551
+ "learning_rate": 0.00010950226244343893,
552
+ "loss": 0.1368,
553
  "step": 700
554
  },
555
  {
556
+ "epoch": 5.882352941176471,
557
+ "eval_accuracy": 0.7311320754716981,
558
+ "eval_loss": 0.8305109739303589,
559
+ "eval_runtime": 2.2559,
560
+ "eval_samples_per_second": 93.974,
561
+ "eval_steps_per_second": 11.968,
562
  "step": 700
563
  },
564
  {
565
+ "epoch": 5.966386554621849,
566
+ "grad_norm": 4.3951263427734375,
567
+ "learning_rate": 0.00010820943762120233,
568
+ "loss": 0.2699,
569
  "step": 710
570
  },
571
  {
572
+ "epoch": 6.050420168067227,
573
+ "grad_norm": 0.8779445290565491,
574
+ "learning_rate": 0.00010691661279896574,
575
+ "loss": 0.1172,
576
  "step": 720
577
  },
578
  {
579
+ "epoch": 6.1344537815126055,
580
+ "grad_norm": 4.695611476898193,
581
+ "learning_rate": 0.00010562378797672916,
582
+ "loss": 0.13,
583
  "step": 730
584
  },
585
  {
586
+ "epoch": 6.218487394957983,
587
+ "grad_norm": 7.564522743225098,
588
+ "learning_rate": 0.00010433096315449258,
589
+ "loss": 0.1392,
590
  "step": 740
591
  },
592
  {
593
+ "epoch": 6.302521008403361,
594
+ "grad_norm": 0.17681638896465302,
595
+ "learning_rate": 0.00010303813833225597,
596
+ "loss": 0.0828,
597
  "step": 750
598
  },
599
  {
600
+ "epoch": 6.38655462184874,
601
+ "grad_norm": 2.515813112258911,
602
+ "learning_rate": 0.00010174531351001939,
603
+ "loss": 0.1119,
604
  "step": 760
605
  },
606
  {
607
+ "epoch": 6.470588235294118,
608
+ "grad_norm": 0.3115313649177551,
609
+ "learning_rate": 0.0001004524886877828,
610
+ "loss": 0.0667,
611
  "step": 770
612
  },
613
  {
614
+ "epoch": 6.554621848739496,
615
+ "grad_norm": 0.9338003396987915,
616
+ "learning_rate": 9.915966386554623e-05,
617
+ "loss": 0.0779,
618
  "step": 780
619
  },
620
  {
621
+ "epoch": 6.6386554621848735,
622
+ "grad_norm": 5.663729190826416,
623
+ "learning_rate": 9.786683904330963e-05,
624
+ "loss": 0.0949,
625
  "step": 790
626
  },
627
  {
628
+ "epoch": 6.722689075630252,
629
+ "grad_norm": 1.159752368927002,
630
+ "learning_rate": 9.657401422107305e-05,
631
+ "loss": 0.0841,
632
  "step": 800
633
  },
634
  {
635
+ "epoch": 6.722689075630252,
636
+ "eval_accuracy": 0.7735849056603774,
637
+ "eval_loss": 0.8974043130874634,
638
+ "eval_runtime": 2.2126,
639
+ "eval_samples_per_second": 95.816,
640
+ "eval_steps_per_second": 12.203,
641
  "step": 800
642
  },
643
  {
644
+ "epoch": 6.80672268907563,
645
+ "grad_norm": 0.8134496808052063,
646
+ "learning_rate": 9.528118939883646e-05,
647
+ "loss": 0.1272,
648
  "step": 810
649
  },
650
  {
651
+ "epoch": 6.890756302521009,
652
+ "grad_norm": 0.09464468061923981,
653
+ "learning_rate": 9.398836457659988e-05,
654
+ "loss": 0.1339,
655
  "step": 820
656
  },
657
  {
658
+ "epoch": 6.974789915966387,
659
+ "grad_norm": 0.08403979986906052,
660
+ "learning_rate": 9.26955397543633e-05,
661
+ "loss": 0.0779,
662
  "step": 830
663
  },
664
  {
665
+ "epoch": 7.0588235294117645,
666
+ "grad_norm": 0.11395015567541122,
667
+ "learning_rate": 9.14027149321267e-05,
668
+ "loss": 0.0495,
669
  "step": 840
670
  },
671
  {
672
+ "epoch": 7.142857142857143,
673
+ "grad_norm": 5.00321102142334,
674
+ "learning_rate": 9.010989010989012e-05,
675
+ "loss": 0.2217,
676
  "step": 850
677
  },
678
  {
679
+ "epoch": 7.226890756302521,
680
+ "grad_norm": 5.354154109954834,
681
+ "learning_rate": 8.881706528765353e-05,
682
+ "loss": 0.0713,
683
  "step": 860
684
  },
685
  {
686
+ "epoch": 7.310924369747899,
687
+ "grad_norm": 0.07731425017118454,
688
+ "learning_rate": 8.752424046541694e-05,
689
+ "loss": 0.0482,
690
  "step": 870
691
  },
692
  {
693
+ "epoch": 7.394957983193278,
694
+ "grad_norm": 1.70600163936615,
695
+ "learning_rate": 8.623141564318036e-05,
696
+ "loss": 0.0368,
697
  "step": 880
698
  },
699
  {
700
+ "epoch": 7.4789915966386555,
701
+ "grad_norm": 0.09904234856367111,
702
+ "learning_rate": 8.493859082094377e-05,
703
+ "loss": 0.0389,
704
  "step": 890
705
  },
706
  {
707
+ "epoch": 7.563025210084033,
708
+ "grad_norm": 5.335230350494385,
709
+ "learning_rate": 8.364576599870718e-05,
710
+ "loss": 0.0942,
711
  "step": 900
712
  },
713
  {
714
+ "epoch": 7.563025210084033,
715
+ "eval_accuracy": 0.7216981132075472,
716
+ "eval_loss": 1.1261749267578125,
717
+ "eval_runtime": 2.2006,
718
+ "eval_samples_per_second": 96.335,
719
+ "eval_steps_per_second": 12.269,
720
  "step": 900
721
  },
722
  {
723
+ "epoch": 7.647058823529412,
724
+ "grad_norm": 5.030584812164307,
725
+ "learning_rate": 8.23529411764706e-05,
726
+ "loss": 0.0278,
727
  "step": 910
728
  },
729
  {
730
+ "epoch": 7.73109243697479,
731
+ "grad_norm": 0.12369989603757858,
732
+ "learning_rate": 8.1060116354234e-05,
733
+ "loss": 0.1137,
734
  "step": 920
735
  },
736
  {
737
+ "epoch": 7.815126050420168,
738
+ "grad_norm": 7.5863189697265625,
739
+ "learning_rate": 7.976729153199742e-05,
740
+ "loss": 0.0904,
741
  "step": 930
742
  },
743
  {
744
+ "epoch": 7.899159663865547,
745
+ "grad_norm": 0.2067825049161911,
746
+ "learning_rate": 7.847446670976083e-05,
747
+ "loss": 0.0397,
748
  "step": 940
749
  },
750
  {
751
+ "epoch": 7.983193277310924,
752
+ "grad_norm": 0.056721098721027374,
753
+ "learning_rate": 7.718164188752424e-05,
754
+ "loss": 0.0679,
755
  "step": 950
756
  },
757
  {
758
+ "epoch": 8.067226890756302,
759
+ "grad_norm": 0.05310463905334473,
760
+ "learning_rate": 7.588881706528765e-05,
761
+ "loss": 0.0329,
762
  "step": 960
763
  },
764
  {
765
+ "epoch": 8.15126050420168,
766
+ "grad_norm": 7.898382663726807,
767
+ "learning_rate": 7.459599224305107e-05,
768
+ "loss": 0.0183,
769
  "step": 970
770
  },
771
  {
772
+ "epoch": 8.235294117647058,
773
+ "grad_norm": 2.061277151107788,
774
+ "learning_rate": 7.330316742081448e-05,
775
+ "loss": 0.0311,
776
  "step": 980
777
  },
778
  {
779
+ "epoch": 8.319327731092438,
780
+ "grad_norm": 0.06646686792373657,
781
+ "learning_rate": 7.20103425985779e-05,
782
+ "loss": 0.0334,
783
  "step": 990
784
  },
785
  {
786
+ "epoch": 8.403361344537815,
787
+ "grad_norm": 0.07112545520067215,
788
+ "learning_rate": 7.071751777634131e-05,
789
+ "loss": 0.0296,
790
  "step": 1000
791
  },
792
  {
793
+ "epoch": 8.403361344537815,
794
+ "eval_accuracy": 0.7122641509433962,
795
+ "eval_loss": 1.2889635562896729,
796
+ "eval_runtime": 2.2011,
797
+ "eval_samples_per_second": 96.314,
798
+ "eval_steps_per_second": 12.266,
799
  "step": 1000
800
  },
801
  {
802
+ "epoch": 8.487394957983193,
803
+ "grad_norm": 0.07936228811740875,
804
+ "learning_rate": 6.942469295410472e-05,
805
+ "loss": 0.0256,
806
  "step": 1010
807
  },
808
  {
809
+ "epoch": 8.571428571428571,
810
+ "grad_norm": 5.849864959716797,
811
+ "learning_rate": 6.813186813186814e-05,
812
+ "loss": 0.0346,
813
  "step": 1020
814
  },
815
  {
816
+ "epoch": 8.655462184873949,
817
+ "grad_norm": 0.05158023163676262,
818
+ "learning_rate": 6.683904330963154e-05,
819
+ "loss": 0.0109,
820
  "step": 1030
821
  },
822
  {
823
+ "epoch": 8.739495798319329,
824
+ "grad_norm": 0.05596969276666641,
825
+ "learning_rate": 6.554621848739496e-05,
826
+ "loss": 0.0129,
827
  "step": 1040
828
  },
829
  {
830
+ "epoch": 8.823529411764707,
831
+ "grad_norm": 0.05292417109012604,
832
+ "learning_rate": 6.425339366515838e-05,
833
+ "loss": 0.033,
834
  "step": 1050
835
  },
836
  {
837
+ "epoch": 8.907563025210084,
838
+ "grad_norm": 0.8892333507537842,
839
+ "learning_rate": 6.296056884292179e-05,
840
+ "loss": 0.0199,
841
  "step": 1060
842
  },
843
  {
844
+ "epoch": 8.991596638655462,
845
+ "grad_norm": 1.8524911403656006,
846
+ "learning_rate": 6.166774402068521e-05,
847
+ "loss": 0.0324,
848
  "step": 1070
849
  },
850
  {
851
+ "epoch": 9.07563025210084,
852
+ "grad_norm": 0.040928326547145844,
853
+ "learning_rate": 6.037491919844861e-05,
854
+ "loss": 0.0352,
855
  "step": 1080
856
  },
857
  {
858
+ "epoch": 9.159663865546218,
859
+ "grad_norm": 0.043698906898498535,
860
+ "learning_rate": 5.9082094376212026e-05,
861
+ "loss": 0.009,
862
  "step": 1090
863
  },
864
  {
865
+ "epoch": 9.243697478991596,
866
+ "grad_norm": 0.04034803435206413,
867
+ "learning_rate": 5.778926955397543e-05,
868
+ "loss": 0.0432,
869
  "step": 1100
870
  },
871
  {
872
+ "epoch": 9.243697478991596,
873
+ "eval_accuracy": 0.7405660377358491,
874
+ "eval_loss": 1.2427575588226318,
875
+ "eval_runtime": 2.1879,
876
+ "eval_samples_per_second": 96.895,
877
+ "eval_steps_per_second": 12.34,
878
  "step": 1100
879
  },
880
  {
881
+ "epoch": 9.327731092436975,
882
+ "grad_norm": 0.042758312076330185,
883
+ "learning_rate": 5.649644473173885e-05,
884
+ "loss": 0.0086,
885
  "step": 1110
886
  },
887
  {
888
+ "epoch": 9.411764705882353,
889
+ "grad_norm": 0.05348571389913559,
890
+ "learning_rate": 5.520361990950227e-05,
891
+ "loss": 0.0113,
892
  "step": 1120
893
  },
894
  {
895
+ "epoch": 9.495798319327731,
896
+ "grad_norm": 0.04173032566905022,
897
+ "learning_rate": 5.3910795087265676e-05,
898
+ "loss": 0.0083,
899
  "step": 1130
900
  },
901
  {
902
+ "epoch": 9.579831932773109,
903
+ "grad_norm": 0.03784575313329697,
904
+ "learning_rate": 5.2617970265029096e-05,
905
+ "loss": 0.0086,
906
  "step": 1140
907
  },
908
  {
909
+ "epoch": 9.663865546218487,
910
+ "grad_norm": 0.05332985520362854,
911
+ "learning_rate": 5.13251454427925e-05,
912
+ "loss": 0.0086,
913
  "step": 1150
914
  },
915
  {
916
+ "epoch": 9.747899159663866,
917
+ "grad_norm": 0.03503885120153427,
918
+ "learning_rate": 5.0032320620555914e-05,
919
+ "loss": 0.0078,
920
  "step": 1160
921
  },
922
  {
923
+ "epoch": 9.831932773109244,
924
+ "grad_norm": 0.033440928906202316,
925
+ "learning_rate": 4.8739495798319326e-05,
926
+ "loss": 0.0095,
927
  "step": 1170
928
  },
929
  {
930
+ "epoch": 9.915966386554622,
931
+ "grad_norm": 0.03903155028820038,
932
+ "learning_rate": 4.744667097608274e-05,
933
+ "loss": 0.0071,
934
  "step": 1180
935
  },
936
  {
937
+ "epoch": 10.0,
938
+ "grad_norm": 0.034581057727336884,
939
+ "learning_rate": 4.615384615384616e-05,
940
+ "loss": 0.0347,
941
  "step": 1190
942
  },
943
  {
944
+ "epoch": 10.084033613445378,
945
+ "grad_norm": 4.804828643798828,
946
+ "learning_rate": 4.486102133160957e-05,
947
+ "loss": 0.0353,
948
  "step": 1200
949
  },
950
  {
951
+ "epoch": 10.084033613445378,
952
+ "eval_accuracy": 0.7452830188679245,
953
+ "eval_loss": 1.250637173652649,
954
+ "eval_runtime": 2.1411,
955
+ "eval_samples_per_second": 99.016,
956
+ "eval_steps_per_second": 12.61,
957
  "step": 1200
958
  },
959
  {
960
+ "epoch": 10.168067226890756,
961
+ "grad_norm": 0.03247935697436333,
962
+ "learning_rate": 4.356819650937298e-05,
963
+ "loss": 0.0071,
964
  "step": 1210
965
  },
966
  {
967
+ "epoch": 10.252100840336134,
968
+ "grad_norm": 0.03735749423503876,
969
+ "learning_rate": 4.2275371687136396e-05,
970
+ "loss": 0.007,
971
  "step": 1220
972
  },
973
  {
974
+ "epoch": 10.336134453781513,
975
+ "grad_norm": 0.03190077841281891,
976
+ "learning_rate": 4.098254686489981e-05,
977
+ "loss": 0.0068,
978
  "step": 1230
979
  },
980
  {
981
+ "epoch": 10.420168067226891,
982
+ "grad_norm": 0.03304820880293846,
983
+ "learning_rate": 3.968972204266322e-05,
984
+ "loss": 0.0063,
985
  "step": 1240
986
  },
987
  {
988
+ "epoch": 10.504201680672269,
989
+ "grad_norm": 0.038498662412166595,
990
+ "learning_rate": 3.839689722042663e-05,
991
+ "loss": 0.0069,
992
  "step": 1250
993
  },
994
  {
995
+ "epoch": 10.588235294117647,
996
+ "grad_norm": 0.03530021384358406,
997
+ "learning_rate": 3.7104072398190046e-05,
998
+ "loss": 0.0067,
999
  "step": 1260
1000
  },
1001
  {
1002
+ "epoch": 10.672268907563025,
1003
+ "grad_norm": 0.041745755821466446,
1004
+ "learning_rate": 3.581124757595346e-05,
1005
+ "loss": 0.0063,
1006
  "step": 1270
1007
  },
1008
  {
1009
+ "epoch": 10.756302521008404,
1010
+ "grad_norm": 0.03443057835102081,
1011
+ "learning_rate": 3.451842275371687e-05,
1012
+ "loss": 0.0062,
1013
  "step": 1280
1014
  },
1015
  {
1016
+ "epoch": 10.840336134453782,
1017
+ "grad_norm": 0.029045993462204933,
1018
+ "learning_rate": 3.322559793148028e-05,
1019
+ "loss": 0.0063,
1020
  "step": 1290
1021
  },
1022
  {
1023
+ "epoch": 10.92436974789916,
1024
+ "grad_norm": 0.04033966362476349,
1025
+ "learning_rate": 3.1932773109243696e-05,
1026
+ "loss": 0.0065,
1027
  "step": 1300
1028
  },
1029
  {
1030
+ "epoch": 10.92436974789916,
1031
+ "eval_accuracy": 0.7783018867924528,
1032
+ "eval_loss": 1.1232017278671265,
1033
+ "eval_runtime": 2.9539,
1034
+ "eval_samples_per_second": 71.77,
1035
+ "eval_steps_per_second": 9.141,
1036
  "step": 1300
1037
  },
1038
  {
1039
+ "epoch": 11.008403361344538,
1040
+ "grad_norm": 0.029126280918717384,
1041
+ "learning_rate": 3.0639948287007115e-05,
1042
+ "loss": 0.0063,
1043
  "step": 1310
1044
  },
1045
  {
1046
+ "epoch": 11.092436974789916,
1047
+ "grad_norm": 0.02833595871925354,
1048
+ "learning_rate": 2.9347123464770527e-05,
1049
+ "loss": 0.0063,
1050
  "step": 1320
1051
  },
1052
  {
1053
+ "epoch": 11.176470588235293,
1054
+ "grad_norm": 0.032052479684352875,
1055
+ "learning_rate": 2.805429864253394e-05,
1056
+ "loss": 0.0063,
1057
  "step": 1330
1058
  },
1059
  {
1060
+ "epoch": 11.260504201680673,
1061
+ "grad_norm": 0.030251996591687202,
1062
+ "learning_rate": 2.676147382029735e-05,
1063
+ "loss": 0.006,
1064
  "step": 1340
1065
  },
1066
  {
1067
+ "epoch": 11.344537815126051,
1068
+ "grad_norm": 0.030112557113170624,
1069
+ "learning_rate": 2.546864899806076e-05,
1070
+ "loss": 0.0059,
1071
  "step": 1350
1072
  },
1073
  {
1074
+ "epoch": 11.428571428571429,
1075
+ "grad_norm": 0.027209602296352386,
1076
+ "learning_rate": 2.4175824175824177e-05,
1077
+ "loss": 0.0059,
1078
  "step": 1360
1079
  },
1080
  {
1081
+ "epoch": 11.512605042016807,
1082
+ "grad_norm": 0.027164172381162643,
1083
+ "learning_rate": 2.288299935358759e-05,
1084
+ "loss": 0.0057,
1085
  "step": 1370
1086
  },
1087
  {
1088
+ "epoch": 11.596638655462185,
1089
+ "grad_norm": 0.02858646586537361,
1090
+ "learning_rate": 2.1590174531351002e-05,
1091
+ "loss": 0.0058,
1092
  "step": 1380
1093
  },
1094
  {
1095
+ "epoch": 11.680672268907562,
1096
+ "grad_norm": 0.02894781529903412,
1097
+ "learning_rate": 2.0297349709114415e-05,
1098
+ "loss": 0.0055,
1099
  "step": 1390
1100
  },
1101
  {
1102
+ "epoch": 11.764705882352942,
1103
+ "grad_norm": 0.025563258677721024,
1104
+ "learning_rate": 1.9004524886877827e-05,
1105
+ "loss": 0.0056,
1106
  "step": 1400
1107
  },
1108
  {
1109
+ "epoch": 11.764705882352942,
1110
+ "eval_accuracy": 0.7830188679245284,
1111
+ "eval_loss": 1.1348851919174194,
1112
+ "eval_runtime": 2.1553,
1113
+ "eval_samples_per_second": 98.362,
1114
+ "eval_steps_per_second": 12.527,
1115
  "step": 1400
1116
  },
1117
  {
1118
+ "epoch": 11.84873949579832,
1119
+ "grad_norm": 0.027119316160678864,
1120
+ "learning_rate": 1.7711700064641243e-05,
1121
+ "loss": 0.0056,
1122
  "step": 1410
1123
  },
1124
  {
1125
+ "epoch": 11.932773109243698,
1126
+ "grad_norm": 0.02663271874189377,
1127
+ "learning_rate": 1.6418875242404656e-05,
1128
+ "loss": 0.0055,
1129
  "step": 1420
1130
  },
1131
  {
1132
+ "epoch": 12.016806722689076,
1133
+ "grad_norm": 0.027364488691091537,
1134
+ "learning_rate": 1.5126050420168067e-05,
1135
+ "loss": 0.0054,
1136
  "step": 1430
1137
  },
1138
  {
1139
+ "epoch": 12.100840336134453,
1140
+ "grad_norm": 0.02702498808503151,
1141
+ "learning_rate": 1.3833225597931483e-05,
1142
+ "loss": 0.0057,
1143
  "step": 1440
1144
  },
1145
  {
1146
+ "epoch": 12.184873949579831,
1147
+ "grad_norm": 0.02570091001689434,
1148
+ "learning_rate": 1.2540400775694893e-05,
1149
+ "loss": 0.0054,
1150
  "step": 1450
1151
  },
1152
  {
1153
+ "epoch": 12.268907563025211,
1154
+ "grad_norm": 0.02761007659137249,
1155
+ "learning_rate": 1.1247575953458308e-05,
1156
+ "loss": 0.0055,
1157
  "step": 1460
1158
  },
1159
  {
1160
+ "epoch": 12.352941176470589,
1161
+ "grad_norm": 0.02617548778653145,
1162
+ "learning_rate": 9.95475113122172e-06,
1163
+ "loss": 0.0055,
1164
  "step": 1470
1165
  },
1166
  {
1167
+ "epoch": 12.436974789915967,
1168
+ "grad_norm": 0.02675885520875454,
1169
+ "learning_rate": 8.661926308985133e-06,
1170
+ "loss": 0.0056,
1171
  "step": 1480
1172
  },
1173
  {
1174
+ "epoch": 12.521008403361344,
1175
+ "grad_norm": 0.029071761295199394,
1176
+ "learning_rate": 7.369101486748546e-06,
1177
+ "loss": 0.0052,
1178
  "step": 1490
1179
  },
1180
  {
1181
+ "epoch": 12.605042016806722,
1182
+ "grad_norm": 0.02562028169631958,
1183
+ "learning_rate": 6.076276664511959e-06,
1184
+ "loss": 0.0054,
1185
  "step": 1500
1186
  },
1187
  {
1188
+ "epoch": 12.605042016806722,
1189
+ "eval_accuracy": 0.7830188679245284,
1190
+ "eval_loss": 1.1463406085968018,
1191
+ "eval_runtime": 2.157,
1192
+ "eval_samples_per_second": 98.284,
1193
+ "eval_steps_per_second": 12.517,
1194
  "step": 1500
1195
  },
1196
  {
1197
+ "epoch": 12.6890756302521,
1198
+ "grad_norm": 0.024869520217180252,
1199
+ "learning_rate": 4.783451842275372e-06,
1200
+ "loss": 0.0055,
1201
  "step": 1510
1202
  },
1203
  {
1204
+ "epoch": 12.77310924369748,
1205
+ "grad_norm": 0.02748894691467285,
1206
+ "learning_rate": 3.490627020038785e-06,
1207
+ "loss": 0.0056,
1208
  "step": 1520
1209
  },
1210
  {
1211
+ "epoch": 12.857142857142858,
1212
+ "grad_norm": 0.026005534455180168,
1213
+ "learning_rate": 2.197802197802198e-06,
1214
+ "loss": 0.0056,
1215
  "step": 1530
1216
  },
1217
  {
1218
+ "epoch": 12.941176470588236,
1219
+ "grad_norm": 0.028039414435625076,
1220
+ "learning_rate": 9.04977375565611e-07,
1221
+ "loss": 0.0054,
1222
  "step": 1540
1223
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1224
  {
1225
  "epoch": 13.0,
1226
+ "step": 1547,
1227
+ "total_flos": 1.9140864535683072e+18,
1228
+ "train_loss": 0.35688263059153663,
1229
+ "train_runtime": 955.328,
1230
+ "train_samples_per_second": 25.855,
1231
+ "train_steps_per_second": 1.619
1232
  }
1233
  ],
1234
  "logging_steps": 10,
1235
+ "max_steps": 1547,
1236
  "num_input_tokens_seen": 0,
1237
  "num_train_epochs": 13,
1238
  "save_steps": 100,
 
1248
  "attributes": {}
1249
  }
1250
  },
1251
+ "total_flos": 1.9140864535683072e+18,
1252
  "train_batch_size": 16,
1253
  "trial_name": null,
1254
  "trial_params": null