Mayank1996 commited on
Commit
777a3d1
1 Parent(s): 487cfb9

End of training

Browse files
README.md CHANGED
@@ -18,9 +18,9 @@ This model is a fine-tuned version of [MCG-NJU/videomae-base](https://huggingfac
18
  It achieves the following results on the evaluation set:
19
  - eval_loss: 0.2618
20
  - eval_accuracy: 0.9412
21
- - eval_runtime: 88.9045
22
- - eval_samples_per_second: 0.574
23
- - eval_steps_per_second: 0.146
24
  - epoch: 24.0020
25
  - step: 1450
26
 
 
18
  It achieves the following results on the evaluation set:
19
  - eval_loss: 0.2618
20
  - eval_accuracy: 0.9412
21
+ - eval_runtime: 89.4759
22
+ - eval_samples_per_second: 0.57
23
+ - eval_steps_per_second: 0.145
24
  - epoch: 24.0020
25
  - step: 1450
26
 
all_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.9411764705882353,
3
+ "eval_loss": 0.2618250548839569
4
+ }
runs/Sep25_06-15-09_stupa-ai/events.out.tfevents.1727244913.stupa-ai.2576179.4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a635d2f7c69c778e88b1261273763c6dc50a82832dfbf747e197ada8fdb2661b
3
- size 44194
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5620cba702463d636611f5b00b9c706d377ad7ddab5e2d622825f3e873edd12
3
+ size 44517
test_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.9411764705882353,
3
+ "eval_loss": 0.2618250548839569
4
+ }
trainer_state.json ADDED
@@ -0,0 +1,1291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9375,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset_fhbh/checkpoint-638",
4
+ "epoch": 24.0020350877193,
5
+ "eval_steps": 500,
6
+ "global_step": 1450,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0003508771929824561,
13
+ "grad_norm": 5.952354907989502,
14
+ "learning_rate": 1.7543859649122808e-07,
15
+ "loss": 0.745,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.0007017543859649122,
20
+ "grad_norm": 4.616081714630127,
21
+ "learning_rate": 3.5087719298245616e-07,
22
+ "loss": 0.6751,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.0010526315789473684,
27
+ "grad_norm": 14.299074172973633,
28
+ "learning_rate": 5.263157894736843e-07,
29
+ "loss": 0.7311,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.0014035087719298245,
34
+ "grad_norm": 9.126326560974121,
35
+ "learning_rate": 7.017543859649123e-07,
36
+ "loss": 0.6957,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.0017543859649122807,
41
+ "grad_norm": 6.692790985107422,
42
+ "learning_rate": 8.771929824561404e-07,
43
+ "loss": 0.7533,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.0020350877192982456,
48
+ "eval_accuracy": 0.5416666666666666,
49
+ "eval_loss": 0.677791178226471,
50
+ "eval_runtime": 78.366,
51
+ "eval_samples_per_second": 0.613,
52
+ "eval_steps_per_second": 0.153,
53
+ "step": 58
54
+ },
55
+ {
56
+ "epoch": 1.0000701754385966,
57
+ "grad_norm": 6.502946853637695,
58
+ "learning_rate": 1.0526315789473685e-06,
59
+ "loss": 0.7694,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 1.0004210526315789,
64
+ "grad_norm": 11.516799926757812,
65
+ "learning_rate": 1.2280701754385965e-06,
66
+ "loss": 0.7382,
67
+ "step": 70
68
+ },
69
+ {
70
+ "epoch": 1.0007719298245614,
71
+ "grad_norm": 7.619742393493652,
72
+ "learning_rate": 1.4035087719298246e-06,
73
+ "loss": 0.6912,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 1.001122807017544,
78
+ "grad_norm": 5.542720794677734,
79
+ "learning_rate": 1.5789473684210528e-06,
80
+ "loss": 0.7054,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 1.0014736842105263,
85
+ "grad_norm": 7.172524929046631,
86
+ "learning_rate": 1.7543859649122807e-06,
87
+ "loss": 0.7533,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 1.0018245614035088,
92
+ "grad_norm": 6.668615341186523,
93
+ "learning_rate": 1.929824561403509e-06,
94
+ "loss": 0.7229,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 1.0020350877192983,
99
+ "eval_accuracy": 0.5416666666666666,
100
+ "eval_loss": 0.663836658000946,
101
+ "eval_runtime": 77.9477,
102
+ "eval_samples_per_second": 0.616,
103
+ "eval_steps_per_second": 0.154,
104
+ "step": 116
105
+ },
106
+ {
107
+ "epoch": 2.000140350877193,
108
+ "grad_norm": 4.909543991088867,
109
+ "learning_rate": 2.105263157894737e-06,
110
+ "loss": 0.6922,
111
+ "step": 120
112
+ },
113
+ {
114
+ "epoch": 2.0004912280701754,
115
+ "grad_norm": 9.0471830368042,
116
+ "learning_rate": 2.2807017543859652e-06,
117
+ "loss": 0.6736,
118
+ "step": 130
119
+ },
120
+ {
121
+ "epoch": 2.0008421052631578,
122
+ "grad_norm": 6.69089412689209,
123
+ "learning_rate": 2.456140350877193e-06,
124
+ "loss": 0.6865,
125
+ "step": 140
126
+ },
127
+ {
128
+ "epoch": 2.0011929824561405,
129
+ "grad_norm": 9.476597785949707,
130
+ "learning_rate": 2.631578947368421e-06,
131
+ "loss": 0.6844,
132
+ "step": 150
133
+ },
134
+ {
135
+ "epoch": 2.001543859649123,
136
+ "grad_norm": 7.067219257354736,
137
+ "learning_rate": 2.8070175438596493e-06,
138
+ "loss": 0.6768,
139
+ "step": 160
140
+ },
141
+ {
142
+ "epoch": 2.001894736842105,
143
+ "grad_norm": 5.748457908630371,
144
+ "learning_rate": 2.9824561403508774e-06,
145
+ "loss": 0.6827,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 2.0020350877192983,
150
+ "eval_accuracy": 0.6041666666666666,
151
+ "eval_loss": 0.6515334248542786,
152
+ "eval_runtime": 77.9754,
153
+ "eval_samples_per_second": 0.616,
154
+ "eval_steps_per_second": 0.154,
155
+ "step": 174
156
+ },
157
+ {
158
+ "epoch": 3.0002105263157897,
159
+ "grad_norm": 8.415090560913086,
160
+ "learning_rate": 3.1578947368421056e-06,
161
+ "loss": 0.7035,
162
+ "step": 180
163
+ },
164
+ {
165
+ "epoch": 3.000561403508772,
166
+ "grad_norm": 7.755239963531494,
167
+ "learning_rate": 3.3333333333333333e-06,
168
+ "loss": 0.712,
169
+ "step": 190
170
+ },
171
+ {
172
+ "epoch": 3.0009122807017543,
173
+ "grad_norm": 11.437898635864258,
174
+ "learning_rate": 3.5087719298245615e-06,
175
+ "loss": 0.6409,
176
+ "step": 200
177
+ },
178
+ {
179
+ "epoch": 3.0012631578947366,
180
+ "grad_norm": 6.896209239959717,
181
+ "learning_rate": 3.6842105263157892e-06,
182
+ "loss": 0.6862,
183
+ "step": 210
184
+ },
185
+ {
186
+ "epoch": 3.0016140350877194,
187
+ "grad_norm": 5.764392852783203,
188
+ "learning_rate": 3.859649122807018e-06,
189
+ "loss": 0.6459,
190
+ "step": 220
191
+ },
192
+ {
193
+ "epoch": 3.0019649122807017,
194
+ "grad_norm": 8.806387901306152,
195
+ "learning_rate": 4.035087719298246e-06,
196
+ "loss": 0.7322,
197
+ "step": 230
198
+ },
199
+ {
200
+ "epoch": 3.0020350877192983,
201
+ "eval_accuracy": 0.75,
202
+ "eval_loss": 0.6666872501373291,
203
+ "eval_runtime": 78.325,
204
+ "eval_samples_per_second": 0.613,
205
+ "eval_steps_per_second": 0.153,
206
+ "step": 232
207
+ },
208
+ {
209
+ "epoch": 4.000280701754386,
210
+ "grad_norm": 11.507173538208008,
211
+ "learning_rate": 4.210526315789474e-06,
212
+ "loss": 0.6937,
213
+ "step": 240
214
+ },
215
+ {
216
+ "epoch": 4.0006315789473685,
217
+ "grad_norm": 7.351099491119385,
218
+ "learning_rate": 4.3859649122807014e-06,
219
+ "loss": 0.6292,
220
+ "step": 250
221
+ },
222
+ {
223
+ "epoch": 4.000982456140351,
224
+ "grad_norm": 4.936241149902344,
225
+ "learning_rate": 4.5614035087719304e-06,
226
+ "loss": 0.6041,
227
+ "step": 260
228
+ },
229
+ {
230
+ "epoch": 4.001333333333333,
231
+ "grad_norm": 10.265213012695312,
232
+ "learning_rate": 4.736842105263159e-06,
233
+ "loss": 0.6616,
234
+ "step": 270
235
+ },
236
+ {
237
+ "epoch": 4.0016842105263155,
238
+ "grad_norm": 14.022355079650879,
239
+ "learning_rate": 4.912280701754386e-06,
240
+ "loss": 0.6489,
241
+ "step": 280
242
+ },
243
+ {
244
+ "epoch": 4.002035087719298,
245
+ "grad_norm": 14.538658142089844,
246
+ "learning_rate": 5.087719298245614e-06,
247
+ "loss": 0.6552,
248
+ "step": 290
249
+ },
250
+ {
251
+ "epoch": 4.002035087719298,
252
+ "eval_accuracy": 0.75,
253
+ "eval_loss": 0.6378026604652405,
254
+ "eval_runtime": 78.4115,
255
+ "eval_samples_per_second": 0.612,
256
+ "eval_steps_per_second": 0.153,
257
+ "step": 290
258
+ },
259
+ {
260
+ "epoch": 5.000350877192982,
261
+ "grad_norm": 6.908311367034912,
262
+ "learning_rate": 5.263157894736842e-06,
263
+ "loss": 0.6183,
264
+ "step": 300
265
+ },
266
+ {
267
+ "epoch": 5.000701754385965,
268
+ "grad_norm": 6.211957931518555,
269
+ "learning_rate": 5.43859649122807e-06,
270
+ "loss": 0.5759,
271
+ "step": 310
272
+ },
273
+ {
274
+ "epoch": 5.001052631578947,
275
+ "grad_norm": 4.951029300689697,
276
+ "learning_rate": 5.6140350877192985e-06,
277
+ "loss": 0.6144,
278
+ "step": 320
279
+ },
280
+ {
281
+ "epoch": 5.00140350877193,
282
+ "grad_norm": 8.593265533447266,
283
+ "learning_rate": 5.789473684210527e-06,
284
+ "loss": 0.5619,
285
+ "step": 330
286
+ },
287
+ {
288
+ "epoch": 5.0017543859649125,
289
+ "grad_norm": 19.80694007873535,
290
+ "learning_rate": 5.964912280701755e-06,
291
+ "loss": 0.4691,
292
+ "step": 340
293
+ },
294
+ {
295
+ "epoch": 5.002035087719298,
296
+ "eval_accuracy": 0.75,
297
+ "eval_loss": 0.5537357926368713,
298
+ "eval_runtime": 80.2663,
299
+ "eval_samples_per_second": 0.598,
300
+ "eval_steps_per_second": 0.15,
301
+ "step": 348
302
+ },
303
+ {
304
+ "epoch": 6.000070175438596,
305
+ "grad_norm": 19.27092170715332,
306
+ "learning_rate": 6.140350877192982e-06,
307
+ "loss": 0.5575,
308
+ "step": 350
309
+ },
310
+ {
311
+ "epoch": 6.000421052631579,
312
+ "grad_norm": 14.520448684692383,
313
+ "learning_rate": 6.315789473684211e-06,
314
+ "loss": 0.5209,
315
+ "step": 360
316
+ },
317
+ {
318
+ "epoch": 6.000771929824562,
319
+ "grad_norm": 13.577587127685547,
320
+ "learning_rate": 6.4912280701754385e-06,
321
+ "loss": 0.4873,
322
+ "step": 370
323
+ },
324
+ {
325
+ "epoch": 6.001122807017544,
326
+ "grad_norm": 2.4672834873199463,
327
+ "learning_rate": 6.666666666666667e-06,
328
+ "loss": 0.3996,
329
+ "step": 380
330
+ },
331
+ {
332
+ "epoch": 6.001473684210526,
333
+ "grad_norm": 29.06943702697754,
334
+ "learning_rate": 6.842105263157896e-06,
335
+ "loss": 0.58,
336
+ "step": 390
337
+ },
338
+ {
339
+ "epoch": 6.001824561403509,
340
+ "grad_norm": 10.214743614196777,
341
+ "learning_rate": 7.017543859649123e-06,
342
+ "loss": 0.6845,
343
+ "step": 400
344
+ },
345
+ {
346
+ "epoch": 6.002035087719298,
347
+ "eval_accuracy": 0.7083333333333334,
348
+ "eval_loss": 0.6998243927955627,
349
+ "eval_runtime": 81.5316,
350
+ "eval_samples_per_second": 0.589,
351
+ "eval_steps_per_second": 0.147,
352
+ "step": 406
353
+ },
354
+ {
355
+ "epoch": 7.000140350877193,
356
+ "grad_norm": 72.12657928466797,
357
+ "learning_rate": 7.192982456140351e-06,
358
+ "loss": 0.6733,
359
+ "step": 410
360
+ },
361
+ {
362
+ "epoch": 7.000491228070175,
363
+ "grad_norm": 5.446975231170654,
364
+ "learning_rate": 7.3684210526315784e-06,
365
+ "loss": 0.2873,
366
+ "step": 420
367
+ },
368
+ {
369
+ "epoch": 7.000842105263158,
370
+ "grad_norm": 9.24228286743164,
371
+ "learning_rate": 7.5438596491228074e-06,
372
+ "loss": 0.4578,
373
+ "step": 430
374
+ },
375
+ {
376
+ "epoch": 7.00119298245614,
377
+ "grad_norm": 1.2333711385726929,
378
+ "learning_rate": 7.719298245614036e-06,
379
+ "loss": 0.3516,
380
+ "step": 440
381
+ },
382
+ {
383
+ "epoch": 7.001543859649122,
384
+ "grad_norm": 6.666906833648682,
385
+ "learning_rate": 7.894736842105263e-06,
386
+ "loss": 0.5434,
387
+ "step": 450
388
+ },
389
+ {
390
+ "epoch": 7.001894736842106,
391
+ "grad_norm": 18.284526824951172,
392
+ "learning_rate": 8.070175438596492e-06,
393
+ "loss": 0.6754,
394
+ "step": 460
395
+ },
396
+ {
397
+ "epoch": 7.002035087719298,
398
+ "eval_accuracy": 0.875,
399
+ "eval_loss": 0.36466991901397705,
400
+ "eval_runtime": 80.8792,
401
+ "eval_samples_per_second": 0.593,
402
+ "eval_steps_per_second": 0.148,
403
+ "step": 464
404
+ },
405
+ {
406
+ "epoch": 8.00021052631579,
407
+ "grad_norm": 8.833359718322754,
408
+ "learning_rate": 8.245614035087721e-06,
409
+ "loss": 0.4877,
410
+ "step": 470
411
+ },
412
+ {
413
+ "epoch": 8.000561403508772,
414
+ "grad_norm": 10.950183868408203,
415
+ "learning_rate": 8.421052631578948e-06,
416
+ "loss": 0.3044,
417
+ "step": 480
418
+ },
419
+ {
420
+ "epoch": 8.000912280701755,
421
+ "grad_norm": 2.037674903869629,
422
+ "learning_rate": 8.596491228070176e-06,
423
+ "loss": 0.2232,
424
+ "step": 490
425
+ },
426
+ {
427
+ "epoch": 8.001263157894737,
428
+ "grad_norm": 78.8741455078125,
429
+ "learning_rate": 8.771929824561403e-06,
430
+ "loss": 0.1771,
431
+ "step": 500
432
+ },
433
+ {
434
+ "epoch": 8.00161403508772,
435
+ "grad_norm": 90.6770248413086,
436
+ "learning_rate": 8.947368421052632e-06,
437
+ "loss": 1.1209,
438
+ "step": 510
439
+ },
440
+ {
441
+ "epoch": 8.001964912280702,
442
+ "grad_norm": 39.13031768798828,
443
+ "learning_rate": 9.122807017543861e-06,
444
+ "loss": 0.8425,
445
+ "step": 520
446
+ },
447
+ {
448
+ "epoch": 8.002035087719298,
449
+ "eval_accuracy": 0.5416666666666666,
450
+ "eval_loss": 0.6199241876602173,
451
+ "eval_runtime": 81.9922,
452
+ "eval_samples_per_second": 0.585,
453
+ "eval_steps_per_second": 0.146,
454
+ "step": 522
455
+ },
456
+ {
457
+ "epoch": 9.000280701754386,
458
+ "grad_norm": 17.226152420043945,
459
+ "learning_rate": 9.298245614035088e-06,
460
+ "loss": 0.7695,
461
+ "step": 530
462
+ },
463
+ {
464
+ "epoch": 9.000631578947369,
465
+ "grad_norm": 12.632246971130371,
466
+ "learning_rate": 9.473684210526317e-06,
467
+ "loss": 0.5423,
468
+ "step": 540
469
+ },
470
+ {
471
+ "epoch": 9.00098245614035,
472
+ "grad_norm": 7.4788336753845215,
473
+ "learning_rate": 9.649122807017545e-06,
474
+ "loss": 0.6734,
475
+ "step": 550
476
+ },
477
+ {
478
+ "epoch": 9.001333333333333,
479
+ "grad_norm": 32.823486328125,
480
+ "learning_rate": 9.824561403508772e-06,
481
+ "loss": 0.4033,
482
+ "step": 560
483
+ },
484
+ {
485
+ "epoch": 9.001684210526316,
486
+ "grad_norm": 5.6088480949401855,
487
+ "learning_rate": 1e-05,
488
+ "loss": 0.2009,
489
+ "step": 570
490
+ },
491
+ {
492
+ "epoch": 9.002035087719298,
493
+ "grad_norm": 0.8267044425010681,
494
+ "learning_rate": 1.0175438596491228e-05,
495
+ "loss": 0.2276,
496
+ "step": 580
497
+ },
498
+ {
499
+ "epoch": 9.002035087719298,
500
+ "eval_accuracy": 0.7291666666666666,
501
+ "eval_loss": 0.9983854293823242,
502
+ "eval_runtime": 81.8828,
503
+ "eval_samples_per_second": 0.586,
504
+ "eval_steps_per_second": 0.147,
505
+ "step": 580
506
+ },
507
+ {
508
+ "epoch": 10.000350877192982,
509
+ "grad_norm": 0.6918083429336548,
510
+ "learning_rate": 1.0350877192982457e-05,
511
+ "loss": 0.4027,
512
+ "step": 590
513
+ },
514
+ {
515
+ "epoch": 10.000701754385965,
516
+ "grad_norm": 12.070817947387695,
517
+ "learning_rate": 1.0526315789473684e-05,
518
+ "loss": 0.1868,
519
+ "step": 600
520
+ },
521
+ {
522
+ "epoch": 10.001052631578947,
523
+ "grad_norm": 11.899476051330566,
524
+ "learning_rate": 1.0701754385964913e-05,
525
+ "loss": 0.8328,
526
+ "step": 610
527
+ },
528
+ {
529
+ "epoch": 10.00140350877193,
530
+ "grad_norm": 18.76070213317871,
531
+ "learning_rate": 1.087719298245614e-05,
532
+ "loss": 0.4753,
533
+ "step": 620
534
+ },
535
+ {
536
+ "epoch": 10.001754385964912,
537
+ "grad_norm": 15.813506126403809,
538
+ "learning_rate": 1.1052631578947368e-05,
539
+ "loss": 0.3953,
540
+ "step": 630
541
+ },
542
+ {
543
+ "epoch": 10.002035087719298,
544
+ "eval_accuracy": 0.9375,
545
+ "eval_loss": 0.3595670759677887,
546
+ "eval_runtime": 84.5422,
547
+ "eval_samples_per_second": 0.568,
548
+ "eval_steps_per_second": 0.142,
549
+ "step": 638
550
+ },
551
+ {
552
+ "epoch": 11.000070175438596,
553
+ "grad_norm": 2.381981372833252,
554
+ "learning_rate": 1.1228070175438597e-05,
555
+ "loss": 0.3252,
556
+ "step": 640
557
+ },
558
+ {
559
+ "epoch": 11.000421052631578,
560
+ "grad_norm": 8.495650291442871,
561
+ "learning_rate": 1.1403508771929824e-05,
562
+ "loss": 0.2205,
563
+ "step": 650
564
+ },
565
+ {
566
+ "epoch": 11.00077192982456,
567
+ "grad_norm": 0.5458263754844666,
568
+ "learning_rate": 1.1578947368421053e-05,
569
+ "loss": 0.4623,
570
+ "step": 660
571
+ },
572
+ {
573
+ "epoch": 11.001122807017543,
574
+ "grad_norm": 35.78744888305664,
575
+ "learning_rate": 1.1754385964912282e-05,
576
+ "loss": 0.4652,
577
+ "step": 670
578
+ },
579
+ {
580
+ "epoch": 11.001473684210527,
581
+ "grad_norm": 69.58731842041016,
582
+ "learning_rate": 1.192982456140351e-05,
583
+ "loss": 0.2175,
584
+ "step": 680
585
+ },
586
+ {
587
+ "epoch": 11.00182456140351,
588
+ "grad_norm": 80.09464263916016,
589
+ "learning_rate": 1.2105263157894737e-05,
590
+ "loss": 0.3255,
591
+ "step": 690
592
+ },
593
+ {
594
+ "epoch": 11.002035087719298,
595
+ "eval_accuracy": 0.9166666666666666,
596
+ "eval_loss": 0.39784160256385803,
597
+ "eval_runtime": 82.0895,
598
+ "eval_samples_per_second": 0.585,
599
+ "eval_steps_per_second": 0.146,
600
+ "step": 696
601
+ },
602
+ {
603
+ "epoch": 12.000140350877192,
604
+ "grad_norm": 0.08766458928585052,
605
+ "learning_rate": 1.2280701754385964e-05,
606
+ "loss": 0.0288,
607
+ "step": 700
608
+ },
609
+ {
610
+ "epoch": 12.000491228070176,
611
+ "grad_norm": 10.239900588989258,
612
+ "learning_rate": 1.2456140350877193e-05,
613
+ "loss": 0.2648,
614
+ "step": 710
615
+ },
616
+ {
617
+ "epoch": 12.000842105263159,
618
+ "grad_norm": 5.331236839294434,
619
+ "learning_rate": 1.2631578947368422e-05,
620
+ "loss": 0.3223,
621
+ "step": 720
622
+ },
623
+ {
624
+ "epoch": 12.001192982456141,
625
+ "grad_norm": 0.24060657620429993,
626
+ "learning_rate": 1.2807017543859651e-05,
627
+ "loss": 0.2808,
628
+ "step": 730
629
+ },
630
+ {
631
+ "epoch": 12.001543859649123,
632
+ "grad_norm": 0.31760913133621216,
633
+ "learning_rate": 1.2982456140350877e-05,
634
+ "loss": 0.2207,
635
+ "step": 740
636
+ },
637
+ {
638
+ "epoch": 12.001894736842106,
639
+ "grad_norm": 70.13704681396484,
640
+ "learning_rate": 1.3157894736842106e-05,
641
+ "loss": 0.2524,
642
+ "step": 750
643
+ },
644
+ {
645
+ "epoch": 12.002035087719298,
646
+ "eval_accuracy": 0.9375,
647
+ "eval_loss": 0.3351368010044098,
648
+ "eval_runtime": 80.8837,
649
+ "eval_samples_per_second": 0.593,
650
+ "eval_steps_per_second": 0.148,
651
+ "step": 754
652
+ },
653
+ {
654
+ "epoch": 13.00021052631579,
655
+ "grad_norm": 0.32135623693466187,
656
+ "learning_rate": 1.3333333333333333e-05,
657
+ "loss": 0.2225,
658
+ "step": 760
659
+ },
660
+ {
661
+ "epoch": 13.000561403508772,
662
+ "grad_norm": 21.094276428222656,
663
+ "learning_rate": 1.3508771929824562e-05,
664
+ "loss": 0.5212,
665
+ "step": 770
666
+ },
667
+ {
668
+ "epoch": 13.000912280701755,
669
+ "grad_norm": 0.08428701013326645,
670
+ "learning_rate": 1.3684210526315791e-05,
671
+ "loss": 0.4246,
672
+ "step": 780
673
+ },
674
+ {
675
+ "epoch": 13.001263157894737,
676
+ "grad_norm": 0.18355534970760345,
677
+ "learning_rate": 1.3859649122807017e-05,
678
+ "loss": 0.0793,
679
+ "step": 790
680
+ },
681
+ {
682
+ "epoch": 13.00161403508772,
683
+ "grad_norm": 8.33340072631836,
684
+ "learning_rate": 1.4035087719298246e-05,
685
+ "loss": 0.3384,
686
+ "step": 800
687
+ },
688
+ {
689
+ "epoch": 13.001964912280702,
690
+ "grad_norm": 0.7141004204750061,
691
+ "learning_rate": 1.4210526315789475e-05,
692
+ "loss": 0.5978,
693
+ "step": 810
694
+ },
695
+ {
696
+ "epoch": 13.002035087719298,
697
+ "eval_accuracy": 0.9375,
698
+ "eval_loss": 0.23082482814788818,
699
+ "eval_runtime": 81.747,
700
+ "eval_samples_per_second": 0.587,
701
+ "eval_steps_per_second": 0.147,
702
+ "step": 812
703
+ },
704
+ {
705
+ "epoch": 14.000280701754386,
706
+ "grad_norm": 0.15585492551326752,
707
+ "learning_rate": 1.4385964912280702e-05,
708
+ "loss": 0.122,
709
+ "step": 820
710
+ },
711
+ {
712
+ "epoch": 14.000631578947369,
713
+ "grad_norm": 49.04802322387695,
714
+ "learning_rate": 1.4561403508771931e-05,
715
+ "loss": 0.522,
716
+ "step": 830
717
+ },
718
+ {
719
+ "epoch": 14.00098245614035,
720
+ "grad_norm": 0.3657858967781067,
721
+ "learning_rate": 1.4736842105263157e-05,
722
+ "loss": 0.0476,
723
+ "step": 840
724
+ },
725
+ {
726
+ "epoch": 14.001333333333333,
727
+ "grad_norm": 0.05123307183384895,
728
+ "learning_rate": 1.4912280701754386e-05,
729
+ "loss": 0.2268,
730
+ "step": 850
731
+ },
732
+ {
733
+ "epoch": 14.001684210526316,
734
+ "grad_norm": 0.08785073459148407,
735
+ "learning_rate": 1.5087719298245615e-05,
736
+ "loss": 0.4392,
737
+ "step": 860
738
+ },
739
+ {
740
+ "epoch": 14.002035087719298,
741
+ "grad_norm": 0.33805736899375916,
742
+ "learning_rate": 1.5263157894736842e-05,
743
+ "loss": 0.1542,
744
+ "step": 870
745
+ },
746
+ {
747
+ "epoch": 14.002035087719298,
748
+ "eval_accuracy": 0.8958333333333334,
749
+ "eval_loss": 0.5762323141098022,
750
+ "eval_runtime": 82.832,
751
+ "eval_samples_per_second": 0.579,
752
+ "eval_steps_per_second": 0.145,
753
+ "step": 870
754
+ },
755
+ {
756
+ "epoch": 15.000350877192982,
757
+ "grad_norm": 0.06892251968383789,
758
+ "learning_rate": 1.543859649122807e-05,
759
+ "loss": 0.1377,
760
+ "step": 880
761
+ },
762
+ {
763
+ "epoch": 15.000701754385965,
764
+ "grad_norm": 0.07005161046981812,
765
+ "learning_rate": 1.56140350877193e-05,
766
+ "loss": 0.0053,
767
+ "step": 890
768
+ },
769
+ {
770
+ "epoch": 15.001052631578947,
771
+ "grad_norm": 0.03198734670877457,
772
+ "learning_rate": 1.5789473684210526e-05,
773
+ "loss": 0.5775,
774
+ "step": 900
775
+ },
776
+ {
777
+ "epoch": 15.00140350877193,
778
+ "grad_norm": 171.48255920410156,
779
+ "learning_rate": 1.5964912280701755e-05,
780
+ "loss": 0.3737,
781
+ "step": 910
782
+ },
783
+ {
784
+ "epoch": 15.001754385964912,
785
+ "grad_norm": 0.4068077504634857,
786
+ "learning_rate": 1.6140350877192984e-05,
787
+ "loss": 0.3073,
788
+ "step": 920
789
+ },
790
+ {
791
+ "epoch": 15.002035087719298,
792
+ "eval_accuracy": 0.8958333333333334,
793
+ "eval_loss": 0.33416375517845154,
794
+ "eval_runtime": 83.3591,
795
+ "eval_samples_per_second": 0.576,
796
+ "eval_steps_per_second": 0.144,
797
+ "step": 928
798
+ },
799
+ {
800
+ "epoch": 16.000070175438598,
801
+ "grad_norm": 0.3335668444633484,
802
+ "learning_rate": 1.6315789473684213e-05,
803
+ "loss": 0.7197,
804
+ "step": 930
805
+ },
806
+ {
807
+ "epoch": 16.00042105263158,
808
+ "grad_norm": 1.4757983684539795,
809
+ "learning_rate": 1.6491228070175442e-05,
810
+ "loss": 0.2539,
811
+ "step": 940
812
+ },
813
+ {
814
+ "epoch": 16.000771929824563,
815
+ "grad_norm": 0.17356331646442413,
816
+ "learning_rate": 1.6666666666666667e-05,
817
+ "loss": 0.0063,
818
+ "step": 950
819
+ },
820
+ {
821
+ "epoch": 16.001122807017545,
822
+ "grad_norm": 0.1452503204345703,
823
+ "learning_rate": 1.6842105263157896e-05,
824
+ "loss": 0.5967,
825
+ "step": 960
826
+ },
827
+ {
828
+ "epoch": 16.001473684210527,
829
+ "grad_norm": 0.1030503362417221,
830
+ "learning_rate": 1.7017543859649125e-05,
831
+ "loss": 0.6578,
832
+ "step": 970
833
+ },
834
+ {
835
+ "epoch": 16.00182456140351,
836
+ "grad_norm": 12.400784492492676,
837
+ "learning_rate": 1.719298245614035e-05,
838
+ "loss": 0.5518,
839
+ "step": 980
840
+ },
841
+ {
842
+ "epoch": 16.0020350877193,
843
+ "eval_accuracy": 0.8541666666666666,
844
+ "eval_loss": 0.4223368465900421,
845
+ "eval_runtime": 83.4362,
846
+ "eval_samples_per_second": 0.575,
847
+ "eval_steps_per_second": 0.144,
848
+ "step": 986
849
+ },
850
+ {
851
+ "epoch": 17.000140350877192,
852
+ "grad_norm": 0.28909754753112793,
853
+ "learning_rate": 1.736842105263158e-05,
854
+ "loss": 0.2008,
855
+ "step": 990
856
+ },
857
+ {
858
+ "epoch": 17.000491228070175,
859
+ "grad_norm": 0.21579360961914062,
860
+ "learning_rate": 1.7543859649122806e-05,
861
+ "loss": 0.3298,
862
+ "step": 1000
863
+ },
864
+ {
865
+ "epoch": 17.000842105263157,
866
+ "grad_norm": 0.10615105926990509,
867
+ "learning_rate": 1.7719298245614035e-05,
868
+ "loss": 0.004,
869
+ "step": 1010
870
+ },
871
+ {
872
+ "epoch": 17.00119298245614,
873
+ "grad_norm": 0.046201951801776886,
874
+ "learning_rate": 1.7894736842105264e-05,
875
+ "loss": 0.3526,
876
+ "step": 1020
877
+ },
878
+ {
879
+ "epoch": 17.00154385964912,
880
+ "grad_norm": 0.06010481342673302,
881
+ "learning_rate": 1.8070175438596493e-05,
882
+ "loss": 0.3399,
883
+ "step": 1030
884
+ },
885
+ {
886
+ "epoch": 17.001894736842104,
887
+ "grad_norm": 8.584966659545898,
888
+ "learning_rate": 1.8245614035087722e-05,
889
+ "loss": 0.6157,
890
+ "step": 1040
891
+ },
892
+ {
893
+ "epoch": 17.0020350877193,
894
+ "eval_accuracy": 0.9375,
895
+ "eval_loss": 0.17038817703723907,
896
+ "eval_runtime": 83.7401,
897
+ "eval_samples_per_second": 0.573,
898
+ "eval_steps_per_second": 0.143,
899
+ "step": 1044
900
+ },
901
+ {
902
+ "epoch": 18.00021052631579,
903
+ "grad_norm": 0.774956464767456,
904
+ "learning_rate": 1.8421052631578947e-05,
905
+ "loss": 0.1596,
906
+ "step": 1050
907
+ },
908
+ {
909
+ "epoch": 18.000561403508772,
910
+ "grad_norm": 0.36749064922332764,
911
+ "learning_rate": 1.8596491228070176e-05,
912
+ "loss": 0.2122,
913
+ "step": 1060
914
+ },
915
+ {
916
+ "epoch": 18.000912280701755,
917
+ "grad_norm": 0.06645552814006805,
918
+ "learning_rate": 1.8771929824561405e-05,
919
+ "loss": 0.2568,
920
+ "step": 1070
921
+ },
922
+ {
923
+ "epoch": 18.001263157894737,
924
+ "grad_norm": 0.021599041298031807,
925
+ "learning_rate": 1.8947368421052634e-05,
926
+ "loss": 0.283,
927
+ "step": 1080
928
+ },
929
+ {
930
+ "epoch": 18.00161403508772,
931
+ "grad_norm": 113.25637817382812,
932
+ "learning_rate": 1.9122807017543863e-05,
933
+ "loss": 0.3591,
934
+ "step": 1090
935
+ },
936
+ {
937
+ "epoch": 18.0019649122807,
938
+ "grad_norm": 0.21973834931850433,
939
+ "learning_rate": 1.929824561403509e-05,
940
+ "loss": 0.2544,
941
+ "step": 1100
942
+ },
943
+ {
944
+ "epoch": 18.0020350877193,
945
+ "eval_accuracy": 0.9166666666666666,
946
+ "eval_loss": 0.35440635681152344,
947
+ "eval_runtime": 82.2034,
948
+ "eval_samples_per_second": 0.584,
949
+ "eval_steps_per_second": 0.146,
950
+ "step": 1102
951
+ },
952
+ {
953
+ "epoch": 19.000280701754384,
954
+ "grad_norm": 0.06097158417105675,
955
+ "learning_rate": 1.9473684210526315e-05,
956
+ "loss": 0.3663,
957
+ "step": 1110
958
+ },
959
+ {
960
+ "epoch": 19.000631578947367,
961
+ "grad_norm": 25.72997283935547,
962
+ "learning_rate": 1.9649122807017544e-05,
963
+ "loss": 0.8104,
964
+ "step": 1120
965
+ },
966
+ {
967
+ "epoch": 19.000982456140353,
968
+ "grad_norm": 0.5115303993225098,
969
+ "learning_rate": 1.9824561403508773e-05,
970
+ "loss": 0.2474,
971
+ "step": 1130
972
+ },
973
+ {
974
+ "epoch": 19.001333333333335,
975
+ "grad_norm": 0.27492067217826843,
976
+ "learning_rate": 2e-05,
977
+ "loss": 0.3686,
978
+ "step": 1140
979
+ },
980
+ {
981
+ "epoch": 19.001684210526317,
982
+ "grad_norm": 22.944690704345703,
983
+ "learning_rate": 2.0175438596491227e-05,
984
+ "loss": 0.2315,
985
+ "step": 1150
986
+ },
987
+ {
988
+ "epoch": 19.0020350877193,
989
+ "grad_norm": 0.11991500854492188,
990
+ "learning_rate": 2.0350877192982456e-05,
991
+ "loss": 0.4036,
992
+ "step": 1160
993
+ },
994
+ {
995
+ "epoch": 19.0020350877193,
996
+ "eval_accuracy": 0.9166666666666666,
997
+ "eval_loss": 0.25051262974739075,
998
+ "eval_runtime": 80.7899,
999
+ "eval_samples_per_second": 0.594,
1000
+ "eval_steps_per_second": 0.149,
1001
+ "step": 1160
1002
+ },
1003
+ {
1004
+ "epoch": 20.000350877192982,
1005
+ "grad_norm": 0.44547587633132935,
1006
+ "learning_rate": 2.0526315789473685e-05,
1007
+ "loss": 0.1078,
1008
+ "step": 1170
1009
+ },
1010
+ {
1011
+ "epoch": 20.000701754385965,
1012
+ "grad_norm": 76.07775115966797,
1013
+ "learning_rate": 2.0701754385964914e-05,
1014
+ "loss": 0.4915,
1015
+ "step": 1180
1016
+ },
1017
+ {
1018
+ "epoch": 20.001052631578947,
1019
+ "grad_norm": 0.349282830953598,
1020
+ "learning_rate": 2.0877192982456143e-05,
1021
+ "loss": 0.2929,
1022
+ "step": 1190
1023
+ },
1024
+ {
1025
+ "epoch": 20.00140350877193,
1026
+ "grad_norm": 8.304322242736816,
1027
+ "learning_rate": 2.105263157894737e-05,
1028
+ "loss": 0.219,
1029
+ "step": 1200
1030
+ },
1031
+ {
1032
+ "epoch": 20.00175438596491,
1033
+ "grad_norm": 0.08941491693258286,
1034
+ "learning_rate": 2.1228070175438598e-05,
1035
+ "loss": 0.2382,
1036
+ "step": 1210
1037
+ },
1038
+ {
1039
+ "epoch": 20.0020350877193,
1040
+ "eval_accuracy": 0.9375,
1041
+ "eval_loss": 0.3155660927295685,
1042
+ "eval_runtime": 82.6296,
1043
+ "eval_samples_per_second": 0.581,
1044
+ "eval_steps_per_second": 0.145,
1045
+ "step": 1218
1046
+ },
1047
+ {
1048
+ "epoch": 21.000070175438598,
1049
+ "grad_norm": 6.294134140014648,
1050
+ "learning_rate": 2.1403508771929827e-05,
1051
+ "loss": 0.2611,
1052
+ "step": 1220
1053
+ },
1054
+ {
1055
+ "epoch": 21.00042105263158,
1056
+ "grad_norm": 0.11261521279811859,
1057
+ "learning_rate": 2.1578947368421053e-05,
1058
+ "loss": 0.1969,
1059
+ "step": 1230
1060
+ },
1061
+ {
1062
+ "epoch": 21.000771929824563,
1063
+ "grad_norm": 0.2796896696090698,
1064
+ "learning_rate": 2.175438596491228e-05,
1065
+ "loss": 0.2955,
1066
+ "step": 1240
1067
+ },
1068
+ {
1069
+ "epoch": 21.001122807017545,
1070
+ "grad_norm": 0.07930008322000504,
1071
+ "learning_rate": 2.1929824561403507e-05,
1072
+ "loss": 0.013,
1073
+ "step": 1250
1074
+ },
1075
+ {
1076
+ "epoch": 21.001473684210527,
1077
+ "grad_norm": 5.909428119659424,
1078
+ "learning_rate": 2.2105263157894736e-05,
1079
+ "loss": 0.3568,
1080
+ "step": 1260
1081
+ },
1082
+ {
1083
+ "epoch": 21.00182456140351,
1084
+ "grad_norm": 168.33380126953125,
1085
+ "learning_rate": 2.2280701754385965e-05,
1086
+ "loss": 0.6751,
1087
+ "step": 1270
1088
+ },
1089
+ {
1090
+ "epoch": 21.0020350877193,
1091
+ "eval_accuracy": 0.9375,
1092
+ "eval_loss": 0.259630411863327,
1093
+ "eval_runtime": 82.1271,
1094
+ "eval_samples_per_second": 0.584,
1095
+ "eval_steps_per_second": 0.146,
1096
+ "step": 1276
1097
+ },
1098
+ {
1099
+ "epoch": 22.000140350877192,
1100
+ "grad_norm": 0.22503353655338287,
1101
+ "learning_rate": 2.2456140350877194e-05,
1102
+ "loss": 0.3249,
1103
+ "step": 1280
1104
+ },
1105
+ {
1106
+ "epoch": 22.000491228070175,
1107
+ "grad_norm": 0.2562604248523712,
1108
+ "learning_rate": 2.2631578947368423e-05,
1109
+ "loss": 0.2267,
1110
+ "step": 1290
1111
+ },
1112
+ {
1113
+ "epoch": 22.000842105263157,
1114
+ "grad_norm": 0.6118970513343811,
1115
+ "learning_rate": 2.280701754385965e-05,
1116
+ "loss": 0.7495,
1117
+ "step": 1300
1118
+ },
1119
+ {
1120
+ "epoch": 22.00119298245614,
1121
+ "grad_norm": 0.2397994101047516,
1122
+ "learning_rate": 2.2982456140350878e-05,
1123
+ "loss": 0.0388,
1124
+ "step": 1310
1125
+ },
1126
+ {
1127
+ "epoch": 22.00154385964912,
1128
+ "grad_norm": 0.10384727269411087,
1129
+ "learning_rate": 2.3157894736842107e-05,
1130
+ "loss": 0.3285,
1131
+ "step": 1320
1132
+ },
1133
+ {
1134
+ "epoch": 22.001894736842104,
1135
+ "grad_norm": 0.0419117733836174,
1136
+ "learning_rate": 2.3333333333333336e-05,
1137
+ "loss": 0.2848,
1138
+ "step": 1330
1139
+ },
1140
+ {
1141
+ "epoch": 22.0020350877193,
1142
+ "eval_accuracy": 0.8125,
1143
+ "eval_loss": 0.822706937789917,
1144
+ "eval_runtime": 83.5818,
1145
+ "eval_samples_per_second": 0.574,
1146
+ "eval_steps_per_second": 0.144,
1147
+ "step": 1334
1148
+ },
1149
+ {
1150
+ "epoch": 23.00021052631579,
1151
+ "grad_norm": 121.5499038696289,
1152
+ "learning_rate": 2.3508771929824565e-05,
1153
+ "loss": 0.5364,
1154
+ "step": 1340
1155
+ },
1156
+ {
1157
+ "epoch": 23.000561403508772,
1158
+ "grad_norm": 0.10266309231519699,
1159
+ "learning_rate": 2.368421052631579e-05,
1160
+ "loss": 0.8097,
1161
+ "step": 1350
1162
+ },
1163
+ {
1164
+ "epoch": 23.000912280701755,
1165
+ "grad_norm": 9.736127853393555,
1166
+ "learning_rate": 2.385964912280702e-05,
1167
+ "loss": 0.6052,
1168
+ "step": 1360
1169
+ },
1170
+ {
1171
+ "epoch": 23.001263157894737,
1172
+ "grad_norm": 4.3637471199035645,
1173
+ "learning_rate": 2.4035087719298245e-05,
1174
+ "loss": 0.3504,
1175
+ "step": 1370
1176
+ },
1177
+ {
1178
+ "epoch": 23.00161403508772,
1179
+ "grad_norm": 0.19882246851921082,
1180
+ "learning_rate": 2.4210526315789474e-05,
1181
+ "loss": 0.3784,
1182
+ "step": 1380
1183
+ },
1184
+ {
1185
+ "epoch": 23.0019649122807,
1186
+ "grad_norm": 0.27082210779190063,
1187
+ "learning_rate": 2.4385964912280703e-05,
1188
+ "loss": 0.1225,
1189
+ "step": 1390
1190
+ },
1191
+ {
1192
+ "epoch": 23.0020350877193,
1193
+ "eval_accuracy": 0.9375,
1194
+ "eval_loss": 0.2921377420425415,
1195
+ "eval_runtime": 82.9556,
1196
+ "eval_samples_per_second": 0.579,
1197
+ "eval_steps_per_second": 0.145,
1198
+ "step": 1392
1199
+ },
1200
+ {
1201
+ "epoch": 24.000280701754384,
1202
+ "grad_norm": 0.2171986997127533,
1203
+ "learning_rate": 2.456140350877193e-05,
1204
+ "loss": 0.1094,
1205
+ "step": 1400
1206
+ },
1207
+ {
1208
+ "epoch": 24.000631578947367,
1209
+ "grad_norm": 0.21692253649234772,
1210
+ "learning_rate": 2.4736842105263158e-05,
1211
+ "loss": 0.3332,
1212
+ "step": 1410
1213
+ },
1214
+ {
1215
+ "epoch": 24.000982456140353,
1216
+ "grad_norm": 0.3834693729877472,
1217
+ "learning_rate": 2.4912280701754387e-05,
1218
+ "loss": 0.2847,
1219
+ "step": 1420
1220
+ },
1221
+ {
1222
+ "epoch": 24.001333333333335,
1223
+ "grad_norm": 0.08816500753164291,
1224
+ "learning_rate": 2.5087719298245616e-05,
1225
+ "loss": 0.1147,
1226
+ "step": 1430
1227
+ },
1228
+ {
1229
+ "epoch": 24.001684210526317,
1230
+ "grad_norm": 0.21103212237358093,
1231
+ "learning_rate": 2.5263157894736845e-05,
1232
+ "loss": 0.4283,
1233
+ "step": 1440
1234
+ },
1235
+ {
1236
+ "epoch": 24.0020350877193,
1237
+ "grad_norm": 0.27631059288978577,
1238
+ "learning_rate": 2.5438596491228074e-05,
1239
+ "loss": 0.616,
1240
+ "step": 1450
1241
+ },
1242
+ {
1243
+ "epoch": 24.0020350877193,
1244
+ "eval_accuracy": 0.9375,
1245
+ "eval_loss": 0.2928893566131592,
1246
+ "eval_runtime": 81.6066,
1247
+ "eval_samples_per_second": 0.588,
1248
+ "eval_steps_per_second": 0.147,
1249
+ "step": 1450
1250
+ },
1251
+ {
1252
+ "epoch": 24.0020350877193,
1253
+ "eval_accuracy": 0.9411764705882353,
1254
+ "eval_loss": 0.2618250548839569,
1255
+ "eval_runtime": 88.9045,
1256
+ "eval_samples_per_second": 0.574,
1257
+ "eval_steps_per_second": 0.146,
1258
+ "step": 1450
1259
+ },
1260
+ {
1261
+ "epoch": 24.0020350877193,
1262
+ "eval_accuracy": 0.9411764705882353,
1263
+ "eval_loss": 0.2618250548839569,
1264
+ "eval_runtime": 89.4759,
1265
+ "eval_samples_per_second": 0.57,
1266
+ "eval_steps_per_second": 0.145,
1267
+ "step": 1450
1268
+ }
1269
+ ],
1270
+ "logging_steps": 10,
1271
+ "max_steps": 28500,
1272
+ "num_input_tokens_seen": 0,
1273
+ "num_train_epochs": 9223372036854775807,
1274
+ "save_steps": 500,
1275
+ "stateful_callbacks": {
1276
+ "TrainerControl": {
1277
+ "args": {
1278
+ "should_epoch_stop": false,
1279
+ "should_evaluate": false,
1280
+ "should_log": false,
1281
+ "should_save": true,
1282
+ "should_training_stop": false
1283
+ },
1284
+ "attributes": {}
1285
+ }
1286
+ },
1287
+ "total_flos": 7.164871389462528e+18,
1288
+ "train_batch_size": 4,
1289
+ "trial_name": null,
1290
+ "trial_params": null
1291
+ }