Gizachew commited on
Commit
ced20fd
1 Parent(s): 8a2672c

End of training

Browse files
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [facebook/hubert-large-ls960-ft](https://huggingface.co/facebook/hubert-large-ls960-ft) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.4469
21
- - Accuracy: 0.9260
22
 
23
  ## Model description
24
 
@@ -38,8 +38,8 @@ More information needed
38
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 1e-05
41
- - train_batch_size: 4
42
- - eval_batch_size: 4
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: linear
@@ -50,15 +50,13 @@ The following hyperparameters were used during training:
50
 
51
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
  |:-------------:|:------:|:----:|:---------------:|:--------:|
53
- | 1.0337 | 0.1664 | 500 | 0.9555 | 0.6861 |
54
- | 0.5792 | 0.3328 | 1000 | 0.6325 | 0.8535 |
55
- | 0.3656 | 0.4992 | 1500 | 0.5913 | 0.8789 |
56
- | 0.33 | 0.6656 | 2000 | 0.4296 | 0.9118 |
57
- | 0.2212 | 0.8319 | 2500 | 0.4698 | 0.9155 |
58
- | 0.364 | 0.9983 | 3000 | 0.4107 | 0.9133 |
59
- | 0.2371 | 1.1647 | 3500 | 0.5081 | 0.9118 |
60
- | 0.3086 | 1.3311 | 4000 | 0.4347 | 0.9275 |
61
- | 0.1531 | 1.4975 | 4500 | 0.4469 | 0.9260 |
62
 
63
 
64
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [facebook/hubert-large-ls960-ft](https://huggingface.co/facebook/hubert-large-ls960-ft) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.3169
21
+ - Accuracy: 0.9395
22
 
23
  ## Model description
24
 
 
38
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 1e-05
41
+ - train_batch_size: 16
42
+ - eval_batch_size: 8
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: linear
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
  |:-------------:|:------:|:----:|:---------------:|:--------:|
53
+ | 0.5275 | 0.6649 | 500 | 0.4740 | 0.8483 |
54
+ | 0.2923 | 1.3298 | 1000 | 0.3727 | 0.8999 |
55
+ | 0.2168 | 1.9947 | 1500 | 0.4175 | 0.9073 |
56
+ | 0.1442 | 2.6596 | 2000 | 0.3471 | 0.9312 |
57
+ | 0.1341 | 3.3245 | 2500 | 0.3641 | 0.9275 |
58
+ | 0.1136 | 3.9894 | 3000 | 0.3286 | 0.9432 |
59
+ | 0.1102 | 4.6543 | 3500 | 0.3169 | 0.9395 |
 
 
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.497504159733777,
3
- "total_flos": 2.060256870141264e+18,
4
- "train_loss": 0.4611078107621935,
5
- "train_runtime": 3912.8249,
6
  "train_samples": 12018,
7
- "train_samples_per_second": 15.357,
8
- "train_steps_per_second": 3.84
9
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 7.814523952420126e+18,
4
+ "train_loss": 0.29202749221882923,
5
+ "train_runtime": 8842.7154,
6
  "train_samples": 12018,
7
+ "train_samples_per_second": 6.795,
8
+ "train_steps_per_second": 0.425
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e4e2f75ffdee20cd0081352b3a7a2b9bfda1c2a1f2eec4dccb4cc5b3327c44a
3
  size 1266025756
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f556ced5fb4077887ee371d074f7dbebe2d2b95d4386e3a614e1f69724268478
3
  size 1266025756
runs/Jun15_14-11-05_9fe18ef6c7ec/events.out.tfevents.1718460680.9fe18ef6c7ec.34.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:147afdbad27934d690be17e3bbb6539effe150558c95e1c9c0c417bd0bc591c2
3
+ size 16477
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.497504159733777,
3
- "total_flos": 2.060256870141264e+18,
4
- "train_loss": 0.4611078107621935,
5
- "train_runtime": 3912.8249,
6
  "train_samples": 12018,
7
- "train_samples_per_second": 15.357,
8
- "train_steps_per_second": 3.84
9
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 7.814523952420126e+18,
4
+ "train_loss": 0.29202749221882923,
5
+ "train_runtime": 8842.7154,
6
  "train_samples": 12018,
7
+ "train_samples_per_second": 6.795,
8
+ "train_steps_per_second": 0.425
9
  }
trainer_state.json CHANGED
@@ -1,421 +1,347 @@
1
  {
2
- "best_metric": 0.4106574058532715,
3
- "best_model_checkpoint": "/kaggle/working/hubert-agum960-amharic/checkpoint-3000",
4
- "epoch": 1.497504159733777,
5
  "eval_steps": 500,
6
- "global_step": 4500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.033277870216306155,
13
- "grad_norm": 3.5415408611297607,
14
- "learning_rate": 9.933444259567388e-06,
15
- "loss": 1.6,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.06655574043261231,
20
- "grad_norm": 3.9917523860931396,
21
- "learning_rate": 9.866888519134776e-06,
22
- "loss": 1.5237,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 0.09983361064891846,
27
- "grad_norm": NaN,
28
- "learning_rate": 9.80099833610649e-06,
29
- "loss": 1.3107,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 0.13311148086522462,
34
- "grad_norm": 19.04720687866211,
35
- "learning_rate": 9.735108153078204e-06,
36
- "loss": 1.1818,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 0.16638935108153077,
41
- "grad_norm": 4.309573650360107,
42
- "learning_rate": 9.668552412645591e-06,
43
- "loss": 1.0337,
44
  "step": 500
45
  },
46
  {
47
- "epoch": 0.16638935108153077,
48
- "eval_accuracy": 0.6860986351966858,
49
- "eval_loss": 0.9554809927940369,
50
- "eval_runtime": 125.3513,
51
- "eval_samples_per_second": 10.674,
52
- "eval_steps_per_second": 2.672,
53
  "step": 500
54
  },
55
  {
56
- "epoch": 0.19966722129783693,
57
- "grad_norm": 5.9266791343688965,
58
- "learning_rate": 9.601996672212978e-06,
59
- "loss": 0.8586,
60
  "step": 600
61
  },
62
  {
63
- "epoch": 0.23294509151414308,
64
- "grad_norm": 25.942941665649414,
65
- "learning_rate": 9.535440931780367e-06,
66
- "loss": 0.7985,
67
  "step": 700
68
  },
69
  {
70
- "epoch": 0.26622296173044924,
71
- "grad_norm": 23.755361557006836,
72
- "learning_rate": 9.469550748752082e-06,
73
- "loss": 0.6843,
74
  "step": 800
75
  },
76
  {
77
- "epoch": 0.2995008319467554,
78
- "grad_norm": 5.648043155670166,
79
- "learning_rate": 9.402995008319469e-06,
80
- "loss": 0.5446,
81
  "step": 900
82
  },
83
  {
84
- "epoch": 0.33277870216306155,
85
- "grad_norm": 1.2758762836456299,
86
- "learning_rate": 9.337104825291182e-06,
87
- "loss": 0.5792,
88
  "step": 1000
89
  },
90
  {
91
- "epoch": 0.33277870216306155,
92
- "eval_accuracy": 0.853512704372406,
93
- "eval_loss": 0.6325268745422363,
94
- "eval_runtime": 121.8954,
95
- "eval_samples_per_second": 10.977,
96
- "eval_steps_per_second": 2.748,
97
  "step": 1000
98
  },
99
  {
100
- "epoch": 0.36605657237936773,
101
- "grad_norm": 1.7034721374511719,
102
- "learning_rate": 9.27054908485857e-06,
103
- "loss": 0.4254,
104
  "step": 1100
105
  },
106
  {
107
- "epoch": 0.39933444259567386,
108
- "grad_norm": 0.4905766546726227,
109
- "learning_rate": 9.203993344425958e-06,
110
- "loss": 0.5024,
111
  "step": 1200
112
  },
113
  {
114
- "epoch": 0.43261231281198004,
115
- "grad_norm": 1.4882396459579468,
116
- "learning_rate": 9.137437603993346e-06,
117
- "loss": 0.4714,
118
  "step": 1300
119
  },
120
  {
121
- "epoch": 0.46589018302828616,
122
- "grad_norm": 0.21027016639709473,
123
- "learning_rate": 9.070881863560733e-06,
124
- "loss": 0.4184,
125
  "step": 1400
126
  },
127
  {
128
- "epoch": 0.49916805324459235,
129
- "grad_norm": 0.40890273451805115,
130
- "learning_rate": 9.004326123128122e-06,
131
- "loss": 0.3656,
132
  "step": 1500
133
  },
134
  {
135
- "epoch": 0.49916805324459235,
136
- "eval_accuracy": 0.878923773765564,
137
- "eval_loss": 0.5912803411483765,
138
- "eval_runtime": 121.7902,
139
- "eval_samples_per_second": 10.986,
140
- "eval_steps_per_second": 2.751,
141
  "step": 1500
142
  },
143
  {
144
- "epoch": 0.5324459234608985,
145
- "grad_norm": 27.935806274414062,
146
- "learning_rate": 8.937770382695509e-06,
147
- "loss": 0.4205,
148
  "step": 1600
149
  },
150
  {
151
- "epoch": 0.5657237936772047,
152
- "grad_norm": 13.320003509521484,
153
- "learning_rate": 8.871880199667222e-06,
154
- "loss": 0.3586,
155
  "step": 1700
156
  },
157
  {
158
- "epoch": 0.5990016638935108,
159
- "grad_norm": 1.6185659170150757,
160
- "learning_rate": 8.80532445923461e-06,
161
- "loss": 0.3782,
162
  "step": 1800
163
  },
164
  {
165
- "epoch": 0.632279534109817,
166
- "grad_norm": 1.0840280055999756,
167
- "learning_rate": 8.738768718801997e-06,
168
- "loss": 0.3585,
169
  "step": 1900
170
  },
171
  {
172
- "epoch": 0.6655574043261231,
173
- "grad_norm": 0.10571889579296112,
174
- "learning_rate": 8.672212978369386e-06,
175
- "loss": 0.33,
176
  "step": 2000
177
  },
178
  {
179
- "epoch": 0.6655574043261231,
180
- "eval_accuracy": 0.9118086695671082,
181
- "eval_loss": 0.4296289384365082,
182
- "eval_runtime": 121.9384,
183
- "eval_samples_per_second": 10.973,
184
- "eval_steps_per_second": 2.747,
185
  "step": 2000
186
  },
187
  {
188
- "epoch": 0.6988352745424293,
189
- "grad_norm": 12.672430038452148,
190
- "learning_rate": 8.605657237936773e-06,
191
- "loss": 0.4664,
192
  "step": 2100
193
  },
194
  {
195
- "epoch": 0.7321131447587355,
196
- "grad_norm": 0.23700322210788727,
197
- "learning_rate": 8.539101497504162e-06,
198
- "loss": 0.364,
199
  "step": 2200
200
  },
201
  {
202
- "epoch": 0.7653910149750416,
203
- "grad_norm": 0.07070857286453247,
204
- "learning_rate": 8.472545757071549e-06,
205
- "loss": 0.3883,
206
  "step": 2300
207
  },
208
  {
209
- "epoch": 0.7986688851913477,
210
- "grad_norm": 0.022693803533911705,
211
- "learning_rate": 8.405990016638937e-06,
212
- "loss": 0.365,
213
  "step": 2400
214
  },
215
  {
216
- "epoch": 0.831946755407654,
217
- "grad_norm": 46.31725311279297,
218
- "learning_rate": 8.339434276206322e-06,
219
- "loss": 0.2212,
220
  "step": 2500
221
  },
222
  {
223
- "epoch": 0.831946755407654,
224
- "eval_accuracy": 0.9155455827713013,
225
- "eval_loss": 0.4698057472705841,
226
- "eval_runtime": 122.1469,
227
- "eval_samples_per_second": 10.954,
228
- "eval_steps_per_second": 2.743,
229
  "step": 2500
230
  },
231
  {
232
- "epoch": 0.8652246256239601,
233
- "grad_norm": 0.03843013569712639,
234
- "learning_rate": 8.272878535773711e-06,
235
- "loss": 0.2667,
236
  "step": 2600
237
  },
238
  {
239
- "epoch": 0.8985024958402662,
240
- "grad_norm": 0.8355897068977356,
241
- "learning_rate": 8.206322795341098e-06,
242
- "loss": 0.2647,
243
  "step": 2700
244
  },
245
  {
246
- "epoch": 0.9317803660565723,
247
- "grad_norm": 71.40643310546875,
248
- "learning_rate": 8.139767054908487e-06,
249
- "loss": 0.2689,
250
  "step": 2800
251
  },
252
  {
253
- "epoch": 0.9650582362728786,
254
- "grad_norm": 0.15802563726902008,
255
- "learning_rate": 8.073211314475874e-06,
256
- "loss": 0.2999,
257
  "step": 2900
258
  },
259
  {
260
- "epoch": 0.9983361064891847,
261
- "grad_norm": 0.03220776841044426,
262
- "learning_rate": 8.006655574043262e-06,
263
- "loss": 0.364,
264
  "step": 3000
265
  },
266
  {
267
- "epoch": 0.9983361064891847,
268
- "eval_accuracy": 0.9133034348487854,
269
- "eval_loss": 0.4106574058532715,
270
- "eval_runtime": 121.8932,
271
- "eval_samples_per_second": 10.977,
272
- "eval_steps_per_second": 2.748,
273
  "step": 3000
274
  },
275
  {
276
- "epoch": 1.0316139767054908,
277
- "grad_norm": 14.846766471862793,
278
- "learning_rate": 7.940099833610649e-06,
279
- "loss": 0.2743,
280
  "step": 3100
281
  },
282
  {
283
- "epoch": 1.064891846921797,
284
- "grad_norm": 5.4475579261779785,
285
- "learning_rate": 7.873544093178038e-06,
286
- "loss": 0.2405,
287
  "step": 3200
288
  },
289
  {
290
- "epoch": 1.098169717138103,
291
- "grad_norm": 0.051481593400239944,
292
- "learning_rate": 7.806988352745425e-06,
293
- "loss": 0.2283,
294
  "step": 3300
295
  },
296
  {
297
- "epoch": 1.1314475873544092,
298
- "grad_norm": 0.10480177402496338,
299
- "learning_rate": 7.740432612312813e-06,
300
- "loss": 0.2049,
301
  "step": 3400
302
  },
303
  {
304
- "epoch": 1.1647254575707155,
305
- "grad_norm": 0.2395573854446411,
306
- "learning_rate": 7.6738768718802e-06,
307
- "loss": 0.2371,
308
  "step": 3500
309
  },
310
  {
311
- "epoch": 1.1647254575707155,
312
- "eval_accuracy": 0.9118086695671082,
313
- "eval_loss": 0.5081421732902527,
314
- "eval_runtime": 122.1747,
315
- "eval_samples_per_second": 10.952,
316
- "eval_steps_per_second": 2.742,
317
  "step": 3500
318
  },
319
  {
320
- "epoch": 1.1980033277870217,
321
- "grad_norm": 0.08652403950691223,
322
- "learning_rate": 7.607321131447588e-06,
323
- "loss": 0.1947,
324
  "step": 3600
325
  },
326
  {
327
- "epoch": 1.2312811980033278,
328
- "grad_norm": 0.01758364401757717,
329
- "learning_rate": 7.540765391014976e-06,
330
- "loss": 0.2981,
331
  "step": 3700
332
  },
333
  {
334
- "epoch": 1.264559068219634,
335
- "grad_norm": 0.023226574063301086,
336
- "learning_rate": 7.4742096505823635e-06,
337
- "loss": 0.2315,
338
- "step": 3800
339
- },
340
- {
341
- "epoch": 1.29783693843594,
342
- "grad_norm": 2.320756673812866,
343
- "learning_rate": 7.407653910149751e-06,
344
- "loss": 0.1545,
345
- "step": 3900
346
- },
347
- {
348
- "epoch": 1.3311148086522462,
349
- "grad_norm": 0.02829299308359623,
350
- "learning_rate": 7.341098169717139e-06,
351
- "loss": 0.3086,
352
- "step": 4000
353
- },
354
- {
355
- "epoch": 1.3311148086522462,
356
- "eval_accuracy": 0.927503764629364,
357
- "eval_loss": 0.43469786643981934,
358
- "eval_runtime": 122.0442,
359
- "eval_samples_per_second": 10.963,
360
- "eval_steps_per_second": 2.745,
361
- "step": 4000
362
- },
363
- {
364
- "epoch": 1.3643926788685525,
365
- "grad_norm": 0.563818097114563,
366
- "learning_rate": 7.274542429284527e-06,
367
- "loss": 0.203,
368
- "step": 4100
369
- },
370
- {
371
- "epoch": 1.3976705490848587,
372
- "grad_norm": 0.032793425023555756,
373
- "learning_rate": 7.207986688851915e-06,
374
- "loss": 0.2317,
375
- "step": 4200
376
- },
377
- {
378
- "epoch": 1.4309484193011648,
379
- "grad_norm": 13.284598350524902,
380
- "learning_rate": 7.1414309484193024e-06,
381
- "loss": 0.1721,
382
- "step": 4300
383
- },
384
- {
385
- "epoch": 1.464226289517471,
386
- "grad_norm": 7.5517897605896,
387
- "learning_rate": 7.0748752079866885e-06,
388
- "loss": 0.2042,
389
- "step": 4400
390
- },
391
- {
392
- "epoch": 1.497504159733777,
393
- "grad_norm": 0.04595191031694412,
394
- "learning_rate": 7.008319467554076e-06,
395
- "loss": 0.1531,
396
- "step": 4500
397
- },
398
- {
399
- "epoch": 1.497504159733777,
400
- "eval_accuracy": 0.926008939743042,
401
- "eval_loss": 0.4468592703342438,
402
- "eval_runtime": 122.6586,
403
- "eval_samples_per_second": 10.908,
404
- "eval_steps_per_second": 2.731,
405
- "step": 4500
406
- },
407
- {
408
- "epoch": 1.497504159733777,
409
- "step": 4500,
410
- "total_flos": 2.060256870141264e+18,
411
- "train_loss": 0.4611078107621935,
412
- "train_runtime": 3912.8249,
413
- "train_samples_per_second": 15.357,
414
- "train_steps_per_second": 3.84
415
  }
416
  ],
417
  "logging_steps": 100,
418
- "max_steps": 15025,
419
  "num_input_tokens_seen": 0,
420
  "num_train_epochs": 5,
421
  "save_steps": 500,
@@ -435,13 +361,13 @@
435
  "should_evaluate": false,
436
  "should_log": false,
437
  "should_save": true,
438
- "should_training_stop": true
439
  },
440
  "attributes": {}
441
  }
442
  },
443
- "total_flos": 2.060256870141264e+18,
444
- "train_batch_size": 4,
445
  "trial_name": null,
446
  "trial_params": null
447
  }
 
1
  {
2
+ "best_metric": 0.31690406799316406,
3
+ "best_model_checkpoint": "/kaggle/working/hubert-agum960-amharic/checkpoint-3500",
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 3760,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.13297872340425532,
13
+ "grad_norm": 2.615074396133423,
14
+ "learning_rate": 9.73404255319149e-06,
15
+ "loss": 1.5743,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.26595744680851063,
20
+ "grad_norm": 2.4918744564056396,
21
+ "learning_rate": 9.46808510638298e-06,
22
+ "loss": 1.3461,
23
  "step": 200
24
  },
25
  {
26
+ "epoch": 0.39893617021276595,
27
+ "grad_norm": 2.8701541423797607,
28
+ "learning_rate": 9.20212765957447e-06,
29
+ "loss": 1.0072,
30
  "step": 300
31
  },
32
  {
33
+ "epoch": 0.5319148936170213,
34
+ "grad_norm": 12.351414680480957,
35
+ "learning_rate": 8.938829787234043e-06,
36
+ "loss": 0.7362,
37
  "step": 400
38
  },
39
  {
40
+ "epoch": 0.6648936170212766,
41
+ "grad_norm": 8.005541801452637,
42
+ "learning_rate": 8.672872340425533e-06,
43
+ "loss": 0.5275,
44
  "step": 500
45
  },
46
  {
47
+ "epoch": 0.6648936170212766,
48
+ "eval_accuracy": 0.8482810258865356,
49
+ "eval_loss": 0.4739724397659302,
50
+ "eval_runtime": 120.9505,
51
+ "eval_samples_per_second": 11.062,
52
+ "eval_steps_per_second": 1.389,
53
  "step": 500
54
  },
55
  {
56
+ "epoch": 0.7978723404255319,
57
+ "grad_norm": 4.672068119049072,
58
+ "learning_rate": 8.406914893617022e-06,
59
+ "loss": 0.4446,
60
  "step": 600
61
  },
62
  {
63
+ "epoch": 0.9308510638297872,
64
+ "grad_norm": 7.660475254058838,
65
+ "learning_rate": 8.140957446808512e-06,
66
+ "loss": 0.3683,
67
  "step": 700
68
  },
69
  {
70
+ "epoch": 1.0638297872340425,
71
+ "grad_norm": 3.2801108360290527,
72
+ "learning_rate": 7.877659574468086e-06,
73
+ "loss": 0.3025,
74
  "step": 800
75
  },
76
  {
77
+ "epoch": 1.196808510638298,
78
+ "grad_norm": 11.70166015625,
79
+ "learning_rate": 7.61436170212766e-06,
80
+ "loss": 0.2542,
81
  "step": 900
82
  },
83
  {
84
+ "epoch": 1.3297872340425532,
85
+ "grad_norm": 10.051458358764648,
86
+ "learning_rate": 7.348404255319149e-06,
87
+ "loss": 0.2923,
88
  "step": 1000
89
  },
90
  {
91
+ "epoch": 1.3297872340425532,
92
+ "eval_accuracy": 0.8998505473136902,
93
+ "eval_loss": 0.3727114200592041,
94
+ "eval_runtime": 121.4022,
95
+ "eval_samples_per_second": 11.021,
96
+ "eval_steps_per_second": 1.384,
97
  "step": 1000
98
  },
99
  {
100
+ "epoch": 1.4627659574468086,
101
+ "grad_norm": 2.711136817932129,
102
+ "learning_rate": 7.0824468085106394e-06,
103
+ "loss": 0.2151,
104
  "step": 1100
105
  },
106
  {
107
+ "epoch": 1.5957446808510638,
108
+ "grad_norm": 3.505892038345337,
109
+ "learning_rate": 6.816489361702127e-06,
110
+ "loss": 0.2303,
111
  "step": 1200
112
  },
113
  {
114
+ "epoch": 1.728723404255319,
115
+ "grad_norm": 0.5282111763954163,
116
+ "learning_rate": 6.550531914893618e-06,
117
+ "loss": 0.2323,
118
  "step": 1300
119
  },
120
  {
121
+ "epoch": 1.8617021276595744,
122
+ "grad_norm": 6.366661071777344,
123
+ "learning_rate": 6.284574468085107e-06,
124
+ "loss": 0.2029,
125
  "step": 1400
126
  },
127
  {
128
+ "epoch": 1.9946808510638299,
129
+ "grad_norm": 17.721065521240234,
130
+ "learning_rate": 6.018617021276596e-06,
131
+ "loss": 0.2168,
132
  "step": 1500
133
  },
134
  {
135
+ "epoch": 1.9946808510638299,
136
+ "eval_accuracy": 0.9073243737220764,
137
+ "eval_loss": 0.41751202940940857,
138
+ "eval_runtime": 120.4054,
139
+ "eval_samples_per_second": 11.112,
140
+ "eval_steps_per_second": 1.395,
141
  "step": 1500
142
  },
143
  {
144
+ "epoch": 2.127659574468085,
145
+ "grad_norm": 29.569236755371094,
146
+ "learning_rate": 5.752659574468086e-06,
147
+ "loss": 0.181,
148
  "step": 1600
149
  },
150
  {
151
+ "epoch": 2.2606382978723403,
152
+ "grad_norm": 3.7976977825164795,
153
+ "learning_rate": 5.4867021276595745e-06,
154
+ "loss": 0.1902,
155
  "step": 1700
156
  },
157
  {
158
+ "epoch": 2.393617021276596,
159
+ "grad_norm": 5.339010715484619,
160
+ "learning_rate": 5.220744680851064e-06,
161
+ "loss": 0.1711,
162
  "step": 1800
163
  },
164
  {
165
+ "epoch": 2.526595744680851,
166
+ "grad_norm": 20.87778091430664,
167
+ "learning_rate": 4.954787234042554e-06,
168
+ "loss": 0.1889,
169
  "step": 1900
170
  },
171
  {
172
+ "epoch": 2.6595744680851063,
173
+ "grad_norm": 8.609652519226074,
174
+ "learning_rate": 4.6888297872340425e-06,
175
+ "loss": 0.1442,
176
  "step": 2000
177
  },
178
  {
179
+ "epoch": 2.6595744680851063,
180
+ "eval_accuracy": 0.9312406778335571,
181
+ "eval_loss": 0.3470732569694519,
182
+ "eval_runtime": 121.4455,
183
+ "eval_samples_per_second": 11.017,
184
+ "eval_steps_per_second": 1.383,
185
  "step": 2000
186
  },
187
  {
188
+ "epoch": 2.7925531914893615,
189
+ "grad_norm": 10.679058074951172,
190
+ "learning_rate": 4.422872340425532e-06,
191
+ "loss": 0.1706,
192
  "step": 2100
193
  },
194
  {
195
+ "epoch": 2.925531914893617,
196
+ "grad_norm": 0.32079407572746277,
197
+ "learning_rate": 4.156914893617022e-06,
198
+ "loss": 0.1333,
199
  "step": 2200
200
  },
201
  {
202
+ "epoch": 3.0585106382978724,
203
+ "grad_norm": 0.19917072355747223,
204
+ "learning_rate": 3.890957446808511e-06,
205
+ "loss": 0.1414,
206
  "step": 2300
207
  },
208
  {
209
+ "epoch": 3.1914893617021276,
210
+ "grad_norm": 3.178858757019043,
211
+ "learning_rate": 3.625e-06,
212
+ "loss": 0.1217,
213
  "step": 2400
214
  },
215
  {
216
+ "epoch": 3.324468085106383,
217
+ "grad_norm": 4.0875020027160645,
218
+ "learning_rate": 3.3590425531914896e-06,
219
+ "loss": 0.1341,
220
  "step": 2500
221
  },
222
  {
223
+ "epoch": 3.324468085106383,
224
+ "eval_accuracy": 0.927503764629364,
225
+ "eval_loss": 0.3641144931316376,
226
+ "eval_runtime": 120.3344,
227
+ "eval_samples_per_second": 11.119,
228
+ "eval_steps_per_second": 1.396,
229
  "step": 2500
230
  },
231
  {
232
+ "epoch": 3.4574468085106385,
233
+ "grad_norm": 2.2575433254241943,
234
+ "learning_rate": 3.0957446808510637e-06,
235
+ "loss": 0.1179,
236
  "step": 2600
237
  },
238
  {
239
+ "epoch": 3.5904255319148937,
240
+ "grad_norm": 2.2771668434143066,
241
+ "learning_rate": 2.8297872340425537e-06,
242
+ "loss": 0.1247,
243
  "step": 2700
244
  },
245
  {
246
+ "epoch": 3.723404255319149,
247
+ "grad_norm": 0.28244930505752563,
248
+ "learning_rate": 2.563829787234043e-06,
249
+ "loss": 0.1186,
250
  "step": 2800
251
  },
252
  {
253
+ "epoch": 3.8563829787234045,
254
+ "grad_norm": 10.92738151550293,
255
+ "learning_rate": 2.297872340425532e-06,
256
+ "loss": 0.1487,
257
  "step": 2900
258
  },
259
  {
260
+ "epoch": 3.9893617021276597,
261
+ "grad_norm": 3.5330650806427,
262
+ "learning_rate": 2.0319148936170213e-06,
263
+ "loss": 0.1136,
264
  "step": 3000
265
  },
266
  {
267
+ "epoch": 3.9893617021276597,
268
+ "eval_accuracy": 0.9431988000869751,
269
+ "eval_loss": 0.3285652995109558,
270
+ "eval_runtime": 121.4946,
271
+ "eval_samples_per_second": 11.013,
272
+ "eval_steps_per_second": 1.383,
273
  "step": 3000
274
  },
275
  {
276
+ "epoch": 4.122340425531915,
277
+ "grad_norm": 0.24657931923866272,
278
+ "learning_rate": 1.7659574468085109e-06,
279
+ "loss": 0.1157,
280
  "step": 3100
281
  },
282
  {
283
+ "epoch": 4.25531914893617,
284
+ "grad_norm": 4.8149800300598145,
285
+ "learning_rate": 1.5e-06,
286
+ "loss": 0.1127,
287
  "step": 3200
288
  },
289
  {
290
+ "epoch": 4.388297872340425,
291
+ "grad_norm": 12.910594940185547,
292
+ "learning_rate": 1.2340425531914894e-06,
293
+ "loss": 0.1159,
294
  "step": 3300
295
  },
296
  {
297
+ "epoch": 4.5212765957446805,
298
+ "grad_norm": 0.07863473147153854,
299
+ "learning_rate": 9.680851063829788e-07,
300
+ "loss": 0.1177,
301
  "step": 3400
302
  },
303
  {
304
+ "epoch": 4.654255319148936,
305
+ "grad_norm": 8.249691009521484,
306
+ "learning_rate": 7.021276595744682e-07,
307
+ "loss": 0.1102,
308
  "step": 3500
309
  },
310
  {
311
+ "epoch": 4.654255319148936,
312
+ "eval_accuracy": 0.939461886882782,
313
+ "eval_loss": 0.31690406799316406,
314
+ "eval_runtime": 121.4032,
315
+ "eval_samples_per_second": 11.021,
316
+ "eval_steps_per_second": 1.384,
317
  "step": 3500
318
  },
319
  {
320
+ "epoch": 4.787234042553192,
321
+ "grad_norm": 0.02278612181544304,
322
+ "learning_rate": 4.361702127659575e-07,
323
+ "loss": 0.0957,
324
  "step": 3600
325
  },
326
  {
327
+ "epoch": 4.920212765957447,
328
+ "grad_norm": 11.251379013061523,
329
+ "learning_rate": 1.7021276595744683e-07,
330
+ "loss": 0.1042,
331
  "step": 3700
332
  },
333
  {
334
+ "epoch": 5.0,
335
+ "step": 3760,
336
+ "total_flos": 7.814523952420126e+18,
337
+ "train_loss": 0.29202749221882923,
338
+ "train_runtime": 8842.7154,
339
+ "train_samples_per_second": 6.795,
340
+ "train_steps_per_second": 0.425
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  }
342
  ],
343
  "logging_steps": 100,
344
+ "max_steps": 3760,
345
  "num_input_tokens_seen": 0,
346
  "num_train_epochs": 5,
347
  "save_steps": 500,
 
361
  "should_evaluate": false,
362
  "should_log": false,
363
  "should_save": true,
364
+ "should_training_stop": false
365
  },
366
  "attributes": {}
367
  }
368
  },
369
+ "total_flos": 7.814523952420126e+18,
370
+ "train_batch_size": 16,
371
  "trial_name": null,
372
  "trial_params": null
373
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2e39203d964af082866179c6a0ff14c4c5db0bbec38e0af23ba7e07899d988c
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bb4f73efab8b12b0f0b7fd78089aeb5800da77a2e0861cb0fac114d0f5be367
3
  size 5176