hts98 commited on
Commit
01c2017
·
verified ·
1 Parent(s): 123ae0e

End of training

Browse files
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.6954
22
- - F1: 0.9290
23
 
24
  ## Model description
25
 
 
18
 
19
  This model is a fine-tuned version of [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.2168
22
+ - F1: 0.9326
23
 
24
  ## Model description
25
 
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "eval_f1": 0.9326343966094134,
4
+ "eval_loss": 0.21681125462055206,
5
+ "eval_runtime": 24.9667,
6
+ "eval_samples": 8966,
7
+ "eval_samples_per_second": 359.118,
8
+ "eval_steps_per_second": 5.648,
9
+ "total_flos": 5.5714266203904e+16,
10
+ "train_loss": 0.024559360085297206,
11
+ "train_runtime": 8074.1532,
12
+ "train_samples": 21175,
13
+ "train_samples_per_second": 104.903,
14
+ "train_steps_per_second": 1.64
15
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "eval_f1": 0.9326343966094134,
4
+ "eval_loss": 0.21681125462055206,
5
+ "eval_runtime": 24.9667,
6
+ "eval_samples": 8966,
7
+ "eval_samples_per_second": 359.118,
8
+ "eval_steps_per_second": 5.648
9
+ }
predict_results.txt ADDED
The diff for this file is too large to render. See raw diff
 
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "total_flos": 5.5714266203904e+16,
4
+ "train_loss": 0.024559360085297206,
5
+ "train_runtime": 8074.1532,
6
+ "train_samples": 21175,
7
+ "train_samples_per_second": 104.903,
8
+ "train_steps_per_second": 1.64
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9326343966094134,
3
+ "best_model_checkpoint": "/tmp/classification_hos_bert/checkpoint-662",
4
+ "epoch": 40.0,
5
+ "eval_steps": 500,
6
+ "global_step": 13240,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_f1": 0.9258309167967879,
14
+ "eval_loss": 0.21829327940940857,
15
+ "eval_runtime": 25.1238,
16
+ "eval_samples_per_second": 356.873,
17
+ "eval_steps_per_second": 5.612,
18
+ "step": 331
19
+ },
20
+ {
21
+ "epoch": 1.510574018126888,
22
+ "grad_norm": 2.2944066524505615,
23
+ "learning_rate": 2.8867069486404837e-05,
24
+ "loss": 0.239,
25
+ "step": 500
26
+ },
27
+ {
28
+ "epoch": 2.0,
29
+ "eval_f1": 0.9326343966094134,
30
+ "eval_loss": 0.21681125462055206,
31
+ "eval_runtime": 25.1665,
32
+ "eval_samples_per_second": 356.267,
33
+ "eval_steps_per_second": 5.603,
34
+ "step": 662
35
+ },
36
+ {
37
+ "epoch": 3.0,
38
+ "eval_f1": 0.9281730983716261,
39
+ "eval_loss": 0.25273850560188293,
40
+ "eval_runtime": 25.1148,
41
+ "eval_samples_per_second": 357.001,
42
+ "eval_steps_per_second": 5.614,
43
+ "step": 993
44
+ },
45
+ {
46
+ "epoch": 3.0211480362537766,
47
+ "grad_norm": 0.6900779604911804,
48
+ "learning_rate": 2.7734138972809666e-05,
49
+ "loss": 0.1255,
50
+ "step": 1000
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_f1": 0.9288422931072943,
55
+ "eval_loss": 0.28959983587265015,
56
+ "eval_runtime": 25.1245,
57
+ "eval_samples_per_second": 356.863,
58
+ "eval_steps_per_second": 5.612,
59
+ "step": 1324
60
+ },
61
+ {
62
+ "epoch": 4.531722054380665,
63
+ "grad_norm": 1.7124501466751099,
64
+ "learning_rate": 2.66012084592145e-05,
65
+ "loss": 0.0662,
66
+ "step": 1500
67
+ },
68
+ {
69
+ "epoch": 5.0,
70
+ "eval_f1": 0.9266116439884007,
71
+ "eval_loss": 0.33890488743782043,
72
+ "eval_runtime": 25.1434,
73
+ "eval_samples_per_second": 356.595,
74
+ "eval_steps_per_second": 5.608,
75
+ "step": 1655
76
+ },
77
+ {
78
+ "epoch": 6.0,
79
+ "eval_f1": 0.9293999553870176,
80
+ "eval_loss": 0.3792820870876312,
81
+ "eval_runtime": 25.1391,
82
+ "eval_samples_per_second": 356.656,
83
+ "eval_steps_per_second": 5.609,
84
+ "step": 1986
85
+ },
86
+ {
87
+ "epoch": 6.042296072507553,
88
+ "grad_norm": 0.8430729508399963,
89
+ "learning_rate": 2.5468277945619337e-05,
90
+ "loss": 0.0453,
91
+ "step": 2000
92
+ },
93
+ {
94
+ "epoch": 7.0,
95
+ "eval_f1": 0.9251617220611198,
96
+ "eval_loss": 0.41103312373161316,
97
+ "eval_runtime": 25.1418,
98
+ "eval_samples_per_second": 356.617,
99
+ "eval_steps_per_second": 5.608,
100
+ "step": 2317
101
+ },
102
+ {
103
+ "epoch": 7.552870090634441,
104
+ "grad_norm": 0.8941565752029419,
105
+ "learning_rate": 2.433534743202417e-05,
106
+ "loss": 0.0257,
107
+ "step": 2500
108
+ },
109
+ {
110
+ "epoch": 8.0,
111
+ "eval_f1": 0.9204773589114432,
112
+ "eval_loss": 0.4656200110912323,
113
+ "eval_runtime": 25.1662,
114
+ "eval_samples_per_second": 356.271,
115
+ "eval_steps_per_second": 5.603,
116
+ "step": 2648
117
+ },
118
+ {
119
+ "epoch": 9.0,
120
+ "eval_f1": 0.9262770466205665,
121
+ "eval_loss": 0.49531668424606323,
122
+ "eval_runtime": 25.1846,
123
+ "eval_samples_per_second": 356.011,
124
+ "eval_steps_per_second": 5.599,
125
+ "step": 2979
126
+ },
127
+ {
128
+ "epoch": 9.06344410876133,
129
+ "grad_norm": 0.01369735598564148,
130
+ "learning_rate": 2.3202416918429002e-05,
131
+ "loss": 0.0196,
132
+ "step": 3000
133
+ },
134
+ {
135
+ "epoch": 10.0,
136
+ "eval_f1": 0.9265001115324559,
137
+ "eval_loss": 0.5412325263023376,
138
+ "eval_runtime": 25.1393,
139
+ "eval_samples_per_second": 356.653,
140
+ "eval_steps_per_second": 5.609,
141
+ "step": 3310
142
+ },
143
+ {
144
+ "epoch": 10.574018126888218,
145
+ "grad_norm": 1.39247727394104,
146
+ "learning_rate": 2.2069486404833838e-05,
147
+ "loss": 0.0125,
148
+ "step": 3500
149
+ },
150
+ {
151
+ "epoch": 11.0,
152
+ "eval_f1": 0.9244925273254517,
153
+ "eval_loss": 0.5528218150138855,
154
+ "eval_runtime": 25.112,
155
+ "eval_samples_per_second": 357.04,
156
+ "eval_steps_per_second": 5.615,
157
+ "step": 3641
158
+ },
159
+ {
160
+ "epoch": 12.0,
161
+ "eval_f1": 0.9261655141646219,
162
+ "eval_loss": 0.5526648759841919,
163
+ "eval_runtime": 25.1453,
164
+ "eval_samples_per_second": 356.568,
165
+ "eval_steps_per_second": 5.607,
166
+ "step": 3972
167
+ },
168
+ {
169
+ "epoch": 12.084592145015106,
170
+ "grad_norm": 0.3373314440250397,
171
+ "learning_rate": 2.093655589123867e-05,
172
+ "loss": 0.0141,
173
+ "step": 4000
174
+ },
175
+ {
176
+ "epoch": 13.0,
177
+ "eval_f1": 0.9276154360919028,
178
+ "eval_loss": 0.5682665705680847,
179
+ "eval_runtime": 25.1511,
180
+ "eval_samples_per_second": 356.486,
181
+ "eval_steps_per_second": 5.606,
182
+ "step": 4303
183
+ },
184
+ {
185
+ "epoch": 13.595166163141993,
186
+ "grad_norm": 0.07623090595006943,
187
+ "learning_rate": 1.9803625377643507e-05,
188
+ "loss": 0.0097,
189
+ "step": 4500
190
+ },
191
+ {
192
+ "epoch": 14.0,
193
+ "eval_f1": 0.9239348650457283,
194
+ "eval_loss": 0.5835373997688293,
195
+ "eval_runtime": 25.143,
196
+ "eval_samples_per_second": 356.6,
197
+ "eval_steps_per_second": 5.608,
198
+ "step": 4634
199
+ },
200
+ {
201
+ "epoch": 15.0,
202
+ "eval_f1": 0.9279500334597368,
203
+ "eval_loss": 0.5905042886734009,
204
+ "eval_runtime": 25.1447,
205
+ "eval_samples_per_second": 356.576,
206
+ "eval_steps_per_second": 5.608,
207
+ "step": 4965
208
+ },
209
+ {
210
+ "epoch": 15.105740181268882,
211
+ "grad_norm": 0.01646752655506134,
212
+ "learning_rate": 1.867069486404834e-05,
213
+ "loss": 0.0107,
214
+ "step": 5000
215
+ },
216
+ {
217
+ "epoch": 16.0,
218
+ "eval_f1": 0.9298460852107964,
219
+ "eval_loss": 0.5799357295036316,
220
+ "eval_runtime": 25.142,
221
+ "eval_samples_per_second": 356.615,
222
+ "eval_steps_per_second": 5.608,
223
+ "step": 5296
224
+ },
225
+ {
226
+ "epoch": 16.61631419939577,
227
+ "grad_norm": 0.061389509588479996,
228
+ "learning_rate": 1.753776435045317e-05,
229
+ "loss": 0.009,
230
+ "step": 5500
231
+ },
232
+ {
233
+ "epoch": 17.0,
234
+ "eval_f1": 0.9266116439884007,
235
+ "eval_loss": 0.6126909255981445,
236
+ "eval_runtime": 25.1653,
237
+ "eval_samples_per_second": 356.285,
238
+ "eval_steps_per_second": 5.603,
239
+ "step": 5627
240
+ },
241
+ {
242
+ "epoch": 18.0,
243
+ "eval_f1": 0.9283961632835155,
244
+ "eval_loss": 0.591077446937561,
245
+ "eval_runtime": 25.1503,
246
+ "eval_samples_per_second": 356.497,
247
+ "eval_steps_per_second": 5.606,
248
+ "step": 5958
249
+ },
250
+ {
251
+ "epoch": 18.12688821752266,
252
+ "grad_norm": 0.0015490599907934666,
253
+ "learning_rate": 1.6404833836858007e-05,
254
+ "loss": 0.0084,
255
+ "step": 6000
256
+ },
257
+ {
258
+ "epoch": 19.0,
259
+ "eval_f1": 0.930292215034575,
260
+ "eval_loss": 0.5900245308876038,
261
+ "eval_runtime": 25.1515,
262
+ "eval_samples_per_second": 356.479,
263
+ "eval_steps_per_second": 5.606,
264
+ "step": 6289
265
+ },
266
+ {
267
+ "epoch": 19.637462235649547,
268
+ "grad_norm": 0.34078794717788696,
269
+ "learning_rate": 1.527190332326284e-05,
270
+ "loss": 0.008,
271
+ "step": 6500
272
+ },
273
+ {
274
+ "epoch": 20.0,
275
+ "eval_f1": 0.9282846308275708,
276
+ "eval_loss": 0.5922934412956238,
277
+ "eval_runtime": 25.1544,
278
+ "eval_samples_per_second": 356.438,
279
+ "eval_steps_per_second": 5.605,
280
+ "step": 6620
281
+ },
282
+ {
283
+ "epoch": 21.0,
284
+ "eval_f1": 0.9305152799464644,
285
+ "eval_loss": 0.6186188459396362,
286
+ "eval_runtime": 25.1563,
287
+ "eval_samples_per_second": 356.412,
288
+ "eval_steps_per_second": 5.605,
289
+ "step": 6951
290
+ },
291
+ {
292
+ "epoch": 21.148036253776436,
293
+ "grad_norm": 0.16627806425094604,
294
+ "learning_rate": 1.4138972809667674e-05,
295
+ "loss": 0.0068,
296
+ "step": 7000
297
+ },
298
+ {
299
+ "epoch": 22.0,
300
+ "eval_f1": 0.9291768904751282,
301
+ "eval_loss": 0.6076038479804993,
302
+ "eval_runtime": 25.1577,
303
+ "eval_samples_per_second": 356.392,
304
+ "eval_steps_per_second": 5.605,
305
+ "step": 7282
306
+ },
307
+ {
308
+ "epoch": 22.658610271903324,
309
+ "grad_norm": 0.02961309626698494,
310
+ "learning_rate": 1.3006042296072508e-05,
311
+ "loss": 0.0064,
312
+ "step": 7500
313
+ },
314
+ {
315
+ "epoch": 23.0,
316
+ "eval_f1": 0.930292215034575,
317
+ "eval_loss": 0.578154444694519,
318
+ "eval_runtime": 25.1751,
319
+ "eval_samples_per_second": 356.145,
320
+ "eval_steps_per_second": 5.601,
321
+ "step": 7613
322
+ },
323
+ {
324
+ "epoch": 24.0,
325
+ "eval_f1": 0.9319652018737452,
326
+ "eval_loss": 0.607693076133728,
327
+ "eval_runtime": 25.1532,
328
+ "eval_samples_per_second": 356.455,
329
+ "eval_steps_per_second": 5.606,
330
+ "step": 7944
331
+ },
332
+ {
333
+ "epoch": 24.169184290030213,
334
+ "grad_norm": 0.012147185392677784,
335
+ "learning_rate": 1.187311178247734e-05,
336
+ "loss": 0.0048,
337
+ "step": 8000
338
+ },
339
+ {
340
+ "epoch": 25.0,
341
+ "eval_f1": 0.9281730983716261,
342
+ "eval_loss": 0.6445909738540649,
343
+ "eval_runtime": 25.1606,
344
+ "eval_samples_per_second": 356.351,
345
+ "eval_steps_per_second": 5.604,
346
+ "step": 8275
347
+ },
348
+ {
349
+ "epoch": 25.6797583081571,
350
+ "grad_norm": 0.0304458886384964,
351
+ "learning_rate": 1.0740181268882177e-05,
352
+ "loss": 0.0046,
353
+ "step": 8500
354
+ },
355
+ {
356
+ "epoch": 26.0,
357
+ "eval_f1": 0.9315190720499665,
358
+ "eval_loss": 0.6416810154914856,
359
+ "eval_runtime": 25.1644,
360
+ "eval_samples_per_second": 356.298,
361
+ "eval_steps_per_second": 5.603,
362
+ "step": 8606
363
+ },
364
+ {
365
+ "epoch": 27.0,
366
+ "eval_f1": 0.9282846308275708,
367
+ "eval_loss": 0.6655632257461548,
368
+ "eval_runtime": 25.1733,
369
+ "eval_samples_per_second": 356.171,
370
+ "eval_steps_per_second": 5.601,
371
+ "step": 8937
372
+ },
373
+ {
374
+ "epoch": 27.190332326283986,
375
+ "grad_norm": 0.0014442217070609331,
376
+ "learning_rate": 9.60725075528701e-06,
377
+ "loss": 0.0053,
378
+ "step": 9000
379
+ },
380
+ {
381
+ "epoch": 28.0,
382
+ "eval_f1": 0.9288422931072943,
383
+ "eval_loss": 0.6541187763214111,
384
+ "eval_runtime": 25.1766,
385
+ "eval_samples_per_second": 356.124,
386
+ "eval_steps_per_second": 5.6,
387
+ "step": 9268
388
+ },
389
+ {
390
+ "epoch": 28.700906344410875,
391
+ "grad_norm": 0.0008725296938791871,
392
+ "learning_rate": 8.474320241691843e-06,
393
+ "loss": 0.0043,
394
+ "step": 9500
395
+ },
396
+ {
397
+ "epoch": 29.0,
398
+ "eval_f1": 0.9277269685478474,
399
+ "eval_loss": 0.6702625155448914,
400
+ "eval_runtime": 25.1263,
401
+ "eval_samples_per_second": 356.837,
402
+ "eval_steps_per_second": 5.612,
403
+ "step": 9599
404
+ },
405
+ {
406
+ "epoch": 30.0,
407
+ "eval_f1": 0.9251617220611198,
408
+ "eval_loss": 0.6871447563171387,
409
+ "eval_runtime": 25.1371,
410
+ "eval_samples_per_second": 356.684,
411
+ "eval_steps_per_second": 5.609,
412
+ "step": 9930
413
+ },
414
+ {
415
+ "epoch": 30.211480362537763,
416
+ "grad_norm": 0.0005139079876244068,
417
+ "learning_rate": 7.341389728096677e-06,
418
+ "loss": 0.0041,
419
+ "step": 10000
420
+ },
421
+ {
422
+ "epoch": 31.0,
423
+ "eval_f1": 0.9286192281954049,
424
+ "eval_loss": 0.6735148429870605,
425
+ "eval_runtime": 25.1585,
426
+ "eval_samples_per_second": 356.38,
427
+ "eval_steps_per_second": 5.604,
428
+ "step": 10261
429
+ },
430
+ {
431
+ "epoch": 31.72205438066465,
432
+ "grad_norm": 0.002090197755023837,
433
+ "learning_rate": 6.208459214501511e-06,
434
+ "loss": 0.0034,
435
+ "step": 10500
436
+ },
437
+ {
438
+ "epoch": 32.0,
439
+ "eval_f1": 0.9306268124024091,
440
+ "eval_loss": 0.6650559306144714,
441
+ "eval_runtime": 25.1466,
442
+ "eval_samples_per_second": 356.549,
443
+ "eval_steps_per_second": 5.607,
444
+ "step": 10592
445
+ },
446
+ {
447
+ "epoch": 33.0,
448
+ "eval_f1": 0.9305152799464644,
449
+ "eval_loss": 0.6799349188804626,
450
+ "eval_runtime": 25.1547,
451
+ "eval_samples_per_second": 356.435,
452
+ "eval_steps_per_second": 5.605,
453
+ "step": 10923
454
+ },
455
+ {
456
+ "epoch": 33.23262839879154,
457
+ "grad_norm": 0.016955886036157608,
458
+ "learning_rate": 5.075528700906345e-06,
459
+ "loss": 0.0032,
460
+ "step": 11000
461
+ },
462
+ {
463
+ "epoch": 34.0,
464
+ "eval_f1": 0.9297345527548516,
465
+ "eval_loss": 0.6752559542655945,
466
+ "eval_runtime": 25.1385,
467
+ "eval_samples_per_second": 356.664,
468
+ "eval_steps_per_second": 5.609,
469
+ "step": 11254
470
+ },
471
+ {
472
+ "epoch": 34.74320241691843,
473
+ "grad_norm": 0.0011760705383494496,
474
+ "learning_rate": 3.942598187311178e-06,
475
+ "loss": 0.0031,
476
+ "step": 11500
477
+ },
478
+ {
479
+ "epoch": 35.0,
480
+ "eval_f1": 0.9309614097702431,
481
+ "eval_loss": 0.6854746341705322,
482
+ "eval_runtime": 25.1626,
483
+ "eval_samples_per_second": 356.323,
484
+ "eval_steps_per_second": 5.604,
485
+ "step": 11585
486
+ },
487
+ {
488
+ "epoch": 36.0,
489
+ "eval_f1": 0.9306268124024091,
490
+ "eval_loss": 0.6885010600090027,
491
+ "eval_runtime": 25.1518,
492
+ "eval_samples_per_second": 356.475,
493
+ "eval_steps_per_second": 5.606,
494
+ "step": 11916
495
+ },
496
+ {
497
+ "epoch": 36.25377643504532,
498
+ "grad_norm": 0.000429723208071664,
499
+ "learning_rate": 2.809667673716012e-06,
500
+ "loss": 0.003,
501
+ "step": 12000
502
+ },
503
+ {
504
+ "epoch": 37.0,
505
+ "eval_f1": 0.9292884229310729,
506
+ "eval_loss": 0.6960038542747498,
507
+ "eval_runtime": 25.1589,
508
+ "eval_samples_per_second": 356.375,
509
+ "eval_steps_per_second": 5.604,
510
+ "step": 12247
511
+ },
512
+ {
513
+ "epoch": 37.764350453172206,
514
+ "grad_norm": 0.0003884187317453325,
515
+ "learning_rate": 1.6767371601208459e-06,
516
+ "loss": 0.0026,
517
+ "step": 12500
518
+ },
519
+ {
520
+ "epoch": 38.0,
521
+ "eval_f1": 0.9291768904751282,
522
+ "eval_loss": 0.6950347423553467,
523
+ "eval_runtime": 25.1564,
524
+ "eval_samples_per_second": 356.41,
525
+ "eval_steps_per_second": 5.605,
526
+ "step": 12578
527
+ },
528
+ {
529
+ "epoch": 39.0,
530
+ "eval_f1": 0.9297345527548516,
531
+ "eval_loss": 0.6964432597160339,
532
+ "eval_runtime": 25.1685,
533
+ "eval_samples_per_second": 356.238,
534
+ "eval_steps_per_second": 5.602,
535
+ "step": 12909
536
+ },
537
+ {
538
+ "epoch": 39.274924471299094,
539
+ "grad_norm": 0.03609294071793556,
540
+ "learning_rate": 5.438066465256798e-07,
541
+ "loss": 0.0033,
542
+ "step": 13000
543
+ },
544
+ {
545
+ "epoch": 40.0,
546
+ "eval_f1": 0.928953825563239,
547
+ "eval_loss": 0.6954053640365601,
548
+ "eval_runtime": 25.1629,
549
+ "eval_samples_per_second": 356.318,
550
+ "eval_steps_per_second": 5.603,
551
+ "step": 13240
552
+ },
553
+ {
554
+ "epoch": 40.0,
555
+ "step": 13240,
556
+ "total_flos": 5.5714266203904e+16,
557
+ "train_loss": 0.024559360085297206,
558
+ "train_runtime": 8074.1532,
559
+ "train_samples_per_second": 104.903,
560
+ "train_steps_per_second": 1.64
561
+ }
562
+ ],
563
+ "logging_steps": 500,
564
+ "max_steps": 13240,
565
+ "num_input_tokens_seen": 0,
566
+ "num_train_epochs": 40,
567
+ "save_steps": 500,
568
+ "stateful_callbacks": {
569
+ "TrainerControl": {
570
+ "args": {
571
+ "should_epoch_stop": false,
572
+ "should_evaluate": false,
573
+ "should_log": false,
574
+ "should_save": true,
575
+ "should_training_stop": true
576
+ },
577
+ "attributes": {}
578
+ }
579
+ },
580
+ "total_flos": 5.5714266203904e+16,
581
+ "train_batch_size": 64,
582
+ "trial_name": null,
583
+ "trial_params": null
584
+ }