learn3r commited on
Commit
0eb410f
1 Parent(s): 853d50b

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -13
  2. all_results.json +18 -0
  3. eval_results.json +13 -0
  4. train_results.json +8 -0
  5. trainer_state.json +578 -0
README.md CHANGED
@@ -1,26 +1,27 @@
1
  ---
 
2
  tags:
3
  - generated_from_trainer
4
  datasets:
5
- - scrolls
6
  metrics:
7
  - rouge
8
  model-index:
9
  - name: longt5_xl_summ_screen_20
10
  results:
11
  - task:
12
- name: Sequence-to-sequence Language Modeling
13
- type: text2text-generation
14
  dataset:
15
- name: scrolls
16
- type: scrolls
17
  config: summ_screen_fd
18
  split: validation
19
  args: summ_screen_fd
20
  metrics:
21
  - name: Rouge1
22
  type: rouge
23
- value: 32.2717
24
  ---
25
 
26
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -28,14 +29,14 @@ should probably proofread and complete it, then remove this comment. -->
28
 
29
  # longt5_xl_summ_screen_20
30
 
31
- This model was trained from scratch on the scrolls dataset.
32
  It achieves the following results on the evaluation set:
33
- - Loss: 5.0866
34
- - Rouge1: 32.2717
35
- - Rouge2: 7.7004
36
- - Rougel: 18.9107
37
- - Rougelsum: 28.3874
38
- - Gen Len: 124.1893
39
 
40
  ## Model description
41
 
 
1
  ---
2
+ base_model: /exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen/checkpoint-140
3
  tags:
4
  - generated_from_trainer
5
  datasets:
6
+ - tau/scrolls
7
  metrics:
8
  - rouge
9
  model-index:
10
  - name: longt5_xl_summ_screen_20
11
  results:
12
  - task:
13
+ name: Summarization
14
+ type: summarization
15
  dataset:
16
+ name: tau/scrolls summ_screen_fd
17
+ type: tau/scrolls
18
  config: summ_screen_fd
19
  split: validation
20
  args: summ_screen_fd
21
  metrics:
22
  - name: Rouge1
23
  type: rouge
24
+ value: 28.1708
25
  ---
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
29
 
30
  # longt5_xl_summ_screen_20
31
 
32
+ This model is a fine-tuned version of [/exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen/checkpoint-140](https://huggingface.co//exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen/checkpoint-140) on the tau/scrolls summ_screen_fd dataset.
33
  It achieves the following results on the evaluation set:
34
+ - Loss: 3.1917
35
+ - Rouge1: 28.1708
36
+ - Rouge2: 6.6895
37
+ - Rougel: 18.1637
38
+ - Rougelsum: 24.3987
39
+ - Gen Len: 96.2041
40
 
41
  ## Model description
42
 
all_results.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.74,
3
+ "eval_gen_len": 96.20414201183432,
4
+ "eval_loss": 3.1917154788970947,
5
+ "eval_rouge1": 28.1708,
6
+ "eval_rouge2": 6.6895,
7
+ "eval_rougeL": 18.1637,
8
+ "eval_rougeLsum": 24.3987,
9
+ "eval_runtime": 1032.7131,
10
+ "eval_samples": 338,
11
+ "eval_samples_per_second": 0.327,
12
+ "eval_steps_per_second": 0.164,
13
+ "train_loss": 0.18376290196818965,
14
+ "train_runtime": 47972.5065,
15
+ "train_samples": 3673,
16
+ "train_samples_per_second": 0.766,
17
+ "train_steps_per_second": 0.003
18
+ }
eval_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.74,
3
+ "eval_gen_len": 96.20414201183432,
4
+ "eval_loss": 3.1917154788970947,
5
+ "eval_rouge1": 28.1708,
6
+ "eval_rouge2": 6.6895,
7
+ "eval_rougeL": 18.1637,
8
+ "eval_rougeLsum": 24.3987,
9
+ "eval_runtime": 1032.7131,
10
+ "eval_samples": 338,
11
+ "eval_samples_per_second": 0.327,
12
+ "eval_steps_per_second": 0.164
13
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.74,
3
+ "train_loss": 0.18376290196818965,
4
+ "train_runtime": 47972.5065,
5
+ "train_samples": 3673,
6
+ "train_samples_per_second": 0.766,
7
+ "train_steps_per_second": 0.003
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.1917154788970947,
3
+ "best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen_20/checkpoint-28",
4
+ "epoch": 9.73913043478261,
5
+ "eval_steps": 500,
6
+ "global_step": 140,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.14,
13
+ "learning_rate": 0.001,
14
+ "loss": 0.411,
15
+ "step": 2
16
+ },
17
+ {
18
+ "epoch": 0.28,
19
+ "learning_rate": 0.001,
20
+ "loss": 0.4642,
21
+ "step": 4
22
+ },
23
+ {
24
+ "epoch": 0.42,
25
+ "learning_rate": 0.001,
26
+ "loss": 0.4965,
27
+ "step": 6
28
+ },
29
+ {
30
+ "epoch": 0.56,
31
+ "learning_rate": 0.001,
32
+ "loss": 0.7455,
33
+ "step": 8
34
+ },
35
+ {
36
+ "epoch": 0.7,
37
+ "learning_rate": 0.001,
38
+ "loss": 0.4501,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.83,
43
+ "learning_rate": 0.001,
44
+ "loss": 0.3804,
45
+ "step": 12
46
+ },
47
+ {
48
+ "epoch": 0.97,
49
+ "learning_rate": 0.001,
50
+ "loss": 0.4063,
51
+ "step": 14
52
+ },
53
+ {
54
+ "epoch": 0.97,
55
+ "eval_gen_len": 71.90828402366864,
56
+ "eval_loss": 3.7384819984436035,
57
+ "eval_rouge1": 27.9171,
58
+ "eval_rouge2": 6.7215,
59
+ "eval_rougeL": 17.9315,
60
+ "eval_rougeLsum": 24.363,
61
+ "eval_runtime": 823.0541,
62
+ "eval_samples_per_second": 0.411,
63
+ "eval_steps_per_second": 0.205,
64
+ "step": 14
65
+ },
66
+ {
67
+ "epoch": 1.11,
68
+ "learning_rate": 0.001,
69
+ "loss": 0.3201,
70
+ "step": 16
71
+ },
72
+ {
73
+ "epoch": 1.25,
74
+ "learning_rate": 0.001,
75
+ "loss": 0.3253,
76
+ "step": 18
77
+ },
78
+ {
79
+ "epoch": 1.39,
80
+ "learning_rate": 0.001,
81
+ "loss": 0.3215,
82
+ "step": 20
83
+ },
84
+ {
85
+ "epoch": 1.53,
86
+ "learning_rate": 0.001,
87
+ "loss": 0.3175,
88
+ "step": 22
89
+ },
90
+ {
91
+ "epoch": 1.67,
92
+ "learning_rate": 0.001,
93
+ "loss": 0.3331,
94
+ "step": 24
95
+ },
96
+ {
97
+ "epoch": 1.81,
98
+ "learning_rate": 0.001,
99
+ "loss": 0.2811,
100
+ "step": 26
101
+ },
102
+ {
103
+ "epoch": 1.95,
104
+ "learning_rate": 0.001,
105
+ "loss": 0.3125,
106
+ "step": 28
107
+ },
108
+ {
109
+ "epoch": 1.95,
110
+ "eval_gen_len": 96.20414201183432,
111
+ "eval_loss": 3.1917154788970947,
112
+ "eval_rouge1": 28.1708,
113
+ "eval_rouge2": 6.6895,
114
+ "eval_rougeL": 18.1637,
115
+ "eval_rougeLsum": 24.3987,
116
+ "eval_runtime": 1069.4844,
117
+ "eval_samples_per_second": 0.316,
118
+ "eval_steps_per_second": 0.158,
119
+ "step": 28
120
+ },
121
+ {
122
+ "epoch": 2.09,
123
+ "learning_rate": 0.001,
124
+ "loss": 0.2621,
125
+ "step": 30
126
+ },
127
+ {
128
+ "epoch": 2.23,
129
+ "learning_rate": 0.001,
130
+ "loss": 0.2194,
131
+ "step": 32
132
+ },
133
+ {
134
+ "epoch": 2.37,
135
+ "learning_rate": 0.001,
136
+ "loss": 0.2386,
137
+ "step": 34
138
+ },
139
+ {
140
+ "epoch": 2.5,
141
+ "learning_rate": 0.001,
142
+ "loss": 0.2264,
143
+ "step": 36
144
+ },
145
+ {
146
+ "epoch": 2.64,
147
+ "learning_rate": 0.001,
148
+ "loss": 0.2002,
149
+ "step": 38
150
+ },
151
+ {
152
+ "epoch": 2.78,
153
+ "learning_rate": 0.001,
154
+ "loss": 0.2477,
155
+ "step": 40
156
+ },
157
+ {
158
+ "epoch": 2.92,
159
+ "learning_rate": 0.001,
160
+ "loss": 0.2177,
161
+ "step": 42
162
+ },
163
+ {
164
+ "epoch": 2.99,
165
+ "eval_gen_len": 198.0473372781065,
166
+ "eval_loss": 3.9997544288635254,
167
+ "eval_rouge1": 29.3167,
168
+ "eval_rouge2": 5.9,
169
+ "eval_rougeL": 17.3608,
170
+ "eval_rougeLsum": 25.6945,
171
+ "eval_runtime": 1900.1301,
172
+ "eval_samples_per_second": 0.178,
173
+ "eval_steps_per_second": 0.089,
174
+ "step": 43
175
+ },
176
+ {
177
+ "epoch": 3.06,
178
+ "learning_rate": 0.001,
179
+ "loss": 0.2069,
180
+ "step": 44
181
+ },
182
+ {
183
+ "epoch": 3.2,
184
+ "learning_rate": 0.001,
185
+ "loss": 0.164,
186
+ "step": 46
187
+ },
188
+ {
189
+ "epoch": 3.34,
190
+ "learning_rate": 0.001,
191
+ "loss": 0.1679,
192
+ "step": 48
193
+ },
194
+ {
195
+ "epoch": 3.48,
196
+ "learning_rate": 0.001,
197
+ "loss": 0.1736,
198
+ "step": 50
199
+ },
200
+ {
201
+ "epoch": 3.62,
202
+ "learning_rate": 0.001,
203
+ "loss": 0.1688,
204
+ "step": 52
205
+ },
206
+ {
207
+ "epoch": 3.76,
208
+ "learning_rate": 0.001,
209
+ "loss": 0.1749,
210
+ "step": 54
211
+ },
212
+ {
213
+ "epoch": 3.9,
214
+ "learning_rate": 0.001,
215
+ "loss": 0.1753,
216
+ "step": 56
217
+ },
218
+ {
219
+ "epoch": 3.97,
220
+ "eval_gen_len": 158.6508875739645,
221
+ "eval_loss": 4.228714466094971,
222
+ "eval_rouge1": 29.0605,
223
+ "eval_rouge2": 6.2534,
224
+ "eval_rougeL": 17.5744,
225
+ "eval_rougeLsum": 25.6415,
226
+ "eval_runtime": 1492.9623,
227
+ "eval_samples_per_second": 0.226,
228
+ "eval_steps_per_second": 0.113,
229
+ "step": 57
230
+ },
231
+ {
232
+ "epoch": 4.03,
233
+ "learning_rate": 0.001,
234
+ "loss": 0.1656,
235
+ "step": 58
236
+ },
237
+ {
238
+ "epoch": 4.17,
239
+ "learning_rate": 0.001,
240
+ "loss": 0.1144,
241
+ "step": 60
242
+ },
243
+ {
244
+ "epoch": 4.31,
245
+ "learning_rate": 0.001,
246
+ "loss": 0.161,
247
+ "step": 62
248
+ },
249
+ {
250
+ "epoch": 4.45,
251
+ "learning_rate": 0.001,
252
+ "loss": 0.2169,
253
+ "step": 64
254
+ },
255
+ {
256
+ "epoch": 4.59,
257
+ "learning_rate": 0.001,
258
+ "loss": 0.1943,
259
+ "step": 66
260
+ },
261
+ {
262
+ "epoch": 4.73,
263
+ "learning_rate": 0.001,
264
+ "loss": 0.1777,
265
+ "step": 68
266
+ },
267
+ {
268
+ "epoch": 4.87,
269
+ "learning_rate": 0.001,
270
+ "loss": 0.2747,
271
+ "step": 70
272
+ },
273
+ {
274
+ "epoch": 4.94,
275
+ "eval_gen_len": 118.44378698224853,
276
+ "eval_loss": 4.102721214294434,
277
+ "eval_rouge1": 31.2245,
278
+ "eval_rouge2": 6.5663,
279
+ "eval_rougeL": 18.1588,
280
+ "eval_rougeLsum": 26.8996,
281
+ "eval_runtime": 1188.6007,
282
+ "eval_samples_per_second": 0.284,
283
+ "eval_steps_per_second": 0.142,
284
+ "step": 71
285
+ },
286
+ {
287
+ "epoch": 5.01,
288
+ "learning_rate": 0.001,
289
+ "loss": 0.1399,
290
+ "step": 72
291
+ },
292
+ {
293
+ "epoch": 5.15,
294
+ "learning_rate": 0.001,
295
+ "loss": 0.0986,
296
+ "step": 74
297
+ },
298
+ {
299
+ "epoch": 5.29,
300
+ "learning_rate": 0.001,
301
+ "loss": 0.1051,
302
+ "step": 76
303
+ },
304
+ {
305
+ "epoch": 5.43,
306
+ "learning_rate": 0.001,
307
+ "loss": 0.1288,
308
+ "step": 78
309
+ },
310
+ {
311
+ "epoch": 5.57,
312
+ "learning_rate": 0.001,
313
+ "loss": 0.1097,
314
+ "step": 80
315
+ },
316
+ {
317
+ "epoch": 5.7,
318
+ "learning_rate": 0.001,
319
+ "loss": 0.1163,
320
+ "step": 82
321
+ },
322
+ {
323
+ "epoch": 5.84,
324
+ "learning_rate": 0.001,
325
+ "loss": 0.1205,
326
+ "step": 84
327
+ },
328
+ {
329
+ "epoch": 5.98,
330
+ "learning_rate": 0.001,
331
+ "loss": 0.1045,
332
+ "step": 86
333
+ },
334
+ {
335
+ "epoch": 5.98,
336
+ "eval_gen_len": 92.98816568047337,
337
+ "eval_loss": 5.058135986328125,
338
+ "eval_rouge1": 30.6056,
339
+ "eval_rouge2": 6.8892,
340
+ "eval_rougeL": 18.4933,
341
+ "eval_rougeLsum": 26.4027,
342
+ "eval_runtime": 984.3965,
343
+ "eval_samples_per_second": 0.343,
344
+ "eval_steps_per_second": 0.172,
345
+ "step": 86
346
+ },
347
+ {
348
+ "epoch": 6.12,
349
+ "learning_rate": 0.001,
350
+ "loss": 0.0767,
351
+ "step": 88
352
+ },
353
+ {
354
+ "epoch": 6.26,
355
+ "learning_rate": 0.001,
356
+ "loss": 0.0678,
357
+ "step": 90
358
+ },
359
+ {
360
+ "epoch": 6.4,
361
+ "learning_rate": 0.001,
362
+ "loss": 0.0759,
363
+ "step": 92
364
+ },
365
+ {
366
+ "epoch": 6.54,
367
+ "learning_rate": 0.001,
368
+ "loss": 0.0714,
369
+ "step": 94
370
+ },
371
+ {
372
+ "epoch": 6.68,
373
+ "learning_rate": 0.001,
374
+ "loss": 0.0822,
375
+ "step": 96
376
+ },
377
+ {
378
+ "epoch": 6.82,
379
+ "learning_rate": 0.001,
380
+ "loss": 0.0843,
381
+ "step": 98
382
+ },
383
+ {
384
+ "epoch": 6.96,
385
+ "learning_rate": 0.001,
386
+ "loss": 0.0875,
387
+ "step": 100
388
+ },
389
+ {
390
+ "epoch": 6.96,
391
+ "eval_gen_len": 160.89644970414201,
392
+ "eval_loss": 4.59414529800415,
393
+ "eval_rouge1": 32.5234,
394
+ "eval_rouge2": 7.3736,
395
+ "eval_rougeL": 18.8958,
396
+ "eval_rougeLsum": 28.4738,
397
+ "eval_runtime": 1504.7392,
398
+ "eval_samples_per_second": 0.225,
399
+ "eval_steps_per_second": 0.112,
400
+ "step": 100
401
+ },
402
+ {
403
+ "epoch": 7.1,
404
+ "learning_rate": 0.001,
405
+ "loss": 0.0724,
406
+ "step": 102
407
+ },
408
+ {
409
+ "epoch": 7.23,
410
+ "learning_rate": 0.001,
411
+ "loss": 0.0638,
412
+ "step": 104
413
+ },
414
+ {
415
+ "epoch": 7.37,
416
+ "learning_rate": 0.001,
417
+ "loss": 0.0649,
418
+ "step": 106
419
+ },
420
+ {
421
+ "epoch": 7.51,
422
+ "learning_rate": 0.001,
423
+ "loss": 0.0743,
424
+ "step": 108
425
+ },
426
+ {
427
+ "epoch": 7.65,
428
+ "learning_rate": 0.001,
429
+ "loss": 0.0754,
430
+ "step": 110
431
+ },
432
+ {
433
+ "epoch": 7.79,
434
+ "learning_rate": 0.001,
435
+ "loss": 0.0865,
436
+ "step": 112
437
+ },
438
+ {
439
+ "epoch": 7.93,
440
+ "learning_rate": 0.001,
441
+ "loss": 0.1572,
442
+ "step": 114
443
+ },
444
+ {
445
+ "epoch": 8.0,
446
+ "eval_gen_len": 121.01775147928994,
447
+ "eval_loss": 4.938564300537109,
448
+ "eval_rouge1": 31.4658,
449
+ "eval_rouge2": 7.2592,
450
+ "eval_rougeL": 18.4796,
451
+ "eval_rougeLsum": 27.6047,
452
+ "eval_runtime": 1185.8984,
453
+ "eval_samples_per_second": 0.285,
454
+ "eval_steps_per_second": 0.143,
455
+ "step": 115
456
+ },
457
+ {
458
+ "epoch": 8.07,
459
+ "learning_rate": 0.001,
460
+ "loss": 0.0862,
461
+ "step": 116
462
+ },
463
+ {
464
+ "epoch": 8.21,
465
+ "learning_rate": 0.001,
466
+ "loss": 0.0607,
467
+ "step": 118
468
+ },
469
+ {
470
+ "epoch": 8.35,
471
+ "learning_rate": 0.001,
472
+ "loss": 0.0692,
473
+ "step": 120
474
+ },
475
+ {
476
+ "epoch": 8.49,
477
+ "learning_rate": 0.001,
478
+ "loss": 0.0916,
479
+ "step": 122
480
+ },
481
+ {
482
+ "epoch": 8.63,
483
+ "learning_rate": 0.001,
484
+ "loss": 0.0847,
485
+ "step": 124
486
+ },
487
+ {
488
+ "epoch": 8.77,
489
+ "learning_rate": 0.001,
490
+ "loss": 0.089,
491
+ "step": 126
492
+ },
493
+ {
494
+ "epoch": 8.9,
495
+ "learning_rate": 0.001,
496
+ "loss": 0.0867,
497
+ "step": 128
498
+ },
499
+ {
500
+ "epoch": 8.97,
501
+ "eval_gen_len": 160.4792899408284,
502
+ "eval_loss": 4.556480884552002,
503
+ "eval_rouge1": 32.0531,
504
+ "eval_rouge2": 7.0692,
505
+ "eval_rougeL": 18.5551,
506
+ "eval_rougeLsum": 27.3373,
507
+ "eval_runtime": 1462.551,
508
+ "eval_samples_per_second": 0.231,
509
+ "eval_steps_per_second": 0.116,
510
+ "step": 129
511
+ },
512
+ {
513
+ "epoch": 9.04,
514
+ "learning_rate": 0.001,
515
+ "loss": 0.1022,
516
+ "step": 130
517
+ },
518
+ {
519
+ "epoch": 9.18,
520
+ "learning_rate": 0.001,
521
+ "loss": 0.067,
522
+ "step": 132
523
+ },
524
+ {
525
+ "epoch": 9.32,
526
+ "learning_rate": 0.001,
527
+ "loss": 0.0746,
528
+ "step": 134
529
+ },
530
+ {
531
+ "epoch": 9.46,
532
+ "learning_rate": 0.001,
533
+ "loss": 0.0888,
534
+ "step": 136
535
+ },
536
+ {
537
+ "epoch": 9.6,
538
+ "learning_rate": 0.001,
539
+ "loss": 0.1111,
540
+ "step": 138
541
+ },
542
+ {
543
+ "epoch": 9.74,
544
+ "learning_rate": 0.001,
545
+ "loss": 0.0748,
546
+ "step": 140
547
+ },
548
+ {
549
+ "epoch": 9.74,
550
+ "eval_gen_len": 124.18934911242603,
551
+ "eval_loss": 5.086633682250977,
552
+ "eval_rouge1": 32.2717,
553
+ "eval_rouge2": 7.7004,
554
+ "eval_rougeL": 18.9107,
555
+ "eval_rougeLsum": 28.3874,
556
+ "eval_runtime": 1232.2532,
557
+ "eval_samples_per_second": 0.274,
558
+ "eval_steps_per_second": 0.137,
559
+ "step": 140
560
+ },
561
+ {
562
+ "epoch": 9.74,
563
+ "step": 140,
564
+ "total_flos": 2.447850236380324e+18,
565
+ "train_loss": 0.18376290196818965,
566
+ "train_runtime": 47972.5065,
567
+ "train_samples_per_second": 0.766,
568
+ "train_steps_per_second": 0.003
569
+ }
570
+ ],
571
+ "logging_steps": 2,
572
+ "max_steps": 140,
573
+ "num_train_epochs": 10,
574
+ "save_steps": 500,
575
+ "total_flos": 2.447850236380324e+18,
576
+ "trial_name": null,
577
+ "trial_params": null
578
+ }