ayjays132 commited on
Commit
0b5398e
·
verified ·
1 Parent(s): 5525ef5

Upload 8 files

Browse files
Files changed (7) hide show
  1. config.json +54 -32
  2. model.safetensors +1 -1
  3. optimizer.pt +1 -1
  4. rng_state.pth +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +66 -1266
  7. training_args.bin +3 -0
config.json CHANGED
@@ -1,20 +1,16 @@
1
  {
2
  "_name_or_path": "ayjays132/CustomGPT2Conversational",
3
- "model_type": "gpt2",
4
  "architectures": [
5
  "GPT2LMHeadModel"
6
  ],
7
- "tokenizer_config": {
8
- "bos_token_id": 50256,
9
- "eos_token_id": 50256,
10
- "n_positions": 2048,
11
- "padding_side": "left",
12
- "truncation_side": "right"
13
- },
14
  "config": {
15
  "activation_function": "gelu_new",
16
  "attn_pdrop": 0.1,
17
  "embd_pdrop": 0.1,
 
18
  "initializer_range": 0.02,
19
  "layer_norm_epsilon": 1e-05,
20
  "n_ctx": 2048,
@@ -23,40 +19,66 @@
23
  "n_layer": 36,
24
  "n_positions": 2048,
25
  "resid_pdrop": 0.1,
26
- "vocab_size": 50257,
27
  "scale_attn_weights": true,
28
- "gradient_checkpointing": true,
29
- "use_cache": true
30
- },
31
- "task_specific_params": {
32
- "conversational": {
33
- "max_length": 1024,
34
- "min_length": 20,
35
- "temperature": 0.7,
36
- "top_k": 40,
37
- "top_p": 0.95,
38
- "num_beams": 5,
39
- "no_repeat_ngram_size": 3,
40
- "early_stopping": true,
41
- "length_penalty": 2.0,
42
- "do_sample": true,
43
- "frequency_penalty": 0.5,
44
- "presence_penalty": 0.5
45
- }
46
  },
47
- "use_cache": true,
48
- "torch_dtype": "float32",
49
- "transformers_version": "4.37.2",
50
- "library_name": "transformers",
51
  "language": "en",
 
 
52
  "license": "apache-2.0",
53
  "metrics": [
54
  "perplexity",
55
  "accuracy"
56
  ],
 
 
 
 
 
 
57
  "pipeline_tag": "conversational",
 
 
 
 
 
 
 
 
 
58
  "tags": [
59
  "conversational",
60
  "state-of-the-art"
61
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
 
1
  {
2
  "_name_or_path": "ayjays132/CustomGPT2Conversational",
3
+ "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
6
  ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
 
 
 
 
 
9
  "config": {
10
  "activation_function": "gelu_new",
11
  "attn_pdrop": 0.1,
12
  "embd_pdrop": 0.1,
13
+ "gradient_checkpointing": true,
14
  "initializer_range": 0.02,
15
  "layer_norm_epsilon": 1e-05,
16
  "n_ctx": 2048,
 
19
  "n_layer": 36,
20
  "n_positions": 2048,
21
  "resid_pdrop": 0.1,
 
22
  "scale_attn_weights": true,
23
+ "use_cache": true,
24
+ "vocab_size": 50257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  },
26
+ "embd_pdrop": 0.1,
27
+ "eos_token_id": 50256,
28
+ "initializer_range": 0.02,
 
29
  "language": "en",
30
+ "layer_norm_epsilon": 1e-05,
31
+ "library_name": "transformers",
32
  "license": "apache-2.0",
33
  "metrics": [
34
  "perplexity",
35
  "accuracy"
36
  ],
37
+ "model_type": "gpt2",
38
+ "n_embd": 768,
39
+ "n_head": 12,
40
+ "n_inner": null,
41
+ "n_layer": 12,
42
+ "n_positions": 1024,
43
  "pipeline_tag": "conversational",
44
+ "reorder_and_upcast_attn": false,
45
+ "resid_pdrop": 0.1,
46
+ "scale_attn_by_inverse_layer_idx": false,
47
+ "scale_attn_weights": true,
48
+ "summary_activation": null,
49
+ "summary_first_dropout": 0.1,
50
+ "summary_proj_to_labels": true,
51
+ "summary_type": "cls_index",
52
+ "summary_use_proj": true,
53
  "tags": [
54
  "conversational",
55
  "state-of-the-art"
56
+ ],
57
+ "task_specific_params": {
58
+ "conversational": {
59
+ "do_sample": true,
60
+ "early_stopping": true,
61
+ "frequency_penalty": 0.5,
62
+ "length_penalty": 2.0,
63
+ "max_length": 1024,
64
+ "min_length": 20,
65
+ "no_repeat_ngram_size": 3,
66
+ "num_beams": 5,
67
+ "presence_penalty": 0.5,
68
+ "temperature": 0.7,
69
+ "top_k": 40,
70
+ "top_p": 0.95
71
+ }
72
+ },
73
+ "tokenizer_config": {
74
+ "bos_token_id": 50256,
75
+ "eos_token_id": 50256,
76
+ "n_positions": 2048,
77
+ "padding_side": "left",
78
+ "truncation_side": "right"
79
+ },
80
+ "torch_dtype": "float32",
81
+ "transformers_version": "4.37.2",
82
+ "use_cache": true,
83
+ "vocab_size": 50257
84
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:234894e33006b6f8051c56c10cc43ed5434adf92889de9b60a74cccdebe415e2
3
  size 497774208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839abdabb9e2db67117691de736daa0762f8cb9711b916429643a38f4010e757
3
  size 497774208
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10301dbc12e92af88759b3c2ab855c88c8a32eb7f92be5e4d309769f3dc84d7b
3
  size 995642298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c69c7a2ea4b08b80736806da4a78af087235909bc337021f37381bad74760a90
3
  size 995642298
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e439c091a6dd1bed20a2b779f499076b0ef16fb414bd634c52a62189b7a8caf1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dede6f4c6fec6e15aded13fe8fa4e3b188a548f0e43da285ecede4459eea5de
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51caff193f407d8c2a2975068aff6341b9daf298781c9f39523eb45070f85df4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4247e447234772e3730d02b6da20c344c682ce3e67dce3060fc52e6286ef180
3
  size 1064
trainer_state.json CHANGED
@@ -1,1341 +1,141 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 11.845789360327375,
5
  "eval_steps": 500,
6
- "global_step": 110000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.05,
13
- "learning_rate": 4.986538875726901e-05,
14
- "loss": 1.7984,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.11,
19
- "learning_rate": 4.9730777514538016e-05,
20
- "loss": 1.88,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.16,
25
- "learning_rate": 4.959616627180702e-05,
26
- "loss": 1.8651,
27
  "step": 1500
28
  },
29
  {
30
- "epoch": 0.22,
31
- "learning_rate": 4.946155502907603e-05,
32
- "loss": 1.8926,
33
  "step": 2000
34
  },
35
  {
36
- "epoch": 0.27,
37
- "learning_rate": 4.932694378634504e-05,
38
- "loss": 1.9214,
39
  "step": 2500
40
  },
41
  {
42
- "epoch": 0.32,
43
- "learning_rate": 4.9192332543614044e-05,
44
- "loss": 1.9036,
45
  "step": 3000
46
  },
47
  {
48
- "epoch": 0.38,
49
- "learning_rate": 4.905772130088305e-05,
50
- "loss": 1.9155,
51
  "step": 3500
52
  },
53
  {
54
- "epoch": 0.43,
55
- "learning_rate": 4.892311005815206e-05,
56
- "loss": 1.9151,
57
  "step": 4000
58
  },
59
  {
60
- "epoch": 0.48,
61
- "learning_rate": 4.878849881542107e-05,
62
- "loss": 1.9263,
63
  "step": 4500
64
  },
65
  {
66
- "epoch": 0.54,
67
- "learning_rate": 4.865388757269007e-05,
68
- "loss": 1.9654,
69
  "step": 5000
70
  },
71
  {
72
- "epoch": 0.59,
73
- "learning_rate": 4.851927632995908e-05,
74
- "loss": 1.9323,
75
  "step": 5500
76
  },
77
  {
78
- "epoch": 0.65,
79
- "learning_rate": 4.8384665087228085e-05,
80
- "loss": 1.9479,
81
  "step": 6000
82
  },
83
  {
84
- "epoch": 0.7,
85
- "learning_rate": 4.825005384449709e-05,
86
- "loss": 1.9513,
87
  "step": 6500
88
  },
89
  {
90
- "epoch": 0.75,
91
- "learning_rate": 4.8115442601766106e-05,
92
- "loss": 1.9623,
93
  "step": 7000
94
  },
95
  {
96
- "epoch": 0.81,
97
- "learning_rate": 4.798083135903511e-05,
98
- "loss": 1.958,
99
  "step": 7500
100
  },
101
  {
102
- "epoch": 0.86,
103
- "learning_rate": 4.784622011630411e-05,
104
- "loss": 1.9854,
105
  "step": 8000
106
  },
107
  {
108
- "epoch": 0.92,
109
- "learning_rate": 4.771160887357312e-05,
110
- "loss": 1.981,
111
  "step": 8500
112
  },
113
  {
114
- "epoch": 0.97,
115
- "learning_rate": 4.7576997630842127e-05,
116
- "loss": 1.9759,
117
  "step": 9000
118
  },
119
  {
120
- "epoch": 1.02,
121
- "learning_rate": 4.744238638811114e-05,
122
- "loss": 1.872,
123
  "step": 9500
124
  },
125
- {
126
- "epoch": 1.08,
127
- "learning_rate": 4.730777514538015e-05,
128
- "loss": 1.7166,
129
- "step": 10000
130
- },
131
- {
132
- "epoch": 1.13,
133
- "learning_rate": 4.7173163902649154e-05,
134
- "loss": 1.7223,
135
- "step": 10500
136
- },
137
- {
138
- "epoch": 1.18,
139
- "learning_rate": 4.7038552659918154e-05,
140
- "loss": 1.7497,
141
- "step": 11000
142
- },
143
- {
144
- "epoch": 1.24,
145
- "learning_rate": 4.690394141718716e-05,
146
- "loss": 1.7416,
147
- "step": 11500
148
- },
149
- {
150
- "epoch": 1.29,
151
- "learning_rate": 4.6769330174456175e-05,
152
- "loss": 1.7603,
153
- "step": 12000
154
- },
155
- {
156
- "epoch": 1.35,
157
- "learning_rate": 4.663471893172518e-05,
158
- "loss": 1.7939,
159
- "step": 12500
160
- },
161
- {
162
- "epoch": 1.4,
163
- "learning_rate": 4.650010768899419e-05,
164
- "loss": 1.8245,
165
- "step": 13000
166
- },
167
- {
168
- "epoch": 1.45,
169
- "learning_rate": 4.6365496446263195e-05,
170
- "loss": 1.7997,
171
- "step": 13500
172
- },
173
- {
174
- "epoch": 1.51,
175
- "learning_rate": 4.62308852035322e-05,
176
- "loss": 1.8118,
177
- "step": 14000
178
- },
179
- {
180
- "epoch": 1.56,
181
- "learning_rate": 4.609627396080121e-05,
182
- "loss": 1.8217,
183
- "step": 14500
184
- },
185
- {
186
- "epoch": 1.62,
187
- "learning_rate": 4.5961662718070216e-05,
188
- "loss": 1.813,
189
- "step": 15000
190
- },
191
- {
192
- "epoch": 1.67,
193
- "learning_rate": 4.582705147533922e-05,
194
- "loss": 1.8473,
195
- "step": 15500
196
- },
197
- {
198
- "epoch": 1.72,
199
- "learning_rate": 4.569244023260823e-05,
200
- "loss": 1.8383,
201
- "step": 16000
202
- },
203
- {
204
- "epoch": 1.78,
205
- "learning_rate": 4.555782898987724e-05,
206
- "loss": 1.851,
207
- "step": 16500
208
- },
209
- {
210
- "epoch": 1.83,
211
- "learning_rate": 4.5423217747146244e-05,
212
- "loss": 1.869,
213
- "step": 17000
214
- },
215
- {
216
- "epoch": 1.88,
217
- "learning_rate": 4.528860650441525e-05,
218
- "loss": 1.8694,
219
- "step": 17500
220
- },
221
- {
222
- "epoch": 1.94,
223
- "learning_rate": 4.515399526168426e-05,
224
- "loss": 1.8716,
225
- "step": 18000
226
- },
227
- {
228
- "epoch": 1.99,
229
- "learning_rate": 4.5019384018953264e-05,
230
- "loss": 1.8382,
231
- "step": 18500
232
- },
233
- {
234
- "epoch": 2.05,
235
- "learning_rate": 4.488477277622227e-05,
236
- "loss": 1.6506,
237
- "step": 19000
238
- },
239
- {
240
- "epoch": 2.1,
241
- "learning_rate": 4.475016153349128e-05,
242
- "loss": 1.6231,
243
- "step": 19500
244
- },
245
  {
246
  "epoch": 2.15,
247
- "learning_rate": 4.461555029076029e-05,
248
- "loss": 1.6111,
249
- "step": 20000
250
- },
251
- {
252
- "epoch": 2.21,
253
- "learning_rate": 4.448093904802929e-05,
254
- "loss": 1.6543,
255
- "step": 20500
256
- },
257
- {
258
- "epoch": 2.26,
259
- "learning_rate": 4.43463278052983e-05,
260
- "loss": 1.6816,
261
- "step": 21000
262
- },
263
- {
264
- "epoch": 2.32,
265
- "learning_rate": 4.4211716562567306e-05,
266
- "loss": 1.6582,
267
- "step": 21500
268
- },
269
- {
270
- "epoch": 2.37,
271
- "learning_rate": 4.407710531983631e-05,
272
- "loss": 1.6787,
273
- "step": 22000
274
- },
275
- {
276
- "epoch": 2.42,
277
- "learning_rate": 4.394249407710532e-05,
278
- "loss": 1.7136,
279
- "step": 22500
280
- },
281
- {
282
- "epoch": 2.48,
283
- "learning_rate": 4.380788283437433e-05,
284
- "loss": 1.7168,
285
- "step": 23000
286
- },
287
- {
288
- "epoch": 2.53,
289
- "learning_rate": 4.367327159164334e-05,
290
- "loss": 1.6984,
291
- "step": 23500
292
- },
293
- {
294
- "epoch": 2.58,
295
- "learning_rate": 4.353866034891234e-05,
296
- "loss": 1.7052,
297
- "step": 24000
298
- },
299
- {
300
- "epoch": 2.64,
301
- "learning_rate": 4.340404910618135e-05,
302
- "loss": 1.7213,
303
- "step": 24500
304
- },
305
- {
306
- "epoch": 2.69,
307
- "learning_rate": 4.3269437863450354e-05,
308
- "loss": 1.7305,
309
- "step": 25000
310
- },
311
- {
312
- "epoch": 2.75,
313
- "learning_rate": 4.313482662071937e-05,
314
- "loss": 1.7348,
315
- "step": 25500
316
- },
317
- {
318
- "epoch": 2.8,
319
- "learning_rate": 4.3000215377988375e-05,
320
- "loss": 1.7359,
321
- "step": 26000
322
- },
323
- {
324
- "epoch": 2.85,
325
- "learning_rate": 4.286560413525738e-05,
326
- "loss": 1.7278,
327
- "step": 26500
328
- },
329
- {
330
- "epoch": 2.91,
331
- "learning_rate": 4.273099289252638e-05,
332
- "loss": 1.7558,
333
- "step": 27000
334
- },
335
- {
336
- "epoch": 2.96,
337
- "learning_rate": 4.259638164979539e-05,
338
- "loss": 1.7545,
339
- "step": 27500
340
- },
341
- {
342
- "epoch": 3.02,
343
- "learning_rate": 4.24617704070644e-05,
344
- "loss": 1.6658,
345
- "step": 28000
346
- },
347
- {
348
- "epoch": 3.07,
349
- "learning_rate": 4.232715916433341e-05,
350
- "loss": 1.4971,
351
- "step": 28500
352
- },
353
- {
354
- "epoch": 3.12,
355
- "learning_rate": 4.2192547921602416e-05,
356
- "loss": 1.519,
357
- "step": 29000
358
- },
359
- {
360
- "epoch": 3.18,
361
- "learning_rate": 4.205793667887142e-05,
362
- "loss": 1.5378,
363
- "step": 29500
364
- },
365
- {
366
- "epoch": 3.23,
367
- "learning_rate": 4.192332543614042e-05,
368
- "loss": 1.556,
369
- "step": 30000
370
- },
371
- {
372
- "epoch": 3.28,
373
- "learning_rate": 4.1788714193409437e-05,
374
- "loss": 1.5828,
375
- "step": 30500
376
- },
377
- {
378
- "epoch": 3.34,
379
- "learning_rate": 4.1654102950678443e-05,
380
- "loss": 1.5704,
381
- "step": 31000
382
- },
383
- {
384
- "epoch": 3.39,
385
- "learning_rate": 4.151949170794745e-05,
386
- "loss": 1.5954,
387
- "step": 31500
388
- },
389
- {
390
- "epoch": 3.45,
391
- "learning_rate": 4.138488046521646e-05,
392
- "loss": 1.5908,
393
- "step": 32000
394
- },
395
- {
396
- "epoch": 3.5,
397
- "learning_rate": 4.1250269222485464e-05,
398
- "loss": 1.6013,
399
- "step": 32500
400
- },
401
- {
402
- "epoch": 3.55,
403
- "learning_rate": 4.111565797975447e-05,
404
- "loss": 1.6179,
405
- "step": 33000
406
- },
407
- {
408
- "epoch": 3.61,
409
- "learning_rate": 4.098104673702348e-05,
410
- "loss": 1.6316,
411
- "step": 33500
412
- },
413
- {
414
- "epoch": 3.66,
415
- "learning_rate": 4.0846435494292485e-05,
416
- "loss": 1.6168,
417
- "step": 34000
418
- },
419
- {
420
- "epoch": 3.72,
421
- "learning_rate": 4.071182425156149e-05,
422
- "loss": 1.6459,
423
- "step": 34500
424
- },
425
- {
426
- "epoch": 3.77,
427
- "learning_rate": 4.05772130088305e-05,
428
- "loss": 1.6441,
429
- "step": 35000
430
- },
431
- {
432
- "epoch": 3.82,
433
- "learning_rate": 4.0442601766099506e-05,
434
- "loss": 1.6658,
435
- "step": 35500
436
- },
437
- {
438
- "epoch": 3.88,
439
- "learning_rate": 4.030799052336852e-05,
440
- "loss": 1.6688,
441
- "step": 36000
442
- },
443
- {
444
- "epoch": 3.93,
445
- "learning_rate": 4.017337928063752e-05,
446
- "loss": 1.6433,
447
- "step": 36500
448
- },
449
- {
450
- "epoch": 3.98,
451
- "learning_rate": 4.0038768037906526e-05,
452
- "loss": 1.645,
453
- "step": 37000
454
- },
455
- {
456
- "epoch": 4.04,
457
- "learning_rate": 3.990415679517553e-05,
458
- "loss": 1.461,
459
- "step": 37500
460
- },
461
- {
462
- "epoch": 4.09,
463
- "learning_rate": 3.976954555244454e-05,
464
- "loss": 1.4231,
465
- "step": 38000
466
- },
467
- {
468
- "epoch": 4.15,
469
- "learning_rate": 3.9634934309713554e-05,
470
- "loss": 1.4397,
471
- "step": 38500
472
- },
473
- {
474
- "epoch": 4.2,
475
- "learning_rate": 3.950032306698256e-05,
476
- "loss": 1.4422,
477
- "step": 39000
478
- },
479
- {
480
- "epoch": 4.25,
481
- "learning_rate": 3.936571182425156e-05,
482
- "loss": 1.4731,
483
- "step": 39500
484
- },
485
- {
486
- "epoch": 4.31,
487
- "learning_rate": 3.923110058152057e-05,
488
- "loss": 1.4869,
489
- "step": 40000
490
- },
491
- {
492
- "epoch": 4.36,
493
- "learning_rate": 3.9096489338789575e-05,
494
- "loss": 1.4939,
495
- "step": 40500
496
- },
497
- {
498
- "epoch": 4.42,
499
- "learning_rate": 3.896187809605859e-05,
500
- "loss": 1.5142,
501
- "step": 41000
502
- },
503
- {
504
- "epoch": 4.47,
505
- "learning_rate": 3.8827266853327595e-05,
506
- "loss": 1.4907,
507
- "step": 41500
508
- },
509
- {
510
- "epoch": 4.52,
511
- "learning_rate": 3.86926556105966e-05,
512
- "loss": 1.5144,
513
- "step": 42000
514
- },
515
- {
516
- "epoch": 4.58,
517
- "learning_rate": 3.85580443678656e-05,
518
- "loss": 1.5211,
519
- "step": 42500
520
- },
521
- {
522
- "epoch": 4.63,
523
- "learning_rate": 3.842343312513461e-05,
524
- "loss": 1.5204,
525
- "step": 43000
526
- },
527
- {
528
- "epoch": 4.68,
529
- "learning_rate": 3.828882188240362e-05,
530
- "loss": 1.5474,
531
- "step": 43500
532
- },
533
- {
534
- "epoch": 4.74,
535
- "learning_rate": 3.815421063967263e-05,
536
- "loss": 1.5521,
537
- "step": 44000
538
- },
539
- {
540
- "epoch": 4.79,
541
- "learning_rate": 3.8019599396941636e-05,
542
- "loss": 1.5533,
543
- "step": 44500
544
- },
545
- {
546
- "epoch": 4.85,
547
- "learning_rate": 3.788498815421064e-05,
548
- "loss": 1.5394,
549
- "step": 45000
550
- },
551
- {
552
- "epoch": 4.9,
553
- "learning_rate": 3.775037691147965e-05,
554
- "loss": 1.5555,
555
- "step": 45500
556
- },
557
- {
558
- "epoch": 4.95,
559
- "learning_rate": 3.761576566874865e-05,
560
- "loss": 1.5908,
561
- "step": 46000
562
- },
563
- {
564
- "epoch": 5.01,
565
- "learning_rate": 3.7481154426017664e-05,
566
- "loss": 1.5367,
567
- "step": 46500
568
- },
569
- {
570
- "epoch": 5.06,
571
- "learning_rate": 3.734654318328667e-05,
572
- "loss": 1.3431,
573
- "step": 47000
574
- },
575
- {
576
- "epoch": 5.12,
577
- "learning_rate": 3.721193194055568e-05,
578
- "loss": 1.3472,
579
- "step": 47500
580
- },
581
- {
582
- "epoch": 5.17,
583
- "learning_rate": 3.7077320697824685e-05,
584
- "loss": 1.3687,
585
- "step": 48000
586
- },
587
- {
588
- "epoch": 5.22,
589
- "learning_rate": 3.694270945509369e-05,
590
- "loss": 1.3831,
591
- "step": 48500
592
- },
593
- {
594
- "epoch": 5.28,
595
- "learning_rate": 3.68080982123627e-05,
596
- "loss": 1.3774,
597
- "step": 49000
598
- },
599
- {
600
- "epoch": 5.33,
601
- "learning_rate": 3.6673486969631705e-05,
602
- "loss": 1.3933,
603
- "step": 49500
604
- },
605
- {
606
- "epoch": 5.38,
607
- "learning_rate": 3.653887572690071e-05,
608
- "loss": 1.3961,
609
- "step": 50000
610
- },
611
- {
612
- "epoch": 5.44,
613
- "learning_rate": 3.640426448416972e-05,
614
- "loss": 1.4055,
615
- "step": 50500
616
- },
617
- {
618
- "epoch": 5.49,
619
- "learning_rate": 3.6269653241438726e-05,
620
- "loss": 1.4462,
621
- "step": 51000
622
- },
623
- {
624
- "epoch": 5.55,
625
- "learning_rate": 3.613504199870773e-05,
626
- "loss": 1.4235,
627
- "step": 51500
628
- },
629
- {
630
- "epoch": 5.6,
631
- "learning_rate": 3.600043075597674e-05,
632
- "loss": 1.4611,
633
- "step": 52000
634
- },
635
- {
636
- "epoch": 5.65,
637
- "learning_rate": 3.586581951324575e-05,
638
- "loss": 1.4476,
639
- "step": 52500
640
- },
641
- {
642
- "epoch": 5.71,
643
- "learning_rate": 3.5731208270514754e-05,
644
- "loss": 1.4708,
645
- "step": 53000
646
- },
647
- {
648
- "epoch": 5.76,
649
- "learning_rate": 3.559659702778376e-05,
650
- "loss": 1.4818,
651
- "step": 53500
652
- },
653
- {
654
- "epoch": 5.82,
655
- "learning_rate": 3.546198578505277e-05,
656
- "loss": 1.4486,
657
- "step": 54000
658
- },
659
- {
660
- "epoch": 5.87,
661
- "learning_rate": 3.532737454232178e-05,
662
- "loss": 1.466,
663
- "step": 54500
664
- },
665
- {
666
- "epoch": 5.92,
667
- "learning_rate": 3.519276329959078e-05,
668
- "loss": 1.4813,
669
- "step": 55000
670
- },
671
- {
672
- "epoch": 5.98,
673
- "learning_rate": 3.505815205685979e-05,
674
- "loss": 1.4932,
675
- "step": 55500
676
- },
677
- {
678
- "epoch": 6.03,
679
- "learning_rate": 3.4923540814128795e-05,
680
- "loss": 1.3391,
681
- "step": 56000
682
- },
683
- {
684
- "epoch": 6.08,
685
- "learning_rate": 3.47889295713978e-05,
686
- "loss": 1.2801,
687
- "step": 56500
688
- },
689
- {
690
- "epoch": 6.14,
691
- "learning_rate": 3.4654318328666816e-05,
692
- "loss": 1.2803,
693
- "step": 57000
694
- },
695
- {
696
- "epoch": 6.19,
697
- "learning_rate": 3.451970708593582e-05,
698
- "loss": 1.2791,
699
- "step": 57500
700
- },
701
- {
702
- "epoch": 6.25,
703
- "learning_rate": 3.438509584320483e-05,
704
- "loss": 1.297,
705
- "step": 58000
706
- },
707
- {
708
- "epoch": 6.3,
709
- "learning_rate": 3.425048460047383e-05,
710
- "loss": 1.304,
711
- "step": 58500
712
- },
713
- {
714
- "epoch": 6.35,
715
- "learning_rate": 3.4115873357742836e-05,
716
- "loss": 1.3166,
717
- "step": 59000
718
- },
719
- {
720
- "epoch": 6.41,
721
- "learning_rate": 3.398126211501185e-05,
722
- "loss": 1.3318,
723
- "step": 59500
724
- },
725
- {
726
- "epoch": 6.46,
727
- "learning_rate": 3.384665087228086e-05,
728
- "loss": 1.3501,
729
- "step": 60000
730
- },
731
- {
732
- "epoch": 6.52,
733
- "learning_rate": 3.3712039629549864e-05,
734
- "loss": 1.3576,
735
- "step": 60500
736
- },
737
- {
738
- "epoch": 6.57,
739
- "learning_rate": 3.357742838681887e-05,
740
- "loss": 1.3679,
741
- "step": 61000
742
- },
743
- {
744
- "epoch": 6.62,
745
- "learning_rate": 3.344281714408787e-05,
746
- "loss": 1.3905,
747
- "step": 61500
748
- },
749
- {
750
- "epoch": 6.68,
751
- "learning_rate": 3.3308205901356885e-05,
752
- "loss": 1.361,
753
- "step": 62000
754
- },
755
- {
756
- "epoch": 6.73,
757
- "learning_rate": 3.317359465862589e-05,
758
- "loss": 1.3657,
759
- "step": 62500
760
- },
761
- {
762
- "epoch": 6.78,
763
- "learning_rate": 3.30389834158949e-05,
764
- "loss": 1.3768,
765
- "step": 63000
766
- },
767
- {
768
- "epoch": 6.84,
769
- "learning_rate": 3.2904372173163905e-05,
770
- "loss": 1.4069,
771
- "step": 63500
772
- },
773
- {
774
- "epoch": 6.89,
775
- "learning_rate": 3.276976093043291e-05,
776
- "loss": 1.3913,
777
- "step": 64000
778
- },
779
- {
780
- "epoch": 6.95,
781
- "learning_rate": 3.263514968770192e-05,
782
- "loss": 1.3974,
783
- "step": 64500
784
- },
785
- {
786
- "epoch": 7.0,
787
- "learning_rate": 3.2500538444970926e-05,
788
- "loss": 1.4024,
789
- "step": 65000
790
- },
791
- {
792
- "epoch": 7.05,
793
- "learning_rate": 3.236592720223993e-05,
794
- "loss": 1.1818,
795
- "step": 65500
796
- },
797
- {
798
- "epoch": 7.11,
799
- "learning_rate": 3.223131595950894e-05,
800
- "loss": 1.2059,
801
- "step": 66000
802
- },
803
- {
804
- "epoch": 7.16,
805
- "learning_rate": 3.209670471677795e-05,
806
- "loss": 1.2021,
807
- "step": 66500
808
- },
809
- {
810
- "epoch": 7.22,
811
- "learning_rate": 3.1962093474046954e-05,
812
- "loss": 1.2224,
813
- "step": 67000
814
- },
815
- {
816
- "epoch": 7.27,
817
- "learning_rate": 3.182748223131596e-05,
818
- "loss": 1.2637,
819
- "step": 67500
820
- },
821
- {
822
- "epoch": 7.32,
823
- "learning_rate": 3.169287098858497e-05,
824
- "loss": 1.246,
825
- "step": 68000
826
- },
827
- {
828
- "epoch": 7.38,
829
- "learning_rate": 3.1558259745853974e-05,
830
- "loss": 1.2506,
831
- "step": 68500
832
- },
833
- {
834
- "epoch": 7.43,
835
- "learning_rate": 3.142364850312298e-05,
836
- "loss": 1.2749,
837
- "step": 69000
838
- },
839
- {
840
- "epoch": 7.48,
841
- "learning_rate": 3.128903726039199e-05,
842
- "loss": 1.2853,
843
- "step": 69500
844
- },
845
- {
846
- "epoch": 7.54,
847
- "learning_rate": 3.1154426017660995e-05,
848
- "loss": 1.2919,
849
- "step": 70000
850
- },
851
- {
852
- "epoch": 7.59,
853
- "learning_rate": 3.101981477493001e-05,
854
- "loss": 1.307,
855
- "step": 70500
856
- },
857
- {
858
- "epoch": 7.65,
859
- "learning_rate": 3.088520353219901e-05,
860
- "loss": 1.3098,
861
- "step": 71000
862
- },
863
- {
864
- "epoch": 7.7,
865
- "learning_rate": 3.0750592289468016e-05,
866
- "loss": 1.2979,
867
- "step": 71500
868
- },
869
- {
870
- "epoch": 7.75,
871
- "learning_rate": 3.061598104673702e-05,
872
- "loss": 1.3009,
873
- "step": 72000
874
- },
875
- {
876
- "epoch": 7.81,
877
- "learning_rate": 3.0481369804006033e-05,
878
- "loss": 1.2957,
879
- "step": 72500
880
- },
881
- {
882
- "epoch": 7.86,
883
- "learning_rate": 3.034675856127504e-05,
884
- "loss": 1.3204,
885
- "step": 73000
886
- },
887
- {
888
- "epoch": 7.92,
889
- "learning_rate": 3.021214731854405e-05,
890
- "loss": 1.3106,
891
- "step": 73500
892
- },
893
- {
894
- "epoch": 7.97,
895
- "learning_rate": 3.007753607581305e-05,
896
- "loss": 1.3095,
897
- "step": 74000
898
- },
899
- {
900
- "epoch": 8.02,
901
- "learning_rate": 2.994292483308206e-05,
902
- "loss": 1.214,
903
- "step": 74500
904
- },
905
- {
906
- "epoch": 8.08,
907
- "learning_rate": 2.9808313590351067e-05,
908
- "loss": 1.1334,
909
- "step": 75000
910
- },
911
- {
912
- "epoch": 8.13,
913
- "learning_rate": 2.9673702347620074e-05,
914
- "loss": 1.1408,
915
- "step": 75500
916
- },
917
- {
918
- "epoch": 8.18,
919
- "learning_rate": 2.9539091104889084e-05,
920
- "loss": 1.151,
921
- "step": 76000
922
- },
923
- {
924
- "epoch": 8.24,
925
- "learning_rate": 2.940447986215809e-05,
926
- "loss": 1.1552,
927
- "step": 76500
928
- },
929
- {
930
- "epoch": 8.29,
931
- "learning_rate": 2.9269868619427095e-05,
932
- "loss": 1.1694,
933
- "step": 77000
934
- },
935
- {
936
- "epoch": 8.35,
937
- "learning_rate": 2.9135257376696102e-05,
938
- "loss": 1.193,
939
- "step": 77500
940
- },
941
- {
942
- "epoch": 8.4,
943
- "learning_rate": 2.900064613396511e-05,
944
- "loss": 1.1861,
945
- "step": 78000
946
- },
947
- {
948
- "epoch": 8.45,
949
- "learning_rate": 2.8866034891234116e-05,
950
- "loss": 1.1871,
951
- "step": 78500
952
- },
953
- {
954
- "epoch": 8.51,
955
- "learning_rate": 2.8731423648503126e-05,
956
- "loss": 1.2038,
957
- "step": 79000
958
- },
959
- {
960
- "epoch": 8.56,
961
- "learning_rate": 2.8596812405772133e-05,
962
- "loss": 1.2259,
963
- "step": 79500
964
- },
965
- {
966
- "epoch": 8.62,
967
- "learning_rate": 2.846220116304114e-05,
968
- "loss": 1.2174,
969
- "step": 80000
970
- },
971
- {
972
- "epoch": 8.67,
973
- "learning_rate": 2.8327589920310143e-05,
974
- "loss": 1.2217,
975
- "step": 80500
976
- },
977
- {
978
- "epoch": 8.72,
979
- "learning_rate": 2.819297867757915e-05,
980
- "loss": 1.2326,
981
- "step": 81000
982
- },
983
- {
984
- "epoch": 8.78,
985
- "learning_rate": 2.805836743484816e-05,
986
- "loss": 1.2494,
987
- "step": 81500
988
- },
989
- {
990
- "epoch": 8.83,
991
- "learning_rate": 2.7923756192117167e-05,
992
- "loss": 1.2486,
993
- "step": 82000
994
- },
995
- {
996
- "epoch": 8.88,
997
- "learning_rate": 2.7789144949386174e-05,
998
- "loss": 1.2447,
999
- "step": 82500
1000
- },
1001
- {
1002
- "epoch": 8.94,
1003
- "learning_rate": 2.7654533706655184e-05,
1004
- "loss": 1.2425,
1005
- "step": 83000
1006
- },
1007
- {
1008
- "epoch": 8.99,
1009
- "learning_rate": 2.7519922463924185e-05,
1010
- "loss": 1.258,
1011
- "step": 83500
1012
- },
1013
- {
1014
- "epoch": 9.05,
1015
- "learning_rate": 2.7385311221193195e-05,
1016
- "loss": 1.0903,
1017
- "step": 84000
1018
- },
1019
- {
1020
- "epoch": 9.1,
1021
- "learning_rate": 2.7250699978462202e-05,
1022
- "loss": 1.0811,
1023
- "step": 84500
1024
- },
1025
- {
1026
- "epoch": 9.15,
1027
- "learning_rate": 2.711608873573121e-05,
1028
- "loss": 1.0894,
1029
- "step": 85000
1030
- },
1031
- {
1032
- "epoch": 9.21,
1033
- "learning_rate": 2.698147749300022e-05,
1034
- "loss": 1.1093,
1035
- "step": 85500
1036
- },
1037
- {
1038
- "epoch": 9.26,
1039
- "learning_rate": 2.6846866250269226e-05,
1040
- "loss": 1.0984,
1041
- "step": 86000
1042
- },
1043
- {
1044
- "epoch": 9.32,
1045
- "learning_rate": 2.671225500753823e-05,
1046
- "loss": 1.0906,
1047
- "step": 86500
1048
- },
1049
- {
1050
- "epoch": 9.37,
1051
- "learning_rate": 2.6577643764807236e-05,
1052
- "loss": 1.109,
1053
- "step": 87000
1054
- },
1055
- {
1056
- "epoch": 9.42,
1057
- "learning_rate": 2.6443032522076243e-05,
1058
- "loss": 1.1383,
1059
- "step": 87500
1060
- },
1061
- {
1062
- "epoch": 9.48,
1063
- "learning_rate": 2.6308421279345253e-05,
1064
- "loss": 1.146,
1065
- "step": 88000
1066
- },
1067
- {
1068
- "epoch": 9.53,
1069
- "learning_rate": 2.617381003661426e-05,
1070
- "loss": 1.1484,
1071
- "step": 88500
1072
- },
1073
- {
1074
- "epoch": 9.58,
1075
- "learning_rate": 2.6039198793883267e-05,
1076
- "loss": 1.1512,
1077
- "step": 89000
1078
- },
1079
- {
1080
- "epoch": 9.64,
1081
- "learning_rate": 2.5904587551152277e-05,
1082
- "loss": 1.154,
1083
- "step": 89500
1084
- },
1085
- {
1086
- "epoch": 9.69,
1087
- "learning_rate": 2.5769976308421278e-05,
1088
- "loss": 1.1533,
1089
- "step": 90000
1090
- },
1091
- {
1092
- "epoch": 9.75,
1093
- "learning_rate": 2.5635365065690288e-05,
1094
- "loss": 1.183,
1095
- "step": 90500
1096
- },
1097
- {
1098
- "epoch": 9.8,
1099
- "learning_rate": 2.5500753822959295e-05,
1100
- "loss": 1.1628,
1101
- "step": 91000
1102
- },
1103
- {
1104
- "epoch": 9.85,
1105
- "learning_rate": 2.53661425802283e-05,
1106
- "loss": 1.1586,
1107
- "step": 91500
1108
- },
1109
- {
1110
- "epoch": 9.91,
1111
- "learning_rate": 2.5231531337497312e-05,
1112
- "loss": 1.1968,
1113
- "step": 92000
1114
- },
1115
- {
1116
- "epoch": 9.96,
1117
- "learning_rate": 2.509692009476632e-05,
1118
- "loss": 1.1853,
1119
- "step": 92500
1120
- },
1121
- {
1122
- "epoch": 10.02,
1123
- "learning_rate": 2.4962308852035322e-05,
1124
- "loss": 1.1338,
1125
- "step": 93000
1126
- },
1127
- {
1128
- "epoch": 10.07,
1129
- "learning_rate": 2.482769760930433e-05,
1130
- "loss": 1.0121,
1131
- "step": 93500
1132
- },
1133
- {
1134
- "epoch": 10.12,
1135
- "learning_rate": 2.4693086366573336e-05,
1136
- "loss": 1.0293,
1137
- "step": 94000
1138
- },
1139
- {
1140
- "epoch": 10.18,
1141
- "learning_rate": 2.4558475123842346e-05,
1142
- "loss": 1.0306,
1143
- "step": 94500
1144
- },
1145
- {
1146
- "epoch": 10.23,
1147
- "learning_rate": 2.4423863881111353e-05,
1148
- "loss": 1.0396,
1149
- "step": 95000
1150
- },
1151
- {
1152
- "epoch": 10.28,
1153
- "learning_rate": 2.4289252638380357e-05,
1154
- "loss": 1.0501,
1155
- "step": 95500
1156
- },
1157
- {
1158
- "epoch": 10.34,
1159
- "learning_rate": 2.4154641395649367e-05,
1160
- "loss": 1.0499,
1161
- "step": 96000
1162
- },
1163
- {
1164
- "epoch": 10.39,
1165
- "learning_rate": 2.4020030152918374e-05,
1166
- "loss": 1.0542,
1167
- "step": 96500
1168
- },
1169
- {
1170
- "epoch": 10.45,
1171
- "learning_rate": 2.388541891018738e-05,
1172
- "loss": 1.0756,
1173
- "step": 97000
1174
- },
1175
- {
1176
- "epoch": 10.5,
1177
- "learning_rate": 2.3750807667456388e-05,
1178
- "loss": 1.0793,
1179
- "step": 97500
1180
- },
1181
- {
1182
- "epoch": 10.55,
1183
- "learning_rate": 2.3616196424725395e-05,
1184
- "loss": 1.0988,
1185
- "step": 98000
1186
- },
1187
- {
1188
- "epoch": 10.61,
1189
- "learning_rate": 2.34815851819944e-05,
1190
- "loss": 1.0935,
1191
- "step": 98500
1192
- },
1193
- {
1194
- "epoch": 10.66,
1195
- "learning_rate": 2.334697393926341e-05,
1196
- "loss": 1.091,
1197
- "step": 99000
1198
- },
1199
- {
1200
- "epoch": 10.72,
1201
- "learning_rate": 2.3212362696532415e-05,
1202
- "loss": 1.1021,
1203
- "step": 99500
1204
- },
1205
- {
1206
- "epoch": 10.77,
1207
- "learning_rate": 2.3077751453801422e-05,
1208
- "loss": 1.1107,
1209
- "step": 100000
1210
- },
1211
- {
1212
- "epoch": 10.82,
1213
- "learning_rate": 2.294314021107043e-05,
1214
- "loss": 1.1004,
1215
- "step": 100500
1216
- },
1217
- {
1218
- "epoch": 10.88,
1219
- "learning_rate": 2.2808528968339436e-05,
1220
- "loss": 1.1234,
1221
- "step": 101000
1222
- },
1223
- {
1224
- "epoch": 10.93,
1225
- "learning_rate": 2.2673917725608443e-05,
1226
- "loss": 1.12,
1227
- "step": 101500
1228
- },
1229
- {
1230
- "epoch": 10.98,
1231
- "learning_rate": 2.253930648287745e-05,
1232
- "loss": 1.1295,
1233
- "step": 102000
1234
- },
1235
- {
1236
- "epoch": 11.04,
1237
- "learning_rate": 2.240469524014646e-05,
1238
- "loss": 1.0042,
1239
- "step": 102500
1240
- },
1241
- {
1242
- "epoch": 11.09,
1243
- "learning_rate": 2.2270083997415464e-05,
1244
- "loss": 0.9767,
1245
- "step": 103000
1246
- },
1247
- {
1248
- "epoch": 11.15,
1249
- "learning_rate": 2.213547275468447e-05,
1250
- "loss": 0.9776,
1251
- "step": 103500
1252
- },
1253
- {
1254
- "epoch": 11.2,
1255
- "learning_rate": 2.200086151195348e-05,
1256
- "loss": 0.9777,
1257
- "step": 104000
1258
- },
1259
- {
1260
- "epoch": 11.25,
1261
- "learning_rate": 2.1866250269222484e-05,
1262
- "loss": 0.9975,
1263
- "step": 104500
1264
- },
1265
- {
1266
- "epoch": 11.31,
1267
- "learning_rate": 2.1731639026491495e-05,
1268
- "loss": 1.0122,
1269
- "step": 105000
1270
- },
1271
- {
1272
- "epoch": 11.36,
1273
- "learning_rate": 2.15970277837605e-05,
1274
- "loss": 0.991,
1275
- "step": 105500
1276
- },
1277
- {
1278
- "epoch": 11.42,
1279
- "learning_rate": 2.146241654102951e-05,
1280
- "loss": 1.0095,
1281
- "step": 106000
1282
- },
1283
- {
1284
- "epoch": 11.47,
1285
- "learning_rate": 2.1327805298298515e-05,
1286
- "loss": 1.0179,
1287
- "step": 106500
1288
- },
1289
- {
1290
- "epoch": 11.52,
1291
- "learning_rate": 2.1193194055567522e-05,
1292
- "loss": 1.019,
1293
- "step": 107000
1294
- },
1295
- {
1296
- "epoch": 11.58,
1297
- "learning_rate": 2.105858281283653e-05,
1298
- "loss": 1.0495,
1299
- "step": 107500
1300
- },
1301
- {
1302
- "epoch": 11.63,
1303
- "learning_rate": 2.0923971570105536e-05,
1304
- "loss": 1.0266,
1305
- "step": 108000
1306
- },
1307
- {
1308
- "epoch": 11.68,
1309
- "learning_rate": 2.0789360327374543e-05,
1310
- "loss": 1.0422,
1311
- "step": 108500
1312
- },
1313
- {
1314
- "epoch": 11.74,
1315
- "learning_rate": 2.0654749084643553e-05,
1316
- "loss": 1.0383,
1317
- "step": 109000
1318
- },
1319
- {
1320
- "epoch": 11.79,
1321
- "learning_rate": 2.0520137841912557e-05,
1322
- "loss": 1.0624,
1323
- "step": 109500
1324
- },
1325
- {
1326
- "epoch": 11.85,
1327
- "learning_rate": 2.0385526599181564e-05,
1328
- "loss": 1.064,
1329
- "step": 110000
1330
  }
1331
  ],
1332
  "logging_steps": 500,
1333
- "max_steps": 185720,
1334
  "num_input_tokens_seen": 0,
1335
- "num_train_epochs": 20,
1336
  "save_steps": 10000,
1337
- "total_flos": 1.14965619867648e+17,
1338
- "train_batch_size": 4,
1339
  "trial_name": null,
1340
  "trial_params": null
1341
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.1491510853212983,
5
  "eval_steps": 500,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.11,
13
+ "learning_rate": 4.820904076223225e-05,
14
+ "loss": 1.1625,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.21,
19
+ "learning_rate": 4.641808152446451e-05,
20
+ "loss": 1.1918,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.32,
25
+ "learning_rate": 4.462712228669675e-05,
26
+ "loss": 1.2037,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 0.43,
31
+ "learning_rate": 4.283616304892901e-05,
32
+ "loss": 1.2352,
33
  "step": 2000
34
  },
35
  {
36
+ "epoch": 0.54,
37
+ "learning_rate": 4.104520381116126e-05,
38
+ "loss": 1.2375,
39
  "step": 2500
40
  },
41
  {
42
+ "epoch": 0.64,
43
+ "learning_rate": 3.9254244573393514e-05,
44
+ "loss": 1.2305,
45
  "step": 3000
46
  },
47
  {
48
+ "epoch": 0.75,
49
+ "learning_rate": 3.746328533562576e-05,
50
+ "loss": 1.2822,
51
  "step": 3500
52
  },
53
  {
54
+ "epoch": 0.86,
55
+ "learning_rate": 3.567232609785802e-05,
56
+ "loss": 1.2733,
57
  "step": 4000
58
  },
59
  {
60
+ "epoch": 0.97,
61
+ "learning_rate": 3.388136686009026e-05,
62
+ "loss": 1.2598,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 1.07,
67
+ "learning_rate": 3.209040762232252e-05,
68
+ "loss": 1.1006,
69
  "step": 5000
70
  },
71
  {
72
+ "epoch": 1.18,
73
+ "learning_rate": 3.029944838455477e-05,
74
+ "loss": 1.0509,
75
  "step": 5500
76
  },
77
  {
78
+ "epoch": 1.29,
79
+ "learning_rate": 2.850848914678702e-05,
80
+ "loss": 1.0549,
81
  "step": 6000
82
  },
83
  {
84
+ "epoch": 1.4,
85
+ "learning_rate": 2.6717529909019275e-05,
86
+ "loss": 1.0917,
87
  "step": 6500
88
  },
89
  {
90
+ "epoch": 1.5,
91
+ "learning_rate": 2.4926570671251524e-05,
92
+ "loss": 1.1086,
93
  "step": 7000
94
  },
95
  {
96
+ "epoch": 1.61,
97
+ "learning_rate": 2.3135611433483774e-05,
98
+ "loss": 1.0815,
99
  "step": 7500
100
  },
101
  {
102
+ "epoch": 1.72,
103
+ "learning_rate": 2.1344652195716027e-05,
104
+ "loss": 1.0995,
105
  "step": 8000
106
  },
107
  {
108
+ "epoch": 1.83,
109
+ "learning_rate": 1.9553692957948277e-05,
110
+ "loss": 1.1132,
111
  "step": 8500
112
  },
113
  {
114
+ "epoch": 1.93,
115
+ "learning_rate": 1.776273372018053e-05,
116
+ "loss": 1.075,
117
  "step": 9000
118
  },
119
  {
120
+ "epoch": 2.04,
121
+ "learning_rate": 1.5971774482412783e-05,
122
+ "loss": 1.054,
123
  "step": 9500
124
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  {
126
  "epoch": 2.15,
127
+ "learning_rate": 1.4180815244645032e-05,
128
+ "loss": 0.9583,
129
+ "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  }
131
  ],
132
  "logging_steps": 500,
133
+ "max_steps": 13959,
134
  "num_input_tokens_seen": 0,
135
+ "num_train_epochs": 3,
136
  "save_steps": 10000,
137
+ "total_flos": 2.0902317391872e+16,
138
+ "train_batch_size": 8,
139
  "trial_name": null,
140
  "trial_params": null
141
  }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:846a13b1247bfc212e1559f5a61140830e2e39df0b02326620e4db782b0aaf0c
3
+ size 4792