lillian039 commited on
Commit
8707c51
1 Parent(s): b42ee50

Model save

Browse files
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: llama3.1
4
+ base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: transduction-10k-seed100-instruct-fft_lr1e-5_epoch2
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # transduction-10k-seed100-instruct-fft_lr1e-5_epoch2
18
+
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0826
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 1e-05
41
+ - train_batch_size: 8
42
+ - eval_batch_size: 4
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 8
46
+ - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 128
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 2
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:-----:|:----:|:---------------:|
58
+ | 0.1352 | 1.0 | 76 | 0.0970 |
59
+ | 0.051 | 2.0 | 152 | 0.0826 |
60
+
61
+
62
+ ### Framework versions
63
+
64
+ - Transformers 4.45.0.dev0
65
+ - Pytorch 2.4.0+cu121
66
+ - Datasets 3.0.1
67
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 4.516827336435302e+16,
4
+ "train_loss": 0.09894711428665016,
5
+ "train_runtime": 2051.5561,
6
+ "train_samples": 9698,
7
+ "train_samples_per_second": 9.454,
8
+ "train_steps_per_second": 0.074
9
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 128000,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128008,
7
+ 128009
8
+ ],
9
+ "temperature": 0.6,
10
+ "top_p": 0.9,
11
+ "transformers_version": "4.45.0.dev0"
12
+ }
runs/Sep29_20-52-53_instance-20240927-081456/events.out.tfevents.1727643237.instance-20240927-081456.554162.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea79e71c7be12b2a2140f5958bf9a942e1083175053bd4b820b79676cf17f92e
3
- size 37822
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4914a078681d620744ae0eb959c5c29eca8cbfe8f052e331f3ec8bd6303b9c3
3
+ size 38447
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 4.516827336435302e+16,
4
+ "train_loss": 0.09894711428665016,
5
+ "train_runtime": 2051.5561,
6
+ "train_samples": 9698,
7
+ "train_samples_per_second": 9.454,
8
+ "train_steps_per_second": 0.074
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 152,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.013157894736842105,
13
+ "grad_norm": 13.235902064816667,
14
+ "learning_rate": 6.25e-07,
15
+ "loss": 0.3431,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.02631578947368421,
20
+ "grad_norm": 13.138228566029483,
21
+ "learning_rate": 1.25e-06,
22
+ "loss": 0.3429,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.039473684210526314,
27
+ "grad_norm": 11.866560617710881,
28
+ "learning_rate": 1.8750000000000003e-06,
29
+ "loss": 0.2846,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.05263157894736842,
34
+ "grad_norm": 11.872638763731226,
35
+ "learning_rate": 2.5e-06,
36
+ "loss": 0.3455,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.06578947368421052,
41
+ "grad_norm": 7.262421698320824,
42
+ "learning_rate": 3.125e-06,
43
+ "loss": 0.2313,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.07894736842105263,
48
+ "grad_norm": 5.162108989306741,
49
+ "learning_rate": 3.7500000000000005e-06,
50
+ "loss": 0.1935,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.09210526315789473,
55
+ "grad_norm": 4.989963293072479,
56
+ "learning_rate": 4.3750000000000005e-06,
57
+ "loss": 0.1607,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.10526315789473684,
62
+ "grad_norm": 3.2196850103315575,
63
+ "learning_rate": 5e-06,
64
+ "loss": 0.1746,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.11842105263157894,
69
+ "grad_norm": 9.267917781353754,
70
+ "learning_rate": 5.625e-06,
71
+ "loss": 0.2141,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.13157894736842105,
76
+ "grad_norm": 2.9550126149670968,
77
+ "learning_rate": 6.25e-06,
78
+ "loss": 0.1466,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.14473684210526316,
83
+ "grad_norm": 3.2305434151019665,
84
+ "learning_rate": 6.875e-06,
85
+ "loss": 0.1559,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.15789473684210525,
90
+ "grad_norm": 2.303728369668543,
91
+ "learning_rate": 7.500000000000001e-06,
92
+ "loss": 0.1459,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.17105263157894737,
97
+ "grad_norm": 2.1650046636974225,
98
+ "learning_rate": 8.125000000000001e-06,
99
+ "loss": 0.1158,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.18421052631578946,
104
+ "grad_norm": 2.43429839604823,
105
+ "learning_rate": 8.750000000000001e-06,
106
+ "loss": 0.1429,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.19736842105263158,
111
+ "grad_norm": 2.256199461941204,
112
+ "learning_rate": 9.375000000000001e-06,
113
+ "loss": 0.1569,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.21052631578947367,
118
+ "grad_norm": 2.540675164259285,
119
+ "learning_rate": 1e-05,
120
+ "loss": 0.1389,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.2236842105263158,
125
+ "grad_norm": 3.0603124517729934,
126
+ "learning_rate": 9.998666040558187e-06,
127
+ "loss": 0.1481,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.23684210526315788,
132
+ "grad_norm": 3.15679704599401,
133
+ "learning_rate": 9.994664874011864e-06,
134
+ "loss": 0.1543,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.25,
139
+ "grad_norm": 1.2073031998301942,
140
+ "learning_rate": 9.987998635318586e-06,
141
+ "loss": 0.0912,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.2631578947368421,
146
+ "grad_norm": 1.5485358071033657,
147
+ "learning_rate": 9.978670881475173e-06,
148
+ "loss": 0.1454,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.27631578947368424,
153
+ "grad_norm": 1.837098464284506,
154
+ "learning_rate": 9.96668658961975e-06,
155
+ "loss": 0.1263,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.2894736842105263,
160
+ "grad_norm": 2.195521777946691,
161
+ "learning_rate": 9.952052154376027e-06,
162
+ "loss": 0.1347,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.3026315789473684,
167
+ "grad_norm": 1.4759607642496113,
168
+ "learning_rate": 9.93477538444123e-06,
169
+ "loss": 0.1036,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.3157894736842105,
174
+ "grad_norm": 3.4617468292311653,
175
+ "learning_rate": 9.91486549841951e-06,
176
+ "loss": 0.1401,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.32894736842105265,
181
+ "grad_norm": 1.8290234660376485,
182
+ "learning_rate": 9.892333119903045e-06,
183
+ "loss": 0.1191,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.34210526315789475,
188
+ "grad_norm": 1.9446124807007885,
189
+ "learning_rate": 9.867190271803466e-06,
190
+ "loss": 0.1156,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.35526315789473684,
195
+ "grad_norm": 1.924481905745065,
196
+ "learning_rate": 9.839450369936615e-06,
197
+ "loss": 0.1203,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.3684210526315789,
202
+ "grad_norm": 0.9472794741039898,
203
+ "learning_rate": 9.809128215864096e-06,
204
+ "loss": 0.084,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.3815789473684211,
209
+ "grad_norm": 1.8614087289828243,
210
+ "learning_rate": 9.776239988995401e-06,
211
+ "loss": 0.1237,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.39473684210526316,
216
+ "grad_norm": 1.8725601311672047,
217
+ "learning_rate": 9.74080323795483e-06,
218
+ "loss": 0.109,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.40789473684210525,
223
+ "grad_norm": 1.6989866208387698,
224
+ "learning_rate": 9.702836871217838e-06,
225
+ "loss": 0.1078,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 0.42105263157894735,
230
+ "grad_norm": 1.4620493708990816,
231
+ "learning_rate": 9.66236114702178e-06,
232
+ "loss": 0.0993,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.4342105263157895,
237
+ "grad_norm": 1.4349318310245505,
238
+ "learning_rate": 9.619397662556434e-06,
239
+ "loss": 0.1168,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 0.4473684210526316,
244
+ "grad_norm": 1.2971799391770196,
245
+ "learning_rate": 9.573969342440107e-06,
246
+ "loss": 0.0989,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.4605263157894737,
251
+ "grad_norm": 1.3284875209748295,
252
+ "learning_rate": 9.52610042648741e-06,
253
+ "loss": 0.1052,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 0.47368421052631576,
258
+ "grad_norm": 1.517294107060111,
259
+ "learning_rate": 9.475816456775313e-06,
260
+ "loss": 0.1095,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.4868421052631579,
265
+ "grad_norm": 1.8550537994393244,
266
+ "learning_rate": 9.423144264014278e-06,
267
+ "loss": 0.0975,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.5,
272
+ "grad_norm": 2.2490574894196733,
273
+ "learning_rate": 9.368111953231849e-06,
274
+ "loss": 0.1171,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.5131578947368421,
279
+ "grad_norm": 1.839240097339726,
280
+ "learning_rate": 9.310748888776254e-06,
281
+ "loss": 0.0916,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.5263157894736842,
286
+ "grad_norm": 1.0701426736883897,
287
+ "learning_rate": 9.251085678648072e-06,
288
+ "loss": 0.1004,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.5394736842105263,
293
+ "grad_norm": 1.3170647678764655,
294
+ "learning_rate": 9.189154158168293e-06,
295
+ "loss": 0.0873,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.5526315789473685,
300
+ "grad_norm": 1.2646530838963848,
301
+ "learning_rate": 9.124987372991512e-06,
302
+ "loss": 0.1062,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 0.5657894736842105,
307
+ "grad_norm": 2.0376323047774947,
308
+ "learning_rate": 9.058619561473308e-06,
309
+ "loss": 0.1446,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 0.5789473684210527,
314
+ "grad_norm": 1.4525149631147896,
315
+ "learning_rate": 8.990086136401199e-06,
316
+ "loss": 0.1063,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 0.5921052631578947,
321
+ "grad_norm": 1.7227367203376753,
322
+ "learning_rate": 8.91942366609897e-06,
323
+ "loss": 0.1288,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 0.6052631578947368,
328
+ "grad_norm": 1.6374368378542756,
329
+ "learning_rate": 8.846669854914395e-06,
330
+ "loss": 0.1101,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 0.618421052631579,
335
+ "grad_norm": 1.776661892531926,
336
+ "learning_rate": 8.771863523100821e-06,
337
+ "loss": 0.0863,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 0.631578947368421,
342
+ "grad_norm": 2.101189002973872,
343
+ "learning_rate": 8.695044586103297e-06,
344
+ "loss": 0.1155,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 0.6447368421052632,
349
+ "grad_norm": 1.4990143401140024,
350
+ "learning_rate": 8.616254033260351e-06,
351
+ "loss": 0.1233,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 0.6578947368421053,
356
+ "grad_norm": 1.300205397968391,
357
+ "learning_rate": 8.535533905932739e-06,
358
+ "loss": 0.0722,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 0.6710526315789473,
363
+ "grad_norm": 1.2976123144962795,
364
+ "learning_rate": 8.452927275070858e-06,
365
+ "loss": 0.0955,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 0.6842105263157895,
370
+ "grad_norm": 3.1993999412027807,
371
+ "learning_rate": 8.368478218232787e-06,
372
+ "loss": 0.0992,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 0.6973684210526315,
377
+ "grad_norm": 1.511040367087983,
378
+ "learning_rate": 8.282231796065215e-06,
379
+ "loss": 0.1239,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 0.7105263157894737,
384
+ "grad_norm": 1.735863282573467,
385
+ "learning_rate": 8.194234028259806e-06,
386
+ "loss": 0.1114,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 0.7236842105263158,
391
+ "grad_norm": 1.062538506538223,
392
+ "learning_rate": 8.104531868997858e-06,
393
+ "loss": 0.0833,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 0.7368421052631579,
398
+ "grad_norm": 1.4057184361489892,
399
+ "learning_rate": 8.013173181896283e-06,
400
+ "loss": 0.1348,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 0.75,
405
+ "grad_norm": 1.0217071440957173,
406
+ "learning_rate": 7.920206714468383e-06,
407
+ "loss": 0.0869,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 0.7631578947368421,
412
+ "grad_norm": 0.9647935123156808,
413
+ "learning_rate": 7.82568207211296e-06,
414
+ "loss": 0.0774,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 0.7763157894736842,
419
+ "grad_norm": 1.2119765768642377,
420
+ "learning_rate": 7.729649691645673e-06,
421
+ "loss": 0.0867,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 0.7894736842105263,
426
+ "grad_norm": 0.684404887958961,
427
+ "learning_rate": 7.63216081438678e-06,
428
+ "loss": 0.077,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 0.8026315789473685,
433
+ "grad_norm": 1.202460612915356,
434
+ "learning_rate": 7.533267458819597e-06,
435
+ "loss": 0.0823,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 0.8157894736842105,
440
+ "grad_norm": 1.307993207575058,
441
+ "learning_rate": 7.4330223928342814e-06,
442
+ "loss": 0.0902,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 0.8289473684210527,
447
+ "grad_norm": 1.6640855566750032,
448
+ "learning_rate": 7.33147910557174e-06,
449
+ "loss": 0.141,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 0.8421052631578947,
454
+ "grad_norm": 1.3895288949997349,
455
+ "learning_rate": 7.2286917788826926e-06,
456
+ "loss": 0.0992,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 0.8552631578947368,
461
+ "grad_norm": 0.86233230188121,
462
+ "learning_rate": 7.124715258417111e-06,
463
+ "loss": 0.0663,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 0.868421052631579,
468
+ "grad_norm": 1.3734677592094817,
469
+ "learning_rate": 7.019605024359475e-06,
470
+ "loss": 0.0848,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 0.881578947368421,
475
+ "grad_norm": 1.2016915391705267,
476
+ "learning_rate": 6.913417161825449e-06,
477
+ "loss": 0.1137,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 0.8947368421052632,
482
+ "grad_norm": 1.1962233631272159,
483
+ "learning_rate": 6.806208330935766e-06,
484
+ "loss": 0.0866,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 0.9078947368421053,
489
+ "grad_norm": 1.3774507325980982,
490
+ "learning_rate": 6.698035736583307e-06,
491
+ "loss": 0.0975,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 0.9210526315789473,
496
+ "grad_norm": 1.0910714190292217,
497
+ "learning_rate": 6.588957097909509e-06,
498
+ "loss": 0.0804,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 0.9342105263157895,
503
+ "grad_norm": 0.7559368152288192,
504
+ "learning_rate": 6.4790306175063535e-06,
505
+ "loss": 0.0748,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 0.9473684210526315,
510
+ "grad_norm": 1.320278904610911,
511
+ "learning_rate": 6.368314950360416e-06,
512
+ "loss": 0.1092,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 0.9605263157894737,
517
+ "grad_norm": 0.9978117274759357,
518
+ "learning_rate": 6.2568691725555144e-06,
519
+ "loss": 0.0669,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 0.9736842105263158,
524
+ "grad_norm": 1.184512537053634,
525
+ "learning_rate": 6.144752749750671e-06,
526
+ "loss": 0.0791,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 0.9868421052631579,
531
+ "grad_norm": 1.3888364419849502,
532
+ "learning_rate": 6.0320255054501985e-06,
533
+ "loss": 0.0791,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 1.0,
538
+ "grad_norm": 1.0696792385978997,
539
+ "learning_rate": 5.918747589082853e-06,
540
+ "loss": 0.1352,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 1.0,
545
+ "eval_loss": 0.09702859073877335,
546
+ "eval_runtime": 143.4285,
547
+ "eval_samples_per_second": 35.593,
548
+ "eval_steps_per_second": 1.116,
549
+ "step": 76
550
+ },
551
+ {
552
+ "epoch": 1.013157894736842,
553
+ "grad_norm": 1.130795788902584,
554
+ "learning_rate": 5.804979443907065e-06,
555
+ "loss": 0.0565,
556
+ "step": 77
557
+ },
558
+ {
559
+ "epoch": 1.0263157894736843,
560
+ "grad_norm": 0.9742675703939438,
561
+ "learning_rate": 5.690781774759412e-06,
562
+ "loss": 0.0826,
563
+ "step": 78
564
+ },
565
+ {
566
+ "epoch": 1.0394736842105263,
567
+ "grad_norm": 1.4529332894640457,
568
+ "learning_rate": 5.576215515663489e-06,
569
+ "loss": 0.109,
570
+ "step": 79
571
+ },
572
+ {
573
+ "epoch": 1.0526315789473684,
574
+ "grad_norm": 0.8745980190537407,
575
+ "learning_rate": 5.46134179731651e-06,
576
+ "loss": 0.0815,
577
+ "step": 80
578
+ },
579
+ {
580
+ "epoch": 1.0657894736842106,
581
+ "grad_norm": 0.965352409023328,
582
+ "learning_rate": 5.346221914470959e-06,
583
+ "loss": 0.0753,
584
+ "step": 81
585
+ },
586
+ {
587
+ "epoch": 1.0789473684210527,
588
+ "grad_norm": 0.9281912350813449,
589
+ "learning_rate": 5.230917293228699e-06,
590
+ "loss": 0.0604,
591
+ "step": 82
592
+ },
593
+ {
594
+ "epoch": 1.0921052631578947,
595
+ "grad_norm": 0.7117517424840752,
596
+ "learning_rate": 5.115489458265006e-06,
597
+ "loss": 0.0548,
598
+ "step": 83
599
+ },
600
+ {
601
+ "epoch": 1.1052631578947367,
602
+ "grad_norm": 0.8174226361772433,
603
+ "learning_rate": 5e-06,
604
+ "loss": 0.0798,
605
+ "step": 84
606
+ },
607
+ {
608
+ "epoch": 1.118421052631579,
609
+ "grad_norm": 1.091499890906096,
610
+ "learning_rate": 4.8845105417349955e-06,
611
+ "loss": 0.0669,
612
+ "step": 85
613
+ },
614
+ {
615
+ "epoch": 1.131578947368421,
616
+ "grad_norm": 0.8489893677105097,
617
+ "learning_rate": 4.7690827067713035e-06,
618
+ "loss": 0.0799,
619
+ "step": 86
620
+ },
621
+ {
622
+ "epoch": 1.1447368421052633,
623
+ "grad_norm": 1.0103580405572923,
624
+ "learning_rate": 4.653778085529043e-06,
625
+ "loss": 0.0697,
626
+ "step": 87
627
+ },
628
+ {
629
+ "epoch": 1.1578947368421053,
630
+ "grad_norm": 0.8915773944210619,
631
+ "learning_rate": 4.53865820268349e-06,
632
+ "loss": 0.0786,
633
+ "step": 88
634
+ },
635
+ {
636
+ "epoch": 1.1710526315789473,
637
+ "grad_norm": 0.9795278903306394,
638
+ "learning_rate": 4.4237844843365126e-06,
639
+ "loss": 0.0948,
640
+ "step": 89
641
+ },
642
+ {
643
+ "epoch": 1.1842105263157894,
644
+ "grad_norm": 0.6718237305799917,
645
+ "learning_rate": 4.309218225240591e-06,
646
+ "loss": 0.0751,
647
+ "step": 90
648
+ },
649
+ {
650
+ "epoch": 1.1973684210526316,
651
+ "grad_norm": 0.9809953018950844,
652
+ "learning_rate": 4.195020556092935e-06,
653
+ "loss": 0.0717,
654
+ "step": 91
655
+ },
656
+ {
657
+ "epoch": 1.2105263157894737,
658
+ "grad_norm": 1.3739884608587827,
659
+ "learning_rate": 4.081252410917148e-06,
660
+ "loss": 0.0834,
661
+ "step": 92
662
+ },
663
+ {
664
+ "epoch": 1.2236842105263157,
665
+ "grad_norm": 0.7354951934416454,
666
+ "learning_rate": 3.967974494549803e-06,
667
+ "loss": 0.0497,
668
+ "step": 93
669
+ },
670
+ {
671
+ "epoch": 1.236842105263158,
672
+ "grad_norm": 0.9321611181418619,
673
+ "learning_rate": 3.855247250249331e-06,
674
+ "loss": 0.1091,
675
+ "step": 94
676
+ },
677
+ {
678
+ "epoch": 1.25,
679
+ "grad_norm": 0.8153028496714768,
680
+ "learning_rate": 3.743130827444487e-06,
681
+ "loss": 0.0719,
682
+ "step": 95
683
+ },
684
+ {
685
+ "epoch": 1.263157894736842,
686
+ "grad_norm": 1.7763781332034168,
687
+ "learning_rate": 3.6316850496395863e-06,
688
+ "loss": 0.0995,
689
+ "step": 96
690
+ },
691
+ {
692
+ "epoch": 1.2763157894736843,
693
+ "grad_norm": 1.0403921146171855,
694
+ "learning_rate": 3.5209693824936486e-06,
695
+ "loss": 0.1035,
696
+ "step": 97
697
+ },
698
+ {
699
+ "epoch": 1.2894736842105263,
700
+ "grad_norm": 0.9375966336686881,
701
+ "learning_rate": 3.4110429020904924e-06,
702
+ "loss": 0.0854,
703
+ "step": 98
704
+ },
705
+ {
706
+ "epoch": 1.3026315789473684,
707
+ "grad_norm": 0.8044967231814447,
708
+ "learning_rate": 3.301964263416693e-06,
709
+ "loss": 0.0545,
710
+ "step": 99
711
+ },
712
+ {
713
+ "epoch": 1.3157894736842106,
714
+ "grad_norm": 1.057615646119616,
715
+ "learning_rate": 3.1937916690642356e-06,
716
+ "loss": 0.0721,
717
+ "step": 100
718
+ },
719
+ {
720
+ "epoch": 1.3289473684210527,
721
+ "grad_norm": 0.8460218568489392,
722
+ "learning_rate": 3.0865828381745515e-06,
723
+ "loss": 0.0756,
724
+ "step": 101
725
+ },
726
+ {
727
+ "epoch": 1.3421052631578947,
728
+ "grad_norm": 0.7973010176418315,
729
+ "learning_rate": 2.980394975640526e-06,
730
+ "loss": 0.0701,
731
+ "step": 102
732
+ },
733
+ {
734
+ "epoch": 1.3552631578947367,
735
+ "grad_norm": 0.43625526231871226,
736
+ "learning_rate": 2.8752847415828923e-06,
737
+ "loss": 0.0439,
738
+ "step": 103
739
+ },
740
+ {
741
+ "epoch": 1.368421052631579,
742
+ "grad_norm": 0.7846073292481729,
743
+ "learning_rate": 2.771308221117309e-06,
744
+ "loss": 0.068,
745
+ "step": 104
746
+ },
747
+ {
748
+ "epoch": 1.381578947368421,
749
+ "grad_norm": 0.7894886291152677,
750
+ "learning_rate": 2.668520894428259e-06,
751
+ "loss": 0.0521,
752
+ "step": 105
753
+ },
754
+ {
755
+ "epoch": 1.3947368421052633,
756
+ "grad_norm": 0.8898713056015292,
757
+ "learning_rate": 2.5669776071657194e-06,
758
+ "loss": 0.0832,
759
+ "step": 106
760
+ },
761
+ {
762
+ "epoch": 1.4078947368421053,
763
+ "grad_norm": 0.7239448528037634,
764
+ "learning_rate": 2.466732541180404e-06,
765
+ "loss": 0.0651,
766
+ "step": 107
767
+ },
768
+ {
769
+ "epoch": 1.4210526315789473,
770
+ "grad_norm": 1.1924698653865555,
771
+ "learning_rate": 2.3678391856132203e-06,
772
+ "loss": 0.0871,
773
+ "step": 108
774
+ },
775
+ {
776
+ "epoch": 1.4342105263157894,
777
+ "grad_norm": 0.796957419192979,
778
+ "learning_rate": 2.2703503083543288e-06,
779
+ "loss": 0.0632,
780
+ "step": 109
781
+ },
782
+ {
783
+ "epoch": 1.4473684210526316,
784
+ "grad_norm": 0.8317972188965687,
785
+ "learning_rate": 2.174317927887041e-06,
786
+ "loss": 0.0627,
787
+ "step": 110
788
+ },
789
+ {
790
+ "epoch": 1.4605263157894737,
791
+ "grad_norm": 0.6383766307808258,
792
+ "learning_rate": 2.0797932855316183e-06,
793
+ "loss": 0.0685,
794
+ "step": 111
795
+ },
796
+ {
797
+ "epoch": 1.4736842105263157,
798
+ "grad_norm": 0.8167329309674938,
799
+ "learning_rate": 1.9868268181037186e-06,
800
+ "loss": 0.0674,
801
+ "step": 112
802
+ },
803
+ {
804
+ "epoch": 1.486842105263158,
805
+ "grad_norm": 0.7471966235227357,
806
+ "learning_rate": 1.8954681310021434e-06,
807
+ "loss": 0.0729,
808
+ "step": 113
809
+ },
810
+ {
811
+ "epoch": 1.5,
812
+ "grad_norm": 0.6587846660979715,
813
+ "learning_rate": 1.8057659717401948e-06,
814
+ "loss": 0.0481,
815
+ "step": 114
816
+ },
817
+ {
818
+ "epoch": 1.513157894736842,
819
+ "grad_norm": 0.8623712101197185,
820
+ "learning_rate": 1.7177682039347875e-06,
821
+ "loss": 0.0751,
822
+ "step": 115
823
+ },
824
+ {
825
+ "epoch": 1.526315789473684,
826
+ "grad_norm": 0.7600610777742871,
827
+ "learning_rate": 1.6315217817672142e-06,
828
+ "loss": 0.0754,
829
+ "step": 116
830
+ },
831
+ {
832
+ "epoch": 1.5394736842105263,
833
+ "grad_norm": 0.7028388956696653,
834
+ "learning_rate": 1.5470727249291423e-06,
835
+ "loss": 0.0591,
836
+ "step": 117
837
+ },
838
+ {
839
+ "epoch": 1.5526315789473686,
840
+ "grad_norm": 0.9080662531327958,
841
+ "learning_rate": 1.4644660940672628e-06,
842
+ "loss": 0.0665,
843
+ "step": 118
844
+ },
845
+ {
846
+ "epoch": 1.5657894736842106,
847
+ "grad_norm": 0.8002081473664321,
848
+ "learning_rate": 1.383745966739652e-06,
849
+ "loss": 0.0567,
850
+ "step": 119
851
+ },
852
+ {
853
+ "epoch": 1.5789473684210527,
854
+ "grad_norm": 0.8070376846944848,
855
+ "learning_rate": 1.3049554138967052e-06,
856
+ "loss": 0.074,
857
+ "step": 120
858
+ },
859
+ {
860
+ "epoch": 1.5921052631578947,
861
+ "grad_norm": 0.7133857988769062,
862
+ "learning_rate": 1.2281364768991804e-06,
863
+ "loss": 0.082,
864
+ "step": 121
865
+ },
866
+ {
867
+ "epoch": 1.6052631578947367,
868
+ "grad_norm": 0.7063230781754961,
869
+ "learning_rate": 1.1533301450856054e-06,
870
+ "loss": 0.0793,
871
+ "step": 122
872
+ },
873
+ {
874
+ "epoch": 1.618421052631579,
875
+ "grad_norm": 0.5118303753159236,
876
+ "learning_rate": 1.0805763339010329e-06,
877
+ "loss": 0.0606,
878
+ "step": 123
879
+ },
880
+ {
881
+ "epoch": 1.631578947368421,
882
+ "grad_norm": 0.6255028097581671,
883
+ "learning_rate": 1.0099138635988026e-06,
884
+ "loss": 0.0624,
885
+ "step": 124
886
+ },
887
+ {
888
+ "epoch": 1.6447368421052633,
889
+ "grad_norm": 1.2565545443199133,
890
+ "learning_rate": 9.41380438526694e-07,
891
+ "loss": 0.0834,
892
+ "step": 125
893
+ },
894
+ {
895
+ "epoch": 1.6578947368421053,
896
+ "grad_norm": 0.6949398501709794,
897
+ "learning_rate": 8.750126270084891e-07,
898
+ "loss": 0.0554,
899
+ "step": 126
900
+ },
901
+ {
902
+ "epoch": 1.6710526315789473,
903
+ "grad_norm": 0.6169370398639232,
904
+ "learning_rate": 8.108458418317089e-07,
905
+ "loss": 0.0756,
906
+ "step": 127
907
+ },
908
+ {
909
+ "epoch": 1.6842105263157894,
910
+ "grad_norm": 0.5783403647508548,
911
+ "learning_rate": 7.489143213519301e-07,
912
+ "loss": 0.0484,
913
+ "step": 128
914
+ },
915
+ {
916
+ "epoch": 1.6973684210526314,
917
+ "grad_norm": 0.613206478877195,
918
+ "learning_rate": 6.892511112237472e-07,
919
+ "loss": 0.0674,
920
+ "step": 129
921
+ },
922
+ {
923
+ "epoch": 1.7105263157894737,
924
+ "grad_norm": 0.7610207582376373,
925
+ "learning_rate": 6.318880467681527e-07,
926
+ "loss": 0.0802,
927
+ "step": 130
928
+ },
929
+ {
930
+ "epoch": 1.723684210526316,
931
+ "grad_norm": 0.6484932637313764,
932
+ "learning_rate": 5.768557359857241e-07,
933
+ "loss": 0.0799,
934
+ "step": 131
935
+ },
936
+ {
937
+ "epoch": 1.736842105263158,
938
+ "grad_norm": 0.801045963046033,
939
+ "learning_rate": 5.241835432246888e-07,
940
+ "loss": 0.0631,
941
+ "step": 132
942
+ },
943
+ {
944
+ "epoch": 1.75,
945
+ "grad_norm": 0.5006567755599417,
946
+ "learning_rate": 4.738995735125895e-07,
947
+ "loss": 0.0602,
948
+ "step": 133
949
+ },
950
+ {
951
+ "epoch": 1.763157894736842,
952
+ "grad_norm": 0.7321825355306667,
953
+ "learning_rate": 4.2603065755989493e-07,
954
+ "loss": 0.0574,
955
+ "step": 134
956
+ },
957
+ {
958
+ "epoch": 1.776315789473684,
959
+ "grad_norm": 0.5455281345073466,
960
+ "learning_rate": 3.8060233744356634e-07,
961
+ "loss": 0.0585,
962
+ "step": 135
963
+ },
964
+ {
965
+ "epoch": 1.7894736842105263,
966
+ "grad_norm": 0.6941675384816356,
967
+ "learning_rate": 3.3763885297822153e-07,
968
+ "loss": 0.0593,
969
+ "step": 136
970
+ },
971
+ {
972
+ "epoch": 1.8026315789473686,
973
+ "grad_norm": 0.607277109667664,
974
+ "learning_rate": 2.9716312878216194e-07,
975
+ "loss": 0.0596,
976
+ "step": 137
977
+ },
978
+ {
979
+ "epoch": 1.8157894736842106,
980
+ "grad_norm": 0.5526534850437881,
981
+ "learning_rate": 2.5919676204517073e-07,
982
+ "loss": 0.0574,
983
+ "step": 138
984
+ },
985
+ {
986
+ "epoch": 1.8289473684210527,
987
+ "grad_norm": 0.7027887496493381,
988
+ "learning_rate": 2.237600110046001e-07,
989
+ "loss": 0.0883,
990
+ "step": 139
991
+ },
992
+ {
993
+ "epoch": 1.8421052631578947,
994
+ "grad_norm": 0.5658475866263035,
995
+ "learning_rate": 1.908717841359048e-07,
996
+ "loss": 0.0593,
997
+ "step": 140
998
+ },
999
+ {
1000
+ "epoch": 1.8552631578947367,
1001
+ "grad_norm": 0.6440458312241749,
1002
+ "learning_rate": 1.6054963006338742e-07,
1003
+ "loss": 0.0645,
1004
+ "step": 141
1005
+ },
1006
+ {
1007
+ "epoch": 1.868421052631579,
1008
+ "grad_norm": 0.8251777255090132,
1009
+ "learning_rate": 1.328097281965357e-07,
1010
+ "loss": 0.091,
1011
+ "step": 142
1012
+ },
1013
+ {
1014
+ "epoch": 1.881578947368421,
1015
+ "grad_norm": 0.584859727969167,
1016
+ "learning_rate": 1.0766688009695548e-07,
1017
+ "loss": 0.0624,
1018
+ "step": 143
1019
+ },
1020
+ {
1021
+ "epoch": 1.8947368421052633,
1022
+ "grad_norm": 0.6478070234261503,
1023
+ "learning_rate": 8.513450158049109e-08,
1024
+ "loss": 0.0662,
1025
+ "step": 144
1026
+ },
1027
+ {
1028
+ "epoch": 1.9078947368421053,
1029
+ "grad_norm": 0.5481010295457636,
1030
+ "learning_rate": 6.522461555877213e-08,
1031
+ "loss": 0.0717,
1032
+ "step": 145
1033
+ },
1034
+ {
1035
+ "epoch": 1.9210526315789473,
1036
+ "grad_norm": 0.5866004853500736,
1037
+ "learning_rate": 4.794784562397459e-08,
1038
+ "loss": 0.0729,
1039
+ "step": 146
1040
+ },
1041
+ {
1042
+ "epoch": 1.9342105263157894,
1043
+ "grad_norm": 0.7549384445552164,
1044
+ "learning_rate": 3.3313410380250157e-08,
1045
+ "loss": 0.0936,
1046
+ "step": 147
1047
+ },
1048
+ {
1049
+ "epoch": 1.9473684210526314,
1050
+ "grad_norm": 0.8887389895072535,
1051
+ "learning_rate": 2.1329118524827662e-08,
1052
+ "loss": 0.1033,
1053
+ "step": 148
1054
+ },
1055
+ {
1056
+ "epoch": 1.9605263157894737,
1057
+ "grad_norm": 0.6459019124164774,
1058
+ "learning_rate": 1.200136468141544e-08,
1059
+ "loss": 0.0951,
1060
+ "step": 149
1061
+ },
1062
+ {
1063
+ "epoch": 1.973684210526316,
1064
+ "grad_norm": 0.5993524993653697,
1065
+ "learning_rate": 5.3351259881379016e-09,
1066
+ "loss": 0.063,
1067
+ "step": 150
1068
+ },
1069
+ {
1070
+ "epoch": 1.986842105263158,
1071
+ "grad_norm": 0.6183094577100168,
1072
+ "learning_rate": 1.3339594418138036e-09,
1073
+ "loss": 0.0715,
1074
+ "step": 151
1075
+ },
1076
+ {
1077
+ "epoch": 2.0,
1078
+ "grad_norm": 0.47166327201031033,
1079
+ "learning_rate": 0.0,
1080
+ "loss": 0.051,
1081
+ "step": 152
1082
+ },
1083
+ {
1084
+ "epoch": 2.0,
1085
+ "eval_loss": 0.08257210999727249,
1086
+ "eval_runtime": 143.687,
1087
+ "eval_samples_per_second": 35.529,
1088
+ "eval_steps_per_second": 1.114,
1089
+ "step": 152
1090
+ },
1091
+ {
1092
+ "epoch": 2.0,
1093
+ "step": 152,
1094
+ "total_flos": 4.516827336435302e+16,
1095
+ "train_loss": 0.09894711428665016,
1096
+ "train_runtime": 2051.5561,
1097
+ "train_samples_per_second": 9.454,
1098
+ "train_steps_per_second": 0.074
1099
+ }
1100
+ ],
1101
+ "logging_steps": 1,
1102
+ "max_steps": 152,
1103
+ "num_input_tokens_seen": 0,
1104
+ "num_train_epochs": 2,
1105
+ "save_steps": 500,
1106
+ "stateful_callbacks": {
1107
+ "TrainerControl": {
1108
+ "args": {
1109
+ "should_epoch_stop": false,
1110
+ "should_evaluate": false,
1111
+ "should_log": false,
1112
+ "should_save": true,
1113
+ "should_training_stop": true
1114
+ },
1115
+ "attributes": {}
1116
+ }
1117
+ },
1118
+ "total_flos": 4.516827336435302e+16,
1119
+ "train_batch_size": 8,
1120
+ "trial_name": null,
1121
+ "trial_params": null
1122
+ }