PiaoYangHF commited on
Commit
3348a74
1 Parent(s): 81ffe17

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +433 -0
trainer_state.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 50,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "grad_norm": 3.3398852348327637,
14
+ "learning_rate": 4.999950652140343e-05,
15
+ "loss": 0.288,
16
+ "num_input_tokens_seen": 1192,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 3.534205436706543,
22
+ "learning_rate": 4.9998026105095405e-05,
23
+ "loss": 0.1627,
24
+ "num_input_tokens_seen": 2208,
25
+ "step": 2
26
+ },
27
+ {
28
+ "epoch": 0.06,
29
+ "grad_norm": 3.9795141220092773,
30
+ "learning_rate": 4.999555880952023e-05,
31
+ "loss": 0.2259,
32
+ "num_input_tokens_seen": 3320,
33
+ "step": 3
34
+ },
35
+ {
36
+ "epoch": 0.08,
37
+ "grad_norm": 2.712265729904175,
38
+ "learning_rate": 4.99921047320825e-05,
39
+ "loss": 0.3248,
40
+ "num_input_tokens_seen": 4536,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.1,
45
+ "grad_norm": 2.0970170497894287,
46
+ "learning_rate": 4.998766400914329e-05,
47
+ "loss": 0.1867,
48
+ "num_input_tokens_seen": 5960,
49
+ "step": 5
50
+ },
51
+ {
52
+ "epoch": 0.12,
53
+ "grad_norm": 2.16365909576416,
54
+ "learning_rate": 4.998223681601473e-05,
55
+ "loss": 0.1558,
56
+ "num_input_tokens_seen": 7304,
57
+ "step": 6
58
+ },
59
+ {
60
+ "epoch": 0.14,
61
+ "grad_norm": 1.9363147020339966,
62
+ "learning_rate": 4.9975823366953124e-05,
63
+ "loss": 0.1863,
64
+ "num_input_tokens_seen": 8744,
65
+ "step": 7
66
+ },
67
+ {
68
+ "epoch": 0.16,
69
+ "grad_norm": 1.874936819076538,
70
+ "learning_rate": 4.996842391515044e-05,
71
+ "loss": 0.1679,
72
+ "num_input_tokens_seen": 10120,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.18,
77
+ "grad_norm": 7.298994541168213,
78
+ "learning_rate": 4.996003875272438e-05,
79
+ "loss": 0.3627,
80
+ "num_input_tokens_seen": 10816,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.2,
85
+ "grad_norm": 1.7786474227905273,
86
+ "learning_rate": 4.995066821070679e-05,
87
+ "loss": 0.1466,
88
+ "num_input_tokens_seen": 11920,
89
+ "step": 10
90
+ },
91
+ {
92
+ "epoch": 0.22,
93
+ "grad_norm": 1.9229803085327148,
94
+ "learning_rate": 4.994031265903063e-05,
95
+ "loss": 0.2029,
96
+ "num_input_tokens_seen": 13216,
97
+ "step": 11
98
+ },
99
+ {
100
+ "epoch": 0.24,
101
+ "grad_norm": 1.230123519897461,
102
+ "learning_rate": 4.992897250651535e-05,
103
+ "loss": 0.1142,
104
+ "num_input_tokens_seen": 14552,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.26,
109
+ "grad_norm": 2.0229177474975586,
110
+ "learning_rate": 4.991664820085074e-05,
111
+ "loss": 0.2142,
112
+ "num_input_tokens_seen": 15600,
113
+ "step": 13
114
+ },
115
+ {
116
+ "epoch": 0.28,
117
+ "grad_norm": 1.4955450296401978,
118
+ "learning_rate": 4.990334022857932e-05,
119
+ "loss": 0.149,
120
+ "num_input_tokens_seen": 16744,
121
+ "step": 14
122
+ },
123
+ {
124
+ "epoch": 0.3,
125
+ "grad_norm": 3.070120334625244,
126
+ "learning_rate": 4.9889049115077005e-05,
127
+ "loss": 0.2267,
128
+ "num_input_tokens_seen": 17424,
129
+ "step": 15
130
+ },
131
+ {
132
+ "epoch": 0.32,
133
+ "grad_norm": 2.16508150100708,
134
+ "learning_rate": 4.987377542453251e-05,
135
+ "loss": 0.1499,
136
+ "num_input_tokens_seen": 18408,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 0.34,
141
+ "grad_norm": 3.1738064289093018,
142
+ "learning_rate": 4.9857519759924974e-05,
143
+ "loss": 0.4279,
144
+ "num_input_tokens_seen": 19288,
145
+ "step": 17
146
+ },
147
+ {
148
+ "epoch": 0.36,
149
+ "grad_norm": 2.7662558555603027,
150
+ "learning_rate": 4.984028276300021e-05,
151
+ "loss": 0.2957,
152
+ "num_input_tokens_seen": 20336,
153
+ "step": 18
154
+ },
155
+ {
156
+ "epoch": 0.38,
157
+ "grad_norm": 1.4056564569473267,
158
+ "learning_rate": 4.982206511424534e-05,
159
+ "loss": 0.1899,
160
+ "num_input_tokens_seen": 21528,
161
+ "step": 19
162
+ },
163
+ {
164
+ "epoch": 0.4,
165
+ "grad_norm": 1.4555004835128784,
166
+ "learning_rate": 4.980286753286195e-05,
167
+ "loss": 0.1429,
168
+ "num_input_tokens_seen": 22680,
169
+ "step": 20
170
+ },
171
+ {
172
+ "epoch": 0.42,
173
+ "grad_norm": 1.6009600162506104,
174
+ "learning_rate": 4.978269077673767e-05,
175
+ "loss": 0.2084,
176
+ "num_input_tokens_seen": 23800,
177
+ "step": 21
178
+ },
179
+ {
180
+ "epoch": 0.44,
181
+ "grad_norm": 1.1936904191970825,
182
+ "learning_rate": 4.976153564241628e-05,
183
+ "loss": 0.0474,
184
+ "num_input_tokens_seen": 24784,
185
+ "step": 22
186
+ },
187
+ {
188
+ "epoch": 0.46,
189
+ "grad_norm": 2.241938829421997,
190
+ "learning_rate": 4.9739402965066276e-05,
191
+ "loss": 0.1757,
192
+ "num_input_tokens_seen": 25688,
193
+ "step": 23
194
+ },
195
+ {
196
+ "epoch": 0.48,
197
+ "grad_norm": 1.735016942024231,
198
+ "learning_rate": 4.971629361844785e-05,
199
+ "loss": 0.2083,
200
+ "num_input_tokens_seen": 27016,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.5,
205
+ "grad_norm": 1.648120403289795,
206
+ "learning_rate": 4.9692208514878444e-05,
207
+ "loss": 0.1653,
208
+ "num_input_tokens_seen": 28200,
209
+ "step": 25
210
+ },
211
+ {
212
+ "epoch": 0.52,
213
+ "grad_norm": 2.088919162750244,
214
+ "learning_rate": 4.96671486051967e-05,
215
+ "loss": 0.2277,
216
+ "num_input_tokens_seen": 29240,
217
+ "step": 26
218
+ },
219
+ {
220
+ "epoch": 0.54,
221
+ "grad_norm": 1.9720033407211304,
222
+ "learning_rate": 4.9641114878724956e-05,
223
+ "loss": 0.2262,
224
+ "num_input_tokens_seen": 30264,
225
+ "step": 27
226
+ },
227
+ {
228
+ "epoch": 0.56,
229
+ "grad_norm": 2.137261390686035,
230
+ "learning_rate": 4.9614108363230135e-05,
231
+ "loss": 0.2119,
232
+ "num_input_tokens_seen": 31376,
233
+ "step": 28
234
+ },
235
+ {
236
+ "epoch": 0.58,
237
+ "grad_norm": 1.7072184085845947,
238
+ "learning_rate": 4.958613012488324e-05,
239
+ "loss": 0.186,
240
+ "num_input_tokens_seen": 32288,
241
+ "step": 29
242
+ },
243
+ {
244
+ "epoch": 0.6,
245
+ "grad_norm": 1.6640106439590454,
246
+ "learning_rate": 4.9557181268217227e-05,
247
+ "loss": 0.1381,
248
+ "num_input_tokens_seen": 33336,
249
+ "step": 30
250
+ },
251
+ {
252
+ "epoch": 0.62,
253
+ "grad_norm": 0.743667721748352,
254
+ "learning_rate": 4.952726293608335e-05,
255
+ "loss": 0.1521,
256
+ "num_input_tokens_seen": 35264,
257
+ "step": 31
258
+ },
259
+ {
260
+ "epoch": 0.64,
261
+ "grad_norm": 1.3096036911010742,
262
+ "learning_rate": 4.949637630960617e-05,
263
+ "loss": 0.2232,
264
+ "num_input_tokens_seen": 36496,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 0.66,
269
+ "grad_norm": 1.943765640258789,
270
+ "learning_rate": 4.9464522608136805e-05,
271
+ "loss": 0.0252,
272
+ "num_input_tokens_seen": 37312,
273
+ "step": 33
274
+ },
275
+ {
276
+ "epoch": 0.68,
277
+ "grad_norm": 1.8293421268463135,
278
+ "learning_rate": 4.943170308920484e-05,
279
+ "loss": 0.264,
280
+ "num_input_tokens_seen": 38376,
281
+ "step": 34
282
+ },
283
+ {
284
+ "epoch": 0.7,
285
+ "grad_norm": 2.6471211910247803,
286
+ "learning_rate": 4.939791904846869e-05,
287
+ "loss": 0.1645,
288
+ "num_input_tokens_seen": 39168,
289
+ "step": 35
290
+ },
291
+ {
292
+ "epoch": 0.72,
293
+ "grad_norm": 1.6533693075180054,
294
+ "learning_rate": 4.9363171819664434e-05,
295
+ "loss": 0.1802,
296
+ "num_input_tokens_seen": 40440,
297
+ "step": 36
298
+ },
299
+ {
300
+ "epoch": 0.74,
301
+ "grad_norm": 1.3770309686660767,
302
+ "learning_rate": 4.9327462774553166e-05,
303
+ "loss": 0.2206,
304
+ "num_input_tokens_seen": 41760,
305
+ "step": 37
306
+ },
307
+ {
308
+ "epoch": 0.76,
309
+ "grad_norm": 1.6882014274597168,
310
+ "learning_rate": 4.929079332286685e-05,
311
+ "loss": 0.2886,
312
+ "num_input_tokens_seen": 42944,
313
+ "step": 38
314
+ },
315
+ {
316
+ "epoch": 0.78,
317
+ "grad_norm": 1.9799898862838745,
318
+ "learning_rate": 4.925316491225265e-05,
319
+ "loss": 0.1897,
320
+ "num_input_tokens_seen": 44008,
321
+ "step": 39
322
+ },
323
+ {
324
+ "epoch": 0.8,
325
+ "grad_norm": 1.551802158355713,
326
+ "learning_rate": 4.9214579028215776e-05,
327
+ "loss": 0.1935,
328
+ "num_input_tokens_seen": 45168,
329
+ "step": 40
330
+ },
331
+ {
332
+ "epoch": 0.82,
333
+ "grad_norm": 1.0445095300674438,
334
+ "learning_rate": 4.917503719406088e-05,
335
+ "loss": 0.1071,
336
+ "num_input_tokens_seen": 46432,
337
+ "step": 41
338
+ },
339
+ {
340
+ "epoch": 0.84,
341
+ "grad_norm": 3.2616090774536133,
342
+ "learning_rate": 4.913454097083185e-05,
343
+ "loss": 0.5737,
344
+ "num_input_tokens_seen": 47360,
345
+ "step": 42
346
+ },
347
+ {
348
+ "epoch": 0.86,
349
+ "grad_norm": 0.9618259072303772,
350
+ "learning_rate": 4.909309195725025e-05,
351
+ "loss": 0.114,
352
+ "num_input_tokens_seen": 49008,
353
+ "step": 43
354
+ },
355
+ {
356
+ "epoch": 0.88,
357
+ "grad_norm": 1.5821013450622559,
358
+ "learning_rate": 4.905069178965215e-05,
359
+ "loss": 0.2676,
360
+ "num_input_tokens_seen": 50264,
361
+ "step": 44
362
+ },
363
+ {
364
+ "epoch": 0.9,
365
+ "grad_norm": 1.1913903951644897,
366
+ "learning_rate": 4.900734214192358e-05,
367
+ "loss": 0.1061,
368
+ "num_input_tokens_seen": 51368,
369
+ "step": 45
370
+ },
371
+ {
372
+ "epoch": 0.92,
373
+ "grad_norm": 1.8395439386367798,
374
+ "learning_rate": 4.89630447254344e-05,
375
+ "loss": 0.2161,
376
+ "num_input_tokens_seen": 52304,
377
+ "step": 46
378
+ },
379
+ {
380
+ "epoch": 0.94,
381
+ "grad_norm": 0.736319363117218,
382
+ "learning_rate": 4.891780128897077e-05,
383
+ "loss": 0.1163,
384
+ "num_input_tokens_seen": 53712,
385
+ "step": 47
386
+ },
387
+ {
388
+ "epoch": 0.96,
389
+ "grad_norm": 1.5373475551605225,
390
+ "learning_rate": 4.887161361866608e-05,
391
+ "loss": 0.1609,
392
+ "num_input_tokens_seen": 54704,
393
+ "step": 48
394
+ },
395
+ {
396
+ "epoch": 0.98,
397
+ "grad_norm": 1.3580759763717651,
398
+ "learning_rate": 4.882448353793048e-05,
399
+ "loss": 0.1673,
400
+ "num_input_tokens_seen": 55880,
401
+ "step": 49
402
+ },
403
+ {
404
+ "epoch": 1.0,
405
+ "grad_norm": 2.0536258220672607,
406
+ "learning_rate": 4.877641290737884e-05,
407
+ "loss": 0.1955,
408
+ "num_input_tokens_seen": 56784,
409
+ "step": 50
410
+ }
411
+ ],
412
+ "logging_steps": 1,
413
+ "max_steps": 500,
414
+ "num_input_tokens_seen": 56784,
415
+ "num_train_epochs": 10,
416
+ "save_steps": 50,
417
+ "stateful_callbacks": {
418
+ "TrainerControl": {
419
+ "args": {
420
+ "should_epoch_stop": false,
421
+ "should_evaluate": false,
422
+ "should_log": false,
423
+ "should_save": true,
424
+ "should_training_stop": false
425
+ },
426
+ "attributes": {}
427
+ }
428
+ },
429
+ "total_flos": 2564102936199168.0,
430
+ "train_batch_size": 1,
431
+ "trial_name": null,
432
+ "trial_params": null
433
+ }