ameerazam08 commited on
Commit
1066c4c
·
1 Parent(s): 556c6f4

Donw lstr 512

Browse files
Files changed (6) hide show
  1. optimizer.pt +2 -2
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +2 -2
  4. scaler.pt +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +59 -329
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:175aca2d22d4e0ffe707aa0203de63cde30621cf49a4e7a9ded1eabcd757b3ae
3
- size 2498489737
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5772f0d132c52d6c7666a76eefa6acb743e5062963c285f6e91ff07779b68b1
3
+ size 2498513929
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:808080093cf66db7b341badb099355e4b67f4fcaefe5d6307115899a4c9bfca7
3
  size 1266126445
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a3ba17d5b17bb6071632212add5e7a861df1991153686e2b139bda01650420d
3
  size 1266126445
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5e9db4f8fb9adcedf9b8f43bc1c1355687d2a0381ee8c01e6f555f0a82cb5dc
3
- size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7c914e03b6244226666d251f841b0bbc1d1bfb51226ea62c98deeaaf6176005
3
+ size 14567
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fccda41cf05d0c2582a5ae864ddced240b945973cac170056ad38621f56c053
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e26ec2eeed5d2393eae8db5740c3d49711ae46ff10dcff4c5d8a4a07cdc5062
3
  size 559
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f178de8fd426d107cb3181bc3de3565897ca73a1fcc762375a87651bd94651ae
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fff2c89694c75e0ec2f7fd167cc6e42ee90aad80a67c0b576b690e62f6ab285
3
  size 623
trainer_state.json CHANGED
@@ -1,376 +1,106 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.0,
5
- "global_step": 240,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.42,
12
- "learning_rate": 7.500000000000001e-05,
13
- "loss": 1.2218,
14
  "step": 10
15
  },
16
  {
17
- "epoch": 0.42,
18
- "eval_accuracy": 0.6041666865348816,
19
- "eval_loss": 0.9863560795783997,
20
- "eval_runtime": 10.1253,
21
- "eval_samples_per_second": 4.741,
22
- "eval_steps_per_second": 1.185,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.83,
27
- "learning_rate": 7.083333333333334e-05,
28
- "loss": 0.9652,
29
  "step": 20
30
  },
31
  {
32
- "epoch": 0.83,
33
- "eval_accuracy": 0.5833333134651184,
34
- "eval_loss": 0.8854789733886719,
35
- "eval_runtime": 11.1802,
36
- "eval_samples_per_second": 4.293,
37
- "eval_steps_per_second": 1.073,
38
  "step": 20
39
  },
40
  {
41
- "epoch": 1.25,
42
- "learning_rate": 6.666666666666667e-05,
43
- "loss": 0.9764,
44
  "step": 30
45
  },
46
  {
47
- "epoch": 1.25,
48
- "eval_accuracy": 0.6666666865348816,
49
- "eval_loss": 0.8104388117790222,
50
- "eval_runtime": 10.0746,
51
- "eval_samples_per_second": 4.764,
52
- "eval_steps_per_second": 1.191,
53
  "step": 30
54
  },
55
  {
56
- "epoch": 1.67,
57
- "learning_rate": 6.25e-05,
58
- "loss": 0.5574,
59
  "step": 40
60
  },
61
  {
62
- "epoch": 1.67,
63
- "eval_accuracy": 0.625,
64
- "eval_loss": 1.0489534139633179,
65
- "eval_runtime": 11.7457,
66
- "eval_samples_per_second": 4.087,
67
- "eval_steps_per_second": 1.022,
68
  "step": 40
69
  },
70
  {
71
- "epoch": 2.08,
72
- "learning_rate": 5.833333333333334e-05,
73
- "loss": 0.778,
74
  "step": 50
75
  },
76
  {
77
- "epoch": 2.08,
78
- "eval_accuracy": 0.6458333134651184,
79
- "eval_loss": 0.8387454152107239,
80
- "eval_runtime": 10.2158,
81
- "eval_samples_per_second": 4.699,
82
- "eval_steps_per_second": 1.175,
83
  "step": 50
84
  },
85
  {
86
- "epoch": 2.5,
87
- "learning_rate": 5.4166666666666664e-05,
88
- "loss": 0.4129,
89
  "step": 60
90
  },
91
  {
92
- "epoch": 2.5,
93
- "eval_accuracy": 0.7083333134651184,
94
- "eval_loss": 0.7567564845085144,
95
- "eval_runtime": 11.1124,
96
- "eval_samples_per_second": 4.319,
97
- "eval_steps_per_second": 1.08,
98
  "step": 60
99
- },
100
- {
101
- "epoch": 2.92,
102
- "learning_rate": 5e-05,
103
- "loss": 0.4054,
104
- "step": 70
105
- },
106
- {
107
- "epoch": 2.92,
108
- "eval_accuracy": 0.75,
109
- "eval_loss": 0.7560882568359375,
110
- "eval_runtime": 10.3666,
111
- "eval_samples_per_second": 4.63,
112
- "eval_steps_per_second": 1.158,
113
- "step": 70
114
- },
115
- {
116
- "epoch": 3.33,
117
- "learning_rate": 4.5833333333333334e-05,
118
- "loss": 0.3773,
119
- "step": 80
120
- },
121
- {
122
- "epoch": 3.33,
123
- "eval_accuracy": 0.8125,
124
- "eval_loss": 0.6256787180900574,
125
- "eval_runtime": 11.7432,
126
- "eval_samples_per_second": 4.087,
127
- "eval_steps_per_second": 1.022,
128
- "step": 80
129
- },
130
- {
131
- "epoch": 3.75,
132
- "learning_rate": 4.166666666666667e-05,
133
- "loss": 0.1139,
134
- "step": 90
135
- },
136
- {
137
- "epoch": 3.75,
138
- "eval_accuracy": 0.875,
139
- "eval_loss": 0.44811201095581055,
140
- "eval_runtime": 10.2271,
141
- "eval_samples_per_second": 4.693,
142
- "eval_steps_per_second": 1.173,
143
- "step": 90
144
- },
145
- {
146
- "epoch": 4.17,
147
- "learning_rate": 3.7500000000000003e-05,
148
- "loss": 0.1395,
149
- "step": 100
150
- },
151
- {
152
- "epoch": 4.17,
153
- "eval_accuracy": 0.7708333134651184,
154
- "eval_loss": 0.7507086396217346,
155
- "eval_runtime": 11.1455,
156
- "eval_samples_per_second": 4.307,
157
- "eval_steps_per_second": 1.077,
158
- "step": 100
159
- },
160
- {
161
- "epoch": 4.58,
162
- "learning_rate": 3.3333333333333335e-05,
163
- "loss": 0.0564,
164
- "step": 110
165
- },
166
- {
167
- "epoch": 4.58,
168
- "eval_accuracy": 0.7916666865348816,
169
- "eval_loss": 0.7551252841949463,
170
- "eval_runtime": 10.0567,
171
- "eval_samples_per_second": 4.773,
172
- "eval_steps_per_second": 1.193,
173
- "step": 110
174
- },
175
- {
176
- "epoch": 5.0,
177
- "learning_rate": 2.916666666666667e-05,
178
- "loss": 0.0767,
179
- "step": 120
180
- },
181
- {
182
- "epoch": 5.0,
183
- "eval_accuracy": 0.7916666865348816,
184
- "eval_loss": 0.7378367781639099,
185
- "eval_runtime": 11.5811,
186
- "eval_samples_per_second": 4.145,
187
- "eval_steps_per_second": 1.036,
188
- "step": 120
189
- },
190
- {
191
- "epoch": 5.42,
192
- "learning_rate": 2.5e-05,
193
- "loss": 0.0464,
194
- "step": 130
195
- },
196
- {
197
- "epoch": 5.42,
198
- "eval_accuracy": 0.7291666865348816,
199
- "eval_loss": 1.1143478155136108,
200
- "eval_runtime": 10.0199,
201
- "eval_samples_per_second": 4.79,
202
- "eval_steps_per_second": 1.198,
203
- "step": 130
204
- },
205
- {
206
- "epoch": 5.83,
207
- "learning_rate": 2.0833333333333336e-05,
208
- "loss": 0.0996,
209
- "step": 140
210
- },
211
- {
212
- "epoch": 5.83,
213
- "eval_accuracy": 0.8333333134651184,
214
- "eval_loss": 0.6909031867980957,
215
- "eval_runtime": 11.7023,
216
- "eval_samples_per_second": 4.102,
217
- "eval_steps_per_second": 1.025,
218
- "step": 140
219
- },
220
- {
221
- "epoch": 6.25,
222
- "learning_rate": 1.6666666666666667e-05,
223
- "loss": 0.0166,
224
- "step": 150
225
- },
226
- {
227
- "epoch": 6.25,
228
- "eval_accuracy": 0.8333333134651184,
229
- "eval_loss": 0.6695077419281006,
230
- "eval_runtime": 9.8609,
231
- "eval_samples_per_second": 4.868,
232
- "eval_steps_per_second": 1.217,
233
- "step": 150
234
- },
235
- {
236
- "epoch": 6.67,
237
- "learning_rate": 1.25e-05,
238
- "loss": 0.0547,
239
- "step": 160
240
- },
241
- {
242
- "epoch": 6.67,
243
- "eval_accuracy": 0.75,
244
- "eval_loss": 0.9423481822013855,
245
- "eval_runtime": 11.5884,
246
- "eval_samples_per_second": 4.142,
247
- "eval_steps_per_second": 1.036,
248
- "step": 160
249
- },
250
- {
251
- "epoch": 7.08,
252
- "learning_rate": 8.333333333333334e-06,
253
- "loss": 0.1214,
254
- "step": 170
255
- },
256
- {
257
- "epoch": 7.08,
258
- "eval_accuracy": 0.7916666865348816,
259
- "eval_loss": 0.7280401587486267,
260
- "eval_runtime": 10.0684,
261
- "eval_samples_per_second": 4.767,
262
- "eval_steps_per_second": 1.192,
263
- "step": 170
264
- },
265
- {
266
- "epoch": 7.5,
267
- "learning_rate": 4.166666666666667e-06,
268
- "loss": 0.0096,
269
- "step": 180
270
- },
271
- {
272
- "epoch": 7.5,
273
- "eval_accuracy": 0.7916666865348816,
274
- "eval_loss": 0.6912185549736023,
275
- "eval_runtime": 11.3942,
276
- "eval_samples_per_second": 4.213,
277
- "eval_steps_per_second": 1.053,
278
- "step": 180
279
- },
280
- {
281
- "epoch": 7.92,
282
- "learning_rate": 0.0,
283
- "loss": 0.0611,
284
- "step": 190
285
- },
286
- {
287
- "epoch": 7.92,
288
- "eval_accuracy": 0.7916666865348816,
289
- "eval_loss": 0.6880165934562683,
290
- "eval_runtime": 10.0683,
291
- "eval_samples_per_second": 4.767,
292
- "eval_steps_per_second": 1.192,
293
- "step": 190
294
- },
295
- {
296
- "epoch": 8.33,
297
- "learning_rate": 0.0,
298
- "loss": 0.0254,
299
- "step": 200
300
- },
301
- {
302
- "epoch": 8.33,
303
- "eval_accuracy": 0.7916666865348816,
304
- "eval_loss": 0.6880165934562683,
305
- "eval_runtime": 12.1412,
306
- "eval_samples_per_second": 3.953,
307
- "eval_steps_per_second": 0.988,
308
- "step": 200
309
- },
310
- {
311
- "epoch": 8.75,
312
- "learning_rate": 0.0,
313
- "loss": 0.0073,
314
- "step": 210
315
- },
316
- {
317
- "epoch": 8.75,
318
- "eval_accuracy": 0.7916666865348816,
319
- "eval_loss": 0.6880165934562683,
320
- "eval_runtime": 9.9345,
321
- "eval_samples_per_second": 4.832,
322
- "eval_steps_per_second": 1.208,
323
- "step": 210
324
- },
325
- {
326
- "epoch": 9.17,
327
- "learning_rate": 0.0,
328
- "loss": 0.0153,
329
- "step": 220
330
- },
331
- {
332
- "epoch": 9.17,
333
- "eval_accuracy": 0.7916666865348816,
334
- "eval_loss": 0.6880165934562683,
335
- "eval_runtime": 10.0673,
336
- "eval_samples_per_second": 4.768,
337
- "eval_steps_per_second": 1.192,
338
- "step": 220
339
- },
340
- {
341
- "epoch": 9.58,
342
- "learning_rate": 0.0,
343
- "loss": 0.0093,
344
- "step": 230
345
- },
346
- {
347
- "epoch": 9.58,
348
- "eval_accuracy": 0.7916666865348816,
349
- "eval_loss": 0.6880165934562683,
350
- "eval_runtime": 9.9352,
351
- "eval_samples_per_second": 4.831,
352
- "eval_steps_per_second": 1.208,
353
- "step": 230
354
- },
355
- {
356
- "epoch": 10.0,
357
- "learning_rate": 0.0,
358
- "loss": 0.0575,
359
- "step": 240
360
- },
361
- {
362
- "epoch": 10.0,
363
- "eval_accuracy": 0.7916666865348816,
364
- "eval_loss": 0.6880165934562683,
365
- "eval_runtime": 10.1458,
366
- "eval_samples_per_second": 4.731,
367
- "eval_steps_per_second": 1.183,
368
- "step": 240
369
  }
370
  ],
371
- "max_steps": 240,
372
- "num_train_epochs": 10,
373
- "total_flos": 6.925070959567395e+17,
374
  "trial_name": null,
375
  "trial_params": null
376
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.3908045977011494,
5
+ "global_step": 60,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.23,
12
+ "learning_rate": 9.418604651162792e-05,
13
+ "loss": 1.9238,
14
  "step": 10
15
  },
16
  {
17
+ "epoch": 0.23,
18
+ "eval_accuracy": 0.1818181872367859,
19
+ "eval_loss": 1.9564208984375,
20
+ "eval_runtime": 19.1351,
21
+ "eval_samples_per_second": 4.599,
22
+ "eval_steps_per_second": 1.15,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.46,
27
+ "learning_rate": 8.837209302325582e-05,
28
+ "loss": 1.9589,
29
  "step": 20
30
  },
31
  {
32
+ "epoch": 0.46,
33
+ "eval_accuracy": 0.15909090638160706,
34
+ "eval_loss": 1.9498623609542847,
35
+ "eval_runtime": 21.5957,
36
+ "eval_samples_per_second": 4.075,
37
+ "eval_steps_per_second": 1.019,
38
  "step": 20
39
  },
40
  {
41
+ "epoch": 0.69,
42
+ "learning_rate": 8.255813953488373e-05,
43
+ "loss": 1.9677,
44
  "step": 30
45
  },
46
  {
47
+ "epoch": 0.69,
48
+ "eval_accuracy": 0.17045454680919647,
49
+ "eval_loss": 1.8762873411178589,
50
+ "eval_runtime": 19.1489,
51
+ "eval_samples_per_second": 4.596,
52
+ "eval_steps_per_second": 1.149,
53
  "step": 30
54
  },
55
  {
56
+ "epoch": 0.92,
57
+ "learning_rate": 7.674418604651163e-05,
58
+ "loss": 1.9357,
59
  "step": 40
60
  },
61
  {
62
+ "epoch": 0.92,
63
+ "eval_accuracy": 0.21590909361839294,
64
+ "eval_loss": 1.8796330690383911,
65
+ "eval_runtime": 21.6027,
66
+ "eval_samples_per_second": 4.074,
67
+ "eval_steps_per_second": 1.018,
68
  "step": 40
69
  },
70
  {
71
+ "epoch": 1.16,
72
+ "learning_rate": 7.093023255813955e-05,
73
+ "loss": 1.9529,
74
  "step": 50
75
  },
76
  {
77
+ "epoch": 1.16,
78
+ "eval_accuracy": 0.22727273404598236,
79
+ "eval_loss": 1.841020107269287,
80
+ "eval_runtime": 22.0824,
81
+ "eval_samples_per_second": 3.985,
82
+ "eval_steps_per_second": 0.996,
83
  "step": 50
84
  },
85
  {
86
+ "epoch": 1.39,
87
+ "learning_rate": 6.511627906976745e-05,
88
+ "loss": 1.8197,
89
  "step": 60
90
  },
91
  {
92
+ "epoch": 1.39,
93
+ "eval_accuracy": 0.35227271914482117,
94
+ "eval_loss": 1.6947021484375,
95
+ "eval_runtime": 21.8177,
96
+ "eval_samples_per_second": 4.033,
97
+ "eval_steps_per_second": 1.008,
98
  "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  }
100
  ],
101
+ "max_steps": 172,
102
+ "num_train_epochs": 4,
103
+ "total_flos": 1.7581701250290854e+17,
104
  "trial_name": null,
105
  "trial_params": null
106
  }