alicegoesdown commited on
Commit
5ffbb5a
·
verified ·
1 Parent(s): 12bd456

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1aec38ec987cade1694dd2adb977be799dc886de277db28a7617163fdb79093f
3
  size 100689176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c0dcc370af7068987019d5ad7c55775cd4aa685f984103fd5fb8da8d0263a78
3
  size 100689176
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cf02ff75d920a72a544ac9eb9e2180ab22d712f5963fbb3016e4a4418a0d6dd
3
  size 201488698
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c6a4f4139c8708380a359a5afcbe53de60e8b0bdaaaef9b89ffbc87e489ebb1
3
  size 201488698
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3fa9d6e7bb503576de38e4bc341860abcc3b9e550d7810ca32a03e7fcdb4512
3
- size 14308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adee8db53dd522679041ac1e22258a9dccde0894ff7e7335c959c7a488f59fa9
3
+ size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef8d2900de30e6031eb67496f65d84b5428252e5a56573254c12f627baa587a8
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5f37e21fbe85e09b136734aed2deb2ce642b5bd3d64c65196a2c110d8c5ff3a
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,351 +1,125 @@
1
  {
2
- "best_metric": 1.0556260347366333,
3
- "best_model_checkpoint": "./output/checkpoint-450",
4
- "epoch": 0.03256621797655232,
5
  "eval_steps": 150,
6
- "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0007236937328122738,
13
- "grad_norm": 1.5576016902923584,
14
- "learning_rate": 1.25e-05,
15
- "loss": 1.2444,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.0014473874656245477,
20
- "grad_norm": 1.1290874481201172,
21
- "learning_rate": 2.5e-05,
22
- "loss": 1.3366,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.0021710811984368217,
27
- "grad_norm": 1.8639086484909058,
28
- "learning_rate": 3.75e-05,
29
- "loss": 1.256,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.0028947749312490953,
34
- "grad_norm": 1.134912133216858,
35
- "learning_rate": 5e-05,
36
- "loss": 1.1769,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.0036184686640613694,
41
- "grad_norm": 1.3332692384719849,
42
- "learning_rate": 6.25e-05,
43
- "loss": 1.0865,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.004342162396873643,
48
- "grad_norm": 1.3363752365112305,
49
- "learning_rate": 7.5e-05,
50
- "loss": 1.084,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.005065856129685917,
55
- "grad_norm": 1.0881965160369873,
56
- "learning_rate": 8.75e-05,
57
- "loss": 1.0748,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.005789549862498191,
62
- "grad_norm": 1.3386387825012207,
63
- "learning_rate": 0.0001,
64
- "loss": 1.1922,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.006513243595310464,
69
- "grad_norm": 0.9421172142028809,
70
- "learning_rate": 0.00011250000000000001,
71
- "loss": 1.0653,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.007236937328122739,
76
- "grad_norm": 1.0505911111831665,
77
- "learning_rate": 0.000125,
78
- "loss": 1.0001,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.007960631060935012,
83
- "grad_norm": 1.0293700695037842,
84
- "learning_rate": 0.00012499871543489787,
85
- "loss": 1.1826,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.008684324793747287,
90
- "grad_norm": 1.0566595792770386,
91
- "learning_rate": 0.00012499486179239495,
92
- "loss": 1.1505,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.00940801852655956,
97
- "grad_norm": 0.7607803344726562,
98
- "learning_rate": 0.00012498843923089938,
99
- "loss": 1.1219,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.010131712259371834,
104
- "grad_norm": 0.9567335844039917,
105
- "learning_rate": 0.0001249794480144175,
106
- "loss": 1.1675,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.010855405992184108,
111
- "grad_norm": 0.9335429668426514,
112
- "learning_rate": 0.000124967888512543,
113
- "loss": 1.1177,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.010855405992184108,
118
- "eval_loss": 1.084096074104309,
119
- "eval_runtime": 68.3319,
120
- "eval_samples_per_second": 7.317,
121
- "eval_steps_per_second": 7.317,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 0.011579099724996381,
126
- "grad_norm": 1.0022648572921753,
127
- "learning_rate": 0.00012495376120044173,
128
- "loss": 1.0762,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.012302793457808655,
133
- "grad_norm": 1.0098097324371338,
134
- "learning_rate": 0.00012493706665883217,
135
- "loss": 1.0473,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.013026487190620929,
140
- "grad_norm": 0.9342713952064514,
141
- "learning_rate": 0.00012491780557396154,
142
- "loss": 1.2133,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.013750180923433204,
147
- "grad_norm": 1.2415492534637451,
148
- "learning_rate": 0.00012489597873757756,
149
- "loss": 1.1635,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.014473874656245478,
154
- "grad_norm": 0.727070152759552,
155
- "learning_rate": 0.00012487158704689602,
156
- "loss": 1.1106,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.015197568389057751,
161
- "grad_norm": 1.039361596107483,
162
- "learning_rate": 0.0001248446315045638,
163
- "loss": 1.1199,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 0.015921262121870023,
168
- "grad_norm": 1.1260257959365845,
169
- "learning_rate": 0.00012481511321861763,
170
- "loss": 1.1924,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 0.0166449558546823,
175
- "grad_norm": 0.9752004742622375,
176
- "learning_rate": 0.00012478303340243864,
177
- "loss": 1.0699,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 0.017368649587494574,
182
- "grad_norm": 1.4011763334274292,
183
- "learning_rate": 0.00012474839337470246,
184
- "loss": 1.106,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 0.018092343320306847,
189
- "grad_norm": 0.9429338574409485,
190
- "learning_rate": 0.0001247111945593249,
191
- "loss": 1.161,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 0.01881603705311912,
196
- "grad_norm": 1.0999586582183838,
197
- "learning_rate": 0.00012467143848540359,
198
- "loss": 1.1911,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 0.019539730785931395,
203
- "grad_norm": 1.1027190685272217,
204
- "learning_rate": 0.000124629126787155,
205
- "loss": 1.2331,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 0.020263424518743668,
210
- "grad_norm": 0.9375354647636414,
211
- "learning_rate": 0.00012458426120384738,
212
- "loss": 1.0836,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 0.020987118251555942,
217
- "grad_norm": 2.151923894882202,
218
- "learning_rate": 0.00012453684357972906,
219
- "loss": 1.0584,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 0.021710811984368215,
224
- "grad_norm": 0.833605945110321,
225
- "learning_rate": 0.00012448687586395289,
226
- "loss": 1.1453,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 0.021710811984368215,
231
- "eval_loss": 1.061118483543396,
232
- "eval_runtime": 67.9747,
233
- "eval_samples_per_second": 7.356,
234
- "eval_steps_per_second": 7.356,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 0.02243450571718049,
239
- "grad_norm": 1.0999945402145386,
240
- "learning_rate": 0.00012443436011049593,
241
- "loss": 1.2021,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 0.023158199449992763,
246
- "grad_norm": 1.1812015771865845,
247
- "learning_rate": 0.0001243792984780751,
248
- "loss": 1.0777,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.023881893182805036,
253
- "grad_norm": 1.6861599683761597,
254
- "learning_rate": 0.00012432169323005853,
255
- "loss": 1.1632,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 0.02460558691561731,
260
- "grad_norm": 1.1688460111618042,
261
- "learning_rate": 0.00012426154673437223,
262
- "loss": 1.07,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 0.025329280648429583,
267
- "grad_norm": 0.8406811952590942,
268
- "learning_rate": 0.00012419886146340314,
269
- "loss": 1.0175,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 0.026052974381241857,
274
- "grad_norm": 1.205870270729065,
275
- "learning_rate": 0.0001241336399938972,
276
- "loss": 1.2039,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 0.02677666811405413,
281
- "grad_norm": 0.7990264296531677,
282
- "learning_rate": 0.00012406588500685355,
283
- "loss": 1.0588,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 0.027500361846866408,
288
- "grad_norm": 1.1303527355194092,
289
- "learning_rate": 0.00012399559928741435,
290
- "loss": 1.1073,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 0.02822405557967868,
295
- "grad_norm": 0.7280349731445312,
296
- "learning_rate": 0.00012392278572475023,
297
- "loss": 1.0966,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 0.028947749312490955,
302
- "grad_norm": 1.076653242111206,
303
- "learning_rate": 0.0001238474473119416,
304
- "loss": 1.111,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 0.02967144304530323,
309
- "grad_norm": 0.9253267049789429,
310
- "learning_rate": 0.00012376958714585545,
311
- "loss": 1.1165,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 0.030395136778115502,
316
- "grad_norm": 0.9444619417190552,
317
- "learning_rate": 0.0001236892084270183,
318
- "loss": 1.1612,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 0.031118830510927776,
323
- "grad_norm": 1.2068166732788086,
324
- "learning_rate": 0.00012360631445948448,
325
- "loss": 1.1822,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 0.031842524243740046,
330
- "grad_norm": 0.8767175674438477,
331
- "learning_rate": 0.00012352090865070026,
332
- "loss": 0.9645,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 0.03256621797655232,
337
- "grad_norm": 0.8043785691261292,
338
- "learning_rate": 0.00012343299451136397,
339
- "loss": 1.1397,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 0.03256621797655232,
344
- "eval_loss": 1.0556260347366333,
345
- "eval_runtime": 67.7421,
346
- "eval_samples_per_second": 7.381,
347
- "eval_steps_per_second": 7.381,
348
- "step": 450
349
  }
350
  ],
351
  "logging_steps": 10,
@@ -365,8 +139,8 @@
365
  "attributes": {}
366
  }
367
  },
368
- "total_flos": 9.640123545491866e+16,
369
- "train_batch_size": 8,
370
  "trial_name": null,
371
  "trial_params": null
372
  }
 
1
  {
2
+ "best_metric": 1.0342717170715332,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 0.005427702996092054,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0003618468664061369,
13
+ "grad_norm": 1.2045749425888062,
14
+ "learning_rate": 5.500000000000001e-06,
15
+ "loss": 1.144,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.0007236937328122738,
20
+ "grad_norm": 1.50728178024292,
21
+ "learning_rate": 1.1000000000000001e-05,
22
+ "loss": 1.1809,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.0010855405992184109,
27
+ "grad_norm": 0.9494473934173584,
28
+ "learning_rate": 1.65e-05,
29
+ "loss": 1.0738,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.0014473874656245477,
34
+ "grad_norm": 0.957133948802948,
35
+ "learning_rate": 2.2000000000000003e-05,
36
+ "loss": 0.9573,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.0018092343320306847,
41
+ "grad_norm": 1.7268428802490234,
42
+ "learning_rate": 2.75e-05,
43
+ "loss": 1.0361,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.0021710811984368217,
48
+ "grad_norm": 1.1843866109848022,
49
+ "learning_rate": 3.3e-05,
50
+ "loss": 1.0351,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.0025329280648429585,
55
+ "grad_norm": 1.5817480087280273,
56
+ "learning_rate": 3.85e-05,
57
+ "loss": 1.1654,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.0028947749312490953,
62
+ "grad_norm": 0.8221575617790222,
63
+ "learning_rate": 4.4000000000000006e-05,
64
+ "loss": 1.1031,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.003256621797655232,
69
+ "grad_norm": 1.0210144519805908,
70
+ "learning_rate": 4.9500000000000004e-05,
71
+ "loss": 1.2083,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.0036184686640613694,
76
+ "grad_norm": 1.6523082256317139,
77
+ "learning_rate": 5.5e-05,
78
+ "loss": 1.1551,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.003980315530467506,
83
+ "grad_norm": 1.3959214687347412,
84
+ "learning_rate": 5.4999434791355066e-05,
85
+ "loss": 1.2088,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.004342162396873643,
90
+ "grad_norm": 1.7850854396820068,
91
+ "learning_rate": 5.4997739188653784e-05,
92
+ "loss": 1.0394,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.00470400926327978,
97
+ "grad_norm": 1.707861304283142,
98
+ "learning_rate": 5.4994913261595724e-05,
99
+ "loss": 1.0406,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.005065856129685917,
104
+ "grad_norm": 1.622674584388733,
105
+ "learning_rate": 5.49909571263437e-05,
106
+ "loss": 1.0781,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.005427702996092054,
111
+ "grad_norm": 1.135132908821106,
112
+ "learning_rate": 5.498587094551892e-05,
113
+ "loss": 1.2658,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.005427702996092054,
118
+ "eval_loss": 1.0342717170715332,
119
+ "eval_runtime": 68.3584,
120
+ "eval_samples_per_second": 7.314,
121
+ "eval_steps_per_second": 7.314,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 1.358716220940288e+16,
143
+ "train_batch_size": 4,
144
  "trial_name": null,
145
  "trial_params": null
146
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54d6539c84a7f0730a538274bf05d1d19242ee2f5b7307043f37a16d2ee393e6
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36ffe1bc7f9782a0960f25658b63a6896f6292d5fbd06dd9772d5fa0c7cd4b3b
3
  size 5496