Reynold97 commited on
Commit
ce6b747
·
verified ·
1 Parent(s): 485361f

Training in progress, step 50, checkpoint

Browse files
checkpoint-50/adapter_config.json CHANGED
@@ -10,7 +10,7 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 32,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
@@ -19,7 +19,7 @@
19
  "score"
20
  ],
21
  "peft_type": "LORA",
22
- "r": 16,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 64,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
 
19
  "score"
20
  ],
21
  "peft_type": "LORA",
22
+ "r": 32,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
checkpoint-50/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68a045468f0e9ab47ef0c1a9da28d023b5322f88b90ffe9027c78c0958eaa0c3
3
- size 27313024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:996abc775fa52798168a432665b684d09f9a487bb971709b2be0ff559b4bdc96
3
+ size 54576048
checkpoint-50/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0c097f34464ee144ca4b18eec5321b0629e2eb07ddd8237a2644130bbf266fc
3
- size 54668218
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4eb0ea463df519c6e94818d460604ff6999bf25a011fb4edfc8bd8db72a47f9
3
+ size 109196538
checkpoint-50/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a57775e97d734b1bf77f3d08cc2543cd48b452bdd29105d0c5cff1bb0e96183
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9e3f686147a49a9f31c1e5e32f0f7a18a4f215b56c917e895e7df0f5e717c12
3
  size 14244
checkpoint-50/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8477ebdd9e98a050efaa98c4ee5fb26ea6ac9516d088fbb54c344ed67ff7e87
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9583dae960f1692f1c714672f2fe9e52e367d1e6c1734c325e522dd72e1d9343
3
  size 1064
checkpoint-50/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 0.7421436309814453,
3
  "best_model_checkpoint": "../artifacts/LlaMa3-QLoRA-PatentMatch-v0.1/checkpoint-50",
4
- "epoch": 0.4716981132075472,
5
  "eval_steps": 10,
6
  "global_step": 50,
7
  "is_hyper_param_search": false,
@@ -9,398 +9,398 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.009433962264150943,
13
- "grad_norm": 6.128956317901611,
14
- "learning_rate": 9.905660377358492e-05,
15
- "loss": 0.461,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.018867924528301886,
20
- "grad_norm": 68.78094482421875,
21
- "learning_rate": 9.811320754716981e-05,
22
- "loss": 0.9442,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.02830188679245283,
27
- "grad_norm": 35.7277946472168,
28
- "learning_rate": 9.716981132075472e-05,
29
- "loss": 0.6359,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.03773584905660377,
34
- "grad_norm": 56.930320739746094,
35
- "learning_rate": 9.622641509433963e-05,
36
- "loss": 0.5856,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.04716981132075472,
41
- "grad_norm": 68.45040130615234,
42
- "learning_rate": 9.528301886792453e-05,
43
- "loss": 0.7444,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.05660377358490566,
48
- "grad_norm": 18.832008361816406,
49
- "learning_rate": 9.433962264150944e-05,
50
- "loss": 0.6504,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 0.0660377358490566,
55
- "grad_norm": 11.086563110351562,
56
- "learning_rate": 9.339622641509434e-05,
57
- "loss": 0.5207,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 0.07547169811320754,
62
- "grad_norm": 20.990877151489258,
63
- "learning_rate": 9.245283018867925e-05,
64
- "loss": 0.586,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 0.08490566037735849,
69
- "grad_norm": 14.929924964904785,
70
- "learning_rate": 9.150943396226416e-05,
71
- "loss": 0.4919,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 0.09433962264150944,
76
- "grad_norm": 13.368741989135742,
77
- "learning_rate": 9.056603773584906e-05,
78
- "loss": 0.7216,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 0.09433962264150944,
83
- "eval_loss": 0.7321442365646362,
84
- "eval_runtime": 17.8269,
85
- "eval_samples_per_second": 16.548,
86
- "eval_steps_per_second": 3.31,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 0.10377358490566038,
91
- "grad_norm": 13.765029907226562,
92
- "learning_rate": 8.962264150943397e-05,
93
- "loss": 0.9187,
94
  "step": 11
95
  },
96
  {
97
- "epoch": 0.11320754716981132,
98
- "grad_norm": 23.503965377807617,
99
- "learning_rate": 8.867924528301888e-05,
100
- "loss": 0.742,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 0.12264150943396226,
105
- "grad_norm": 11.606241226196289,
106
- "learning_rate": 8.773584905660378e-05,
107
- "loss": 0.5432,
108
  "step": 13
109
  },
110
  {
111
- "epoch": 0.1320754716981132,
112
- "grad_norm": 37.84874725341797,
113
- "learning_rate": 8.679245283018869e-05,
114
- "loss": 0.6434,
115
  "step": 14
116
  },
117
  {
118
- "epoch": 0.14150943396226415,
119
- "grad_norm": 7.491106033325195,
120
- "learning_rate": 8.584905660377359e-05,
121
- "loss": 0.5317,
122
  "step": 15
123
  },
124
  {
125
- "epoch": 0.1509433962264151,
126
- "grad_norm": 19.157922744750977,
127
- "learning_rate": 8.49056603773585e-05,
128
- "loss": 0.8891,
129
  "step": 16
130
  },
131
  {
132
- "epoch": 0.16037735849056603,
133
- "grad_norm": 29.841453552246094,
134
- "learning_rate": 8.396226415094341e-05,
135
- "loss": 0.5606,
136
  "step": 17
137
  },
138
  {
139
- "epoch": 0.16981132075471697,
140
- "grad_norm": 31.549617767333984,
141
- "learning_rate": 8.30188679245283e-05,
142
- "loss": 0.7426,
143
  "step": 18
144
  },
145
  {
146
- "epoch": 0.1792452830188679,
147
- "grad_norm": 12.463747024536133,
148
- "learning_rate": 8.207547169811322e-05,
149
- "loss": 0.7602,
150
  "step": 19
151
  },
152
  {
153
- "epoch": 0.18867924528301888,
154
- "grad_norm": 37.965084075927734,
155
- "learning_rate": 8.113207547169813e-05,
156
- "loss": 0.6794,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 0.18867924528301888,
161
- "eval_loss": 0.7417612671852112,
162
- "eval_runtime": 18.8468,
163
- "eval_samples_per_second": 15.652,
164
- "eval_steps_per_second": 3.13,
165
  "step": 20
166
  },
167
  {
168
- "epoch": 0.19811320754716982,
169
- "grad_norm": 13.589489936828613,
170
- "learning_rate": 8.018867924528302e-05,
171
- "loss": 0.648,
172
  "step": 21
173
  },
174
  {
175
- "epoch": 0.20754716981132076,
176
- "grad_norm": 39.87663269042969,
177
- "learning_rate": 7.924528301886794e-05,
178
- "loss": 0.7446,
179
  "step": 22
180
  },
181
  {
182
- "epoch": 0.2169811320754717,
183
- "grad_norm": 17.065126419067383,
184
- "learning_rate": 7.830188679245283e-05,
185
- "loss": 0.6289,
186
  "step": 23
187
  },
188
  {
189
- "epoch": 0.22641509433962265,
190
- "grad_norm": 8.414811134338379,
191
- "learning_rate": 7.735849056603774e-05,
192
- "loss": 0.462,
193
  "step": 24
194
  },
195
  {
196
- "epoch": 0.2358490566037736,
197
- "grad_norm": 30.71627426147461,
198
- "learning_rate": 7.641509433962265e-05,
199
- "loss": 0.5479,
200
  "step": 25
201
  },
202
  {
203
- "epoch": 0.24528301886792453,
204
- "grad_norm": 32.945404052734375,
205
- "learning_rate": 7.547169811320755e-05,
206
- "loss": 0.6388,
207
  "step": 26
208
  },
209
  {
210
- "epoch": 0.25471698113207547,
211
- "grad_norm": 8.858525276184082,
212
- "learning_rate": 7.452830188679245e-05,
213
- "loss": 0.6801,
214
  "step": 27
215
  },
216
  {
217
- "epoch": 0.2641509433962264,
218
- "grad_norm": 7.566395282745361,
219
- "learning_rate": 7.358490566037736e-05,
220
- "loss": 0.6122,
221
  "step": 28
222
  },
223
  {
224
- "epoch": 0.27358490566037735,
225
- "grad_norm": 17.68086051940918,
226
- "learning_rate": 7.264150943396226e-05,
227
- "loss": 0.5955,
228
  "step": 29
229
  },
230
  {
231
- "epoch": 0.2830188679245283,
232
- "grad_norm": 13.58435344696045,
233
- "learning_rate": 7.169811320754717e-05,
234
- "loss": 0.6361,
235
  "step": 30
236
  },
237
  {
238
- "epoch": 0.2830188679245283,
239
- "eval_loss": 0.8027433156967163,
240
- "eval_runtime": 19.1354,
241
- "eval_samples_per_second": 15.416,
242
- "eval_steps_per_second": 3.083,
243
  "step": 30
244
  },
245
  {
246
- "epoch": 0.29245283018867924,
247
- "grad_norm": 35.713294982910156,
248
- "learning_rate": 7.075471698113208e-05,
249
- "loss": 0.8423,
250
  "step": 31
251
  },
252
  {
253
- "epoch": 0.3018867924528302,
254
- "grad_norm": 8.273999214172363,
255
- "learning_rate": 6.981132075471698e-05,
256
- "loss": 0.6246,
257
  "step": 32
258
  },
259
  {
260
- "epoch": 0.3113207547169811,
261
- "grad_norm": 24.59564208984375,
262
- "learning_rate": 6.886792452830189e-05,
263
- "loss": 0.6088,
264
  "step": 33
265
  },
266
  {
267
- "epoch": 0.32075471698113206,
268
- "grad_norm": 50.26318359375,
269
- "learning_rate": 6.79245283018868e-05,
270
- "loss": 0.6227,
271
  "step": 34
272
  },
273
  {
274
- "epoch": 0.330188679245283,
275
- "grad_norm": 8.71313190460205,
276
- "learning_rate": 6.69811320754717e-05,
277
- "loss": 0.6782,
278
  "step": 35
279
  },
280
  {
281
- "epoch": 0.33962264150943394,
282
- "grad_norm": 43.067962646484375,
283
- "learning_rate": 6.60377358490566e-05,
284
- "loss": 0.876,
285
  "step": 36
286
  },
287
  {
288
- "epoch": 0.3490566037735849,
289
- "grad_norm": 52.883399963378906,
290
- "learning_rate": 6.50943396226415e-05,
291
- "loss": 0.8668,
292
  "step": 37
293
  },
294
  {
295
- "epoch": 0.3584905660377358,
296
- "grad_norm": 23.537960052490234,
297
- "learning_rate": 6.415094339622641e-05,
298
- "loss": 0.6368,
299
  "step": 38
300
  },
301
  {
302
- "epoch": 0.36792452830188677,
303
- "grad_norm": 18.321443557739258,
304
- "learning_rate": 6.320754716981132e-05,
305
- "loss": 0.6782,
306
  "step": 39
307
  },
308
  {
309
- "epoch": 0.37735849056603776,
310
- "grad_norm": 14.678520202636719,
311
- "learning_rate": 6.226415094339622e-05,
312
- "loss": 0.8059,
313
  "step": 40
314
  },
315
  {
316
- "epoch": 0.37735849056603776,
317
- "eval_loss": 0.7866339683532715,
318
- "eval_runtime": 19.074,
319
- "eval_samples_per_second": 15.466,
320
- "eval_steps_per_second": 3.093,
321
  "step": 40
322
  },
323
  {
324
- "epoch": 0.3867924528301887,
325
- "grad_norm": 53.35795593261719,
326
- "learning_rate": 6.132075471698113e-05,
327
- "loss": 0.8482,
328
  "step": 41
329
  },
330
  {
331
- "epoch": 0.39622641509433965,
332
- "grad_norm": 26.74378776550293,
333
- "learning_rate": 6.037735849056604e-05,
334
- "loss": 0.5963,
335
  "step": 42
336
  },
337
  {
338
- "epoch": 0.4056603773584906,
339
- "grad_norm": 51.7420539855957,
340
- "learning_rate": 5.943396226415094e-05,
341
- "loss": 0.6738,
342
  "step": 43
343
  },
344
  {
345
- "epoch": 0.41509433962264153,
346
- "grad_norm": 7.802835941314697,
347
- "learning_rate": 5.849056603773585e-05,
348
- "loss": 0.7359,
349
  "step": 44
350
  },
351
  {
352
- "epoch": 0.42452830188679247,
353
- "grad_norm": 46.24528121948242,
354
- "learning_rate": 5.7547169811320756e-05,
355
- "loss": 0.6698,
356
  "step": 45
357
  },
358
  {
359
- "epoch": 0.4339622641509434,
360
- "grad_norm": 11.004349708557129,
361
- "learning_rate": 5.660377358490566e-05,
362
- "loss": 0.7172,
363
  "step": 46
364
  },
365
  {
366
- "epoch": 0.44339622641509435,
367
- "grad_norm": 46.21535873413086,
368
- "learning_rate": 5.5660377358490564e-05,
369
- "loss": 0.7805,
370
  "step": 47
371
  },
372
  {
373
- "epoch": 0.4528301886792453,
374
- "grad_norm": 30.457117080688477,
375
- "learning_rate": 5.4716981132075475e-05,
376
- "loss": 0.6887,
377
  "step": 48
378
  },
379
  {
380
- "epoch": 0.46226415094339623,
381
- "grad_norm": 15.550039291381836,
382
- "learning_rate": 5.377358490566038e-05,
383
- "loss": 0.6272,
384
  "step": 49
385
  },
386
  {
387
- "epoch": 0.4716981132075472,
388
- "grad_norm": 22.72784996032715,
389
- "learning_rate": 5.283018867924528e-05,
390
- "loss": 0.6153,
391
  "step": 50
392
  },
393
  {
394
- "epoch": 0.4716981132075472,
395
- "eval_loss": 0.7421436309814453,
396
- "eval_runtime": 19.1648,
397
- "eval_samples_per_second": 15.393,
398
- "eval_steps_per_second": 3.079,
399
  "step": 50
400
  }
401
  ],
402
  "logging_steps": 1,
403
- "max_steps": 106,
404
  "num_input_tokens_seen": 0,
405
  "num_train_epochs": 1,
406
  "save_steps": 50,
@@ -416,7 +416,7 @@
416
  "attributes": {}
417
  }
418
  },
419
- "total_flos": 1.905280632127488e+16,
420
  "train_batch_size": 5,
421
  "trial_name": null,
422
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.322121500968933,
3
  "best_model_checkpoint": "../artifacts/LlaMa3-QLoRA-PatentMatch-v0.1/checkpoint-50",
4
+ "epoch": 0.09433962264150944,
5
  "eval_steps": 10,
6
  "global_step": 50,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0018867924528301887,
13
+ "grad_norm": 29.756061553955078,
14
+ "learning_rate": 1e-10,
15
+ "loss": 0.8469,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.0037735849056603774,
20
+ "grad_norm": 95.5372543334961,
21
+ "learning_rate": 2e-10,
22
+ "loss": 1.3855,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.005660377358490566,
27
+ "grad_norm": 31.052196502685547,
28
+ "learning_rate": 3e-10,
29
+ "loss": 1.0934,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.007547169811320755,
34
+ "grad_norm": 27.79388999938965,
35
+ "learning_rate": 4e-10,
36
+ "loss": 0.5449,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.009433962264150943,
41
+ "grad_norm": 104.76834869384766,
42
+ "learning_rate": 5e-10,
43
+ "loss": 2.7414,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.011320754716981131,
48
+ "grad_norm": 72.9654312133789,
49
+ "learning_rate": 6e-10,
50
+ "loss": 1.8,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.013207547169811321,
55
+ "grad_norm": 118.8756103515625,
56
+ "learning_rate": 7.000000000000001e-10,
57
+ "loss": 2.7953,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.01509433962264151,
62
+ "grad_norm": 38.9265251159668,
63
+ "learning_rate": 8e-10,
64
+ "loss": 0.9516,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.016981132075471698,
69
+ "grad_norm": 36.34098815917969,
70
+ "learning_rate": 9e-10,
71
+ "loss": 1.4047,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.018867924528301886,
76
+ "grad_norm": 73.90235900878906,
77
+ "learning_rate": 1e-09,
78
+ "loss": 0.842,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.018867924528301886,
83
+ "eval_loss": 1.3371663093566895,
84
+ "eval_runtime": 19.2573,
85
+ "eval_samples_per_second": 15.319,
86
+ "eval_steps_per_second": 1.921,
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.020754716981132074,
91
+ "grad_norm": 23.1890926361084,
92
+ "learning_rate": 1.1000000000000001e-09,
93
+ "loss": 0.1208,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.022641509433962263,
98
+ "grad_norm": 22.793596267700195,
99
+ "learning_rate": 1.2e-09,
100
+ "loss": 0.9565,
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.024528301886792454,
105
+ "grad_norm": 119.95530700683594,
106
+ "learning_rate": 1.3e-09,
107
+ "loss": 2.2288,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.026415094339622643,
112
+ "grad_norm": 65.44388580322266,
113
+ "learning_rate": 1.4000000000000001e-09,
114
+ "loss": 0.6767,
115
  "step": 14
116
  },
117
  {
118
+ "epoch": 0.02830188679245283,
119
+ "grad_norm": 58.65330123901367,
120
+ "learning_rate": 1.5e-09,
121
+ "loss": 1.082,
122
  "step": 15
123
  },
124
  {
125
+ "epoch": 0.03018867924528302,
126
+ "grad_norm": 32.0055046081543,
127
+ "learning_rate": 1.6e-09,
128
+ "loss": 0.1699,
129
  "step": 16
130
  },
131
  {
132
+ "epoch": 0.03207547169811321,
133
+ "grad_norm": 73.29571533203125,
134
+ "learning_rate": 1.7000000000000001e-09,
135
+ "loss": 0.8457,
136
  "step": 17
137
  },
138
  {
139
+ "epoch": 0.033962264150943396,
140
+ "grad_norm": 148.53343200683594,
141
+ "learning_rate": 1.8e-09,
142
+ "loss": 2.2594,
143
  "step": 18
144
  },
145
  {
146
+ "epoch": 0.035849056603773584,
147
+ "grad_norm": 41.90864181518555,
148
+ "learning_rate": 1.9e-09,
149
+ "loss": 0.8586,
150
  "step": 19
151
  },
152
  {
153
+ "epoch": 0.03773584905660377,
154
+ "grad_norm": 27.577930450439453,
155
+ "learning_rate": 2e-09,
156
+ "loss": 1.0981,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.03773584905660377,
161
+ "eval_loss": 1.3284127712249756,
162
+ "eval_runtime": 19.9562,
163
+ "eval_samples_per_second": 14.782,
164
+ "eval_steps_per_second": 1.854,
165
  "step": 20
166
  },
167
  {
168
+ "epoch": 0.03962264150943396,
169
+ "grad_norm": 42.17684555053711,
170
+ "learning_rate": 2.0999999999999998e-09,
171
+ "loss": 0.8403,
172
  "step": 21
173
  },
174
  {
175
+ "epoch": 0.04150943396226415,
176
+ "grad_norm": 23.889053344726562,
177
+ "learning_rate": 2.2000000000000003e-09,
178
+ "loss": 0.5028,
179
  "step": 22
180
  },
181
  {
182
+ "epoch": 0.04339622641509434,
183
+ "grad_norm": 123.04353332519531,
184
+ "learning_rate": 2.3000000000000003e-09,
185
+ "loss": 2.2626,
186
  "step": 23
187
  },
188
  {
189
+ "epoch": 0.045283018867924525,
190
+ "grad_norm": 28.47271156311035,
191
+ "learning_rate": 2.4e-09,
192
+ "loss": 0.1571,
193
  "step": 24
194
  },
195
  {
196
+ "epoch": 0.04716981132075472,
197
+ "grad_norm": 76.90851593017578,
198
+ "learning_rate": 2.5e-09,
199
+ "loss": 1.1323,
200
  "step": 25
201
  },
202
  {
203
+ "epoch": 0.04905660377358491,
204
+ "grad_norm": 76.9769058227539,
205
+ "learning_rate": 2.6e-09,
206
+ "loss": 1.2021,
207
  "step": 26
208
  },
209
  {
210
+ "epoch": 0.0509433962264151,
211
+ "grad_norm": 65.92312622070312,
212
+ "learning_rate": 2.7e-09,
213
+ "loss": 1.1695,
214
  "step": 27
215
  },
216
  {
217
+ "epoch": 0.052830188679245285,
218
+ "grad_norm": 68.66289520263672,
219
+ "learning_rate": 2.8000000000000003e-09,
220
+ "loss": 1.2923,
221
  "step": 28
222
  },
223
  {
224
+ "epoch": 0.05471698113207547,
225
+ "grad_norm": 60.09720230102539,
226
+ "learning_rate": 2.9e-09,
227
+ "loss": 0.8071,
228
  "step": 29
229
  },
230
  {
231
+ "epoch": 0.05660377358490566,
232
+ "grad_norm": 120.86378479003906,
233
+ "learning_rate": 3e-09,
234
+ "loss": 2.4214,
235
  "step": 30
236
  },
237
  {
238
+ "epoch": 0.05660377358490566,
239
+ "eval_loss": 1.322973370552063,
240
+ "eval_runtime": 20.2855,
241
+ "eval_samples_per_second": 14.542,
242
+ "eval_steps_per_second": 1.824,
243
  "step": 30
244
  },
245
  {
246
+ "epoch": 0.05849056603773585,
247
+ "grad_norm": 51.104766845703125,
248
+ "learning_rate": 3.1e-09,
249
+ "loss": 1.0263,
250
  "step": 31
251
  },
252
  {
253
+ "epoch": 0.06037735849056604,
254
+ "grad_norm": 74.6181640625,
255
+ "learning_rate": 3.2e-09,
256
+ "loss": 1.5383,
257
  "step": 32
258
  },
259
  {
260
+ "epoch": 0.062264150943396226,
261
+ "grad_norm": 30.15936279296875,
262
+ "learning_rate": 3.3e-09,
263
+ "loss": 0.3719,
264
  "step": 33
265
  },
266
  {
267
+ "epoch": 0.06415094339622641,
268
+ "grad_norm": 157.64108276367188,
269
+ "learning_rate": 3.4000000000000003e-09,
270
+ "loss": 3.4692,
271
  "step": 34
272
  },
273
  {
274
+ "epoch": 0.0660377358490566,
275
+ "grad_norm": 43.18593215942383,
276
+ "learning_rate": 3.5e-09,
277
+ "loss": 0.549,
278
  "step": 35
279
  },
280
  {
281
+ "epoch": 0.06792452830188679,
282
+ "grad_norm": 125.1178207397461,
283
+ "learning_rate": 3.6e-09,
284
+ "loss": 1.3121,
285
  "step": 36
286
  },
287
  {
288
+ "epoch": 0.06981132075471698,
289
+ "grad_norm": 111.42374420166016,
290
+ "learning_rate": 3.7e-09,
291
+ "loss": 1.8228,
292
  "step": 37
293
  },
294
  {
295
+ "epoch": 0.07169811320754717,
296
+ "grad_norm": 35.12282943725586,
297
+ "learning_rate": 3.8e-09,
298
+ "loss": 0.5382,
299
  "step": 38
300
  },
301
  {
302
+ "epoch": 0.07358490566037736,
303
+ "grad_norm": 159.79856872558594,
304
+ "learning_rate": 3.9e-09,
305
+ "loss": 2.6343,
306
  "step": 39
307
  },
308
  {
309
+ "epoch": 0.07547169811320754,
310
+ "grad_norm": 20.610780715942383,
311
+ "learning_rate": 4e-09,
312
+ "loss": 0.5307,
313
  "step": 40
314
  },
315
  {
316
+ "epoch": 0.07547169811320754,
317
+ "eval_loss": 1.3228683471679688,
318
+ "eval_runtime": 20.4238,
319
+ "eval_samples_per_second": 14.444,
320
+ "eval_steps_per_second": 1.812,
321
  "step": 40
322
  },
323
  {
324
+ "epoch": 0.07735849056603773,
325
+ "grad_norm": 25.3465633392334,
326
+ "learning_rate": 4.0999999999999995e-09,
327
+ "loss": 0.7819,
328
  "step": 41
329
  },
330
  {
331
+ "epoch": 0.07924528301886792,
332
+ "grad_norm": 76.80998229980469,
333
+ "learning_rate": 4.1999999999999996e-09,
334
+ "loss": 1.4672,
335
  "step": 42
336
  },
337
  {
338
+ "epoch": 0.08113207547169811,
339
+ "grad_norm": 27.21162223815918,
340
+ "learning_rate": 4.3e-09,
341
+ "loss": 1.2953,
342
  "step": 43
343
  },
344
  {
345
+ "epoch": 0.0830188679245283,
346
+ "grad_norm": 105.13359832763672,
347
+ "learning_rate": 4.4000000000000005e-09,
348
+ "loss": 1.9177,
349
  "step": 44
350
  },
351
  {
352
+ "epoch": 0.08490566037735849,
353
+ "grad_norm": 28.627273559570312,
354
+ "learning_rate": 4.500000000000001e-09,
355
+ "loss": 0.2641,
356
  "step": 45
357
  },
358
  {
359
+ "epoch": 0.08679245283018867,
360
+ "grad_norm": 39.927616119384766,
361
+ "learning_rate": 4.600000000000001e-09,
362
+ "loss": 1.7988,
363
  "step": 46
364
  },
365
  {
366
+ "epoch": 0.08867924528301886,
367
+ "grad_norm": 117.57942962646484,
368
+ "learning_rate": 4.7e-09,
369
+ "loss": 1.8186,
370
  "step": 47
371
  },
372
  {
373
+ "epoch": 0.09056603773584905,
374
+ "grad_norm": 65.71804809570312,
375
+ "learning_rate": 4.8e-09,
376
+ "loss": 1.1402,
377
  "step": 48
378
  },
379
  {
380
+ "epoch": 0.09245283018867924,
381
+ "grad_norm": 45.33087921142578,
382
+ "learning_rate": 4.9e-09,
383
+ "loss": 1.6036,
384
  "step": 49
385
  },
386
  {
387
+ "epoch": 0.09433962264150944,
388
+ "grad_norm": 76.15016174316406,
389
+ "learning_rate": 5e-09,
390
+ "loss": 1.1135,
391
  "step": 50
392
  },
393
  {
394
+ "epoch": 0.09433962264150944,
395
+ "eval_loss": 1.322121500968933,
396
+ "eval_runtime": 20.4665,
397
+ "eval_samples_per_second": 14.414,
398
+ "eval_steps_per_second": 1.808,
399
  "step": 50
400
  }
401
  ],
402
  "logging_steps": 1,
403
+ "max_steps": 530,
404
  "num_input_tokens_seen": 0,
405
  "num_train_epochs": 1,
406
  "save_steps": 50,
 
416
  "attributes": {}
417
  }
418
  },
419
+ "total_flos": 3589628022374400.0,
420
  "train_batch_size": 5,
421
  "trial_name": null,
422
  "trial_params": null
checkpoint-50/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7639eddf8788426b3403c73375c624a0bd04ca994bbc9d09fd8030ef07a2dfc2
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30601ceab6d241a569cfd829c7955620a983f03985e8370f4a3f11ecd5ec3a95
3
  size 5112