aadityap commited on
Commit
c52a984
1 Parent(s): 070c04e

Model save

Browse files
Files changed (4) hide show
  1. README.md +64 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +492 -0
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: barc0/Llama-3.1-ARC-Potpourri-Transduction-8B
3
+ library_name: peft
4
+ license: llama3.1
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: problem39_model_more_aug_30
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # problem39_model_more_aug_30
18
+
19
+ This model is a fine-tuned version of [barc0/Llama-3.1-ARC-Potpourri-Transduction-8B](https://huggingface.co/barc0/Llama-3.1-ARC-Potpourri-Transduction-8B) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0029
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 2
42
+ - eval_batch_size: 2
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
+ - lr_scheduler_type: cosine
47
+ - lr_scheduler_warmup_ratio: 0.1
48
+ - num_epochs: 2
49
+
50
+ ### Training results
51
+
52
+ | Training Loss | Epoch | Step | Validation Loss |
53
+ |:-------------:|:-----:|:----:|:---------------:|
54
+ | 0.0 | 1.0 | 31 | 0.0026 |
55
+ | 0.0001 | 2.0 | 62 | 0.0029 |
56
+
57
+
58
+ ### Framework versions
59
+
60
+ - PEFT 0.13.2
61
+ - Transformers 4.47.0.dev0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 3.1.0
64
+ - Tokenizers 0.20.3
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 1144521031680.0,
4
+ "train_loss": 0.0004356121934755896,
5
+ "train_runtime": 331.8626,
6
+ "train_samples": 62,
7
+ "train_samples_per_second": 0.374,
8
+ "train_steps_per_second": 0.187
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 1144521031680.0,
4
+ "train_loss": 0.0004356121934755896,
5
+ "train_runtime": 331.8626,
6
+ "train_samples": 62,
7
+ "train_samples_per_second": 0.374,
8
+ "train_steps_per_second": 0.187
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 62,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03225806451612903,
13
+ "grad_norm": 0.0003888311526422569,
14
+ "learning_rate": 7.142857142857143e-06,
15
+ "loss": 0.0,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.06451612903225806,
20
+ "grad_norm": 0.006705866500275534,
21
+ "learning_rate": 1.4285714285714285e-05,
22
+ "loss": 0.0018,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.0967741935483871,
27
+ "grad_norm": 0.0004288404580521471,
28
+ "learning_rate": 2.1428571428571428e-05,
29
+ "loss": 0.0001,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.12903225806451613,
34
+ "grad_norm": 0.006721843317752399,
35
+ "learning_rate": 2.857142857142857e-05,
36
+ "loss": 0.0019,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.16129032258064516,
41
+ "grad_norm": 0.009519105488016918,
42
+ "learning_rate": 3.571428571428572e-05,
43
+ "loss": 0.0032,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.1935483870967742,
48
+ "grad_norm": 0.009983139237850786,
49
+ "learning_rate": 4.2857142857142856e-05,
50
+ "loss": 0.003,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.22580645161290322,
55
+ "grad_norm": 0.006711102239168073,
56
+ "learning_rate": 5e-05,
57
+ "loss": 0.0014,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.25806451612903225,
62
+ "grad_norm": 0.0001960930857351409,
63
+ "learning_rate": 4.995922759815339e-05,
64
+ "loss": 0.0,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.2903225806451613,
69
+ "grad_norm": 9.143550595543499e-05,
70
+ "learning_rate": 4.9837043383713753e-05,
71
+ "loss": 0.0,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.3225806451612903,
76
+ "grad_norm": 0.007417075139109489,
77
+ "learning_rate": 4.963384589619233e-05,
78
+ "loss": 0.0017,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.3548387096774194,
83
+ "grad_norm": 0.002566821879916539,
84
+ "learning_rate": 4.935029792355834e-05,
85
+ "loss": 0.0006,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.3870967741935484,
90
+ "grad_norm": 0.007642635295730325,
91
+ "learning_rate": 4.898732434036244e-05,
92
+ "loss": 0.0009,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.41935483870967744,
97
+ "grad_norm": 0.00826117485142426,
98
+ "learning_rate": 4.854610909098812e-05,
99
+ "loss": 0.0011,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.45161290322580644,
104
+ "grad_norm": 0.005359952597223366,
105
+ "learning_rate": 4.802809132787125e-05,
106
+ "loss": 0.0005,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.4838709677419355,
111
+ "grad_norm": 0.002749396030753739,
112
+ "learning_rate": 4.743496071728396e-05,
113
+ "loss": 0.0002,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.5161290322580645,
118
+ "grad_norm": 0.0011744439020923403,
119
+ "learning_rate": 4.6768651927994434e-05,
120
+ "loss": 0.0002,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.5483870967741935,
125
+ "grad_norm": 0.0017986768332600579,
126
+ "learning_rate": 4.6031338320779534e-05,
127
+ "loss": 0.0003,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.5806451612903226,
132
+ "grad_norm": 0.0027352310170441737,
133
+ "learning_rate": 4.522542485937369e-05,
134
+ "loss": 0.0006,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.6129032258064516,
139
+ "grad_norm": 0.00021274929498089895,
140
+ "learning_rate": 4.4353540265977064e-05,
141
+ "loss": 0.0,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.6451612903225806,
146
+ "grad_norm": 0.00015254003122006633,
147
+ "learning_rate": 4.341852844691012e-05,
148
+ "loss": 0.0,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.6774193548387096,
153
+ "grad_norm": 0.00014754669500526948,
154
+ "learning_rate": 4.242343921638234e-05,
155
+ "loss": 0.0,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.7096774193548387,
160
+ "grad_norm": 0.014701207072580798,
161
+ "learning_rate": 4.137151834863213e-05,
162
+ "loss": 0.0053,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.7419354838709677,
167
+ "grad_norm": 0.00040132537069888165,
168
+ "learning_rate": 4.0266196990885955e-05,
169
+ "loss": 0.0001,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.7741935483870968,
174
+ "grad_norm": 0.000310253267324018,
175
+ "learning_rate": 3.911108047166924e-05,
176
+ "loss": 0.0,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.8064516129032258,
181
+ "grad_norm": 0.0032453162218567663,
182
+ "learning_rate": 3.790993654097405e-05,
183
+ "loss": 0.0004,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.8387096774193549,
188
+ "grad_norm": 0.0011259240533704532,
189
+ "learning_rate": 3.6666683080641846e-05,
190
+ "loss": 0.0002,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.8709677419354839,
195
+ "grad_norm": 6.593340410831557e-05,
196
+ "learning_rate": 3.5385375325047166e-05,
197
+ "loss": 0.0,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.9032258064516129,
202
+ "grad_norm": 5.1984982035018296e-05,
203
+ "learning_rate": 3.4070192633766025e-05,
204
+ "loss": 0.0,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.9354838709677419,
209
+ "grad_norm": 0.0012978852470741332,
210
+ "learning_rate": 3.272542485937369e-05,
211
+ "loss": 0.0002,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.967741935483871,
216
+ "grad_norm": 0.0011559129352493,
217
+ "learning_rate": 3.135545835483718e-05,
218
+ "loss": 0.0001,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 1.0,
223
+ "grad_norm": 0.00016768764915493187,
224
+ "learning_rate": 2.996476166614364e-05,
225
+ "loss": 0.0,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 1.0,
230
+ "eval_loss": 0.002580386819317937,
231
+ "eval_runtime": 0.6554,
232
+ "eval_samples_per_second": 1.526,
233
+ "eval_steps_per_second": 1.526,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 1.032258064516129,
238
+ "grad_norm": 9.698020571956317e-05,
239
+ "learning_rate": 2.8557870956832132e-05,
240
+ "loss": 0.0,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 1.064516129032258,
245
+ "grad_norm": 0.0017065509209019311,
246
+ "learning_rate": 2.7139375211970996e-05,
247
+ "loss": 0.0001,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 1.096774193548387,
252
+ "grad_norm": 0.008080666632729714,
253
+ "learning_rate": 2.5713901269842404e-05,
254
+ "loss": 0.0022,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 1.129032258064516,
259
+ "grad_norm": 0.00014016671601073153,
260
+ "learning_rate": 2.42860987301576e-05,
261
+ "loss": 0.0,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 1.1612903225806452,
266
+ "grad_norm": 0.0002183282791861122,
267
+ "learning_rate": 2.2860624788029013e-05,
268
+ "loss": 0.0,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 1.1935483870967742,
273
+ "grad_norm": 3.4706191979932056e-05,
274
+ "learning_rate": 2.1442129043167874e-05,
275
+ "loss": 0.0,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 1.2258064516129032,
280
+ "grad_norm": 6.274199180955771e-05,
281
+ "learning_rate": 2.003523833385637e-05,
282
+ "loss": 0.0,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 1.2580645161290323,
287
+ "grad_norm": 0.00011240971099287573,
288
+ "learning_rate": 1.8644541645162834e-05,
289
+ "loss": 0.0,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 1.2903225806451613,
294
+ "grad_norm": 8.379442890594463e-05,
295
+ "learning_rate": 1.7274575140626318e-05,
296
+ "loss": 0.0,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 1.3225806451612903,
301
+ "grad_norm": 0.0006950311671464802,
302
+ "learning_rate": 1.5929807366233977e-05,
303
+ "loss": 0.0001,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 1.3548387096774195,
308
+ "grad_norm": 6.554897296239256e-05,
309
+ "learning_rate": 1.4614624674952842e-05,
310
+ "loss": 0.0,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 1.3870967741935485,
315
+ "grad_norm": 8.946111691896196e-05,
316
+ "learning_rate": 1.3333316919358157e-05,
317
+ "loss": 0.0,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 1.4193548387096775,
322
+ "grad_norm": 0.00021237835272870894,
323
+ "learning_rate": 1.2090063459025955e-05,
324
+ "loss": 0.0,
325
+ "step": 44
326
+ },
327
+ {
328
+ "epoch": 1.4516129032258065,
329
+ "grad_norm": 8.780703503531893e-05,
330
+ "learning_rate": 1.0888919528330777e-05,
331
+ "loss": 0.0,
332
+ "step": 45
333
+ },
334
+ {
335
+ "epoch": 1.4838709677419355,
336
+ "grad_norm": 9.310185931359011e-05,
337
+ "learning_rate": 9.733803009114045e-06,
338
+ "loss": 0.0,
339
+ "step": 46
340
+ },
341
+ {
342
+ "epoch": 1.5161290322580645,
343
+ "grad_norm": 0.0022132919985140046,
344
+ "learning_rate": 8.628481651367876e-06,
345
+ "loss": 0.0002,
346
+ "step": 47
347
+ },
348
+ {
349
+ "epoch": 1.5483870967741935,
350
+ "grad_norm": 0.0001803834028245764,
351
+ "learning_rate": 7.576560783617668e-06,
352
+ "loss": 0.0,
353
+ "step": 48
354
+ },
355
+ {
356
+ "epoch": 1.5806451612903225,
357
+ "grad_norm": 0.0011069882126815845,
358
+ "learning_rate": 6.5814715530898745e-06,
359
+ "loss": 0.0001,
360
+ "step": 49
361
+ },
362
+ {
363
+ "epoch": 1.6129032258064515,
364
+ "grad_norm": 0.00015904430381639932,
365
+ "learning_rate": 5.646459734022938e-06,
366
+ "loss": 0.0,
367
+ "step": 50
368
+ },
369
+ {
370
+ "epoch": 1.6451612903225805,
371
+ "grad_norm": 0.00023196882598516344,
372
+ "learning_rate": 4.7745751406263165e-06,
373
+ "loss": 0.0,
374
+ "step": 51
375
+ },
376
+ {
377
+ "epoch": 1.6774193548387095,
378
+ "grad_norm": 8.494839968665877e-05,
379
+ "learning_rate": 3.968661679220468e-06,
380
+ "loss": 0.0,
381
+ "step": 52
382
+ },
383
+ {
384
+ "epoch": 1.7096774193548387,
385
+ "grad_norm": 7.495013324600562e-05,
386
+ "learning_rate": 3.2313480720055745e-06,
387
+ "loss": 0.0,
388
+ "step": 53
389
+ },
390
+ {
391
+ "epoch": 1.7419354838709677,
392
+ "grad_norm": 0.00011153696481041336,
393
+ "learning_rate": 2.565039282716045e-06,
394
+ "loss": 0.0,
395
+ "step": 54
396
+ },
397
+ {
398
+ "epoch": 1.7741935483870968,
399
+ "grad_norm": 9.790228118890396e-05,
400
+ "learning_rate": 1.97190867212875e-06,
401
+ "loss": 0.0,
402
+ "step": 55
403
+ },
404
+ {
405
+ "epoch": 1.8064516129032258,
406
+ "grad_norm": 0.0005323293049174912,
407
+ "learning_rate": 1.4538909090118846e-06,
408
+ "loss": 0.0,
409
+ "step": 56
410
+ },
411
+ {
412
+ "epoch": 1.838709677419355,
413
+ "grad_norm": 7.75389691903468e-05,
414
+ "learning_rate": 1.0126756596375686e-06,
415
+ "loss": 0.0,
416
+ "step": 57
417
+ },
418
+ {
419
+ "epoch": 1.870967741935484,
420
+ "grad_norm": 9.419564261080586e-05,
421
+ "learning_rate": 6.497020764416633e-07,
422
+ "loss": 0.0,
423
+ "step": 58
424
+ },
425
+ {
426
+ "epoch": 1.903225806451613,
427
+ "grad_norm": 1.6085484123530773e-05,
428
+ "learning_rate": 3.6615410380767544e-07,
429
+ "loss": 0.0,
430
+ "step": 59
431
+ },
432
+ {
433
+ "epoch": 1.935483870967742,
434
+ "grad_norm": 0.0005223311340339994,
435
+ "learning_rate": 1.6295661628624447e-07,
436
+ "loss": 0.0001,
437
+ "step": 60
438
+ },
439
+ {
440
+ "epoch": 1.967741935483871,
441
+ "grad_norm": 9.50629744971249e-05,
442
+ "learning_rate": 4.07724018466088e-08,
443
+ "loss": 0.0,
444
+ "step": 61
445
+ },
446
+ {
447
+ "epoch": 2.0,
448
+ "grad_norm": 0.0005980841644283981,
449
+ "learning_rate": 0.0,
450
+ "loss": 0.0001,
451
+ "step": 62
452
+ },
453
+ {
454
+ "epoch": 2.0,
455
+ "eval_loss": 0.0029119281098246574,
456
+ "eval_runtime": 1.4832,
457
+ "eval_samples_per_second": 0.674,
458
+ "eval_steps_per_second": 0.674,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 2.0,
463
+ "step": 62,
464
+ "total_flos": 1144521031680.0,
465
+ "train_loss": 0.0004356121934755896,
466
+ "train_runtime": 331.8626,
467
+ "train_samples_per_second": 0.374,
468
+ "train_steps_per_second": 0.187
469
+ }
470
+ ],
471
+ "logging_steps": 1,
472
+ "max_steps": 62,
473
+ "num_input_tokens_seen": 0,
474
+ "num_train_epochs": 2,
475
+ "save_steps": 500,
476
+ "stateful_callbacks": {
477
+ "TrainerControl": {
478
+ "args": {
479
+ "should_epoch_stop": false,
480
+ "should_evaluate": false,
481
+ "should_log": false,
482
+ "should_save": true,
483
+ "should_training_stop": true
484
+ },
485
+ "attributes": {}
486
+ }
487
+ },
488
+ "total_flos": 1144521031680.0,
489
+ "train_batch_size": 2,
490
+ "trial_name": null,
491
+ "trial_params": null
492
+ }