Mel-Iza0 commited on
Commit
af890dc
·
1 Parent(s): 68af67c

Training in progress, epoch 8, checkpoint

Browse files
checkpoint-30478/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+
19
+ The following `bitsandbytes` quantization config was used during training:
20
+ - quant_method: bitsandbytes
21
+ - load_in_8bit: False
22
+ - load_in_4bit: True
23
+ - llm_int8_threshold: 6.0
24
+ - llm_int8_skip_modules: None
25
+ - llm_int8_enable_fp32_cpu_offload: False
26
+ - llm_int8_has_fp16_weight: False
27
+ - bnb_4bit_quant_type: nf4
28
+ - bnb_4bit_use_double_quant: True
29
+ - bnb_4bit_compute_dtype: bfloat16
30
+ ### Framework versions
31
+
32
+ - PEFT 0.4.0
33
+
34
+ - PEFT 0.4.0
checkpoint-30478/adapter_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "k_proj",
19
+ "v_projo_proj"
20
+ ],
21
+ "task_type": "CAUSAL_LM"
22
+ }
checkpoint-30478/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86df2be2e0ef3c6a999da11a505a30c7d5b0faaf708bd49bfe58fb8d7a3c7274
3
+ size 13677261
checkpoint-30478/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99bffa27a573160d03bf719b932c9dba46ace3260da3c77318b3c13516e40840
3
+ size 13648432
checkpoint-30478/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8507ff1c584e6617b6e4dc616a94da0df714511d2785effbebf761644dbb60a5
3
+ size 27370181
checkpoint-30478/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e382170e4faa60d9192067415c41df46ab5e823dc4bd321b643810d3ee593485
3
+ size 14575
checkpoint-30478/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4777023b1df9cec050d9a51f0d37b2a53dc39f849b99665432024ee66962913a
3
+ size 627
checkpoint-30478/special_tokens_map.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<unk>",
4
+ "<s>",
5
+ "</s>"
6
+ ],
7
+ "bos_token": "<s>",
8
+ "eos_token": "</s>",
9
+ "pad_token": "</s>",
10
+ "unk_token": "<unk>"
11
+ }
checkpoint-30478/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-30478/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "additional_special_tokens": [
29
+ "<unk>",
30
+ "<s>",
31
+ "</s>"
32
+ ],
33
+ "bos_token": "<s>",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": true,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": true
44
+ }
checkpoint-30478/trainer_state.json ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7868753671646118,
3
+ "best_model_checkpoint": "./Zeroshot/01-12-23-mistralai-Mistral-7B-v0.1_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_3/checkpoints/checkpoint-27092",
4
+ "epoch": 8.999852354938728,
5
+ "eval_steps": 500,
6
+ "global_step": 30478,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15,
13
+ "learning_rate": 5.835794447725931e-05,
14
+ "loss": 1.4468,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.3,
19
+ "learning_rate": 0.00011742468989958655,
20
+ "loss": 0.9754,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.44,
25
+ "learning_rate": 0.00017649143532191377,
26
+ "loss": 0.9429,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.59,
31
+ "learning_rate": 0.000235558180744241,
32
+ "loss": 0.9147,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.74,
37
+ "learning_rate": 0.00029462492616656825,
38
+ "loss": 0.9067,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.89,
43
+ "learning_rate": 0.00035369167158889544,
44
+ "loss": 0.8978,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 1.0,
49
+ "eval_loss": 0.8805813789367676,
50
+ "eval_runtime": 88.2945,
51
+ "eval_samples_per_second": 17.057,
52
+ "eval_steps_per_second": 2.141,
53
+ "step": 3386
54
+ },
55
+ {
56
+ "epoch": 1.03,
57
+ "learning_rate": 0.00039998760393503537,
58
+ "loss": 0.895,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 1.18,
63
+ "learning_rate": 0.0003996072594095129,
64
+ "loss": 0.8687,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 1.33,
69
+ "learning_rate": 0.00039869668890858337,
70
+ "loss": 0.8884,
71
+ "step": 4500
72
+ },
73
+ {
74
+ "epoch": 1.48,
75
+ "learning_rate": 0.00039725831122269285,
76
+ "loss": 0.8715,
77
+ "step": 5000
78
+ },
79
+ {
80
+ "epoch": 1.62,
81
+ "learning_rate": 0.00039529594718087214,
82
+ "loss": 0.8645,
83
+ "step": 5500
84
+ },
85
+ {
86
+ "epoch": 1.77,
87
+ "learning_rate": 0.0003928148095012922,
88
+ "loss": 0.8666,
89
+ "step": 6000
90
+ },
91
+ {
92
+ "epoch": 1.92,
93
+ "learning_rate": 0.0003898214889444803,
94
+ "loss": 0.8719,
95
+ "step": 6500
96
+ },
97
+ {
98
+ "epoch": 2.0,
99
+ "eval_loss": 0.8552550673484802,
100
+ "eval_runtime": 88.3232,
101
+ "eval_samples_per_second": 17.051,
102
+ "eval_steps_per_second": 2.14,
103
+ "step": 6773
104
+ },
105
+ {
106
+ "epoch": 2.07,
107
+ "learning_rate": 0.00038632393680597854,
108
+ "loss": 0.8438,
109
+ "step": 7000
110
+ },
111
+ {
112
+ "epoch": 2.21,
113
+ "learning_rate": 0.0003823314437949511,
114
+ "loss": 0.8308,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 2.36,
119
+ "learning_rate": 0.00037785461535484375,
120
+ "loss": 0.8259,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 2.51,
125
+ "learning_rate": 0.0003729053434916558,
126
+ "loss": 0.8324,
127
+ "step": 8500
128
+ },
129
+ {
130
+ "epoch": 2.66,
131
+ "learning_rate": 0.0003674967751846552,
132
+ "loss": 0.8413,
133
+ "step": 9000
134
+ },
135
+ {
136
+ "epoch": 2.81,
137
+ "learning_rate": 0.0003616554183563445,
138
+ "loss": 0.8322,
139
+ "step": 9500
140
+ },
141
+ {
142
+ "epoch": 2.95,
143
+ "learning_rate": 0.00035537338261496887,
144
+ "loss": 0.8368,
145
+ "step": 10000
146
+ },
147
+ {
148
+ "epoch": 3.0,
149
+ "eval_loss": 0.8443310260772705,
150
+ "eval_runtime": 88.3102,
151
+ "eval_samples_per_second": 17.054,
152
+ "eval_steps_per_second": 2.14,
153
+ "step": 10159
154
+ },
155
+ {
156
+ "epoch": 3.1,
157
+ "learning_rate": 0.0003486786213865893,
158
+ "loss": 0.8088,
159
+ "step": 10500
160
+ },
161
+ {
162
+ "epoch": 3.25,
163
+ "learning_rate": 0.0003415889182744321,
164
+ "loss": 0.8003,
165
+ "step": 11000
166
+ },
167
+ {
168
+ "epoch": 3.4,
169
+ "learning_rate": 0.0003341231059840768,
170
+ "loss": 0.805,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 3.54,
175
+ "learning_rate": 0.0003263010162972709,
176
+ "loss": 0.8061,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 3.69,
181
+ "learning_rate": 0.00031814342739185336,
182
+ "loss": 0.8008,
183
+ "step": 12500
184
+ },
185
+ {
186
+ "epoch": 3.84,
187
+ "learning_rate": 0.000309672008647721,
188
+ "loss": 0.8029,
189
+ "step": 13000
190
+ },
191
+ {
192
+ "epoch": 3.99,
193
+ "learning_rate": 0.00030090926308545536,
194
+ "loss": 0.8056,
195
+ "step": 13500
196
+ },
197
+ {
198
+ "epoch": 4.0,
199
+ "eval_loss": 0.8322489857673645,
200
+ "eval_runtime": 88.3294,
201
+ "eval_samples_per_second": 17.05,
202
+ "eval_steps_per_second": 2.14,
203
+ "step": 13546
204
+ },
205
+ {
206
+ "epoch": 4.13,
207
+ "learning_rate": 0.00029187846759051,
208
+ "loss": 0.7649,
209
+ "step": 14000
210
+ },
211
+ {
212
+ "epoch": 4.28,
213
+ "learning_rate": 0.00028260361108174584,
214
+ "loss": 0.7674,
215
+ "step": 14500
216
+ },
217
+ {
218
+ "epoch": 4.43,
219
+ "learning_rate": 0.0002731093307885585,
220
+ "loss": 0.7635,
221
+ "step": 15000
222
+ },
223
+ {
224
+ "epoch": 4.58,
225
+ "learning_rate": 0.0002634208468058692,
226
+ "loss": 0.7759,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 4.72,
231
+ "learning_rate": 0.000253563895100822,
232
+ "loss": 0.7669,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 4.87,
237
+ "learning_rate": 0.0002435847820221107,
238
+ "loss": 0.7708,
239
+ "step": 16500
240
+ },
241
+ {
242
+ "epoch": 5.0,
243
+ "eval_loss": 0.8134331107139587,
244
+ "eval_runtime": 88.3002,
245
+ "eval_samples_per_second": 17.055,
246
+ "eval_steps_per_second": 2.14,
247
+ "step": 16932
248
+ },
249
+ {
250
+ "epoch": 5.02,
251
+ "learning_rate": 0.00023347002797691627,
252
+ "loss": 0.7681,
253
+ "step": 17000
254
+ },
255
+ {
256
+ "epoch": 5.17,
257
+ "learning_rate": 0.00022326636595170415,
258
+ "loss": 0.7237,
259
+ "step": 17500
260
+ },
261
+ {
262
+ "epoch": 5.32,
263
+ "learning_rate": 0.0002130214749113661,
264
+ "loss": 0.7299,
265
+ "step": 18000
266
+ },
267
+ {
268
+ "epoch": 5.46,
269
+ "learning_rate": 0.0002027215162553563,
270
+ "loss": 0.7307,
271
+ "step": 18500
272
+ },
273
+ {
274
+ "epoch": 5.61,
275
+ "learning_rate": 0.0001924143283101145,
276
+ "loss": 0.7251,
277
+ "step": 19000
278
+ },
279
+ {
280
+ "epoch": 5.76,
281
+ "learning_rate": 0.00018212729053467496,
282
+ "loss": 0.7328,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 5.91,
287
+ "learning_rate": 0.00017188772886224626,
288
+ "loss": 0.731,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 6.0,
293
+ "eval_loss": 0.8020778298377991,
294
+ "eval_runtime": 88.3249,
295
+ "eval_samples_per_second": 17.051,
296
+ "eval_steps_per_second": 2.14,
297
+ "step": 20319
298
+ },
299
+ {
300
+ "epoch": 6.05,
301
+ "learning_rate": 0.00016172284311307314,
302
+ "loss": 0.7118,
303
+ "step": 20500
304
+ },
305
+ {
306
+ "epoch": 6.2,
307
+ "learning_rate": 0.00015167964186544786,
308
+ "loss": 0.691,
309
+ "step": 21000
310
+ },
311
+ {
312
+ "epoch": 6.35,
313
+ "learning_rate": 0.00014174455895749337,
314
+ "loss": 0.6873,
315
+ "step": 21500
316
+ },
317
+ {
318
+ "epoch": 6.5,
319
+ "learning_rate": 0.00013196422266617455,
320
+ "loss": 0.6892,
321
+ "step": 22000
322
+ },
323
+ {
324
+ "epoch": 6.64,
325
+ "learning_rate": 0.00012236461295016225,
326
+ "loss": 0.6864,
327
+ "step": 22500
328
+ },
329
+ {
330
+ "epoch": 6.79,
331
+ "learning_rate": 0.00011298979396555838,
332
+ "loss": 0.6848,
333
+ "step": 23000
334
+ },
335
+ {
336
+ "epoch": 6.94,
337
+ "learning_rate": 0.00010382710238831153,
338
+ "loss": 0.6828,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 7.0,
343
+ "eval_loss": 0.7894787788391113,
344
+ "eval_runtime": 88.3291,
345
+ "eval_samples_per_second": 17.05,
346
+ "eval_steps_per_second": 2.14,
347
+ "step": 23705
348
+ },
349
+ {
350
+ "epoch": 7.09,
351
+ "learning_rate": 9.491987931754444e-05,
352
+ "loss": 0.6629,
353
+ "step": 24000
354
+ },
355
+ {
356
+ "epoch": 7.23,
357
+ "learning_rate": 8.629178542098462e-05,
358
+ "loss": 0.652,
359
+ "step": 24500
360
+ },
361
+ {
362
+ "epoch": 7.38,
363
+ "learning_rate": 7.796573990265166e-05,
364
+ "loss": 0.6479,
365
+ "step": 25000
366
+ },
367
+ {
368
+ "epoch": 7.53,
369
+ "learning_rate": 6.996385962151814e-05,
370
+ "loss": 0.65,
371
+ "step": 25500
372
+ },
373
+ {
374
+ "epoch": 7.68,
375
+ "learning_rate": 6.230740034147598e-05,
376
+ "loss": 0.6455,
377
+ "step": 26000
378
+ },
379
+ {
380
+ "epoch": 7.83,
381
+ "learning_rate": 5.503090360176009e-05,
382
+ "loss": 0.6516,
383
+ "step": 26500
384
+ },
385
+ {
386
+ "epoch": 7.97,
387
+ "learning_rate": 4.8124540451398006e-05,
388
+ "loss": 0.6453,
389
+ "step": 27000
390
+ },
391
+ {
392
+ "epoch": 8.0,
393
+ "eval_loss": 0.7868753671646118,
394
+ "eval_runtime": 88.3103,
395
+ "eval_samples_per_second": 17.054,
396
+ "eval_steps_per_second": 2.14,
397
+ "step": 27092
398
+ },
399
+ {
400
+ "epoch": 8.12,
401
+ "learning_rate": 4.162161109356324e-05,
402
+ "loss": 0.6222,
403
+ "step": 27500
404
+ },
405
+ {
406
+ "epoch": 8.27,
407
+ "learning_rate": 3.553938955963088e-05,
408
+ "loss": 0.6164,
409
+ "step": 28000
410
+ },
411
+ {
412
+ "epoch": 8.42,
413
+ "learning_rate": 2.9894032335367272e-05,
414
+ "loss": 0.6214,
415
+ "step": 28500
416
+ },
417
+ {
418
+ "epoch": 8.56,
419
+ "learning_rate": 2.470053544371056e-05,
420
+ "loss": 0.6215,
421
+ "step": 29000
422
+ },
423
+ {
424
+ "epoch": 8.71,
425
+ "learning_rate": 1.9981677010252242e-05,
426
+ "loss": 0.6181,
427
+ "step": 29500
428
+ },
429
+ {
430
+ "epoch": 8.86,
431
+ "learning_rate": 1.573108289234544e-05,
432
+ "loss": 0.6179,
433
+ "step": 30000
434
+ },
435
+ {
436
+ "epoch": 9.0,
437
+ "eval_loss": 0.7876124382019043,
438
+ "eval_runtime": 88.313,
439
+ "eval_samples_per_second": 17.053,
440
+ "eval_steps_per_second": 2.14,
441
+ "step": 30478
442
+ }
443
+ ],
444
+ "logging_steps": 500,
445
+ "max_steps": 33860,
446
+ "num_train_epochs": 10,
447
+ "save_steps": 500,
448
+ "total_flos": 1.530218263203152e+18,
449
+ "trial_name": null,
450
+ "trial_params": null
451
+ }
checkpoint-30478/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ba9992e8be82ca13275abd7c4d38c76e2e922e71387553242d20e094340bc1
3
+ size 4347