scottsuk0306 commited on
Commit
7bbb34a
1 Parent(s): 724ba1b

Model save

Browse files
README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: alignment-handbook/zephyr-7b-sft-full
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: zephyr-7b-math-train-test
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # zephyr-7b-math-train-test
20
+
21
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.0130
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 1e-05
43
+ - train_batch_size: 16
44
+ - eval_batch_size: 8
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 4
48
+ - total_train_batch_size: 64
49
+ - total_eval_batch_size: 32
50
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
+ - lr_scheduler_type: cosine
52
+ - lr_scheduler_warmup_ratio: 0.1
53
+ - num_epochs: 10
54
+
55
+ ### Training results
56
+
57
+ | Training Loss | Epoch | Step | Validation Loss |
58
+ |:-------------:|:-----:|:----:|:---------------:|
59
+ | 0.746 | 1.0 | 10 | 0.6231 |
60
+ | 0.5258 | 2.0 | 20 | 0.3843 |
61
+ | 0.3183 | 3.0 | 30 | 0.1999 |
62
+ | 0.16 | 4.0 | 40 | 0.0864 |
63
+ | 0.0811 | 5.0 | 50 | 0.0496 |
64
+ | 0.0502 | 6.0 | 60 | 0.0345 |
65
+ | 0.035 | 7.0 | 70 | 0.0254 |
66
+ | 0.0241 | 8.0 | 80 | 0.0185 |
67
+ | 0.0165 | 9.0 | 90 | 0.0142 |
68
+ | 0.0129 | 10.0 | 100 | 0.0130 |
69
+
70
+
71
+ ### Framework versions
72
+
73
+ - Transformers 4.44.2
74
+ - Pytorch 2.4.1+cu124
75
+ - Datasets 2.21.0
76
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 20937965568000.0,
4
+ "train_loss": 0.21048430703580379,
5
+ "train_runtime": 572.8088,
6
+ "train_samples": 4000,
7
+ "train_samples_per_second": 10.737,
8
+ "train_steps_per_second": 0.175
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.44.2"
6
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 20937965568000.0,
4
+ "train_loss": 0.21048430703580379,
5
+ "train_runtime": 572.8088,
6
+ "train_samples": 4000,
7
+ "train_samples_per_second": 10.737,
8
+ "train_steps_per_second": 0.175
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "grad_norm": 18.916099095189473,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 0.9574,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.5,
20
+ "grad_norm": 21.53952313643694,
21
+ "learning_rate": 5e-06,
22
+ "loss": 0.8754,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 1.0,
27
+ "grad_norm": 5.798825182601948,
28
+ "learning_rate": 1e-05,
29
+ "loss": 0.746,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 1.0,
34
+ "eval_loss": 0.6230876445770264,
35
+ "eval_runtime": 5.7302,
36
+ "eval_samples_per_second": 53.052,
37
+ "eval_steps_per_second": 1.745,
38
+ "step": 10
39
+ },
40
+ {
41
+ "epoch": 1.5,
42
+ "grad_norm": 3.2446769999187137,
43
+ "learning_rate": 9.924038765061042e-06,
44
+ "loss": 0.5866,
45
+ "step": 15
46
+ },
47
+ {
48
+ "epoch": 2.0,
49
+ "grad_norm": 2.9248994732336526,
50
+ "learning_rate": 9.698463103929542e-06,
51
+ "loss": 0.5258,
52
+ "step": 20
53
+ },
54
+ {
55
+ "epoch": 2.0,
56
+ "eval_loss": 0.3843041658401489,
57
+ "eval_runtime": 5.7293,
58
+ "eval_samples_per_second": 53.061,
59
+ "eval_steps_per_second": 1.745,
60
+ "step": 20
61
+ },
62
+ {
63
+ "epoch": 2.5,
64
+ "grad_norm": 3.044511051436622,
65
+ "learning_rate": 9.330127018922195e-06,
66
+ "loss": 0.3601,
67
+ "step": 25
68
+ },
69
+ {
70
+ "epoch": 3.0,
71
+ "grad_norm": 2.817325596304153,
72
+ "learning_rate": 8.83022221559489e-06,
73
+ "loss": 0.3183,
74
+ "step": 30
75
+ },
76
+ {
77
+ "epoch": 3.0,
78
+ "eval_loss": 0.1999027132987976,
79
+ "eval_runtime": 5.7226,
80
+ "eval_samples_per_second": 53.123,
81
+ "eval_steps_per_second": 1.747,
82
+ "step": 30
83
+ },
84
+ {
85
+ "epoch": 3.5,
86
+ "grad_norm": 3.4431057415284267,
87
+ "learning_rate": 8.213938048432697e-06,
88
+ "loss": 0.1832,
89
+ "step": 35
90
+ },
91
+ {
92
+ "epoch": 4.0,
93
+ "grad_norm": 2.7772199860436415,
94
+ "learning_rate": 7.500000000000001e-06,
95
+ "loss": 0.16,
96
+ "step": 40
97
+ },
98
+ {
99
+ "epoch": 4.0,
100
+ "eval_loss": 0.0864211916923523,
101
+ "eval_runtime": 5.7261,
102
+ "eval_samples_per_second": 53.09,
103
+ "eval_steps_per_second": 1.746,
104
+ "step": 40
105
+ },
106
+ {
107
+ "epoch": 4.5,
108
+ "grad_norm": 3.315697037143245,
109
+ "learning_rate": 6.710100716628345e-06,
110
+ "loss": 0.0802,
111
+ "step": 45
112
+ },
113
+ {
114
+ "epoch": 5.0,
115
+ "grad_norm": 2.0887623280319887,
116
+ "learning_rate": 5.8682408883346535e-06,
117
+ "loss": 0.0811,
118
+ "step": 50
119
+ },
120
+ {
121
+ "epoch": 5.0,
122
+ "eval_loss": 0.04959693178534508,
123
+ "eval_runtime": 5.7295,
124
+ "eval_samples_per_second": 53.059,
125
+ "eval_steps_per_second": 1.745,
126
+ "step": 50
127
+ },
128
+ {
129
+ "epoch": 5.5,
130
+ "grad_norm": 2.1432132959969725,
131
+ "learning_rate": 5e-06,
132
+ "loss": 0.0479,
133
+ "step": 55
134
+ },
135
+ {
136
+ "epoch": 6.0,
137
+ "grad_norm": 1.8407217223386276,
138
+ "learning_rate": 4.131759111665349e-06,
139
+ "loss": 0.0502,
140
+ "step": 60
141
+ },
142
+ {
143
+ "epoch": 6.0,
144
+ "eval_loss": 0.034496039152145386,
145
+ "eval_runtime": 5.726,
146
+ "eval_samples_per_second": 53.091,
147
+ "eval_steps_per_second": 1.746,
148
+ "step": 60
149
+ },
150
+ {
151
+ "epoch": 6.5,
152
+ "grad_norm": 1.5928033449038717,
153
+ "learning_rate": 3.289899283371657e-06,
154
+ "loss": 0.0339,
155
+ "step": 65
156
+ },
157
+ {
158
+ "epoch": 7.0,
159
+ "grad_norm": 1.359878925254289,
160
+ "learning_rate": 2.5000000000000015e-06,
161
+ "loss": 0.035,
162
+ "step": 70
163
+ },
164
+ {
165
+ "epoch": 7.0,
166
+ "eval_loss": 0.025400497019290924,
167
+ "eval_runtime": 5.7281,
168
+ "eval_samples_per_second": 53.072,
169
+ "eval_steps_per_second": 1.746,
170
+ "step": 70
171
+ },
172
+ {
173
+ "epoch": 7.5,
174
+ "grad_norm": 1.1463040760182688,
175
+ "learning_rate": 1.7860619515673034e-06,
176
+ "loss": 0.0248,
177
+ "step": 75
178
+ },
179
+ {
180
+ "epoch": 8.0,
181
+ "grad_norm": 1.0125936690851214,
182
+ "learning_rate": 1.1697777844051105e-06,
183
+ "loss": 0.0241,
184
+ "step": 80
185
+ },
186
+ {
187
+ "epoch": 8.0,
188
+ "eval_loss": 0.0185158122330904,
189
+ "eval_runtime": 5.7304,
190
+ "eval_samples_per_second": 53.051,
191
+ "eval_steps_per_second": 1.745,
192
+ "step": 80
193
+ },
194
+ {
195
+ "epoch": 8.5,
196
+ "grad_norm": 0.8554989663331637,
197
+ "learning_rate": 6.698729810778065e-07,
198
+ "loss": 0.0177,
199
+ "step": 85
200
+ },
201
+ {
202
+ "epoch": 9.0,
203
+ "grad_norm": 0.6771852125259162,
204
+ "learning_rate": 3.015368960704584e-07,
205
+ "loss": 0.0165,
206
+ "step": 90
207
+ },
208
+ {
209
+ "epoch": 9.0,
210
+ "eval_loss": 0.014229783788323402,
211
+ "eval_runtime": 5.7297,
212
+ "eval_samples_per_second": 53.057,
213
+ "eval_steps_per_second": 1.745,
214
+ "step": 90
215
+ },
216
+ {
217
+ "epoch": 9.5,
218
+ "grad_norm": 0.5405112996842772,
219
+ "learning_rate": 7.59612349389599e-08,
220
+ "loss": 0.0135,
221
+ "step": 95
222
+ },
223
+ {
224
+ "epoch": 10.0,
225
+ "grad_norm": 0.5192308458018222,
226
+ "learning_rate": 0.0,
227
+ "loss": 0.0129,
228
+ "step": 100
229
+ },
230
+ {
231
+ "epoch": 10.0,
232
+ "eval_loss": 0.01297028735280037,
233
+ "eval_runtime": 5.7028,
234
+ "eval_samples_per_second": 53.307,
235
+ "eval_steps_per_second": 1.754,
236
+ "step": 100
237
+ },
238
+ {
239
+ "epoch": 10.0,
240
+ "step": 100,
241
+ "total_flos": 20937965568000.0,
242
+ "train_loss": 0.21048430703580379,
243
+ "train_runtime": 572.8088,
244
+ "train_samples_per_second": 10.737,
245
+ "train_steps_per_second": 0.175
246
+ }
247
+ ],
248
+ "logging_steps": 5,
249
+ "max_steps": 100,
250
+ "num_input_tokens_seen": 0,
251
+ "num_train_epochs": 10,
252
+ "save_steps": 100,
253
+ "stateful_callbacks": {
254
+ "TrainerControl": {
255
+ "args": {
256
+ "should_epoch_stop": false,
257
+ "should_evaluate": false,
258
+ "should_log": false,
259
+ "should_save": true,
260
+ "should_training_stop": true
261
+ },
262
+ "attributes": {}
263
+ }
264
+ },
265
+ "total_flos": 20937965568000.0,
266
+ "train_batch_size": 16,
267
+ "trial_name": null,
268
+ "trial_params": null
269
+ }