ale-bay commited on
Commit
ee8cd5b
1 Parent(s): 0d5bdd6

Model save

Browse files
README.md CHANGED
@@ -1,15 +1,10 @@
1
  ---
2
- base_model: data/gemma-2b
3
  tags:
4
- - alignment-handbook
5
- - trl
6
- - sft
7
- - generated_from_trainer
8
  - trl
9
  - sft
10
  - generated_from_trainer
11
  datasets:
12
- - argilla/dpo-mix-7k
13
  model-index:
14
  - name: zephyr-2b-gemma-dft
15
  results: []
@@ -18,12 +13,10 @@ model-index:
18
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
19
  should probably proofread and complete it, then remove this comment. -->
20
 
21
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://zebra.wandb.io/cto/distillm/runs/kje9xugl)
22
  # zephyr-2b-gemma-dft
23
 
24
- This model is a fine-tuned version of [data/gemma-2b](https://huggingface.co/data/gemma-2b) on the argilla/dpo-mix-7k dataset.
25
- It achieves the following results on the evaluation set:
26
- - Loss: 0.0000
27
 
28
  ## Model description
29
 
@@ -51,13 +44,10 @@ The following hyperparameters were used during training:
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
- - num_epochs: 1
55
 
56
  ### Training results
57
 
58
- | Training Loss | Epoch | Step | Validation Loss |
59
- |:-------------:|:------:|:----:|:---------------:|
60
- | 0.0 | 0.9982 | 270 | 0.0000 |
61
 
62
 
63
  ### Framework versions
 
1
  ---
 
2
  tags:
 
 
 
 
3
  - trl
4
  - sft
5
  - generated_from_trainer
6
  datasets:
7
+ - generator
8
  model-index:
9
  - name: zephyr-2b-gemma-dft
10
  results: []
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
  should probably proofread and complete it, then remove this comment. -->
15
 
16
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](None)
17
  # zephyr-2b-gemma-dft
18
 
19
+ This model was trained from scratch on the generator dataset.
 
 
20
 
21
  ## Model description
22
 
 
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.1
47
+ - num_epochs: 0
48
 
49
  ### Training results
50
 
 
 
 
51
 
52
 
53
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 0.9981515711645101,
3
- "eval_loss": 1.3726342331210617e-05,
4
- "eval_runtime": 50.9693,
5
- "eval_samples": 750,
6
- "eval_samples_per_second": 19.051,
7
- "eval_steps_per_second": 2.394,
8
- "total_flos": 5.260333472022528e+16,
9
- "train_loss": 1.8035727003330572e-05,
10
- "train_runtime": 2689.0507,
11
  "train_samples": 6750,
12
- "train_samples_per_second": 3.217,
13
- "train_steps_per_second": 0.1
14
  }
 
1
  {
2
+ "epoch": 0,
3
+ "total_flos": 0,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 4.808,
 
 
 
 
 
6
  "train_samples": 6750,
7
+ "train_samples_per_second": 0.0,
8
+ "train_steps_per_second": 0.0
9
  }
config.json CHANGED
@@ -24,6 +24,6 @@
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.42.4",
27
- "use_cache": true,
28
  "vocab_size": 256000
29
  }
 
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.42.4",
27
+ "use_cache": false,
28
  "vocab_size": 256000
29
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:166c2cc2ac8f29542b11f7812cc6cdded64345b787b9b74e50c194790584c068
3
  size 4945242264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98caad07f1e1d17fede7734906d6881d4cb231e1d0cd2ebcf7fc8a6317eee4d6
3
  size 4945242264
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:403d33ecb881a9ae12700a261993eb4f1ad36a361fbb628ffaa31bf3ff3456e1
3
  size 67121608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3342a2f7db857bc9873c699eef1416eb2b320603e5e042a6f7736689b26382cb
3
  size 67121608
runs/Jul23_16-03-42_ale-distillm-8-0-0/events.out.tfevents.1721747038.ale-distillm-8-0-0.67747.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e00a4e18a9b4b510ffada73fab88f7ab13cc0ec51603704a110c340d5f9e7f99
3
+ size 5428
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9981515711645101,
3
- "total_flos": 5.260333472022528e+16,
4
- "train_loss": 1.8035727003330572e-05,
5
- "train_runtime": 2689.0507,
6
  "train_samples": 6750,
7
- "train_samples_per_second": 3.217,
8
- "train_steps_per_second": 0.1
9
  }
 
1
  {
2
+ "epoch": 0,
3
+ "total_flos": 0,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 4.808,
6
  "train_samples": 6750,
7
+ "train_samples_per_second": 0.0,
8
+ "train_steps_per_second": 0.0
9
  }
trainer_state.json CHANGED
@@ -1,413 +1,27 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9981515711645101,
5
  "eval_steps": 500,
6
- "global_step": 270,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.018484288354898338,
13
- "grad_norm": 0.00118255615234375,
14
- "learning_rate": 3.7037037037037037e-06,
15
- "loss": 0.0001,
16
- "step": 5
17
- },
18
- {
19
- "epoch": 0.036968576709796676,
20
- "grad_norm": 0.000579833984375,
21
- "learning_rate": 7.4074074074074075e-06,
22
- "loss": 0.0001,
23
- "step": 10
24
- },
25
- {
26
- "epoch": 0.05545286506469501,
27
- "grad_norm": 0.000614166259765625,
28
- "learning_rate": 1.1111111111111113e-05,
29
- "loss": 0.0,
30
- "step": 15
31
- },
32
- {
33
- "epoch": 0.07393715341959335,
34
- "grad_norm": 0.000850677490234375,
35
- "learning_rate": 1.4814814814814815e-05,
36
- "loss": 0.0,
37
- "step": 20
38
- },
39
- {
40
- "epoch": 0.09242144177449169,
41
- "grad_norm": 0.000446319580078125,
42
- "learning_rate": 1.851851851851852e-05,
43
- "loss": 0.0,
44
- "step": 25
45
- },
46
- {
47
- "epoch": 0.11090573012939002,
48
- "grad_norm": 0.00075531005859375,
49
- "learning_rate": 1.9992479525042305e-05,
50
- "loss": 0.0,
51
- "step": 30
52
- },
53
- {
54
- "epoch": 0.12939001848428835,
55
- "grad_norm": 0.0003108978271484375,
56
- "learning_rate": 1.9946562024066018e-05,
57
- "loss": 0.0,
58
- "step": 35
59
- },
60
- {
61
- "epoch": 0.1478743068391867,
62
- "grad_norm": 0.0003833770751953125,
63
- "learning_rate": 1.9859096633447965e-05,
64
- "loss": 0.0,
65
- "step": 40
66
- },
67
- {
68
- "epoch": 0.16635859519408502,
69
- "grad_norm": 0.000499725341796875,
70
- "learning_rate": 1.973044870579824e-05,
71
- "loss": 0.0,
72
- "step": 45
73
- },
74
- {
75
- "epoch": 0.18484288354898337,
76
- "grad_norm": 0.000118255615234375,
77
- "learning_rate": 1.95611556177388e-05,
78
- "loss": 0.0,
79
- "step": 50
80
- },
81
- {
82
- "epoch": 0.2033271719038817,
83
- "grad_norm": 7.724761962890625e-05,
84
- "learning_rate": 1.93519245252219e-05,
85
- "loss": 0.0,
86
- "step": 55
87
- },
88
- {
89
- "epoch": 0.22181146025878004,
90
- "grad_norm": 0.0002689361572265625,
91
- "learning_rate": 1.9103629409661468e-05,
92
- "loss": 0.0,
93
- "step": 60
94
- },
95
- {
96
- "epoch": 0.24029574861367836,
97
- "grad_norm": 8.821487426757812e-05,
98
- "learning_rate": 1.881730742721608e-05,
99
- "loss": 0.0,
100
- "step": 65
101
- },
102
- {
103
- "epoch": 0.2587800369685767,
104
- "grad_norm": 0.00016307830810546875,
105
- "learning_rate": 1.8494154576472976e-05,
106
- "loss": 0.0,
107
- "step": 70
108
- },
109
- {
110
- "epoch": 0.27726432532347506,
111
- "grad_norm": 9.441375732421875e-05,
112
- "learning_rate": 1.8135520702629677e-05,
113
- "loss": 0.0,
114
- "step": 75
115
- },
116
- {
117
- "epoch": 0.2957486136783734,
118
- "grad_norm": 0.0002689361572265625,
119
- "learning_rate": 1.7742903859041324e-05,
120
- "loss": 0.0,
121
- "step": 80
122
- },
123
- {
124
- "epoch": 0.3142329020332717,
125
- "grad_norm": 5.793571472167969e-05,
126
- "learning_rate": 1.7317944049686125e-05,
127
- "loss": 0.0,
128
- "step": 85
129
- },
130
- {
131
- "epoch": 0.33271719038817005,
132
- "grad_norm": 0.00018596649169921875,
133
- "learning_rate": 1.686241637868734e-05,
134
- "loss": 0.0,
135
- "step": 90
136
- },
137
- {
138
- "epoch": 0.3512014787430684,
139
- "grad_norm": 0.00012111663818359375,
140
- "learning_rate": 1.637822363550706e-05,
141
- "loss": 0.0,
142
- "step": 95
143
- },
144
- {
145
- "epoch": 0.36968576709796674,
146
- "grad_norm": 0.00013446807861328125,
147
- "learning_rate": 1.586738834678418e-05,
148
- "loss": 0.0,
149
- "step": 100
150
- },
151
- {
152
- "epoch": 0.38817005545286504,
153
- "grad_norm": 6.246566772460938e-05,
154
- "learning_rate": 1.5332044328016916e-05,
155
- "loss": 0.0,
156
- "step": 105
157
- },
158
- {
159
- "epoch": 0.4066543438077634,
160
- "grad_norm": 0.00013828277587890625,
161
- "learning_rate": 1.4774427770379492e-05,
162
- "loss": 0.0,
163
- "step": 110
164
- },
165
- {
166
- "epoch": 0.42513863216266173,
167
- "grad_norm": 0.0001544952392578125,
168
- "learning_rate": 1.4196867899904292e-05,
169
- "loss": 0.0,
170
- "step": 115
171
- },
172
- {
173
- "epoch": 0.4436229205175601,
174
- "grad_norm": 0.00011968612670898438,
175
- "learning_rate": 1.3601777248047105e-05,
176
- "loss": 0.0,
177
- "step": 120
178
- },
179
- {
180
- "epoch": 0.46210720887245843,
181
- "grad_norm": 6.532669067382812e-05,
182
- "learning_rate": 1.2991641574276419e-05,
183
- "loss": 0.0,
184
- "step": 125
185
- },
186
- {
187
- "epoch": 0.4805914972273567,
188
- "grad_norm": 9.965896606445312e-05,
189
- "learning_rate": 1.2369009482781191e-05,
190
- "loss": 0.0,
191
- "step": 130
192
- },
193
- {
194
- "epoch": 0.49907578558225507,
195
- "grad_norm": 5.698204040527344e-05,
196
- "learning_rate": 1.1736481776669307e-05,
197
- "loss": 0.0,
198
- "step": 135
199
- },
200
- {
201
- "epoch": 0.5175600739371534,
202
- "grad_norm": 4.124641418457031e-05,
203
- "learning_rate": 1.1096700594125318e-05,
204
- "loss": 0.0,
205
- "step": 140
206
- },
207
- {
208
- "epoch": 0.5360443622920518,
209
- "grad_norm": 9.5367431640625e-05,
210
- "learning_rate": 1.0452338371907065e-05,
211
- "loss": 0.0,
212
- "step": 145
213
- },
214
- {
215
- "epoch": 0.5545286506469501,
216
- "grad_norm": 5.698204040527344e-05,
217
- "learning_rate": 9.806086682281759e-06,
218
- "loss": 0.0,
219
- "step": 150
220
- },
221
- {
222
- "epoch": 0.5730129390018485,
223
- "grad_norm": 9.441375732421875e-05,
224
- "learning_rate": 9.160644990030932e-06,
225
- "loss": 0.0,
226
- "step": 155
227
- },
228
- {
229
- "epoch": 0.5914972273567468,
230
- "grad_norm": 0.00011777877807617188,
231
- "learning_rate": 8.518709376487515e-06,
232
- "loss": 0.0,
233
- "step": 160
234
- },
235
- {
236
- "epoch": 0.609981515711645,
237
- "grad_norm": 0.0001583099365234375,
238
- "learning_rate": 7.882961277705897e-06,
239
- "loss": 0.0,
240
- "step": 165
241
- },
242
- {
243
- "epoch": 0.6284658040665434,
244
- "grad_norm": 0.000141143798828125,
245
- "learning_rate": 7.256056283806987e-06,
246
- "loss": 0.0,
247
- "step": 170
248
- },
249
- {
250
- "epoch": 0.6469500924214417,
251
- "grad_norm": 0.0001697540283203125,
252
- "learning_rate": 6.640613046284581e-06,
253
- "loss": 0.0,
254
- "step": 175
255
- },
256
- {
257
- "epoch": 0.6654343807763401,
258
- "grad_norm": 0.000431060791015625,
259
- "learning_rate": 6.039202339608432e-06,
260
- "loss": 0.0,
261
- "step": 180
262
- },
263
- {
264
- "epoch": 0.6839186691312384,
265
- "grad_norm": 8.106231689453125e-05,
266
- "learning_rate": 5.454336322814995e-06,
267
- "loss": 0.0,
268
- "step": 185
269
- },
270
- {
271
- "epoch": 0.7024029574861368,
272
- "grad_norm": 9.918212890625e-05,
273
- "learning_rate": 4.888458045941269e-06,
274
- "loss": 0.0,
275
- "step": 190
276
- },
277
- {
278
- "epoch": 0.7208872458410351,
279
- "grad_norm": 4.76837158203125e-05,
280
- "learning_rate": 4.343931245134616e-06,
281
- "loss": 0.0,
282
- "step": 195
283
- },
284
- {
285
- "epoch": 0.7393715341959335,
286
- "grad_norm": 0.00012493133544921875,
287
- "learning_rate": 3.823030469065431e-06,
288
- "loss": 0.0,
289
- "step": 200
290
- },
291
- {
292
- "epoch": 0.7578558225508318,
293
- "grad_norm": 5.340576171875e-05,
294
- "learning_rate": 3.3279315778858034e-06,
295
- "loss": 0.0,
296
- "step": 205
297
- },
298
- {
299
- "epoch": 0.7763401109057301,
300
- "grad_norm": 7.05718994140625e-05,
301
- "learning_rate": 2.8607026544210115e-06,
302
- "loss": 0.0,
303
- "step": 210
304
- },
305
- {
306
- "epoch": 0.7948243992606284,
307
- "grad_norm": 0.0002117156982421875,
308
- "learning_rate": 2.423295365558821e-06,
309
- "loss": 0.0,
310
- "step": 215
311
- },
312
- {
313
- "epoch": 0.8133086876155268,
314
- "grad_norm": 4.38690185546875e-05,
315
- "learning_rate": 2.01753680992107e-06,
316
- "loss": 0.0,
317
- "step": 220
318
- },
319
- {
320
- "epoch": 0.8317929759704251,
321
- "grad_norm": 0.00010395050048828125,
322
- "learning_rate": 1.6451218858706374e-06,
323
- "loss": 0.0,
324
- "step": 225
325
- },
326
- {
327
- "epoch": 0.8502772643253235,
328
- "grad_norm": 0.00018978118896484375,
329
- "learning_rate": 1.307606211733522e-06,
330
- "loss": 0.0,
331
- "step": 230
332
- },
333
- {
334
- "epoch": 0.8687615526802218,
335
- "grad_norm": 5.53131103515625e-05,
336
- "learning_rate": 1.0063996278090704e-06,
337
- "loss": 0.0,
338
- "step": 235
339
- },
340
- {
341
- "epoch": 0.8872458410351202,
342
- "grad_norm": 6.389617919921875e-05,
343
- "learning_rate": 7.427603073110967e-07,
344
- "loss": 0.0,
345
- "step": 240
346
- },
347
- {
348
- "epoch": 0.9057301293900185,
349
- "grad_norm": 0.00011205673217773438,
350
- "learning_rate": 5.177895008392353e-07,
351
- "loss": 0.0,
352
- "step": 245
353
- },
354
- {
355
- "epoch": 0.9242144177449169,
356
- "grad_norm": 0.000293731689453125,
357
- "learning_rate": 3.3242693633337986e-07,
358
- "loss": 0.0,
359
- "step": 250
360
- },
361
- {
362
- "epoch": 0.9426987060998152,
363
- "grad_norm": 0.000179290771484375,
364
- "learning_rate": 1.874468937261531e-07,
365
- "loss": 0.0,
366
- "step": 255
367
- },
368
- {
369
- "epoch": 0.9611829944547134,
370
- "grad_norm": 0.002349853515625,
371
- "learning_rate": 8.345497068998897e-08,
372
- "loss": 0.0,
373
- "step": 260
374
- },
375
- {
376
- "epoch": 0.9796672828096118,
377
- "grad_norm": 6.437301635742188e-05,
378
- "learning_rate": 2.088555298867978e-08,
379
- "loss": 0.0,
380
- "step": 265
381
- },
382
- {
383
- "epoch": 0.9981515711645101,
384
- "grad_norm": 9.1552734375e-05,
385
- "learning_rate": 0.0,
386
- "loss": 0.0,
387
- "step": 270
388
- },
389
- {
390
- "epoch": 0.9981515711645101,
391
- "eval_loss": 1.3726342331210617e-05,
392
- "eval_runtime": 52.2001,
393
- "eval_samples_per_second": 18.601,
394
- "eval_steps_per_second": 2.337,
395
- "step": 270
396
- },
397
- {
398
- "epoch": 0.9981515711645101,
399
- "step": 270,
400
- "total_flos": 5.260333472022528e+16,
401
- "train_loss": 1.8035727003330572e-05,
402
- "train_runtime": 2689.0507,
403
- "train_samples_per_second": 3.217,
404
- "train_steps_per_second": 0.1
405
  }
406
  ],
407
  "logging_steps": 5,
408
- "max_steps": 270,
409
  "num_input_tokens_seen": 0,
410
- "num_train_epochs": 1,
411
  "save_steps": 500,
412
  "stateful_callbacks": {
413
  "TrainerControl": {
@@ -421,7 +35,7 @@
421
  "attributes": {}
422
  }
423
  },
424
- "total_flos": 5.260333472022528e+16,
425
  "train_batch_size": 8,
426
  "trial_name": null,
427
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0,
5
  "eval_steps": 500,
6
+ "global_step": 0,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0,
13
+ "step": 0,
14
+ "total_flos": 0,
15
+ "train_loss": 0.0,
16
+ "train_runtime": 4.808,
17
+ "train_samples_per_second": 0.0,
18
+ "train_steps_per_second": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
  ],
21
  "logging_steps": 5,
22
+ "max_steps": 0,
23
  "num_input_tokens_seen": 0,
24
+ "num_train_epochs": 0,
25
  "save_steps": 500,
26
  "stateful_callbacks": {
27
  "TrainerControl": {
 
35
  "attributes": {}
36
  }
37
  },
38
+ "total_flos": 0,
39
  "train_batch_size": 8,
40
  "trial_name": null,
41
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48db1adcd7121cc0952bac83481593034c426cd3d56b3e7a70068bc17b1bad10
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43a0172f6f69bee07075ae9f14129953a7c9c530502a5beeaa13f998fa945ac5
3
  size 5304