sfulay commited on
Commit
41fe581
1 Parent(s): aa8e785

Model save

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
5
  - trl
6
  - dpo
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: zephyr-7b-dpo-full-magpi-high-bleu-3-epochs
@@ -17,15 +18,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.0049
21
- - Rewards/chosen: -1.6090
22
- - Rewards/rejected: -45.3616
23
  - Rewards/accuracies: 0.9960
24
- - Rewards/margins: 43.7525
25
- - Logps/rejected: -5176.9458
26
- - Logps/chosen: -527.8860
27
- - Logits/rejected: -3.4269
28
- - Logits/chosen: -3.5551
29
 
30
  ## Model description
31
 
@@ -62,12 +63,12 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 0.0025 | 0.4739 | 50 | 0.0067 | -1.4593 | -36.8732 | 0.9940 | 35.4140 | -4328.1133 | -512.9101 | -3.0480 | -3.2146 |
66
- | 0.0035 | 0.9479 | 100 | 0.0053 | -1.9936 | -41.1186 | 0.9940 | 39.1250 | -4752.6543 | -566.3461 | -3.2382 | -3.4106 |
67
- | 0.0002 | 1.4218 | 150 | 0.0072 | -2.1152 | -46.0999 | 0.9940 | 43.9846 | -5250.7739 | -578.5039 | -3.5133 | -3.5737 |
68
- | 0.0 | 1.8957 | 200 | 0.0046 | -1.6298 | -44.8878 | 0.9960 | 43.2581 | -5129.5718 | -529.9568 | -3.3932 | -3.4839 |
69
- | 0.0 | 2.3697 | 250 | 0.0049 | -1.6103 | -45.2487 | 0.9960 | 43.6384 | -5165.6558 | -528.0136 | -3.4300 | -3.5565 |
70
- | 0.0 | 2.8436 | 300 | 0.0049 | -1.6090 | -45.3616 | 0.9960 | 43.7525 | -5176.9458 | -527.8860 | -3.4269 | -3.5551 |
71
 
72
 
73
  ### Framework versions
 
4
  tags:
5
  - trl
6
  - dpo
7
+ - alignment-handbook
8
  - generated_from_trainer
9
  model-index:
10
  - name: zephyr-7b-dpo-full-magpi-high-bleu-3-epochs
 
18
 
19
  This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.0050
22
+ - Rewards/chosen: -1.4582
23
+ - Rewards/rejected: -44.8746
24
  - Rewards/accuracies: 0.9960
25
+ - Rewards/margins: 43.4164
26
+ - Logps/rejected: -5128.2480
27
+ - Logps/chosen: -512.8050
28
+ - Logits/rejected: -3.4441
29
+ - Logits/chosen: -3.5504
30
 
31
  ## Model description
32
 
 
63
 
64
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 0.0025 | 0.4739 | 50 | 0.0067 | -1.4454 | -37.1575 | 0.9940 | 35.7120 | -4356.5356 | -511.5262 | -3.0434 | -3.2142 |
67
+ | 0.0036 | 0.9479 | 100 | 0.0053 | -2.0303 | -41.2450 | 0.9940 | 39.2146 | -4765.2842 | -570.0164 | -3.2429 | -3.4104 |
68
+ | 0.0001 | 1.4218 | 150 | 0.0070 | -1.9459 | -45.2030 | 0.9940 | 43.2570 | -5161.0879 | -561.5757 | -3.5068 | -3.5867 |
69
+ | 0.0 | 1.8957 | 200 | 0.0047 | -1.4539 | -44.2686 | 0.9960 | 42.8147 | -5067.6450 | -512.3704 | -3.4229 | -3.5020 |
70
+ | 0.0 | 2.3697 | 250 | 0.0050 | -1.4525 | -44.7537 | 0.9960 | 43.3012 | -5116.1577 | -512.2269 | -3.4445 | -3.5510 |
71
+ | 0.0 | 2.8436 | 300 | 0.0050 | -1.4582 | -44.8746 | 0.9960 | 43.4164 | -5128.2480 | -512.8050 | -3.4441 | -3.5504 |
72
 
73
 
74
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,22 @@
1
  {
2
  "epoch": 2.985781990521327,
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "total_flos": 0.0,
4
- "train_loss": 0.04085985715484842,
5
- "train_runtime": 9625.0686,
6
  "train_samples": 13500,
7
- "train_samples_per_second": 4.208,
8
- "train_steps_per_second": 0.033
9
  }
 
1
  {
2
  "epoch": 2.985781990521327,
3
+ "eval_logits/chosen": -3.554823398590088,
4
+ "eval_logits/rejected": -3.4271774291992188,
5
+ "eval_logps/chosen": -527.9577026367188,
6
+ "eval_logps/rejected": -5177.13427734375,
7
+ "eval_loss": 0.004887364339083433,
8
+ "eval_rewards/accuracies": 0.9959677457809448,
9
+ "eval_rewards/chosen": -1.6097602844238281,
10
+ "eval_rewards/margins": 43.75369644165039,
11
+ "eval_rewards/rejected": -45.363460540771484,
12
+ "eval_runtime": 195.0759,
13
+ "eval_samples": 3905,
14
+ "eval_samples_per_second": 20.018,
15
+ "eval_steps_per_second": 0.318,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.04083177362173292,
18
+ "train_runtime": 9033.5209,
19
  "train_samples": 13500,
20
+ "train_samples_per_second": 4.483,
21
+ "train_steps_per_second": 0.035
22
  }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.985781990521327,
3
+ "eval_logits/chosen": -3.554823398590088,
4
+ "eval_logits/rejected": -3.4271774291992188,
5
+ "eval_logps/chosen": -527.9577026367188,
6
+ "eval_logps/rejected": -5177.13427734375,
7
+ "eval_loss": 0.004887364339083433,
8
+ "eval_rewards/accuracies": 0.9959677457809448,
9
+ "eval_rewards/chosen": -1.6097602844238281,
10
+ "eval_rewards/margins": 43.75369644165039,
11
+ "eval_rewards/rejected": -45.363460540771484,
12
+ "eval_runtime": 195.0759,
13
+ "eval_samples": 3905,
14
+ "eval_samples_per_second": 20.018,
15
+ "eval_steps_per_second": 0.318
16
+ }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d251c4ef2b4a1a2fbdef1160fc39c3103e5d5886d52070b7b539e89cb8da01f1
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68975e1291478d1339fd59772dd3bda25df676f16a75793190e1e237d5b07cbf
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9218cc23d068a585d47810af43283eaf22c7531ec408d0a3f18ed1ac8fe9c327
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf9410e725afbeb11aa75a576814b569404c46920bef499dccdea40e07762d23
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f84601a6e02f549d17a9f429968fd5d2f9d509ab2440df0d7286b977c477ba52
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d296637a91238fa45d093d1f90ca28ac5588378f56065bfa495eba3c173a2e75
3
  size 4540516344
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.985781990521327,
3
  "total_flos": 0.0,
4
- "train_loss": 0.04085985715484842,
5
- "train_runtime": 9625.0686,
6
  "train_samples": 13500,
7
- "train_samples_per_second": 4.208,
8
- "train_steps_per_second": 0.033
9
  }
 
1
  {
2
  "epoch": 2.985781990521327,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.04083177362173292,
5
+ "train_runtime": 9033.5209,
6
  "train_samples": 13500,
7
+ "train_samples_per_second": 4.483,
8
+ "train_steps_per_second": 0.035
9
  }
trainer_state.json CHANGED
@@ -10,573 +10,573 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0947867298578199,
13
- "grad_norm": 56.808507663630365,
14
  "learning_rate": 1.5624999999999999e-07,
15
- "logits/chosen": -2.8022360801696777,
16
- "logits/rejected": -2.6993465423583984,
17
- "logps/chosen": -354.06646728515625,
18
- "logps/rejected": -648.7581787109375,
19
- "loss": 0.6845,
20
- "rewards/accuracies": 0.625,
21
- "rewards/chosen": 0.003776966128498316,
22
- "rewards/margins": 0.015949796885252,
23
- "rewards/rejected": -0.012172830291092396,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.1895734597156398,
28
- "grad_norm": 16.92917194646474,
29
  "learning_rate": 3.1249999999999997e-07,
30
- "logits/chosen": -2.845165729522705,
31
- "logits/rejected": -2.729945421218872,
32
- "logps/chosen": -361.77703857421875,
33
- "logps/rejected": -731.990234375,
34
- "loss": 0.449,
35
  "rewards/accuracies": 1.0,
36
- "rewards/chosen": 0.11443690210580826,
37
- "rewards/margins": 0.7462291717529297,
38
- "rewards/rejected": -0.6317921876907349,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.2843601895734597,
43
- "grad_norm": 2.8859667385666943,
44
  "learning_rate": 4.6874999999999996e-07,
45
- "logits/chosen": -2.9244141578674316,
46
- "logits/rejected": -2.7600600719451904,
47
- "logps/chosen": -344.0387268066406,
48
- "logps/rejected": -1062.6480712890625,
49
- "loss": 0.1129,
50
  "rewards/accuracies": 1.0,
51
- "rewards/chosen": 0.4156733453273773,
52
- "rewards/margins": 4.948118686676025,
53
- "rewards/rejected": -4.532444953918457,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 0.3791469194312796,
58
- "grad_norm": 0.7962774018104511,
59
  "learning_rate": 4.990147841143461e-07,
60
- "logits/chosen": -2.992558002471924,
61
- "logits/rejected": -2.855602264404297,
62
- "logps/chosen": -369.517822265625,
63
- "logps/rejected": -2512.892822265625,
64
  "loss": 0.0145,
65
  "rewards/accuracies": 1.0,
66
- "rewards/chosen": -0.17123910784721375,
67
- "rewards/margins": 18.776042938232422,
68
- "rewards/rejected": -18.947282791137695,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 0.47393364928909953,
73
- "grad_norm": 0.08077589023709618,
74
  "learning_rate": 4.950256493879794e-07,
75
- "logits/chosen": -3.1441614627838135,
76
- "logits/rejected": -3.0702505111694336,
77
- "logps/chosen": -446.707763671875,
78
- "logps/rejected": -3806.796875,
79
  "loss": 0.0025,
80
  "rewards/accuracies": 1.0,
81
- "rewards/chosen": -0.8762685656547546,
82
- "rewards/margins": 31.019947052001953,
83
- "rewards/rejected": -31.896215438842773,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 0.47393364928909953,
88
- "eval_logits/chosen": -3.2145538330078125,
89
- "eval_logits/rejected": -3.048049211502075,
90
- "eval_logps/chosen": -512.9100952148438,
91
- "eval_logps/rejected": -4328.11328125,
92
- "eval_loss": 0.006741416174918413,
93
  "eval_rewards/accuracies": 0.9939516186714172,
94
- "eval_rewards/chosen": -1.4592840671539307,
95
- "eval_rewards/margins": 35.413963317871094,
96
- "eval_rewards/rejected": -36.87324523925781,
97
- "eval_runtime": 193.9438,
98
- "eval_samples_per_second": 20.135,
99
- "eval_steps_per_second": 0.32,
100
  "step": 50
101
  },
102
  {
103
  "epoch": 0.5687203791469194,
104
- "grad_norm": 0.08718232747940217,
105
  "learning_rate": 4.88020090697132e-07,
106
- "logits/chosen": -3.2784461975097656,
107
- "logits/rejected": -3.1442923545837402,
108
- "logps/chosen": -567.5687255859375,
109
- "logps/rejected": -4666.0283203125,
110
- "loss": 0.0042,
111
  "rewards/accuracies": 0.9937499761581421,
112
- "rewards/chosen": -1.9609228372573853,
113
- "rewards/margins": 38.35578536987305,
114
- "rewards/rejected": -40.31671142578125,
115
  "step": 60
116
  },
117
  {
118
  "epoch": 0.6635071090047393,
119
- "grad_norm": 1.5224160707137697,
120
  "learning_rate": 4.780843509929904e-07,
121
- "logits/chosen": -3.293713331222534,
122
- "logits/rejected": -3.0942394733428955,
123
- "logps/chosen": -608.1524047851562,
124
- "logps/rejected": -4869.34912109375,
125
  "loss": 0.0021,
126
  "rewards/accuracies": 1.0,
127
- "rewards/chosen": -2.6836161613464355,
128
- "rewards/margins": 39.84333419799805,
129
- "rewards/rejected": -42.52695083618164,
130
  "step": 70
131
  },
132
  {
133
  "epoch": 0.7582938388625592,
134
- "grad_norm": 0.19554010480864858,
135
  "learning_rate": 4.6534074564712217e-07,
136
- "logits/chosen": -3.4218146800994873,
137
- "logits/rejected": -3.29943585395813,
138
- "logps/chosen": -602.691650390625,
139
- "logps/rejected": -5427.1826171875,
140
- "loss": 0.0004,
141
  "rewards/accuracies": 1.0,
142
- "rewards/chosen": -2.5821824073791504,
143
- "rewards/margins": 45.447879791259766,
144
- "rewards/rejected": -48.030067443847656,
145
  "step": 80
146
  },
147
  {
148
  "epoch": 0.8530805687203792,
149
- "grad_norm": 2.1541635120991005,
150
  "learning_rate": 4.4994615667026846e-07,
151
- "logits/chosen": -3.4852542877197266,
152
- "logits/rejected": -3.398355484008789,
153
- "logps/chosen": -631.3218994140625,
154
- "logps/rejected": -5290.6767578125,
155
  "loss": 0.0018,
156
  "rewards/accuracies": 1.0,
157
- "rewards/chosen": -2.7490692138671875,
158
- "rewards/margins": 43.93152618408203,
159
- "rewards/rejected": -46.68059539794922,
160
  "step": 90
161
  },
162
  {
163
  "epoch": 0.9478672985781991,
164
- "grad_norm": 2.8671974784822956,
165
  "learning_rate": 4.320901013934887e-07,
166
- "logits/chosen": -3.422508716583252,
167
- "logits/rejected": -3.361572742462158,
168
- "logps/chosen": -554.526123046875,
169
- "logps/rejected": -4800.9462890625,
170
- "loss": 0.0035,
171
  "rewards/accuracies": 1.0,
172
- "rewards/chosen": -2.072134494781494,
173
- "rewards/margins": 39.7642936706543,
174
- "rewards/rejected": -41.836429595947266,
175
  "step": 100
176
  },
177
  {
178
  "epoch": 0.9478672985781991,
179
- "eval_logits/chosen": -3.410635232925415,
180
- "eval_logits/rejected": -3.2381887435913086,
181
- "eval_logps/chosen": -566.3461303710938,
182
- "eval_logps/rejected": -4752.654296875,
183
- "eval_loss": 0.005255497060716152,
184
  "eval_rewards/accuracies": 0.9939516186714172,
185
- "eval_rewards/chosen": -1.9936442375183105,
186
- "eval_rewards/margins": 39.12500762939453,
187
- "eval_rewards/rejected": -41.118648529052734,
188
- "eval_runtime": 193.1039,
189
- "eval_samples_per_second": 20.222,
190
- "eval_steps_per_second": 0.321,
191
  "step": 100
192
  },
193
  {
194
  "epoch": 1.042654028436019,
195
- "grad_norm": 1.2561873368154433,
196
  "learning_rate": 4.119923993874379e-07,
197
- "logits/chosen": -3.4638073444366455,
198
- "logits/rejected": -3.4071457386016846,
199
- "logps/chosen": -544.2169189453125,
200
- "logps/rejected": -5126.4560546875,
201
  "loss": 0.0006,
202
  "rewards/accuracies": 1.0,
203
- "rewards/chosen": -1.89871084690094,
204
- "rewards/margins": 42.90679168701172,
205
- "rewards/rejected": -44.805503845214844,
206
  "step": 110
207
  },
208
  {
209
  "epoch": 1.1374407582938388,
210
- "grad_norm": 0.9786979825165962,
211
  "learning_rate": 3.899004663415083e-07,
212
- "logits/chosen": -3.452301502227783,
213
- "logits/rejected": -3.3307366371154785,
214
- "logps/chosen": -527.18212890625,
215
- "logps/rejected": -5159.8701171875,
216
  "loss": 0.0005,
217
  "rewards/accuracies": 1.0,
218
- "rewards/chosen": -1.735828161239624,
219
- "rewards/margins": 43.46116256713867,
220
- "rewards/rejected": -45.196990966796875,
221
  "step": 120
222
  },
223
  {
224
  "epoch": 1.2322274881516588,
225
- "grad_norm": 0.04433484422705367,
226
  "learning_rate": 3.6608626821692824e-07,
227
- "logits/chosen": -3.4930477142333984,
228
- "logits/rejected": -3.4833171367645264,
229
- "logps/chosen": -507.93817138671875,
230
- "logps/rejected": -5810.45654296875,
231
  "loss": 0.0012,
232
  "rewards/accuracies": 1.0,
233
- "rewards/chosen": -1.616611123085022,
234
- "rewards/margins": 49.649314880371094,
235
- "rewards/rejected": -51.26592254638672,
236
  "step": 130
237
  },
238
  {
239
  "epoch": 1.3270142180094786,
240
- "grad_norm": 0.0001033743847851317,
241
  "learning_rate": 3.408429731701635e-07,
242
- "logits/chosen": -3.602503538131714,
243
- "logits/rejected": -3.6203441619873047,
244
- "logps/chosen": -699.213623046875,
245
- "logps/rejected": -5553.19384765625,
246
  "loss": 0.0001,
247
  "rewards/accuracies": 1.0,
248
- "rewards/chosen": -3.502641201019287,
249
- "rewards/margins": 45.56864547729492,
250
- "rewards/rejected": -49.07128143310547,
251
  "step": 140
252
  },
253
  {
254
  "epoch": 1.4218009478672986,
255
- "grad_norm": 0.001084875953335538,
256
  "learning_rate": 3.144813424636031e-07,
257
- "logits/chosen": -3.7745907306671143,
258
- "logits/rejected": -3.7177462577819824,
259
- "logps/chosen": -846.7849731445312,
260
- "logps/rejected": -5827.8447265625,
261
- "loss": 0.0002,
262
  "rewards/accuracies": 1.0,
263
- "rewards/chosen": -4.706751823425293,
264
- "rewards/margins": 47.121307373046875,
265
- "rewards/rejected": -51.82805633544922,
266
  "step": 150
267
  },
268
  {
269
  "epoch": 1.4218009478672986,
270
- "eval_logits/chosen": -3.573683977127075,
271
- "eval_logits/rejected": -3.5132687091827393,
272
- "eval_logps/chosen": -578.50390625,
273
- "eval_logps/rejected": -5250.77392578125,
274
- "eval_loss": 0.007219326216727495,
275
  "eval_rewards/accuracies": 0.9939516186714172,
276
- "eval_rewards/chosen": -2.1152215003967285,
277
- "eval_rewards/margins": 43.9846305847168,
278
- "eval_rewards/rejected": -46.099853515625,
279
- "eval_runtime": 192.0822,
280
- "eval_samples_per_second": 20.33,
281
  "eval_steps_per_second": 0.323,
282
  "step": 150
283
  },
284
  {
285
  "epoch": 1.5165876777251186,
286
- "grad_norm": 0.00020050840378569657,
287
  "learning_rate": 2.8732590479375165e-07,
288
- "logits/chosen": -3.562108278274536,
289
- "logits/rejected": -3.6045074462890625,
290
- "logps/chosen": -542.7542724609375,
291
- "logps/rejected": -5262.7451171875,
292
  "loss": 0.0003,
293
  "rewards/accuracies": 1.0,
294
- "rewards/chosen": -1.8957535028457642,
295
- "rewards/margins": 44.86753463745117,
296
- "rewards/rejected": -46.76329040527344,
297
  "step": 160
298
  },
299
  {
300
  "epoch": 1.6113744075829384,
301
- "grad_norm": 0.0014969345497476303,
302
  "learning_rate": 2.597109611334169e-07,
303
- "logits/chosen": -3.5832862854003906,
304
- "logits/rejected": -3.668252944946289,
305
- "logps/chosen": -530.57373046875,
306
- "logps/rejected": -5514.72509765625,
307
- "loss": 0.0002,
308
  "rewards/accuracies": 1.0,
309
- "rewards/chosen": -1.7220637798309326,
310
- "rewards/margins": 47.27531814575195,
311
- "rewards/rejected": -48.99738311767578,
312
  "step": 170
313
  },
314
  {
315
  "epoch": 1.7061611374407581,
316
- "grad_norm": 0.012284347539022291,
317
  "learning_rate": 2.3197646927086694e-07,
318
- "logits/chosen": -3.544964551925659,
319
- "logits/rejected": -3.628335952758789,
320
- "logps/chosen": -555.7106323242188,
321
- "logps/rejected": -5537.78515625,
322
- "loss": 0.0082,
323
  "rewards/accuracies": 1.0,
324
- "rewards/chosen": -1.8591806888580322,
325
- "rewards/margins": 47.509742736816406,
326
- "rewards/rejected": -49.36892318725586,
327
  "step": 180
328
  },
329
  {
330
  "epoch": 1.8009478672985781,
331
- "grad_norm": 0.0019233280418011931,
332
  "learning_rate": 2.0446385870993467e-07,
333
- "logits/chosen": -3.525050640106201,
334
- "logits/rejected": -3.5435047149658203,
335
- "logps/chosen": -543.8511352539062,
336
- "logps/rejected": -5155.20263671875,
337
  "loss": 0.0,
338
  "rewards/accuracies": 1.0,
339
- "rewards/chosen": -1.6305545568466187,
340
- "rewards/margins": 43.96278381347656,
341
- "rewards/rejected": -45.59334182739258,
342
  "step": 190
343
  },
344
  {
345
  "epoch": 1.8957345971563981,
346
- "grad_norm": 0.022433128971849518,
347
  "learning_rate": 1.775118274523545e-07,
348
- "logits/chosen": -3.5229125022888184,
349
- "logits/rejected": -3.511779308319092,
350
- "logps/chosen": -507.39569091796875,
351
- "logps/rejected": -5092.2197265625,
352
  "loss": 0.0,
353
  "rewards/accuracies": 1.0,
354
- "rewards/chosen": -1.549849510192871,
355
- "rewards/margins": 43.13224411010742,
356
- "rewards/rejected": -44.682090759277344,
357
  "step": 200
358
  },
359
  {
360
  "epoch": 1.8957345971563981,
361
- "eval_logits/chosen": -3.4838595390319824,
362
- "eval_logits/rejected": -3.393195152282715,
363
- "eval_logps/chosen": -529.9568481445312,
364
- "eval_logps/rejected": -5129.57177734375,
365
- "eval_loss": 0.004649566486477852,
366
  "eval_rewards/accuracies": 0.9959677457809448,
367
- "eval_rewards/chosen": -1.6297515630722046,
368
- "eval_rewards/margins": 43.258079528808594,
369
- "eval_rewards/rejected": -44.88783264160156,
370
- "eval_runtime": 193.4054,
371
- "eval_samples_per_second": 20.191,
372
- "eval_steps_per_second": 0.321,
373
  "step": 200
374
  },
375
  {
376
  "epoch": 1.9905213270142181,
377
- "grad_norm": 1.2842516744764987,
378
  "learning_rate": 1.514521724066537e-07,
379
- "logits/chosen": -3.542708158493042,
380
- "logits/rejected": -3.5746352672576904,
381
- "logps/chosen": -553.3961181640625,
382
- "logps/rejected": -5135.65087890625,
383
  "loss": 0.0002,
384
  "rewards/accuracies": 1.0,
385
- "rewards/chosen": -1.7059533596038818,
386
- "rewards/margins": 43.47618865966797,
387
- "rewards/rejected": -45.18214797973633,
388
  "step": 210
389
  },
390
  {
391
  "epoch": 2.085308056872038,
392
- "grad_norm": 0.02286366102698996,
393
  "learning_rate": 1.266057047539568e-07,
394
- "logits/chosen": -3.5124752521514893,
395
- "logits/rejected": -3.5435707569122314,
396
- "logps/chosen": -497.9325256347656,
397
- "logps/rejected": -5323.30126953125,
398
  "loss": 0.0,
399
  "rewards/accuracies": 1.0,
400
- "rewards/chosen": -1.6141698360443115,
401
- "rewards/margins": 45.36501693725586,
402
- "rewards/rejected": -46.97918701171875,
403
  "step": 220
404
  },
405
  {
406
  "epoch": 2.1800947867298577,
407
- "grad_norm": 0.000533688233348108,
408
  "learning_rate": 1.032783005551884e-07,
409
- "logits/chosen": -3.5625851154327393,
410
- "logits/rejected": -3.5691921710968018,
411
- "logps/chosen": -492.0570373535156,
412
- "logps/rejected": -4921.8291015625,
413
  "loss": 0.0,
414
  "rewards/accuracies": 1.0,
415
- "rewards/chosen": -1.4992796182632446,
416
- "rewards/margins": 41.6370735168457,
417
- "rewards/rejected": -43.1363525390625,
418
  "step": 230
419
  },
420
  {
421
  "epoch": 2.2748815165876777,
422
- "grad_norm": 0.006381253133428834,
423
  "learning_rate": 8.175713521924976e-08,
424
- "logits/chosen": -3.5869510173797607,
425
- "logits/rejected": -3.523674726486206,
426
- "logps/chosen": -516.2166748046875,
427
- "logps/rejected": -5140.20703125,
428
  "loss": 0.0,
429
  "rewards/accuracies": 1.0,
430
- "rewards/chosen": -1.6389557123184204,
431
- "rewards/margins": 43.762550354003906,
432
- "rewards/rejected": -45.40150833129883,
433
  "step": 240
434
  },
435
  {
436
  "epoch": 2.3696682464454977,
437
- "grad_norm": 0.000733038849174391,
438
  "learning_rate": 6.230714818829733e-08,
439
- "logits/chosen": -3.5448238849639893,
440
- "logits/rejected": -3.516840696334839,
441
- "logps/chosen": -503.037841796875,
442
- "logps/rejected": -5467.2509765625,
443
  "loss": 0.0,
444
  "rewards/accuracies": 1.0,
445
- "rewards/chosen": -1.5125716924667358,
446
- "rewards/margins": 47.35405731201172,
447
- "rewards/rejected": -48.86663055419922,
448
  "step": 250
449
  },
450
  {
451
  "epoch": 2.3696682464454977,
452
- "eval_logits/chosen": -3.556544065475464,
453
- "eval_logits/rejected": -3.430008888244629,
454
- "eval_logps/chosen": -528.0136108398438,
455
- "eval_logps/rejected": -5165.65576171875,
456
- "eval_loss": 0.004941246937960386,
457
  "eval_rewards/accuracies": 0.9959677457809448,
458
- "eval_rewards/chosen": -1.6103183031082153,
459
- "eval_rewards/margins": 43.63835144042969,
460
- "eval_rewards/rejected": -45.24867248535156,
461
- "eval_runtime": 191.8373,
462
- "eval_samples_per_second": 20.356,
463
  "eval_steps_per_second": 0.323,
464
  "step": 250
465
  },
466
  {
467
  "epoch": 2.4644549763033177,
468
- "grad_norm": 0.004934692810403007,
469
  "learning_rate": 4.516778136213037e-08,
470
- "logits/chosen": -3.5642166137695312,
471
- "logits/rejected": -3.541161298751831,
472
- "logps/chosen": -490.9266662597656,
473
- "logps/rejected": -5269.9482421875,
474
  "loss": 0.0,
475
  "rewards/accuracies": 1.0,
476
- "rewards/chosen": -1.523624300956726,
477
- "rewards/margins": 45.255531311035156,
478
- "rewards/rejected": -46.77915573120117,
479
  "step": 260
480
  },
481
  {
482
  "epoch": 2.5592417061611377,
483
- "grad_norm": 0.0159993223919006,
484
  "learning_rate": 3.055003141378948e-08,
485
- "logits/chosen": -3.5474228858947754,
486
- "logits/rejected": -3.5533015727996826,
487
- "logps/chosen": -518.9056396484375,
488
- "logps/rejected": -5904.6611328125,
489
  "loss": 0.0,
490
  "rewards/accuracies": 1.0,
491
- "rewards/chosen": -1.4330569505691528,
492
- "rewards/margins": 51.321136474609375,
493
- "rewards/rejected": -52.75419998168945,
494
  "step": 270
495
  },
496
  {
497
  "epoch": 2.654028436018957,
498
- "grad_norm": 0.008470858031121525,
499
  "learning_rate": 1.8633852284264508e-08,
500
- "logits/chosen": -3.55708646774292,
501
- "logits/rejected": -3.5458106994628906,
502
- "logps/chosen": -536.460205078125,
503
- "logps/rejected": -5575.57177734375,
504
  "loss": 0.0,
505
  "rewards/accuracies": 1.0,
506
- "rewards/chosen": -1.506127119064331,
507
- "rewards/margins": 47.6081657409668,
508
- "rewards/rejected": -49.11429214477539,
509
  "step": 280
510
  },
511
  {
512
  "epoch": 2.748815165876777,
513
- "grad_norm": 5.3150365717798084e-05,
514
  "learning_rate": 9.56593983327919e-09,
515
- "logits/chosen": -3.5954432487487793,
516
- "logits/rejected": -3.5634117126464844,
517
- "logps/chosen": -543.16455078125,
518
- "logps/rejected": -5410.59375,
519
  "loss": 0.0,
520
  "rewards/accuracies": 1.0,
521
- "rewards/chosen": -1.641262412071228,
522
- "rewards/margins": 46.54301071166992,
523
- "rewards/rejected": -48.18427276611328,
524
  "step": 290
525
  },
526
  {
527
  "epoch": 2.843601895734597,
528
- "grad_norm": 0.0011549345423847704,
529
  "learning_rate": 3.4579259185321398e-09,
530
- "logits/chosen": -3.5742130279541016,
531
- "logits/rejected": -3.561877727508545,
532
- "logps/chosen": -529.9727783203125,
533
- "logps/rejected": -5293.2900390625,
534
  "loss": 0.0,
535
  "rewards/accuracies": 1.0,
536
- "rewards/chosen": -1.5660574436187744,
537
- "rewards/margins": 45.59223175048828,
538
- "rewards/rejected": -47.15829849243164,
539
  "step": 300
540
  },
541
  {
542
  "epoch": 2.843601895734597,
543
- "eval_logits/chosen": -3.555126428604126,
544
- "eval_logits/rejected": -3.426910161972046,
545
- "eval_logps/chosen": -527.8860473632812,
546
- "eval_logps/rejected": -5176.94580078125,
547
- "eval_loss": 0.004882320296019316,
548
  "eval_rewards/accuracies": 0.9959677457809448,
549
- "eval_rewards/chosen": -1.6090432405471802,
550
- "eval_rewards/margins": 43.75252914428711,
551
- "eval_rewards/rejected": -45.361572265625,
552
- "eval_runtime": 192.0885,
553
- "eval_samples_per_second": 20.329,
554
- "eval_steps_per_second": 0.323,
555
  "step": 300
556
  },
557
  {
558
  "epoch": 2.938388625592417,
559
- "grad_norm": 0.0003810246486281626,
560
  "learning_rate": 3.850041354441502e-10,
561
- "logits/chosen": -3.588923931121826,
562
- "logits/rejected": -3.5137507915496826,
563
- "logps/chosen": -526.1129150390625,
564
- "logps/rejected": -4852.4716796875,
565
  "loss": 0.0,
566
  "rewards/accuracies": 1.0,
567
- "rewards/chosen": -1.6466729640960693,
568
- "rewards/margins": 41.060447692871094,
569
- "rewards/rejected": -42.70712661743164,
570
  "step": 310
571
  },
572
  {
573
  "epoch": 2.985781990521327,
574
  "step": 315,
575
  "total_flos": 0.0,
576
- "train_loss": 0.04085985715484842,
577
- "train_runtime": 9625.0686,
578
- "train_samples_per_second": 4.208,
579
- "train_steps_per_second": 0.033
580
  }
581
  ],
582
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0947867298578199,
13
+ "grad_norm": 56.951628924108704,
14
  "learning_rate": 1.5624999999999999e-07,
15
+ "logits/chosen": -2.8022689819335938,
16
+ "logits/rejected": -2.699367046356201,
17
+ "logps/chosen": -354.14007568359375,
18
+ "logps/rejected": -648.7852783203125,
19
+ "loss": 0.6846,
20
+ "rewards/accuracies": 0.637499988079071,
21
+ "rewards/chosen": 0.0030409712344408035,
22
+ "rewards/margins": 0.015484926290810108,
23
+ "rewards/rejected": -0.01244395412504673,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.1895734597156398,
28
+ "grad_norm": 16.911922497415656,
29
  "learning_rate": 3.1249999999999997e-07,
30
+ "logits/chosen": -2.8449482917785645,
31
+ "logits/rejected": -2.7297720909118652,
32
+ "logps/chosen": -361.7726135253906,
33
+ "logps/rejected": -731.9713134765625,
34
+ "loss": 0.4488,
35
  "rewards/accuracies": 1.0,
36
+ "rewards/chosen": 0.11448182910680771,
37
+ "rewards/margins": 0.7460837364196777,
38
+ "rewards/rejected": -0.6316019892692566,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.2843601895734597,
43
+ "grad_norm": 2.8879981399804886,
44
  "learning_rate": 4.6874999999999996e-07,
45
+ "logits/chosen": -2.924880027770996,
46
+ "logits/rejected": -2.7608063220977783,
47
+ "logps/chosen": -344.0640869140625,
48
+ "logps/rejected": -1062.529541015625,
49
+ "loss": 0.1128,
50
  "rewards/accuracies": 1.0,
51
+ "rewards/chosen": 0.4154191017150879,
52
+ "rewards/margins": 4.946678638458252,
53
+ "rewards/rejected": -4.531259536743164,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 0.3791469194312796,
58
+ "grad_norm": 0.7967945507055681,
59
  "learning_rate": 4.990147841143461e-07,
60
+ "logits/chosen": -2.9928297996520996,
61
+ "logits/rejected": -2.858860969543457,
62
+ "logps/chosen": -369.7523193359375,
63
+ "logps/rejected": -2523.788818359375,
64
  "loss": 0.0145,
65
  "rewards/accuracies": 1.0,
66
+ "rewards/chosen": -0.17358417809009552,
67
+ "rewards/margins": 18.882659912109375,
68
+ "rewards/rejected": -19.056243896484375,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 0.47393364928909953,
73
+ "grad_norm": 0.08242657747458541,
74
  "learning_rate": 4.950256493879794e-07,
75
+ "logits/chosen": -3.1458115577697754,
76
+ "logits/rejected": -3.068504810333252,
77
+ "logps/chosen": -445.88641357421875,
78
+ "logps/rejected": -3839.385498046875,
79
  "loss": 0.0025,
80
  "rewards/accuracies": 1.0,
81
+ "rewards/chosen": -0.8680551648139954,
82
+ "rewards/margins": 31.354045867919922,
83
+ "rewards/rejected": -32.22209930419922,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 0.47393364928909953,
88
+ "eval_logits/chosen": -3.214230537414551,
89
+ "eval_logits/rejected": -3.0434162616729736,
90
+ "eval_logps/chosen": -511.5262451171875,
91
+ "eval_logps/rejected": -4356.53564453125,
92
+ "eval_loss": 0.006651720497757196,
93
  "eval_rewards/accuracies": 0.9939516186714172,
94
+ "eval_rewards/chosen": -1.4454454183578491,
95
+ "eval_rewards/margins": 35.71202850341797,
96
+ "eval_rewards/rejected": -37.157470703125,
97
+ "eval_runtime": 194.5294,
98
+ "eval_samples_per_second": 20.074,
99
+ "eval_steps_per_second": 0.319,
100
  "step": 50
101
  },
102
  {
103
  "epoch": 0.5687203791469194,
104
+ "grad_norm": 0.08904936739654302,
105
  "learning_rate": 4.88020090697132e-07,
106
+ "logits/chosen": -3.2791202068328857,
107
+ "logits/rejected": -3.141754150390625,
108
+ "logps/chosen": -564.9468383789062,
109
+ "logps/rejected": -4684.3271484375,
110
+ "loss": 0.004,
111
  "rewards/accuracies": 0.9937499761581421,
112
+ "rewards/chosen": -1.9347045421600342,
113
+ "rewards/margins": 38.56499099731445,
114
+ "rewards/rejected": -40.49969482421875,
115
  "step": 60
116
  },
117
  {
118
  "epoch": 0.6635071090047393,
119
+ "grad_norm": 1.511268095124282,
120
  "learning_rate": 4.780843509929904e-07,
121
+ "logits/chosen": -3.2914862632751465,
122
+ "logits/rejected": -3.0883309841156006,
123
+ "logps/chosen": -603.4210205078125,
124
+ "logps/rejected": -4877.28662109375,
125
  "loss": 0.0021,
126
  "rewards/accuracies": 1.0,
127
+ "rewards/chosen": -2.6363024711608887,
128
+ "rewards/margins": 39.97002410888672,
129
+ "rewards/rejected": -42.606327056884766,
130
  "step": 70
131
  },
132
  {
133
  "epoch": 0.7582938388625592,
134
+ "grad_norm": 0.22202350824430725,
135
  "learning_rate": 4.6534074564712217e-07,
136
+ "logits/chosen": -3.417383909225464,
137
+ "logits/rejected": -3.290362596511841,
138
+ "logps/chosen": -600.4118041992188,
139
+ "logps/rejected": -5436.11376953125,
140
+ "loss": 0.0005,
141
  "rewards/accuracies": 1.0,
142
+ "rewards/chosen": -2.5593833923339844,
143
+ "rewards/margins": 45.55999755859375,
144
+ "rewards/rejected": -48.11937713623047,
145
  "step": 80
146
  },
147
  {
148
  "epoch": 0.8530805687203792,
149
+ "grad_norm": 2.0861019684034874,
150
  "learning_rate": 4.4994615667026846e-07,
151
+ "logits/chosen": -3.4805240631103516,
152
+ "logits/rejected": -3.3906772136688232,
153
+ "logps/chosen": -624.0176391601562,
154
+ "logps/rejected": -5296.82275390625,
155
  "loss": 0.0018,
156
  "rewards/accuracies": 1.0,
157
+ "rewards/chosen": -2.676025867462158,
158
+ "rewards/margins": 44.0660285949707,
159
+ "rewards/rejected": -46.7420539855957,
160
  "step": 90
161
  },
162
  {
163
  "epoch": 0.9478672985781991,
164
+ "grad_norm": 2.8965011668216905,
165
  "learning_rate": 4.320901013934887e-07,
166
+ "logits/chosen": -3.4210407733917236,
167
+ "logits/rejected": -3.3643829822540283,
168
+ "logps/chosen": -556.0076904296875,
169
+ "logps/rejected": -4813.1806640625,
170
+ "loss": 0.0036,
171
  "rewards/accuracies": 1.0,
172
+ "rewards/chosen": -2.0869507789611816,
173
+ "rewards/margins": 39.87181854248047,
174
+ "rewards/rejected": -41.95877456665039,
175
  "step": 100
176
  },
177
  {
178
  "epoch": 0.9478672985781991,
179
+ "eval_logits/chosen": -3.4104061126708984,
180
+ "eval_logits/rejected": -3.2429261207580566,
181
+ "eval_logps/chosen": -570.0164184570312,
182
+ "eval_logps/rejected": -4765.2841796875,
183
+ "eval_loss": 0.0052900416776537895,
184
  "eval_rewards/accuracies": 0.9939516186714172,
185
+ "eval_rewards/chosen": -2.0303473472595215,
186
+ "eval_rewards/margins": 39.21460723876953,
187
+ "eval_rewards/rejected": -41.24495315551758,
188
+ "eval_runtime": 192.2337,
189
+ "eval_samples_per_second": 20.314,
190
+ "eval_steps_per_second": 0.323,
191
  "step": 100
192
  },
193
  {
194
  "epoch": 1.042654028436019,
195
+ "grad_norm": 1.2489542878599509,
196
  "learning_rate": 4.119923993874379e-07,
197
+ "logits/chosen": -3.4639148712158203,
198
+ "logits/rejected": -3.4126315116882324,
199
+ "logps/chosen": -549.92138671875,
200
+ "logps/rejected": -5150.29638671875,
201
  "loss": 0.0006,
202
  "rewards/accuracies": 1.0,
203
+ "rewards/chosen": -1.9557552337646484,
204
+ "rewards/margins": 43.08815002441406,
205
+ "rewards/rejected": -45.04390335083008,
206
  "step": 110
207
  },
208
  {
209
  "epoch": 1.1374407582938388,
210
+ "grad_norm": 0.919711694376481,
211
  "learning_rate": 3.899004663415083e-07,
212
+ "logits/chosen": -3.455725908279419,
213
+ "logits/rejected": -3.3397490978240967,
214
+ "logps/chosen": -534.6444702148438,
215
+ "logps/rejected": -5193.822265625,
216
  "loss": 0.0005,
217
  "rewards/accuracies": 1.0,
218
+ "rewards/chosen": -1.8104517459869385,
219
+ "rewards/margins": 43.72606658935547,
220
+ "rewards/rejected": -45.53651809692383,
221
  "step": 120
222
  },
223
  {
224
  "epoch": 1.2322274881516588,
225
+ "grad_norm": 0.03772744312797018,
226
  "learning_rate": 3.6608626821692824e-07,
227
+ "logits/chosen": -3.503054141998291,
228
+ "logits/rejected": -3.4913394451141357,
229
+ "logps/chosen": -509.2953186035156,
230
+ "logps/rejected": -5831.84228515625,
231
  "loss": 0.0012,
232
  "rewards/accuracies": 1.0,
233
+ "rewards/chosen": -1.6301825046539307,
234
+ "rewards/margins": 49.84960174560547,
235
+ "rewards/rejected": -51.47977828979492,
236
  "step": 130
237
  },
238
  {
239
  "epoch": 1.3270142180094786,
240
+ "grad_norm": 0.00011722006953608906,
241
  "learning_rate": 3.408429731701635e-07,
242
+ "logits/chosen": -3.636444091796875,
243
+ "logits/rejected": -3.614245891571045,
244
+ "logps/chosen": -664.00341796875,
245
+ "logps/rejected": -5503.0537109375,
246
  "loss": 0.0001,
247
  "rewards/accuracies": 1.0,
248
+ "rewards/chosen": -3.150538682937622,
249
+ "rewards/margins": 45.41934585571289,
250
+ "rewards/rejected": -48.56988525390625,
251
  "step": 140
252
  },
253
  {
254
  "epoch": 1.4218009478672986,
255
+ "grad_norm": 0.0013414969188062405,
256
  "learning_rate": 3.144813424636031e-07,
257
+ "logits/chosen": -3.788306713104248,
258
+ "logits/rejected": -3.686079740524292,
259
+ "logps/chosen": -791.1682739257812,
260
+ "logps/rejected": -5721.5634765625,
261
+ "loss": 0.0001,
262
  "rewards/accuracies": 1.0,
263
+ "rewards/chosen": -4.1505842208862305,
264
+ "rewards/margins": 46.614662170410156,
265
+ "rewards/rejected": -50.7652473449707,
266
  "step": 150
267
  },
268
  {
269
  "epoch": 1.4218009478672986,
270
+ "eval_logits/chosen": -3.5867350101470947,
271
+ "eval_logits/rejected": -3.5067942142486572,
272
+ "eval_logps/chosen": -561.57568359375,
273
+ "eval_logps/rejected": -5161.087890625,
274
+ "eval_loss": 0.006992733106017113,
275
  "eval_rewards/accuracies": 0.9939516186714172,
276
+ "eval_rewards/chosen": -1.9459394216537476,
277
+ "eval_rewards/margins": 43.25704574584961,
278
+ "eval_rewards/rejected": -45.2029914855957,
279
+ "eval_runtime": 191.7726,
280
+ "eval_samples_per_second": 20.363,
281
  "eval_steps_per_second": 0.323,
282
  "step": 150
283
  },
284
  {
285
  "epoch": 1.5165876777251186,
286
+ "grad_norm": 0.0004138099071654368,
287
  "learning_rate": 2.8732590479375165e-07,
288
+ "logits/chosen": -3.556847333908081,
289
+ "logits/rejected": -3.5835862159729004,
290
+ "logps/chosen": -528.8604736328125,
291
+ "logps/rejected": -5157.8740234375,
292
  "loss": 0.0003,
293
  "rewards/accuracies": 1.0,
294
+ "rewards/chosen": -1.7568155527114868,
295
+ "rewards/margins": 43.957759857177734,
296
+ "rewards/rejected": -45.714576721191406,
297
  "step": 160
298
  },
299
  {
300
  "epoch": 1.6113744075829384,
301
+ "grad_norm": 0.0016286137021698196,
302
  "learning_rate": 2.597109611334169e-07,
303
+ "logits/chosen": -3.579390287399292,
304
+ "logits/rejected": -3.6478075981140137,
305
+ "logps/chosen": -520.5675048828125,
306
+ "logps/rejected": -5432.5673828125,
307
+ "loss": 0.0001,
308
  "rewards/accuracies": 1.0,
309
+ "rewards/chosen": -1.6220014095306396,
310
+ "rewards/margins": 46.55379867553711,
311
+ "rewards/rejected": -48.17579650878906,
312
  "step": 170
313
  },
314
  {
315
  "epoch": 1.7061611374407581,
316
+ "grad_norm": 0.00799320909391895,
317
  "learning_rate": 2.3197646927086694e-07,
318
+ "logits/chosen": -3.5350117683410645,
319
+ "logits/rejected": -3.6110050678253174,
320
+ "logps/chosen": -534.5997314453125,
321
+ "logps/rejected": -5420.73583984375,
322
+ "loss": 0.0075,
323
  "rewards/accuracies": 1.0,
324
+ "rewards/chosen": -1.6480720043182373,
325
+ "rewards/margins": 46.55036163330078,
326
+ "rewards/rejected": -48.19843292236328,
327
  "step": 180
328
  },
329
  {
330
  "epoch": 1.8009478672985781,
331
+ "grad_norm": 0.0014081828819370304,
332
  "learning_rate": 2.0446385870993467e-07,
333
+ "logits/chosen": -3.5267558097839355,
334
+ "logits/rejected": -3.5355076789855957,
335
+ "logps/chosen": -524.6720581054688,
336
+ "logps/rejected": -5069.0888671875,
337
  "loss": 0.0,
338
  "rewards/accuracies": 1.0,
339
+ "rewards/chosen": -1.4387648105621338,
340
+ "rewards/margins": 43.29344177246094,
341
+ "rewards/rejected": -44.73220443725586,
342
  "step": 190
343
  },
344
  {
345
  "epoch": 1.8957345971563981,
346
+ "grad_norm": 0.018000801767423476,
347
  "learning_rate": 1.775118274523545e-07,
348
+ "logits/chosen": -3.5183377265930176,
349
+ "logits/rejected": -3.5119102001190186,
350
+ "logps/chosen": -486.629150390625,
351
+ "logps/rejected": -5021.52490234375,
352
  "loss": 0.0,
353
  "rewards/accuracies": 1.0,
354
+ "rewards/chosen": -1.3421844244003296,
355
+ "rewards/margins": 42.632965087890625,
356
+ "rewards/rejected": -43.97514724731445,
357
  "step": 200
358
  },
359
  {
360
  "epoch": 1.8957345971563981,
361
+ "eval_logits/chosen": -3.502014398574829,
362
+ "eval_logits/rejected": -3.422856092453003,
363
+ "eval_logps/chosen": -512.3704223632812,
364
+ "eval_logps/rejected": -5067.64501953125,
365
+ "eval_loss": 0.004733214620500803,
366
  "eval_rewards/accuracies": 0.9959677457809448,
367
+ "eval_rewards/chosen": -1.4538869857788086,
368
+ "eval_rewards/margins": 42.814674377441406,
369
+ "eval_rewards/rejected": -44.26856231689453,
370
+ "eval_runtime": 194.1121,
371
+ "eval_samples_per_second": 20.117,
372
+ "eval_steps_per_second": 0.319,
373
  "step": 200
374
  },
375
  {
376
  "epoch": 1.9905213270142181,
377
+ "grad_norm": 0.9312964869423628,
378
  "learning_rate": 1.514521724066537e-07,
379
+ "logits/chosen": -3.540240526199341,
380
+ "logits/rejected": -3.5632777214050293,
381
+ "logps/chosen": -531.4307861328125,
382
+ "logps/rejected": -5061.63818359375,
383
  "loss": 0.0002,
384
  "rewards/accuracies": 1.0,
385
+ "rewards/chosen": -1.486299753189087,
386
+ "rewards/margins": 42.955726623535156,
387
+ "rewards/rejected": -44.44202423095703,
388
  "step": 210
389
  },
390
  {
391
  "epoch": 2.085308056872038,
392
+ "grad_norm": 0.029566978048640967,
393
  "learning_rate": 1.266057047539568e-07,
394
+ "logits/chosen": -3.5052971839904785,
395
+ "logits/rejected": -3.5332977771759033,
396
+ "logps/chosen": -477.3848571777344,
397
+ "logps/rejected": -5269.00390625,
398
  "loss": 0.0,
399
  "rewards/accuracies": 1.0,
400
+ "rewards/chosen": -1.4086942672729492,
401
+ "rewards/margins": 45.027523040771484,
402
+ "rewards/rejected": -46.43621826171875,
403
  "step": 220
404
  },
405
  {
406
  "epoch": 2.1800947867298577,
407
+ "grad_norm": 0.0005556188331340245,
408
  "learning_rate": 1.032783005551884e-07,
409
+ "logits/chosen": -3.5509438514709473,
410
+ "logits/rejected": -3.5611331462860107,
411
+ "logps/chosen": -473.364501953125,
412
+ "logps/rejected": -4865.369140625,
413
  "loss": 0.0,
414
  "rewards/accuracies": 1.0,
415
+ "rewards/chosen": -1.312354326248169,
416
+ "rewards/margins": 41.259403228759766,
417
+ "rewards/rejected": -42.57175827026367,
418
  "step": 230
419
  },
420
  {
421
  "epoch": 2.2748815165876777,
422
+ "grad_norm": 0.005629678669869344,
423
  "learning_rate": 8.175713521924976e-08,
424
+ "logits/chosen": -3.5678086280822754,
425
+ "logits/rejected": -3.5121123790740967,
426
+ "logps/chosen": -496.83258056640625,
427
+ "logps/rejected": -5081.9599609375,
428
  "loss": 0.0,
429
  "rewards/accuracies": 1.0,
430
+ "rewards/chosen": -1.4451147317886353,
431
+ "rewards/margins": 43.37391662597656,
432
+ "rewards/rejected": -44.81903839111328,
433
  "step": 240
434
  },
435
  {
436
  "epoch": 2.3696682464454977,
437
+ "grad_norm": 0.0012113886351427462,
438
  "learning_rate": 6.230714818829733e-08,
439
+ "logits/chosen": -3.530911922454834,
440
+ "logits/rejected": -3.5102057456970215,
441
+ "logps/chosen": -484.5502014160156,
442
+ "logps/rejected": -5412.3271484375,
443
  "loss": 0.0,
444
  "rewards/accuracies": 1.0,
445
+ "rewards/chosen": -1.327695608139038,
446
+ "rewards/margins": 46.98969268798828,
447
+ "rewards/rejected": -48.31739044189453,
448
  "step": 250
449
  },
450
  {
451
  "epoch": 2.3696682464454977,
452
+ "eval_logits/chosen": -3.5510308742523193,
453
+ "eval_logits/rejected": -3.444518566131592,
454
+ "eval_logps/chosen": -512.2269287109375,
455
+ "eval_logps/rejected": -5116.15771484375,
456
+ "eval_loss": 0.005008448380976915,
457
  "eval_rewards/accuracies": 0.9959677457809448,
458
+ "eval_rewards/chosen": -1.4524519443511963,
459
+ "eval_rewards/margins": 43.301239013671875,
460
+ "eval_rewards/rejected": -44.753692626953125,
461
+ "eval_runtime": 192.1218,
462
+ "eval_samples_per_second": 20.326,
463
  "eval_steps_per_second": 0.323,
464
  "step": 250
465
  },
466
  {
467
  "epoch": 2.4644549763033177,
468
+ "grad_norm": 0.005272804838769864,
469
  "learning_rate": 4.516778136213037e-08,
470
+ "logits/chosen": -3.5464816093444824,
471
+ "logits/rejected": -3.532754898071289,
472
+ "logps/chosen": -474.98077392578125,
473
+ "logps/rejected": -5214.1748046875,
474
  "loss": 0.0,
475
  "rewards/accuracies": 1.0,
476
+ "rewards/chosen": -1.3641650676727295,
477
+ "rewards/margins": 44.85725021362305,
478
+ "rewards/rejected": -46.22141647338867,
479
  "step": 260
480
  },
481
  {
482
  "epoch": 2.5592417061611377,
483
+ "grad_norm": 0.01760309981671165,
484
  "learning_rate": 3.055003141378948e-08,
485
+ "logits/chosen": -3.5305237770080566,
486
+ "logits/rejected": -3.543522357940674,
487
+ "logps/chosen": -502.1796875,
488
+ "logps/rejected": -5842.8251953125,
489
  "loss": 0.0,
490
  "rewards/accuracies": 1.0,
491
+ "rewards/chosen": -1.2657973766326904,
492
+ "rewards/margins": 50.87003707885742,
493
+ "rewards/rejected": -52.135841369628906,
494
  "step": 270
495
  },
496
  {
497
  "epoch": 2.654028436018957,
498
+ "grad_norm": 0.01345213655983596,
499
  "learning_rate": 1.8633852284264508e-08,
500
+ "logits/chosen": -3.5437607765197754,
501
+ "logits/rejected": -3.537663221359253,
502
+ "logps/chosen": -519.03759765625,
503
+ "logps/rejected": -5507.5615234375,
504
  "loss": 0.0,
505
  "rewards/accuracies": 1.0,
506
+ "rewards/chosen": -1.3319001197814941,
507
+ "rewards/margins": 47.102291107177734,
508
+ "rewards/rejected": -48.4341926574707,
509
  "step": 280
510
  },
511
  {
512
  "epoch": 2.748815165876777,
513
+ "grad_norm": 5.9654408780918595e-05,
514
  "learning_rate": 9.56593983327919e-09,
515
+ "logits/chosen": -3.5722999572753906,
516
+ "logits/rejected": -3.5434532165527344,
517
+ "logps/chosen": -525.2794189453125,
518
+ "logps/rejected": -5359.7451171875,
519
  "loss": 0.0,
520
  "rewards/accuracies": 1.0,
521
+ "rewards/chosen": -1.4624111652374268,
522
+ "rewards/margins": 46.21337890625,
523
+ "rewards/rejected": -47.675785064697266,
524
  "step": 290
525
  },
526
  {
527
  "epoch": 2.843601895734597,
528
+ "grad_norm": 0.0012624104591569302,
529
  "learning_rate": 3.4579259185321398e-09,
530
+ "logits/chosen": -3.5550761222839355,
531
+ "logits/rejected": -3.541923999786377,
532
+ "logps/chosen": -513.0765380859375,
533
+ "logps/rejected": -5235.28759765625,
534
  "loss": 0.0,
535
  "rewards/accuracies": 1.0,
536
+ "rewards/chosen": -1.3970950841903687,
537
+ "rewards/margins": 45.181175231933594,
538
+ "rewards/rejected": -46.578269958496094,
539
  "step": 300
540
  },
541
  {
542
  "epoch": 2.843601895734597,
543
+ "eval_logits/chosen": -3.5504369735717773,
544
+ "eval_logits/rejected": -3.444122552871704,
545
+ "eval_logps/chosen": -512.8049926757812,
546
+ "eval_logps/rejected": -5128.248046875,
547
+ "eval_loss": 0.004975645802915096,
548
  "eval_rewards/accuracies": 0.9959677457809448,
549
+ "eval_rewards/chosen": -1.4582326412200928,
550
+ "eval_rewards/margins": 43.41635513305664,
551
+ "eval_rewards/rejected": -44.87459182739258,
552
+ "eval_runtime": 192.6295,
553
+ "eval_samples_per_second": 20.272,
554
+ "eval_steps_per_second": 0.322,
555
  "step": 300
556
  },
557
  {
558
  "epoch": 2.938388625592417,
559
+ "grad_norm": 0.0005953504074610172,
560
  "learning_rate": 3.850041354441502e-10,
561
+ "logits/chosen": -3.5716750621795654,
562
+ "logits/rejected": -3.5102698802948,
563
+ "logps/chosen": -509.0469665527344,
564
+ "logps/rejected": -4801.1611328125,
565
  "loss": 0.0,
566
  "rewards/accuracies": 1.0,
567
+ "rewards/chosen": -1.4760140180587769,
568
+ "rewards/margins": 40.71800994873047,
569
+ "rewards/rejected": -42.19402313232422,
570
  "step": 310
571
  },
572
  {
573
  "epoch": 2.985781990521327,
574
  "step": 315,
575
  "total_flos": 0.0,
576
+ "train_loss": 0.04083177362173292,
577
+ "train_runtime": 9033.5209,
578
+ "train_samples_per_second": 4.483,
579
+ "train_steps_per_second": 0.035
580
  }
581
  ],
582
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c09991d0dd4badd7eb3d51c710e71e2c2a5932e93fe69a025005d92a6da7e339
3
  size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecf20cbd8067145a8eb8173b159aac02fda214ae80ecd675174fcf61768dabe8
3
  size 7544