ggbetz commited on
Commit
21a6823
·
verified ·
1 Parent(s): a41735b

Model save

Browse files
README.md CHANGED
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/4qlrj3fp)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/d55470xf)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.996219281663516,
3
  "total_flos": 0.0,
4
- "train_loss": 0.5393935610549618,
5
- "train_runtime": 2398.307,
6
- "train_samples": 6347,
7
- "train_samples_per_second": 5.293,
8
- "train_steps_per_second": 0.083
9
  }
 
1
  {
2
+ "epoch": 1.9971305595408895,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5929731760901966,
5
+ "train_runtime": 2030.2211,
6
+ "train_samples": 5576,
7
+ "train_samples_per_second": 5.493,
8
+ "train_steps_per_second": 0.086
9
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f807a55400211dc38fe9871d39bbdb3a1c49f8b79532d4313e1c899fe429e10
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1b2cfa01d7837330c9b890a79677233bde135efb7b9300dc70ca9c2436cfe2d
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8dd55c15ea1762fc4462e86e4d79d32fb9df2bf76c79e3c0e53edca6e2575d8
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1a9f875c8b82e62970bc18ae27be339c4fa9058b0cdb3f537fa10774bf479e9
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:954125f0f6d5961aff8db190d00a9831a874a385c04dedda7d82359384d7233e
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f23e9719d839b39d06c0e5b2276ea0def1e8e6d6774413af2ae74d34fa9ac0a6
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4c9947d0ccbc72142d87c4ef362037fd555ee53092f6642a644678e9be9d61e
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:643861ba758160b88b3f43351ea8d2c440cf58ff16173e0337f35c74c90cf95b
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.996219281663516,
3
  "total_flos": 0.0,
4
- "train_loss": 0.5393935610549618,
5
- "train_runtime": 2398.307,
6
- "train_samples": 6347,
7
- "train_samples_per_second": 5.293,
8
- "train_steps_per_second": 0.083
9
  }
 
1
  {
2
+ "epoch": 1.9971305595408895,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5929731760901966,
5
+ "train_runtime": 2030.2211,
6
+ "train_samples": 5576,
7
+ "train_samples_per_second": 5.493,
8
+ "train_steps_per_second": 0.086
9
  }
trainer_state.json CHANGED
@@ -1,610 +1,535 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.996219281663516,
5
  "eval_steps": 500,
6
- "global_step": 198,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.05040957781978576,
13
- "grad_norm": 108.41279315573027,
14
- "learning_rate": 1e-07,
15
  "logits/chosen": NaN,
16
  "logits/rejected": NaN,
17
- "logps/chosen": -394.29998779296875,
18
- "logps/rejected": -490.4984436035156,
19
- "loss": 0.6973,
20
- "rewards/accuracies": 0.12812499701976776,
21
- "rewards/chosen": -0.0023020743392407894,
22
- "rewards/margins": -0.009827613830566406,
23
- "rewards/rejected": 0.007504081819206476,
24
  "step": 5
25
  },
26
  {
27
- "epoch": 0.10081915563957151,
28
- "grad_norm": 87.2429177419689,
29
- "learning_rate": 2e-07,
30
  "logits/chosen": NaN,
31
  "logits/rejected": NaN,
32
- "logps/chosen": -374.5687561035156,
33
- "logps/rejected": -397.4437561035156,
34
- "loss": 0.6827,
35
- "rewards/accuracies": 0.30937498807907104,
36
- "rewards/chosen": 0.009921550750732422,
37
- "rewards/margins": 0.029467200860381126,
38
- "rewards/rejected": -0.01954820193350315,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.15122873345935728,
43
- "grad_norm": 83.76505879098643,
44
- "learning_rate": 1.946808510638298e-07,
45
  "logits/chosen": NaN,
46
- "logits/rejected": NaN,
47
- "logps/chosen": -361.41485595703125,
48
- "logps/rejected": -441.2671813964844,
49
- "loss": 0.6832,
50
- "rewards/accuracies": 0.3187499940395355,
51
- "rewards/chosen": -0.0191789623349905,
52
- "rewards/margins": 0.04044074937701225,
53
- "rewards/rejected": -0.059579335153102875,
54
  "step": 15
55
  },
56
  {
57
- "epoch": 0.20163831127914303,
58
- "grad_norm": 55.583186387674,
59
- "learning_rate": 1.8936170212765957e-07,
60
  "logits/chosen": NaN,
61
  "logits/rejected": NaN,
62
- "logps/chosen": -373.5375061035156,
63
- "logps/rejected": -464.6875,
64
- "loss": 0.6233,
65
- "rewards/accuracies": 0.4437499940395355,
66
- "rewards/chosen": -0.021530818194150925,
67
- "rewards/margins": 0.23837146162986755,
68
- "rewards/rejected": -0.2597528398036957,
69
  "step": 20
70
  },
71
  {
72
- "epoch": 0.2520478890989288,
73
- "grad_norm": 60.717924564454115,
74
- "learning_rate": 1.8404255319148937e-07,
75
  "logits/chosen": NaN,
76
- "logits/rejected": NaN,
77
- "logps/chosen": -371.62188720703125,
78
- "logps/rejected": -478.8374938964844,
79
- "loss": 0.6328,
80
- "rewards/accuracies": 0.3968749940395355,
81
- "rewards/chosen": -0.04529209062457085,
82
- "rewards/margins": 0.28455600142478943,
83
- "rewards/rejected": -0.3299552798271179,
84
  "step": 25
85
  },
86
  {
87
- "epoch": 0.30245746691871456,
88
- "grad_norm": 58.667280799471335,
89
- "learning_rate": 1.7872340425531914e-07,
90
  "logits/chosen": NaN,
91
  "logits/rejected": NaN,
92
- "logps/chosen": -407.125,
93
- "logps/rejected": -456.94061279296875,
94
- "loss": 0.6037,
95
- "rewards/accuracies": 0.4281249940395355,
96
- "rewards/chosen": -0.06888346374034882,
97
- "rewards/margins": 0.4735303819179535,
98
- "rewards/rejected": -0.5425974726676941,
99
  "step": 30
100
  },
101
  {
102
- "epoch": 0.35286704473850034,
103
- "grad_norm": 52.677451346587404,
104
- "learning_rate": 1.7340425531914892e-07,
105
  "logits/chosen": NaN,
106
  "logits/rejected": NaN,
107
- "logps/chosen": -396.4683532714844,
108
- "logps/rejected": -433.375,
109
- "loss": 0.5971,
110
- "rewards/accuracies": 0.515625,
111
- "rewards/chosen": -0.10200033336877823,
112
- "rewards/margins": 0.5540359616279602,
113
- "rewards/rejected": -0.6558942794799805,
114
  "step": 35
115
  },
116
  {
117
- "epoch": 0.40327662255828606,
118
- "grad_norm": 61.023691806589966,
119
- "learning_rate": 1.6808510638297872e-07,
120
  "logits/chosen": NaN,
121
  "logits/rejected": NaN,
122
- "logps/chosen": -349.65545654296875,
123
- "logps/rejected": -466.68438720703125,
124
- "loss": 0.6065,
125
- "rewards/accuracies": 0.578125,
126
- "rewards/chosen": -0.10066480934619904,
127
- "rewards/margins": 0.6059595346450806,
128
- "rewards/rejected": -0.7062518000602722,
129
  "step": 40
130
  },
131
  {
132
- "epoch": 0.45368620037807184,
133
- "grad_norm": 50.9687221759888,
134
- "learning_rate": 1.627659574468085e-07,
135
  "logits/chosen": NaN,
136
- "logits/rejected": -0.1692344695329666,
137
- "logps/chosen": -325.9609375,
138
- "logps/rejected": -462.97186279296875,
139
- "loss": 0.5598,
140
- "rewards/accuracies": 0.6031249761581421,
141
- "rewards/chosen": -0.09121231734752655,
142
- "rewards/margins": 1.08050537109375,
143
- "rewards/rejected": -1.1715847253799438,
144
  "step": 45
145
  },
146
  {
147
- "epoch": 0.5040957781978576,
148
- "grad_norm": 56.54625986325724,
149
- "learning_rate": 1.574468085106383e-07,
150
  "logits/chosen": NaN,
151
- "logits/rejected": NaN,
152
- "logps/chosen": -342.57501220703125,
153
- "logps/rejected": -478.74139404296875,
154
- "loss": 0.591,
155
- "rewards/accuracies": 0.574999988079071,
156
- "rewards/chosen": -0.10402297973632812,
157
- "rewards/margins": 0.7732677459716797,
158
- "rewards/rejected": -0.8772258758544922,
159
  "step": 50
160
  },
161
  {
162
- "epoch": 0.5545053560176434,
163
- "grad_norm": 52.007398326371984,
164
- "learning_rate": 1.5212765957446807e-07,
165
  "logits/chosen": NaN,
166
  "logits/rejected": NaN,
167
- "logps/chosen": -381.3374938964844,
168
- "logps/rejected": -456.14373779296875,
169
- "loss": 0.5847,
170
- "rewards/accuracies": 0.574999988079071,
171
- "rewards/chosen": -0.08334217220544815,
172
- "rewards/margins": 0.8058792352676392,
173
- "rewards/rejected": -0.889452338218689,
174
  "step": 55
175
  },
176
  {
177
- "epoch": 0.6049149338374291,
178
- "grad_norm": 46.410753552087435,
179
- "learning_rate": 1.4680851063829787e-07,
180
  "logits/chosen": NaN,
181
  "logits/rejected": NaN,
182
- "logps/chosen": -358.54998779296875,
183
- "logps/rejected": -475.54998779296875,
184
- "loss": 0.5703,
185
- "rewards/accuracies": 0.596875011920929,
186
- "rewards/chosen": -0.1341991424560547,
187
- "rewards/margins": 1.1414505243301392,
188
- "rewards/rejected": -1.27621328830719,
189
  "step": 60
190
  },
191
  {
192
- "epoch": 0.6553245116572148,
193
- "grad_norm": 62.601972259800355,
194
- "learning_rate": 1.4148936170212768e-07,
195
  "logits/chosen": NaN,
196
  "logits/rejected": NaN,
197
- "logps/chosen": -358.22265625,
198
- "logps/rejected": -460.4937438964844,
199
- "loss": 0.5561,
200
- "rewards/accuracies": 0.612500011920929,
201
- "rewards/chosen": -0.09829378128051758,
202
- "rewards/margins": 1.154931664466858,
203
- "rewards/rejected": -1.25310218334198,
204
  "step": 65
205
  },
206
  {
207
- "epoch": 0.7057340894770007,
208
- "grad_norm": 42.81446910830512,
209
- "learning_rate": 1.3617021276595742e-07,
210
  "logits/chosen": NaN,
211
  "logits/rejected": NaN,
212
- "logps/chosen": -350.8062438964844,
213
- "logps/rejected": -479.98126220703125,
214
- "loss": 0.5464,
215
- "rewards/accuracies": 0.668749988079071,
216
- "rewards/chosen": -0.08439864963293076,
217
- "rewards/margins": 1.0637038946151733,
218
- "rewards/rejected": -1.146966576576233,
219
  "step": 70
220
  },
221
  {
222
- "epoch": 0.7561436672967864,
223
- "grad_norm": 59.85465445918704,
224
- "learning_rate": 1.3085106382978723e-07,
225
  "logits/chosen": NaN,
226
  "logits/rejected": NaN,
227
- "logps/chosen": -339.3656311035156,
228
- "logps/rejected": -428.65936279296875,
229
- "loss": 0.5872,
230
- "rewards/accuracies": 0.643750011920929,
231
- "rewards/chosen": -0.11314620822668076,
232
- "rewards/margins": 0.7480255365371704,
233
- "rewards/rejected": -0.8610885739326477,
234
  "step": 75
235
  },
236
  {
237
- "epoch": 0.8065532451165721,
238
- "grad_norm": 77.89913196854276,
239
- "learning_rate": 1.25531914893617e-07,
240
  "logits/chosen": NaN,
241
- "logits/rejected": -0.232859805226326,
242
- "logps/chosen": -367.71563720703125,
243
- "logps/rejected": -496.9125061035156,
244
- "loss": 0.5386,
245
- "rewards/accuracies": 0.6968749761581421,
246
- "rewards/chosen": -0.09202079474925995,
247
- "rewards/margins": 1.1276824474334717,
248
- "rewards/rejected": -1.2203514575958252,
249
  "step": 80
250
  },
251
  {
252
- "epoch": 0.856962822936358,
253
- "grad_norm": 51.576853346068596,
254
- "learning_rate": 1.202127659574468e-07,
255
  "logits/chosen": NaN,
256
  "logits/rejected": NaN,
257
- "logps/chosen": -352.01873779296875,
258
- "logps/rejected": -441.85626220703125,
259
- "loss": 0.5489,
260
- "rewards/accuracies": 0.699999988079071,
261
- "rewards/chosen": -0.09004707634449005,
262
- "rewards/margins": 0.9977798461914062,
263
- "rewards/rejected": -1.0878921747207642,
264
  "step": 85
265
  },
266
  {
267
- "epoch": 0.9073724007561437,
268
- "grad_norm": 51.91948881188928,
269
- "learning_rate": 1.148936170212766e-07,
270
  "logits/chosen": NaN,
271
- "logits/rejected": NaN,
272
- "logps/chosen": -391.98748779296875,
273
- "logps/rejected": -501.89373779296875,
274
- "loss": 0.5429,
275
- "rewards/accuracies": 0.668749988079071,
276
- "rewards/chosen": -0.13425922393798828,
277
- "rewards/margins": 1.2032638788223267,
278
- "rewards/rejected": -1.339324951171875,
279
  "step": 90
280
  },
281
  {
282
- "epoch": 0.9577819785759294,
283
- "grad_norm": 54.81194235391722,
284
- "learning_rate": 1.0957446808510638e-07,
285
  "logits/chosen": NaN,
286
  "logits/rejected": NaN,
287
- "logps/chosen": -380.8531188964844,
288
- "logps/rejected": -485.4375,
289
- "loss": 0.5506,
290
- "rewards/accuracies": 0.6875,
291
- "rewards/chosen": -0.123291015625,
292
- "rewards/margins": 1.137838363647461,
293
- "rewards/rejected": -1.2604999542236328,
294
  "step": 95
295
  },
296
  {
297
- "epoch": 1.0081915563957151,
298
- "grad_norm": 51.52692462343263,
299
- "learning_rate": 1.0425531914893617e-07,
300
  "logits/chosen": NaN,
301
  "logits/rejected": NaN,
302
- "logps/chosen": -357.92498779296875,
303
- "logps/rejected": -420.00311279296875,
304
- "loss": 0.5433,
305
- "rewards/accuracies": 0.6875,
306
- "rewards/chosen": -0.118899405002594,
307
- "rewards/margins": 1.2265655994415283,
308
- "rewards/rejected": -1.3462097644805908,
309
  "step": 100
310
  },
311
  {
312
- "epoch": 1.0586011342155008,
313
- "grad_norm": 48.23673648203366,
314
- "learning_rate": 9.893617021276596e-08,
315
  "logits/chosen": NaN,
316
  "logits/rejected": NaN,
317
- "logps/chosen": -420.59375,
318
- "logps/rejected": -551.4156494140625,
319
- "loss": 0.4711,
320
- "rewards/accuracies": 0.715624988079071,
321
- "rewards/chosen": -0.05655860900878906,
322
- "rewards/margins": 1.65283203125,
323
- "rewards/rejected": -1.7101104259490967,
324
  "step": 105
325
  },
326
  {
327
- "epoch": 1.1090107120352868,
328
- "grad_norm": 129.8626362436788,
329
- "learning_rate": 9.361702127659574e-08,
330
  "logits/chosen": NaN,
331
- "logits/rejected": NaN,
332
- "logps/chosen": -375.9306640625,
333
- "logps/rejected": -410.8500061035156,
334
- "loss": 0.5499,
335
- "rewards/accuracies": 0.706250011920929,
336
- "rewards/chosen": -0.10003051906824112,
337
- "rewards/margins": 0.8063720464706421,
338
- "rewards/rejected": -0.906982421875,
339
  "step": 110
340
  },
341
  {
342
- "epoch": 1.1594202898550725,
343
- "grad_norm": 37.96460632662538,
344
- "learning_rate": 8.829787234042553e-08,
345
  "logits/chosen": NaN,
346
  "logits/rejected": NaN,
347
- "logps/chosen": -395.4546813964844,
348
- "logps/rejected": -534.5437622070312,
349
- "loss": 0.4536,
350
- "rewards/accuracies": 0.7875000238418579,
351
- "rewards/chosen": -0.0355035774409771,
352
- "rewards/margins": 1.6931426525115967,
353
- "rewards/rejected": -1.7283508777618408,
354
  "step": 115
355
  },
356
  {
357
- "epoch": 1.2098298676748582,
358
- "grad_norm": 41.75349141526311,
359
- "learning_rate": 8.297872340425531e-08,
360
  "logits/chosen": NaN,
361
- "logits/rejected": -0.24345549941062927,
362
- "logps/chosen": -337.83905029296875,
363
- "logps/rejected": -414.4765625,
364
- "loss": 0.4953,
365
- "rewards/accuracies": 0.796875,
366
- "rewards/chosen": -0.09084253013134003,
367
- "rewards/margins": 1.2055069208145142,
368
- "rewards/rejected": -1.2963898181915283,
369
  "step": 120
370
  },
371
  {
372
- "epoch": 1.260239445494644,
373
- "grad_norm": 47.57568984542951,
374
- "learning_rate": 7.76595744680851e-08,
375
  "logits/chosen": NaN,
376
  "logits/rejected": NaN,
377
- "logps/chosen": -358.9046936035156,
378
- "logps/rejected": -469.49688720703125,
379
- "loss": 0.501,
380
- "rewards/accuracies": 0.75,
381
- "rewards/chosen": -0.09036216884851456,
382
- "rewards/margins": 1.231683373451233,
383
- "rewards/rejected": -1.3224579095840454,
384
  "step": 125
385
  },
386
  {
387
- "epoch": 1.3106490233144297,
388
- "grad_norm": 53.944830915941765,
389
- "learning_rate": 7.23404255319149e-08,
390
  "logits/chosen": NaN,
391
  "logits/rejected": NaN,
392
- "logps/chosen": -356.4375,
393
- "logps/rejected": -489.125,
394
- "loss": 0.4999,
395
  "rewards/accuracies": 0.7593749761581421,
396
- "rewards/chosen": -0.0903778076171875,
397
- "rewards/margins": 1.2274360656738281,
398
- "rewards/rejected": -1.3174560070037842,
399
  "step": 130
400
  },
401
  {
402
- "epoch": 1.3610586011342156,
403
- "grad_norm": 40.45839840494219,
404
- "learning_rate": 6.702127659574469e-08,
405
  "logits/chosen": NaN,
406
  "logits/rejected": NaN,
407
- "logps/chosen": -368.95623779296875,
408
- "logps/rejected": -460.58282470703125,
409
- "loss": 0.4856,
410
- "rewards/accuracies": 0.746874988079071,
411
- "rewards/chosen": -0.033612824976444244,
412
- "rewards/margins": 1.4452941417694092,
413
- "rewards/rejected": -1.479437232017517,
414
  "step": 135
415
  },
416
  {
417
- "epoch": 1.4114681789540013,
418
- "grad_norm": 43.63902778407327,
419
- "learning_rate": 6.170212765957446e-08,
420
  "logits/chosen": NaN,
421
  "logits/rejected": NaN,
422
- "logps/chosen": -393.62030029296875,
423
- "logps/rejected": -473.53436279296875,
424
- "loss": 0.4987,
425
- "rewards/accuracies": 0.734375,
426
- "rewards/chosen": -0.03872375562787056,
427
- "rewards/margins": 1.239990234375,
428
- "rewards/rejected": -1.2792266607284546,
429
  "step": 140
430
  },
431
  {
432
- "epoch": 1.461877756773787,
433
- "grad_norm": 40.93200179183777,
434
- "learning_rate": 5.638297872340425e-08,
435
  "logits/chosen": NaN,
436
- "logits/rejected": -0.28594666719436646,
437
- "logps/chosen": -370.62188720703125,
438
- "logps/rejected": -508.2562561035156,
439
- "loss": 0.4891,
440
- "rewards/accuracies": 0.7250000238418579,
441
- "rewards/chosen": -0.047638703137636185,
442
- "rewards/margins": 1.3118622303009033,
443
- "rewards/rejected": -1.359655737876892,
444
  "step": 145
445
  },
446
  {
447
- "epoch": 1.5122873345935728,
448
- "grad_norm": 58.65309693932161,
449
- "learning_rate": 5.106382978723404e-08,
450
  "logits/chosen": NaN,
451
- "logits/rejected": NaN,
452
- "logps/chosen": -347.8609313964844,
453
- "logps/rejected": -473.4750061035156,
454
- "loss": 0.4814,
455
- "rewards/accuracies": 0.762499988079071,
456
- "rewards/chosen": -0.05366211012005806,
457
- "rewards/margins": 1.2643524408340454,
458
- "rewards/rejected": -1.3187682628631592,
459
  "step": 150
460
  },
461
  {
462
- "epoch": 1.5626969124133585,
463
- "grad_norm": 40.5990127283572,
464
- "learning_rate": 4.5744680851063826e-08,
465
  "logits/chosen": NaN,
466
  "logits/rejected": NaN,
467
- "logps/chosen": -377.1734313964844,
468
- "logps/rejected": -453.3296813964844,
469
- "loss": 0.4998,
470
- "rewards/accuracies": 0.753125011920929,
471
- "rewards/chosen": -0.07384242862462997,
472
- "rewards/margins": 1.2074543237686157,
473
- "rewards/rejected": -1.2816162109375,
474
  "step": 155
475
  },
476
  {
477
- "epoch": 1.6131064902331445,
478
- "grad_norm": 32.42610640703847,
479
- "learning_rate": 4.0425531914893614e-08,
480
  "logits/chosen": NaN,
481
- "logits/rejected": NaN,
482
- "logps/chosen": -330.2578125,
483
- "logps/rejected": -398.8812561035156,
484
- "loss": 0.471,
485
- "rewards/accuracies": 0.7906249761581421,
486
- "rewards/chosen": -0.01856536790728569,
487
- "rewards/margins": 1.3712249994277954,
488
- "rewards/rejected": -1.3899352550506592,
489
  "step": 160
490
  },
491
  {
492
- "epoch": 1.66351606805293,
493
- "grad_norm": 41.01530072372472,
494
- "learning_rate": 3.51063829787234e-08,
495
  "logits/chosen": NaN,
496
- "logits/rejected": NaN,
497
- "logps/chosen": -340.2749938964844,
498
- "logps/rejected": -438.40313720703125,
499
- "loss": 0.4842,
500
- "rewards/accuracies": 0.737500011920929,
501
- "rewards/chosen": -0.0492522232234478,
502
- "rewards/margins": 1.4962584972381592,
503
- "rewards/rejected": -1.5454528331756592,
504
  "step": 165
505
  },
506
  {
507
- "epoch": 1.713925645872716,
508
- "grad_norm": 40.64613349590343,
509
- "learning_rate": 2.9787234042553187e-08,
510
  "logits/chosen": NaN,
511
- "logits/rejected": NaN,
512
- "logps/chosen": -348.5640563964844,
513
- "logps/rejected": -449.84844970703125,
514
- "loss": 0.4915,
515
- "rewards/accuracies": 0.731249988079071,
516
- "rewards/chosen": -0.07779388129711151,
517
- "rewards/margins": 1.278845191001892,
518
- "rewards/rejected": -1.356591820716858,
519
  "step": 170
520
  },
521
  {
522
- "epoch": 1.7643352236925016,
523
- "grad_norm": 41.86507518705149,
524
- "learning_rate": 2.4468085106382976e-08,
525
- "logits/chosen": NaN,
526
- "logits/rejected": NaN,
527
- "logps/chosen": -350.2046813964844,
528
- "logps/rejected": -433.5687561035156,
529
- "loss": 0.4755,
530
- "rewards/accuracies": 0.8031250238418579,
531
- "rewards/chosen": -0.027071380987763405,
532
- "rewards/margins": 1.4462082386016846,
533
- "rewards/rejected": -1.4738037586212158,
534
- "step": 175
535
- },
536
- {
537
- "epoch": 1.8147448015122873,
538
- "grad_norm": 46.67757830253006,
539
- "learning_rate": 1.9148936170212764e-08,
540
- "logits/chosen": NaN,
541
- "logits/rejected": NaN,
542
- "logps/chosen": -398.32501220703125,
543
- "logps/rejected": -456.9437561035156,
544
- "loss": 0.4807,
545
- "rewards/accuracies": 0.746874988079071,
546
- "rewards/chosen": -0.05414886400103569,
547
- "rewards/margins": 1.2570632696151733,
548
- "rewards/rejected": -1.310980200767517,
549
- "step": 180
550
- },
551
- {
552
- "epoch": 1.865154379332073,
553
- "grad_norm": 43.636875496682755,
554
- "learning_rate": 1.3829787234042552e-08,
555
- "logits/chosen": NaN,
556
- "logits/rejected": -0.17527160048484802,
557
- "logps/chosen": -373.87188720703125,
558
- "logps/rejected": -535.796875,
559
- "loss": 0.4376,
560
- "rewards/accuracies": 0.796875,
561
- "rewards/chosen": -0.026004791259765625,
562
- "rewards/margins": 1.6229279041290283,
563
- "rewards/rejected": -1.649743676185608,
564
- "step": 185
565
- },
566
- {
567
- "epoch": 1.9155639571518588,
568
- "grad_norm": 41.753332033000945,
569
- "learning_rate": 8.510638297872339e-09,
570
- "logits/chosen": NaN,
571
- "logits/rejected": NaN,
572
- "logps/chosen": -360.0718688964844,
573
- "logps/rejected": -468.2124938964844,
574
- "loss": 0.4872,
575
- "rewards/accuracies": 0.731249988079071,
576
- "rewards/chosen": -0.11498375236988068,
577
- "rewards/margins": 1.2304840087890625,
578
- "rewards/rejected": -1.345800757408142,
579
- "step": 190
580
- },
581
- {
582
- "epoch": 1.9659735349716447,
583
- "grad_norm": 43.377545693326894,
584
- "learning_rate": 3.1914893617021273e-09,
585
- "logits/chosen": NaN,
586
- "logits/rejected": NaN,
587
- "logps/chosen": -365.515625,
588
- "logps/rejected": -478.56561279296875,
589
- "loss": 0.4805,
590
- "rewards/accuracies": 0.7718750238418579,
591
- "rewards/chosen": -0.04032173007726669,
592
- "rewards/margins": 1.3543853759765625,
593
- "rewards/rejected": -1.39520263671875,
594
- "step": 195
595
- },
596
- {
597
- "epoch": 1.996219281663516,
598
- "step": 198,
599
  "total_flos": 0.0,
600
- "train_loss": 0.5393935610549618,
601
- "train_runtime": 2398.307,
602
- "train_samples_per_second": 5.293,
603
- "train_steps_per_second": 0.083
604
  }
605
  ],
606
  "logging_steps": 5,
607
- "max_steps": 198,
608
  "num_input_tokens_seen": 0,
609
  "num_train_epochs": 2,
610
  "save_steps": 50,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9971305595408895,
5
  "eval_steps": 500,
6
+ "global_step": 174,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.05738880918220947,
13
+ "grad_norm": 147.0504219778771,
14
+ "learning_rate": 1.1111111111111111e-07,
15
  "logits/chosen": NaN,
16
  "logits/rejected": NaN,
17
+ "logps/chosen": -309.26251220703125,
18
+ "logps/rejected": -410.8433532714844,
19
+ "loss": 0.6899,
20
+ "rewards/accuracies": 0.17499999701976776,
21
+ "rewards/chosen": -0.002740192459896207,
22
+ "rewards/margins": 0.0071624754928052425,
23
+ "rewards/rejected": -0.00990285910665989,
24
  "step": 5
25
  },
26
  {
27
+ "epoch": 0.11477761836441894,
28
+ "grad_norm": 123.58633090508155,
29
+ "learning_rate": 1.9878787878787876e-07,
30
  "logits/chosen": NaN,
31
  "logits/rejected": NaN,
32
+ "logps/chosen": -277.09844970703125,
33
+ "logps/rejected": -374.31719970703125,
34
+ "loss": 0.6926,
35
+ "rewards/accuracies": 0.2718749940395355,
36
+ "rewards/chosen": 0.0009648323175497353,
37
+ "rewards/margins": 0.00359344482421875,
38
+ "rewards/rejected": -0.0026039122603833675,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.17216642754662842,
43
+ "grad_norm": 82.52640749761395,
44
+ "learning_rate": 1.9272727272727272e-07,
45
  "logits/chosen": NaN,
46
+ "logits/rejected": -0.355844110250473,
47
+ "logps/chosen": -250.90859985351562,
48
+ "logps/rejected": -288.7484436035156,
49
+ "loss": 0.6679,
50
+ "rewards/accuracies": 0.3656249940395355,
51
+ "rewards/chosen": -0.018105220049619675,
52
+ "rewards/margins": 0.07349129021167755,
53
+ "rewards/rejected": -0.09159164130687714,
54
  "step": 15
55
  },
56
  {
57
+ "epoch": 0.22955523672883787,
58
+ "grad_norm": 46.811267261200435,
59
+ "learning_rate": 1.8666666666666667e-07,
60
  "logits/chosen": NaN,
61
  "logits/rejected": NaN,
62
+ "logps/chosen": -250.43203735351562,
63
+ "logps/rejected": -299.9468688964844,
64
+ "loss": 0.638,
65
+ "rewards/accuracies": 0.4124999940395355,
66
+ "rewards/chosen": -0.02562398836016655,
67
+ "rewards/margins": 0.21783074736595154,
68
+ "rewards/rejected": -0.24351105093955994,
69
  "step": 20
70
  },
71
  {
72
+ "epoch": 0.28694404591104733,
73
+ "grad_norm": 44.812641031283796,
74
+ "learning_rate": 1.806060606060606e-07,
75
  "logits/chosen": NaN,
76
+ "logits/rejected": -0.32989805936813354,
77
+ "logps/chosen": -264.3968811035156,
78
+ "logps/rejected": -420.49688720703125,
79
+ "loss": 0.6218,
80
+ "rewards/accuracies": 0.4625000059604645,
81
+ "rewards/chosen": -0.05606970936059952,
82
+ "rewards/margins": 0.3526493012905121,
83
+ "rewards/rejected": -0.4085969924926758,
84
  "step": 25
85
  },
86
  {
87
+ "epoch": 0.34433285509325684,
88
+ "grad_norm": 41.09665846804977,
89
+ "learning_rate": 1.7454545454545453e-07,
90
  "logits/chosen": NaN,
91
  "logits/rejected": NaN,
92
+ "logps/chosen": -261.12969970703125,
93
+ "logps/rejected": -333.25665283203125,
94
+ "loss": 0.6433,
95
+ "rewards/accuracies": 0.453125,
96
+ "rewards/chosen": -0.05041093751788139,
97
+ "rewards/margins": 0.3066027760505676,
98
+ "rewards/rejected": -0.3569931983947754,
99
  "step": 30
100
  },
101
  {
102
+ "epoch": 0.4017216642754663,
103
+ "grad_norm": 47.66979279242685,
104
+ "learning_rate": 1.6848484848484848e-07,
105
  "logits/chosen": NaN,
106
  "logits/rejected": NaN,
107
+ "logps/chosen": -252.76406860351562,
108
+ "logps/rejected": -320.32342529296875,
109
+ "loss": 0.611,
110
+ "rewards/accuracies": 0.546875,
111
+ "rewards/chosen": -0.07408180087804794,
112
+ "rewards/margins": 0.735063910484314,
113
+ "rewards/rejected": -0.8097826838493347,
114
  "step": 35
115
  },
116
  {
117
+ "epoch": 0.45911047345767575,
118
+ "grad_norm": 44.453842871184655,
119
+ "learning_rate": 1.624242424242424e-07,
120
  "logits/chosen": NaN,
121
  "logits/rejected": NaN,
122
+ "logps/chosen": -282.52734375,
123
+ "logps/rejected": -314.4585876464844,
124
+ "loss": 0.6154,
125
+ "rewards/accuracies": 0.550000011920929,
126
+ "rewards/chosen": -0.08205080032348633,
127
+ "rewards/margins": 0.6658231616020203,
128
+ "rewards/rejected": -0.7472448348999023,
129
  "step": 40
130
  },
131
  {
132
+ "epoch": 0.5164992826398852,
133
+ "grad_norm": 72.40873860390725,
134
+ "learning_rate": 1.5636363636363637e-07,
135
  "logits/chosen": NaN,
136
+ "logits/rejected": NaN,
137
+ "logps/chosen": -246.61483764648438,
138
+ "logps/rejected": -324.8515625,
139
+ "loss": 0.6541,
140
+ "rewards/accuracies": 0.5562499761581421,
141
+ "rewards/chosen": -0.14140835404396057,
142
+ "rewards/margins": 0.6368468999862671,
143
+ "rewards/rejected": -0.7777351140975952,
144
  "step": 45
145
  },
146
  {
147
+ "epoch": 0.5738880918220947,
148
+ "grad_norm": 51.33061405888651,
149
+ "learning_rate": 1.503030303030303e-07,
150
  "logits/chosen": NaN,
151
+ "logits/rejected": -0.29754638671875,
152
+ "logps/chosen": -251.82656860351562,
153
+ "logps/rejected": -364.16094970703125,
154
+ "loss": 0.6061,
155
+ "rewards/accuracies": 0.5687500238418579,
156
+ "rewards/chosen": -0.097315214574337,
157
+ "rewards/margins": 0.7471939325332642,
158
+ "rewards/rejected": -0.8438205718994141,
159
  "step": 50
160
  },
161
  {
162
+ "epoch": 0.6312769010043041,
163
+ "grad_norm": 47.78546749711004,
164
+ "learning_rate": 1.4424242424242422e-07,
165
  "logits/chosen": NaN,
166
  "logits/rejected": NaN,
167
+ "logps/chosen": -229.6281280517578,
168
+ "logps/rejected": -346.20623779296875,
169
+ "loss": 0.5885,
170
+ "rewards/accuracies": 0.596875011920929,
171
+ "rewards/chosen": -0.08479080349206924,
172
+ "rewards/margins": 0.9593955874443054,
173
+ "rewards/rejected": -1.0437196493148804,
174
  "step": 55
175
  },
176
  {
177
+ "epoch": 0.6886657101865137,
178
+ "grad_norm": 32.88856633193689,
179
+ "learning_rate": 1.3818181818181818e-07,
180
  "logits/chosen": NaN,
181
  "logits/rejected": NaN,
182
+ "logps/chosen": -259.2515563964844,
183
+ "logps/rejected": -317.92498779296875,
184
+ "loss": 0.6109,
185
+ "rewards/accuracies": 0.640625,
186
+ "rewards/chosen": -0.09950466454029083,
187
+ "rewards/margins": 0.8476117849349976,
188
+ "rewards/rejected": -0.9464820623397827,
189
  "step": 60
190
  },
191
  {
192
+ "epoch": 0.7460545193687231,
193
+ "grad_norm": 46.246379021501845,
194
+ "learning_rate": 1.3212121212121213e-07,
195
  "logits/chosen": NaN,
196
  "logits/rejected": NaN,
197
+ "logps/chosen": -258.0960998535156,
198
+ "logps/rejected": -290.5296936035156,
199
+ "loss": 0.617,
200
+ "rewards/accuracies": 0.640625,
201
+ "rewards/chosen": -0.1353795975446701,
202
+ "rewards/margins": 0.7194949984550476,
203
+ "rewards/rejected": -0.8551372289657593,
204
  "step": 65
205
  },
206
  {
207
+ "epoch": 0.8034433285509326,
208
+ "grad_norm": 51.298985955922575,
209
+ "learning_rate": 1.2606060606060603e-07,
210
  "logits/chosen": NaN,
211
  "logits/rejected": NaN,
212
+ "logps/chosen": -290.3218688964844,
213
+ "logps/rejected": -378.1734313964844,
214
+ "loss": 0.5908,
215
+ "rewards/accuracies": 0.65625,
216
+ "rewards/chosen": -0.12958745658397675,
217
+ "rewards/margins": 0.8194991946220398,
218
+ "rewards/rejected": -0.9491798281669617,
219
  "step": 70
220
  },
221
  {
222
+ "epoch": 0.860832137733142,
223
+ "grad_norm": 51.37452979537066,
224
+ "learning_rate": 1.2e-07,
225
  "logits/chosen": NaN,
226
  "logits/rejected": NaN,
227
+ "logps/chosen": -272.2359313964844,
228
+ "logps/rejected": -461.9546813964844,
229
+ "loss": 0.6139,
230
+ "rewards/accuracies": 0.596875011920929,
231
+ "rewards/chosen": -0.08674906194210052,
232
+ "rewards/margins": 0.7904602289199829,
233
+ "rewards/rejected": -0.8770895004272461,
234
  "step": 75
235
  },
236
  {
237
+ "epoch": 0.9182209469153515,
238
+ "grad_norm": 49.12644933327405,
239
+ "learning_rate": 1.1393939393939393e-07,
240
  "logits/chosen": NaN,
241
+ "logits/rejected": NaN,
242
+ "logps/chosen": -271.92657470703125,
243
+ "logps/rejected": -385.4671936035156,
244
+ "loss": 0.5887,
245
+ "rewards/accuracies": 0.6343749761581421,
246
+ "rewards/chosen": -0.12120027840137482,
247
+ "rewards/margins": 0.9173402786254883,
248
+ "rewards/rejected": -1.0387518405914307,
249
  "step": 80
250
  },
251
  {
252
+ "epoch": 0.975609756097561,
253
+ "grad_norm": 78.67875740741145,
254
+ "learning_rate": 1.0787878787878789e-07,
255
  "logits/chosen": NaN,
256
  "logits/rejected": NaN,
257
+ "logps/chosen": -268.3734436035156,
258
+ "logps/rejected": -356.06561279296875,
259
+ "loss": 0.6094,
260
+ "rewards/accuracies": 0.621874988079071,
261
+ "rewards/chosen": -0.08560104668140411,
262
+ "rewards/margins": 0.7130492925643921,
263
+ "rewards/rejected": -0.7986106872558594,
264
  "step": 85
265
  },
266
  {
267
+ "epoch": 1.0329985652797704,
268
+ "grad_norm": 37.99397160982797,
269
+ "learning_rate": 1.018181818181818e-07,
270
  "logits/chosen": NaN,
271
+ "logits/rejected": -0.2726287841796875,
272
+ "logps/chosen": -278.57501220703125,
273
+ "logps/rejected": -375.9390563964844,
274
+ "loss": 0.5887,
275
+ "rewards/accuracies": 0.6499999761581421,
276
+ "rewards/chosen": -0.10773544013500214,
277
+ "rewards/margins": 0.7839363217353821,
278
+ "rewards/rejected": -0.892169177532196,
279
  "step": 90
280
  },
281
  {
282
+ "epoch": 1.0903873744619799,
283
+ "grad_norm": 38.13934723090496,
284
+ "learning_rate": 9.575757575757574e-08,
285
  "logits/chosen": NaN,
286
  "logits/rejected": NaN,
287
+ "logps/chosen": -287.20001220703125,
288
+ "logps/rejected": -351.8617248535156,
289
+ "loss": 0.5602,
290
+ "rewards/accuracies": 0.684374988079071,
291
+ "rewards/chosen": -0.09619579464197159,
292
+ "rewards/margins": 0.9326726794242859,
293
+ "rewards/rejected": -1.0286362171173096,
294
  "step": 95
295
  },
296
  {
297
+ "epoch": 1.1477761836441893,
298
+ "grad_norm": 59.66676077313003,
299
+ "learning_rate": 8.96969696969697e-08,
300
  "logits/chosen": NaN,
301
  "logits/rejected": NaN,
302
+ "logps/chosen": -253.2980499267578,
303
+ "logps/rejected": -323.6937561035156,
304
+ "loss": 0.5715,
305
+ "rewards/accuracies": 0.721875011920929,
306
+ "rewards/chosen": -0.09653882682323456,
307
+ "rewards/margins": 0.7721735239028931,
308
+ "rewards/rejected": -0.8687639236450195,
309
  "step": 100
310
  },
311
  {
312
+ "epoch": 1.2051649928263988,
313
+ "grad_norm": 40.181942906152656,
314
+ "learning_rate": 8.363636363636363e-08,
315
  "logits/chosen": NaN,
316
  "logits/rejected": NaN,
317
+ "logps/chosen": -269.91796875,
318
+ "logps/rejected": -342.6031188964844,
319
+ "loss": 0.5733,
320
+ "rewards/accuracies": 0.6781250238418579,
321
+ "rewards/chosen": -0.1024196594953537,
322
+ "rewards/margins": 0.6281814575195312,
323
+ "rewards/rejected": -0.7303474545478821,
324
  "step": 105
325
  },
326
  {
327
+ "epoch": 1.2625538020086085,
328
+ "grad_norm": 43.130194806696174,
329
+ "learning_rate": 7.757575757575757e-08,
330
  "logits/chosen": NaN,
331
+ "logits/rejected": -0.28910523653030396,
332
+ "logps/chosen": -256.4398498535156,
333
+ "logps/rejected": -315.8851623535156,
334
+ "loss": 0.5609,
335
+ "rewards/accuracies": 0.7406250238418579,
336
+ "rewards/chosen": -0.05760955810546875,
337
+ "rewards/margins": 0.8904060125350952,
338
+ "rewards/rejected": -0.948272705078125,
339
  "step": 110
340
  },
341
  {
342
+ "epoch": 1.3199426111908177,
343
+ "grad_norm": 40.187136674467965,
344
+ "learning_rate": 7.151515151515152e-08,
345
  "logits/chosen": NaN,
346
  "logits/rejected": NaN,
347
+ "logps/chosen": -286.8125,
348
+ "logps/rejected": -305.38751220703125,
349
+ "loss": 0.5859,
350
+ "rewards/accuracies": 0.684374988079071,
351
+ "rewards/chosen": -0.05495605617761612,
352
+ "rewards/margins": 0.7184921503067017,
353
+ "rewards/rejected": -0.7738761901855469,
354
  "step": 115
355
  },
356
  {
357
+ "epoch": 1.3773314203730274,
358
+ "grad_norm": 62.26850922502401,
359
+ "learning_rate": 6.545454545454545e-08,
360
  "logits/chosen": NaN,
361
+ "logits/rejected": NaN,
362
+ "logps/chosen": -281.8828125,
363
+ "logps/rejected": -348.609375,
364
+ "loss": 0.5641,
365
+ "rewards/accuracies": 0.715624988079071,
366
+ "rewards/chosen": -0.0843501091003418,
367
+ "rewards/margins": 0.8600600957870483,
368
+ "rewards/rejected": -0.9451843500137329,
369
  "step": 120
370
  },
371
  {
372
+ "epoch": 1.4347202295552366,
373
+ "grad_norm": 40.00896779584937,
374
+ "learning_rate": 5.93939393939394e-08,
375
  "logits/chosen": NaN,
376
  "logits/rejected": NaN,
377
+ "logps/chosen": -289.64959716796875,
378
+ "logps/rejected": -378.01873779296875,
379
+ "loss": 0.5644,
380
+ "rewards/accuracies": 0.7562500238418579,
381
+ "rewards/chosen": -0.06678199768066406,
382
+ "rewards/margins": 0.8301635980606079,
383
+ "rewards/rejected": -0.8972938656806946,
384
  "step": 125
385
  },
386
  {
387
+ "epoch": 1.4921090387374463,
388
+ "grad_norm": 33.42224905779865,
389
+ "learning_rate": 5.333333333333333e-08,
390
  "logits/chosen": NaN,
391
  "logits/rejected": NaN,
392
+ "logps/chosen": -228.01406860351562,
393
+ "logps/rejected": -385.5078125,
394
+ "loss": 0.5231,
395
  "rewards/accuracies": 0.7593749761581421,
396
+ "rewards/chosen": -0.06652259826660156,
397
+ "rewards/margins": 1.106951117515564,
398
+ "rewards/rejected": -1.173893690109253,
399
  "step": 130
400
  },
401
  {
402
+ "epoch": 1.5494978479196555,
403
+ "grad_norm": 35.377034170414966,
404
+ "learning_rate": 4.727272727272727e-08,
405
  "logits/chosen": NaN,
406
  "logits/rejected": NaN,
407
+ "logps/chosen": -276.4312438964844,
408
+ "logps/rejected": -374.94219970703125,
409
+ "loss": 0.5493,
410
+ "rewards/accuracies": 0.753125011920929,
411
+ "rewards/chosen": -0.07003593444824219,
412
+ "rewards/margins": 0.8382889032363892,
413
+ "rewards/rejected": -0.9086562991142273,
414
  "step": 135
415
  },
416
  {
417
+ "epoch": 1.6068866571018652,
418
+ "grad_norm": 42.98104883723141,
419
+ "learning_rate": 4.121212121212121e-08,
420
  "logits/chosen": NaN,
421
  "logits/rejected": NaN,
422
+ "logps/chosen": -245.8046875,
423
+ "logps/rejected": -332.34844970703125,
424
+ "loss": 0.5553,
425
+ "rewards/accuracies": 0.731249988079071,
426
+ "rewards/chosen": -0.09261999279260635,
427
+ "rewards/margins": 0.8664749264717102,
428
+ "rewards/rejected": -0.9593642950057983,
429
  "step": 140
430
  },
431
  {
432
+ "epoch": 1.6642754662840746,
433
+ "grad_norm": 41.194723091835506,
434
+ "learning_rate": 3.5151515151515146e-08,
435
  "logits/chosen": NaN,
436
+ "logits/rejected": NaN,
437
+ "logps/chosen": -271.9007873535156,
438
+ "logps/rejected": -372.2890625,
439
+ "loss": 0.576,
440
+ "rewards/accuracies": 0.703125,
441
+ "rewards/chosen": -0.10253047943115234,
442
+ "rewards/margins": 0.6510879397392273,
443
+ "rewards/rejected": -0.7538429498672485,
444
  "step": 145
445
  },
446
  {
447
+ "epoch": 1.721664275466284,
448
+ "grad_norm": 44.50624051602098,
449
+ "learning_rate": 2.9090909090909088e-08,
450
  "logits/chosen": NaN,
451
+ "logits/rejected": -0.3272903561592102,
452
+ "logps/chosen": -241.9031219482422,
453
+ "logps/rejected": -336.61407470703125,
454
+ "loss": 0.557,
455
+ "rewards/accuracies": 0.699999988079071,
456
+ "rewards/chosen": -0.09629325568675995,
457
+ "rewards/margins": 0.7580966949462891,
458
+ "rewards/rejected": -0.8545807003974915,
459
  "step": 150
460
  },
461
  {
462
+ "epoch": 1.7790530846484935,
463
+ "grad_norm": 35.414071145451,
464
+ "learning_rate": 2.303030303030303e-08,
465
  "logits/chosen": NaN,
466
  "logits/rejected": NaN,
467
+ "logps/chosen": -249.95938110351562,
468
+ "logps/rejected": -301.9765625,
469
+ "loss": 0.5675,
470
+ "rewards/accuracies": 0.778124988079071,
471
+ "rewards/chosen": -0.08613376319408417,
472
+ "rewards/margins": 0.8244568109512329,
473
+ "rewards/rejected": -0.9111496210098267,
474
  "step": 155
475
  },
476
  {
477
+ "epoch": 1.836441893830703,
478
+ "grad_norm": 38.330459518863954,
479
+ "learning_rate": 1.696969696969697e-08,
480
  "logits/chosen": NaN,
481
+ "logits/rejected": -0.393698126077652,
482
+ "logps/chosen": -262.72186279296875,
483
+ "logps/rejected": -394.54217529296875,
484
+ "loss": 0.5666,
485
+ "rewards/accuracies": 0.71875,
486
+ "rewards/chosen": -0.09622383117675781,
487
+ "rewards/margins": 0.8662067651748657,
488
+ "rewards/rejected": -0.9627344012260437,
489
  "step": 160
490
  },
491
  {
492
+ "epoch": 1.8938307030129125,
493
+ "grad_norm": 41.39148819542555,
494
+ "learning_rate": 1.0909090909090908e-08,
495
  "logits/chosen": NaN,
496
+ "logits/rejected": -0.3541931211948395,
497
+ "logps/chosen": -286.5625,
498
+ "logps/rejected": -391.0406188964844,
499
+ "loss": 0.5354,
500
+ "rewards/accuracies": 0.75,
501
+ "rewards/chosen": -0.07141885906457901,
502
+ "rewards/margins": 1.0486961603164673,
503
+ "rewards/rejected": -1.120294213294983,
504
  "step": 165
505
  },
506
  {
507
+ "epoch": 1.951219512195122,
508
+ "grad_norm": 39.2547263324904,
509
+ "learning_rate": 4.848484848484848e-09,
510
  "logits/chosen": NaN,
511
+ "logits/rejected": -0.39358216524124146,
512
+ "logps/chosen": -267.47265625,
513
+ "logps/rejected": -372.5625,
514
+ "loss": 0.5478,
515
+ "rewards/accuracies": 0.706250011920929,
516
+ "rewards/chosen": -0.05818195268511772,
517
+ "rewards/margins": 1.0078842639923096,
518
+ "rewards/rejected": -1.065637230873108,
519
  "step": 170
520
  },
521
  {
522
+ "epoch": 1.9971305595408895,
523
+ "step": 174,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  "total_flos": 0.0,
525
+ "train_loss": 0.5929731760901966,
526
+ "train_runtime": 2030.2211,
527
+ "train_samples_per_second": 5.493,
528
+ "train_steps_per_second": 0.086
529
  }
530
  ],
531
  "logging_steps": 5,
532
+ "max_steps": 174,
533
  "num_input_tokens_seen": 0,
534
  "num_train_epochs": 2,
535
  "save_steps": 50,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04392d39f00478df0417f7fd9ba7d15085f3dba381f41d035a42742e674b52ff
3
  size 7672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c63478df6f2cc0a8edc66dba8ca06784108c3576df8e676705056d7247719f8
3
  size 7672