ggbetz commited on
Commit
a41735b
·
verified ·
1 Parent(s): 3b3e313

Model save

Browse files
README.md CHANGED
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/8ze83nxy)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/4qlrj3fp)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.9917920656634747,
3
  "total_flos": 0.0,
4
- "train_loss": 0.5634572964448196,
5
- "train_runtime": 2180.3656,
6
- "train_samples": 5847,
7
- "train_samples_per_second": 5.363,
8
  "train_steps_per_second": 0.083
9
  }
 
1
  {
2
+ "epoch": 1.996219281663516,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5393935610549618,
5
+ "train_runtime": 2398.307,
6
+ "train_samples": 6347,
7
+ "train_samples_per_second": 5.293,
8
  "train_steps_per_second": 0.083
9
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:809b40494237bab1dd817a75ff18ab2fb2a3c9578448b01143faca1e0d076763
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f807a55400211dc38fe9871d39bbdb3a1c49f8b79532d4313e1c899fe429e10
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ccc099adaf8b970638fb2c3e0bd2553e21c008d9b29600b8c7ece82589f4fa9
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8dd55c15ea1762fc4462e86e4d79d32fb9df2bf76c79e3c0e53edca6e2575d8
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:465f0940230790d7a7e95d05f2984afe6f813f177e5f094512b09847f0195f89
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954125f0f6d5961aff8db190d00a9831a874a385c04dedda7d82359384d7233e
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fe633a860b536624a0beef90367bed6891825b7db79ad0e5c99fc57a31a532a
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c9947d0ccbc72142d87c4ef362037fd555ee53092f6642a644678e9be9d61e
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.9917920656634747,
3
  "total_flos": 0.0,
4
- "train_loss": 0.5634572964448196,
5
- "train_runtime": 2180.3656,
6
- "train_samples": 5847,
7
- "train_samples_per_second": 5.363,
8
  "train_steps_per_second": 0.083
9
  }
 
1
  {
2
+ "epoch": 1.996219281663516,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5393935610549618,
5
+ "train_runtime": 2398.307,
6
+ "train_samples": 6347,
7
+ "train_samples_per_second": 5.293,
8
  "train_steps_per_second": 0.083
9
  }
trainer_state.json CHANGED
@@ -1,565 +1,610 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.9917920656634747,
5
  "eval_steps": 500,
6
- "global_step": 182,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.05471956224350205,
13
- "grad_norm": 73.54892222222522,
14
  "learning_rate": 1e-07,
15
  "logits/chosen": NaN,
16
  "logits/rejected": NaN,
17
- "logps/chosen": -379.07501220703125,
18
- "logps/rejected": -433.9624938964844,
19
- "loss": 0.6968,
20
- "rewards/accuracies": 0.18125000596046448,
21
- "rewards/chosen": -0.005663680844008923,
22
- "rewards/margins": -0.008682822808623314,
23
- "rewards/rejected": 0.0030033111106604338,
24
  "step": 5
25
  },
26
  {
27
- "epoch": 0.1094391244870041,
28
- "grad_norm": 81.90895583785112,
29
  "learning_rate": 2e-07,
30
  "logits/chosen": NaN,
31
  "logits/rejected": NaN,
32
- "logps/chosen": -353.57342529296875,
33
- "logps/rejected": -377.31951904296875,
34
- "loss": 0.6873,
35
- "rewards/accuracies": 0.27812498807907104,
36
- "rewards/chosen": 0.004337215330451727,
37
- "rewards/margins": 0.012407870963215828,
38
- "rewards/rejected": -0.00807995442301035,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.16415868673050615,
43
- "grad_norm": 49.86519511858418,
44
- "learning_rate": 1.9418604651162788e-07,
45
  "logits/chosen": NaN,
46
  "logits/rejected": NaN,
47
- "logps/chosen": -287.00311279296875,
48
- "logps/rejected": -337.828125,
49
- "loss": 0.6843,
50
- "rewards/accuracies": 0.31562501192092896,
51
- "rewards/chosen": -0.005427980329841375,
52
- "rewards/margins": 0.02440509758889675,
53
- "rewards/rejected": -0.029846668243408203,
54
  "step": 15
55
  },
56
  {
57
- "epoch": 0.2188782489740082,
58
- "grad_norm": 61.602490673284805,
59
- "learning_rate": 1.883720930232558e-07,
60
  "logits/chosen": NaN,
61
  "logits/rejected": NaN,
62
- "logps/chosen": -344.2406311035156,
63
- "logps/rejected": -458.76251220703125,
64
- "loss": 0.6682,
65
- "rewards/accuracies": 0.37812501192092896,
66
- "rewards/chosen": -0.016874879598617554,
67
- "rewards/margins": 0.07516946643590927,
68
- "rewards/rejected": -0.09207276999950409,
69
  "step": 20
70
  },
71
  {
72
- "epoch": 0.27359781121751026,
73
- "grad_norm": 59.029725792223225,
74
- "learning_rate": 1.825581395348837e-07,
75
  "logits/chosen": NaN,
76
  "logits/rejected": NaN,
77
- "logps/chosen": -315.7281188964844,
78
- "logps/rejected": -453.23828125,
79
- "loss": 0.6414,
80
- "rewards/accuracies": 0.48124998807907104,
81
- "rewards/chosen": -0.011789416894316673,
82
- "rewards/margins": 0.16611814498901367,
83
- "rewards/rejected": -0.1779058277606964,
84
  "step": 25
85
  },
86
  {
87
- "epoch": 0.3283173734610123,
88
- "grad_norm": 47.84812908517649,
89
- "learning_rate": 1.7674418604651164e-07,
90
  "logits/chosen": NaN,
91
  "logits/rejected": NaN,
92
- "logps/chosen": -333.09686279296875,
93
- "logps/rejected": -409.4671936035156,
94
- "loss": 0.6205,
95
- "rewards/accuracies": 0.453125,
96
- "rewards/chosen": -0.0324886329472065,
97
- "rewards/margins": 0.23987388610839844,
98
- "rewards/rejected": -0.27254530787467957,
99
  "step": 30
100
  },
101
  {
102
- "epoch": 0.38303693570451436,
103
- "grad_norm": 52.08971649605601,
104
- "learning_rate": 1.7093023255813953e-07,
105
  "logits/chosen": NaN,
106
  "logits/rejected": NaN,
107
- "logps/chosen": -357.9859313964844,
108
- "logps/rejected": -379.1796875,
109
- "loss": 0.6341,
110
- "rewards/accuracies": 0.4593749940395355,
111
- "rewards/chosen": -0.037075091153383255,
112
- "rewards/margins": 0.252890020608902,
113
- "rewards/rejected": -0.28978270292282104,
114
  "step": 35
115
  },
116
  {
117
- "epoch": 0.4377564979480164,
118
- "grad_norm": 45.7725967987622,
119
- "learning_rate": 1.6511627906976744e-07,
120
  "logits/chosen": NaN,
121
  "logits/rejected": NaN,
122
- "logps/chosen": -323.4984436035156,
123
- "logps/rejected": -402.7093811035156,
124
- "loss": 0.6099,
125
- "rewards/accuracies": 0.503125011920929,
126
- "rewards/chosen": -0.049513887614011765,
127
- "rewards/margins": 0.4467551112174988,
128
- "rewards/rejected": -0.496664434671402,
129
  "step": 40
130
  },
131
  {
132
- "epoch": 0.49247606019151846,
133
- "grad_norm": 42.51194464232103,
134
- "learning_rate": 1.5930232558139534e-07,
135
  "logits/chosen": NaN,
136
- "logits/rejected": NaN,
137
- "logps/chosen": -297.24298095703125,
138
- "logps/rejected": -373.6734313964844,
139
- "loss": 0.598,
140
- "rewards/accuracies": 0.5249999761581421,
141
- "rewards/chosen": -0.05229806900024414,
142
- "rewards/margins": 0.4699579179286957,
143
- "rewards/rejected": -0.5220922231674194,
144
  "step": 45
145
  },
146
  {
147
- "epoch": 0.5471956224350205,
148
- "grad_norm": 44.14119188922749,
149
- "learning_rate": 1.5348837209302325e-07,
150
  "logits/chosen": NaN,
151
  "logits/rejected": NaN,
152
- "logps/chosen": -303.765625,
153
- "logps/rejected": -350.6812438964844,
154
- "loss": 0.6068,
155
- "rewards/accuracies": 0.550000011920929,
156
- "rewards/chosen": -0.028664493933320045,
157
- "rewards/margins": 0.45641660690307617,
158
- "rewards/rejected": -0.48496121168136597,
159
  "step": 50
160
  },
161
  {
162
- "epoch": 0.6019151846785226,
163
- "grad_norm": 45.64723368838045,
164
- "learning_rate": 1.4767441860465114e-07,
165
  "logits/chosen": NaN,
166
- "logits/rejected": -0.1685028076171875,
167
- "logps/chosen": -335.57501220703125,
168
- "logps/rejected": -403.3828125,
169
- "loss": 0.5663,
170
- "rewards/accuracies": 0.59375,
171
- "rewards/chosen": -0.03076200559735298,
172
- "rewards/margins": 0.744637668132782,
173
- "rewards/rejected": -0.7755187749862671,
174
  "step": 55
175
  },
176
  {
177
- "epoch": 0.6566347469220246,
178
- "grad_norm": 54.68823709160605,
179
- "learning_rate": 1.4186046511627906e-07,
180
  "logits/chosen": NaN,
181
- "logits/rejected": -0.2386222779750824,
182
- "logps/chosen": -349.6429748535156,
183
- "logps/rejected": -427.62188720703125,
184
- "loss": 0.5919,
185
- "rewards/accuracies": 0.6187499761581421,
186
- "rewards/chosen": -0.06507845222949982,
187
- "rewards/margins": 0.5661047101020813,
188
- "rewards/rejected": -0.6309539675712585,
189
  "step": 60
190
  },
191
  {
192
- "epoch": 0.7113543091655267,
193
- "grad_norm": 37.812337714738334,
194
- "learning_rate": 1.3604651162790698e-07,
195
  "logits/chosen": NaN,
196
  "logits/rejected": NaN,
197
- "logps/chosen": -391.65313720703125,
198
- "logps/rejected": -404.3343811035156,
199
- "loss": 0.5743,
200
- "rewards/accuracies": 0.5562499761581421,
201
- "rewards/chosen": -0.03319978713989258,
202
- "rewards/margins": 0.6765543222427368,
203
- "rewards/rejected": -0.7098339200019836,
204
  "step": 65
205
  },
206
  {
207
- "epoch": 0.7660738714090287,
208
- "grad_norm": 43.16858074168923,
209
- "learning_rate": 1.302325581395349e-07,
210
  "logits/chosen": NaN,
211
  "logits/rejected": NaN,
212
- "logps/chosen": -348.0687561035156,
213
- "logps/rejected": -420.6000061035156,
214
- "loss": 0.5578,
215
- "rewards/accuracies": 0.574999988079071,
216
- "rewards/chosen": -0.002912330674007535,
217
- "rewards/margins": 0.7157382965087891,
218
- "rewards/rejected": -0.7183740735054016,
219
  "step": 70
220
  },
221
  {
222
- "epoch": 0.8207934336525308,
223
- "grad_norm": 46.03417938024923,
224
- "learning_rate": 1.244186046511628e-07,
225
  "logits/chosen": NaN,
226
  "logits/rejected": NaN,
227
- "logps/chosen": -346.75,
228
- "logps/rejected": -368.25,
229
- "loss": 0.5502,
230
- "rewards/accuracies": 0.653124988079071,
231
- "rewards/chosen": -0.01859130896627903,
232
- "rewards/margins": 0.7938125729560852,
233
- "rewards/rejected": -0.8118951916694641,
234
  "step": 75
235
  },
236
  {
237
- "epoch": 0.8755129958960328,
238
- "grad_norm": 45.68840453931342,
239
- "learning_rate": 1.186046511627907e-07,
240
  "logits/chosen": NaN,
241
- "logits/rejected": NaN,
242
- "logps/chosen": -373.9671936035156,
243
- "logps/rejected": -470.2250061035156,
244
- "loss": 0.55,
245
- "rewards/accuracies": 0.609375,
246
- "rewards/chosen": -0.04471855238080025,
247
- "rewards/margins": 0.8136627078056335,
248
- "rewards/rejected": -0.8582122921943665,
249
  "step": 80
250
  },
251
  {
252
- "epoch": 0.9302325581395349,
253
- "grad_norm": 46.25513385846438,
254
- "learning_rate": 1.127906976744186e-07,
255
  "logits/chosen": NaN,
256
  "logits/rejected": NaN,
257
- "logps/chosen": -306.0453186035156,
258
- "logps/rejected": -382.3812561035156,
259
- "loss": 0.5732,
260
- "rewards/accuracies": 0.640625,
261
- "rewards/chosen": -0.0332220084965229,
262
- "rewards/margins": 0.7451133728027344,
263
- "rewards/rejected": -0.7781906127929688,
264
  "step": 85
265
  },
266
  {
267
- "epoch": 0.9849521203830369,
268
- "grad_norm": 43.05942874643264,
269
- "learning_rate": 1.069767441860465e-07,
270
  "logits/chosen": NaN,
271
  "logits/rejected": NaN,
272
- "logps/chosen": -339.6312561035156,
273
- "logps/rejected": -404.6328125,
274
- "loss": 0.5659,
275
- "rewards/accuracies": 0.612500011920929,
276
- "rewards/chosen": -0.06772689521312714,
277
- "rewards/margins": 0.7259882092475891,
278
- "rewards/rejected": -0.7933288812637329,
279
  "step": 90
280
  },
281
  {
282
- "epoch": 1.039671682626539,
283
- "grad_norm": 46.27154296619937,
284
- "learning_rate": 1.0116279069767442e-07,
285
  "logits/chosen": NaN,
286
  "logits/rejected": NaN,
287
- "logps/chosen": -338.83123779296875,
288
- "logps/rejected": -414.30938720703125,
289
- "loss": 0.5471,
290
- "rewards/accuracies": 0.6625000238418579,
291
- "rewards/chosen": -0.0271759033203125,
292
- "rewards/margins": 0.8508437871932983,
293
- "rewards/rejected": -0.8778969049453735,
294
  "step": 95
295
  },
296
  {
297
- "epoch": 1.094391244870041,
298
- "grad_norm": 37.78475339484884,
299
- "learning_rate": 9.534883720930232e-08,
300
  "logits/chosen": NaN,
301
  "logits/rejected": NaN,
302
- "logps/chosen": -331.1796875,
303
- "logps/rejected": -435.3179626464844,
304
- "loss": 0.5351,
305
- "rewards/accuracies": 0.6968749761581421,
306
- "rewards/chosen": -0.01332016009837389,
307
- "rewards/margins": 0.7190505862236023,
308
- "rewards/rejected": -0.7319396734237671,
309
  "step": 100
310
  },
311
  {
312
- "epoch": 1.1491108071135432,
313
- "grad_norm": 53.87671855674201,
314
- "learning_rate": 8.953488372093023e-08,
315
  "logits/chosen": NaN,
316
  "logits/rejected": NaN,
317
- "logps/chosen": -359.4312438964844,
318
- "logps/rejected": -363.98748779296875,
319
- "loss": 0.5338,
320
- "rewards/accuracies": 0.671875,
321
- "rewards/chosen": 0.01817016676068306,
322
- "rewards/margins": 0.8886383175849915,
323
- "rewards/rejected": -0.8702591061592102,
324
  "step": 105
325
  },
326
  {
327
- "epoch": 1.2038303693570451,
328
- "grad_norm": 37.1769031620794,
329
- "learning_rate": 8.372093023255815e-08,
330
  "logits/chosen": NaN,
331
  "logits/rejected": NaN,
332
- "logps/chosen": -320.11798095703125,
333
- "logps/rejected": -372.38751220703125,
334
- "loss": 0.5261,
335
  "rewards/accuracies": 0.706250011920929,
336
- "rewards/chosen": 0.015477180480957031,
337
- "rewards/margins": 0.9761615991592407,
338
- "rewards/rejected": -0.9608657956123352,
339
  "step": 110
340
  },
341
  {
342
- "epoch": 1.2585499316005473,
343
- "grad_norm": 42.09569803865549,
344
- "learning_rate": 7.790697674418605e-08,
345
  "logits/chosen": NaN,
346
  "logits/rejected": NaN,
347
- "logps/chosen": -358.69219970703125,
348
- "logps/rejected": -394.2437438964844,
349
- "loss": 0.5333,
350
- "rewards/accuracies": 0.703125,
351
- "rewards/chosen": -0.0025016784202307463,
352
- "rewards/margins": 0.9062668085098267,
353
- "rewards/rejected": -0.9096938967704773,
354
  "step": 115
355
  },
356
  {
357
- "epoch": 1.3132694938440492,
358
- "grad_norm": 36.918397640269696,
359
- "learning_rate": 7.209302325581394e-08,
360
  "logits/chosen": NaN,
361
- "logits/rejected": NaN,
362
- "logps/chosen": -322.95001220703125,
363
- "logps/rejected": -362.4125061035156,
364
- "loss": 0.5178,
365
- "rewards/accuracies": 0.6875,
366
- "rewards/chosen": 0.02965698204934597,
367
- "rewards/margins": 1.0709717273712158,
368
- "rewards/rejected": -1.0418059825897217,
369
  "step": 120
370
  },
371
  {
372
- "epoch": 1.3679890560875512,
373
- "grad_norm": 41.25871091693466,
374
- "learning_rate": 6.627906976744185e-08,
375
  "logits/chosen": NaN,
376
  "logits/rejected": NaN,
377
- "logps/chosen": -347.3968811035156,
378
- "logps/rejected": -478.6343688964844,
379
- "loss": 0.5056,
380
- "rewards/accuracies": 0.737500011920929,
381
- "rewards/chosen": 0.011506843380630016,
382
- "rewards/margins": 1.1476867198944092,
383
- "rewards/rejected": -1.135986328125,
384
  "step": 125
385
  },
386
  {
387
- "epoch": 1.4227086183310533,
388
- "grad_norm": 38.50680509320898,
389
- "learning_rate": 6.046511627906976e-08,
390
  "logits/chosen": NaN,
391
  "logits/rejected": NaN,
392
- "logps/chosen": -289.20623779296875,
393
- "logps/rejected": -385.5179748535156,
394
- "loss": 0.5277,
395
- "rewards/accuracies": 0.690625011920929,
396
- "rewards/chosen": 0.012903976254165173,
397
- "rewards/margins": 0.8722091913223267,
398
- "rewards/rejected": -0.8594962954521179,
399
  "step": 130
400
  },
401
  {
402
- "epoch": 1.4774281805745555,
403
- "grad_norm": 47.9748923473641,
404
- "learning_rate": 5.465116279069767e-08,
405
  "logits/chosen": NaN,
406
  "logits/rejected": NaN,
407
- "logps/chosen": -328.2406311035156,
408
- "logps/rejected": -399.08123779296875,
409
- "loss": 0.524,
410
- "rewards/accuracies": 0.715624988079071,
411
- "rewards/chosen": -0.015491103753447533,
412
- "rewards/margins": 1.0500564575195312,
413
- "rewards/rejected": -1.0659011602401733,
414
  "step": 135
415
  },
416
  {
417
- "epoch": 1.5321477428180574,
418
- "grad_norm": 40.692911236757624,
419
- "learning_rate": 4.883720930232558e-08,
420
  "logits/chosen": NaN,
421
  "logits/rejected": NaN,
422
- "logps/chosen": -374.1187438964844,
423
- "logps/rejected": -411.32501220703125,
424
- "loss": 0.4871,
425
- "rewards/accuracies": 0.7093750238418579,
426
- "rewards/chosen": 0.05074119567871094,
427
- "rewards/margins": 1.3334617614746094,
428
- "rewards/rejected": -1.2823364734649658,
429
  "step": 140
430
  },
431
  {
432
- "epoch": 1.5868673050615594,
433
- "grad_norm": 40.76205645702533,
434
- "learning_rate": 4.3023255813953484e-08,
435
  "logits/chosen": NaN,
436
- "logits/rejected": -0.15314331650733948,
437
- "logps/chosen": -364.9437561035156,
438
- "logps/rejected": -455.7124938964844,
439
- "loss": 0.4725,
440
- "rewards/accuracies": 0.765625,
441
- "rewards/chosen": 0.03847331926226616,
442
- "rewards/margins": 1.328369140625,
443
- "rewards/rejected": -1.2897827625274658,
444
  "step": 145
445
  },
446
  {
447
- "epoch": 1.6415868673050615,
448
- "grad_norm": 46.69293795008511,
449
- "learning_rate": 3.7209302325581396e-08,
450
  "logits/chosen": NaN,
451
  "logits/rejected": NaN,
452
- "logps/chosen": -338.5874938964844,
453
- "logps/rejected": -408.1656188964844,
454
- "loss": 0.5119,
455
- "rewards/accuracies": 0.7093750238418579,
456
- "rewards/chosen": 0.028449058532714844,
457
- "rewards/margins": 0.9527389407157898,
458
- "rewards/rejected": -0.9238342046737671,
459
  "step": 150
460
  },
461
  {
462
- "epoch": 1.6963064295485637,
463
- "grad_norm": 42.042163782049805,
464
- "learning_rate": 3.13953488372093e-08,
465
  "logits/chosen": NaN,
466
- "logits/rejected": -0.19841155409812927,
467
- "logps/chosen": -352.875,
468
- "logps/rejected": -418.7562561035156,
469
- "loss": 0.519,
470
- "rewards/accuracies": 0.721875011920929,
471
- "rewards/chosen": 0.008855698630213737,
472
- "rewards/margins": 1.1339629888534546,
473
- "rewards/rejected": -1.1251556873321533,
474
  "step": 155
475
  },
476
  {
477
- "epoch": 1.7510259917920656,
478
- "grad_norm": 45.03337894308017,
479
- "learning_rate": 2.5581395348837208e-08,
480
  "logits/chosen": NaN,
481
  "logits/rejected": NaN,
482
- "logps/chosen": -355.38751220703125,
483
- "logps/rejected": -383.7406311035156,
484
- "loss": 0.5215,
485
- "rewards/accuracies": 0.6968749761581421,
486
- "rewards/chosen": -0.0020130157936364412,
487
- "rewards/margins": 0.9867599606513977,
488
- "rewards/rejected": -0.9886184930801392,
489
  "step": 160
490
  },
491
  {
492
- "epoch": 1.8057455540355676,
493
- "grad_norm": 39.86335782741774,
494
- "learning_rate": 1.9767441860465116e-08,
495
  "logits/chosen": NaN,
496
  "logits/rejected": NaN,
497
- "logps/chosen": -328.53790283203125,
498
- "logps/rejected": -387.0218811035156,
499
- "loss": 0.5333,
500
- "rewards/accuracies": 0.6468750238418579,
501
- "rewards/chosen": -0.010569858364760876,
502
- "rewards/margins": 0.907672107219696,
503
- "rewards/rejected": -0.9181579351425171,
504
  "step": 165
505
  },
506
  {
507
- "epoch": 1.8604651162790697,
508
- "grad_norm": 44.87998150984873,
509
- "learning_rate": 1.3953488372093022e-08,
510
  "logits/chosen": NaN,
511
- "logits/rejected": -0.20475158095359802,
512
- "logps/chosen": -318.3793029785156,
513
- "logps/rejected": -401.95001220703125,
514
- "loss": 0.5183,
515
- "rewards/accuracies": 0.734375,
516
- "rewards/chosen": 0.05231628566980362,
517
- "rewards/margins": 0.9515264630317688,
518
- "rewards/rejected": -0.899548351764679,
519
  "step": 170
520
  },
521
  {
522
- "epoch": 1.915184678522572,
523
- "grad_norm": 44.78373504452225,
524
- "learning_rate": 8.139534883720931e-09,
525
  "logits/chosen": NaN,
526
  "logits/rejected": NaN,
527
- "logps/chosen": -362.09375,
528
- "logps/rejected": -410.56329345703125,
529
- "loss": 0.512,
530
- "rewards/accuracies": 0.6968749761581421,
531
- "rewards/chosen": -0.0063323974609375,
532
- "rewards/margins": 1.0748703479766846,
533
- "rewards/rejected": -1.0807387828826904,
534
  "step": 175
535
  },
536
  {
537
- "epoch": 1.9699042407660738,
538
- "grad_norm": 47.232513404054075,
539
- "learning_rate": 2.3255813953488372e-09,
540
  "logits/chosen": NaN,
541
  "logits/rejected": NaN,
542
- "logps/chosen": -326.29608154296875,
543
- "logps/rejected": -434.44061279296875,
544
- "loss": 0.5055,
545
- "rewards/accuracies": 0.731249988079071,
546
- "rewards/chosen": 0.015459060668945312,
547
- "rewards/margins": 1.0265671014785767,
548
- "rewards/rejected": -1.0105316638946533,
549
  "step": 180
550
  },
551
  {
552
- "epoch": 1.9917920656634747,
553
- "step": 182,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  "total_flos": 0.0,
555
- "train_loss": 0.5634572964448196,
556
- "train_runtime": 2180.3656,
557
- "train_samples_per_second": 5.363,
558
  "train_steps_per_second": 0.083
559
  }
560
  ],
561
  "logging_steps": 5,
562
- "max_steps": 182,
563
  "num_input_tokens_seen": 0,
564
  "num_train_epochs": 2,
565
  "save_steps": 50,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.996219281663516,
5
  "eval_steps": 500,
6
+ "global_step": 198,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.05040957781978576,
13
+ "grad_norm": 108.41279315573027,
14
  "learning_rate": 1e-07,
15
  "logits/chosen": NaN,
16
  "logits/rejected": NaN,
17
+ "logps/chosen": -394.29998779296875,
18
+ "logps/rejected": -490.4984436035156,
19
+ "loss": 0.6973,
20
+ "rewards/accuracies": 0.12812499701976776,
21
+ "rewards/chosen": -0.0023020743392407894,
22
+ "rewards/margins": -0.009827613830566406,
23
+ "rewards/rejected": 0.007504081819206476,
24
  "step": 5
25
  },
26
  {
27
+ "epoch": 0.10081915563957151,
28
+ "grad_norm": 87.2429177419689,
29
  "learning_rate": 2e-07,
30
  "logits/chosen": NaN,
31
  "logits/rejected": NaN,
32
+ "logps/chosen": -374.5687561035156,
33
+ "logps/rejected": -397.4437561035156,
34
+ "loss": 0.6827,
35
+ "rewards/accuracies": 0.30937498807907104,
36
+ "rewards/chosen": 0.009921550750732422,
37
+ "rewards/margins": 0.029467200860381126,
38
+ "rewards/rejected": -0.01954820193350315,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.15122873345935728,
43
+ "grad_norm": 83.76505879098643,
44
+ "learning_rate": 1.946808510638298e-07,
45
  "logits/chosen": NaN,
46
  "logits/rejected": NaN,
47
+ "logps/chosen": -361.41485595703125,
48
+ "logps/rejected": -441.2671813964844,
49
+ "loss": 0.6832,
50
+ "rewards/accuracies": 0.3187499940395355,
51
+ "rewards/chosen": -0.0191789623349905,
52
+ "rewards/margins": 0.04044074937701225,
53
+ "rewards/rejected": -0.059579335153102875,
54
  "step": 15
55
  },
56
  {
57
+ "epoch": 0.20163831127914303,
58
+ "grad_norm": 55.583186387674,
59
+ "learning_rate": 1.8936170212765957e-07,
60
  "logits/chosen": NaN,
61
  "logits/rejected": NaN,
62
+ "logps/chosen": -373.5375061035156,
63
+ "logps/rejected": -464.6875,
64
+ "loss": 0.6233,
65
+ "rewards/accuracies": 0.4437499940395355,
66
+ "rewards/chosen": -0.021530818194150925,
67
+ "rewards/margins": 0.23837146162986755,
68
+ "rewards/rejected": -0.2597528398036957,
69
  "step": 20
70
  },
71
  {
72
+ "epoch": 0.2520478890989288,
73
+ "grad_norm": 60.717924564454115,
74
+ "learning_rate": 1.8404255319148937e-07,
75
  "logits/chosen": NaN,
76
  "logits/rejected": NaN,
77
+ "logps/chosen": -371.62188720703125,
78
+ "logps/rejected": -478.8374938964844,
79
+ "loss": 0.6328,
80
+ "rewards/accuracies": 0.3968749940395355,
81
+ "rewards/chosen": -0.04529209062457085,
82
+ "rewards/margins": 0.28455600142478943,
83
+ "rewards/rejected": -0.3299552798271179,
84
  "step": 25
85
  },
86
  {
87
+ "epoch": 0.30245746691871456,
88
+ "grad_norm": 58.667280799471335,
89
+ "learning_rate": 1.7872340425531914e-07,
90
  "logits/chosen": NaN,
91
  "logits/rejected": NaN,
92
+ "logps/chosen": -407.125,
93
+ "logps/rejected": -456.94061279296875,
94
+ "loss": 0.6037,
95
+ "rewards/accuracies": 0.4281249940395355,
96
+ "rewards/chosen": -0.06888346374034882,
97
+ "rewards/margins": 0.4735303819179535,
98
+ "rewards/rejected": -0.5425974726676941,
99
  "step": 30
100
  },
101
  {
102
+ "epoch": 0.35286704473850034,
103
+ "grad_norm": 52.677451346587404,
104
+ "learning_rate": 1.7340425531914892e-07,
105
  "logits/chosen": NaN,
106
  "logits/rejected": NaN,
107
+ "logps/chosen": -396.4683532714844,
108
+ "logps/rejected": -433.375,
109
+ "loss": 0.5971,
110
+ "rewards/accuracies": 0.515625,
111
+ "rewards/chosen": -0.10200033336877823,
112
+ "rewards/margins": 0.5540359616279602,
113
+ "rewards/rejected": -0.6558942794799805,
114
  "step": 35
115
  },
116
  {
117
+ "epoch": 0.40327662255828606,
118
+ "grad_norm": 61.023691806589966,
119
+ "learning_rate": 1.6808510638297872e-07,
120
  "logits/chosen": NaN,
121
  "logits/rejected": NaN,
122
+ "logps/chosen": -349.65545654296875,
123
+ "logps/rejected": -466.68438720703125,
124
+ "loss": 0.6065,
125
+ "rewards/accuracies": 0.578125,
126
+ "rewards/chosen": -0.10066480934619904,
127
+ "rewards/margins": 0.6059595346450806,
128
+ "rewards/rejected": -0.7062518000602722,
129
  "step": 40
130
  },
131
  {
132
+ "epoch": 0.45368620037807184,
133
+ "grad_norm": 50.9687221759888,
134
+ "learning_rate": 1.627659574468085e-07,
135
  "logits/chosen": NaN,
136
+ "logits/rejected": -0.1692344695329666,
137
+ "logps/chosen": -325.9609375,
138
+ "logps/rejected": -462.97186279296875,
139
+ "loss": 0.5598,
140
+ "rewards/accuracies": 0.6031249761581421,
141
+ "rewards/chosen": -0.09121231734752655,
142
+ "rewards/margins": 1.08050537109375,
143
+ "rewards/rejected": -1.1715847253799438,
144
  "step": 45
145
  },
146
  {
147
+ "epoch": 0.5040957781978576,
148
+ "grad_norm": 56.54625986325724,
149
+ "learning_rate": 1.574468085106383e-07,
150
  "logits/chosen": NaN,
151
  "logits/rejected": NaN,
152
+ "logps/chosen": -342.57501220703125,
153
+ "logps/rejected": -478.74139404296875,
154
+ "loss": 0.591,
155
+ "rewards/accuracies": 0.574999988079071,
156
+ "rewards/chosen": -0.10402297973632812,
157
+ "rewards/margins": 0.7732677459716797,
158
+ "rewards/rejected": -0.8772258758544922,
159
  "step": 50
160
  },
161
  {
162
+ "epoch": 0.5545053560176434,
163
+ "grad_norm": 52.007398326371984,
164
+ "learning_rate": 1.5212765957446807e-07,
165
  "logits/chosen": NaN,
166
+ "logits/rejected": NaN,
167
+ "logps/chosen": -381.3374938964844,
168
+ "logps/rejected": -456.14373779296875,
169
+ "loss": 0.5847,
170
+ "rewards/accuracies": 0.574999988079071,
171
+ "rewards/chosen": -0.08334217220544815,
172
+ "rewards/margins": 0.8058792352676392,
173
+ "rewards/rejected": -0.889452338218689,
174
  "step": 55
175
  },
176
  {
177
+ "epoch": 0.6049149338374291,
178
+ "grad_norm": 46.410753552087435,
179
+ "learning_rate": 1.4680851063829787e-07,
180
  "logits/chosen": NaN,
181
+ "logits/rejected": NaN,
182
+ "logps/chosen": -358.54998779296875,
183
+ "logps/rejected": -475.54998779296875,
184
+ "loss": 0.5703,
185
+ "rewards/accuracies": 0.596875011920929,
186
+ "rewards/chosen": -0.1341991424560547,
187
+ "rewards/margins": 1.1414505243301392,
188
+ "rewards/rejected": -1.27621328830719,
189
  "step": 60
190
  },
191
  {
192
+ "epoch": 0.6553245116572148,
193
+ "grad_norm": 62.601972259800355,
194
+ "learning_rate": 1.4148936170212768e-07,
195
  "logits/chosen": NaN,
196
  "logits/rejected": NaN,
197
+ "logps/chosen": -358.22265625,
198
+ "logps/rejected": -460.4937438964844,
199
+ "loss": 0.5561,
200
+ "rewards/accuracies": 0.612500011920929,
201
+ "rewards/chosen": -0.09829378128051758,
202
+ "rewards/margins": 1.154931664466858,
203
+ "rewards/rejected": -1.25310218334198,
204
  "step": 65
205
  },
206
  {
207
+ "epoch": 0.7057340894770007,
208
+ "grad_norm": 42.81446910830512,
209
+ "learning_rate": 1.3617021276595742e-07,
210
  "logits/chosen": NaN,
211
  "logits/rejected": NaN,
212
+ "logps/chosen": -350.8062438964844,
213
+ "logps/rejected": -479.98126220703125,
214
+ "loss": 0.5464,
215
+ "rewards/accuracies": 0.668749988079071,
216
+ "rewards/chosen": -0.08439864963293076,
217
+ "rewards/margins": 1.0637038946151733,
218
+ "rewards/rejected": -1.146966576576233,
219
  "step": 70
220
  },
221
  {
222
+ "epoch": 0.7561436672967864,
223
+ "grad_norm": 59.85465445918704,
224
+ "learning_rate": 1.3085106382978723e-07,
225
  "logits/chosen": NaN,
226
  "logits/rejected": NaN,
227
+ "logps/chosen": -339.3656311035156,
228
+ "logps/rejected": -428.65936279296875,
229
+ "loss": 0.5872,
230
+ "rewards/accuracies": 0.643750011920929,
231
+ "rewards/chosen": -0.11314620822668076,
232
+ "rewards/margins": 0.7480255365371704,
233
+ "rewards/rejected": -0.8610885739326477,
234
  "step": 75
235
  },
236
  {
237
+ "epoch": 0.8065532451165721,
238
+ "grad_norm": 77.89913196854276,
239
+ "learning_rate": 1.25531914893617e-07,
240
  "logits/chosen": NaN,
241
+ "logits/rejected": -0.232859805226326,
242
+ "logps/chosen": -367.71563720703125,
243
+ "logps/rejected": -496.9125061035156,
244
+ "loss": 0.5386,
245
+ "rewards/accuracies": 0.6968749761581421,
246
+ "rewards/chosen": -0.09202079474925995,
247
+ "rewards/margins": 1.1276824474334717,
248
+ "rewards/rejected": -1.2203514575958252,
249
  "step": 80
250
  },
251
  {
252
+ "epoch": 0.856962822936358,
253
+ "grad_norm": 51.576853346068596,
254
+ "learning_rate": 1.202127659574468e-07,
255
  "logits/chosen": NaN,
256
  "logits/rejected": NaN,
257
+ "logps/chosen": -352.01873779296875,
258
+ "logps/rejected": -441.85626220703125,
259
+ "loss": 0.5489,
260
+ "rewards/accuracies": 0.699999988079071,
261
+ "rewards/chosen": -0.09004707634449005,
262
+ "rewards/margins": 0.9977798461914062,
263
+ "rewards/rejected": -1.0878921747207642,
264
  "step": 85
265
  },
266
  {
267
+ "epoch": 0.9073724007561437,
268
+ "grad_norm": 51.91948881188928,
269
+ "learning_rate": 1.148936170212766e-07,
270
  "logits/chosen": NaN,
271
  "logits/rejected": NaN,
272
+ "logps/chosen": -391.98748779296875,
273
+ "logps/rejected": -501.89373779296875,
274
+ "loss": 0.5429,
275
+ "rewards/accuracies": 0.668749988079071,
276
+ "rewards/chosen": -0.13425922393798828,
277
+ "rewards/margins": 1.2032638788223267,
278
+ "rewards/rejected": -1.339324951171875,
279
  "step": 90
280
  },
281
  {
282
+ "epoch": 0.9577819785759294,
283
+ "grad_norm": 54.81194235391722,
284
+ "learning_rate": 1.0957446808510638e-07,
285
  "logits/chosen": NaN,
286
  "logits/rejected": NaN,
287
+ "logps/chosen": -380.8531188964844,
288
+ "logps/rejected": -485.4375,
289
+ "loss": 0.5506,
290
+ "rewards/accuracies": 0.6875,
291
+ "rewards/chosen": -0.123291015625,
292
+ "rewards/margins": 1.137838363647461,
293
+ "rewards/rejected": -1.2604999542236328,
294
  "step": 95
295
  },
296
  {
297
+ "epoch": 1.0081915563957151,
298
+ "grad_norm": 51.52692462343263,
299
+ "learning_rate": 1.0425531914893617e-07,
300
  "logits/chosen": NaN,
301
  "logits/rejected": NaN,
302
+ "logps/chosen": -357.92498779296875,
303
+ "logps/rejected": -420.00311279296875,
304
+ "loss": 0.5433,
305
+ "rewards/accuracies": 0.6875,
306
+ "rewards/chosen": -0.118899405002594,
307
+ "rewards/margins": 1.2265655994415283,
308
+ "rewards/rejected": -1.3462097644805908,
309
  "step": 100
310
  },
311
  {
312
+ "epoch": 1.0586011342155008,
313
+ "grad_norm": 48.23673648203366,
314
+ "learning_rate": 9.893617021276596e-08,
315
  "logits/chosen": NaN,
316
  "logits/rejected": NaN,
317
+ "logps/chosen": -420.59375,
318
+ "logps/rejected": -551.4156494140625,
319
+ "loss": 0.4711,
320
+ "rewards/accuracies": 0.715624988079071,
321
+ "rewards/chosen": -0.05655860900878906,
322
+ "rewards/margins": 1.65283203125,
323
+ "rewards/rejected": -1.7101104259490967,
324
  "step": 105
325
  },
326
  {
327
+ "epoch": 1.1090107120352868,
328
+ "grad_norm": 129.8626362436788,
329
+ "learning_rate": 9.361702127659574e-08,
330
  "logits/chosen": NaN,
331
  "logits/rejected": NaN,
332
+ "logps/chosen": -375.9306640625,
333
+ "logps/rejected": -410.8500061035156,
334
+ "loss": 0.5499,
335
  "rewards/accuracies": 0.706250011920929,
336
+ "rewards/chosen": -0.10003051906824112,
337
+ "rewards/margins": 0.8063720464706421,
338
+ "rewards/rejected": -0.906982421875,
339
  "step": 110
340
  },
341
  {
342
+ "epoch": 1.1594202898550725,
343
+ "grad_norm": 37.96460632662538,
344
+ "learning_rate": 8.829787234042553e-08,
345
  "logits/chosen": NaN,
346
  "logits/rejected": NaN,
347
+ "logps/chosen": -395.4546813964844,
348
+ "logps/rejected": -534.5437622070312,
349
+ "loss": 0.4536,
350
+ "rewards/accuracies": 0.7875000238418579,
351
+ "rewards/chosen": -0.0355035774409771,
352
+ "rewards/margins": 1.6931426525115967,
353
+ "rewards/rejected": -1.7283508777618408,
354
  "step": 115
355
  },
356
  {
357
+ "epoch": 1.2098298676748582,
358
+ "grad_norm": 41.75349141526311,
359
+ "learning_rate": 8.297872340425531e-08,
360
  "logits/chosen": NaN,
361
+ "logits/rejected": -0.24345549941062927,
362
+ "logps/chosen": -337.83905029296875,
363
+ "logps/rejected": -414.4765625,
364
+ "loss": 0.4953,
365
+ "rewards/accuracies": 0.796875,
366
+ "rewards/chosen": -0.09084253013134003,
367
+ "rewards/margins": 1.2055069208145142,
368
+ "rewards/rejected": -1.2963898181915283,
369
  "step": 120
370
  },
371
  {
372
+ "epoch": 1.260239445494644,
373
+ "grad_norm": 47.57568984542951,
374
+ "learning_rate": 7.76595744680851e-08,
375
  "logits/chosen": NaN,
376
  "logits/rejected": NaN,
377
+ "logps/chosen": -358.9046936035156,
378
+ "logps/rejected": -469.49688720703125,
379
+ "loss": 0.501,
380
+ "rewards/accuracies": 0.75,
381
+ "rewards/chosen": -0.09036216884851456,
382
+ "rewards/margins": 1.231683373451233,
383
+ "rewards/rejected": -1.3224579095840454,
384
  "step": 125
385
  },
386
  {
387
+ "epoch": 1.3106490233144297,
388
+ "grad_norm": 53.944830915941765,
389
+ "learning_rate": 7.23404255319149e-08,
390
  "logits/chosen": NaN,
391
  "logits/rejected": NaN,
392
+ "logps/chosen": -356.4375,
393
+ "logps/rejected": -489.125,
394
+ "loss": 0.4999,
395
+ "rewards/accuracies": 0.7593749761581421,
396
+ "rewards/chosen": -0.0903778076171875,
397
+ "rewards/margins": 1.2274360656738281,
398
+ "rewards/rejected": -1.3174560070037842,
399
  "step": 130
400
  },
401
  {
402
+ "epoch": 1.3610586011342156,
403
+ "grad_norm": 40.45839840494219,
404
+ "learning_rate": 6.702127659574469e-08,
405
  "logits/chosen": NaN,
406
  "logits/rejected": NaN,
407
+ "logps/chosen": -368.95623779296875,
408
+ "logps/rejected": -460.58282470703125,
409
+ "loss": 0.4856,
410
+ "rewards/accuracies": 0.746874988079071,
411
+ "rewards/chosen": -0.033612824976444244,
412
+ "rewards/margins": 1.4452941417694092,
413
+ "rewards/rejected": -1.479437232017517,
414
  "step": 135
415
  },
416
  {
417
+ "epoch": 1.4114681789540013,
418
+ "grad_norm": 43.63902778407327,
419
+ "learning_rate": 6.170212765957446e-08,
420
  "logits/chosen": NaN,
421
  "logits/rejected": NaN,
422
+ "logps/chosen": -393.62030029296875,
423
+ "logps/rejected": -473.53436279296875,
424
+ "loss": 0.4987,
425
+ "rewards/accuracies": 0.734375,
426
+ "rewards/chosen": -0.03872375562787056,
427
+ "rewards/margins": 1.239990234375,
428
+ "rewards/rejected": -1.2792266607284546,
429
  "step": 140
430
  },
431
  {
432
+ "epoch": 1.461877756773787,
433
+ "grad_norm": 40.93200179183777,
434
+ "learning_rate": 5.638297872340425e-08,
435
  "logits/chosen": NaN,
436
+ "logits/rejected": -0.28594666719436646,
437
+ "logps/chosen": -370.62188720703125,
438
+ "logps/rejected": -508.2562561035156,
439
+ "loss": 0.4891,
440
+ "rewards/accuracies": 0.7250000238418579,
441
+ "rewards/chosen": -0.047638703137636185,
442
+ "rewards/margins": 1.3118622303009033,
443
+ "rewards/rejected": -1.359655737876892,
444
  "step": 145
445
  },
446
  {
447
+ "epoch": 1.5122873345935728,
448
+ "grad_norm": 58.65309693932161,
449
+ "learning_rate": 5.106382978723404e-08,
450
  "logits/chosen": NaN,
451
  "logits/rejected": NaN,
452
+ "logps/chosen": -347.8609313964844,
453
+ "logps/rejected": -473.4750061035156,
454
+ "loss": 0.4814,
455
+ "rewards/accuracies": 0.762499988079071,
456
+ "rewards/chosen": -0.05366211012005806,
457
+ "rewards/margins": 1.2643524408340454,
458
+ "rewards/rejected": -1.3187682628631592,
459
  "step": 150
460
  },
461
  {
462
+ "epoch": 1.5626969124133585,
463
+ "grad_norm": 40.5990127283572,
464
+ "learning_rate": 4.5744680851063826e-08,
465
  "logits/chosen": NaN,
466
+ "logits/rejected": NaN,
467
+ "logps/chosen": -377.1734313964844,
468
+ "logps/rejected": -453.3296813964844,
469
+ "loss": 0.4998,
470
+ "rewards/accuracies": 0.753125011920929,
471
+ "rewards/chosen": -0.07384242862462997,
472
+ "rewards/margins": 1.2074543237686157,
473
+ "rewards/rejected": -1.2816162109375,
474
  "step": 155
475
  },
476
  {
477
+ "epoch": 1.6131064902331445,
478
+ "grad_norm": 32.42610640703847,
479
+ "learning_rate": 4.0425531914893614e-08,
480
  "logits/chosen": NaN,
481
  "logits/rejected": NaN,
482
+ "logps/chosen": -330.2578125,
483
+ "logps/rejected": -398.8812561035156,
484
+ "loss": 0.471,
485
+ "rewards/accuracies": 0.7906249761581421,
486
+ "rewards/chosen": -0.01856536790728569,
487
+ "rewards/margins": 1.3712249994277954,
488
+ "rewards/rejected": -1.3899352550506592,
489
  "step": 160
490
  },
491
  {
492
+ "epoch": 1.66351606805293,
493
+ "grad_norm": 41.01530072372472,
494
+ "learning_rate": 3.51063829787234e-08,
495
  "logits/chosen": NaN,
496
  "logits/rejected": NaN,
497
+ "logps/chosen": -340.2749938964844,
498
+ "logps/rejected": -438.40313720703125,
499
+ "loss": 0.4842,
500
+ "rewards/accuracies": 0.737500011920929,
501
+ "rewards/chosen": -0.0492522232234478,
502
+ "rewards/margins": 1.4962584972381592,
503
+ "rewards/rejected": -1.5454528331756592,
504
  "step": 165
505
  },
506
  {
507
+ "epoch": 1.713925645872716,
508
+ "grad_norm": 40.64613349590343,
509
+ "learning_rate": 2.9787234042553187e-08,
510
  "logits/chosen": NaN,
511
+ "logits/rejected": NaN,
512
+ "logps/chosen": -348.5640563964844,
513
+ "logps/rejected": -449.84844970703125,
514
+ "loss": 0.4915,
515
+ "rewards/accuracies": 0.731249988079071,
516
+ "rewards/chosen": -0.07779388129711151,
517
+ "rewards/margins": 1.278845191001892,
518
+ "rewards/rejected": -1.356591820716858,
519
  "step": 170
520
  },
521
  {
522
+ "epoch": 1.7643352236925016,
523
+ "grad_norm": 41.86507518705149,
524
+ "learning_rate": 2.4468085106382976e-08,
525
  "logits/chosen": NaN,
526
  "logits/rejected": NaN,
527
+ "logps/chosen": -350.2046813964844,
528
+ "logps/rejected": -433.5687561035156,
529
+ "loss": 0.4755,
530
+ "rewards/accuracies": 0.8031250238418579,
531
+ "rewards/chosen": -0.027071380987763405,
532
+ "rewards/margins": 1.4462082386016846,
533
+ "rewards/rejected": -1.4738037586212158,
534
  "step": 175
535
  },
536
  {
537
+ "epoch": 1.8147448015122873,
538
+ "grad_norm": 46.67757830253006,
539
+ "learning_rate": 1.9148936170212764e-08,
540
  "logits/chosen": NaN,
541
  "logits/rejected": NaN,
542
+ "logps/chosen": -398.32501220703125,
543
+ "logps/rejected": -456.9437561035156,
544
+ "loss": 0.4807,
545
+ "rewards/accuracies": 0.746874988079071,
546
+ "rewards/chosen": -0.05414886400103569,
547
+ "rewards/margins": 1.2570632696151733,
548
+ "rewards/rejected": -1.310980200767517,
549
  "step": 180
550
  },
551
  {
552
+ "epoch": 1.865154379332073,
553
+ "grad_norm": 43.636875496682755,
554
+ "learning_rate": 1.3829787234042552e-08,
555
+ "logits/chosen": NaN,
556
+ "logits/rejected": -0.17527160048484802,
557
+ "logps/chosen": -373.87188720703125,
558
+ "logps/rejected": -535.796875,
559
+ "loss": 0.4376,
560
+ "rewards/accuracies": 0.796875,
561
+ "rewards/chosen": -0.026004791259765625,
562
+ "rewards/margins": 1.6229279041290283,
563
+ "rewards/rejected": -1.649743676185608,
564
+ "step": 185
565
+ },
566
+ {
567
+ "epoch": 1.9155639571518588,
568
+ "grad_norm": 41.753332033000945,
569
+ "learning_rate": 8.510638297872339e-09,
570
+ "logits/chosen": NaN,
571
+ "logits/rejected": NaN,
572
+ "logps/chosen": -360.0718688964844,
573
+ "logps/rejected": -468.2124938964844,
574
+ "loss": 0.4872,
575
+ "rewards/accuracies": 0.731249988079071,
576
+ "rewards/chosen": -0.11498375236988068,
577
+ "rewards/margins": 1.2304840087890625,
578
+ "rewards/rejected": -1.345800757408142,
579
+ "step": 190
580
+ },
581
+ {
582
+ "epoch": 1.9659735349716447,
583
+ "grad_norm": 43.377545693326894,
584
+ "learning_rate": 3.1914893617021273e-09,
585
+ "logits/chosen": NaN,
586
+ "logits/rejected": NaN,
587
+ "logps/chosen": -365.515625,
588
+ "logps/rejected": -478.56561279296875,
589
+ "loss": 0.4805,
590
+ "rewards/accuracies": 0.7718750238418579,
591
+ "rewards/chosen": -0.04032173007726669,
592
+ "rewards/margins": 1.3543853759765625,
593
+ "rewards/rejected": -1.39520263671875,
594
+ "step": 195
595
+ },
596
+ {
597
+ "epoch": 1.996219281663516,
598
+ "step": 198,
599
  "total_flos": 0.0,
600
+ "train_loss": 0.5393935610549618,
601
+ "train_runtime": 2398.307,
602
+ "train_samples_per_second": 5.293,
603
  "train_steps_per_second": 0.083
604
  }
605
  ],
606
  "logging_steps": 5,
607
+ "max_steps": 198,
608
  "num_input_tokens_seen": 0,
609
  "num_train_epochs": 2,
610
  "save_steps": 50,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8fdbbe2f1b3f3b85296b6baf64f326cf244b75414a339603d903d8f6b145663
3
  size 7672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04392d39f00478df0417f7fd9ba7d15085f3dba381f41d035a42742e674b52ff
3
  size 7672