yfliao commited on
Commit
3e91657
·
verified ·
1 Parent(s): 2f3998e

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/speechlabx/huggingface/runs/wv9fa7q3)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/speechlabx/huggingface/runs/6hrfnhwk)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.04100553294370786,
4
- "train_runtime": 8329.5214,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.9,
7
- "train_steps_per_second": 0.007
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.04285794010000496,
4
+ "train_runtime": 11336.4845,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.662,
7
+ "train_steps_per_second": 0.005
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1dc8670668bf49e9b95b5db7836b4028b4a53a65aa498880a009ae4ad34ae1ac
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9185ef8824f31f98137ea8afa0a0be60226fa11498b7aa06a548782154c62da2
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1825cfb7168099515cd7dab1c82ecfca113aa9a665683c84039987bd78c5b27d
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65c1f3e4e2659fad3c661a83c78c3a3a5a2b7ed97c0d32db8d1701cf0b34f105
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9c0e7eef7ddb36026815ab0aa3272da0aa29e40635b77567b24e0263ee02c38
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcc0fed3777fa9e2520d98fb836c49aa5fc2f568041062ee1a729ee225a61093
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8df3c77817acf9c80b96124a1b0a72964c9662a8ca23c25dc9d697b598ab2b02
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3352f63906588764a7aaf36fd13178ac8c112d8f1127bc82942f4a8e194782fc
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.04100553294370786,
4
- "train_runtime": 8329.5214,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.9,
7
- "train_steps_per_second": 0.007
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.04285794010000496,
4
+ "train_runtime": 11336.4845,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.662,
7
+ "train_steps_per_second": 0.005
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.9893390191897654,
5
- "eval_steps": 100,
6
  "global_step": 58,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
@@ -10,824 +10,839 @@
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
- "completion_length": 605.9933242797852,
14
  "epoch": 0.017057569296375266,
15
- "grad_norm": 0.46656012535095215,
16
  "kl": 0.0,
17
  "learning_rate": 5e-07,
18
- "loss": 0.0328,
19
- "reward": 0.6272321715950966,
20
- "reward_std": 0.3389472607523203,
21
- "rewards/accuracy_reward": 0.6272321715950966,
22
  "rewards/format_reward": 0.0,
23
  "step": 1
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
- "completion_length": 595.8381958007812,
28
  "epoch": 0.03411513859275053,
29
- "grad_norm": 0.40490883588790894,
30
  "kl": 0.0,
31
  "learning_rate": 1e-06,
32
- "loss": 0.029,
33
- "reward": 0.5859375260770321,
34
- "reward_std": 0.37348853051662445,
35
- "rewards/accuracy_reward": 0.5859375260770321,
36
  "rewards/format_reward": 0.0,
37
  "step": 2
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
- "completion_length": 616.5044860839844,
42
  "epoch": 0.0511727078891258,
43
- "grad_norm": 0.5747449398040771,
44
- "kl": 0.00019681453704833984,
45
  "learning_rate": 1.5e-06,
46
- "loss": 0.0331,
47
- "reward": 0.6138393133878708,
48
- "reward_std": 0.34731038846075535,
49
- "rewards/accuracy_reward": 0.6138393133878708,
50
  "rewards/format_reward": 0.0,
51
  "step": 3
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
- "completion_length": 593.1674423217773,
56
  "epoch": 0.06823027718550106,
57
- "grad_norm": 0.37437704205513,
58
- "kl": 0.00048792362213134766,
59
  "learning_rate": 2e-06,
60
- "loss": -0.003,
61
- "reward": 0.5725446753203869,
62
- "reward_std": 0.35837166383862495,
63
- "rewards/accuracy_reward": 0.5725446753203869,
64
  "rewards/format_reward": 0.0,
65
  "step": 4
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
- "completion_length": 595.5591735839844,
70
  "epoch": 0.08528784648187633,
71
- "grad_norm": 0.4537689983844757,
72
- "kl": 0.000583648681640625,
73
  "learning_rate": 2.5e-06,
74
- "loss": 0.038,
75
- "reward": 0.604910746216774,
76
- "reward_std": 0.3501625321805477,
77
- "rewards/accuracy_reward": 0.604910746216774,
78
  "rewards/format_reward": 0.0,
79
  "step": 5
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
- "completion_length": 601.1138725280762,
84
  "epoch": 0.1023454157782516,
85
- "grad_norm": 2.8121519088745117,
86
- "kl": 0.002495288848876953,
87
  "learning_rate": 3e-06,
88
- "loss": 0.051,
89
- "reward": 0.6339286044239998,
90
- "reward_std": 0.28945007361471653,
91
- "rewards/accuracy_reward": 0.632812537252903,
92
- "rewards/format_reward": 0.0011160714784637094,
93
  "step": 6
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
- "completion_length": 619.041316986084,
98
  "epoch": 0.11940298507462686,
99
- "grad_norm": 0.6353834867477417,
100
- "kl": 0.0027112960815429688,
101
  "learning_rate": 2.9972633313349763e-06,
102
- "loss": 0.0694,
103
- "reward": 0.6618303805589676,
104
- "reward_std": 0.30959521792829037,
105
- "rewards/accuracy_reward": 0.6618303805589676,
106
  "rewards/format_reward": 0.0,
107
  "step": 7
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
- "completion_length": 630.1774749755859,
112
  "epoch": 0.13646055437100213,
113
- "grad_norm": 0.5084631443023682,
114
- "kl": 0.003963470458984375,
115
  "learning_rate": 2.989063311147081e-06,
116
- "loss": 0.0362,
117
- "reward": 0.6584821715950966,
118
- "reward_std": 0.31060847640037537,
119
- "rewards/accuracy_reward": 0.6584821715950966,
120
  "rewards/format_reward": 0.0,
121
  "step": 8
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
- "completion_length": 605.6975860595703,
126
  "epoch": 0.1535181236673774,
127
- "grad_norm": 0.3929789066314697,
128
- "kl": 0.007457733154296875,
129
  "learning_rate": 2.9754298604207156e-06,
130
- "loss": 0.0784,
131
- "reward": 0.7433036044239998,
132
- "reward_std": 0.28443033434450626,
133
- "rewards/accuracy_reward": 0.7433036044239998,
134
  "rewards/format_reward": 0.0,
135
  "step": 9
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
- "completion_length": 614.2846221923828,
140
  "epoch": 0.17057569296375266,
141
- "grad_norm": 0.46036890149116516,
142
- "kl": 0.020414352416992188,
143
  "learning_rate": 2.956412726139078e-06,
144
- "loss": 0.0579,
145
- "reward": 0.7031250223517418,
146
- "reward_std": 0.26031214371323586,
147
- "rewards/accuracy_reward": 0.7031250223517418,
148
  "rewards/format_reward": 0.0,
149
  "step": 10
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
- "completion_length": 648.0703506469727,
154
  "epoch": 0.18763326226012794,
155
- "grad_norm": 0.40724363923072815,
156
- "kl": 0.003391265869140625,
157
  "learning_rate": 2.9320812997628183e-06,
158
- "loss": 0.057,
159
- "reward": 0.729910746216774,
160
- "reward_std": 0.23080609552562237,
161
- "rewards/accuracy_reward": 0.729910746216774,
162
  "rewards/format_reward": 0.0,
163
  "step": 11
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
- "completion_length": 572.5647506713867,
168
  "epoch": 0.2046908315565032,
169
- "grad_norm": 0.3498363792896271,
170
- "kl": 0.0052776336669921875,
171
  "learning_rate": 2.9025243640281224e-06,
172
- "loss": 0.0529,
173
- "reward": 0.8069196864962578,
174
- "reward_std": 0.22050337865948677,
175
- "rewards/accuracy_reward": 0.8069196864962578,
176
  "rewards/format_reward": 0.0,
177
  "step": 12
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
- "completion_length": 588.2254791259766,
182
  "epoch": 0.22174840085287847,
183
- "grad_norm": 0.5149727463722229,
184
- "kl": 0.0054779052734375,
185
  "learning_rate": 2.8678497689881355e-06,
186
- "loss": 0.0505,
187
- "reward": 0.7767857536673546,
188
- "reward_std": 0.1721354192122817,
189
- "rewards/accuracy_reward": 0.7767857536673546,
190
  "rewards/format_reward": 0.0,
191
  "step": 13
192
  },
193
  {
194
  "clip_ratio": 0.0,
195
- "completion_length": 619.4620895385742,
196
  "epoch": 0.23880597014925373,
197
- "grad_norm": 0.18762321770191193,
198
- "kl": 0.004208564758300781,
199
  "learning_rate": 2.8281840384798147e-06,
200
- "loss": 0.0662,
201
- "reward": 0.7488839626312256,
202
- "reward_std": 0.20387782901525497,
203
- "rewards/accuracy_reward": 0.7488839626312256,
204
  "rewards/format_reward": 0.0,
205
  "step": 14
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
- "completion_length": 595.3817291259766,
210
  "epoch": 0.255863539445629,
211
- "grad_norm": 0.641958475112915,
212
- "kl": 0.004180908203125,
213
  "learning_rate": 2.7836719084521715e-06,
214
- "loss": 0.0681,
215
- "reward": 0.7667411118745804,
216
- "reward_std": 0.2224541725590825,
217
- "rewards/accuracy_reward": 0.7667411118745804,
218
  "rewards/format_reward": 0.0,
219
  "step": 15
220
  },
221
  {
222
  "clip_ratio": 0.0,
223
- "completion_length": 583.5859680175781,
224
  "epoch": 0.27292110874200426,
225
- "grad_norm": 0.741455078125,
226
- "kl": 0.0067882537841796875,
227
  "learning_rate": 2.7344757988404844e-06,
228
- "loss": 0.0656,
229
- "reward": 0.7901786118745804,
230
- "reward_std": 0.21803472563624382,
231
- "rewards/accuracy_reward": 0.7901786118745804,
232
  "rewards/format_reward": 0.0,
233
  "step": 16
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
- "completion_length": 597.1685638427734,
238
  "epoch": 0.2899786780383795,
239
- "grad_norm": 0.23518237471580505,
240
- "kl": 0.003398895263671875,
241
  "learning_rate": 2.680775220913575e-06,
242
- "loss": 0.0435,
243
- "reward": 0.7645089700818062,
244
- "reward_std": 0.17900951812043786,
245
- "rewards/accuracy_reward": 0.7645089700818062,
246
  "rewards/format_reward": 0.0,
247
  "step": 17
248
  },
249
  {
250
  "clip_ratio": 0.0,
251
- "completion_length": 612.630615234375,
252
  "epoch": 0.3070362473347548,
253
- "grad_norm": 0.27309614419937134,
254
- "kl": 0.0040035247802734375,
255
  "learning_rate": 2.6227661222566517e-06,
256
- "loss": 0.055,
257
- "reward": 0.7399553954601288,
258
- "reward_std": 0.2160194292664528,
259
- "rewards/accuracy_reward": 0.7399553954601288,
260
  "rewards/format_reward": 0.0,
261
  "step": 18
262
  },
263
  {
264
  "clip_ratio": 0.0,
265
- "completion_length": 596.2109603881836,
266
  "epoch": 0.32409381663113007,
267
- "grad_norm": 0.1339673399925232,
268
- "kl": 0.0028057098388671875,
269
  "learning_rate": 2.5606601717798212e-06,
270
- "loss": 0.0425,
271
- "reward": 0.7700893059372902,
272
- "reward_std": 0.19220922887325287,
273
- "rewards/accuracy_reward": 0.7700893059372902,
274
  "rewards/format_reward": 0.0,
275
  "step": 19
276
  },
277
  {
278
  "clip_ratio": 0.0,
279
- "completion_length": 552.1049461364746,
280
  "epoch": 0.3411513859275053,
281
- "grad_norm": 0.4360298216342926,
282
- "kl": 0.0038690567016601562,
283
  "learning_rate": 2.4946839873611927e-06,
284
- "loss": 0.0449,
285
- "reward": 0.7912946790456772,
286
- "reward_std": 0.19673781003803015,
287
- "rewards/accuracy_reward": 0.7912946790456772,
288
  "rewards/format_reward": 0.0,
289
  "step": 20
290
  },
291
  {
292
  "clip_ratio": 0.0,
293
- "completion_length": 606.3794937133789,
294
  "epoch": 0.3582089552238806,
295
- "grad_norm": 0.27116331458091736,
296
- "kl": 0.0029468536376953125,
297
  "learning_rate": 2.425078308942815e-06,
298
- "loss": 0.0265,
299
- "reward": 0.7734375298023224,
300
- "reward_std": 0.17621066141873598,
301
- "rewards/accuracy_reward": 0.7734375298023224,
302
  "rewards/format_reward": 0.0,
303
  "step": 21
304
  },
305
  {
306
  "clip_ratio": 0.0,
307
- "completion_length": 585.0826187133789,
308
  "epoch": 0.3752665245202559,
309
- "grad_norm": 0.15653851628303528,
310
- "kl": 0.0028667449951171875,
311
  "learning_rate": 2.3520971200967337e-06,
312
- "loss": 0.0473,
313
- "reward": 0.8035714700818062,
314
- "reward_std": 0.16157799493521452,
315
- "rewards/accuracy_reward": 0.8035714700818062,
316
  "rewards/format_reward": 0.0,
317
  "step": 22
318
  },
319
  {
320
  "clip_ratio": 0.0,
321
- "completion_length": 596.0055999755859,
322
  "epoch": 0.39232409381663114,
323
- "grad_norm": 0.1592758744955063,
324
- "kl": 0.0041027069091796875,
325
  "learning_rate": 2.276006721266485e-06,
326
- "loss": 0.0449,
327
- "reward": 0.7779018208384514,
328
- "reward_std": 0.1884950390085578,
329
- "rewards/accuracy_reward": 0.7779018208384514,
330
  "rewards/format_reward": 0.0,
331
  "step": 23
332
  },
333
  {
334
  "clip_ratio": 0.0,
335
- "completion_length": 592.358283996582,
336
  "epoch": 0.4093816631130064,
337
- "grad_norm": 0.7523362040519714,
338
- "kl": 0.0069179534912109375,
339
  "learning_rate": 2.1970847580656528e-06,
340
- "loss": 0.0609,
341
- "reward": 0.7511161044239998,
342
- "reward_std": 0.19366461411118507,
343
- "rewards/accuracy_reward": 0.7511161044239998,
344
  "rewards/format_reward": 0.0,
345
  "step": 24
346
  },
347
  {
348
  "clip_ratio": 0.0,
349
- "completion_length": 584.0781593322754,
350
  "epoch": 0.42643923240938164,
351
- "grad_norm": 0.1614408791065216,
352
- "kl": 0.0040912628173828125,
353
  "learning_rate": 2.1156192081791355e-06,
354
- "loss": 0.0412,
355
- "reward": 0.731026828289032,
356
- "reward_std": 0.19054656103253365,
357
- "rewards/accuracy_reward": 0.731026828289032,
358
  "rewards/format_reward": 0.0,
359
  "step": 25
360
  },
361
  {
362
  "clip_ratio": 0.0,
363
- "completion_length": 586.0480117797852,
364
  "epoch": 0.44349680170575695,
365
- "grad_norm": 0.14655889570713043,
366
- "kl": 0.0051975250244140625,
367
  "learning_rate": 2.0319073305638034e-06,
368
- "loss": 0.0398,
369
  "reward": 0.753348246216774,
370
- "reward_std": 0.19846755359321833,
371
  "rewards/accuracy_reward": 0.753348246216774,
372
  "rewards/format_reward": 0.0,
373
  "step": 26
374
  },
375
  {
376
  "clip_ratio": 0.0,
377
- "completion_length": 583.4933319091797,
378
  "epoch": 0.4605543710021322,
379
- "grad_norm": 0.2977466285228729,
380
- "kl": 0.0040130615234375,
381
  "learning_rate": 1.9462545807828044e-06,
382
- "loss": 0.0443,
383
- "reward": 0.761160746216774,
384
- "reward_std": 0.17427107319235802,
385
- "rewards/accuracy_reward": 0.761160746216774,
386
  "rewards/format_reward": 0.0,
387
  "step": 27
388
  },
389
  {
390
  "clip_ratio": 0.0,
391
- "completion_length": 565.6919822692871,
392
  "epoch": 0.47761194029850745,
393
- "grad_norm": 0.3453241288661957,
394
- "kl": 0.0048007965087890625,
395
  "learning_rate": 1.8589734964313368e-06,
396
- "loss": 0.0312,
397
- "reward": 0.7801339700818062,
398
- "reward_std": 0.17806075559929013,
399
- "rewards/accuracy_reward": 0.7801339700818062,
400
  "rewards/format_reward": 0.0,
401
  "step": 28
402
  },
403
  {
404
  "clip_ratio": 0.0,
405
- "completion_length": 608.4219055175781,
406
  "epoch": 0.4946695095948827,
407
- "grad_norm": 0.34829476475715637,
408
- "kl": 0.0038909912109375,
409
  "learning_rate": 1.7703825567208588e-06,
410
- "loss": 0.0357,
411
- "reward": 0.7455357536673546,
412
- "reward_std": 0.1893458729609847,
413
- "rewards/accuracy_reward": 0.7455357536673546,
414
  "rewards/format_reward": 0.0,
415
  "step": 29
416
  },
417
  {
418
  "clip_ratio": 0.0,
419
- "completion_length": 638.1015930175781,
420
  "epoch": 0.511727078891258,
421
- "grad_norm": 0.8564308881759644,
422
- "kl": 0.0047130584716796875,
423
  "learning_rate": 1.6808050203829845e-06,
424
- "loss": 0.0605,
425
- "reward": 0.7120536044239998,
426
- "reward_std": 0.1940593123435974,
427
- "rewards/accuracy_reward": 0.7120536044239998,
428
  "rewards/format_reward": 0.0,
429
  "step": 30
430
  },
431
  {
432
  "clip_ratio": 0.0,
433
- "completion_length": 595.8727874755859,
434
  "epoch": 0.5287846481876333,
435
- "grad_norm": 0.4144252836704254,
436
- "kl": 0.0044384002685546875,
437
  "learning_rate": 1.5905677461334292e-06,
438
- "loss": 0.0462,
439
- "reward": 0.7343750298023224,
440
- "reward_std": 0.19265135563910007,
441
- "rewards/accuracy_reward": 0.7343750298023224,
442
  "rewards/format_reward": 0.0,
443
  "step": 31
444
  },
445
  {
446
  "clip_ratio": 0.0,
447
- "completion_length": 583.022331237793,
448
  "epoch": 0.5458422174840085,
449
- "grad_norm": 0.2915865480899811,
450
- "kl": 0.0038242340087890625,
451
  "learning_rate": 1.5e-06,
452
- "loss": 0.0366,
453
- "reward": 0.7656250298023224,
454
- "reward_std": 0.1911288918927312,
455
- "rewards/accuracy_reward": 0.7656250298023224,
456
  "rewards/format_reward": 0.0,
457
  "step": 32
458
  },
459
  {
460
  "clip_ratio": 0.0,
461
- "completion_length": 597.7399787902832,
462
  "epoch": 0.5628997867803838,
463
- "grad_norm": 0.12201324105262756,
464
- "kl": 0.0044403076171875,
465
  "learning_rate": 1.4094322538665708e-06,
466
- "loss": 0.0375,
467
- "reward": 0.7366071790456772,
468
- "reward_std": 0.16133728623390198,
469
- "rewards/accuracy_reward": 0.7366071790456772,
470
  "rewards/format_reward": 0.0,
471
  "step": 33
472
  },
473
  {
474
  "clip_ratio": 0.0,
475
- "completion_length": 589.4151954650879,
476
  "epoch": 0.579957356076759,
477
- "grad_norm": 0.16991916298866272,
478
- "kl": 0.0041408538818359375,
479
  "learning_rate": 1.3191949796170155e-06,
480
- "loss": 0.0318,
481
- "reward": 0.7321428954601288,
482
- "reward_std": 0.16871801298111677,
483
- "rewards/accuracy_reward": 0.7321428954601288,
484
  "rewards/format_reward": 0.0,
485
  "step": 34
486
  },
487
  {
488
  "clip_ratio": 0.0,
489
- "completion_length": 625.5457954406738,
490
  "epoch": 0.5970149253731343,
491
- "grad_norm": 0.18359360098838806,
492
- "kl": 0.0037403106689453125,
493
  "learning_rate": 1.2296174432791415e-06,
494
- "loss": 0.0318,
495
- "reward": 0.7488839775323868,
496
- "reward_std": 0.16130364406853914,
497
- "rewards/accuracy_reward": 0.7488839775323868,
498
  "rewards/format_reward": 0.0,
499
  "step": 35
500
  },
501
  {
502
  "clip_ratio": 0.0,
503
- "completion_length": 558.873908996582,
504
  "epoch": 0.6140724946695096,
505
- "grad_norm": 0.16755032539367676,
506
- "kl": 0.0041866302490234375,
507
  "learning_rate": 1.141026503568664e-06,
508
- "loss": 0.0348,
509
- "reward": 0.750000037252903,
510
- "reward_std": 0.15614807326346636,
511
- "rewards/accuracy_reward": 0.750000037252903,
512
  "rewards/format_reward": 0.0,
513
  "step": 36
514
  },
515
  {
516
  "clip_ratio": 0.0,
517
- "completion_length": 577.9743461608887,
518
  "epoch": 0.6311300639658849,
519
- "grad_norm": 0.22190368175506592,
520
- "kl": 0.0046405792236328125,
521
  "learning_rate": 1.0537454192171958e-06,
522
- "loss": 0.0402,
523
- "reward": 0.8002232536673546,
524
- "reward_std": 0.17436057049781084,
525
- "rewards/accuracy_reward": 0.8002232536673546,
526
  "rewards/format_reward": 0.0,
527
  "step": 37
528
  },
529
  {
530
  "clip_ratio": 0.0,
531
- "completion_length": 581.0558319091797,
532
  "epoch": 0.6481876332622601,
533
- "grad_norm": 0.1611102819442749,
534
- "kl": 0.00406646728515625,
535
  "learning_rate": 9.680926694361964e-07,
536
- "loss": 0.0365,
537
- "reward": 0.7678571715950966,
538
- "reward_std": 0.15295173227787018,
539
- "rewards/accuracy_reward": 0.7678571715950966,
540
  "rewards/format_reward": 0.0,
541
  "step": 38
542
  },
543
  {
544
  "clip_ratio": 0.0,
545
- "completion_length": 554.3147659301758,
546
  "epoch": 0.6652452025586354,
547
- "grad_norm": 0.6986324787139893,
548
- "kl": 0.0059719085693359375,
549
  "learning_rate": 8.843807918208651e-07,
550
- "loss": 0.0387,
551
- "reward": 0.801339328289032,
552
- "reward_std": 0.16045559756457806,
553
- "rewards/accuracy_reward": 0.801339328289032,
554
  "rewards/format_reward": 0.0,
555
  "step": 39
556
  },
557
  {
558
  "clip_ratio": 0.0,
559
- "completion_length": 628.6562652587891,
560
  "epoch": 0.6823027718550106,
561
- "grad_norm": 0.2882983386516571,
562
- "kl": 0.0050792694091796875,
563
  "learning_rate": 8.029152419343472e-07,
564
- "loss": 0.0423,
565
- "reward": 0.7388393208384514,
566
- "reward_std": 0.18197357282042503,
567
- "rewards/accuracy_reward": 0.7388393208384514,
568
  "rewards/format_reward": 0.0,
569
  "step": 40
570
  },
571
  {
572
  "clip_ratio": 0.0,
573
- "completion_length": 604.989990234375,
574
  "epoch": 0.6993603411513859,
575
- "grad_norm": 0.24079355597496033,
576
- "kl": 0.004535675048828125,
577
  "learning_rate": 7.239932787335147e-07,
578
- "loss": 0.0428,
579
- "reward": 0.746651828289032,
580
- "reward_std": 0.21638327650725842,
581
- "rewards/accuracy_reward": 0.746651828289032,
582
  "rewards/format_reward": 0.0,
583
  "step": 41
584
  },
585
  {
586
  "clip_ratio": 0.0,
587
- "completion_length": 588.8884124755859,
588
  "epoch": 0.7164179104477612,
589
- "grad_norm": 0.1857738345861435,
590
- "kl": 0.0051898956298828125,
591
  "learning_rate": 6.479028799032664e-07,
592
- "loss": 0.0348,
593
  "reward": 0.7667410969734192,
594
- "reward_std": 0.1482915743254125,
595
  "rewards/accuracy_reward": 0.7667410969734192,
596
  "rewards/format_reward": 0.0,
597
  "step": 42
598
  },
599
  {
600
  "clip_ratio": 0.0,
601
- "completion_length": 633.1930999755859,
602
  "epoch": 0.7334754797441365,
603
- "grad_norm": 0.12053947895765305,
604
- "kl": 0.004100799560546875,
605
  "learning_rate": 5.749216910571854e-07,
606
- "loss": 0.0326,
607
- "reward": 0.7578125298023224,
608
- "reward_std": 0.18650216422975063,
609
- "rewards/accuracy_reward": 0.7578125298023224,
610
  "rewards/format_reward": 0.0,
611
  "step": 43
612
  },
613
  {
614
  "clip_ratio": 0.0,
615
- "completion_length": 591.989990234375,
616
  "epoch": 0.7505330490405118,
617
- "grad_norm": 0.2785487473011017,
618
- "kl": 0.0066280364990234375,
619
  "learning_rate": 5.053160126388076e-07,
620
- "loss": 0.0232,
621
- "reward": 0.7243303880095482,
622
- "reward_std": 0.16931413393467665,
623
- "rewards/accuracy_reward": 0.7243303880095482,
624
  "rewards/format_reward": 0.0,
625
  "step": 44
626
  },
627
  {
628
  "clip_ratio": 0.0,
629
- "completion_length": 564.8236808776855,
630
  "epoch": 0.767590618336887,
631
- "grad_norm": 0.20789861679077148,
632
- "kl": 0.00405120849609375,
633
  "learning_rate": 4.3933982822017883e-07,
634
- "loss": 0.0318,
635
- "reward": 0.7935268133878708,
636
- "reward_std": 0.15123321348801255,
637
- "rewards/accuracy_reward": 0.7935268133878708,
638
  "rewards/format_reward": 0.0,
639
  "step": 45
640
  },
641
  {
642
  "clip_ratio": 0.0,
643
- "completion_length": 609.4509048461914,
644
  "epoch": 0.7846481876332623,
645
- "grad_norm": 0.5048084855079651,
646
- "kl": 0.0051288604736328125,
647
  "learning_rate": 3.772338777433482e-07,
648
- "loss": 0.0484,
649
- "reward": 0.7388393133878708,
650
- "reward_std": 0.19954787380993366,
651
- "rewards/accuracy_reward": 0.7388393133878708,
652
  "rewards/format_reward": 0.0,
653
  "step": 46
654
  },
655
  {
656
  "clip_ratio": 0.0,
657
- "completion_length": 591.3649749755859,
658
  "epoch": 0.8017057569296375,
659
- "grad_norm": 0.1130928322672844,
660
- "kl": 0.00443267822265625,
661
  "learning_rate": 3.192247790864249e-07,
662
- "loss": 0.0368,
663
- "reward": 0.7533482536673546,
664
- "reward_std": 0.20039592683315277,
665
- "rewards/accuracy_reward": 0.7533482536673546,
666
  "rewards/format_reward": 0.0,
667
  "step": 47
668
  },
669
  {
670
  "clip_ratio": 0.0,
671
- "completion_length": 579.0134162902832,
672
  "epoch": 0.8187633262260128,
673
- "grad_norm": 0.14524802565574646,
674
- "kl": 0.0044956207275390625,
675
  "learning_rate": 2.6552420115951547e-07,
676
- "loss": 0.0361,
677
- "reward": 0.7845982536673546,
678
- "reward_std": 0.1610517231747508,
679
- "rewards/accuracy_reward": 0.7845982536673546,
680
  "rewards/format_reward": 0.0,
681
  "step": 48
682
  },
683
  {
684
  "clip_ratio": 0.0,
685
- "completion_length": 614.7578506469727,
686
  "epoch": 0.835820895522388,
687
- "grad_norm": 0.1255880445241928,
688
- "kl": 0.00421142578125,
689
  "learning_rate": 2.163280915478289e-07,
690
- "loss": 0.0287,
691
- "reward": 0.7544643133878708,
692
- "reward_std": 0.17125373054295778,
693
- "rewards/accuracy_reward": 0.7544643133878708,
694
  "rewards/format_reward": 0.0,
695
  "step": 49
696
  },
697
  {
698
  "clip_ratio": 0.0,
699
- "completion_length": 601.2723503112793,
700
  "epoch": 0.8528784648187633,
701
- "grad_norm": 0.1761648952960968,
702
- "kl": 0.00411224365234375,
703
  "learning_rate": 1.718159615201853e-07,
704
- "loss": 0.0408,
705
- "reward": 0.7388393208384514,
706
- "reward_std": 0.1724097654223442,
707
- "rewards/accuracy_reward": 0.7388393208384514,
708
  "rewards/format_reward": 0.0,
709
  "step": 50
710
  },
711
  {
712
  "clip_ratio": 0.0,
713
- "completion_length": 568.6573905944824,
714
  "epoch": 0.8699360341151386,
715
- "grad_norm": 0.2946619391441345,
716
- "kl": 0.004871368408203125,
717
  "learning_rate": 1.321502310118649e-07,
718
- "loss": 0.0445,
719
- "reward": 0.805803619325161,
720
- "reward_std": 0.16564481239765882,
721
- "rewards/accuracy_reward": 0.805803619325161,
722
  "rewards/format_reward": 0.0,
723
  "step": 51
724
  },
725
  {
726
  "clip_ratio": 0.0,
727
- "completion_length": 621.2734756469727,
728
  "epoch": 0.8869936034115139,
729
- "grad_norm": 0.11113651096820831,
730
- "kl": 0.003757476806640625,
731
  "learning_rate": 9.747563597187792e-08,
732
- "loss": 0.029,
733
- "reward": 0.7589286044239998,
734
- "reward_std": 0.20852677430957556,
735
- "rewards/accuracy_reward": 0.7589286044239998,
736
  "rewards/format_reward": 0.0,
737
  "step": 52
738
  },
739
  {
740
  "clip_ratio": 0.0,
741
- "completion_length": 591.5591697692871,
742
  "epoch": 0.9040511727078892,
743
- "grad_norm": 0.10602231323719025,
744
- "kl": 0.0047321319580078125,
745
  "learning_rate": 6.791870023718161e-08,
746
- "loss": 0.0328,
747
- "reward": 0.718750037252903,
748
- "reward_std": 0.16462033987045288,
749
- "rewards/accuracy_reward": 0.718750037252903,
750
  "rewards/format_reward": 0.0,
751
  "step": 53
752
  },
753
  {
754
  "clip_ratio": 0.0,
755
- "completion_length": 569.8493576049805,
756
  "epoch": 0.9211087420042644,
757
- "grad_norm": 0.17479702830314636,
758
- "kl": 0.0049591064453125,
759
  "learning_rate": 4.358727386092198e-08,
760
- "loss": 0.025,
761
- "reward": 0.7957589700818062,
762
- "reward_std": 0.19463301822543144,
763
- "rewards/accuracy_reward": 0.7957589700818062,
764
  "rewards/format_reward": 0.0,
765
  "step": 54
766
  },
767
  {
768
  "clip_ratio": 0.0,
769
- "completion_length": 610.9352951049805,
770
  "epoch": 0.9381663113006397,
771
- "grad_norm": 0.12260417640209198,
772
- "kl": 0.0051116943359375,
773
  "learning_rate": 2.4570139579284723e-08,
774
- "loss": 0.033,
775
- "reward": 0.7979911118745804,
776
- "reward_std": 0.1830651182681322,
777
- "rewards/accuracy_reward": 0.7979911118745804,
778
  "rewards/format_reward": 0.0,
779
  "step": 55
780
  },
781
  {
782
  "clip_ratio": 0.0,
783
- "completion_length": 577.7366218566895,
784
  "epoch": 0.9552238805970149,
785
- "grad_norm": 0.28994134068489075,
786
- "kl": 0.0058689117431640625,
787
  "learning_rate": 1.093668885291904e-08,
788
- "loss": 0.0189,
789
- "reward": 0.7857143208384514,
790
- "reward_std": 0.1605983767658472,
791
- "rewards/accuracy_reward": 0.7857143208384514,
792
  "rewards/format_reward": 0.0,
793
  "step": 56
794
  },
795
  {
796
  "clip_ratio": 0.0,
797
- "completion_length": 588.8772659301758,
798
  "epoch": 0.9722814498933902,
799
- "grad_norm": 0.16521920263767242,
800
- "kl": 0.00461578369140625,
801
  "learning_rate": 2.736668665023756e-09,
802
- "loss": 0.0413,
803
- "reward": 0.7723214626312256,
804
- "reward_std": 0.19243028853088617,
805
- "rewards/accuracy_reward": 0.7723214626312256,
806
  "rewards/format_reward": 0.0,
807
  "step": 57
808
  },
809
  {
810
  "clip_ratio": 0.0,
811
- "completion_length": 586.1794052124023,
812
  "epoch": 0.9893390191897654,
813
- "grad_norm": 0.31878846883773804,
814
- "kl": 0.004428863525390625,
815
  "learning_rate": 0.0,
816
- "loss": 0.0198,
817
- "reward": 0.764508955180645,
818
- "reward_std": 0.15518809761852026,
819
- "rewards/accuracy_reward": 0.764508955180645,
820
  "rewards/format_reward": 0.0,
821
  "step": 58
822
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
823
  {
824
  "epoch": 0.9893390191897654,
825
  "step": 58,
826
  "total_flos": 0.0,
827
- "train_loss": 0.04100553294370786,
828
- "train_runtime": 8329.5214,
829
- "train_samples_per_second": 0.9,
830
- "train_steps_per_second": 0.007
831
  }
832
  ],
833
  "logging_steps": 1,
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.9893390191897654,
5
+ "eval_steps": 10000,
6
  "global_step": 58,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
 
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
+ "completion_length": 604.5960006713867,
14
  "epoch": 0.017057569296375266,
15
+ "grad_norm": 0.4511799216270447,
16
  "kl": 0.0,
17
  "learning_rate": 5e-07,
18
+ "loss": 0.0272,
19
+ "reward": 0.6261160969734192,
20
+ "reward_std": 0.3447213862091303,
21
+ "rewards/accuracy_reward": 0.6261160969734192,
22
  "rewards/format_reward": 0.0,
23
  "step": 1
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
+ "completion_length": 602.7522583007812,
28
  "epoch": 0.03411513859275053,
29
+ "grad_norm": 0.3016141951084137,
30
  "kl": 0.0,
31
  "learning_rate": 1e-06,
32
+ "loss": 0.0351,
33
+ "reward": 0.6026785969734192,
34
+ "reward_std": 0.35665313154459,
35
+ "rewards/accuracy_reward": 0.6026785969734192,
36
  "rewards/format_reward": 0.0,
37
  "step": 2
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
+ "completion_length": 631.8515930175781,
42
  "epoch": 0.0511727078891258,
43
+ "grad_norm": 0.3726074695587158,
44
+ "kl": 0.00022530555725097656,
45
  "learning_rate": 1.5e-06,
46
+ "loss": 0.0455,
47
+ "reward": 0.6149553880095482,
48
+ "reward_std": 0.35506617091596127,
49
+ "rewards/accuracy_reward": 0.6149553880095482,
50
  "rewards/format_reward": 0.0,
51
  "step": 3
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
+ "completion_length": 596.8404312133789,
56
  "epoch": 0.06823027718550106,
57
+ "grad_norm": 0.7917911410331726,
58
+ "kl": 0.0002683401107788086,
59
  "learning_rate": 2e-06,
60
+ "loss": 0.0196,
61
+ "reward": 0.564732164144516,
62
+ "reward_std": 0.35563987866044044,
63
+ "rewards/accuracy_reward": 0.564732164144516,
64
  "rewards/format_reward": 0.0,
65
  "step": 4
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
+ "completion_length": 590.5893135070801,
70
  "epoch": 0.08528784648187633,
71
+ "grad_norm": 0.5213038325309753,
72
+ "kl": 0.0002919435501098633,
73
  "learning_rate": 2.5e-06,
74
+ "loss": 0.007,
75
+ "reward": 0.640625037252903,
76
+ "reward_std": 0.3181990496814251,
77
+ "rewards/accuracy_reward": 0.640625037252903,
78
  "rewards/format_reward": 0.0,
79
  "step": 5
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
+ "completion_length": 586.8504676818848,
84
  "epoch": 0.1023454157782516,
85
+ "grad_norm": 0.333626925945282,
86
+ "kl": 0.0005509853363037109,
87
  "learning_rate": 3e-06,
88
+ "loss": 0.03,
89
+ "reward": 0.6406250298023224,
90
+ "reward_std": 0.3159851096570492,
91
+ "rewards/accuracy_reward": 0.6406250298023224,
92
+ "rewards/format_reward": 0.0,
93
  "step": 6
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
+ "completion_length": 611.2210083007812,
98
  "epoch": 0.11940298507462686,
99
+ "grad_norm": 0.33720001578330994,
100
+ "kl": 0.0010228157043457031,
101
  "learning_rate": 2.9972633313349763e-06,
102
+ "loss": 0.0505,
103
+ "reward": 0.6774553880095482,
104
+ "reward_std": 0.31844254955649376,
105
+ "rewards/accuracy_reward": 0.6774553880095482,
106
  "rewards/format_reward": 0.0,
107
  "step": 7
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
+ "completion_length": 633.0658798217773,
112
  "epoch": 0.13646055437100213,
113
+ "grad_norm": 40.491729736328125,
114
+ "kl": 0.023244857788085938,
115
  "learning_rate": 2.989063311147081e-06,
116
+ "loss": 0.0587,
117
+ "reward": 0.6495535969734192,
118
+ "reward_std": 0.30738127417862415,
119
+ "rewards/accuracy_reward": 0.6495535969734192,
120
  "rewards/format_reward": 0.0,
121
  "step": 8
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
+ "completion_length": 596.904052734375,
126
  "epoch": 0.1535181236673774,
127
+ "grad_norm": 191.003173828125,
128
+ "kl": 0.11961555480957031,
129
  "learning_rate": 2.9754298604207156e-06,
130
+ "loss": 0.0912,
131
+ "reward": 0.7299107387661934,
132
+ "reward_std": 0.3143673036247492,
133
+ "rewards/accuracy_reward": 0.7299107387661934,
134
  "rewards/format_reward": 0.0,
135
  "step": 9
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
+ "completion_length": 620.3705596923828,
140
  "epoch": 0.17057569296375266,
141
+ "grad_norm": 0.40953710675239563,
142
+ "kl": 0.0033578872680664062,
143
  "learning_rate": 2.956412726139078e-06,
144
+ "loss": 0.083,
145
+ "reward": 0.7209821864962578,
146
+ "reward_std": 0.2721347473561764,
147
+ "rewards/accuracy_reward": 0.7209821864962578,
148
  "rewards/format_reward": 0.0,
149
  "step": 10
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
+ "completion_length": 652.9877548217773,
154
  "epoch": 0.18763326226012794,
155
+ "grad_norm": 0.19379274547100067,
156
+ "kl": 0.003864288330078125,
157
  "learning_rate": 2.9320812997628183e-06,
158
+ "loss": 0.0712,
159
+ "reward": 0.7087053880095482,
160
+ "reward_std": 0.26535856537520885,
161
+ "rewards/accuracy_reward": 0.7087053880095482,
162
  "rewards/format_reward": 0.0,
163
  "step": 11
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
+ "completion_length": 577.8091888427734,
168
  "epoch": 0.2046908315565032,
169
+ "grad_norm": 0.7543983459472656,
170
+ "kl": 0.0048828125,
171
  "learning_rate": 2.9025243640281224e-06,
172
+ "loss": 0.0758,
173
+ "reward": 0.7946428954601288,
174
+ "reward_std": 0.19575820118188858,
175
+ "rewards/accuracy_reward": 0.7946428954601288,
176
  "rewards/format_reward": 0.0,
177
  "step": 12
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
+ "completion_length": 605.303596496582,
182
  "epoch": 0.22174840085287847,
183
+ "grad_norm": 1.7850338220596313,
184
+ "kl": 0.005886077880859375,
185
  "learning_rate": 2.8678497689881355e-06,
186
+ "loss": 0.062,
187
+ "reward": 0.7589286118745804,
188
+ "reward_std": 0.2227060990408063,
189
+ "rewards/accuracy_reward": 0.7589286118745804,
190
  "rewards/format_reward": 0.0,
191
  "step": 13
192
  },
193
  {
194
  "clip_ratio": 0.0,
195
+ "completion_length": 609.9096221923828,
196
  "epoch": 0.23880597014925373,
197
+ "grad_norm": 0.2924557030200958,
198
+ "kl": 0.0028257369995117188,
199
  "learning_rate": 2.8281840384798147e-06,
200
+ "loss": 0.0644,
201
+ "reward": 0.7366071715950966,
202
+ "reward_std": 0.20327048748731613,
203
+ "rewards/accuracy_reward": 0.7366071715950966,
204
  "rewards/format_reward": 0.0,
205
  "step": 14
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
+ "completion_length": 607.3292617797852,
210
  "epoch": 0.255863539445629,
211
+ "grad_norm": 0.19465771317481995,
212
+ "kl": 0.0043182373046875,
213
  "learning_rate": 2.7836719084521715e-06,
214
+ "loss": 0.0529,
215
+ "reward": 0.773437537252903,
216
+ "reward_std": 0.2134528523311019,
217
+ "rewards/accuracy_reward": 0.773437537252903,
218
  "rewards/format_reward": 0.0,
219
  "step": 15
220
  },
221
  {
222
  "clip_ratio": 0.0,
223
+ "completion_length": 582.2745819091797,
224
  "epoch": 0.27292110874200426,
225
+ "grad_norm": 0.19785651564598083,
226
+ "kl": 0.00447845458984375,
227
  "learning_rate": 2.7344757988404844e-06,
228
+ "loss": 0.0635,
229
+ "reward": 0.792410746216774,
230
+ "reward_std": 0.2150930892676115,
231
+ "rewards/accuracy_reward": 0.792410746216774,
232
  "rewards/format_reward": 0.0,
233
  "step": 16
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
+ "completion_length": 607.478816986084,
238
  "epoch": 0.2899786780383795,
239
+ "grad_norm": 2.073848009109497,
240
+ "kl": 0.012720108032226562,
241
  "learning_rate": 2.680775220913575e-06,
242
+ "loss": 0.0424,
243
+ "reward": 0.756696455180645,
244
+ "reward_std": 0.1880332687869668,
245
+ "rewards/accuracy_reward": 0.756696455180645,
246
  "rewards/format_reward": 0.0,
247
  "step": 17
248
  },
249
  {
250
  "clip_ratio": 0.0,
251
+ "completion_length": 610.4665374755859,
252
  "epoch": 0.3070362473347548,
253
+ "grad_norm": 0.23339830338954926,
254
+ "kl": 0.01509857177734375,
255
  "learning_rate": 2.6227661222566517e-06,
256
+ "loss": 0.0432,
257
+ "reward": 0.7343750447034836,
258
+ "reward_std": 0.21946769207715988,
259
+ "rewards/accuracy_reward": 0.7343750447034836,
260
  "rewards/format_reward": 0.0,
261
  "step": 18
262
  },
263
  {
264
  "clip_ratio": 0.0,
265
+ "completion_length": 595.0357475280762,
266
  "epoch": 0.32409381663113007,
267
+ "grad_norm": 0.10285016894340515,
268
+ "kl": 0.0044097900390625,
269
  "learning_rate": 2.5606601717798212e-06,
270
+ "loss": 0.0378,
271
+ "reward": 0.7857143208384514,
272
+ "reward_std": 0.17597838956862688,
273
+ "rewards/accuracy_reward": 0.7857143208384514,
274
  "rewards/format_reward": 0.0,
275
  "step": 19
276
  },
277
  {
278
  "clip_ratio": 0.0,
279
+ "completion_length": 565.811408996582,
280
  "epoch": 0.3411513859275053,
281
+ "grad_norm": 0.14462673664093018,
282
+ "kl": 0.003070831298828125,
283
  "learning_rate": 2.4946839873611927e-06,
284
+ "loss": 0.051,
285
+ "reward": 0.762276828289032,
286
+ "reward_std": 0.21532537415623665,
287
+ "rewards/accuracy_reward": 0.762276828289032,
288
  "rewards/format_reward": 0.0,
289
  "step": 20
290
  },
291
  {
292
  "clip_ratio": 0.0,
293
+ "completion_length": 620.9185562133789,
294
  "epoch": 0.3582089552238806,
295
+ "grad_norm": 0.6581376791000366,
296
+ "kl": 0.00328826904296875,
297
  "learning_rate": 2.425078308942815e-06,
298
+ "loss": 0.0453,
299
+ "reward": 0.757812537252903,
300
+ "reward_std": 0.2050897255539894,
301
+ "rewards/accuracy_reward": 0.757812537252903,
302
  "rewards/format_reward": 0.0,
303
  "step": 21
304
  },
305
  {
306
  "clip_ratio": 0.0,
307
+ "completion_length": 590.5056076049805,
308
  "epoch": 0.3752665245202559,
309
+ "grad_norm": 1.3106849193572998,
310
+ "kl": 0.0048999786376953125,
311
  "learning_rate": 2.3520971200967337e-06,
312
+ "loss": 0.0458,
313
+ "reward": 0.791294664144516,
314
+ "reward_std": 0.20316135231405497,
315
+ "rewards/accuracy_reward": 0.791294664144516,
316
  "rewards/format_reward": 0.0,
317
  "step": 22
318
  },
319
  {
320
  "clip_ratio": 0.0,
321
+ "completion_length": 595.1216735839844,
322
  "epoch": 0.39232409381663114,
323
+ "grad_norm": 0.15704886615276337,
324
+ "kl": 0.00351715087890625,
325
  "learning_rate": 2.276006721266485e-06,
326
+ "loss": 0.0422,
327
+ "reward": 0.7656250298023224,
328
+ "reward_std": 0.19919524434953928,
329
+ "rewards/accuracy_reward": 0.7656250298023224,
330
  "rewards/format_reward": 0.0,
331
  "step": 23
332
  },
333
  {
334
  "clip_ratio": 0.0,
335
+ "completion_length": 597.1718978881836,
336
  "epoch": 0.4093816631130064,
337
+ "grad_norm": 0.12948359549045563,
338
+ "kl": 0.004302978515625,
339
  "learning_rate": 2.1970847580656528e-06,
340
+ "loss": 0.0466,
341
+ "reward": 0.7522321790456772,
342
+ "reward_std": 0.18040904868394136,
343
+ "rewards/accuracy_reward": 0.7522321790456772,
344
  "rewards/format_reward": 0.0,
345
  "step": 24
346
  },
347
  {
348
  "clip_ratio": 0.0,
349
+ "completion_length": 601.4386405944824,
350
  "epoch": 0.42643923240938164,
351
+ "grad_norm": 0.1379498988389969,
352
+ "kl": 0.0035552978515625,
353
  "learning_rate": 2.1156192081791355e-06,
354
+ "loss": 0.0661,
355
+ "reward": 0.718750037252903,
356
+ "reward_std": 0.2221574056893587,
357
+ "rewards/accuracy_reward": 0.718750037252903,
358
  "rewards/format_reward": 0.0,
359
  "step": 25
360
  },
361
  {
362
  "clip_ratio": 0.0,
363
+ "completion_length": 590.9721260070801,
364
  "epoch": 0.44349680170575695,
365
+ "grad_norm": 0.6328846216201782,
366
+ "kl": 0.003810882568359375,
367
  "learning_rate": 2.0319073305638034e-06,
368
+ "loss": 0.0435,
369
  "reward": 0.753348246216774,
370
+ "reward_std": 0.19143945816904306,
371
  "rewards/accuracy_reward": 0.753348246216774,
372
  "rewards/format_reward": 0.0,
373
  "step": 26
374
  },
375
  {
376
  "clip_ratio": 0.0,
377
+ "completion_length": 588.3326263427734,
378
  "epoch": 0.4605543710021322,
379
+ "grad_norm": 0.1565869301557541,
380
+ "kl": 0.0039806365966796875,
381
  "learning_rate": 1.9462545807828044e-06,
382
+ "loss": 0.05,
383
+ "reward": 0.7354911044239998,
384
+ "reward_std": 0.1781614711508155,
385
+ "rewards/accuracy_reward": 0.7354911044239998,
386
  "rewards/format_reward": 0.0,
387
  "step": 27
388
  },
389
  {
390
  "clip_ratio": 0.0,
391
+ "completion_length": 583.0100708007812,
392
  "epoch": 0.47761194029850745,
393
+ "grad_norm": 0.5358207821846008,
394
+ "kl": 0.013216018676757812,
395
  "learning_rate": 1.8589734964313368e-06,
396
+ "loss": 0.0499,
397
+ "reward": 0.7957589700818062,
398
+ "reward_std": 0.17203470412641764,
399
+ "rewards/accuracy_reward": 0.7957589700818062,
400
  "rewards/format_reward": 0.0,
401
  "step": 28
402
  },
403
  {
404
  "clip_ratio": 0.0,
405
+ "completion_length": 617.6105194091797,
406
  "epoch": 0.4946695095948827,
407
+ "grad_norm": 0.10866988450288773,
408
+ "kl": 0.0028591156005859375,
409
  "learning_rate": 1.7703825567208588e-06,
410
+ "loss": 0.0461,
411
+ "reward": 0.7343750298023224,
412
+ "reward_std": 0.1905689835548401,
413
+ "rewards/accuracy_reward": 0.7343750298023224,
414
  "rewards/format_reward": 0.0,
415
  "step": 29
416
  },
417
  {
418
  "clip_ratio": 0.0,
419
+ "completion_length": 628.3661041259766,
420
  "epoch": 0.511727078891258,
421
+ "grad_norm": 0.1341499537229538,
422
+ "kl": 0.003131866455078125,
423
  "learning_rate": 1.6808050203829845e-06,
424
+ "loss": 0.0238,
425
+ "reward": 0.735491119325161,
426
+ "reward_std": 0.17904315516352654,
427
+ "rewards/accuracy_reward": 0.735491119325161,
428
  "rewards/format_reward": 0.0,
429
  "step": 30
430
  },
431
  {
432
  "clip_ratio": 0.0,
433
+ "completion_length": 589.1886444091797,
434
  "epoch": 0.5287846481876333,
435
+ "grad_norm": 0.12772534787654877,
436
+ "kl": 0.004833221435546875,
437
  "learning_rate": 1.5905677461334292e-06,
438
+ "loss": 0.0384,
439
+ "reward": 0.7243303880095482,
440
+ "reward_std": 0.19990051537752151,
441
+ "rewards/accuracy_reward": 0.7243303880095482,
442
  "rewards/format_reward": 0.0,
443
  "step": 31
444
  },
445
  {
446
  "clip_ratio": 0.0,
447
+ "completion_length": 577.6808319091797,
448
  "epoch": 0.5458422174840085,
449
+ "grad_norm": 0.1517443060874939,
450
+ "kl": 0.0039844512939453125,
451
  "learning_rate": 1.5e-06,
452
+ "loss": 0.0365,
453
+ "reward": 0.7633928954601288,
454
+ "reward_std": 0.18407837813720107,
455
+ "rewards/accuracy_reward": 0.7633928954601288,
456
  "rewards/format_reward": 0.0,
457
  "step": 32
458
  },
459
  {
460
  "clip_ratio": 0.0,
461
+ "completion_length": 587.8839569091797,
462
  "epoch": 0.5628997867803838,
463
+ "grad_norm": 0.1479600965976715,
464
+ "kl": 0.0033779144287109375,
465
  "learning_rate": 1.4094322538665708e-06,
466
+ "loss": 0.0441,
467
+ "reward": 0.7466518208384514,
468
+ "reward_std": 0.19517328590154648,
469
+ "rewards/accuracy_reward": 0.7466518208384514,
470
  "rewards/format_reward": 0.0,
471
  "step": 33
472
  },
473
  {
474
  "clip_ratio": 0.0,
475
+ "completion_length": 591.8169937133789,
476
  "epoch": 0.579957356076759,
477
+ "grad_norm": 0.1458793729543686,
478
+ "kl": 0.0035686492919921875,
479
  "learning_rate": 1.3191949796170155e-06,
480
+ "loss": 0.0396,
481
+ "reward": 0.7232143059372902,
482
+ "reward_std": 0.19442316610366106,
483
+ "rewards/accuracy_reward": 0.7232143059372902,
484
  "rewards/format_reward": 0.0,
485
  "step": 34
486
  },
487
  {
488
  "clip_ratio": 0.0,
489
+ "completion_length": 629.7857322692871,
490
  "epoch": 0.5970149253731343,
491
+ "grad_norm": 0.09501513838768005,
492
+ "kl": 0.008701324462890625,
493
  "learning_rate": 1.2296174432791415e-06,
494
+ "loss": 0.0273,
495
+ "reward": 0.7321428954601288,
496
+ "reward_std": 0.19501928705722094,
497
+ "rewards/accuracy_reward": 0.7321428954601288,
498
  "rewards/format_reward": 0.0,
499
  "step": 35
500
  },
501
  {
502
  "clip_ratio": 0.0,
503
+ "completion_length": 568.5536155700684,
504
  "epoch": 0.6140724946695096,
505
+ "grad_norm": 0.1745123714208603,
506
+ "kl": 0.0050048828125,
507
  "learning_rate": 1.141026503568664e-06,
508
+ "loss": 0.0448,
509
+ "reward": 0.7600446715950966,
510
+ "reward_std": 0.18900445476174355,
511
+ "rewards/accuracy_reward": 0.7600446715950966,
512
  "rewards/format_reward": 0.0,
513
  "step": 36
514
  },
515
  {
516
  "clip_ratio": 0.0,
517
+ "completion_length": 580.4486923217773,
518
  "epoch": 0.6311300639658849,
519
+ "grad_norm": 0.16805370151996613,
520
+ "kl": 0.0038299560546875,
521
  "learning_rate": 1.0537454192171958e-06,
522
+ "loss": 0.0328,
523
+ "reward": 0.797991119325161,
524
+ "reward_std": 0.1671001985669136,
525
+ "rewards/accuracy_reward": 0.797991119325161,
526
  "rewards/format_reward": 0.0,
527
  "step": 37
528
  },
529
  {
530
  "clip_ratio": 0.0,
531
+ "completion_length": 574.9922180175781,
532
  "epoch": 0.6481876332622601,
533
+ "grad_norm": 0.1520647555589676,
534
+ "kl": 0.00421142578125,
535
  "learning_rate": 9.680926694361964e-07,
536
+ "loss": 0.0415,
537
+ "reward": 0.7511161118745804,
538
+ "reward_std": 0.18564290553331375,
539
+ "rewards/accuracy_reward": 0.7511161118745804,
540
  "rewards/format_reward": 0.0,
541
  "step": 38
542
  },
543
  {
544
  "clip_ratio": 0.0,
545
+ "completion_length": 555.4922180175781,
546
  "epoch": 0.6652452025586354,
547
+ "grad_norm": 0.1991657167673111,
548
+ "kl": 0.004291534423828125,
549
  "learning_rate": 8.843807918208651e-07,
550
+ "loss": 0.0422,
551
+ "reward": 0.7823661118745804,
552
+ "reward_std": 0.17239855788648129,
553
+ "rewards/accuracy_reward": 0.7823661118745804,
554
  "rewards/format_reward": 0.0,
555
  "step": 39
556
  },
557
  {
558
  "clip_ratio": 0.0,
559
+ "completion_length": 625.3661117553711,
560
  "epoch": 0.6823027718550106,
561
+ "grad_norm": 0.11269740760326385,
562
+ "kl": 0.0033588409423828125,
563
  "learning_rate": 8.029152419343472e-07,
564
+ "loss": 0.0409,
565
+ "reward": 0.7075893133878708,
566
+ "reward_std": 0.20507850963622332,
567
+ "rewards/accuracy_reward": 0.7075893133878708,
568
  "rewards/format_reward": 0.0,
569
  "step": 40
570
  },
571
  {
572
  "clip_ratio": 0.0,
573
+ "completion_length": 610.1897506713867,
574
  "epoch": 0.6993603411513859,
575
+ "grad_norm": 0.21570497751235962,
576
+ "kl": 0.005218505859375,
577
  "learning_rate": 7.239932787335147e-07,
578
+ "loss": 0.0379,
579
+ "reward": 0.7656250298023224,
580
+ "reward_std": 0.19328955188393593,
581
+ "rewards/accuracy_reward": 0.7656250298023224,
582
  "rewards/format_reward": 0.0,
583
  "step": 41
584
  },
585
  {
586
  "clip_ratio": 0.0,
587
+ "completion_length": 601.0435485839844,
588
  "epoch": 0.7164179104477612,
589
+ "grad_norm": 0.17806975543498993,
590
+ "kl": 0.003978729248046875,
591
  "learning_rate": 6.479028799032664e-07,
592
+ "loss": 0.0424,
593
  "reward": 0.7667410969734192,
594
+ "reward_std": 0.2007934171706438,
595
  "rewards/accuracy_reward": 0.7667410969734192,
596
  "rewards/format_reward": 0.0,
597
  "step": 42
598
  },
599
  {
600
  "clip_ratio": 0.0,
601
+ "completion_length": 634.1518173217773,
602
  "epoch": 0.7334754797441365,
603
+ "grad_norm": 0.1565958708524704,
604
+ "kl": 0.005039215087890625,
605
  "learning_rate": 5.749216910571854e-07,
606
+ "loss": 0.0482,
607
+ "reward": 0.7287946790456772,
608
+ "reward_std": 0.22933948785066605,
609
+ "rewards/accuracy_reward": 0.7287946790456772,
610
  "rewards/format_reward": 0.0,
611
  "step": 43
612
  },
613
  {
614
  "clip_ratio": 0.0,
615
+ "completion_length": 586.2723388671875,
616
  "epoch": 0.7505330490405118,
617
+ "grad_norm": 0.13228431344032288,
618
+ "kl": 0.0039825439453125,
619
  "learning_rate": 5.053160126388076e-07,
620
+ "loss": 0.0416,
621
+ "reward": 0.7366071864962578,
622
+ "reward_std": 0.1808932526037097,
623
+ "rewards/accuracy_reward": 0.7366071864962578,
624
  "rewards/format_reward": 0.0,
625
  "step": 44
626
  },
627
  {
628
  "clip_ratio": 0.0,
629
+ "completion_length": 576.3806037902832,
630
  "epoch": 0.767590618336887,
631
+ "grad_norm": 0.13988560438156128,
632
+ "kl": 0.0044498443603515625,
633
  "learning_rate": 4.3933982822017883e-07,
634
+ "loss": 0.0219,
635
+ "reward": 0.7633928954601288,
636
+ "reward_std": 0.17312624771147966,
637
+ "rewards/accuracy_reward": 0.7633928954601288,
638
  "rewards/format_reward": 0.0,
639
  "step": 45
640
  },
641
  {
642
  "clip_ratio": 0.0,
643
+ "completion_length": 604.428596496582,
644
  "epoch": 0.7846481876332623,
645
+ "grad_norm": 0.10653964430093765,
646
+ "kl": 0.0032711029052734375,
647
  "learning_rate": 3.772338777433482e-07,
648
+ "loss": 0.0434,
649
+ "reward": 0.7220982536673546,
650
+ "reward_std": 0.2057167086750269,
651
+ "rewards/accuracy_reward": 0.7220982536673546,
652
  "rewards/format_reward": 0.0,
653
  "step": 46
654
  },
655
  {
656
  "clip_ratio": 0.0,
657
+ "completion_length": 587.4810485839844,
658
  "epoch": 0.8017057569296375,
659
+ "grad_norm": 0.12146904319524765,
660
+ "kl": 0.0043125152587890625,
661
  "learning_rate": 3.192247790864249e-07,
662
+ "loss": 0.0364,
663
+ "reward": 0.718750037252903,
664
+ "reward_std": 0.21366270910948515,
665
+ "rewards/accuracy_reward": 0.718750037252903,
666
  "rewards/format_reward": 0.0,
667
  "step": 47
668
  },
669
  {
670
  "clip_ratio": 0.0,
671
+ "completion_length": 578.7477912902832,
672
  "epoch": 0.8187633262260128,
673
+ "grad_norm": 0.38011038303375244,
674
+ "kl": 0.005626678466796875,
675
  "learning_rate": 2.6552420115951547e-07,
676
+ "loss": 0.0458,
677
+ "reward": 0.768973246216774,
678
+ "reward_std": 0.18335067853331566,
679
+ "rewards/accuracy_reward": 0.768973246216774,
680
  "rewards/format_reward": 0.0,
681
  "step": 48
682
  },
683
  {
684
  "clip_ratio": 0.0,
685
+ "completion_length": 619.3940048217773,
686
  "epoch": 0.835820895522388,
687
+ "grad_norm": 0.12710346281528473,
688
+ "kl": 0.008390426635742188,
689
  "learning_rate": 2.163280915478289e-07,
690
+ "loss": 0.0188,
691
+ "reward": 0.7366071864962578,
692
+ "reward_std": 0.19365339912474155,
693
+ "rewards/accuracy_reward": 0.7366071864962578,
694
  "rewards/format_reward": 0.0,
695
  "step": 49
696
  },
697
  {
698
  "clip_ratio": 0.0,
699
+ "completion_length": 598.0636329650879,
700
  "epoch": 0.8528784648187633,
701
+ "grad_norm": 0.12470484524965286,
702
+ "kl": 0.0068721771240234375,
703
  "learning_rate": 1.718159615201853e-07,
704
+ "loss": 0.0316,
705
+ "reward": 0.7433036044239998,
706
+ "reward_std": 0.18148937169462442,
707
+ "rewards/accuracy_reward": 0.7433036044239998,
708
  "rewards/format_reward": 0.0,
709
  "step": 50
710
  },
711
  {
712
  "clip_ratio": 0.0,
713
+ "completion_length": 563.3772583007812,
714
  "epoch": 0.8699360341151386,
715
+ "grad_norm": 0.1378241628408432,
716
+ "kl": 0.0040340423583984375,
717
  "learning_rate": 1.321502310118649e-07,
718
+ "loss": 0.0543,
719
+ "reward": 0.801339328289032,
720
+ "reward_std": 0.2019382379949093,
721
+ "rewards/accuracy_reward": 0.801339328289032,
722
  "rewards/format_reward": 0.0,
723
  "step": 51
724
  },
725
  {
726
  "clip_ratio": 0.0,
727
+ "completion_length": 614.0346298217773,
728
  "epoch": 0.8869936034115139,
729
+ "grad_norm": 0.1643468588590622,
730
+ "kl": 0.004608154296875,
731
  "learning_rate": 9.747563597187792e-08,
732
+ "loss": 0.0382,
733
+ "reward": 0.7544643133878708,
734
+ "reward_std": 0.22550495527684689,
735
+ "rewards/accuracy_reward": 0.7544643133878708,
736
  "rewards/format_reward": 0.0,
737
  "step": 52
738
  },
739
  {
740
  "clip_ratio": 0.0,
741
+ "completion_length": 577.741081237793,
742
  "epoch": 0.9040511727078892,
743
+ "grad_norm": 0.12994147837162018,
744
+ "kl": 0.0049915313720703125,
745
  "learning_rate": 6.791870023718161e-08,
746
+ "loss": 0.032,
747
+ "reward": 0.7176339626312256,
748
+ "reward_std": 0.19572455808520317,
749
+ "rewards/accuracy_reward": 0.7176339626312256,
750
  "rewards/format_reward": 0.0,
751
  "step": 53
752
  },
753
  {
754
  "clip_ratio": 0.0,
755
+ "completion_length": 555.170783996582,
756
  "epoch": 0.9211087420042644,
757
+ "grad_norm": 0.14271694421768188,
758
+ "kl": 0.005153656005859375,
759
  "learning_rate": 4.358727386092198e-08,
760
+ "loss": 0.0232,
761
+ "reward": 0.8046875447034836,
762
+ "reward_std": 0.2259807214140892,
763
+ "rewards/accuracy_reward": 0.8046875447034836,
764
  "rewards/format_reward": 0.0,
765
  "step": 54
766
  },
767
  {
768
  "clip_ratio": 0.0,
769
+ "completion_length": 601.6172180175781,
770
  "epoch": 0.9381663113006397,
771
+ "grad_norm": 0.15514791011810303,
772
+ "kl": 0.0044956207275390625,
773
  "learning_rate": 2.4570139579284723e-08,
774
+ "loss": 0.0323,
775
+ "reward": 0.8024554029107094,
776
+ "reward_std": 0.1908433297649026,
777
+ "rewards/accuracy_reward": 0.8024554029107094,
778
  "rewards/format_reward": 0.0,
779
  "step": 55
780
  },
781
  {
782
  "clip_ratio": 0.0,
783
+ "completion_length": 574.4933204650879,
784
  "epoch": 0.9552238805970149,
785
+ "grad_norm": 0.18661239743232727,
786
+ "kl": 0.006015777587890625,
787
  "learning_rate": 1.093668885291904e-08,
788
+ "loss": 0.0185,
789
+ "reward": 0.7656250298023224,
790
+ "reward_std": 0.1824802029877901,
791
+ "rewards/accuracy_reward": 0.7656250298023224,
792
  "rewards/format_reward": 0.0,
793
  "step": 56
794
  },
795
  {
796
  "clip_ratio": 0.0,
797
+ "completion_length": 576.9375305175781,
798
  "epoch": 0.9722814498933902,
799
+ "grad_norm": 0.15061481297016144,
800
+ "kl": 0.004833221435546875,
801
  "learning_rate": 2.736668665023756e-09,
802
+ "loss": 0.0344,
803
+ "reward": 0.7745536118745804,
804
+ "reward_std": 0.2012776229530573,
805
+ "rewards/accuracy_reward": 0.7745536118745804,
806
  "rewards/format_reward": 0.0,
807
  "step": 57
808
  },
809
  {
810
  "clip_ratio": 0.0,
811
+ "completion_length": 588.2766265869141,
812
  "epoch": 0.9893390191897654,
813
+ "grad_norm": 0.12136111408472061,
814
+ "kl": 0.0038356781005859375,
815
  "learning_rate": 0.0,
816
+ "loss": 0.0255,
817
+ "reward": 0.7555803880095482,
818
+ "reward_std": 0.18643509317189455,
819
+ "rewards/accuracy_reward": 0.7555803880095482,
820
  "rewards/format_reward": 0.0,
821
  "step": 58
822
  },
823
+ {
824
+ "epoch": 0.9893390191897654,
825
+ "eval_clip_ratio": 0.0,
826
+ "eval_completion_length": 575.1875478726225,
827
+ "eval_kl": 0.011225008736022364,
828
+ "eval_loss": 0.02523801103234291,
829
+ "eval_reward": 0.673665024316349,
830
+ "eval_reward_std": 0.2331223714465912,
831
+ "eval_rewards/accuracy_reward": 0.6736364986378545,
832
+ "eval_rewards/format_reward": 2.8525788586931868e-05,
833
+ "eval_runtime": 4208.1068,
834
+ "eval_samples_per_second": 1.188,
835
+ "eval_steps_per_second": 0.011,
836
+ "step": 58
837
+ },
838
  {
839
  "epoch": 0.9893390191897654,
840
  "step": 58,
841
  "total_flos": 0.0,
842
+ "train_loss": 0.04285794010000496,
843
+ "train_runtime": 11336.4845,
844
+ "train_samples_per_second": 0.662,
845
+ "train_steps_per_second": 0.005
846
  }
847
  ],
848
  "logging_steps": 1,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e9e03ec4cb080ac7b2fea9657a273e1b089c043de80aabdc3468383c37fb320
3
- size 7992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9d34ada9554116268a24d22a2e02b0b01443418c7b20a28d26ba8a9daedcdf9
3
+ size 8056