luodian commited on
Commit
31bdb40
1 Parent(s): 66a125c

7f3cd9fef53682555543772ea1099fc63b98d89e956e41f2914cc5c940857ad0

Browse files
Files changed (6) hide show
  1. added_tokens.json +1 -0
  2. config.json +198 -198
  3. tokenizer.json +9 -0
  4. tokenizer_config.json +9 -0
  5. trainer_state.json +950 -950
  6. training_args.bin +2 -2
added_tokens.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "<|endoftext|>": 151643,
3
  "<|im_end|>": 151645,
4
  "<|im_start|>": 151644
 
1
  {
2
+ "<image>": 151646,
3
  "<|endoftext|>": 151643,
4
  "<|im_end|>": 151645,
5
  "<|im_start|>": 151644
config.json CHANGED
@@ -1,199 +1,199 @@
1
  {
2
- "_name_or_path": "/mnt/bn/vl-research-cn-boli01-hl/checkpoints/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-72B-Instruct-mid_to_final_next_3m_am9_july13",
3
- "architectures": [
4
- "LlavaQwenForCausalLM"
5
- ],
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "eos_token_id": 151645,
9
- "hidden_act": "silu",
10
- "hidden_size": 8192,
11
- "image_aspect_ratio": "anyres_max_9",
12
- "image_crop_resolution": null,
13
- "image_grid_pinpoints": [
14
- [
15
- 384,
16
- 384
17
- ],
18
- [
19
- 384,
20
- 768
21
- ],
22
- [
23
- 384,
24
- 1152
25
- ],
26
- [
27
- 384,
28
- 1536
29
- ],
30
- [
31
- 384,
32
- 1920
33
- ],
34
- [
35
- 384,
36
- 2304
37
- ],
38
- [
39
- 768,
40
- 384
41
- ],
42
- [
43
- 768,
44
- 768
45
- ],
46
- [
47
- 768,
48
- 1152
49
- ],
50
- [
51
- 768,
52
- 1536
53
- ],
54
- [
55
- 768,
56
- 1920
57
- ],
58
- [
59
- 768,
60
- 2304
61
- ],
62
- [
63
- 1152,
64
- 384
65
- ],
66
- [
67
- 1152,
68
- 768
69
- ],
70
- [
71
- 1152,
72
- 1152
73
- ],
74
- [
75
- 1152,
76
- 1536
77
- ],
78
- [
79
- 1152,
80
- 1920
81
- ],
82
- [
83
- 1152,
84
- 2304
85
- ],
86
- [
87
- 1536,
88
- 384
89
- ],
90
- [
91
- 1536,
92
- 768
93
- ],
94
- [
95
- 1536,
96
- 1152
97
- ],
98
- [
99
- 1536,
100
- 1536
101
- ],
102
- [
103
- 1536,
104
- 1920
105
- ],
106
- [
107
- 1536,
108
- 2304
109
- ],
110
- [
111
- 1920,
112
- 384
113
- ],
114
- [
115
- 1920,
116
- 768
117
- ],
118
- [
119
- 1920,
120
- 1152
121
- ],
122
- [
123
- 1920,
124
- 1536
125
- ],
126
- [
127
- 1920,
128
- 1920
129
- ],
130
- [
131
- 1920,
132
- 2304
133
- ],
134
- [
135
- 2304,
136
- 384
137
- ],
138
- [
139
- 2304,
140
- 768
141
- ],
142
- [
143
- 2304,
144
- 1152
145
- ],
146
- [
147
- 2304,
148
- 1536
149
- ],
150
- [
151
- 2304,
152
- 1920
153
- ],
154
- [
155
- 2304,
156
- 2304
157
- ]
158
- ],
159
- "image_split_resolution": null,
160
- "image_token_index": 151646,
161
- "initializer_range": 0.02,
162
- "intermediate_size": 29568,
163
- "max_position_embeddings": 32768,
164
- "max_window_layers": 80,
165
- "mm_hidden_size": 1152,
166
- "mm_patch_merge_type": "spatial_unpad",
167
- "mm_projector_lr": null,
168
- "mm_projector_type": "mlp2x_gelu",
169
- "mm_resampler_type": null,
170
- "mm_spatial_pool_mode": "bilinear",
171
- "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
172
- "mm_use_im_patch_token": false,
173
- "mm_use_im_start_end": false,
174
- "mm_vision_select_feature": "patch",
175
- "mm_vision_select_layer": -2,
176
- "mm_vision_tower": "google/siglip-so400m-patch14-384",
177
- "mm_vision_tower_lr": 2e-07,
178
- "model_type": "llava",
179
- "num_attention_heads": 64,
180
- "num_hidden_layers": 80,
181
- "num_key_value_heads": 8,
182
- "pos_skipping_range": 4096,
183
- "rms_norm_eps": 1e-06,
184
- "rope_scaling": null,
185
- "rope_theta": 1000000.0,
186
- "sliding_window": 131072,
187
- "tie_word_embeddings": false,
188
- "tokenizer_model_max_length": 32768,
189
- "tokenizer_padding_side": "right",
190
- "torch_dtype": "bfloat16",
191
- "transformers_version": "4.40.0.dev0",
192
- "use_cache": true,
193
- "use_mm_proj": true,
194
- "use_pos_skipping": false,
195
- "use_sliding_window": false,
196
- "vision_tower_pretrained": null,
197
- "vocab_size": 152064,
198
- "add_faster_video": false
199
- }
 
1
  {
2
+ "_name_or_path": "/mnt/bn/vl-research/workspace/txiong23/outputs/ai_feedback/llava_next/critic-72b-iterDPO/v1p5Plus_llava-bench/llava-onevision_Qwen2-72b-ov_dpo-iter1_llava-rlhf-llava-criticV1p5Plus-llava-bench-72b_beta0.1_epoch1",
3
+ "architectures": [
4
+ "LlavaQwenForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 8192,
11
+ "image_aspect_ratio": "anyres_max_9",
12
+ "image_crop_resolution": 384,
13
+ "image_grid_pinpoints": [
14
+ [
15
+ 384,
16
+ 384
17
+ ],
18
+ [
19
+ 384,
20
+ 768
21
+ ],
22
+ [
23
+ 384,
24
+ 1152
25
+ ],
26
+ [
27
+ 384,
28
+ 1536
29
+ ],
30
+ [
31
+ 384,
32
+ 1920
33
+ ],
34
+ [
35
+ 384,
36
+ 2304
37
+ ],
38
+ [
39
+ 768,
40
+ 384
41
+ ],
42
+ [
43
+ 768,
44
+ 768
45
+ ],
46
+ [
47
+ 768,
48
+ 1152
49
+ ],
50
+ [
51
+ 768,
52
+ 1536
53
+ ],
54
+ [
55
+ 768,
56
+ 1920
57
+ ],
58
+ [
59
+ 768,
60
+ 2304
61
+ ],
62
+ [
63
+ 1152,
64
+ 384
65
+ ],
66
+ [
67
+ 1152,
68
+ 768
69
+ ],
70
+ [
71
+ 1152,
72
+ 1152
73
+ ],
74
+ [
75
+ 1152,
76
+ 1536
77
+ ],
78
+ [
79
+ 1152,
80
+ 1920
81
+ ],
82
+ [
83
+ 1152,
84
+ 2304
85
+ ],
86
+ [
87
+ 1536,
88
+ 384
89
+ ],
90
+ [
91
+ 1536,
92
+ 768
93
+ ],
94
+ [
95
+ 1536,
96
+ 1152
97
+ ],
98
+ [
99
+ 1536,
100
+ 1536
101
+ ],
102
+ [
103
+ 1536,
104
+ 1920
105
+ ],
106
+ [
107
+ 1536,
108
+ 2304
109
+ ],
110
+ [
111
+ 1920,
112
+ 384
113
+ ],
114
+ [
115
+ 1920,
116
+ 768
117
+ ],
118
+ [
119
+ 1920,
120
+ 1152
121
+ ],
122
+ [
123
+ 1920,
124
+ 1536
125
+ ],
126
+ [
127
+ 1920,
128
+ 1920
129
+ ],
130
+ [
131
+ 1920,
132
+ 2304
133
+ ],
134
+ [
135
+ 2304,
136
+ 384
137
+ ],
138
+ [
139
+ 2304,
140
+ 768
141
+ ],
142
+ [
143
+ 2304,
144
+ 1152
145
+ ],
146
+ [
147
+ 2304,
148
+ 1536
149
+ ],
150
+ [
151
+ 2304,
152
+ 1920
153
+ ],
154
+ [
155
+ 2304,
156
+ 2304
157
+ ]
158
+ ],
159
+ "image_split_resolution": 384,
160
+ "image_token_index": 151646,
161
+ "initializer_range": 0.02,
162
+ "intermediate_size": 29568,
163
+ "max_position_embeddings": 32768,
164
+ "max_window_layers": 80,
165
+ "mm_hidden_size": 1152,
166
+ "mm_newline_position": "one_token",
167
+ "mm_patch_merge_type": "spatial_unpad",
168
+ "mm_projector_lr": null,
169
+ "mm_projector_type": "mlp2x_gelu",
170
+ "mm_resampler_type": null,
171
+ "mm_spatial_pool_mode": "bilinear",
172
+ "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
173
+ "mm_use_im_patch_token": false,
174
+ "mm_use_im_start_end": false,
175
+ "mm_vision_select_feature": "patch",
176
+ "mm_vision_select_layer": -2,
177
+ "mm_vision_tower": "google/siglip-so400m-patch14-384",
178
+ "mm_vision_tower_lr": null,
179
+ "model_type": "llava_qwen",
180
+ "num_attention_heads": 64,
181
+ "num_hidden_layers": 80,
182
+ "num_key_value_heads": 8,
183
+ "pos_skipping_range": 4096,
184
+ "rms_norm_eps": 1e-06,
185
+ "rope_scaling": null,
186
+ "rope_theta": 1000000.0,
187
+ "sliding_window": 131072,
188
+ "tie_word_embeddings": false,
189
+ "tokenizer_model_max_length": 32768,
190
+ "tokenizer_padding_side": "right",
191
+ "torch_dtype": "bfloat16",
192
+ "transformers_version": "4.40.0.dev0",
193
+ "use_cache": true,
194
+ "use_mm_proj": true,
195
+ "use_pos_skipping": false,
196
+ "use_sliding_window": false,
197
+ "vision_tower_pretrained": null,
198
+ "vocab_size": 152064
199
+ }
tokenizer.json CHANGED
@@ -29,6 +29,15 @@
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
 
 
 
 
 
 
 
 
 
32
  }
33
  ],
34
  "normalizer": {
 
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
32
+ },
33
+ {
34
+ "id": 151646,
35
+ "content": "<image>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
  }
42
  ],
43
  "normalizer": {
tokenizer_config.json CHANGED
@@ -24,6 +24,14 @@
24
  "rstrip": false,
25
  "single_word": false,
26
  "special": true
 
 
 
 
 
 
 
 
27
  }
28
  },
29
  "additional_special_tokens": [
@@ -38,6 +46,7 @@
38
  "model_max_length": 32768,
39
  "pad_token": "<|endoftext|>",
40
  "padding_side": "right",
 
41
  "split_special_tokens": false,
42
  "tokenizer_class": "Qwen2Tokenizer",
43
  "unk_token": null
 
24
  "rstrip": false,
25
  "single_word": false,
26
  "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<image>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
  }
36
  },
37
  "additional_special_tokens": [
 
46
  "model_max_length": 32768,
47
  "pad_token": "<|endoftext|>",
48
  "padding_side": "right",
49
+ "processor_class": "LlavaProcessor",
50
  "split_special_tokens": false,
51
  "tokenizer_class": "Qwen2Tokenizer",
52
  "unk_token": null
trainer_state.json CHANGED
@@ -10,1325 +10,1325 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "grad_norm": 29.326078015981345,
14
  "learning_rate": 6.25e-08,
15
- "logps/chosen": -47.87165832519531,
16
- "logps/rejected": -35.03704071044922,
17
- "loss": 0.6939,
18
- "losses/dpo": 0.7437427639961243,
19
- "losses/sft": 0.2519839406013489,
20
- "losses/total": 0.7437427639961243,
21
- "ref_logps/chosen": -47.90069580078125,
22
- "ref_logps/rejected": -35.07575225830078,
23
- "rewards/accuracies": 0.4609375,
24
- "rewards/chosen": 0.0029037208296358585,
25
- "rewards/margins": -0.0009674869943410158,
26
- "rewards/rejected": 0.0038712075911462307,
27
  "step": 1
28
  },
29
  {
30
  "epoch": 0.03,
31
- "grad_norm": 25.98987817588094,
32
  "learning_rate": 1.25e-07,
33
- "logps/chosen": -46.03837966918945,
34
- "logps/rejected": -34.79166030883789,
35
- "loss": 0.6937,
36
- "losses/dpo": 0.711306095123291,
37
- "losses/sft": 0.21511156857013702,
38
- "losses/total": 0.711306095123291,
39
- "ref_logps/chosen": -46.05853271484375,
40
- "ref_logps/rejected": -34.81706237792969,
41
- "rewards/accuracies": 0.5,
42
- "rewards/chosen": 0.0020157406106591225,
43
- "rewards/margins": -0.000524366507306695,
44
- "rewards/rejected": 0.002540107350796461,
45
  "step": 2
46
  },
47
  {
48
  "epoch": 0.04,
49
- "grad_norm": 43.145173675858224,
50
  "learning_rate": 1.875e-07,
51
- "logps/chosen": -41.797569274902344,
52
- "logps/rejected": -31.708539962768555,
53
- "loss": 0.693,
54
- "losses/dpo": 0.7042351365089417,
55
- "losses/sft": 0.18763618171215057,
56
- "losses/total": 0.7042351365089417,
57
- "ref_logps/chosen": -41.833030700683594,
58
- "ref_logps/rejected": -31.735107421875,
59
- "rewards/accuracies": 0.5234375,
60
- "rewards/chosen": 0.003545756684616208,
61
- "rewards/margins": 0.0008889732416719198,
62
- "rewards/rejected": 0.0026567834429442883,
63
  "step": 3
64
  },
65
  {
66
  "epoch": 0.05,
67
- "grad_norm": 31.32790996670384,
68
  "learning_rate": 2.5e-07,
69
- "logps/chosen": -42.71172332763672,
70
- "logps/rejected": -32.757808685302734,
71
- "loss": 0.6927,
72
- "losses/dpo": 0.6976655125617981,
73
- "losses/sft": 0.17784112691879272,
74
- "losses/total": 0.6976655125617981,
75
- "ref_logps/chosen": -42.72623062133789,
76
- "ref_logps/rejected": -32.75667190551758,
77
  "rewards/accuracies": 0.5,
78
- "rewards/chosen": 0.0014508566819131374,
79
- "rewards/margins": 0.0015643269289284945,
80
- "rewards/rejected": -0.00011346983956173062,
81
  "step": 4
82
  },
83
  {
84
  "epoch": 0.07,
85
- "grad_norm": 35.10577986645193,
86
  "learning_rate": 3.1249999999999997e-07,
87
- "logps/chosen": -45.85194396972656,
88
- "logps/rejected": -34.628639221191406,
89
- "loss": 0.689,
90
- "losses/dpo": 0.7395577430725098,
91
- "losses/sft": 0.17383158206939697,
92
- "losses/total": 0.7395577430725098,
93
- "ref_logps/chosen": -45.91680145263672,
94
- "ref_logps/rejected": -34.60468673706055,
95
- "rewards/accuracies": 0.5703125,
96
- "rewards/chosen": 0.006485694088041782,
97
- "rewards/margins": 0.008881103247404099,
98
- "rewards/rejected": -0.002395408693701029,
99
  "step": 5
100
  },
101
  {
102
  "epoch": 0.08,
103
- "grad_norm": 28.260278751569523,
104
  "learning_rate": 3.75e-07,
105
- "logps/chosen": -42.09749221801758,
106
- "logps/rejected": -32.70561599731445,
107
- "loss": 0.6932,
108
- "losses/dpo": 0.6590798497200012,
109
- "losses/sft": 0.18368251621723175,
110
- "losses/total": 0.6590798497200012,
111
- "ref_logps/chosen": -42.06741714477539,
112
- "ref_logps/rejected": -32.67097473144531,
113
- "rewards/accuracies": 0.484375,
114
- "rewards/chosen": -0.0030076471157372,
115
- "rewards/margins": 0.0004564363043755293,
116
- "rewards/rejected": -0.0034640836529433727,
117
  "step": 6
118
  },
119
  {
120
  "epoch": 0.1,
121
- "grad_norm": 45.257780534421805,
122
  "learning_rate": 4.375e-07,
123
- "logps/chosen": -48.16801834106445,
124
- "logps/rejected": -35.98320770263672,
125
- "loss": 0.6931,
126
- "losses/dpo": 0.674820065498352,
127
- "losses/sft": 0.17130310833454132,
128
- "losses/total": 0.674820065498352,
129
- "ref_logps/chosen": -48.16166687011719,
130
- "ref_logps/rejected": -35.96845245361328,
131
  "rewards/accuracies": 0.515625,
132
- "rewards/chosen": -0.0006352070486173034,
133
- "rewards/margins": 0.0008399828802794218,
134
- "rewards/rejected": -0.0014751903945580125,
135
  "step": 7
136
  },
137
  {
138
  "epoch": 0.11,
139
- "grad_norm": 37.963707614132204,
140
  "learning_rate": 5e-07,
141
- "logps/chosen": -46.631561279296875,
142
- "logps/rejected": -34.54258728027344,
143
- "loss": 0.6911,
144
- "losses/dpo": 0.6616916060447693,
145
- "losses/sft": 0.15279927849769592,
146
- "losses/total": 0.6616916060447693,
147
- "ref_logps/chosen": -46.690643310546875,
148
- "ref_logps/rejected": -34.551368713378906,
149
- "rewards/accuracies": 0.5546875,
150
- "rewards/chosen": 0.005908225197345018,
151
- "rewards/margins": 0.005030112341046333,
152
- "rewards/rejected": 0.0008781132637523115,
153
  "step": 8
154
  },
155
  {
156
  "epoch": 0.12,
157
- "grad_norm": 23.24345634411509,
158
  "learning_rate": 4.997080567080816e-07,
159
- "logps/chosen": -45.053184509277344,
160
- "logps/rejected": -35.14673614501953,
161
- "loss": 0.6888,
162
- "losses/dpo": 0.645126461982727,
163
- "losses/sft": 0.1863231658935547,
164
- "losses/total": 0.645126461982727,
165
- "ref_logps/chosen": -45.13517379760742,
166
- "ref_logps/rejected": -35.132957458496094,
167
- "rewards/accuracies": 0.5390625,
168
- "rewards/chosen": 0.008199075236916542,
169
- "rewards/margins": 0.009576688520610332,
170
- "rewards/rejected": -0.0013776118867099285,
171
  "step": 9
172
  },
173
  {
174
  "epoch": 0.14,
175
- "grad_norm": 27.949597341892236,
176
  "learning_rate": 4.988329086794122e-07,
177
- "logps/chosen": -46.718475341796875,
178
- "logps/rejected": -36.01044464111328,
179
- "loss": 0.6845,
180
- "losses/dpo": 0.6536989212036133,
181
- "losses/sft": 0.16235677897930145,
182
- "losses/total": 0.6536989212036133,
183
- "ref_logps/chosen": -46.86553192138672,
184
- "ref_logps/rejected": -35.97478103637695,
185
- "rewards/accuracies": 0.6171875,
186
- "rewards/chosen": 0.0147053562104702,
187
- "rewards/margins": 0.018271632492542267,
188
- "rewards/rejected": -0.0035662769805639982,
189
  "step": 10
190
  },
191
  {
192
  "epoch": 0.15,
193
- "grad_norm": 40.316536183472955,
194
  "learning_rate": 4.973765998627628e-07,
195
- "logps/chosen": -45.7076416015625,
196
- "logps/rejected": -32.744361877441406,
197
- "loss": 0.6758,
198
- "losses/dpo": 0.639275848865509,
199
- "losses/sft": 0.19072100520133972,
200
- "losses/total": 0.639275848865509,
201
- "ref_logps/chosen": -45.953941345214844,
202
- "ref_logps/rejected": -32.63063430786133,
203
- "rewards/accuracies": 0.734375,
204
- "rewards/chosen": 0.024630192667245865,
205
- "rewards/margins": 0.036002762615680695,
206
- "rewards/rejected": -0.01137256994843483,
207
  "step": 11
208
  },
209
  {
210
  "epoch": 0.16,
211
- "grad_norm": 31.231333750699285,
212
  "learning_rate": 4.953425315348533e-07,
213
- "logps/chosen": -48.346229553222656,
214
- "logps/rejected": -35.44029235839844,
215
- "loss": 0.6735,
216
- "losses/dpo": 0.7411879301071167,
217
- "losses/sft": 0.30462783575057983,
218
- "losses/total": 0.7411879301071167,
219
- "ref_logps/chosen": -48.579471588134766,
220
- "ref_logps/rejected": -35.26258087158203,
221
- "rewards/accuracies": 0.75,
222
- "rewards/chosen": 0.023324450477957726,
223
- "rewards/margins": 0.04109576344490051,
224
- "rewards/rejected": -0.017771316692233086,
225
  "step": 12
226
  },
227
  {
228
  "epoch": 0.18,
229
- "grad_norm": 24.02378939332813,
230
  "learning_rate": 4.92735454356513e-07,
231
- "logps/chosen": -43.760799407958984,
232
- "logps/rejected": -32.20792007446289,
233
- "loss": 0.6771,
234
- "losses/dpo": 0.7643380761146545,
235
- "losses/sft": 0.15294401347637177,
236
- "losses/total": 0.7643380761146545,
237
- "ref_logps/chosen": -43.909759521484375,
238
- "ref_logps/rejected": -32.016273498535156,
239
- "rewards/accuracies": 0.65625,
240
- "rewards/chosen": 0.01489595789462328,
241
- "rewards/margins": 0.034060731530189514,
242
- "rewards/rejected": -0.01916477642953396,
243
  "step": 13
244
  },
245
  {
246
  "epoch": 0.19,
247
- "grad_norm": 33.47814491109199,
248
  "learning_rate": 4.895614572772916e-07,
249
- "logps/chosen": -45.79880905151367,
250
- "logps/rejected": -34.85653305053711,
251
- "loss": 0.6669,
252
- "losses/dpo": 0.7224411368370056,
253
- "losses/sft": 0.2095840573310852,
254
- "losses/total": 0.7224411368370056,
255
- "ref_logps/chosen": -46.07813262939453,
256
- "ref_logps/rejected": -34.58377456665039,
257
- "rewards/accuracies": 0.734375,
258
- "rewards/chosen": 0.02793230675160885,
259
- "rewards/margins": 0.055208105593919754,
260
- "rewards/rejected": -0.027275800704956055,
261
  "step": 14
262
  },
263
  {
264
  "epoch": 0.21,
265
- "grad_norm": 47.78782257013143,
266
  "learning_rate": 4.858279533144357e-07,
267
- "logps/chosen": -47.91066360473633,
268
- "logps/rejected": -36.8038330078125,
269
- "loss": 0.6545,
270
- "losses/dpo": 0.5712046027183533,
271
- "losses/sft": 0.20200778543949127,
272
- "losses/total": 0.5712046027183533,
273
- "ref_logps/chosen": -48.32217788696289,
274
- "ref_logps/rejected": -36.395023345947266,
275
- "rewards/accuracies": 0.765625,
276
- "rewards/chosen": 0.04115153104066849,
277
- "rewards/margins": 0.08203274011611938,
278
- "rewards/rejected": -0.040881212800741196,
279
  "step": 15
280
  },
281
  {
282
  "epoch": 0.22,
283
- "grad_norm": 246.97737804069968,
284
  "learning_rate": 4.815436622394441e-07,
285
- "logps/chosen": -46.90559387207031,
286
- "logps/rejected": -36.626888275146484,
287
- "loss": 0.6465,
288
- "losses/dpo": 0.7274478077888489,
289
- "losses/sft": 0.26765260100364685,
290
- "losses/total": 0.7274478077888489,
291
- "ref_logps/chosen": -47.21229934692383,
292
- "ref_logps/rejected": -35.93655776977539,
293
- "rewards/accuracies": 0.78125,
294
- "rewards/chosen": 0.03067046031355858,
295
- "rewards/margins": 0.09970355033874512,
296
- "rewards/rejected": -0.06903309375047684,
297
  "step": 16
298
  },
299
  {
300
  "epoch": 0.23,
301
- "grad_norm": 23.079239827774252,
302
  "learning_rate": 4.767185902126363e-07,
303
- "logps/chosen": -48.87858200073242,
304
- "logps/rejected": -36.90644073486328,
305
- "loss": 0.633,
306
- "losses/dpo": 0.6357161998748779,
307
- "losses/sft": 0.1839471459388733,
308
- "losses/total": 0.6357161998748779,
309
- "ref_logps/chosen": -49.40204620361328,
310
- "ref_logps/rejected": -36.11450958251953,
311
- "rewards/accuracies": 0.8203125,
312
- "rewards/chosen": 0.05234625190496445,
313
- "rewards/margins": 0.13153919577598572,
314
- "rewards/rejected": -0.07919295132160187,
315
  "step": 17
316
  },
317
  {
318
  "epoch": 0.25,
319
- "grad_norm": 25.63300252359878,
320
  "learning_rate": 4.7136400641330245e-07,
321
- "logps/chosen": -46.71650695800781,
322
- "logps/rejected": -37.09510040283203,
323
- "loss": 0.6297,
324
- "losses/dpo": 0.6393631100654602,
325
- "losses/sft": 0.21227942407131195,
326
- "losses/total": 0.6393631100654602,
327
- "ref_logps/chosen": -46.991477966308594,
328
- "ref_logps/rejected": -35.969173431396484,
329
- "rewards/accuracies": 0.8203125,
330
- "rewards/chosen": 0.02749716117978096,
331
- "rewards/margins": 0.14008952677249908,
332
- "rewards/rejected": -0.11259236931800842,
333
  "step": 18
334
  },
335
  {
336
  "epoch": 0.26,
337
- "grad_norm": 26.311859157755837,
338
  "learning_rate": 4.6549241672001225e-07,
339
- "logps/chosen": -43.63357162475586,
340
- "logps/rejected": -34.979026794433594,
341
- "loss": 0.6077,
342
- "losses/dpo": 0.5548383593559265,
343
- "losses/sft": 0.19493867456912994,
344
- "losses/total": 0.5548383593559265,
345
- "ref_logps/chosen": -44.03193664550781,
346
- "ref_logps/rejected": -33.485252380371094,
347
- "rewards/accuracies": 0.8515625,
348
- "rewards/chosen": 0.03983645513653755,
349
- "rewards/margins": 0.18921390175819397,
350
- "rewards/rejected": -0.14937745034694672,
351
  "step": 19
352
  },
353
  {
354
  "epoch": 0.27,
355
- "grad_norm": 28.714173620781665,
356
  "learning_rate": 4.591175345025566e-07,
357
- "logps/chosen": -46.371559143066406,
358
- "logps/rejected": -35.243812561035156,
359
- "loss": 0.609,
360
- "losses/dpo": 0.6410955190658569,
361
- "losses/sft": 0.16183941066265106,
362
- "losses/total": 0.6410955190658569,
363
- "ref_logps/chosen": -46.70909881591797,
364
- "ref_logps/rejected": -33.71453857421875,
365
- "rewards/accuracies": 0.828125,
366
- "rewards/chosen": 0.03375420719385147,
367
- "rewards/margins": 0.18668171763420105,
368
- "rewards/rejected": -0.15292751789093018,
369
  "step": 20
370
  },
371
  {
372
  "epoch": 0.29,
373
- "grad_norm": 26.549036618365495,
374
  "learning_rate": 4.5225424859373684e-07,
375
- "logps/chosen": -41.521549224853516,
376
- "logps/rejected": -34.770103454589844,
377
- "loss": 0.5963,
378
- "losses/dpo": 0.7364767789840698,
379
- "losses/sft": 0.17622552812099457,
380
- "losses/total": 0.7364767789840698,
381
- "ref_logps/chosen": -41.7501106262207,
382
- "ref_logps/rejected": -32.80527114868164,
383
- "rewards/accuracies": 0.859375,
384
- "rewards/chosen": 0.02285606414079666,
385
- "rewards/margins": 0.21933907270431519,
386
- "rewards/rejected": -0.19648301601409912,
387
  "step": 21
388
  },
389
  {
390
  "epoch": 0.3,
391
- "grad_norm": 33.26960463303905,
392
  "learning_rate": 4.4491858851580553e-07,
393
- "logps/chosen": -45.94141387939453,
394
- "logps/rejected": -36.16654968261719,
395
- "loss": 0.5887,
396
- "losses/dpo": 0.495862752199173,
397
- "losses/sft": 0.17526012659072876,
398
- "losses/total": 0.495862752199173,
399
- "ref_logps/chosen": -46.16797637939453,
400
- "ref_logps/rejected": -33.92024612426758,
401
- "rewards/accuracies": 0.84375,
402
- "rewards/chosen": 0.02265631966292858,
403
- "rewards/margins": 0.2472866028547287,
404
- "rewards/rejected": -0.22463028132915497,
405
  "step": 22
406
  },
407
  {
408
  "epoch": 0.32,
409
- "grad_norm": 38.94504011639214,
410
  "learning_rate": 4.3712768704277524e-07,
411
- "logps/chosen": -43.17596435546875,
412
- "logps/rejected": -35.83791732788086,
413
- "loss": 0.5549,
414
- "losses/dpo": 0.6368575692176819,
415
- "losses/sft": 0.20419813692569733,
416
- "losses/total": 0.6368575692176819,
417
- "ref_logps/chosen": -43.439910888671875,
418
- "ref_logps/rejected": -32.738441467285156,
419
- "rewards/accuracies": 0.8828125,
420
- "rewards/chosen": 0.026394736021757126,
421
- "rewards/margins": 0.3363422751426697,
422
- "rewards/rejected": -0.30994755029678345,
423
  "step": 23
424
  },
425
  {
426
  "epoch": 0.33,
427
- "grad_norm": 28.33928817647071,
428
  "learning_rate": 4.2889974018603024e-07,
429
- "logps/chosen": -48.73534393310547,
430
- "logps/rejected": -40.98769760131836,
431
- "loss": 0.5358,
432
- "losses/dpo": 0.6388107538223267,
433
- "losses/sft": 0.21662825345993042,
434
- "losses/total": 0.6388107538223267,
435
- "ref_logps/chosen": -48.840187072753906,
436
- "ref_logps/rejected": -37.24340057373047,
437
- "rewards/accuracies": 0.890625,
438
- "rewards/chosen": 0.010484418831765652,
439
- "rewards/margins": 0.38491398096084595,
440
- "rewards/rejected": -0.3744295537471771,
441
  "step": 24
442
  },
443
  {
444
  "epoch": 0.34,
445
- "grad_norm": 31.571769897086057,
446
  "learning_rate": 4.2025396469669926e-07,
447
- "logps/chosen": -49.65196228027344,
448
- "logps/rejected": -39.15043258666992,
449
- "loss": 0.5317,
450
- "losses/dpo": 0.4821869134902954,
451
- "losses/sft": 0.2129327803850174,
452
- "losses/total": 0.4821869134902954,
453
- "ref_logps/chosen": -49.09580993652344,
454
- "ref_logps/rejected": -34.47374725341797,
455
- "rewards/accuracies": 0.8828125,
456
- "rewards/chosen": -0.05561504885554314,
457
- "rewards/margins": 0.41205331683158875,
458
- "rewards/rejected": -0.467668354511261,
459
  "step": 25
460
  },
461
  {
462
  "epoch": 0.36,
463
- "grad_norm": 20.54896163205101,
464
  "learning_rate": 4.112105531840426e-07,
465
- "logps/chosen": -50.22370529174805,
466
- "logps/rejected": -38.49211120605469,
467
- "loss": 0.5133,
468
- "losses/dpo": 0.6953214406967163,
469
- "losses/sft": 0.1770307421684265,
470
- "losses/total": 0.6953214406967163,
471
- "ref_logps/chosen": -49.23892593383789,
472
- "ref_logps/rejected": -32.732269287109375,
473
- "rewards/accuracies": 0.8671875,
474
- "rewards/chosen": -0.09847792983055115,
475
- "rewards/margins": 0.4775061011314392,
476
- "rewards/rejected": -0.575984001159668,
477
  "step": 26
478
  },
479
  {
480
  "epoch": 0.37,
481
- "grad_norm": 24.210290197713302,
482
  "learning_rate": 4.017906269546778e-07,
483
- "logps/chosen": -48.78424072265625,
484
- "logps/rejected": -39.4119758605957,
485
- "loss": 0.5025,
486
- "losses/dpo": 0.2536649703979492,
487
- "losses/sft": 0.17507979273796082,
488
- "losses/total": 0.2536649703979492,
489
- "ref_logps/chosen": -47.147621154785156,
490
- "ref_logps/rejected": -32.35851287841797,
491
- "rewards/accuracies": 0.84375,
492
- "rewards/chosen": -0.16366226971149445,
493
- "rewards/margins": 0.5416839718818665,
494
- "rewards/rejected": -0.7053462266921997,
495
  "step": 27
496
  },
497
  {
498
  "epoch": 0.38,
499
- "grad_norm": 25.054325101536794,
500
  "learning_rate": 3.920161866827889e-07,
501
- "logps/chosen": -46.48284912109375,
502
- "logps/rejected": -40.55732727050781,
503
- "loss": 0.5225,
504
- "losses/dpo": 0.6159500479698181,
505
- "losses/sft": 0.18471354246139526,
506
- "losses/total": 0.6159500479698181,
507
- "ref_logps/chosen": -44.64717102050781,
508
- "ref_logps/rejected": -34.08299255371094,
509
  "rewards/accuracies": 0.84375,
510
- "rewards/chosen": -0.18356791138648987,
511
- "rewards/margins": 0.46386560797691345,
512
- "rewards/rejected": -0.6474335193634033,
513
  "step": 28
514
  },
515
  {
516
  "epoch": 0.4,
517
- "grad_norm": 25.059885652690767,
518
  "learning_rate": 3.8191006102653317e-07,
519
- "logps/chosen": -50.65240478515625,
520
- "logps/rejected": -44.85976028442383,
521
- "loss": 0.4509,
522
- "losses/dpo": 0.5429763793945312,
523
- "losses/sft": 0.19810011982917786,
524
- "losses/total": 0.5429763793945312,
525
- "ref_logps/chosen": -47.85638427734375,
526
- "ref_logps/rejected": -35.169281005859375,
527
- "rewards/accuracies": 0.9140625,
528
- "rewards/chosen": -0.27960240840911865,
529
- "rewards/margins": 0.6894451975822449,
530
- "rewards/rejected": -0.9690475463867188,
531
  "step": 29
532
  },
533
  {
534
  "epoch": 0.41,
535
- "grad_norm": 19.99856582783424,
536
  "learning_rate": 3.7149585331065145e-07,
537
- "logps/chosen": -49.85383605957031,
538
- "logps/rejected": -45.81809997558594,
539
- "loss": 0.4332,
540
- "losses/dpo": 0.29431843757629395,
541
- "losses/sft": 0.18581561744213104,
542
- "losses/total": 0.29431843757629395,
543
- "ref_logps/chosen": -46.770938873291016,
544
- "ref_logps/rejected": -34.5809326171875,
545
- "rewards/accuracies": 0.8671875,
546
- "rewards/chosen": -0.3082895576953888,
547
- "rewards/margins": 0.8154268264770508,
548
- "rewards/rejected": -1.1237163543701172,
549
  "step": 30
550
  },
551
  {
552
  "epoch": 0.42,
553
- "grad_norm": 34.79633257386577,
554
  "learning_rate": 3.6079788639981036e-07,
555
- "logps/chosen": -52.836326599121094,
556
- "logps/rejected": -46.93244934082031,
557
- "loss": 0.4604,
558
- "losses/dpo": 0.8810983300209045,
559
- "losses/sft": 0.23828193545341492,
560
- "losses/total": 0.8810983300209045,
561
- "ref_logps/chosen": -49.11648178100586,
562
- "ref_logps/rejected": -36.381752014160156,
563
- "rewards/accuracies": 0.8984375,
564
- "rewards/chosen": -0.3719848394393921,
565
- "rewards/margins": 0.6830847263336182,
566
- "rewards/rejected": -1.0550695657730103,
567
  "step": 31
568
  },
569
  {
570
  "epoch": 0.44,
571
- "grad_norm": 23.026509844905394,
572
  "learning_rate": 3.498411458914238e-07,
573
- "logps/chosen": -50.38003921508789,
574
- "logps/rejected": -45.10429763793945,
575
- "loss": 0.4393,
576
- "losses/dpo": 0.15313033759593964,
577
- "losses/sft": 0.19763650000095367,
578
- "losses/total": 0.15313033759593964,
579
- "ref_logps/chosen": -46.028076171875,
580
- "ref_logps/rejected": -33.00657272338867,
581
- "rewards/accuracies": 0.875,
582
- "rewards/chosen": -0.4351964592933655,
583
- "rewards/margins": 0.7745760679244995,
584
- "rewards/rejected": -1.2097725868225098,
585
  "step": 32
586
  },
587
  {
588
  "epoch": 0.45,
589
- "grad_norm": 18.317574609447647,
590
  "learning_rate": 3.3865122176063385e-07,
591
- "logps/chosen": -51.4942512512207,
592
- "logps/rejected": -49.96583557128906,
593
- "loss": 0.4075,
594
- "losses/dpo": 0.1953999102115631,
595
- "losses/sft": 0.29790106415748596,
596
- "losses/total": 0.1953999102115631,
597
- "ref_logps/chosen": -45.6589469909668,
598
- "ref_logps/rejected": -34.858577728271484,
599
- "rewards/accuracies": 0.8515625,
600
- "rewards/chosen": -0.5835303068161011,
601
- "rewards/margins": 0.9271953105926514,
602
- "rewards/rejected": -1.510725736618042,
603
  "step": 33
604
  },
605
  {
606
  "epoch": 0.47,
607
- "grad_norm": 19.255871137244554,
608
  "learning_rate": 3.272542485937368e-07,
609
- "logps/chosen": -50.351234436035156,
610
- "logps/rejected": -48.89935302734375,
611
- "loss": 0.3959,
612
- "losses/dpo": 0.4281933605670929,
613
- "losses/sft": 0.19774244725704193,
614
- "losses/total": 0.4281933605670929,
615
- "ref_logps/chosen": -43.48761749267578,
616
- "ref_logps/rejected": -32.255577087402344,
617
- "rewards/accuracies": 0.859375,
618
- "rewards/chosen": -0.68636155128479,
619
- "rewards/margins": 0.9780160188674927,
620
- "rewards/rejected": -1.6643775701522827,
621
  "step": 34
622
  },
623
  {
624
  "epoch": 0.48,
625
- "grad_norm": 17.53385145494046,
626
  "learning_rate": 3.1567684454964674e-07,
627
- "logps/chosen": -49.46981430053711,
628
- "logps/rejected": -49.80710220336914,
629
- "loss": 0.4011,
630
- "losses/dpo": 0.5663512945175171,
631
- "losses/sft": 0.24904295802116394,
632
- "losses/total": 0.5663512945175171,
633
- "ref_logps/chosen": -42.88325500488281,
634
- "ref_logps/rejected": -33.13590621948242,
635
- "rewards/accuracies": 0.890625,
636
- "rewards/chosen": -0.6586559414863586,
637
- "rewards/margins": 1.0084636211395264,
638
- "rewards/rejected": -1.6671196222305298,
639
  "step": 35
640
  },
641
  {
642
  "epoch": 0.49,
643
- "grad_norm": 157.5390863725062,
644
  "learning_rate": 3.0394604919195157e-07,
645
- "logps/chosen": -50.14772415161133,
646
- "logps/rejected": -49.97753143310547,
647
- "loss": 0.4132,
648
- "losses/dpo": 0.6134005784988403,
649
- "losses/sft": 0.1941785216331482,
650
- "losses/total": 0.6134005784988403,
651
- "ref_logps/chosen": -42.886375427246094,
652
- "ref_logps/rejected": -32.889442443847656,
653
- "rewards/accuracies": 0.859375,
654
- "rewards/chosen": -0.7261347770690918,
655
- "rewards/margins": 0.9826743006706238,
656
- "rewards/rejected": -1.7088091373443604,
657
  "step": 36
658
  },
659
  {
660
  "epoch": 0.51,
661
- "grad_norm": 30.744138000924785,
662
  "learning_rate": 2.920892603367596e-07,
663
- "logps/chosen": -52.53690719604492,
664
- "logps/rejected": -51.7293701171875,
665
- "loss": 0.4345,
666
- "losses/dpo": 0.39982184767723083,
667
- "losses/sft": 0.16318069398403168,
668
- "losses/total": 0.39982184767723083,
669
- "ref_logps/chosen": -44.043270111083984,
670
- "ref_logps/rejected": -33.67184066772461,
671
- "rewards/accuracies": 0.8203125,
672
- "rewards/chosen": -0.8493636250495911,
673
- "rewards/margins": 0.956389307975769,
674
- "rewards/rejected": -1.8057528734207153,
675
  "step": 37
676
  },
677
  {
678
  "epoch": 0.52,
679
- "grad_norm": 18.608606064784283,
680
  "learning_rate": 2.801341700638307e-07,
681
- "logps/chosen": -54.247406005859375,
682
- "logps/rejected": -51.46720886230469,
683
- "loss": 0.4308,
684
- "losses/dpo": 0.7559365630149841,
685
- "losses/sft": 0.20898960530757904,
686
- "losses/total": 0.7559365630149841,
687
- "ref_logps/chosen": -47.05962371826172,
688
- "ref_logps/rejected": -34.95857238769531,
689
- "rewards/accuracies": 0.828125,
690
- "rewards/chosen": -0.7187784910202026,
691
- "rewards/margins": 0.9320851564407349,
692
- "rewards/rejected": -1.6508636474609375,
693
  "step": 38
694
  },
695
  {
696
  "epoch": 0.53,
697
- "grad_norm": 47.98397942977545,
698
  "learning_rate": 2.681087000404406e-07,
699
- "logps/chosen": -53.239768981933594,
700
- "logps/rejected": -52.34550476074219,
701
- "loss": 0.3907,
702
- "losses/dpo": 0.31572413444519043,
703
- "losses/sft": 0.18499067425727844,
704
- "losses/total": 0.31572413444519043,
705
- "ref_logps/chosen": -45.19135284423828,
706
- "ref_logps/rejected": -33.13307189941406,
707
- "rewards/accuracies": 0.90625,
708
- "rewards/chosen": -0.8048416972160339,
709
- "rewards/margins": 1.1164013147354126,
710
- "rewards/rejected": -1.9212429523468018,
711
  "step": 39
712
  },
713
  {
714
  "epoch": 0.55,
715
- "grad_norm": 21.523748609052035,
716
  "learning_rate": 2.5604093630903305e-07,
717
- "logps/chosen": -53.806236267089844,
718
- "logps/rejected": -54.13373565673828,
719
- "loss": 0.3678,
720
- "losses/dpo": 0.6854045391082764,
721
- "losses/sft": 0.21097487211227417,
722
- "losses/total": 0.6854045391082764,
723
- "ref_logps/chosen": -44.96014404296875,
724
- "ref_logps/rejected": -34.04387664794922,
725
- "rewards/accuracies": 0.890625,
726
- "rewards/chosen": -0.8846092224121094,
727
- "rewards/margins": 1.1243770122528076,
728
- "rewards/rejected": -2.008985996246338,
729
  "step": 40
730
  },
731
  {
732
  "epoch": 0.56,
733
- "grad_norm": 20.63046978113073,
734
  "learning_rate": 2.43959063690967e-07,
735
- "logps/chosen": -56.91130065917969,
736
- "logps/rejected": -54.714378356933594,
737
- "loss": 0.3872,
738
- "losses/dpo": 0.1204671785235405,
739
- "losses/sft": 0.17937365174293518,
740
- "losses/total": 0.1204671785235405,
741
- "ref_logps/chosen": -47.74310302734375,
742
- "ref_logps/rejected": -34.866615295410156,
743
- "rewards/accuracies": 0.859375,
744
- "rewards/chosen": -0.9168204069137573,
745
- "rewards/margins": 1.0679559707641602,
746
- "rewards/rejected": -1.984776258468628,
747
  "step": 41
748
  },
749
  {
750
  "epoch": 0.58,
751
- "grad_norm": 27.841791874606287,
752
  "learning_rate": 2.3189129995955942e-07,
753
- "logps/chosen": -56.37548065185547,
754
- "logps/rejected": -55.140594482421875,
755
- "loss": 0.3703,
756
- "losses/dpo": 0.6694349646568298,
757
- "losses/sft": 0.15415219962596893,
758
- "losses/total": 0.6694349646568298,
759
- "ref_logps/chosen": -46.114707946777344,
760
- "ref_logps/rejected": -33.19464111328125,
761
- "rewards/accuracies": 0.8671875,
762
- "rewards/chosen": -1.0260775089263916,
763
- "rewards/margins": 1.16851806640625,
764
- "rewards/rejected": -2.1945955753326416,
765
  "step": 42
766
  },
767
  {
768
  "epoch": 0.59,
769
- "grad_norm": 20.157417684445996,
770
  "learning_rate": 2.1986582993616925e-07,
771
- "logps/chosen": -55.861724853515625,
772
- "logps/rejected": -55.27591323852539,
773
- "loss": 0.4096,
774
- "losses/dpo": 0.253600537776947,
775
- "losses/sft": 0.25442296266555786,
776
- "losses/total": 0.253600537776947,
777
- "ref_logps/chosen": -46.024993896484375,
778
- "ref_logps/rejected": -34.88616180419922,
779
- "rewards/accuracies": 0.859375,
780
- "rewards/chosen": -0.9836731553077698,
781
- "rewards/margins": 1.0553019046783447,
782
- "rewards/rejected": -2.038975238800049,
783
  "step": 43
784
  },
785
  {
786
  "epoch": 0.6,
787
- "grad_norm": 22.91868411351925,
788
  "learning_rate": 2.0791073966324034e-07,
789
- "logps/chosen": -56.3699836730957,
790
- "logps/rejected": -58.20032501220703,
791
- "loss": 0.3645,
792
- "losses/dpo": 0.05803808197379112,
793
- "losses/sft": 0.16261443495750427,
794
- "losses/total": 0.05803808197379112,
795
- "ref_logps/chosen": -46.18814468383789,
796
- "ref_logps/rejected": -35.7181396484375,
797
- "rewards/accuracies": 0.8828125,
798
- "rewards/chosen": -1.018183708190918,
799
- "rewards/margins": 1.230034351348877,
800
- "rewards/rejected": -2.248218059539795,
801
  "step": 44
802
  },
803
  {
804
  "epoch": 0.62,
805
- "grad_norm": 24.665726952614282,
806
  "learning_rate": 1.960539508080485e-07,
807
- "logps/chosen": -55.33811569213867,
808
- "logps/rejected": -56.2475700378418,
809
- "loss": 0.4363,
810
- "losses/dpo": 0.6756047606468201,
811
- "losses/sft": 0.1989610195159912,
812
- "losses/total": 0.6756047606468201,
813
- "ref_logps/chosen": -42.876373291015625,
814
- "ref_logps/rejected": -33.306602478027344,
815
- "rewards/accuracies": 0.8125,
816
- "rewards/chosen": -1.2461739778518677,
817
- "rewards/margins": 1.0479230880737305,
818
- "rewards/rejected": -2.2940969467163086,
819
  "step": 45
820
  },
821
  {
822
  "epoch": 0.63,
823
- "grad_norm": 35.04495782063734,
824
  "learning_rate": 1.8432315545035327e-07,
825
- "logps/chosen": -59.337791442871094,
826
- "logps/rejected": -60.82359313964844,
827
- "loss": 0.3701,
828
- "losses/dpo": 0.24237556755542755,
829
- "losses/sft": 0.14872561395168304,
830
- "losses/total": 0.24237556755542755,
831
- "ref_logps/chosen": -46.916419982910156,
832
- "ref_logps/rejected": -36.144935607910156,
833
- "rewards/accuracies": 0.8671875,
834
- "rewards/chosen": -1.2421373128890991,
835
- "rewards/margins": 1.2257287502288818,
836
- "rewards/rejected": -2.4678661823272705,
837
  "step": 46
838
  },
839
  {
840
  "epoch": 0.64,
841
- "grad_norm": 18.874251761700755,
842
  "learning_rate": 1.7274575140626315e-07,
843
- "logps/chosen": -60.359886169433594,
844
- "logps/rejected": -56.043479919433594,
845
- "loss": 0.3903,
846
- "losses/dpo": 0.6876823902130127,
847
- "losses/sft": 0.163571298122406,
848
- "losses/total": 0.6876823902130127,
849
- "ref_logps/chosen": -49.23930358886719,
850
- "ref_logps/rejected": -34.02153778076172,
851
- "rewards/accuracies": 0.8984375,
852
- "rewards/chosen": -1.1120576858520508,
853
- "rewards/margins": 1.0901365280151367,
854
- "rewards/rejected": -2.2021942138671875,
855
  "step": 47
856
  },
857
  {
858
  "epoch": 0.66,
859
- "grad_norm": 29.114539057876968,
860
  "learning_rate": 1.6134877823936607e-07,
861
- "logps/chosen": -60.98393249511719,
862
- "logps/rejected": -58.489444732666016,
863
- "loss": 0.4011,
864
- "losses/dpo": 0.03265048563480377,
865
- "losses/sft": 0.14689283072948456,
866
- "losses/total": 0.03265048563480377,
867
- "ref_logps/chosen": -49.34606170654297,
868
- "ref_logps/rejected": -36.67803955078125,
869
- "rewards/accuracies": 0.8671875,
870
- "rewards/chosen": -1.1637871265411377,
871
- "rewards/margins": 1.0173530578613281,
872
- "rewards/rejected": -2.181140184402466,
873
  "step": 48
874
  },
875
  {
876
  "epoch": 0.67,
877
- "grad_norm": 21.107662898541907,
878
  "learning_rate": 1.5015885410857614e-07,
879
- "logps/chosen": -60.81307601928711,
880
- "logps/rejected": -59.90397262573242,
881
- "loss": 0.3897,
882
- "losses/dpo": 0.33075177669525146,
883
- "losses/sft": 0.214824840426445,
884
- "losses/total": 0.33075177669525146,
885
- "ref_logps/chosen": -46.25496292114258,
886
- "ref_logps/rejected": -33.91436004638672,
887
- "rewards/accuracies": 0.859375,
888
- "rewards/chosen": -1.4558112621307373,
889
- "rewards/margins": 1.143149971961975,
890
- "rewards/rejected": -2.598961114883423,
891
  "step": 49
892
  },
893
  {
894
  "epoch": 0.68,
895
- "grad_norm": 26.95108201172052,
896
  "learning_rate": 1.392021136001897e-07,
897
- "logps/chosen": -56.23418426513672,
898
- "logps/rejected": -56.328125,
899
- "loss": 0.3964,
900
- "losses/dpo": 0.03794693946838379,
901
- "losses/sft": 0.19881302118301392,
902
- "losses/total": 0.03794693946838379,
903
- "ref_logps/chosen": -42.96794891357422,
904
- "ref_logps/rejected": -32.164451599121094,
905
- "rewards/accuracies": 0.875,
906
- "rewards/chosen": -1.3266233205795288,
907
- "rewards/margins": 1.089743971824646,
908
- "rewards/rejected": -2.416367530822754,
909
  "step": 50
910
  },
911
  {
912
  "epoch": 0.7,
913
- "grad_norm": 33.76828619344551,
914
  "learning_rate": 1.2850414668934847e-07,
915
- "logps/chosen": -61.50416946411133,
916
- "logps/rejected": -59.79325485229492,
917
- "loss": 0.3827,
918
- "losses/dpo": 0.5413109660148621,
919
- "losses/sft": 0.30467280745506287,
920
- "losses/total": 0.5413109660148621,
921
- "ref_logps/chosen": -48.96829605102539,
922
- "ref_logps/rejected": -35.99717330932617,
923
- "rewards/accuracies": 0.9375,
924
- "rewards/chosen": -1.2535876035690308,
925
- "rewards/margins": 1.1260210275650024,
926
- "rewards/rejected": -2.379608631134033,
927
  "step": 51
928
  },
929
  {
930
  "epoch": 0.71,
931
- "grad_norm": 16.559964106722745,
932
  "learning_rate": 1.1808993897346678e-07,
933
- "logps/chosen": -58.611270904541016,
934
- "logps/rejected": -58.919395446777344,
935
- "loss": 0.3796,
936
- "losses/dpo": 0.3290981352329254,
937
- "losses/sft": 0.19547075033187866,
938
- "losses/total": 0.3290981352329254,
939
- "ref_logps/chosen": -46.96087646484375,
940
- "ref_logps/rejected": -36.086090087890625,
941
- "rewards/accuracies": 0.90625,
942
- "rewards/chosen": -1.1650400161743164,
943
- "rewards/margins": 1.1182900667190552,
944
- "rewards/rejected": -2.283329963684082,
945
  "step": 52
946
  },
947
  {
948
  "epoch": 0.73,
949
- "grad_norm": 25.26391431571928,
950
  "learning_rate": 1.0798381331721107e-07,
951
- "logps/chosen": -58.2769775390625,
952
- "logps/rejected": -57.12656021118164,
953
- "loss": 0.3707,
954
- "losses/dpo": 0.3912191092967987,
955
- "losses/sft": 0.20826196670532227,
956
- "losses/total": 0.3912191092967987,
957
- "ref_logps/chosen": -46.01140213012695,
958
- "ref_logps/rejected": -32.54326629638672,
959
- "rewards/accuracies": 0.859375,
960
- "rewards/chosen": -1.226557731628418,
961
- "rewards/margins": 1.2317723035812378,
962
- "rewards/rejected": -2.4583301544189453,
963
  "step": 53
964
  },
965
  {
966
  "epoch": 0.74,
967
- "grad_norm": 18.669814077600197,
968
  "learning_rate": 9.82093730453222e-08,
969
- "logps/chosen": -57.36506271362305,
970
- "logps/rejected": -57.83528137207031,
971
- "loss": 0.4249,
972
- "losses/dpo": 0.28024712204933167,
973
- "losses/sft": 0.21661897003650665,
974
- "losses/total": 0.28024712204933167,
975
- "ref_logps/chosen": -44.405941009521484,
976
- "ref_logps/rejected": -34.53661346435547,
977
- "rewards/accuracies": 0.8671875,
978
- "rewards/chosen": -1.295912265777588,
979
- "rewards/margins": 1.0339548587799072,
980
- "rewards/rejected": -2.329867124557495,
981
  "step": 54
982
  },
983
  {
984
  "epoch": 0.75,
985
- "grad_norm": 17.65819121351904,
986
  "learning_rate": 8.87894468159574e-08,
987
- "logps/chosen": -60.354469299316406,
988
- "logps/rejected": -60.50645065307617,
989
- "loss": 0.3985,
990
- "losses/dpo": 0.9817911386489868,
991
- "losses/sft": 0.1904633343219757,
992
- "losses/total": 0.9817911386489868,
993
- "ref_logps/chosen": -46.499290466308594,
994
- "ref_logps/rejected": -34.763404846191406,
995
- "rewards/accuracies": 0.8359375,
996
- "rewards/chosen": -1.3855178356170654,
997
- "rewards/margins": 1.1887872219085693,
998
- "rewards/rejected": -2.5743050575256348,
999
  "step": 55
1000
  },
1001
  {
1002
  "epoch": 0.77,
1003
- "grad_norm": 23.90292670438398,
1004
  "learning_rate": 7.974603530330067e-08,
1005
- "logps/chosen": -55.58333206176758,
1006
- "logps/rejected": -55.52084732055664,
1007
- "loss": 0.3777,
1008
- "losses/dpo": 0.04075286537408829,
1009
- "losses/sft": 0.22049269080162048,
1010
- "losses/total": 0.04075286537408829,
1011
- "ref_logps/chosen": -43.25560760498047,
1012
- "ref_logps/rejected": -31.006759643554688,
1013
- "rewards/accuracies": 0.8828125,
1014
- "rewards/chosen": -1.2327725887298584,
1015
- "rewards/margins": 1.2186365127563477,
1016
- "rewards/rejected": -2.451408863067627,
1017
  "step": 56
1018
  },
1019
  {
1020
  "epoch": 0.78,
1021
- "grad_norm": 28.08593658686289,
1022
  "learning_rate": 7.110025981396975e-08,
1023
- "logps/chosen": -58.75514221191406,
1024
- "logps/rejected": -58.784584045410156,
1025
- "loss": 0.4449,
1026
- "losses/dpo": 0.4793856143951416,
1027
- "losses/sft": 0.20940393209457397,
1028
- "losses/total": 0.4793856143951416,
1029
- "ref_logps/chosen": -45.29600524902344,
1030
- "ref_logps/rejected": -34.97162628173828,
1031
- "rewards/accuracies": 0.8046875,
1032
- "rewards/chosen": -1.3459134101867676,
1033
- "rewards/margins": 1.0353822708129883,
1034
- "rewards/rejected": -2.381295680999756,
1035
  "step": 57
1036
  },
1037
  {
1038
  "epoch": 0.79,
1039
- "grad_norm": 24.077339089176505,
1040
  "learning_rate": 6.28723129572247e-08,
1041
- "logps/chosen": -55.75697326660156,
1042
- "logps/rejected": -56.72669219970703,
1043
- "loss": 0.3567,
1044
- "losses/dpo": 0.21238191425800323,
1045
- "losses/sft": 0.1661817878484726,
1046
- "losses/total": 0.21238191425800323,
1047
- "ref_logps/chosen": -44.3855094909668,
1048
- "ref_logps/rejected": -32.21479797363281,
1049
- "rewards/accuracies": 0.890625,
1050
- "rewards/chosen": -1.137147068977356,
1051
- "rewards/margins": 1.314042568206787,
1052
- "rewards/rejected": -2.4511895179748535,
1053
  "step": 58
1054
  },
1055
  {
1056
  "epoch": 0.81,
1057
- "grad_norm": 43.46612828134844,
1058
  "learning_rate": 5.508141148419443e-08,
1059
- "logps/chosen": -61.76049041748047,
1060
- "logps/rejected": -62.041648864746094,
1061
- "loss": 0.3688,
1062
- "losses/dpo": 0.27996987104415894,
1063
- "losses/sft": 0.1737639456987381,
1064
- "losses/total": 0.27996987104415894,
1065
- "ref_logps/chosen": -49.25553894042969,
1066
- "ref_logps/rejected": -36.210182189941406,
1067
- "rewards/accuracies": 0.84375,
1068
- "rewards/chosen": -1.250495195388794,
1069
- "rewards/margins": 1.3326513767242432,
1070
- "rewards/rejected": -2.583146572113037,
1071
  "step": 59
1072
  },
1073
  {
1074
  "epoch": 0.82,
1075
- "grad_norm": 22.779198271573037,
1076
  "learning_rate": 4.774575140626316e-08,
1077
- "logps/chosen": -55.46681594848633,
1078
- "logps/rejected": -57.17453384399414,
1079
- "loss": 0.3531,
1080
- "losses/dpo": 0.046613942831754684,
1081
- "losses/sft": 0.20427729189395905,
1082
- "losses/total": 0.046613942831754684,
1083
- "ref_logps/chosen": -42.29081726074219,
1084
- "ref_logps/rejected": -30.75497817993164,
1085
- "rewards/accuracies": 0.8984375,
1086
- "rewards/chosen": -1.3175995349884033,
1087
- "rewards/margins": 1.3243558406829834,
1088
- "rewards/rejected": -2.6419553756713867,
1089
  "step": 60
1090
  },
1091
  {
1092
  "epoch": 0.84,
1093
- "grad_norm": 20.59368424342303,
1094
  "learning_rate": 4.0882465497443313e-08,
1095
- "logps/chosen": -58.52223587036133,
1096
- "logps/rejected": -56.04042053222656,
1097
- "loss": 0.3923,
1098
- "losses/dpo": 0.26003214716911316,
1099
- "losses/sft": 0.17392012476921082,
1100
- "losses/total": 0.26003214716911316,
1101
- "ref_logps/chosen": -48.404632568359375,
1102
- "ref_logps/rejected": -34.86602783203125,
1103
- "rewards/accuracies": 0.890625,
1104
- "rewards/chosen": -1.0117601156234741,
1105
- "rewards/margins": 1.1056792736053467,
1106
- "rewards/rejected": -2.1174392700195312,
1107
  "step": 61
1108
  },
1109
  {
1110
  "epoch": 0.85,
1111
- "grad_norm": 23.660376428219948,
1112
  "learning_rate": 3.450758327998768e-08,
1113
- "logps/chosen": -60.401039123535156,
1114
- "logps/rejected": -60.10982131958008,
1115
- "loss": 0.3902,
1116
- "losses/dpo": 0.01773645170032978,
1117
- "losses/sft": 0.17717282474040985,
1118
- "losses/total": 0.01773645170032978,
1119
- "ref_logps/chosen": -48.241943359375,
1120
- "ref_logps/rejected": -34.582366943359375,
1121
- "rewards/accuracies": 0.890625,
1122
- "rewards/chosen": -1.215909719467163,
1123
- "rewards/margins": 1.3368357419967651,
1124
- "rewards/rejected": -2.5527453422546387,
1125
  "step": 62
1126
  },
1127
  {
1128
  "epoch": 0.86,
1129
- "grad_norm": 86.96881294099092,
1130
  "learning_rate": 2.863599358669755e-08,
1131
- "logps/chosen": -56.905418395996094,
1132
- "logps/rejected": -56.808746337890625,
1133
- "loss": 0.3944,
1134
- "losses/dpo": 0.15065120160579681,
1135
- "losses/sft": 0.22477349638938904,
1136
- "losses/total": 0.15065120160579681,
1137
- "ref_logps/chosen": -44.15583038330078,
1138
- "ref_logps/rejected": -33.21840286254883,
1139
- "rewards/accuracies": 0.828125,
1140
- "rewards/chosen": -1.2749593257904053,
1141
- "rewards/margins": 1.0840749740600586,
1142
- "rewards/rejected": -2.359034299850464,
1143
  "step": 63
1144
  },
1145
  {
1146
  "epoch": 0.88,
1147
- "grad_norm": 18.8337077576639,
1148
  "learning_rate": 2.3281409787363648e-08,
1149
- "logps/chosen": -57.604774475097656,
1150
- "logps/rejected": -57.78453063964844,
1151
- "loss": 0.3863,
1152
- "losses/dpo": 0.41682732105255127,
1153
- "losses/sft": 0.16616390645503998,
1154
- "losses/total": 0.41682732105255127,
1155
- "ref_logps/chosen": -43.315818786621094,
1156
- "ref_logps/rejected": -31.524248123168945,
1157
- "rewards/accuracies": 0.8671875,
1158
- "rewards/chosen": -1.4288955926895142,
1159
- "rewards/margins": 1.1971325874328613,
1160
- "rewards/rejected": -2.626028537750244,
1161
  "step": 64
1162
  },
1163
  {
1164
  "epoch": 0.89,
1165
- "grad_norm": 374.1054719017444,
1166
  "learning_rate": 1.845633776055591e-08,
1167
- "logps/chosen": -57.63691711425781,
1168
- "logps/rejected": -58.3455810546875,
1169
- "loss": 0.3882,
1170
- "losses/dpo": 0.26508828997612,
1171
- "losses/sft": 0.2718198001384735,
1172
- "losses/total": 0.26508828997612,
1173
- "ref_logps/chosen": -44.429481506347656,
1174
- "ref_logps/rejected": -33.13744354248047,
1175
- "rewards/accuracies": 0.875,
1176
- "rewards/chosen": -1.3207435607910156,
1177
- "rewards/margins": 1.2000699043273926,
1178
- "rewards/rejected": -2.520813465118408,
1179
  "step": 65
1180
  },
1181
  {
1182
  "epoch": 0.9,
1183
- "grad_norm": 26.70970124014032,
1184
  "learning_rate": 1.4172046685564209e-08,
1185
- "logps/chosen": -58.663551330566406,
1186
- "logps/rejected": -58.07282257080078,
1187
- "loss": 0.3962,
1188
- "losses/dpo": 0.08177483081817627,
1189
- "losses/sft": 0.18531636893749237,
1190
- "losses/total": 0.08177483081817627,
1191
- "ref_logps/chosen": -45.821983337402344,
1192
- "ref_logps/rejected": -33.62261199951172,
1193
- "rewards/accuracies": 0.875,
1194
- "rewards/chosen": -1.2841567993164062,
1195
- "rewards/margins": 1.1608643531799316,
1196
- "rewards/rejected": -2.445021390914917,
1197
  "step": 66
1198
  },
1199
  {
1200
  "epoch": 0.92,
1201
- "grad_norm": 25.593261462625442,
1202
  "learning_rate": 1.0438542722708444e-08,
1203
- "logps/chosen": -59.08097839355469,
1204
- "logps/rejected": -59.16502380371094,
1205
- "loss": 0.3836,
1206
- "losses/dpo": 0.02788337506353855,
1207
- "losses/sft": 0.19819076359272003,
1208
- "losses/total": 0.02788337506353855,
1209
- "ref_logps/chosen": -45.94892883300781,
1210
- "ref_logps/rejected": -33.597511291503906,
1211
- "rewards/accuracies": 0.8828125,
1212
- "rewards/chosen": -1.3132052421569824,
1213
- "rewards/margins": 1.2435462474822998,
1214
- "rewards/rejected": -2.556751251220703,
1215
  "step": 67
1216
  },
1217
  {
1218
  "epoch": 0.93,
1219
- "grad_norm": 25.28796063034412,
1220
  "learning_rate": 7.2645456434869965e-09,
1221
- "logps/chosen": -57.95222473144531,
1222
- "logps/rejected": -58.91720199584961,
1223
- "loss": 0.3915,
1224
- "losses/dpo": 1.2907841205596924,
1225
- "losses/sft": 0.20458956062793732,
1226
- "losses/total": 1.2907841205596924,
1227
- "ref_logps/chosen": -45.50114440917969,
1228
- "ref_logps/rejected": -35.063446044921875,
1229
- "rewards/accuracies": 0.890625,
1230
- "rewards/chosen": -1.2451080083847046,
1231
- "rewards/margins": 1.140267252922058,
1232
- "rewards/rejected": -2.385375499725342,
1233
  "step": 68
1234
  },
1235
  {
1236
  "epoch": 0.95,
1237
- "grad_norm": 30.554099185463503,
1238
  "learning_rate": 4.657468465146641e-09,
1239
- "logps/chosen": -57.99516296386719,
1240
- "logps/rejected": -55.496768951416016,
1241
- "loss": 0.3752,
1242
- "losses/dpo": 0.20264464616775513,
1243
- "losses/sft": 0.17493540048599243,
1244
- "losses/total": 0.20264464616775513,
1245
- "ref_logps/chosen": -47.58026123046875,
1246
- "ref_logps/rejected": -33.345062255859375,
1247
- "rewards/accuracies": 0.890625,
1248
- "rewards/chosen": -1.041489839553833,
1249
- "rewards/margins": 1.1736811399459839,
1250
- "rewards/rejected": -2.2151710987091064,
1251
  "step": 69
1252
  },
1253
  {
1254
  "epoch": 0.96,
1255
- "grad_norm": 21.555895701368716,
1256
  "learning_rate": 2.6234001372372193e-09,
1257
- "logps/chosen": -55.79784393310547,
1258
- "logps/rejected": -54.85697555541992,
1259
- "loss": 0.4513,
1260
- "losses/dpo": 0.6288288235664368,
1261
- "losses/sft": 0.25858786702156067,
1262
- "losses/total": 0.6288288235664368,
1263
- "ref_logps/chosen": -42.008121490478516,
1264
- "ref_logps/rejected": -31.47281265258789,
1265
- "rewards/accuracies": 0.828125,
1266
- "rewards/chosen": -1.3789721727371216,
1267
- "rewards/margins": 0.9594441056251526,
1268
- "rewards/rejected": -2.338416337966919,
1269
  "step": 70
1270
  },
1271
  {
1272
  "epoch": 0.97,
1273
- "grad_norm": 21.73384383499147,
1274
  "learning_rate": 1.167091320587843e-09,
1275
- "logps/chosen": -56.99696350097656,
1276
- "logps/rejected": -59.2013053894043,
1277
- "loss": 0.3554,
1278
- "losses/dpo": 0.09169570356607437,
1279
- "losses/sft": 0.20991858839988708,
1280
- "losses/total": 0.09169570356607437,
1281
- "ref_logps/chosen": -42.36278533935547,
1282
- "ref_logps/rejected": -31.79424476623535,
1283
- "rewards/accuracies": 0.890625,
1284
- "rewards/chosen": -1.463417887687683,
1285
- "rewards/margins": 1.2772881984710693,
1286
- "rewards/rejected": -2.740705966949463,
1287
  "step": 71
1288
  },
1289
  {
1290
  "epoch": 0.99,
1291
- "grad_norm": 30.958564799186906,
1292
  "learning_rate": 2.9194329191833953e-10,
1293
- "logps/chosen": -58.35291290283203,
1294
- "logps/rejected": -56.74859619140625,
1295
- "loss": 0.3706,
1296
- "losses/dpo": 0.3077165484428406,
1297
- "losses/sft": 0.17356029152870178,
1298
- "losses/total": 0.3077165484428406,
1299
- "ref_logps/chosen": -44.90869903564453,
1300
- "ref_logps/rejected": -31.324697494506836,
1301
- "rewards/accuracies": 0.890625,
1302
- "rewards/chosen": -1.34442138671875,
1303
- "rewards/margins": 1.197968602180481,
1304
- "rewards/rejected": -2.5423898696899414,
1305
  "step": 72
1306
  },
1307
  {
1308
  "epoch": 1.0,
1309
- "grad_norm": 20.514487251091158,
1310
  "learning_rate": 0.0,
1311
- "logps/chosen": -55.3281135559082,
1312
- "logps/rejected": -54.42873764038086,
1313
- "loss": 0.4185,
1314
- "losses/dpo": 0.45331382751464844,
1315
- "losses/sft": 0.16170088946819305,
1316
- "losses/total": 0.45331382751464844,
1317
- "ref_logps/chosen": -42.832916259765625,
1318
- "ref_logps/rejected": -31.545093536376953,
1319
- "rewards/accuracies": 0.875,
1320
- "rewards/chosen": -1.2495195865631104,
1321
- "rewards/margins": 1.0388449430465698,
1322
- "rewards/rejected": -2.2883644104003906,
1323
  "step": 73
1324
  },
1325
  {
1326
  "epoch": 1.0,
1327
  "step": 73,
1328
  "total_flos": 0.0,
1329
- "train_loss": 0.4880054197082781,
1330
- "train_runtime": 1195.1879,
1331
- "train_samples_per_second": 7.883,
1332
  "train_steps_per_second": 0.061
1333
  }
1334
  ],
@@ -1336,7 +1336,7 @@
1336
  "max_steps": 73,
1337
  "num_input_tokens_seen": 0,
1338
  "num_train_epochs": 1,
1339
- "save_steps": 1000,
1340
  "total_flos": 0.0,
1341
  "train_batch_size": 1,
1342
  "trial_name": null,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "grad_norm": 31.510919537856985,
14
  "learning_rate": 6.25e-08,
15
+ "logps/chosen": -49.24467849731445,
16
+ "logps/rejected": -39.578514099121094,
17
+ "loss": 0.7008,
18
+ "losses/dpo": 0.6596390604972839,
19
+ "losses/sft": 0.17909570038318634,
20
+ "losses/total": 0.6596390604972839,
21
+ "ref_logps/chosen": -49.112640380859375,
22
+ "ref_logps/rejected": -39.590065002441406,
23
+ "rewards/accuracies": 0.4296875,
24
+ "rewards/chosen": -0.013204257935285568,
25
+ "rewards/margins": -0.01435948722064495,
26
+ "rewards/rejected": 0.001155228354036808,
27
  "step": 1
28
  },
29
  {
30
  "epoch": 0.03,
31
+ "grad_norm": 50.71439940508435,
32
  "learning_rate": 1.25e-07,
33
+ "logps/chosen": -53.134193420410156,
34
+ "logps/rejected": -47.03993225097656,
35
+ "loss": 0.6899,
36
+ "losses/dpo": 0.6913646459579468,
37
+ "losses/sft": 0.1710137277841568,
38
+ "losses/total": 0.6913646459579468,
39
+ "ref_logps/chosen": -53.1795768737793,
40
+ "ref_logps/rejected": -47.00617599487305,
41
+ "rewards/accuracies": 0.5390625,
42
+ "rewards/chosen": 0.004538209177553654,
43
+ "rewards/margins": 0.007913690991699696,
44
+ "rewards/rejected": -0.0033754808828234673,
45
  "step": 2
46
  },
47
  {
48
  "epoch": 0.04,
49
+ "grad_norm": 42.035900231625114,
50
  "learning_rate": 1.875e-07,
51
+ "logps/chosen": -51.67204666137695,
52
+ "logps/rejected": -42.43119812011719,
53
+ "loss": 0.6908,
54
+ "losses/dpo": 0.6752724647521973,
55
+ "losses/sft": 0.1361423283815384,
56
+ "losses/total": 0.6752724647521973,
57
+ "ref_logps/chosen": -51.709320068359375,
58
+ "ref_logps/rejected": -42.41019058227539,
59
+ "rewards/accuracies": 0.5390625,
60
+ "rewards/chosen": 0.0037269049789756536,
61
+ "rewards/margins": 0.00582832982763648,
62
+ "rewards/rejected": -0.002101423917338252,
63
  "step": 3
64
  },
65
  {
66
  "epoch": 0.05,
67
+ "grad_norm": 37.529489917527485,
68
  "learning_rate": 2.5e-07,
69
+ "logps/chosen": -49.27662658691406,
70
+ "logps/rejected": -42.45417404174805,
71
+ "loss": 0.6949,
72
+ "losses/dpo": 0.6901852488517761,
73
+ "losses/sft": 0.18635118007659912,
74
+ "losses/total": 0.6901852488517761,
75
+ "ref_logps/chosen": -49.26988220214844,
76
+ "ref_logps/rejected": -42.47301483154297,
77
  "rewards/accuracies": 0.5,
78
+ "rewards/chosen": -0.0006745874416083097,
79
+ "rewards/margins": -0.0025582504458725452,
80
+ "rewards/rejected": 0.0018836627714335918,
81
  "step": 4
82
  },
83
  {
84
  "epoch": 0.07,
85
+ "grad_norm": 33.99395990426859,
86
  "learning_rate": 3.1249999999999997e-07,
87
+ "logps/chosen": -48.10963439941406,
88
+ "logps/rejected": -40.53562927246094,
89
+ "loss": 0.693,
90
+ "losses/dpo": 0.6474289894104004,
91
+ "losses/sft": 0.1237928494811058,
92
+ "losses/total": 0.6474289894104004,
93
+ "ref_logps/chosen": -48.120365142822266,
94
+ "ref_logps/rejected": -40.53252410888672,
95
+ "rewards/accuracies": 0.4921875,
96
+ "rewards/chosen": 0.001073037856258452,
97
+ "rewards/margins": 0.001383528346195817,
98
+ "rewards/rejected": -0.00031049124663695693,
99
  "step": 5
100
  },
101
  {
102
  "epoch": 0.08,
103
+ "grad_norm": 33.57616672063881,
104
  "learning_rate": 3.75e-07,
105
+ "logps/chosen": -50.0238037109375,
106
+ "logps/rejected": -41.26359558105469,
107
+ "loss": 0.6976,
108
+ "losses/dpo": 0.6670458316802979,
109
+ "losses/sft": 0.15494827926158905,
110
+ "losses/total": 0.6670458316802979,
111
+ "ref_logps/chosen": -49.96715545654297,
112
+ "ref_logps/rejected": -41.28278732299805,
113
+ "rewards/accuracies": 0.5,
114
+ "rewards/chosen": -0.005665063392370939,
115
+ "rewards/margins": -0.007583992555737495,
116
+ "rewards/rejected": 0.0019189291633665562,
117
  "step": 6
118
  },
119
  {
120
  "epoch": 0.1,
121
+ "grad_norm": 32.82497056854739,
122
  "learning_rate": 4.375e-07,
123
+ "logps/chosen": -51.68867111206055,
124
+ "logps/rejected": -43.37660217285156,
125
+ "loss": 0.6921,
126
+ "losses/dpo": 0.6866487264633179,
127
+ "losses/sft": 0.13379694521427155,
128
+ "losses/total": 0.6866487264633179,
129
+ "ref_logps/chosen": -51.77002716064453,
130
+ "ref_logps/rejected": -43.42585754394531,
131
  "rewards/accuracies": 0.515625,
132
+ "rewards/chosen": 0.008136047050356865,
133
+ "rewards/margins": 0.0032104291021823883,
134
+ "rewards/rejected": 0.004925617948174477,
135
  "step": 7
136
  },
137
  {
138
  "epoch": 0.11,
139
+ "grad_norm": 38.79691230760384,
140
  "learning_rate": 5e-07,
141
+ "logps/chosen": -49.79241180419922,
142
+ "logps/rejected": -42.46006774902344,
143
+ "loss": 0.695,
144
+ "losses/dpo": 0.6527105569839478,
145
+ "losses/sft": 0.15517215430736542,
146
+ "losses/total": 0.6527105569839478,
147
+ "ref_logps/chosen": -49.76615905761719,
148
+ "ref_logps/rejected": -42.45863342285156,
149
+ "rewards/accuracies": 0.4765625,
150
+ "rewards/chosen": -0.0026245731860399246,
151
+ "rewards/margins": -0.002481258474290371,
152
+ "rewards/rejected": -0.00014331366401165724,
153
  "step": 8
154
  },
155
  {
156
  "epoch": 0.12,
157
+ "grad_norm": 226.56518786863586,
158
  "learning_rate": 4.997080567080816e-07,
159
+ "logps/chosen": -54.879547119140625,
160
+ "logps/rejected": -42.98774719238281,
161
+ "loss": 0.6853,
162
+ "losses/dpo": 0.6064831614494324,
163
+ "losses/sft": 0.2268030345439911,
164
+ "losses/total": 0.6064831614494324,
165
+ "ref_logps/chosen": -54.92854309082031,
166
+ "ref_logps/rejected": -42.86476135253906,
167
+ "rewards/accuracies": 0.5546875,
168
+ "rewards/chosen": 0.004900116473436356,
169
+ "rewards/margins": 0.017198245972394943,
170
+ "rewards/rejected": -0.012298129498958588,
171
  "step": 9
172
  },
173
  {
174
  "epoch": 0.14,
175
+ "grad_norm": 50.10962021770019,
176
  "learning_rate": 4.988329086794122e-07,
177
+ "logps/chosen": -52.08982467651367,
178
+ "logps/rejected": -43.10820388793945,
179
+ "loss": 0.6755,
180
+ "losses/dpo": 0.7082348465919495,
181
+ "losses/sft": 0.1494147777557373,
182
+ "losses/total": 0.7082348465919495,
183
+ "ref_logps/chosen": -52.199188232421875,
184
+ "ref_logps/rejected": -42.8469123840332,
185
+ "rewards/accuracies": 0.7109375,
186
+ "rewards/chosen": 0.010936222970485687,
187
+ "rewards/margins": 0.0370652973651886,
188
+ "rewards/rejected": -0.02612907439470291,
189
  "step": 10
190
  },
191
  {
192
  "epoch": 0.15,
193
+ "grad_norm": 50.96166024464587,
194
  "learning_rate": 4.973765998627628e-07,
195
+ "logps/chosen": -47.0179443359375,
196
+ "logps/rejected": -42.876380920410156,
197
+ "loss": 0.6764,
198
+ "losses/dpo": 0.7454954385757446,
199
+ "losses/sft": 0.11485376209020615,
200
+ "losses/total": 0.7454954385757446,
201
+ "ref_logps/chosen": -47.048309326171875,
202
+ "ref_logps/rejected": -42.54939651489258,
203
+ "rewards/accuracies": 0.671875,
204
+ "rewards/chosen": 0.003036007285118103,
205
+ "rewards/margins": 0.03573445603251457,
206
+ "rewards/rejected": -0.03269844502210617,
207
  "step": 11
208
  },
209
  {
210
  "epoch": 0.16,
211
+ "grad_norm": 43.95886877722781,
212
  "learning_rate": 4.953425315348533e-07,
213
+ "logps/chosen": -47.39234161376953,
214
+ "logps/rejected": -42.44198226928711,
215
+ "loss": 0.6802,
216
+ "losses/dpo": 0.6931471824645996,
217
+ "losses/sft": 0.1578434258699417,
218
+ "losses/total": 0.6931471824645996,
219
+ "ref_logps/chosen": -47.38432312011719,
220
+ "ref_logps/rejected": -42.15151596069336,
221
+ "rewards/accuracies": 0.5859375,
222
+ "rewards/chosen": -0.0008019641973078251,
223
+ "rewards/margins": 0.028244582936167717,
224
+ "rewards/rejected": -0.029046546667814255,
225
  "step": 12
226
  },
227
  {
228
  "epoch": 0.18,
229
+ "grad_norm": 35.92010395880078,
230
  "learning_rate": 4.92735454356513e-07,
231
+ "logps/chosen": -47.71138381958008,
232
+ "logps/rejected": -40.651554107666016,
233
+ "loss": 0.6671,
234
+ "losses/dpo": 0.6601188778877258,
235
+ "losses/sft": 0.12200119346380234,
236
+ "losses/total": 0.6601188778877258,
237
+ "ref_logps/chosen": -47.742637634277344,
238
+ "ref_logps/rejected": -40.12622833251953,
239
+ "rewards/accuracies": 0.6875,
240
+ "rewards/chosen": 0.003125616116449237,
241
+ "rewards/margins": 0.055658161640167236,
242
+ "rewards/rejected": -0.052532538771629333,
243
  "step": 13
244
  },
245
  {
246
  "epoch": 0.19,
247
+ "grad_norm": 38.42448037262164,
248
  "learning_rate": 4.895614572772916e-07,
249
+ "logps/chosen": -49.50225830078125,
250
+ "logps/rejected": -41.507652282714844,
251
+ "loss": 0.6657,
252
+ "losses/dpo": 0.7294200658798218,
253
+ "losses/sft": 0.15328460931777954,
254
+ "losses/total": 0.7294200658798218,
255
+ "ref_logps/chosen": -49.43043518066406,
256
+ "ref_logps/rejected": -40.846160888671875,
257
+ "rewards/accuracies": 0.6953125,
258
+ "rewards/chosen": -0.007182045839726925,
259
+ "rewards/margins": 0.058967407792806625,
260
+ "rewards/rejected": -0.06614945828914642,
261
  "step": 14
262
  },
263
  {
264
  "epoch": 0.21,
265
+ "grad_norm": 35.51673879153569,
266
  "learning_rate": 4.858279533144357e-07,
267
+ "logps/chosen": -50.58116149902344,
268
+ "logps/rejected": -44.15226745605469,
269
+ "loss": 0.6648,
270
+ "losses/dpo": 0.7195499539375305,
271
+ "losses/sft": 0.12852783501148224,
272
+ "losses/total": 0.7195499539375305,
273
+ "ref_logps/chosen": -50.35996627807617,
274
+ "ref_logps/rejected": -43.29161071777344,
275
+ "rewards/accuracies": 0.671875,
276
+ "rewards/chosen": -0.02211933210492134,
277
+ "rewards/margins": 0.06394598633050919,
278
+ "rewards/rejected": -0.08606532216072083,
279
  "step": 15
280
  },
281
  {
282
  "epoch": 0.22,
283
+ "grad_norm": 34.20405799205637,
284
  "learning_rate": 4.815436622394441e-07,
285
+ "logps/chosen": -53.629356384277344,
286
+ "logps/rejected": -45.9289665222168,
287
+ "loss": 0.6455,
288
+ "losses/dpo": 0.6931471824645996,
289
+ "losses/sft": 0.14978539943695068,
290
+ "losses/total": 0.6931471824645996,
291
+ "ref_logps/chosen": -53.546119689941406,
292
+ "ref_logps/rejected": -44.75611114501953,
293
+ "rewards/accuracies": 0.7109375,
294
+ "rewards/chosen": -0.00832361076027155,
295
+ "rewards/margins": 0.10896164178848267,
296
+ "rewards/rejected": -0.11728526651859283,
297
  "step": 16
298
  },
299
  {
300
  "epoch": 0.23,
301
+ "grad_norm": 51.47866505976329,
302
  "learning_rate": 4.767185902126363e-07,
303
+ "logps/chosen": -52.55246353149414,
304
+ "logps/rejected": -48.530906677246094,
305
+ "loss": 0.6445,
306
+ "losses/dpo": 0.6025716662406921,
307
+ "losses/sft": 0.17564229667186737,
308
+ "losses/total": 0.6025716662406921,
309
+ "ref_logps/chosen": -52.30644989013672,
310
+ "ref_logps/rejected": -47.189701080322266,
311
+ "rewards/accuracies": 0.6875,
312
+ "rewards/chosen": -0.02460121549665928,
313
+ "rewards/margins": 0.10951894521713257,
314
+ "rewards/rejected": -0.1341201663017273,
315
  "step": 17
316
  },
317
  {
318
  "epoch": 0.25,
319
+ "grad_norm": 42.49951077408584,
320
  "learning_rate": 4.7136400641330245e-07,
321
+ "logps/chosen": -52.193199157714844,
322
+ "logps/rejected": -43.489959716796875,
323
+ "loss": 0.6225,
324
+ "losses/dpo": 0.4434754252433777,
325
+ "losses/sft": 0.13364653289318085,
326
+ "losses/total": 0.4434754252433777,
327
+ "ref_logps/chosen": -51.82938003540039,
328
+ "ref_logps/rejected": -41.477630615234375,
329
+ "rewards/accuracies": 0.7265625,
330
+ "rewards/chosen": -0.03638196364045143,
331
+ "rewards/margins": 0.16485059261322021,
332
+ "rewards/rejected": -0.20123255252838135,
333
  "step": 18
334
  },
335
  {
336
  "epoch": 0.26,
337
+ "grad_norm": 34.30863345836372,
338
  "learning_rate": 4.6549241672001225e-07,
339
+ "logps/chosen": -49.611228942871094,
340
+ "logps/rejected": -46.756103515625,
341
+ "loss": 0.6071,
342
+ "losses/dpo": 0.6045699715614319,
343
+ "losses/sft": 0.13909928500652313,
344
+ "losses/total": 0.6045699715614319,
345
+ "ref_logps/chosen": -49.14532470703125,
346
+ "ref_logps/rejected": -44.29924774169922,
347
+ "rewards/accuracies": 0.7890625,
348
+ "rewards/chosen": -0.046590566635131836,
349
+ "rewards/margins": 0.19909515976905823,
350
+ "rewards/rejected": -0.24568572640419006,
351
  "step": 19
352
  },
353
  {
354
  "epoch": 0.27,
355
+ "grad_norm": 35.537459009303014,
356
  "learning_rate": 4.591175345025566e-07,
357
+ "logps/chosen": -50.094032287597656,
358
+ "logps/rejected": -46.121707916259766,
359
+ "loss": 0.6058,
360
+ "losses/dpo": 0.5356044173240662,
361
+ "losses/sft": 0.1512974053621292,
362
+ "losses/total": 0.5356044173240662,
363
+ "ref_logps/chosen": -49.32575607299805,
364
+ "ref_logps/rejected": -43.3276481628418,
365
+ "rewards/accuracies": 0.7578125,
366
+ "rewards/chosen": -0.07682754099369049,
367
+ "rewards/margins": 0.20257848501205444,
368
+ "rewards/rejected": -0.27940604090690613,
369
  "step": 20
370
  },
371
  {
372
  "epoch": 0.29,
373
+ "grad_norm": 82.79188209516856,
374
  "learning_rate": 4.5225424859373684e-07,
375
+ "logps/chosen": -53.73488235473633,
376
+ "logps/rejected": -48.036128997802734,
377
+ "loss": 0.5936,
378
+ "losses/dpo": 0.6363024711608887,
379
+ "losses/sft": 0.18370339274406433,
380
+ "losses/total": 0.6363024711608887,
381
+ "ref_logps/chosen": -52.835113525390625,
382
+ "ref_logps/rejected": -44.653133392333984,
383
+ "rewards/accuracies": 0.75,
384
+ "rewards/chosen": -0.08997656404972076,
385
+ "rewards/margins": 0.24832308292388916,
386
+ "rewards/rejected": -0.33829963207244873,
387
  "step": 21
388
  },
389
  {
390
  "epoch": 0.3,
391
+ "grad_norm": 35.68500953981497,
392
  "learning_rate": 4.4491858851580553e-07,
393
+ "logps/chosen": -52.28797149658203,
394
+ "logps/rejected": -48.59185791015625,
395
+ "loss": 0.5956,
396
+ "losses/dpo": 0.6950743198394775,
397
+ "losses/sft": 0.3061237335205078,
398
+ "losses/total": 0.6950743198394775,
399
+ "ref_logps/chosen": -50.30400085449219,
400
+ "ref_logps/rejected": -43.93888854980469,
401
+ "rewards/accuracies": 0.6953125,
402
+ "rewards/chosen": -0.19839699566364288,
403
+ "rewards/margins": 0.2669002413749695,
404
+ "rewards/rejected": -0.46529728174209595,
405
  "step": 22
406
  },
407
  {
408
  "epoch": 0.32,
409
+ "grad_norm": 51.2013048213769,
410
  "learning_rate": 4.3712768704277524e-07,
411
+ "logps/chosen": -50.60584259033203,
412
+ "logps/rejected": -49.11386489868164,
413
+ "loss": 0.5413,
414
+ "losses/dpo": 1.0638338327407837,
415
+ "losses/sft": 0.1654026359319687,
416
+ "losses/total": 1.0638338327407837,
417
+ "ref_logps/chosen": -48.48411560058594,
418
+ "ref_logps/rejected": -42.86372375488281,
419
+ "rewards/accuracies": 0.7890625,
420
+ "rewards/chosen": -0.2121725082397461,
421
+ "rewards/margins": 0.4128417372703552,
422
+ "rewards/rejected": -0.6250141859054565,
423
  "step": 23
424
  },
425
  {
426
  "epoch": 0.33,
427
+ "grad_norm": 26.152258770479694,
428
  "learning_rate": 4.2889974018603024e-07,
429
+ "logps/chosen": -53.90440368652344,
430
+ "logps/rejected": -50.59031295776367,
431
+ "loss": 0.5421,
432
+ "losses/dpo": 0.7262043952941895,
433
+ "losses/sft": 0.18153545260429382,
434
+ "losses/total": 0.7262043952941895,
435
+ "ref_logps/chosen": -51.315208435058594,
436
+ "ref_logps/rejected": -43.73783874511719,
437
+ "rewards/accuracies": 0.734375,
438
+ "rewards/chosen": -0.2589200735092163,
439
+ "rewards/margins": 0.4263269305229187,
440
+ "rewards/rejected": -0.6852469444274902,
441
  "step": 24
442
  },
443
  {
444
  "epoch": 0.34,
445
+ "grad_norm": 49.33026538547341,
446
  "learning_rate": 4.2025396469669926e-07,
447
+ "logps/chosen": -53.82769775390625,
448
+ "logps/rejected": -50.70613098144531,
449
+ "loss": 0.5517,
450
+ "losses/dpo": 0.41891151666641235,
451
+ "losses/sft": 0.1567077487707138,
452
+ "losses/total": 0.41891151666641235,
453
+ "ref_logps/chosen": -50.04922103881836,
454
+ "ref_logps/rejected": -42.47876739501953,
455
+ "rewards/accuracies": 0.7109375,
456
+ "rewards/chosen": -0.37784749269485474,
457
+ "rewards/margins": 0.4448884427547455,
458
+ "rewards/rejected": -0.8227359056472778,
459
  "step": 25
460
  },
461
  {
462
  "epoch": 0.36,
463
+ "grad_norm": 38.54013817065724,
464
  "learning_rate": 4.112105531840426e-07,
465
+ "logps/chosen": -56.83380889892578,
466
+ "logps/rejected": -55.79877471923828,
467
+ "loss": 0.5145,
468
+ "losses/dpo": 0.25203394889831543,
469
+ "losses/sft": 0.15007992088794708,
470
+ "losses/total": 0.25203394889831543,
471
+ "ref_logps/chosen": -51.92417907714844,
472
+ "ref_logps/rejected": -45.255348205566406,
473
+ "rewards/accuracies": 0.78125,
474
+ "rewards/chosen": -0.49096283316612244,
475
+ "rewards/margins": 0.5633795261383057,
476
+ "rewards/rejected": -1.0543423891067505,
477
  "step": 26
478
  },
479
  {
480
  "epoch": 0.37,
481
+ "grad_norm": 35.25775541354148,
482
  "learning_rate": 4.017906269546778e-07,
483
+ "logps/chosen": -57.45189666748047,
484
+ "logps/rejected": -53.96529769897461,
485
+ "loss": 0.5329,
486
+ "losses/dpo": 0.3053484857082367,
487
+ "losses/sft": 0.12551482021808624,
488
+ "losses/total": 0.3053484857082367,
489
+ "ref_logps/chosen": -52.189697265625,
490
+ "ref_logps/rejected": -44.003353118896484,
491
+ "rewards/accuracies": 0.765625,
492
+ "rewards/chosen": -0.5262198448181152,
493
+ "rewards/margins": 0.46997466683387756,
494
+ "rewards/rejected": -0.99619460105896,
495
  "step": 27
496
  },
497
  {
498
  "epoch": 0.38,
499
+ "grad_norm": 23.64859158816546,
500
  "learning_rate": 3.920161866827889e-07,
501
+ "logps/chosen": -57.529579162597656,
502
+ "logps/rejected": -56.24253845214844,
503
+ "loss": 0.4882,
504
+ "losses/dpo": 0.67308509349823,
505
+ "losses/sft": 0.12211936712265015,
506
+ "losses/total": 0.67308509349823,
507
+ "ref_logps/chosen": -51.290321350097656,
508
+ "ref_logps/rejected": -43.70161437988281,
509
  "rewards/accuracies": 0.84375,
510
+ "rewards/chosen": -0.6239261627197266,
511
+ "rewards/margins": 0.6301661133766174,
512
+ "rewards/rejected": -1.2540922164916992,
513
  "step": 28
514
  },
515
  {
516
  "epoch": 0.4,
517
+ "grad_norm": 34.075060841718205,
518
  "learning_rate": 3.8191006102653317e-07,
519
+ "logps/chosen": -54.39567947387695,
520
+ "logps/rejected": -53.69304656982422,
521
+ "loss": 0.515,
522
+ "losses/dpo": 0.3636208176612854,
523
+ "losses/sft": 0.15627621114253998,
524
+ "losses/total": 0.3636208176612854,
525
+ "ref_logps/chosen": -46.61610412597656,
526
+ "ref_logps/rejected": -40.169898986816406,
527
+ "rewards/accuracies": 0.7890625,
528
+ "rewards/chosen": -0.7779572010040283,
529
+ "rewards/margins": 0.5743571519851685,
530
+ "rewards/rejected": -1.3523142337799072,
531
  "step": 29
532
  },
533
  {
534
  "epoch": 0.41,
535
+ "grad_norm": 39.075544491508126,
536
  "learning_rate": 3.7149585331065145e-07,
537
+ "logps/chosen": -55.852046966552734,
538
+ "logps/rejected": -56.0112419128418,
539
+ "loss": 0.4987,
540
+ "losses/dpo": 0.9463083744049072,
541
+ "losses/sft": 0.1410434991121292,
542
+ "losses/total": 0.9463083744049072,
543
+ "ref_logps/chosen": -48.19220733642578,
544
+ "ref_logps/rejected": -42.009246826171875,
545
+ "rewards/accuracies": 0.78125,
546
+ "rewards/chosen": -0.765984058380127,
547
+ "rewards/margins": 0.634215772151947,
548
+ "rewards/rejected": -1.4001998901367188,
549
  "step": 30
550
  },
551
  {
552
  "epoch": 0.42,
553
+ "grad_norm": 27.598875905640913,
554
  "learning_rate": 3.6079788639981036e-07,
555
+ "logps/chosen": -60.89286804199219,
556
+ "logps/rejected": -58.91151428222656,
557
+ "loss": 0.477,
558
+ "losses/dpo": 0.5266605615615845,
559
+ "losses/sft": 0.14658664166927338,
560
+ "losses/total": 0.5266605615615845,
561
+ "ref_logps/chosen": -51.61863327026367,
562
+ "ref_logps/rejected": -42.28534698486328,
563
+ "rewards/accuracies": 0.84375,
564
+ "rewards/chosen": -0.9274235963821411,
565
+ "rewards/margins": 0.7351932525634766,
566
+ "rewards/rejected": -1.6626169681549072,
567
  "step": 31
568
  },
569
  {
570
  "epoch": 0.44,
571
+ "grad_norm": 28.756497878941868,
572
  "learning_rate": 3.498411458914238e-07,
573
+ "logps/chosen": -60.40979766845703,
574
+ "logps/rejected": -60.34819793701172,
575
+ "loss": 0.4847,
576
+ "losses/dpo": 0.4884541630744934,
577
+ "losses/sft": 0.19494454562664032,
578
+ "losses/total": 0.4884541630744934,
579
+ "ref_logps/chosen": -52.04460906982422,
580
+ "ref_logps/rejected": -44.97260665893555,
581
+ "rewards/accuracies": 0.8125,
582
+ "rewards/chosen": -0.8365185260772705,
583
+ "rewards/margins": 0.701040506362915,
584
+ "rewards/rejected": -1.537559151649475,
585
  "step": 32
586
  },
587
  {
588
  "epoch": 0.45,
589
+ "grad_norm": 28.434577214059978,
590
  "learning_rate": 3.3865122176063385e-07,
591
+ "logps/chosen": -57.96968460083008,
592
+ "logps/rejected": -56.83502960205078,
593
+ "loss": 0.4677,
594
+ "losses/dpo": 0.5375417470932007,
595
+ "losses/sft": 0.148137629032135,
596
+ "losses/total": 0.5375417470932007,
597
+ "ref_logps/chosen": -47.51142120361328,
598
+ "ref_logps/rejected": -38.619873046875,
599
+ "rewards/accuracies": 0.796875,
600
+ "rewards/chosen": -1.0458261966705322,
601
+ "rewards/margins": 0.7756892442703247,
602
+ "rewards/rejected": -1.8215153217315674,
603
  "step": 33
604
  },
605
  {
606
  "epoch": 0.47,
607
+ "grad_norm": 29.49134519608509,
608
  "learning_rate": 3.272542485937368e-07,
609
+ "logps/chosen": -62.14280700683594,
610
+ "logps/rejected": -61.54853820800781,
611
+ "loss": 0.4961,
612
+ "losses/dpo": 0.7278311848640442,
613
+ "losses/sft": 0.1993408501148224,
614
+ "losses/total": 0.7278311848640442,
615
+ "ref_logps/chosen": -51.82103729248047,
616
+ "ref_logps/rejected": -44.025291442871094,
617
+ "rewards/accuracies": 0.75,
618
+ "rewards/chosen": -1.0321770906448364,
619
+ "rewards/margins": 0.7201482653617859,
620
+ "rewards/rejected": -1.7523252964019775,
621
  "step": 34
622
  },
623
  {
624
  "epoch": 0.48,
625
+ "grad_norm": 26.468466564434074,
626
  "learning_rate": 3.1567684454964674e-07,
627
+ "logps/chosen": -56.682411193847656,
628
+ "logps/rejected": -58.0302734375,
629
+ "loss": 0.4396,
630
+ "losses/dpo": 0.5649744868278503,
631
+ "losses/sft": 0.129312202334404,
632
+ "losses/total": 0.5649744868278503,
633
+ "ref_logps/chosen": -47.770896911621094,
634
+ "ref_logps/rejected": -40.46588897705078,
635
+ "rewards/accuracies": 0.78125,
636
+ "rewards/chosen": -0.8911517858505249,
637
+ "rewards/margins": 0.8652870059013367,
638
+ "rewards/rejected": -1.7564387321472168,
639
  "step": 35
640
  },
641
  {
642
  "epoch": 0.49,
643
+ "grad_norm": 25.595938355510107,
644
  "learning_rate": 3.0394604919195157e-07,
645
+ "logps/chosen": -61.79121398925781,
646
+ "logps/rejected": -62.89993667602539,
647
+ "loss": 0.4745,
648
+ "losses/dpo": 0.2738940417766571,
649
+ "losses/sft": 0.15838046371936798,
650
+ "losses/total": 0.2738940417766571,
651
+ "ref_logps/chosen": -49.21980285644531,
652
+ "ref_logps/rejected": -42.15373992919922,
653
+ "rewards/accuracies": 0.7578125,
654
+ "rewards/chosen": -1.257140874862671,
655
+ "rewards/margins": 0.8174787759780884,
656
+ "rewards/rejected": -2.074619770050049,
657
  "step": 36
658
  },
659
  {
660
  "epoch": 0.51,
661
+ "grad_norm": 24.43093018755541,
662
  "learning_rate": 2.920892603367596e-07,
663
+ "logps/chosen": -60.27898406982422,
664
+ "logps/rejected": -61.00114822387695,
665
+ "loss": 0.4791,
666
+ "losses/dpo": 0.5097879767417908,
667
+ "losses/sft": 0.17959001660346985,
668
+ "losses/total": 0.5097879767417908,
669
+ "ref_logps/chosen": -49.49871826171875,
670
+ "ref_logps/rejected": -42.78453063964844,
671
+ "rewards/accuracies": 0.7890625,
672
+ "rewards/chosen": -1.078026533126831,
673
+ "rewards/margins": 0.7436352372169495,
674
+ "rewards/rejected": -1.8216617107391357,
675
  "step": 37
676
  },
677
  {
678
  "epoch": 0.52,
679
+ "grad_norm": 27.08759050745191,
680
  "learning_rate": 2.801341700638307e-07,
681
+ "logps/chosen": -62.44047164916992,
682
+ "logps/rejected": -62.224884033203125,
683
+ "loss": 0.4767,
684
+ "losses/dpo": 0.37803658843040466,
685
+ "losses/sft": 0.18652431666851044,
686
+ "losses/total": 0.37803658843040466,
687
+ "ref_logps/chosen": -50.557559967041016,
688
+ "ref_logps/rejected": -41.98173522949219,
689
+ "rewards/accuracies": 0.78125,
690
+ "rewards/chosen": -1.1882915496826172,
691
+ "rewards/margins": 0.8360234498977661,
692
+ "rewards/rejected": -2.0243148803710938,
693
  "step": 38
694
  },
695
  {
696
  "epoch": 0.53,
697
+ "grad_norm": 33.43704122805882,
698
  "learning_rate": 2.681087000404406e-07,
699
+ "logps/chosen": -65.03423309326172,
700
+ "logps/rejected": -65.04768371582031,
701
+ "loss": 0.5084,
702
+ "losses/dpo": 0.7027170658111572,
703
+ "losses/sft": 0.12946006655693054,
704
+ "losses/total": 0.7027170658111572,
705
+ "ref_logps/chosen": -51.817020416259766,
706
+ "ref_logps/rejected": -44.10376739501953,
707
+ "rewards/accuracies": 0.71875,
708
+ "rewards/chosen": -1.3217215538024902,
709
+ "rewards/margins": 0.7726694345474243,
710
+ "rewards/rejected": -2.094391107559204,
711
  "step": 39
712
  },
713
  {
714
  "epoch": 0.55,
715
+ "grad_norm": 129.45570096870608,
716
  "learning_rate": 2.5604093630903305e-07,
717
+ "logps/chosen": -59.94195556640625,
718
+ "logps/rejected": -63.19525909423828,
719
+ "loss": 0.4527,
720
+ "losses/dpo": 0.7064068913459778,
721
+ "losses/sft": 0.13724827766418457,
722
+ "losses/total": 0.7064068913459778,
723
+ "ref_logps/chosen": -47.65526580810547,
724
+ "ref_logps/rejected": -42.33349609375,
725
+ "rewards/accuracies": 0.796875,
726
+ "rewards/chosen": -1.2286689281463623,
727
+ "rewards/margins": 0.8575077056884766,
728
+ "rewards/rejected": -2.086176872253418,
729
  "step": 40
730
  },
731
  {
732
  "epoch": 0.56,
733
+ "grad_norm": 24.012368893482737,
734
  "learning_rate": 2.43959063690967e-07,
735
+ "logps/chosen": -61.86077880859375,
736
+ "logps/rejected": -66.92259979248047,
737
+ "loss": 0.4553,
738
+ "losses/dpo": 0.3154614567756653,
739
+ "losses/sft": 0.16294528543949127,
740
+ "losses/total": 0.3154614567756653,
741
+ "ref_logps/chosen": -48.4053955078125,
742
+ "ref_logps/rejected": -44.16739273071289,
743
+ "rewards/accuracies": 0.796875,
744
+ "rewards/chosen": -1.3455381393432617,
745
+ "rewards/margins": 0.9299829006195068,
746
+ "rewards/rejected": -2.2755210399627686,
747
  "step": 41
748
  },
749
  {
750
  "epoch": 0.58,
751
+ "grad_norm": 102.16273879292123,
752
  "learning_rate": 2.3189129995955942e-07,
753
+ "logps/chosen": -64.28011322021484,
754
+ "logps/rejected": -64.86600494384766,
755
+ "loss": 0.4642,
756
+ "losses/dpo": 0.4425658583641052,
757
+ "losses/sft": 0.11157938092947006,
758
+ "losses/total": 0.4425658583641052,
759
+ "ref_logps/chosen": -53.148860931396484,
760
+ "ref_logps/rejected": -45.69209289550781,
761
+ "rewards/accuracies": 0.8125,
762
+ "rewards/chosen": -1.113125205039978,
763
+ "rewards/margins": 0.8042662143707275,
764
+ "rewards/rejected": -1.9173914194107056,
765
  "step": 42
766
  },
767
  {
768
  "epoch": 0.59,
769
+ "grad_norm": 27.45890404880792,
770
  "learning_rate": 2.1986582993616925e-07,
771
+ "logps/chosen": -64.73760223388672,
772
+ "logps/rejected": -63.79928207397461,
773
+ "loss": 0.4607,
774
+ "losses/dpo": 0.08850211650133133,
775
+ "losses/sft": 0.1770356148481369,
776
+ "losses/total": 0.08850211650133133,
777
+ "ref_logps/chosen": -52.7093391418457,
778
+ "ref_logps/rejected": -42.18277359008789,
779
+ "rewards/accuracies": 0.78125,
780
+ "rewards/chosen": -1.2028264999389648,
781
+ "rewards/margins": 0.9588243961334229,
782
+ "rewards/rejected": -2.1616508960723877,
783
  "step": 43
784
  },
785
  {
786
  "epoch": 0.6,
787
+ "grad_norm": 35.977504794321945,
788
  "learning_rate": 2.0791073966324034e-07,
789
+ "logps/chosen": -65.26310729980469,
790
+ "logps/rejected": -66.70973205566406,
791
+ "loss": 0.4531,
792
+ "losses/dpo": 0.7271055579185486,
793
+ "losses/sft": 0.1396849900484085,
794
+ "losses/total": 0.7271055579185486,
795
+ "ref_logps/chosen": -53.6609001159668,
796
+ "ref_logps/rejected": -46.01152801513672,
797
+ "rewards/accuracies": 0.8515625,
798
+ "rewards/chosen": -1.1602208614349365,
799
+ "rewards/margins": 0.909599781036377,
800
+ "rewards/rejected": -2.0698208808898926,
801
  "step": 44
802
  },
803
  {
804
  "epoch": 0.62,
805
+ "grad_norm": 30.8655762680209,
806
  "learning_rate": 1.960539508080485e-07,
807
+ "logps/chosen": -62.69697952270508,
808
+ "logps/rejected": -64.87743377685547,
809
+ "loss": 0.4482,
810
+ "losses/dpo": 0.7160241603851318,
811
+ "losses/sft": 0.16245654225349426,
812
+ "losses/total": 0.7160241603851318,
813
+ "ref_logps/chosen": -49.983802795410156,
814
+ "ref_logps/rejected": -43.17237091064453,
815
+ "rewards/accuracies": 0.796875,
816
+ "rewards/chosen": -1.2713178396224976,
817
+ "rewards/margins": 0.8991883993148804,
818
+ "rewards/rejected": -2.170506238937378,
819
  "step": 45
820
  },
821
  {
822
  "epoch": 0.63,
823
+ "grad_norm": 42.203067350847334,
824
  "learning_rate": 1.8432315545035327e-07,
825
+ "logps/chosen": -65.14309692382812,
826
+ "logps/rejected": -64.79584503173828,
827
+ "loss": 0.4638,
828
+ "losses/dpo": 0.647499144077301,
829
+ "losses/sft": 0.17309847474098206,
830
+ "losses/total": 0.647499144077301,
831
+ "ref_logps/chosen": -51.99322509765625,
832
+ "ref_logps/rejected": -42.78443145751953,
833
+ "rewards/accuracies": 0.828125,
834
+ "rewards/chosen": -1.3149867057800293,
835
+ "rewards/margins": 0.8861545324325562,
836
+ "rewards/rejected": -2.201141357421875,
837
  "step": 46
838
  },
839
  {
840
  "epoch": 0.64,
841
+ "grad_norm": 26.70333995315738,
842
  "learning_rate": 1.7274575140626315e-07,
843
+ "logps/chosen": -64.62788391113281,
844
+ "logps/rejected": -66.00929260253906,
845
+ "loss": 0.3755,
846
+ "losses/dpo": 0.7572497129440308,
847
+ "losses/sft": 0.16420619189739227,
848
+ "losses/total": 0.7572497129440308,
849
+ "ref_logps/chosen": -53.670265197753906,
850
+ "ref_logps/rejected": -43.973365783691406,
851
+ "rewards/accuracies": 0.8671875,
852
+ "rewards/chosen": -1.0957624912261963,
853
+ "rewards/margins": 1.1078301668167114,
854
+ "rewards/rejected": -2.203592538833618,
855
  "step": 47
856
  },
857
  {
858
  "epoch": 0.66,
859
+ "grad_norm": 30.007336064883145,
860
  "learning_rate": 1.6134877823936607e-07,
861
+ "logps/chosen": -60.07398223876953,
862
+ "logps/rejected": -63.938331604003906,
863
+ "loss": 0.4222,
864
+ "losses/dpo": 0.4377414286136627,
865
+ "losses/sft": 0.11289513111114502,
866
+ "losses/total": 0.4377414286136627,
867
+ "ref_logps/chosen": -48.226722717285156,
868
+ "ref_logps/rejected": -42.14054870605469,
869
+ "rewards/accuracies": 0.765625,
870
+ "rewards/chosen": -1.1847256422042847,
871
+ "rewards/margins": 0.995052695274353,
872
+ "rewards/rejected": -2.1797783374786377,
873
  "step": 48
874
  },
875
  {
876
  "epoch": 0.67,
877
+ "grad_norm": 33.18269229637362,
878
  "learning_rate": 1.5015885410857614e-07,
879
+ "logps/chosen": -63.292476654052734,
880
+ "logps/rejected": -66.98123168945312,
881
+ "loss": 0.4246,
882
+ "losses/dpo": 0.3581680655479431,
883
+ "losses/sft": 0.17332157492637634,
884
+ "losses/total": 0.3581680655479431,
885
+ "ref_logps/chosen": -50.916664123535156,
886
+ "ref_logps/rejected": -45.03084182739258,
887
+ "rewards/accuracies": 0.84375,
888
+ "rewards/chosen": -1.2375812530517578,
889
+ "rewards/margins": 0.9574571847915649,
890
+ "rewards/rejected": -2.195038318634033,
891
  "step": 49
892
  },
893
  {
894
  "epoch": 0.68,
895
+ "grad_norm": 32.535671402481796,
896
  "learning_rate": 1.392021136001897e-07,
897
+ "logps/chosen": -61.653507232666016,
898
+ "logps/rejected": -66.69647979736328,
899
+ "loss": 0.3784,
900
+ "losses/dpo": 0.14524562656879425,
901
+ "losses/sft": 0.1722523272037506,
902
+ "losses/total": 0.14524562656879425,
903
+ "ref_logps/chosen": -51.076236724853516,
904
+ "ref_logps/rejected": -44.246944427490234,
905
+ "rewards/accuracies": 0.8046875,
906
+ "rewards/chosen": -1.0577270984649658,
907
+ "rewards/margins": 1.1872267723083496,
908
+ "rewards/rejected": -2.2449538707733154,
909
  "step": 50
910
  },
911
  {
912
  "epoch": 0.7,
913
+ "grad_norm": 25.494002931399418,
914
  "learning_rate": 1.2850414668934847e-07,
915
+ "logps/chosen": -63.76778030395508,
916
+ "logps/rejected": -66.36669921875,
917
+ "loss": 0.4239,
918
+ "losses/dpo": 0.5600922107696533,
919
+ "losses/sft": 0.13496388494968414,
920
+ "losses/total": 0.5600922107696533,
921
+ "ref_logps/chosen": -49.28026580810547,
922
+ "ref_logps/rejected": -41.501556396484375,
923
+ "rewards/accuracies": 0.84375,
924
+ "rewards/chosen": -1.4487512111663818,
925
+ "rewards/margins": 1.0377633571624756,
926
+ "rewards/rejected": -2.4865145683288574,
927
  "step": 51
928
  },
929
  {
930
  "epoch": 0.71,
931
+ "grad_norm": 24.257143077705184,
932
  "learning_rate": 1.1808993897346678e-07,
933
+ "logps/chosen": -60.98112487792969,
934
+ "logps/rejected": -66.6905517578125,
935
+ "loss": 0.4463,
936
+ "losses/dpo": 0.6072797775268555,
937
+ "losses/sft": 0.14662015438079834,
938
+ "losses/total": 0.6072797775268555,
939
+ "ref_logps/chosen": -48.362701416015625,
940
+ "ref_logps/rejected": -44.13787841796875,
941
+ "rewards/accuracies": 0.7734375,
942
+ "rewards/chosen": -1.2618422508239746,
943
+ "rewards/margins": 0.9934254288673401,
944
+ "rewards/rejected": -2.25526762008667,
945
  "step": 52
946
  },
947
  {
948
  "epoch": 0.73,
949
+ "grad_norm": 28.99400991059714,
950
  "learning_rate": 1.0798381331721107e-07,
951
+ "logps/chosen": -61.79584884643555,
952
+ "logps/rejected": -64.41800689697266,
953
+ "loss": 0.4073,
954
+ "losses/dpo": 0.17208106815814972,
955
+ "losses/sft": 0.14093177020549774,
956
+ "losses/total": 0.17208106815814972,
957
+ "ref_logps/chosen": -49.47767639160156,
958
+ "ref_logps/rejected": -41.640419006347656,
959
+ "rewards/accuracies": 0.828125,
960
+ "rewards/chosen": -1.2318172454833984,
961
+ "rewards/margins": 1.0459411144256592,
962
+ "rewards/rejected": -2.2777583599090576,
963
  "step": 53
964
  },
965
  {
966
  "epoch": 0.74,
967
+ "grad_norm": 32.15451904929163,
968
  "learning_rate": 9.82093730453222e-08,
969
+ "logps/chosen": -62.59501647949219,
970
+ "logps/rejected": -63.971126556396484,
971
+ "loss": 0.5058,
972
+ "losses/dpo": 0.3638117015361786,
973
+ "losses/sft": 0.14573359489440918,
974
+ "losses/total": 0.3638117015361786,
975
+ "ref_logps/chosen": -51.01821517944336,
976
+ "ref_logps/rejected": -43.79859161376953,
977
+ "rewards/accuracies": 0.7421875,
978
+ "rewards/chosen": -1.157679796218872,
979
+ "rewards/margins": 0.8595736026763916,
980
+ "rewards/rejected": -2.0172533988952637,
981
  "step": 54
982
  },
983
  {
984
  "epoch": 0.75,
985
+ "grad_norm": 29.71547196346147,
986
  "learning_rate": 8.87894468159574e-08,
987
+ "logps/chosen": -63.908302307128906,
988
+ "logps/rejected": -65.4727554321289,
989
+ "loss": 0.4043,
990
+ "losses/dpo": 1.4689037799835205,
991
+ "losses/sft": 0.19870857894420624,
992
+ "losses/total": 1.4689037799835205,
993
+ "ref_logps/chosen": -49.52888870239258,
994
+ "ref_logps/rejected": -39.608917236328125,
995
+ "rewards/accuracies": 0.8203125,
996
+ "rewards/chosen": -1.437941312789917,
997
+ "rewards/margins": 1.148442029953003,
998
+ "rewards/rejected": -2.586383581161499,
999
  "step": 55
1000
  },
1001
  {
1002
  "epoch": 0.77,
1003
+ "grad_norm": 40.070212554436814,
1004
  "learning_rate": 7.974603530330067e-08,
1005
+ "logps/chosen": -63.663780212402344,
1006
+ "logps/rejected": -65.04582977294922,
1007
+ "loss": 0.4551,
1008
+ "losses/dpo": 0.7427124977111816,
1009
+ "losses/sft": 0.1415950208902359,
1010
+ "losses/total": 0.7427124977111816,
1011
+ "ref_logps/chosen": -52.447853088378906,
1012
+ "ref_logps/rejected": -44.403289794921875,
1013
+ "rewards/accuracies": 0.78125,
1014
+ "rewards/chosen": -1.1215929985046387,
1015
+ "rewards/margins": 0.9426611661911011,
1016
+ "rewards/rejected": -2.0642542839050293,
1017
  "step": 56
1018
  },
1019
  {
1020
  "epoch": 0.78,
1021
+ "grad_norm": 50.773025060467745,
1022
  "learning_rate": 7.110025981396975e-08,
1023
+ "logps/chosen": -64.57611083984375,
1024
+ "logps/rejected": -67.85655212402344,
1025
+ "loss": 0.4677,
1026
+ "losses/dpo": 0.7480742335319519,
1027
+ "losses/sft": 0.1996467560529709,
1028
+ "losses/total": 0.7480742335319519,
1029
+ "ref_logps/chosen": -49.66741943359375,
1030
+ "ref_logps/rejected": -43.6092414855957,
1031
+ "rewards/accuracies": 0.796875,
1032
+ "rewards/chosen": -1.4908695220947266,
1033
+ "rewards/margins": 0.9338614344596863,
1034
+ "rewards/rejected": -2.4247307777404785,
1035
  "step": 57
1036
  },
1037
  {
1038
  "epoch": 0.79,
1039
+ "grad_norm": 151.98374859438076,
1040
  "learning_rate": 6.28723129572247e-08,
1041
+ "logps/chosen": -63.19204330444336,
1042
+ "logps/rejected": -67.11865234375,
1043
+ "loss": 0.4125,
1044
+ "losses/dpo": 0.191142737865448,
1045
+ "losses/sft": 0.1258397251367569,
1046
+ "losses/total": 0.191142737865448,
1047
+ "ref_logps/chosen": -49.3205680847168,
1048
+ "ref_logps/rejected": -42.308128356933594,
1049
+ "rewards/accuracies": 0.859375,
1050
+ "rewards/chosen": -1.387147307395935,
1051
+ "rewards/margins": 1.0939054489135742,
1052
+ "rewards/rejected": -2.481052875518799,
1053
  "step": 58
1054
  },
1055
  {
1056
  "epoch": 0.81,
1057
+ "grad_norm": 68.1040384525775,
1058
  "learning_rate": 5.508141148419443e-08,
1059
+ "logps/chosen": -66.73199462890625,
1060
+ "logps/rejected": -66.16500854492188,
1061
+ "loss": 0.416,
1062
+ "losses/dpo": 0.4806699752807617,
1063
+ "losses/sft": 0.17691649496555328,
1064
+ "losses/total": 0.4806699752807617,
1065
+ "ref_logps/chosen": -52.95691680908203,
1066
+ "ref_logps/rejected": -42.309608459472656,
1067
+ "rewards/accuracies": 0.828125,
1068
+ "rewards/chosen": -1.3775070905685425,
1069
+ "rewards/margins": 1.0080331563949585,
1070
+ "rewards/rejected": -2.38554048538208,
1071
  "step": 59
1072
  },
1073
  {
1074
  "epoch": 0.82,
1075
+ "grad_norm": 40.930870088616274,
1076
  "learning_rate": 4.774575140626316e-08,
1077
+ "logps/chosen": -65.13594055175781,
1078
+ "logps/rejected": -67.62340545654297,
1079
+ "loss": 0.4084,
1080
+ "losses/dpo": 0.3780416250228882,
1081
+ "losses/sft": 0.070173479616642,
1082
+ "losses/total": 0.3780416250228882,
1083
+ "ref_logps/chosen": -50.85914611816406,
1084
+ "ref_logps/rejected": -41.8093147277832,
1085
+ "rewards/accuracies": 0.828125,
1086
+ "rewards/chosen": -1.4276790618896484,
1087
+ "rewards/margins": 1.153730034828186,
1088
+ "rewards/rejected": -2.581409215927124,
1089
  "step": 60
1090
  },
1091
  {
1092
  "epoch": 0.84,
1093
+ "grad_norm": 30.260562657822298,
1094
  "learning_rate": 4.0882465497443313e-08,
1095
+ "logps/chosen": -63.07252883911133,
1096
+ "logps/rejected": -69.03398132324219,
1097
+ "loss": 0.4369,
1098
+ "losses/dpo": 0.36678266525268555,
1099
+ "losses/sft": 0.12201398611068726,
1100
+ "losses/total": 0.36678266525268555,
1101
+ "ref_logps/chosen": -49.61937713623047,
1102
+ "ref_logps/rejected": -44.632652282714844,
1103
+ "rewards/accuracies": 0.78125,
1104
+ "rewards/chosen": -1.3453152179718018,
1105
+ "rewards/margins": 1.0948173999786377,
1106
+ "rewards/rejected": -2.4401326179504395,
1107
  "step": 61
1108
  },
1109
  {
1110
  "epoch": 0.85,
1111
+ "grad_norm": 24.402238672879346,
1112
  "learning_rate": 3.450758327998768e-08,
1113
+ "logps/chosen": -63.65578842163086,
1114
+ "logps/rejected": -67.62722778320312,
1115
+ "loss": 0.3813,
1116
+ "losses/dpo": 0.7685420513153076,
1117
+ "losses/sft": 0.19448544085025787,
1118
+ "losses/total": 0.7685420513153076,
1119
+ "ref_logps/chosen": -51.653221130371094,
1120
+ "ref_logps/rejected": -43.50916290283203,
1121
+ "rewards/accuracies": 0.8359375,
1122
+ "rewards/chosen": -1.2002568244934082,
1123
+ "rewards/margins": 1.2115496397018433,
1124
+ "rewards/rejected": -2.411806583404541,
1125
  "step": 62
1126
  },
1127
  {
1128
  "epoch": 0.86,
1129
+ "grad_norm": 24.77840549005978,
1130
  "learning_rate": 2.863599358669755e-08,
1131
+ "logps/chosen": -65.08932495117188,
1132
+ "logps/rejected": -66.61253356933594,
1133
+ "loss": 0.4291,
1134
+ "losses/dpo": 1.248164176940918,
1135
+ "losses/sft": 0.15433447062969208,
1136
+ "losses/total": 1.248164176940918,
1137
+ "ref_logps/chosen": -52.82990646362305,
1138
+ "ref_logps/rejected": -44.81475830078125,
1139
+ "rewards/accuracies": 0.8046875,
1140
+ "rewards/chosen": -1.2259418964385986,
1141
+ "rewards/margins": 0.9538350105285645,
1142
+ "rewards/rejected": -2.179776906967163,
1143
  "step": 63
1144
  },
1145
  {
1146
  "epoch": 0.88,
1147
+ "grad_norm": 37.580661698465605,
1148
  "learning_rate": 2.3281409787363648e-08,
1149
+ "logps/chosen": -61.57237243652344,
1150
+ "logps/rejected": -65.20365142822266,
1151
+ "loss": 0.4063,
1152
+ "losses/dpo": 0.09678905457258224,
1153
+ "losses/sft": 0.15513929724693298,
1154
+ "losses/total": 0.09678905457258224,
1155
+ "ref_logps/chosen": -48.36906814575195,
1156
+ "ref_logps/rejected": -41.755638122558594,
1157
+ "rewards/accuracies": 0.84375,
1158
+ "rewards/chosen": -1.3203303813934326,
1159
+ "rewards/margins": 1.0244702100753784,
1160
+ "rewards/rejected": -2.3448004722595215,
1161
  "step": 64
1162
  },
1163
  {
1164
  "epoch": 0.89,
1165
+ "grad_norm": 52.86928073665909,
1166
  "learning_rate": 1.845633776055591e-08,
1167
+ "logps/chosen": -63.794559478759766,
1168
+ "logps/rejected": -68.59591674804688,
1169
+ "loss": 0.4236,
1170
+ "losses/dpo": 0.5401971936225891,
1171
+ "losses/sft": 0.1492851972579956,
1172
+ "losses/total": 0.5401971936225891,
1173
+ "ref_logps/chosen": -51.490814208984375,
1174
+ "ref_logps/rejected": -46.14380645751953,
1175
+ "rewards/accuracies": 0.8125,
1176
+ "rewards/chosen": -1.2303749322891235,
1177
+ "rewards/margins": 1.0148357152938843,
1178
+ "rewards/rejected": -2.245210647583008,
1179
  "step": 65
1180
  },
1181
  {
1182
  "epoch": 0.9,
1183
+ "grad_norm": 26.186736903994795,
1184
  "learning_rate": 1.4172046685564209e-08,
1185
+ "logps/chosen": -63.65757751464844,
1186
+ "logps/rejected": -66.78644561767578,
1187
+ "loss": 0.4453,
1188
+ "losses/dpo": 0.3751629889011383,
1189
+ "losses/sft": 0.20083920657634735,
1190
+ "losses/total": 0.3751629889011383,
1191
+ "ref_logps/chosen": -50.14271545410156,
1192
+ "ref_logps/rejected": -43.08906555175781,
1193
+ "rewards/accuracies": 0.78125,
1194
+ "rewards/chosen": -1.3514864444732666,
1195
+ "rewards/margins": 1.0182515382766724,
1196
+ "rewards/rejected": -2.3697378635406494,
1197
  "step": 66
1198
  },
1199
  {
1200
  "epoch": 0.92,
1201
+ "grad_norm": 32.512692665801914,
1202
  "learning_rate": 1.0438542722708444e-08,
1203
+ "logps/chosen": -61.425235748291016,
1204
+ "logps/rejected": -62.99015426635742,
1205
+ "loss": 0.4034,
1206
+ "losses/dpo": 0.4745466709136963,
1207
+ "losses/sft": 0.15217916667461395,
1208
+ "losses/total": 0.4745466709136963,
1209
+ "ref_logps/chosen": -49.22258758544922,
1210
+ "ref_logps/rejected": -39.8096923828125,
1211
+ "rewards/accuracies": 0.84375,
1212
+ "rewards/chosen": -1.2202647924423218,
1213
+ "rewards/margins": 1.0977811813354492,
1214
+ "rewards/rejected": -2.3180460929870605,
1215
  "step": 67
1216
  },
1217
  {
1218
  "epoch": 0.93,
1219
+ "grad_norm": 28.142561520937964,
1220
  "learning_rate": 7.2645456434869965e-09,
1221
+ "logps/chosen": -60.124473571777344,
1222
+ "logps/rejected": -64.52591705322266,
1223
+ "loss": 0.4073,
1224
+ "losses/dpo": 0.5824078917503357,
1225
+ "losses/sft": 0.15901198983192444,
1226
+ "losses/total": 0.5824078917503357,
1227
+ "ref_logps/chosen": -49.08730697631836,
1228
+ "ref_logps/rejected": -42.66169357299805,
1229
+ "rewards/accuracies": 0.8203125,
1230
+ "rewards/chosen": -1.1037168502807617,
1231
+ "rewards/margins": 1.0827052593231201,
1232
+ "rewards/rejected": -2.1864218711853027,
1233
  "step": 68
1234
  },
1235
  {
1236
  "epoch": 0.95,
1237
+ "grad_norm": 26.50196447879399,
1238
  "learning_rate": 4.657468465146641e-09,
1239
+ "logps/chosen": -63.6911735534668,
1240
+ "logps/rejected": -68.59868621826172,
1241
+ "loss": 0.3973,
1242
+ "losses/dpo": 0.40227654576301575,
1243
+ "losses/sft": 0.18853811919689178,
1244
+ "losses/total": 0.40227654576301575,
1245
+ "ref_logps/chosen": -51.581443786621094,
1246
+ "ref_logps/rejected": -44.77821350097656,
1247
+ "rewards/accuracies": 0.8515625,
1248
+ "rewards/chosen": -1.210972785949707,
1249
+ "rewards/margins": 1.1710745096206665,
1250
+ "rewards/rejected": -2.382047176361084,
1251
  "step": 69
1252
  },
1253
  {
1254
  "epoch": 0.96,
1255
+ "grad_norm": 56.71695513519511,
1256
  "learning_rate": 2.6234001372372193e-09,
1257
+ "logps/chosen": -63.47015380859375,
1258
+ "logps/rejected": -65.77841186523438,
1259
+ "loss": 0.4462,
1260
+ "losses/dpo": 0.6931471824645996,
1261
+ "losses/sft": 0.16308167576789856,
1262
+ "losses/total": 0.6931471824645996,
1263
+ "ref_logps/chosen": -50.47125244140625,
1264
+ "ref_logps/rejected": -43.315406799316406,
1265
+ "rewards/accuracies": 0.8203125,
1266
+ "rewards/chosen": -1.2998902797698975,
1267
+ "rewards/margins": 0.9464105367660522,
1268
+ "rewards/rejected": -2.24630069732666,
1269
  "step": 70
1270
  },
1271
  {
1272
  "epoch": 0.97,
1273
+ "grad_norm": 23.674451477206517,
1274
  "learning_rate": 1.167091320587843e-09,
1275
+ "logps/chosen": -62.78862762451172,
1276
+ "logps/rejected": -67.10327911376953,
1277
+ "loss": 0.4313,
1278
+ "losses/dpo": 0.12619969248771667,
1279
+ "losses/sft": 0.16695602238178253,
1280
+ "losses/total": 0.12619969248771667,
1281
+ "ref_logps/chosen": -50.83470153808594,
1282
+ "ref_logps/rejected": -45.14189147949219,
1283
+ "rewards/accuracies": 0.8046875,
1284
+ "rewards/chosen": -1.1953924894332886,
1285
+ "rewards/margins": 1.000746250152588,
1286
+ "rewards/rejected": -2.196138858795166,
1287
  "step": 71
1288
  },
1289
  {
1290
  "epoch": 0.99,
1291
+ "grad_norm": 26.52528545158649,
1292
  "learning_rate": 2.9194329191833953e-10,
1293
+ "logps/chosen": -63.891353607177734,
1294
+ "logps/rejected": -66.84733581542969,
1295
+ "loss": 0.4241,
1296
+ "losses/dpo": 0.2748945951461792,
1297
+ "losses/sft": 0.1392793506383896,
1298
+ "losses/total": 0.2748945951461792,
1299
+ "ref_logps/chosen": -51.043392181396484,
1300
+ "ref_logps/rejected": -43.61123275756836,
1301
+ "rewards/accuracies": 0.7734375,
1302
+ "rewards/chosen": -1.2847963571548462,
1303
+ "rewards/margins": 1.0388145446777344,
1304
+ "rewards/rejected": -2.323610782623291,
1305
  "step": 72
1306
  },
1307
  {
1308
  "epoch": 1.0,
1309
+ "grad_norm": 32.933679564175364,
1310
  "learning_rate": 0.0,
1311
+ "logps/chosen": -64.90662384033203,
1312
+ "logps/rejected": -65.81568145751953,
1313
+ "loss": 0.3989,
1314
+ "losses/dpo": 0.6386691927909851,
1315
+ "losses/sft": 0.17253142595291138,
1316
+ "losses/total": 0.6386691927909851,
1317
+ "ref_logps/chosen": -53.89427185058594,
1318
+ "ref_logps/rejected": -43.687034606933594,
1319
+ "rewards/accuracies": 0.8515625,
1320
+ "rewards/chosen": -1.1012351512908936,
1321
+ "rewards/margins": 1.1116297245025635,
1322
+ "rewards/rejected": -2.212864875793457,
1323
  "step": 73
1324
  },
1325
  {
1326
  "epoch": 1.0,
1327
  "step": 73,
1328
  "total_flos": 0.0,
1329
+ "train_loss": 0.5157321454727486,
1330
+ "train_runtime": 1200.6075,
1331
+ "train_samples_per_second": 7.848,
1332
  "train_steps_per_second": 0.061
1333
  }
1334
  ],
 
1336
  "max_steps": 73,
1337
  "num_input_tokens_seen": 0,
1338
  "num_train_epochs": 1,
1339
+ "save_steps": 20000,
1340
  "total_flos": 0.0,
1341
  "train_batch_size": 1,
1342
  "trial_name": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2be64989f4fc627b4d3149eec1d20f187177a9a5a04e580e903943c9a25a406
3
- size 8056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b583f6e9c5632b56ce3bbd7c1ce1021b5fd031b803ec813530cd0ad4a3507e0
3
+ size 8184