Ligeng-Zhu commited on
Commit
692c299
·
verified ·
1 Parent(s): 1d148e2

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ligeng-zhu/openr1/runs/vdr82ewr)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ligeng-zhu/openr1/runs/cy51opva)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0011118109807661124,
4
- "train_runtime": 12310.5993,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.609,
7
  "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.00014772719968559928,
4
+ "train_runtime": 12261.9552,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.612,
7
  "train_steps_per_second": 0.005
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbf1ead14d78145d667212b8ac3487eb9fa5ff73e4cb6a8358492679d825929a
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fc6f55571bcb95d956c7cfd35906a43b7b4fd1867862496319e593bc781eb20
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bdbbe9882c2c5a4f7d2fc7e6f2d285786c7cb3a31c4176c4e64b38432afa4b8
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52e0ae9a5a539b827261fdcb81a2b56701a1707f6960c3c3e145bae58042fc7c
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c94aaa97e6c96bf27f564944be524064e26fd0489128f5ec7beef219eba1ef4
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e494758064a66d38bdfc7c5c6f3113a52a908d21dec04a41cf7abbe30735b23
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:296842bdd672617796c8ed82d8055cf9eb9448c72044d2a7e49b6617863ec8f4
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfb2e4c79185c8542c9c7453a93b773a3a530bcfd583a45661655853795ecbc6
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0011118109807661124,
4
- "train_runtime": 12310.5993,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.609,
7
  "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.00014772719968559928,
4
+ "train_runtime": 12261.9552,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.612,
7
  "train_steps_per_second": 0.005
8
  }
trainer_state.json CHANGED
@@ -9,161 +9,161 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 612.6890861511231,
13
  "epoch": 0.08528784648187633,
14
- "grad_norm": 499.6991882324219,
15
- "kl": 0.08026816844940185,
16
  "learning_rate": 2.5e-06,
17
- "loss": 0.0032,
18
- "reward": 0.634151815623045,
19
- "reward_std": 0.3310549106448889,
20
- "rewards/accuracy_reward": 0.6339286021888256,
21
- "rewards/format_reward": 0.00022321429569274187,
22
  "step": 5
23
  },
24
  {
25
- "completion_length": 622.5966781616211,
26
  "epoch": 0.17057569296375266,
27
- "grad_norm": 1.3974753618240356,
28
- "kl": 0.03140735626220703,
29
  "learning_rate": 2.956412726139078e-06,
30
- "loss": 0.0013,
31
- "reward": 0.6975446715950966,
32
- "reward_std": 0.29021961949765684,
33
- "rewards/accuracy_reward": 0.6968750283122063,
34
- "rewards/format_reward": 0.0006696428870782256,
35
  "step": 10
36
  },
37
  {
38
- "completion_length": 620.2056060791016,
39
  "epoch": 0.255863539445629,
40
- "grad_norm": 6.291464328765869,
41
- "kl": 0.005489921569824219,
42
  "learning_rate": 2.7836719084521715e-06,
43
  "loss": 0.0002,
44
- "reward": 0.7453125342726707,
45
- "reward_std": 0.23736504185944796,
46
- "rewards/accuracy_reward": 0.7453125342726707,
47
  "rewards/format_reward": 0.0,
48
  "step": 15
49
  },
50
  {
51
- "completion_length": 604.9377510070801,
52
  "epoch": 0.3411513859275053,
53
- "grad_norm": 0.1435093730688095,
54
- "kl": 0.0034137725830078124,
55
  "learning_rate": 2.4946839873611927e-06,
56
  "loss": 0.0001,
57
- "reward": 0.7725446790456771,
58
- "reward_std": 0.204142040386796,
59
- "rewards/accuracy_reward": 0.7725446790456771,
60
  "rewards/format_reward": 0.0,
61
  "step": 20
62
  },
63
  {
64
- "completion_length": 610.2647560119628,
65
  "epoch": 0.42643923240938164,
66
- "grad_norm": 6.51456880569458,
67
- "kl": 0.008059120178222657,
68
  "learning_rate": 2.1156192081791355e-06,
69
- "loss": 0.0003,
70
- "reward": 0.7638393253087997,
71
- "reward_std": 0.1955505058169365,
72
- "rewards/accuracy_reward": 0.7638393253087997,
73
  "rewards/format_reward": 0.0,
74
  "step": 25
75
  },
76
  {
77
- "completion_length": 610.1558280944824,
78
  "epoch": 0.511727078891258,
79
- "grad_norm": 0.07075676321983337,
80
- "kl": 0.0029842376708984373,
81
  "learning_rate": 1.6808050203829845e-06,
82
  "loss": 0.0001,
83
- "reward": 0.7504464626312256,
84
- "reward_std": 0.18517553191632033,
85
- "rewards/accuracy_reward": 0.7504464626312256,
86
  "rewards/format_reward": 0.0,
87
  "step": 30
88
  },
89
  {
90
- "completion_length": 599.5821647644043,
91
  "epoch": 0.5970149253731343,
92
- "grad_norm": 0.11963143199682236,
93
- "kl": 0.002887916564941406,
94
  "learning_rate": 1.2296174432791415e-06,
95
- "loss": 0.0001,
96
- "reward": 0.7441964641213417,
97
- "reward_std": 0.1850216191262007,
98
- "rewards/accuracy_reward": 0.7441964641213417,
99
  "rewards/format_reward": 0.0,
100
  "step": 35
101
  },
102
  {
103
- "completion_length": 585.4386421203614,
104
  "epoch": 0.6823027718550106,
105
- "grad_norm": 0.12116171419620514,
106
- "kl": 0.003470611572265625,
107
  "learning_rate": 8.029152419343472e-07,
108
  "loss": 0.0001,
109
- "reward": 0.7640625357627868,
110
- "reward_std": 0.17753117084503173,
111
- "rewards/accuracy_reward": 0.7640625357627868,
112
  "rewards/format_reward": 0.0,
113
  "step": 40
114
  },
115
  {
116
- "completion_length": 604.5248001098632,
117
  "epoch": 0.767590618336887,
118
- "grad_norm": 0.18742159008979797,
119
- "kl": 0.0029314041137695314,
120
  "learning_rate": 4.3933982822017883e-07,
121
  "loss": 0.0001,
122
- "reward": 0.752678605914116,
123
- "reward_std": 0.18469745945185423,
124
- "rewards/accuracy_reward": 0.752678605914116,
125
  "rewards/format_reward": 0.0,
126
  "step": 45
127
  },
128
  {
129
- "completion_length": 605.77993850708,
130
  "epoch": 0.8528784648187633,
131
- "grad_norm": 0.10905129462480545,
132
- "kl": 0.002948570251464844,
133
  "learning_rate": 1.718159615201853e-07,
134
  "loss": 0.0001,
135
- "reward": 0.7511161014437675,
136
- "reward_std": 0.18544870764017105,
137
- "rewards/accuracy_reward": 0.7511161014437675,
138
  "rewards/format_reward": 0.0,
139
  "step": 50
140
  },
141
  {
142
- "completion_length": 598.5527030944825,
143
  "epoch": 0.9381663113006397,
144
- "grad_norm": 0.11327774077653885,
145
- "kl": 0.012871551513671874,
146
  "learning_rate": 2.4570139579284723e-08,
147
- "loss": 0.0005,
148
- "reward": 0.7783482506871223,
149
- "reward_std": 0.19646856598556042,
150
- "rewards/accuracy_reward": 0.7783482506871223,
151
  "rewards/format_reward": 0.0,
152
  "step": 55
153
  },
154
  {
155
- "completion_length": 588.8050842285156,
156
  "epoch": 0.9893390191897654,
157
- "kl": 0.0034154256184895835,
158
- "reward": 0.7719494377573332,
159
- "reward_std": 0.1814334474814435,
160
- "rewards/accuracy_reward": 0.7719494377573332,
161
  "rewards/format_reward": 0.0,
162
  "step": 58,
163
  "total_flos": 0.0,
164
- "train_loss": 0.0011118109807661124,
165
- "train_runtime": 12310.5993,
166
- "train_samples_per_second": 0.609,
167
  "train_steps_per_second": 0.005
168
  }
169
  ],
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 614.0942222595215,
13
  "epoch": 0.08528784648187633,
14
+ "grad_norm": 39.48369216918945,
15
+ "kl": 0.002408742904663086,
16
  "learning_rate": 2.5e-06,
17
+ "loss": 0.0001,
18
+ "reward": 0.6303571693599224,
19
+ "reward_std": 0.3278109859675169,
20
+ "rewards/accuracy_reward": 0.6299107424914837,
21
+ "rewards/format_reward": 0.00044642859138548373,
22
  "step": 5
23
  },
24
  {
25
+ "completion_length": 618.1955627441406,
26
  "epoch": 0.17057569296375266,
27
+ "grad_norm": 0.26418083906173706,
28
+ "kl": 0.0033366203308105467,
29
  "learning_rate": 2.956412726139078e-06,
30
+ "loss": 0.0001,
31
+ "reward": 0.6823660999536514,
32
+ "reward_std": 0.2967432256788015,
33
+ "rewards/accuracy_reward": 0.6821428865194321,
34
+ "rewards/format_reward": 0.00022321429569274187,
35
  "step": 10
36
  },
37
  {
38
+ "completion_length": 615.4616325378418,
39
  "epoch": 0.255863539445629,
40
+ "grad_norm": 0.29952046275138855,
41
+ "kl": 0.004865837097167969,
42
  "learning_rate": 2.7836719084521715e-06,
43
  "loss": 0.0002,
44
+ "reward": 0.7544643208384514,
45
+ "reward_std": 0.22131893783807755,
46
+ "rewards/accuracy_reward": 0.7544643208384514,
47
  "rewards/format_reward": 0.0,
48
  "step": 15
49
  },
50
  {
51
+ "completion_length": 594.9377471923829,
52
  "epoch": 0.3411513859275053,
53
+ "grad_norm": 0.26904380321502686,
54
+ "kl": 0.0036653518676757813,
55
  "learning_rate": 2.4946839873611927e-06,
56
  "loss": 0.0001,
57
+ "reward": 0.7558036029338837,
58
+ "reward_std": 0.20889290906488894,
59
+ "rewards/accuracy_reward": 0.7558036029338837,
60
  "rewards/format_reward": 0.0,
61
  "step": 20
62
  },
63
  {
64
+ "completion_length": 605.4913223266601,
65
  "epoch": 0.42643923240938164,
66
+ "grad_norm": 0.12021861970424652,
67
+ "kl": 0.003992271423339844,
68
  "learning_rate": 2.1156192081791355e-06,
69
+ "loss": 0.0002,
70
+ "reward": 0.7569196805357933,
71
+ "reward_std": 0.19887337032705546,
72
+ "rewards/accuracy_reward": 0.7569196805357933,
73
  "rewards/format_reward": 0.0,
74
  "step": 25
75
  },
76
  {
77
+ "completion_length": 609.8823944091797,
78
  "epoch": 0.511727078891258,
79
+ "grad_norm": 0.09571157395839691,
80
+ "kl": 0.0034656524658203125,
81
  "learning_rate": 1.6808050203829845e-06,
82
  "loss": 0.0001,
83
+ "reward": 0.7562500357627868,
84
+ "reward_std": 0.1893269034102559,
85
+ "rewards/accuracy_reward": 0.7562500357627868,
86
  "rewards/format_reward": 0.0,
87
  "step": 30
88
  },
89
  {
90
+ "completion_length": 594.9317237854004,
91
  "epoch": 0.5970149253731343,
92
+ "grad_norm": 0.17406129837036133,
93
+ "kl": 0.01944389343261719,
94
  "learning_rate": 1.2296174432791415e-06,
95
+ "loss": 0.0008,
96
+ "reward": 0.7457589656114578,
97
+ "reward_std": 0.1772445771843195,
98
+ "rewards/accuracy_reward": 0.7457589656114578,
99
  "rewards/format_reward": 0.0,
100
  "step": 35
101
  },
102
  {
103
+ "completion_length": 587.7542655944824,
104
  "epoch": 0.6823027718550106,
105
+ "grad_norm": 0.3893604278564453,
106
+ "kl": 0.0033367156982421877,
107
  "learning_rate": 8.029152419343472e-07,
108
  "loss": 0.0001,
109
+ "reward": 0.7665178909897804,
110
+ "reward_std": 0.17746288534253835,
111
+ "rewards/accuracy_reward": 0.7665178909897804,
112
  "rewards/format_reward": 0.0,
113
  "step": 40
114
  },
115
  {
116
+ "completion_length": 608.1580612182618,
117
  "epoch": 0.767590618336887,
118
+ "grad_norm": 0.13640980422496796,
119
+ "kl": 0.0030500411987304686,
120
  "learning_rate": 4.3933982822017883e-07,
121
  "loss": 0.0001,
122
+ "reward": 0.7549107506871223,
123
+ "reward_std": 0.18457430368289351,
124
+ "rewards/accuracy_reward": 0.7549107506871223,
125
  "rewards/format_reward": 0.0,
126
  "step": 45
127
  },
128
  {
129
+ "completion_length": 602.2951156616211,
130
  "epoch": 0.8528784648187633,
131
+ "grad_norm": 0.217549666762352,
132
+ "kl": 0.003106689453125,
133
  "learning_rate": 1.718159615201853e-07,
134
  "loss": 0.0001,
135
+ "reward": 0.7488839671015739,
136
+ "reward_std": 0.18157500196248294,
137
+ "rewards/accuracy_reward": 0.7488839671015739,
138
  "rewards/format_reward": 0.0,
139
  "step": 50
140
  },
141
  {
142
+ "completion_length": 599.9350708007812,
143
  "epoch": 0.9381663113006397,
144
+ "grad_norm": 0.23885449767112732,
145
+ "kl": 0.0033361434936523436,
146
  "learning_rate": 2.4570139579284723e-08,
147
+ "loss": 0.0001,
148
+ "reward": 0.776116107404232,
149
+ "reward_std": 0.19774878825992345,
150
+ "rewards/accuracy_reward": 0.776116107404232,
151
  "rewards/format_reward": 0.0,
152
  "step": 55
153
  },
154
  {
155
+ "completion_length": 583.1644630432129,
156
  "epoch": 0.9893390191897654,
157
+ "kl": 0.0035022099812825522,
158
+ "reward": 0.7760417039195696,
159
+ "reward_std": 0.16163485000530878,
160
+ "rewards/accuracy_reward": 0.7760417039195696,
161
  "rewards/format_reward": 0.0,
162
  "step": 58,
163
  "total_flos": 0.0,
164
+ "train_loss": -0.00014772719968559928,
165
+ "train_runtime": 12261.9552,
166
+ "train_samples_per_second": 0.612,
167
  "train_steps_per_second": 0.005
168
  }
169
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a2b3dab96973f117202445b041ef4453ffe0e5c64232ffa41708ca0867bd02e
3
  size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad87e5a39e36d78f4464aed52557072e88434fa4b15d87a8e6183f26c1addbbc
3
  size 7544