cwaud commited on
Commit
a5c0395
1 Parent(s): 61b3a89

Training in progress, step 35, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "down_proj",
24
- "q_proj",
25
  "up_proj",
26
  "gate_proj",
27
  "o_proj",
28
  "k_proj",
29
- "v_proj"
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "up_proj",
24
  "gate_proj",
25
  "o_proj",
26
  "k_proj",
27
+ "v_proj",
28
+ "down_proj",
29
+ "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e83f643c5a9ab1cfeea362359ac6a525e971df1e3983567f604cb597906c1754
3
  size 45118424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba4278cca7255c9ddaed50ffb682eee63c6d1a424636d63a067aa111a5265137
3
  size 45118424
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:432df0593e0c92f4f103861a5875d814800331c5703ce69afd65fbbe0b1d89bd
3
  size 23159290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b272124104fc85c680ad1fb7b8107ff9da2af68217c1025da825177c6ddaa9d7
3
  size 23159290
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f5adca46d5768b2528d11214bf50c169d41422785e8b729921c5399a7fed5df
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d12df93e39f0749d364ca6814b0bfacff84881459c9967e57146b8e6e2eb11d7
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb578e75c11a81e85dda67a691f96ba4793a02960f1409fd3e1511aac873491a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb862dbd6d6a9c776ecb1c74360f055af4c6285cf030af3020acd182d15c5ef8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,118 +1,277 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.02386634844868735,
5
- "eval_steps": 3,
6
- "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.002386634844868735,
13
- "grad_norm": 1.0580705404281616,
14
- "learning_rate": 2e-05,
15
  "loss": 1.0747,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.002386634844868735,
20
  "eval_loss": 1.0123302936553955,
21
- "eval_runtime": 27.4242,
22
- "eval_samples_per_second": 6.454,
23
- "eval_steps_per_second": 3.245,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.00477326968973747,
28
- "grad_norm": 0.7454516291618347,
29
- "learning_rate": 4e-05,
30
  "loss": 0.7862,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.007159904534606206,
35
- "grad_norm": 0.6492141485214233,
36
- "learning_rate": 6e-05,
37
- "loss": 0.8378,
38
- "step": 3
39
- },
40
- {
41
- "epoch": 0.007159904534606206,
42
- "eval_loss": 1.0094887018203735,
43
- "eval_runtime": 27.4338,
44
- "eval_samples_per_second": 6.452,
45
- "eval_steps_per_second": 3.244,
46
  "step": 3
47
  },
48
  {
49
  "epoch": 0.00954653937947494,
50
- "grad_norm": 0.5810346007347107,
51
- "learning_rate": 8e-05,
52
- "loss": 0.7888,
53
  "step": 4
54
  },
55
  {
56
  "epoch": 0.011933174224343675,
57
- "grad_norm": 0.6462458372116089,
58
- "learning_rate": 0.0001,
59
- "loss": 0.7506,
60
  "step": 5
61
  },
62
  {
63
  "epoch": 0.014319809069212411,
64
- "grad_norm": 1.0376014709472656,
65
- "learning_rate": 0.00012,
66
- "loss": 0.8917,
67
- "step": 6
68
- },
69
- {
70
- "epoch": 0.014319809069212411,
71
- "eval_loss": 0.9692097306251526,
72
- "eval_runtime": 27.4583,
73
- "eval_samples_per_second": 6.446,
74
- "eval_steps_per_second": 3.241,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 0.016706443914081145,
79
- "grad_norm": 0.5357475280761719,
80
- "learning_rate": 0.00014,
81
- "loss": 0.7908,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 0.01909307875894988,
86
- "grad_norm": 0.8745409846305847,
87
- "learning_rate": 0.00016,
88
- "loss": 1.1039,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 0.021479713603818614,
93
- "grad_norm": 0.4521085321903229,
94
- "learning_rate": 0.00018,
95
- "loss": 0.6845,
96
- "step": 9
97
- },
98
- {
99
- "epoch": 0.021479713603818614,
100
- "eval_loss": 0.9187015295028687,
101
- "eval_runtime": 27.4853,
102
- "eval_samples_per_second": 6.44,
103
- "eval_steps_per_second": 3.238,
104
  "step": 9
105
  },
106
  {
107
  "epoch": 0.02386634844868735,
108
- "grad_norm": 0.5216010808944702,
109
- "learning_rate": 0.0002,
110
- "loss": 0.7988,
111
  "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  }
113
  ],
114
  "logging_steps": 1,
115
- "max_steps": 10,
116
  "num_input_tokens_seen": 0,
117
  "num_train_epochs": 1,
118
  "save_steps": 5,
@@ -123,12 +282,12 @@
123
  "should_evaluate": false,
124
  "should_log": false,
125
  "should_save": true,
126
- "should_training_stop": true
127
  },
128
  "attributes": {}
129
  }
130
  },
131
- "total_flos": 1935445139128320.0,
132
  "train_batch_size": 2,
133
  "trial_name": null,
134
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.08353221957040573,
5
+ "eval_steps": 25,
6
+ "global_step": 35,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.002386634844868735,
13
+ "grad_norm": 1.0874019861221313,
14
+ "learning_rate": 1e-05,
15
  "loss": 1.0747,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.002386634844868735,
20
  "eval_loss": 1.0123302936553955,
21
+ "eval_runtime": 27.9511,
22
+ "eval_samples_per_second": 6.332,
23
+ "eval_steps_per_second": 3.184,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.00477326968973747,
28
+ "grad_norm": 0.748105525970459,
29
+ "learning_rate": 2e-05,
30
  "loss": 0.7862,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.007159904534606206,
35
+ "grad_norm": 0.6411569118499756,
36
+ "learning_rate": 3e-05,
37
+ "loss": 0.8414,
 
 
 
 
 
 
 
 
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.00954653937947494,
42
+ "grad_norm": 0.5958303213119507,
43
+ "learning_rate": 4e-05,
44
+ "loss": 0.7947,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.011933174224343675,
49
+ "grad_norm": 0.6730267405509949,
50
+ "learning_rate": 5e-05,
51
+ "loss": 0.76,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.014319809069212411,
56
+ "grad_norm": 1.1415467262268066,
57
+ "learning_rate": 6e-05,
58
+ "loss": 0.9148,
 
 
 
 
 
 
 
 
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.016706443914081145,
63
+ "grad_norm": 0.598889946937561,
64
+ "learning_rate": 7e-05,
65
+ "loss": 0.8089,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.01909307875894988,
70
+ "grad_norm": 1.1247347593307495,
71
+ "learning_rate": 8e-05,
72
+ "loss": 1.1527,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.021479713603818614,
77
+ "grad_norm": 0.5591338872909546,
78
+ "learning_rate": 9e-05,
79
+ "loss": 0.711,
 
 
 
 
 
 
 
 
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.02386634844868735,
84
+ "grad_norm": 0.6751295328140259,
85
+ "learning_rate": 0.0001,
86
+ "loss": 0.8312,
87
  "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.026252983293556086,
91
+ "grad_norm": 0.7573989033699036,
92
+ "learning_rate": 9.99695413509548e-05,
93
+ "loss": 1.1809,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.028639618138424822,
98
+ "grad_norm": 0.7879427671432495,
99
+ "learning_rate": 9.987820251299122e-05,
100
+ "loss": 1.0638,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.031026252983293555,
105
+ "grad_norm": 0.6389354467391968,
106
+ "learning_rate": 9.972609476841367e-05,
107
+ "loss": 0.8152,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.03341288782816229,
112
+ "grad_norm": 0.479705810546875,
113
+ "learning_rate": 9.951340343707852e-05,
114
+ "loss": 0.6732,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.03579952267303103,
119
+ "grad_norm": 0.5184096097946167,
120
+ "learning_rate": 9.924038765061042e-05,
121
+ "loss": 1.0368,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.03818615751789976,
126
+ "grad_norm": 0.4770895838737488,
127
+ "learning_rate": 9.890738003669029e-05,
128
+ "loss": 0.8109,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.0405727923627685,
133
+ "grad_norm": 0.5425803065299988,
134
+ "learning_rate": 9.851478631379982e-05,
135
+ "loss": 0.88,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.04295942720763723,
140
+ "grad_norm": 0.5870300531387329,
141
+ "learning_rate": 9.806308479691595e-05,
142
+ "loss": 0.906,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.045346062052505964,
147
+ "grad_norm": 0.5628218054771423,
148
+ "learning_rate": 9.755282581475769e-05,
149
+ "loss": 0.6757,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.0477326968973747,
154
+ "grad_norm": 0.5832799673080444,
155
+ "learning_rate": 9.698463103929542e-05,
156
+ "loss": 0.8508,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.050119331742243436,
161
+ "grad_norm": 0.5866987109184265,
162
+ "learning_rate": 9.635919272833938e-05,
163
+ "loss": 0.872,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.05250596658711217,
168
+ "grad_norm": 0.604156494140625,
169
+ "learning_rate": 9.567727288213005e-05,
170
+ "loss": 0.6487,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.05489260143198091,
175
+ "grad_norm": 0.5820953249931335,
176
+ "learning_rate": 9.493970231495835e-05,
177
+ "loss": 0.8686,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.057279236276849645,
182
+ "grad_norm": 0.6393556594848633,
183
+ "learning_rate": 9.414737964294636e-05,
184
+ "loss": 0.781,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.059665871121718374,
189
+ "grad_norm": 0.6221751570701599,
190
+ "learning_rate": 9.330127018922194e-05,
191
+ "loss": 0.8032,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.059665871121718374,
196
+ "eval_loss": 0.8647359013557434,
197
+ "eval_runtime": 27.9002,
198
+ "eval_samples_per_second": 6.344,
199
+ "eval_steps_per_second": 3.19,
200
+ "step": 25
201
+ },
202
+ {
203
+ "epoch": 0.06205250596658711,
204
+ "grad_norm": 0.7337064146995544,
205
+ "learning_rate": 9.24024048078213e-05,
206
+ "loss": 0.9311,
207
+ "step": 26
208
+ },
209
+ {
210
+ "epoch": 0.06443914081145585,
211
+ "grad_norm": 0.4561159908771515,
212
+ "learning_rate": 9.145187862775209e-05,
213
+ "loss": 1.0223,
214
+ "step": 27
215
+ },
216
+ {
217
+ "epoch": 0.06682577565632458,
218
+ "grad_norm": 0.42628154158592224,
219
+ "learning_rate": 9.045084971874738e-05,
220
+ "loss": 0.6906,
221
+ "step": 28
222
+ },
223
+ {
224
+ "epoch": 0.06921241050119331,
225
+ "grad_norm": 0.6458117365837097,
226
+ "learning_rate": 8.940053768033609e-05,
227
+ "loss": 0.8969,
228
+ "step": 29
229
+ },
230
+ {
231
+ "epoch": 0.07159904534606205,
232
+ "grad_norm": 0.6923385262489319,
233
+ "learning_rate": 8.83022221559489e-05,
234
+ "loss": 1.0892,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 0.07398568019093078,
239
+ "grad_norm": 0.5989975929260254,
240
+ "learning_rate": 8.715724127386972e-05,
241
+ "loss": 0.6379,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.07637231503579953,
246
+ "grad_norm": 0.4596601724624634,
247
+ "learning_rate": 8.596699001693255e-05,
248
+ "loss": 0.4745,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 0.07875894988066826,
253
+ "grad_norm": 0.5209793448448181,
254
+ "learning_rate": 8.473291852294987e-05,
255
+ "loss": 0.8658,
256
+ "step": 33
257
+ },
258
+ {
259
+ "epoch": 0.081145584725537,
260
+ "grad_norm": 0.5878239870071411,
261
+ "learning_rate": 8.345653031794292e-05,
262
+ "loss": 0.9537,
263
+ "step": 34
264
+ },
265
+ {
266
+ "epoch": 0.08353221957040573,
267
+ "grad_norm": 0.5229330062866211,
268
+ "learning_rate": 8.213938048432697e-05,
269
+ "loss": 0.6711,
270
+ "step": 35
271
  }
272
  ],
273
  "logging_steps": 1,
274
+ "max_steps": 100,
275
  "num_input_tokens_seen": 0,
276
  "num_train_epochs": 1,
277
  "save_steps": 5,
 
282
  "should_evaluate": false,
283
  "should_log": false,
284
  "should_save": true,
285
+ "should_training_stop": false
286
  },
287
  "attributes": {}
288
  }
289
  },
290
+ "total_flos": 6774057986949120.0,
291
  "train_batch_size": 2,
292
  "trial_name": null,
293
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf1f8d66d1d9f86da26660a52881d9c4934db98a9e88bd9f6f9ed55aa44562b5
3
- size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14dcbf12068500e1009d5791cfb900ba9d71f3d534a83271e8a5061a0b3cbb92
3
+ size 6712