winnieyangwannan commited on
Commit
d76c973
·
verified ·
1 Parent(s): 492eed1

Training in progress, step 200, checkpoint

Browse files
checkpoint-200/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "gate_proj",
27
  "k_proj",
28
- "o_proj",
29
- "up_proj",
30
  "q_proj",
 
 
31
  "down_proj",
32
- "v_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "v_proj",
27
  "k_proj",
 
 
28
  "q_proj",
29
+ "gate_proj",
30
+ "o_proj",
31
  "down_proj",
32
+ "up_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b03bcbae977e98dbb28d30db84d89b812576d7e3ce37764495370dd5428077db
3
  size 145287696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96091c94f705abfd93fa76b5e26ab01fc038a2adae9c9a94b8037c14a5080d71
3
  size 145287696
checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd226b3871c92c7488e458ece7628ca7b519dda3cf5c0f8bae5893a01e870046
3
  size 290833618
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7dfe306b99fb0015a85f65c81466ed472ff036879b74eb14adbf2d53284c7b4
3
  size 290833618
checkpoint-200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bd9a592c4a9e7cd2c08cb7c6ad796d33a83626e1e6f48d4851887e8d955063d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5183b16b2feb017749158123747811a04a1ee75226ab2848edc3b8ea64b8634e
3
  size 14244
checkpoint-200/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03d636bf275cca0830f6f24ec5b2f05fe140d74159c6c75ee797e07d187203c0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:399ce82c9e1658d463c19de3c20108c706a76b0356e5e266301d089ef0200499
3
  size 1064
checkpoint-200/trainer_state.json CHANGED
@@ -1,191 +1,319 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.21739130434782608,
5
- "eval_steps": 50,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.010869565217391304,
13
- "grad_norm": 2.4570870399475098,
14
- "learning_rate": 4.981884057971015e-05,
15
- "loss": 2.3612,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.021739130434782608,
20
- "grad_norm": 1.2042421102523804,
21
- "learning_rate": 4.963768115942029e-05,
22
- "loss": 1.1915,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.03260869565217391,
27
- "grad_norm": 1.2019174098968506,
28
- "learning_rate": 4.945652173913044e-05,
29
- "loss": 0.8992,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.043478260869565216,
34
- "grad_norm": 1.0546120405197144,
35
- "learning_rate": 4.9275362318840584e-05,
36
- "loss": 0.7276,
 
 
 
 
 
 
 
 
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.05434782608695652,
41
- "grad_norm": 0.754091203212738,
42
- "learning_rate": 4.909420289855073e-05,
43
- "loss": 0.6419,
 
 
 
 
 
 
 
 
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.05434782608695652,
48
- "eval_loss": 0.7905128598213196,
49
- "eval_runtime": 10.8418,
50
- "eval_samples_per_second": 44.273,
51
- "eval_steps_per_second": 2.767,
52
  "step": 50
53
  },
54
  {
55
- "epoch": 0.06521739130434782,
56
- "grad_norm": 0.7751966714859009,
57
- "learning_rate": 4.891304347826087e-05,
58
- "loss": 0.5971,
 
 
 
 
 
 
 
 
59
  "step": 60
60
  },
61
  {
62
- "epoch": 0.07608695652173914,
63
- "grad_norm": 0.6874057650566101,
64
- "learning_rate": 4.873188405797102e-05,
65
- "loss": 0.608,
66
  "step": 70
67
  },
68
  {
69
- "epoch": 0.08695652173913043,
70
- "grad_norm": 0.7145748734474182,
71
- "learning_rate": 4.855072463768116e-05,
72
- "loss": 0.6111,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  "step": 80
74
  },
75
  {
76
- "epoch": 0.09782608695652174,
77
- "grad_norm": 0.8841484189033508,
78
- "learning_rate": 4.836956521739131e-05,
79
- "loss": 0.5848,
80
  "step": 90
81
  },
82
  {
83
- "epoch": 0.10869565217391304,
84
- "grad_norm": 0.7937784790992737,
85
- "learning_rate": 4.818840579710145e-05,
86
- "loss": 0.5699,
 
 
 
 
 
 
 
 
87
  "step": 100
88
  },
89
  {
90
- "epoch": 0.10869565217391304,
91
- "eval_loss": 0.6956210732460022,
92
- "eval_runtime": 10.863,
93
- "eval_samples_per_second": 44.187,
94
- "eval_steps_per_second": 2.762,
95
  "step": 100
96
  },
97
  {
98
- "epoch": 0.11956521739130435,
99
- "grad_norm": 0.7818441987037659,
100
- "learning_rate": 4.80072463768116e-05,
101
- "loss": 0.5946,
102
  "step": 110
103
  },
104
  {
105
- "epoch": 0.13043478260869565,
106
- "grad_norm": 0.8666340112686157,
107
- "learning_rate": 4.782608695652174e-05,
108
- "loss": 0.5469,
 
 
 
 
 
 
 
 
109
  "step": 120
110
  },
111
  {
112
- "epoch": 0.14130434782608695,
113
- "grad_norm": 0.7637468576431274,
114
- "learning_rate": 4.764492753623189e-05,
115
- "loss": 0.5307,
 
 
 
 
 
 
 
 
116
  "step": 130
117
  },
118
  {
119
- "epoch": 0.15217391304347827,
120
- "grad_norm": 0.8282362222671509,
121
- "learning_rate": 4.746376811594203e-05,
122
- "loss": 0.5312,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  "step": 140
124
  },
125
  {
126
- "epoch": 0.16304347826086957,
127
- "grad_norm": 0.9675197601318359,
128
- "learning_rate": 4.7282608695652177e-05,
129
- "loss": 0.5486,
130
  "step": 150
131
  },
132
  {
133
- "epoch": 0.16304347826086957,
134
- "eval_loss": 0.6567058563232422,
135
- "eval_runtime": 10.8376,
136
- "eval_samples_per_second": 44.29,
137
- "eval_steps_per_second": 2.768,
138
  "step": 150
139
  },
140
  {
141
- "epoch": 0.17391304347826086,
142
- "grad_norm": 0.7035924792289734,
143
- "learning_rate": 4.710144927536232e-05,
144
- "loss": 0.5553,
 
 
 
 
 
 
 
 
145
  "step": 160
146
  },
147
  {
148
- "epoch": 0.18478260869565216,
149
- "grad_norm": 0.8957257866859436,
150
- "learning_rate": 4.6920289855072464e-05,
151
- "loss": 0.54,
152
  "step": 170
153
  },
154
  {
155
- "epoch": 0.1956521739130435,
156
- "grad_norm": 0.8544663190841675,
157
- "learning_rate": 4.673913043478261e-05,
158
- "loss": 0.55,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  "step": 180
160
  },
161
  {
162
- "epoch": 0.20652173913043478,
163
- "grad_norm": 0.7599456310272217,
164
- "learning_rate": 4.655797101449276e-05,
165
- "loss": 0.5102,
166
  "step": 190
167
  },
168
  {
169
- "epoch": 0.21739130434782608,
170
- "grad_norm": 0.9151259064674377,
171
- "learning_rate": 4.63768115942029e-05,
172
- "loss": 0.5372,
 
 
 
 
 
 
 
 
173
  "step": 200
174
  },
175
  {
176
- "epoch": 0.21739130434782608,
177
- "eval_loss": 0.6319016218185425,
178
- "eval_runtime": 10.8237,
179
- "eval_samples_per_second": 44.347,
180
- "eval_steps_per_second": 2.772,
181
  "step": 200
182
  }
183
  ],
184
  "logging_steps": 10,
185
- "max_steps": 2760,
186
  "num_input_tokens_seen": 0,
187
  "num_train_epochs": 3,
188
- "save_steps": 100,
189
  "stateful_callbacks": {
190
  "TrainerControl": {
191
  "args": {
@@ -198,8 +326,8 @@
198
  "attributes": {}
199
  }
200
  },
201
- "total_flos": 1.7520160047366144e+16,
202
- "train_batch_size": 16,
203
  "trial_name": null,
204
  "trial_params": null
205
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.43478260869565216,
5
+ "eval_steps": 10,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.021739130434782608,
13
+ "grad_norm": 2.2049312591552734,
14
+ "learning_rate": 4.963768115942029e-05,
15
+ "loss": 2.3459,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.021739130434782608,
20
+ "eval_loss": 1.5330805778503418,
21
+ "eval_runtime": 9.6608,
22
+ "eval_samples_per_second": 49.685,
23
+ "eval_steps_per_second": 1.553,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.043478260869565216,
28
+ "grad_norm": 1.1927831172943115,
29
+ "learning_rate": 4.9275362318840584e-05,
30
+ "loss": 1.1669,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.043478260869565216,
35
+ "eval_loss": 1.0271039009094238,
36
+ "eval_runtime": 9.7215,
37
+ "eval_samples_per_second": 49.375,
38
+ "eval_steps_per_second": 1.543,
39
  "step": 20
40
  },
41
  {
42
+ "epoch": 0.06521739130434782,
43
+ "grad_norm": 1.0509072542190552,
44
+ "learning_rate": 4.891304347826087e-05,
45
+ "loss": 0.8327,
46
  "step": 30
47
  },
48
  {
49
+ "epoch": 0.06521739130434782,
50
+ "eval_loss": 0.9039635062217712,
51
+ "eval_runtime": 9.7818,
52
+ "eval_samples_per_second": 49.071,
53
+ "eval_steps_per_second": 1.533,
54
+ "step": 30
55
+ },
56
+ {
57
+ "epoch": 0.08695652173913043,
58
+ "grad_norm": 0.8860757350921631,
59
+ "learning_rate": 4.855072463768116e-05,
60
+ "loss": 0.731,
61
  "step": 40
62
  },
63
  {
64
+ "epoch": 0.08695652173913043,
65
+ "eval_loss": 0.8067704439163208,
66
+ "eval_runtime": 9.8118,
67
+ "eval_samples_per_second": 48.921,
68
+ "eval_steps_per_second": 1.529,
69
+ "step": 40
70
+ },
71
+ {
72
+ "epoch": 0.10869565217391304,
73
+ "grad_norm": 0.57489013671875,
74
+ "learning_rate": 4.818840579710145e-05,
75
+ "loss": 0.6501,
76
  "step": 50
77
  },
78
  {
79
+ "epoch": 0.10869565217391304,
80
+ "eval_loss": 0.7662845849990845,
81
+ "eval_runtime": 9.8455,
82
+ "eval_samples_per_second": 48.753,
83
+ "eval_steps_per_second": 1.524,
84
  "step": 50
85
  },
86
  {
87
+ "epoch": 0.13043478260869565,
88
+ "grad_norm": 0.5091506242752075,
89
+ "learning_rate": 4.782608695652174e-05,
90
+ "loss": 0.6288,
91
+ "step": 60
92
+ },
93
+ {
94
+ "epoch": 0.13043478260869565,
95
+ "eval_loss": 0.7400590181350708,
96
+ "eval_runtime": 9.8714,
97
+ "eval_samples_per_second": 48.626,
98
+ "eval_steps_per_second": 1.52,
99
  "step": 60
100
  },
101
  {
102
+ "epoch": 0.15217391304347827,
103
+ "grad_norm": 0.5590857863426208,
104
+ "learning_rate": 4.746376811594203e-05,
105
+ "loss": 0.5838,
106
  "step": 70
107
  },
108
  {
109
+ "epoch": 0.15217391304347827,
110
+ "eval_loss": 0.7244360446929932,
111
+ "eval_runtime": 9.8787,
112
+ "eval_samples_per_second": 48.589,
113
+ "eval_steps_per_second": 1.518,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.17391304347826086,
118
+ "grad_norm": 0.553692638874054,
119
+ "learning_rate": 4.710144927536232e-05,
120
+ "loss": 0.5979,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.17391304347826086,
125
+ "eval_loss": 0.7010239958763123,
126
+ "eval_runtime": 9.881,
127
+ "eval_samples_per_second": 48.578,
128
+ "eval_steps_per_second": 1.518,
129
  "step": 80
130
  },
131
  {
132
+ "epoch": 0.1956521739130435,
133
+ "grad_norm": 0.624009370803833,
134
+ "learning_rate": 4.673913043478261e-05,
135
+ "loss": 0.5894,
136
  "step": 90
137
  },
138
  {
139
+ "epoch": 0.1956521739130435,
140
+ "eval_loss": 0.6886861324310303,
141
+ "eval_runtime": 9.8875,
142
+ "eval_samples_per_second": 48.546,
143
+ "eval_steps_per_second": 1.517,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 0.21739130434782608,
148
+ "grad_norm": 0.6356642246246338,
149
+ "learning_rate": 4.63768115942029e-05,
150
+ "loss": 0.5625,
151
  "step": 100
152
  },
153
  {
154
+ "epoch": 0.21739130434782608,
155
+ "eval_loss": 0.6777426600456238,
156
+ "eval_runtime": 9.8957,
157
+ "eval_samples_per_second": 48.506,
158
+ "eval_steps_per_second": 1.516,
159
  "step": 100
160
  },
161
  {
162
+ "epoch": 0.2391304347826087,
163
+ "grad_norm": 0.5606981515884399,
164
+ "learning_rate": 4.601449275362319e-05,
165
+ "loss": 0.5701,
166
  "step": 110
167
  },
168
  {
169
+ "epoch": 0.2391304347826087,
170
+ "eval_loss": 0.6700096130371094,
171
+ "eval_runtime": 9.8935,
172
+ "eval_samples_per_second": 48.516,
173
+ "eval_steps_per_second": 1.516,
174
+ "step": 110
175
+ },
176
+ {
177
+ "epoch": 0.2608695652173913,
178
+ "grad_norm": 0.584280252456665,
179
+ "learning_rate": 4.565217391304348e-05,
180
+ "loss": 0.5365,
181
  "step": 120
182
  },
183
  {
184
+ "epoch": 0.2608695652173913,
185
+ "eval_loss": 0.6583032011985779,
186
+ "eval_runtime": 9.9026,
187
+ "eval_samples_per_second": 48.472,
188
+ "eval_steps_per_second": 1.515,
189
+ "step": 120
190
+ },
191
+ {
192
+ "epoch": 0.2826086956521739,
193
+ "grad_norm": 0.6070062518119812,
194
+ "learning_rate": 4.528985507246377e-05,
195
+ "loss": 0.5227,
196
  "step": 130
197
  },
198
  {
199
+ "epoch": 0.2826086956521739,
200
+ "eval_loss": 0.6501919627189636,
201
+ "eval_runtime": 9.8992,
202
+ "eval_samples_per_second": 48.489,
203
+ "eval_steps_per_second": 1.515,
204
+ "step": 130
205
+ },
206
+ {
207
+ "epoch": 0.30434782608695654,
208
+ "grad_norm": 0.6242979764938354,
209
+ "learning_rate": 4.492753623188406e-05,
210
+ "loss": 0.5305,
211
+ "step": 140
212
+ },
213
+ {
214
+ "epoch": 0.30434782608695654,
215
+ "eval_loss": 0.6420691013336182,
216
+ "eval_runtime": 9.8828,
217
+ "eval_samples_per_second": 48.569,
218
+ "eval_steps_per_second": 1.518,
219
  "step": 140
220
  },
221
  {
222
+ "epoch": 0.32608695652173914,
223
+ "grad_norm": 0.6196560859680176,
224
+ "learning_rate": 4.456521739130435e-05,
225
+ "loss": 0.5334,
226
  "step": 150
227
  },
228
  {
229
+ "epoch": 0.32608695652173914,
230
+ "eval_loss": 0.6367022395133972,
231
+ "eval_runtime": 9.8886,
232
+ "eval_samples_per_second": 48.541,
233
+ "eval_steps_per_second": 1.517,
234
  "step": 150
235
  },
236
  {
237
+ "epoch": 0.34782608695652173,
238
+ "grad_norm": 0.6115924119949341,
239
+ "learning_rate": 4.4202898550724645e-05,
240
+ "loss": 0.5163,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 0.34782608695652173,
245
+ "eval_loss": 0.6340358853340149,
246
+ "eval_runtime": 9.9159,
247
+ "eval_samples_per_second": 48.407,
248
+ "eval_steps_per_second": 1.513,
249
  "step": 160
250
  },
251
  {
252
+ "epoch": 0.3695652173913043,
253
+ "grad_norm": 0.5742267966270447,
254
+ "learning_rate": 4.384057971014493e-05,
255
+ "loss": 0.5156,
256
  "step": 170
257
  },
258
  {
259
+ "epoch": 0.3695652173913043,
260
+ "eval_loss": 0.6300316452980042,
261
+ "eval_runtime": 9.8819,
262
+ "eval_samples_per_second": 48.574,
263
+ "eval_steps_per_second": 1.518,
264
+ "step": 170
265
+ },
266
+ {
267
+ "epoch": 0.391304347826087,
268
+ "grad_norm": 0.5701326727867126,
269
+ "learning_rate": 4.347826086956522e-05,
270
+ "loss": 0.5087,
271
+ "step": 180
272
+ },
273
+ {
274
+ "epoch": 0.391304347826087,
275
+ "eval_loss": 0.6247866153717041,
276
+ "eval_runtime": 9.8798,
277
+ "eval_samples_per_second": 48.584,
278
+ "eval_steps_per_second": 1.518,
279
  "step": 180
280
  },
281
  {
282
+ "epoch": 0.41304347826086957,
283
+ "grad_norm": 0.6273636221885681,
284
+ "learning_rate": 4.3115942028985515e-05,
285
+ "loss": 0.5132,
286
  "step": 190
287
  },
288
  {
289
+ "epoch": 0.41304347826086957,
290
+ "eval_loss": 0.6192939877510071,
291
+ "eval_runtime": 9.8693,
292
+ "eval_samples_per_second": 48.636,
293
+ "eval_steps_per_second": 1.52,
294
+ "step": 190
295
+ },
296
+ {
297
+ "epoch": 0.43478260869565216,
298
+ "grad_norm": 0.6055566668510437,
299
+ "learning_rate": 4.27536231884058e-05,
300
+ "loss": 0.5136,
301
  "step": 200
302
  },
303
  {
304
+ "epoch": 0.43478260869565216,
305
+ "eval_loss": 0.615530788898468,
306
+ "eval_runtime": 9.8728,
307
+ "eval_samples_per_second": 48.618,
308
+ "eval_steps_per_second": 1.519,
309
  "step": 200
310
  }
311
  ],
312
  "logging_steps": 10,
313
+ "max_steps": 1380,
314
  "num_input_tokens_seen": 0,
315
  "num_train_epochs": 3,
316
+ "save_steps": 10,
317
  "stateful_callbacks": {
318
  "TrainerControl": {
319
  "args": {
 
326
  "attributes": {}
327
  }
328
  },
329
+ "total_flos": 3.683737449529344e+16,
330
+ "train_batch_size": 32,
331
  "trial_name": null,
332
  "trial_params": null
333
  }
checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81786d634f359bee447d9c2dad023b92502900c87e9208470ed326b1204c5b1a
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cdf2cbd96a7d8012e1f3da0569783099ba26a3ecbec680ff36cde09793d0889
3
  size 5816