pranay-43 commited on
Commit
3af14bd
·
verified ·
1 Parent(s): 5bf57cf

End of training

Browse files
README.md CHANGED
@@ -22,7 +22,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.9126310534198702
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -32,8 +32,8 @@ should probably proofread and complete it, then remove this comment. -->
32
 
33
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.3206
36
- - Accuracy: 0.9126
37
 
38
  ## Model description
39
 
@@ -61,52 +61,17 @@ The following hyperparameters were used during training:
61
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
  - lr_scheduler_type: linear
63
  - lr_scheduler_warmup_ratio: 0.1
64
- - num_epochs: 40
65
 
66
  ### Training results
67
 
68
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
69
- |:-------------:|:-------:|:----:|:---------------:|:--------:|
70
- | 1.1235 | 0.9841 | 31 | 1.0728 | 0.6565 |
71
- | 0.8201 | 2.0 | 63 | 0.8410 | 0.6915 |
72
- | 0.634 | 2.9841 | 94 | 0.6349 | 0.7773 |
73
- | 0.561 | 4.0 | 126 | 0.5739 | 0.7958 |
74
- | 0.4839 | 4.9841 | 157 | 0.4544 | 0.8382 |
75
- | 0.4073 | 6.0 | 189 | 0.4398 | 0.8462 |
76
- | 0.4035 | 6.9841 | 220 | 0.4284 | 0.8487 |
77
- | 0.3609 | 8.0 | 252 | 0.3886 | 0.8542 |
78
- | 0.3196 | 8.9841 | 283 | 0.4561 | 0.8432 |
79
- | 0.2723 | 10.0 | 315 | 0.3703 | 0.8697 |
80
- | 0.2521 | 10.9841 | 346 | 0.3639 | 0.8722 |
81
- | 0.2644 | 12.0 | 378 | 0.3288 | 0.8832 |
82
- | 0.2282 | 12.9841 | 409 | 0.3625 | 0.8712 |
83
- | 0.2435 | 14.0 | 441 | 0.3175 | 0.8962 |
84
- | 0.2051 | 14.9841 | 472 | 0.3649 | 0.8707 |
85
- | 0.1922 | 16.0 | 504 | 0.3022 | 0.8952 |
86
- | 0.1824 | 16.9841 | 535 | 0.3596 | 0.8752 |
87
- | 0.1799 | 18.0 | 567 | 0.3293 | 0.8942 |
88
- | 0.1562 | 18.9841 | 598 | 0.3204 | 0.8992 |
89
- | 0.1582 | 20.0 | 630 | 0.3467 | 0.8837 |
90
- | 0.1516 | 20.9841 | 661 | 0.3247 | 0.8942 |
91
- | 0.1285 | 22.0 | 693 | 0.3304 | 0.8912 |
92
- | 0.1454 | 22.9841 | 724 | 0.3031 | 0.8957 |
93
- | 0.1548 | 24.0 | 756 | 0.3086 | 0.8992 |
94
- | 0.1041 | 24.9841 | 787 | 0.2945 | 0.9021 |
95
- | 0.1161 | 26.0 | 819 | 0.2968 | 0.9106 |
96
- | 0.1141 | 26.9841 | 850 | 0.2805 | 0.9096 |
97
- | 0.1078 | 28.0 | 882 | 0.3178 | 0.9011 |
98
- | 0.1192 | 28.9841 | 913 | 0.3182 | 0.9041 |
99
- | 0.0977 | 30.0 | 945 | 0.3000 | 0.9061 |
100
- | 0.1011 | 30.9841 | 976 | 0.3065 | 0.9041 |
101
- | 0.0865 | 32.0 | 1008 | 0.3193 | 0.9051 |
102
- | 0.0845 | 32.9841 | 1039 | 0.3047 | 0.9121 |
103
- | 0.0823 | 34.0 | 1071 | 0.3037 | 0.9116 |
104
- | 0.0809 | 34.9841 | 1102 | 0.3329 | 0.9011 |
105
- | 0.0789 | 36.0 | 1134 | 0.3215 | 0.9121 |
106
- | 0.0724 | 36.9841 | 1165 | 0.3273 | 0.9096 |
107
- | 0.0722 | 38.0 | 1197 | 0.3096 | 0.9091 |
108
- | 0.0811 | 38.9841 | 1228 | 0.3206 | 0.9126 |
109
- | 0.0659 | 39.3651 | 1240 | 0.3216 | 0.9126 |
110
 
111
 
112
  ### Framework versions
 
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
+ value: 0.994671729544341
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
32
 
33
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 0.0256
36
+ - Accuracy: 0.9947
37
 
38
  ## Model description
39
 
 
61
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
  - lr_scheduler_type: linear
63
  - lr_scheduler_warmup_ratio: 0.1
64
+ - num_epochs: 5
65
 
66
  ### Training results
67
 
68
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
69
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
70
+ | 0.0268 | 0.9990 | 255 | 0.0256 | 0.9947 |
71
+ | 0.0167 | 1.9980 | 510 | 0.0275 | 0.9947 |
72
+ | 0.0177 | 2.9971 | 765 | 0.0268 | 0.9936 |
73
+ | 0.0158 | 4.0 | 1021 | 0.0238 | 0.9945 |
74
+ | 0.0112 | 4.9951 | 1275 | 0.0259 | 0.9944 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 39.36507936507937,
3
- "eval_accuracy": 0.9126310534198702,
4
- "eval_loss": 0.320569783449173,
5
- "eval_runtime": 25.2588,
6
- "eval_samples_per_second": 79.299,
7
- "eval_steps_per_second": 1.267,
8
- "total_flos": 7.840972523253768e+18,
9
- "train_loss": 0.24690783341084757,
10
- "train_runtime": 6145.2319,
11
- "train_samples_per_second": 52.151,
12
- "train_steps_per_second": 0.202
13
  }
 
1
  {
2
+ "epoch": 4.995102840352596,
3
+ "eval_accuracy": 0.994671729544341,
4
+ "eval_loss": 0.025641364976763725,
5
+ "eval_runtime": 102.6469,
6
+ "eval_samples_per_second": 159.07,
7
+ "eval_steps_per_second": 2.494,
8
+ "total_flos": 8.109813238393209e+18,
9
+ "train_loss": 0.030976083857171675,
10
+ "train_runtime": 4473.2861,
11
+ "train_samples_per_second": 73.002,
12
+ "train_steps_per_second": 0.285
13
  }
config.json CHANGED
@@ -17,24 +17,14 @@
17
  "hidden_dropout_prob": 0.0,
18
  "hidden_size": 768,
19
  "id2label": {
20
- "0": "akiec",
21
- "1": "bcc",
22
- "2": "bkl",
23
- "3": "df",
24
- "4": "mel",
25
- "5": "nv",
26
- "6": "vasc"
27
  },
28
  "image_size": 224,
29
  "initializer_range": 0.02,
30
  "label2id": {
31
- "akiec": 0,
32
- "bcc": 1,
33
- "bkl": 2,
34
- "df": 3,
35
- "mel": 4,
36
- "nv": 5,
37
- "vasc": 6
38
  },
39
  "layer_norm_eps": 1e-05,
40
  "mlp_ratio": 4.0,
 
17
  "hidden_dropout_prob": 0.0,
18
  "hidden_size": 768,
19
  "id2label": {
20
+ "0": "images_class_0",
21
+ "1": "images_class_1"
 
 
 
 
 
22
  },
23
  "image_size": 224,
24
  "initializer_range": 0.02,
25
  "label2id": {
26
+ "images_class_0": 0,
27
+ "images_class_1": 1
 
 
 
 
 
28
  },
29
  "layer_norm_eps": 1e-05,
30
  "mlp_ratio": 4.0,
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 39.36507936507937,
3
- "eval_accuracy": 0.9126310534198702,
4
- "eval_loss": 0.320569783449173,
5
- "eval_runtime": 25.2588,
6
- "eval_samples_per_second": 79.299,
7
- "eval_steps_per_second": 1.267
8
  }
 
1
  {
2
+ "epoch": 4.995102840352596,
3
+ "eval_accuracy": 0.994671729544341,
4
+ "eval_loss": 0.025641364976763725,
5
+ "eval_runtime": 102.6469,
6
+ "eval_samples_per_second": 159.07,
7
+ "eval_steps_per_second": 2.494
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b535bb8ce3f7d269ecb963663144b8b175ee46aaf079e873210516f57e9b4cd
3
- size 110358212
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e99d284193dff48e7408acb1e292b749fb80dfded99dd7a9cac006f9cedcd63
3
+ size 110342832
runs/Sep01_13-00-27_c4735777ea3c/events.out.tfevents.1725195641.c4735777ea3c.36.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:460c1abd62a221991453d26571558712f0be7b82e9d8844149ce04890437fb34
3
+ size 5415
runs/Sep01_13-01-10_c4735777ea3c/events.out.tfevents.1725195681.c4735777ea3c.36.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a546d17d197aa389b0ba06ae202fe68078c9f0d6a0c76655956e79a7e7f0e5f
3
+ size 34131
runs/Sep01_13-01-10_c4735777ea3c/events.out.tfevents.1725200277.c4735777ea3c.36.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ede407413d2c9b7ea17e049da582d1cee5f855b6df7e41b4c24169224716570a
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 39.36507936507937,
3
- "total_flos": 7.840972523253768e+18,
4
- "train_loss": 0.24690783341084757,
5
- "train_runtime": 6145.2319,
6
- "train_samples_per_second": 52.151,
7
- "train_steps_per_second": 0.202
8
  }
 
1
  {
2
+ "epoch": 4.995102840352596,
3
+ "total_flos": 8.109813238393209e+18,
4
+ "train_loss": 0.030976083857171675,
5
+ "train_runtime": 4473.2861,
6
+ "train_samples_per_second": 73.002,
7
+ "train_steps_per_second": 0.285
8
  }
trainer_state.json CHANGED
@@ -1,1255 +1,961 @@
1
  {
2
- "best_metric": 0.9126310534198702,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-1228",
4
- "epoch": 39.36507936507937,
5
  "eval_steps": 500,
6
- "global_step": 1240,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.31746031746031744,
13
- "grad_norm": 7.819032192230225,
14
- "learning_rate": 4.032258064516129e-06,
15
- "loss": 1.711,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.6349206349206349,
20
- "grad_norm": 6.724701404571533,
21
- "learning_rate": 8.064516129032258e-06,
22
- "loss": 1.4406,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.9523809523809523,
27
- "grad_norm": 3.3038227558135986,
28
- "learning_rate": 1.2096774193548388e-05,
29
- "loss": 1.1235,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.9841269841269841,
34
- "eval_accuracy": 0.6565152271592611,
35
- "eval_loss": 1.0728416442871094,
36
- "eval_runtime": 34.2718,
37
- "eval_samples_per_second": 58.445,
38
- "eval_steps_per_second": 0.934,
39
- "step": 31
40
- },
41
- {
42
- "epoch": 1.2698412698412698,
43
- "grad_norm": 6.524947166442871,
44
- "learning_rate": 1.6129032258064517e-05,
45
- "loss": 0.929,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 1.5873015873015874,
50
- "grad_norm": 4.941163539886475,
51
- "learning_rate": 2.0161290322580645e-05,
52
- "loss": 0.8707,
53
  "step": 50
54
  },
55
  {
56
- "epoch": 1.9047619047619047,
57
- "grad_norm": 5.925607204437256,
58
- "learning_rate": 2.4193548387096777e-05,
59
- "loss": 0.8201,
60
  "step": 60
61
  },
62
  {
63
- "epoch": 2.0,
64
- "eval_accuracy": 0.691462805791313,
65
- "eval_loss": 0.8410265445709229,
66
- "eval_runtime": 24.8815,
67
- "eval_samples_per_second": 80.502,
68
- "eval_steps_per_second": 1.286,
69
- "step": 63
70
- },
71
- {
72
- "epoch": 2.2222222222222223,
73
- "grad_norm": 3.6991262435913086,
74
- "learning_rate": 2.822580645161291e-05,
75
- "loss": 0.7149,
76
  "step": 70
77
  },
78
  {
79
- "epoch": 2.5396825396825395,
80
- "grad_norm": 9.60767936706543,
81
- "learning_rate": 3.2258064516129034e-05,
82
- "loss": 0.6696,
83
  "step": 80
84
  },
85
  {
86
- "epoch": 2.857142857142857,
87
- "grad_norm": 4.910393714904785,
88
- "learning_rate": 3.6290322580645165e-05,
89
- "loss": 0.634,
90
  "step": 90
91
  },
92
  {
93
- "epoch": 2.984126984126984,
94
- "eval_accuracy": 0.7773339990014978,
95
- "eval_loss": 0.6348706483840942,
96
- "eval_runtime": 24.8572,
97
- "eval_samples_per_second": 80.58,
98
- "eval_steps_per_second": 1.287,
99
- "step": 94
100
- },
101
- {
102
- "epoch": 3.1746031746031744,
103
- "grad_norm": 9.863725662231445,
104
- "learning_rate": 4.032258064516129e-05,
105
- "loss": 0.5768,
106
  "step": 100
107
  },
108
  {
109
- "epoch": 3.492063492063492,
110
- "grad_norm": 6.171677112579346,
111
- "learning_rate": 4.435483870967742e-05,
112
- "loss": 0.5521,
113
  "step": 110
114
  },
115
  {
116
- "epoch": 3.8095238095238093,
117
- "grad_norm": 8.241861343383789,
118
- "learning_rate": 4.8387096774193554e-05,
119
- "loss": 0.561,
120
  "step": 120
121
  },
122
  {
123
- "epoch": 4.0,
124
- "eval_accuracy": 0.7958062905641538,
125
- "eval_loss": 0.5738771557807922,
126
- "eval_runtime": 24.4894,
127
- "eval_samples_per_second": 81.79,
128
- "eval_steps_per_second": 1.307,
129
- "step": 126
130
- },
131
- {
132
- "epoch": 4.1269841269841265,
133
- "grad_norm": 6.913752555847168,
134
- "learning_rate": 4.973118279569893e-05,
135
- "loss": 0.4906,
136
  "step": 130
137
  },
138
  {
139
- "epoch": 4.444444444444445,
140
- "grad_norm": 4.96409797668457,
141
- "learning_rate": 4.92831541218638e-05,
142
- "loss": 0.4588,
143
  "step": 140
144
  },
145
  {
146
- "epoch": 4.761904761904762,
147
- "grad_norm": 11.189750671386719,
148
- "learning_rate": 4.8835125448028677e-05,
149
- "loss": 0.4839,
150
  "step": 150
151
  },
152
  {
153
- "epoch": 4.984126984126984,
154
- "eval_accuracy": 0.8382426360459311,
155
- "eval_loss": 0.4543745219707489,
156
- "eval_runtime": 24.7801,
157
- "eval_samples_per_second": 80.831,
158
- "eval_steps_per_second": 1.291,
159
- "step": 157
160
- },
161
- {
162
- "epoch": 5.079365079365079,
163
- "grad_norm": 4.5059404373168945,
164
- "learning_rate": 4.8387096774193554e-05,
165
- "loss": 0.4574,
166
  "step": 160
167
  },
168
  {
169
- "epoch": 5.396825396825397,
170
- "grad_norm": 10.079263687133789,
171
- "learning_rate": 4.7939068100358424e-05,
172
- "loss": 0.4431,
173
  "step": 170
174
  },
175
  {
176
- "epoch": 5.714285714285714,
177
- "grad_norm": 8.396503448486328,
178
- "learning_rate": 4.74910394265233e-05,
179
- "loss": 0.4073,
180
  "step": 180
181
  },
182
  {
183
- "epoch": 6.0,
184
- "eval_accuracy": 0.8462306540189716,
185
- "eval_loss": 0.4398318827152252,
186
- "eval_runtime": 24.2749,
187
- "eval_samples_per_second": 82.513,
188
- "eval_steps_per_second": 1.318,
189
- "step": 189
190
- },
191
- {
192
- "epoch": 6.031746031746032,
193
- "grad_norm": 4.483707904815674,
194
- "learning_rate": 4.704301075268818e-05,
195
- "loss": 0.4442,
196
  "step": 190
197
  },
198
  {
199
- "epoch": 6.349206349206349,
200
- "grad_norm": 7.485744476318359,
201
- "learning_rate": 4.659498207885305e-05,
202
- "loss": 0.3724,
203
  "step": 200
204
  },
205
  {
206
- "epoch": 6.666666666666667,
207
- "grad_norm": 6.369739055633545,
208
- "learning_rate": 4.614695340501792e-05,
209
- "loss": 0.3718,
210
  "step": 210
211
  },
212
  {
213
- "epoch": 6.984126984126984,
214
- "grad_norm": 9.252875328063965,
215
- "learning_rate": 4.56989247311828e-05,
216
- "loss": 0.4035,
217
- "step": 220
218
- },
219
- {
220
- "epoch": 6.984126984126984,
221
- "eval_accuracy": 0.8487269096355466,
222
- "eval_loss": 0.4284241795539856,
223
- "eval_runtime": 24.7921,
224
- "eval_samples_per_second": 80.792,
225
- "eval_steps_per_second": 1.291,
226
  "step": 220
227
  },
228
  {
229
- "epoch": 7.301587301587301,
230
- "grad_norm": 11.154105186462402,
231
- "learning_rate": 4.5250896057347674e-05,
232
- "loss": 0.3237,
233
  "step": 230
234
  },
235
  {
236
- "epoch": 7.619047619047619,
237
- "grad_norm": 5.150094509124756,
238
- "learning_rate": 4.4802867383512545e-05,
239
- "loss": 0.3502,
240
  "step": 240
241
  },
242
  {
243
- "epoch": 7.936507936507937,
244
- "grad_norm": 5.770071506500244,
245
- "learning_rate": 4.435483870967742e-05,
246
- "loss": 0.3609,
247
  "step": 250
248
  },
249
  {
250
- "epoch": 8.0,
251
- "eval_accuracy": 0.854218671992012,
252
- "eval_loss": 0.38860228657722473,
253
- "eval_runtime": 24.3118,
254
- "eval_samples_per_second": 82.388,
255
- "eval_steps_per_second": 1.316,
256
- "step": 252
257
  },
258
  {
259
- "epoch": 8.253968253968253,
260
- "grad_norm": 9.657562255859375,
261
- "learning_rate": 4.390681003584229e-05,
262
- "loss": 0.3115,
263
  "step": 260
264
  },
265
  {
266
- "epoch": 8.571428571428571,
267
- "grad_norm": 6.7540812492370605,
268
- "learning_rate": 4.345878136200717e-05,
269
- "loss": 0.306,
270
  "step": 270
271
  },
272
  {
273
- "epoch": 8.88888888888889,
274
- "grad_norm": 9.384312629699707,
275
- "learning_rate": 4.301075268817205e-05,
276
- "loss": 0.3196,
277
  "step": 280
278
  },
279
  {
280
- "epoch": 8.984126984126984,
281
- "eval_accuracy": 0.8432351472790813,
282
- "eval_loss": 0.45614635944366455,
283
- "eval_runtime": 25.2083,
284
- "eval_samples_per_second": 79.458,
285
- "eval_steps_per_second": 1.269,
286
- "step": 283
287
- },
288
- {
289
- "epoch": 9.206349206349206,
290
- "grad_norm": 5.623856544494629,
291
- "learning_rate": 4.256272401433692e-05,
292
- "loss": 0.3088,
293
  "step": 290
294
  },
295
  {
296
- "epoch": 9.523809523809524,
297
- "grad_norm": 8.197659492492676,
298
- "learning_rate": 4.2114695340501795e-05,
299
- "loss": 0.2964,
300
  "step": 300
301
  },
302
  {
303
- "epoch": 9.841269841269842,
304
- "grad_norm": 6.508030414581299,
305
- "learning_rate": 4.166666666666667e-05,
306
- "loss": 0.2723,
307
  "step": 310
308
  },
309
  {
310
- "epoch": 10.0,
311
- "eval_accuracy": 0.8696954568147778,
312
- "eval_loss": 0.3703114986419678,
313
- "eval_runtime": 24.8054,
314
- "eval_samples_per_second": 80.749,
315
- "eval_steps_per_second": 1.29,
316
- "step": 315
317
- },
318
- {
319
- "epoch": 10.158730158730158,
320
- "grad_norm": 6.234573841094971,
321
- "learning_rate": 4.121863799283154e-05,
322
- "loss": 0.2525,
323
  "step": 320
324
  },
325
  {
326
- "epoch": 10.476190476190476,
327
- "grad_norm": 5.158910274505615,
328
- "learning_rate": 4.077060931899642e-05,
329
- "loss": 0.2707,
330
  "step": 330
331
  },
332
  {
333
- "epoch": 10.793650793650794,
334
- "grad_norm": 7.1439971923828125,
335
- "learning_rate": 4.032258064516129e-05,
336
- "loss": 0.2521,
337
  "step": 340
338
  },
339
  {
340
- "epoch": 10.984126984126984,
341
- "eval_accuracy": 0.872191712431353,
342
- "eval_loss": 0.3638671636581421,
343
- "eval_runtime": 24.7377,
344
- "eval_samples_per_second": 80.97,
345
- "eval_steps_per_second": 1.294,
346
- "step": 346
347
- },
348
- {
349
- "epoch": 11.11111111111111,
350
- "grad_norm": 6.113388538360596,
351
- "learning_rate": 3.987455197132617e-05,
352
- "loss": 0.271,
353
  "step": 350
354
  },
355
  {
356
- "epoch": 11.428571428571429,
357
- "grad_norm": 8.854401588439941,
358
- "learning_rate": 3.9426523297491045e-05,
359
- "loss": 0.2671,
360
  "step": 360
361
  },
362
  {
363
- "epoch": 11.746031746031747,
364
- "grad_norm": 8.250101089477539,
365
- "learning_rate": 3.8978494623655915e-05,
366
- "loss": 0.2644,
367
  "step": 370
368
  },
369
  {
370
- "epoch": 12.0,
371
- "eval_accuracy": 0.8831752371442836,
372
- "eval_loss": 0.32876166701316833,
373
- "eval_runtime": 24.8834,
374
- "eval_samples_per_second": 80.495,
375
- "eval_steps_per_second": 1.286,
376
- "step": 378
377
- },
378
- {
379
- "epoch": 12.063492063492063,
380
- "grad_norm": 6.271592140197754,
381
- "learning_rate": 3.8530465949820786e-05,
382
- "loss": 0.2441,
383
  "step": 380
384
  },
385
  {
386
- "epoch": 12.380952380952381,
387
- "grad_norm": 6.686671733856201,
388
- "learning_rate": 3.808243727598566e-05,
389
- "loss": 0.2624,
390
  "step": 390
391
  },
392
  {
393
- "epoch": 12.698412698412698,
394
- "grad_norm": 6.1020989418029785,
395
- "learning_rate": 3.763440860215054e-05,
396
- "loss": 0.2282,
397
  "step": 400
398
  },
399
  {
400
- "epoch": 12.984126984126984,
401
- "eval_accuracy": 0.871193210184723,
402
- "eval_loss": 0.36250728368759155,
403
- "eval_runtime": 24.5366,
404
- "eval_samples_per_second": 81.633,
405
- "eval_steps_per_second": 1.304,
406
- "step": 409
407
- },
408
- {
409
- "epoch": 13.015873015873016,
410
- "grad_norm": 7.433064937591553,
411
- "learning_rate": 3.718637992831541e-05,
412
- "loss": 0.2257,
413
  "step": 410
414
  },
415
  {
416
- "epoch": 13.333333333333334,
417
- "grad_norm": 6.756991863250732,
418
- "learning_rate": 3.673835125448029e-05,
419
- "loss": 0.2036,
420
  "step": 420
421
  },
422
  {
423
- "epoch": 13.65079365079365,
424
- "grad_norm": 6.9047393798828125,
425
- "learning_rate": 3.6290322580645165e-05,
426
- "loss": 0.2335,
427
  "step": 430
428
  },
429
  {
430
- "epoch": 13.968253968253968,
431
- "grad_norm": 9.594013214111328,
432
- "learning_rate": 3.5842293906810036e-05,
433
- "loss": 0.2435,
434
  "step": 440
435
  },
436
  {
437
- "epoch": 14.0,
438
- "eval_accuracy": 0.8961557663504743,
439
- "eval_loss": 0.3175117075443268,
440
- "eval_runtime": 24.4645,
441
- "eval_samples_per_second": 81.874,
442
- "eval_steps_per_second": 1.308,
443
- "step": 441
444
- },
445
- {
446
- "epoch": 14.285714285714286,
447
- "grad_norm": 8.859041213989258,
448
- "learning_rate": 3.539426523297491e-05,
449
- "loss": 0.2068,
450
  "step": 450
451
  },
452
  {
453
- "epoch": 14.603174603174603,
454
- "grad_norm": 7.455759048461914,
455
- "learning_rate": 3.494623655913979e-05,
456
- "loss": 0.2284,
457
  "step": 460
458
  },
459
  {
460
- "epoch": 14.920634920634921,
461
- "grad_norm": 5.559782028198242,
462
- "learning_rate": 3.449820788530466e-05,
463
- "loss": 0.2051,
464
  "step": 470
465
  },
466
  {
467
- "epoch": 14.984126984126984,
468
- "eval_accuracy": 0.8706939590614079,
469
- "eval_loss": 0.3648614287376404,
470
- "eval_runtime": 24.4773,
471
- "eval_samples_per_second": 81.831,
472
- "eval_steps_per_second": 1.307,
473
- "step": 472
474
- },
475
- {
476
- "epoch": 15.238095238095237,
477
- "grad_norm": 5.328235149383545,
478
- "learning_rate": 3.405017921146954e-05,
479
- "loss": 0.2018,
480
  "step": 480
481
  },
482
  {
483
- "epoch": 15.555555555555555,
484
- "grad_norm": 7.363594055175781,
485
- "learning_rate": 3.360215053763441e-05,
486
- "loss": 0.1869,
487
  "step": 490
488
  },
489
  {
490
- "epoch": 15.873015873015873,
491
- "grad_norm": 10.774445533752441,
492
- "learning_rate": 3.3154121863799286e-05,
493
- "loss": 0.1922,
494
  "step": 500
495
  },
496
  {
497
- "epoch": 16.0,
498
- "eval_accuracy": 0.8951572641038442,
499
- "eval_loss": 0.30218109488487244,
500
- "eval_runtime": 24.9213,
501
- "eval_samples_per_second": 80.373,
502
- "eval_steps_per_second": 1.284,
503
- "step": 504
504
  },
505
  {
506
- "epoch": 16.19047619047619,
507
- "grad_norm": 6.6121039390563965,
508
- "learning_rate": 3.270609318996416e-05,
509
- "loss": 0.1825,
 
 
510
  "step": 510
511
  },
512
  {
513
- "epoch": 16.507936507936506,
514
- "grad_norm": 6.801867485046387,
515
- "learning_rate": 3.2258064516129034e-05,
516
- "loss": 0.1741,
517
  "step": 520
518
  },
519
  {
520
- "epoch": 16.825396825396826,
521
- "grad_norm": 6.467336177825928,
522
- "learning_rate": 3.1810035842293904e-05,
523
- "loss": 0.1824,
524
  "step": 530
525
  },
526
  {
527
- "epoch": 16.984126984126984,
528
- "eval_accuracy": 0.8751872191712431,
529
- "eval_loss": 0.35959309339523315,
530
- "eval_runtime": 24.721,
531
- "eval_samples_per_second": 81.024,
532
- "eval_steps_per_second": 1.294,
533
- "step": 535
534
- },
535
- {
536
- "epoch": 17.142857142857142,
537
- "grad_norm": 7.638742923736572,
538
- "learning_rate": 3.136200716845878e-05,
539
- "loss": 0.1841,
540
  "step": 540
541
  },
542
  {
543
- "epoch": 17.46031746031746,
544
- "grad_norm": 12.39472484588623,
545
- "learning_rate": 3.091397849462366e-05,
546
- "loss": 0.1808,
547
  "step": 550
548
  },
549
  {
550
- "epoch": 17.77777777777778,
551
- "grad_norm": 6.454930305480957,
552
- "learning_rate": 3.046594982078853e-05,
553
- "loss": 0.1799,
554
  "step": 560
555
  },
556
  {
557
- "epoch": 18.0,
558
- "eval_accuracy": 0.8941587618572142,
559
- "eval_loss": 0.32931479811668396,
560
- "eval_runtime": 24.732,
561
- "eval_samples_per_second": 80.988,
562
- "eval_steps_per_second": 1.294,
563
- "step": 567
564
- },
565
- {
566
- "epoch": 18.095238095238095,
567
- "grad_norm": 6.038789749145508,
568
- "learning_rate": 3.0017921146953403e-05,
569
- "loss": 0.1544,
570
  "step": 570
571
  },
572
  {
573
- "epoch": 18.41269841269841,
574
- "grad_norm": 6.287538051605225,
575
- "learning_rate": 2.9569892473118284e-05,
576
- "loss": 0.1709,
577
  "step": 580
578
  },
579
  {
580
- "epoch": 18.73015873015873,
581
- "grad_norm": 6.68491268157959,
582
- "learning_rate": 2.9121863799283154e-05,
583
- "loss": 0.1562,
584
  "step": 590
585
  },
586
  {
587
- "epoch": 18.984126984126984,
588
- "eval_accuracy": 0.8991512730903645,
589
- "eval_loss": 0.3203687369823456,
590
- "eval_runtime": 24.7468,
591
- "eval_samples_per_second": 80.94,
592
- "eval_steps_per_second": 1.293,
593
- "step": 598
594
- },
595
- {
596
- "epoch": 19.047619047619047,
597
- "grad_norm": 8.029606819152832,
598
- "learning_rate": 2.8673835125448028e-05,
599
- "loss": 0.1582,
600
  "step": 600
601
  },
602
  {
603
- "epoch": 19.365079365079364,
604
- "grad_norm": 5.551252365112305,
605
- "learning_rate": 2.822580645161291e-05,
606
- "loss": 0.1585,
607
  "step": 610
608
  },
609
  {
610
- "epoch": 19.682539682539684,
611
- "grad_norm": 4.9365997314453125,
612
- "learning_rate": 2.777777777777778e-05,
613
- "loss": 0.1618,
614
  "step": 620
615
  },
616
  {
617
- "epoch": 20.0,
618
- "grad_norm": 6.0366997718811035,
619
- "learning_rate": 2.7329749103942653e-05,
620
- "loss": 0.1582,
621
  "step": 630
622
  },
623
  {
624
- "epoch": 20.0,
625
- "eval_accuracy": 0.8836744882675986,
626
- "eval_loss": 0.3466728925704956,
627
- "eval_runtime": 24.9872,
628
- "eval_samples_per_second": 80.161,
629
- "eval_steps_per_second": 1.281,
630
- "step": 630
631
- },
632
- {
633
- "epoch": 20.317460317460316,
634
- "grad_norm": 8.681838989257812,
635
- "learning_rate": 2.6881720430107527e-05,
636
- "loss": 0.1645,
637
  "step": 640
638
  },
639
  {
640
- "epoch": 20.634920634920636,
641
- "grad_norm": 7.834855556488037,
642
- "learning_rate": 2.6433691756272404e-05,
643
- "loss": 0.1509,
644
  "step": 650
645
  },
646
  {
647
- "epoch": 20.952380952380953,
648
- "grad_norm": 5.852224349975586,
649
- "learning_rate": 2.5985663082437278e-05,
650
- "loss": 0.1516,
651
  "step": 660
652
  },
653
  {
654
- "epoch": 20.984126984126984,
655
- "eval_accuracy": 0.8941587618572142,
656
- "eval_loss": 0.32471397519111633,
657
- "eval_runtime": 25.4422,
658
- "eval_samples_per_second": 78.727,
659
- "eval_steps_per_second": 1.258,
660
- "step": 661
661
- },
662
- {
663
- "epoch": 21.26984126984127,
664
- "grad_norm": 5.681866645812988,
665
- "learning_rate": 2.5537634408602152e-05,
666
- "loss": 0.1411,
667
  "step": 670
668
  },
669
  {
670
- "epoch": 21.58730158730159,
671
- "grad_norm": 4.268200874328613,
672
- "learning_rate": 2.5089605734767026e-05,
673
- "loss": 0.131,
674
  "step": 680
675
  },
676
  {
677
- "epoch": 21.904761904761905,
678
- "grad_norm": 4.7456488609313965,
679
- "learning_rate": 2.46415770609319e-05,
680
- "loss": 0.1285,
681
  "step": 690
682
  },
683
  {
684
- "epoch": 22.0,
685
- "eval_accuracy": 0.891163255117324,
686
- "eval_loss": 0.33040687441825867,
687
- "eval_runtime": 24.24,
688
- "eval_samples_per_second": 82.632,
689
- "eval_steps_per_second": 1.32,
690
- "step": 693
691
- },
692
- {
693
- "epoch": 22.22222222222222,
694
- "grad_norm": 5.077579021453857,
695
- "learning_rate": 2.4193548387096777e-05,
696
- "loss": 0.1238,
697
  "step": 700
698
  },
699
  {
700
- "epoch": 22.53968253968254,
701
- "grad_norm": 5.799654483795166,
702
- "learning_rate": 2.374551971326165e-05,
703
- "loss": 0.1202,
704
  "step": 710
705
  },
706
  {
707
- "epoch": 22.857142857142858,
708
- "grad_norm": 4.615148544311523,
709
- "learning_rate": 2.3297491039426525e-05,
710
- "loss": 0.1454,
711
  "step": 720
712
  },
713
  {
714
- "epoch": 22.984126984126984,
715
- "eval_accuracy": 0.8956565152271593,
716
- "eval_loss": 0.30313894152641296,
717
- "eval_runtime": 24.5031,
718
- "eval_samples_per_second": 81.745,
719
- "eval_steps_per_second": 1.306,
720
- "step": 724
721
- },
722
- {
723
- "epoch": 23.174603174603174,
724
- "grad_norm": 4.609182834625244,
725
- "learning_rate": 2.28494623655914e-05,
726
- "loss": 0.1063,
727
  "step": 730
728
  },
729
  {
730
- "epoch": 23.49206349206349,
731
- "grad_norm": 6.323965549468994,
732
- "learning_rate": 2.2401433691756272e-05,
733
- "loss": 0.1177,
734
  "step": 740
735
  },
736
  {
737
- "epoch": 23.80952380952381,
738
- "grad_norm": 8.178818702697754,
739
- "learning_rate": 2.1953405017921146e-05,
740
- "loss": 0.1548,
741
  "step": 750
742
  },
743
  {
744
- "epoch": 24.0,
745
- "eval_accuracy": 0.8991512730903645,
746
- "eval_loss": 0.30859947204589844,
747
- "eval_runtime": 24.3496,
748
- "eval_samples_per_second": 82.26,
749
- "eval_steps_per_second": 1.314,
750
- "step": 756
751
  },
752
  {
753
- "epoch": 24.126984126984127,
754
- "grad_norm": 5.583525657653809,
755
- "learning_rate": 2.1505376344086024e-05,
756
- "loss": 0.1293,
757
- "step": 760
 
 
758
  },
759
  {
760
- "epoch": 24.444444444444443,
761
- "grad_norm": 3.8900668621063232,
762
- "learning_rate": 2.1057347670250897e-05,
763
- "loss": 0.1229,
764
  "step": 770
765
  },
766
  {
767
- "epoch": 24.761904761904763,
768
- "grad_norm": 5.418626308441162,
769
- "learning_rate": 2.060931899641577e-05,
770
- "loss": 0.1041,
771
  "step": 780
772
  },
773
  {
774
- "epoch": 24.984126984126984,
775
- "eval_accuracy": 0.9021467798302546,
776
- "eval_loss": 0.29453128576278687,
777
- "eval_runtime": 24.6344,
778
- "eval_samples_per_second": 81.309,
779
- "eval_steps_per_second": 1.299,
780
- "step": 787
781
- },
782
- {
783
- "epoch": 25.07936507936508,
784
- "grad_norm": 3.8356785774230957,
785
- "learning_rate": 2.0161290322580645e-05,
786
- "loss": 0.1131,
787
  "step": 790
788
  },
789
  {
790
- "epoch": 25.396825396825395,
791
- "grad_norm": 5.460204124450684,
792
- "learning_rate": 1.9713261648745522e-05,
793
- "loss": 0.1192,
794
  "step": 800
795
  },
796
  {
797
- "epoch": 25.714285714285715,
798
- "grad_norm": 4.820821285247803,
799
- "learning_rate": 1.9265232974910393e-05,
800
- "loss": 0.1161,
801
  "step": 810
802
  },
803
  {
804
- "epoch": 26.0,
805
- "eval_accuracy": 0.91063404892661,
806
- "eval_loss": 0.2968132495880127,
807
- "eval_runtime": 24.1187,
808
- "eval_samples_per_second": 83.047,
809
- "eval_steps_per_second": 1.327,
810
- "step": 819
811
- },
812
- {
813
- "epoch": 26.03174603174603,
814
- "grad_norm": 8.177849769592285,
815
- "learning_rate": 1.881720430107527e-05,
816
- "loss": 0.1178,
817
  "step": 820
818
  },
819
  {
820
- "epoch": 26.349206349206348,
821
- "grad_norm": 6.220778942108154,
822
- "learning_rate": 1.8369175627240144e-05,
823
- "loss": 0.1076,
824
  "step": 830
825
  },
826
  {
827
- "epoch": 26.666666666666668,
828
- "grad_norm": 7.938710689544678,
829
- "learning_rate": 1.7921146953405018e-05,
830
- "loss": 0.1146,
831
  "step": 840
832
  },
833
  {
834
- "epoch": 26.984126984126984,
835
- "grad_norm": 5.643862247467041,
836
- "learning_rate": 1.7473118279569895e-05,
837
- "loss": 0.1141,
838
- "step": 850
839
- },
840
- {
841
- "epoch": 26.984126984126984,
842
- "eval_accuracy": 0.90963554667998,
843
- "eval_loss": 0.2805466949939728,
844
- "eval_runtime": 24.6132,
845
- "eval_samples_per_second": 81.379,
846
- "eval_steps_per_second": 1.3,
847
  "step": 850
848
  },
849
  {
850
- "epoch": 27.3015873015873,
851
- "grad_norm": 4.576446533203125,
852
- "learning_rate": 1.702508960573477e-05,
853
- "loss": 0.1126,
854
  "step": 860
855
  },
856
  {
857
- "epoch": 27.61904761904762,
858
- "grad_norm": 3.556596279144287,
859
- "learning_rate": 1.6577060931899643e-05,
860
- "loss": 0.1058,
861
  "step": 870
862
  },
863
  {
864
- "epoch": 27.936507936507937,
865
- "grad_norm": 6.387381553649902,
866
- "learning_rate": 1.6129032258064517e-05,
867
- "loss": 0.1078,
868
  "step": 880
869
  },
870
  {
871
- "epoch": 28.0,
872
- "eval_accuracy": 0.9011482775836246,
873
- "eval_loss": 0.31776800751686096,
874
- "eval_runtime": 24.1357,
875
- "eval_samples_per_second": 82.989,
876
- "eval_steps_per_second": 1.326,
877
- "step": 882
878
- },
879
- {
880
- "epoch": 28.253968253968253,
881
- "grad_norm": 3.115154266357422,
882
- "learning_rate": 1.568100358422939e-05,
883
- "loss": 0.0964,
884
  "step": 890
885
  },
886
  {
887
- "epoch": 28.571428571428573,
888
- "grad_norm": 5.499573707580566,
889
- "learning_rate": 1.5232974910394265e-05,
890
- "loss": 0.1023,
891
  "step": 900
892
  },
893
  {
894
- "epoch": 28.88888888888889,
895
- "grad_norm": 6.842806816101074,
896
- "learning_rate": 1.4784946236559142e-05,
897
- "loss": 0.1192,
898
  "step": 910
899
  },
900
  {
901
- "epoch": 28.984126984126984,
902
- "eval_accuracy": 0.9041437843235147,
903
- "eval_loss": 0.3182041049003601,
904
- "eval_runtime": 24.1925,
905
- "eval_samples_per_second": 82.794,
906
- "eval_steps_per_second": 1.323,
907
- "step": 913
908
- },
909
- {
910
- "epoch": 29.206349206349206,
911
- "grad_norm": 6.668429374694824,
912
- "learning_rate": 1.4336917562724014e-05,
913
- "loss": 0.1,
914
  "step": 920
915
  },
916
  {
917
- "epoch": 29.523809523809526,
918
- "grad_norm": 4.984645843505859,
919
- "learning_rate": 1.388888888888889e-05,
920
- "loss": 0.0921,
921
  "step": 930
922
  },
923
  {
924
- "epoch": 29.841269841269842,
925
- "grad_norm": 5.513997554779053,
926
- "learning_rate": 1.3440860215053763e-05,
927
- "loss": 0.0977,
928
  "step": 940
929
  },
930
  {
931
- "epoch": 30.0,
932
- "eval_accuracy": 0.9061407888167748,
933
- "eval_loss": 0.29999998211860657,
934
- "eval_runtime": 24.4501,
935
- "eval_samples_per_second": 81.922,
936
- "eval_steps_per_second": 1.309,
937
- "step": 945
938
- },
939
- {
940
- "epoch": 30.158730158730158,
941
- "grad_norm": 4.359825611114502,
942
- "learning_rate": 1.2992831541218639e-05,
943
- "loss": 0.0855,
944
  "step": 950
945
  },
946
  {
947
- "epoch": 30.476190476190474,
948
- "grad_norm": 5.671274662017822,
949
- "learning_rate": 1.2544802867383513e-05,
950
- "loss": 0.0911,
951
  "step": 960
952
  },
953
  {
954
- "epoch": 30.793650793650794,
955
- "grad_norm": 8.433524131774902,
956
- "learning_rate": 1.2096774193548388e-05,
957
- "loss": 0.1011,
958
  "step": 970
959
  },
960
  {
961
- "epoch": 30.984126984126984,
962
- "eval_accuracy": 0.9041437843235147,
963
- "eval_loss": 0.3065091669559479,
964
- "eval_runtime": 36.3982,
965
- "eval_samples_per_second": 55.03,
966
- "eval_steps_per_second": 0.879,
967
- "step": 976
968
- },
969
- {
970
- "epoch": 31.11111111111111,
971
- "grad_norm": 5.782038688659668,
972
- "learning_rate": 1.1648745519713262e-05,
973
- "loss": 0.0967,
974
  "step": 980
975
  },
976
  {
977
- "epoch": 31.428571428571427,
978
- "grad_norm": 5.8083271980285645,
979
- "learning_rate": 1.1200716845878136e-05,
980
- "loss": 0.0983,
981
  "step": 990
982
  },
983
  {
984
- "epoch": 31.746031746031747,
985
- "grad_norm": 4.4911675453186035,
986
- "learning_rate": 1.0752688172043012e-05,
987
- "loss": 0.0865,
988
  "step": 1000
989
  },
990
  {
991
- "epoch": 32.0,
992
- "eval_accuracy": 0.9051422865701447,
993
- "eval_loss": 0.3192664086818695,
994
- "eval_runtime": 24.8956,
995
- "eval_samples_per_second": 80.456,
996
- "eval_steps_per_second": 1.285,
997
- "step": 1008
998
- },
999
- {
1000
- "epoch": 32.06349206349206,
1001
- "grad_norm": 4.185471534729004,
1002
- "learning_rate": 1.0304659498207886e-05,
1003
- "loss": 0.0821,
1004
  "step": 1010
1005
  },
1006
  {
1007
- "epoch": 32.38095238095238,
1008
- "grad_norm": 4.876070499420166,
1009
- "learning_rate": 9.856630824372761e-06,
1010
- "loss": 0.0843,
1011
  "step": 1020
1012
  },
1013
  {
1014
- "epoch": 32.698412698412696,
1015
- "grad_norm": 5.560541152954102,
1016
- "learning_rate": 9.408602150537635e-06,
1017
- "loss": 0.0845,
 
 
 
 
 
 
 
 
 
1018
  "step": 1030
1019
  },
1020
  {
1021
- "epoch": 32.98412698412698,
1022
- "eval_accuracy": 0.9121318022965552,
1023
- "eval_loss": 0.30474382638931274,
1024
- "eval_runtime": 24.6177,
1025
- "eval_samples_per_second": 81.364,
1026
- "eval_steps_per_second": 1.3,
1027
- "step": 1039
1028
- },
1029
- {
1030
- "epoch": 33.01587301587302,
1031
- "grad_norm": 3.8526599407196045,
1032
- "learning_rate": 8.960573476702509e-06,
1033
- "loss": 0.0979,
1034
  "step": 1040
1035
  },
1036
  {
1037
- "epoch": 33.333333333333336,
1038
- "grad_norm": 5.010209083557129,
1039
- "learning_rate": 8.512544802867385e-06,
1040
- "loss": 0.0751,
1041
  "step": 1050
1042
  },
1043
  {
1044
- "epoch": 33.65079365079365,
1045
- "grad_norm": 4.15861701965332,
1046
- "learning_rate": 8.064516129032258e-06,
1047
- "loss": 0.09,
1048
  "step": 1060
1049
  },
1050
  {
1051
- "epoch": 33.96825396825397,
1052
- "grad_norm": 5.510195255279541,
1053
- "learning_rate": 7.616487455197132e-06,
1054
- "loss": 0.0823,
1055
  "step": 1070
1056
  },
1057
  {
1058
- "epoch": 34.0,
1059
- "eval_accuracy": 0.9116325511732402,
1060
- "eval_loss": 0.3036876320838928,
1061
- "eval_runtime": 24.6681,
1062
- "eval_samples_per_second": 81.198,
1063
- "eval_steps_per_second": 1.297,
1064
- "step": 1071
1065
- },
1066
- {
1067
- "epoch": 34.285714285714285,
1068
- "grad_norm": 4.8823771476745605,
1069
- "learning_rate": 7.168458781362007e-06,
1070
- "loss": 0.0866,
1071
  "step": 1080
1072
  },
1073
  {
1074
- "epoch": 34.6031746031746,
1075
- "grad_norm": 4.3600006103515625,
1076
- "learning_rate": 6.720430107526882e-06,
1077
- "loss": 0.0784,
1078
  "step": 1090
1079
  },
1080
  {
1081
- "epoch": 34.92063492063492,
1082
- "grad_norm": 5.2714009284973145,
1083
- "learning_rate": 6.2724014336917564e-06,
1084
- "loss": 0.0809,
1085
  "step": 1100
1086
  },
1087
  {
1088
- "epoch": 34.98412698412698,
1089
- "eval_accuracy": 0.9011482775836246,
1090
- "eval_loss": 0.3329264521598816,
1091
- "eval_runtime": 24.4806,
1092
- "eval_samples_per_second": 81.82,
1093
- "eval_steps_per_second": 1.307,
1094
- "step": 1102
1095
- },
1096
- {
1097
- "epoch": 35.23809523809524,
1098
- "grad_norm": 3.9122490882873535,
1099
- "learning_rate": 5.824372759856631e-06,
1100
- "loss": 0.0704,
1101
  "step": 1110
1102
  },
1103
  {
1104
- "epoch": 35.55555555555556,
1105
- "grad_norm": 4.517535209655762,
1106
- "learning_rate": 5.376344086021506e-06,
1107
- "loss": 0.0778,
1108
  "step": 1120
1109
  },
1110
  {
1111
- "epoch": 35.87301587301587,
1112
- "grad_norm": 4.29276704788208,
1113
- "learning_rate": 4.928315412186381e-06,
1114
- "loss": 0.0789,
1115
  "step": 1130
1116
  },
1117
  {
1118
- "epoch": 36.0,
1119
- "eval_accuracy": 0.9121318022965552,
1120
- "eval_loss": 0.3215394914150238,
1121
- "eval_runtime": 24.2645,
1122
- "eval_samples_per_second": 82.549,
1123
- "eval_steps_per_second": 1.319,
1124
- "step": 1134
1125
- },
1126
- {
1127
- "epoch": 36.19047619047619,
1128
- "grad_norm": 3.785637140274048,
1129
- "learning_rate": 4.4802867383512545e-06,
1130
- "loss": 0.0764,
1131
  "step": 1140
1132
  },
1133
  {
1134
- "epoch": 36.507936507936506,
1135
- "grad_norm": 5.800527572631836,
1136
- "learning_rate": 4.032258064516129e-06,
1137
- "loss": 0.0735,
1138
  "step": 1150
1139
  },
1140
  {
1141
- "epoch": 36.82539682539682,
1142
- "grad_norm": 3.3133223056793213,
1143
- "learning_rate": 3.5842293906810035e-06,
1144
- "loss": 0.0724,
1145
  "step": 1160
1146
  },
1147
  {
1148
- "epoch": 36.98412698412698,
1149
- "eval_accuracy": 0.90963554667998,
1150
- "eval_loss": 0.3272792100906372,
1151
- "eval_runtime": 24.4675,
1152
- "eval_samples_per_second": 81.864,
1153
- "eval_steps_per_second": 1.308,
1154
- "step": 1165
1155
- },
1156
- {
1157
- "epoch": 37.142857142857146,
1158
- "grad_norm": 5.3057050704956055,
1159
- "learning_rate": 3.1362007168458782e-06,
1160
- "loss": 0.0763,
1161
  "step": 1170
1162
  },
1163
  {
1164
- "epoch": 37.46031746031746,
1165
- "grad_norm": 3.7078752517700195,
1166
- "learning_rate": 2.688172043010753e-06,
1167
- "loss": 0.0757,
1168
  "step": 1180
1169
  },
1170
  {
1171
- "epoch": 37.77777777777778,
1172
- "grad_norm": 4.193051338195801,
1173
- "learning_rate": 2.2401433691756272e-06,
1174
- "loss": 0.0722,
1175
  "step": 1190
1176
  },
1177
  {
1178
- "epoch": 38.0,
1179
- "eval_accuracy": 0.909136295556665,
1180
- "eval_loss": 0.30959054827690125,
1181
- "eval_runtime": 24.6671,
1182
- "eval_samples_per_second": 81.201,
1183
- "eval_steps_per_second": 1.297,
1184
- "step": 1197
1185
- },
1186
- {
1187
- "epoch": 38.095238095238095,
1188
- "grad_norm": 4.277989387512207,
1189
- "learning_rate": 1.7921146953405017e-06,
1190
- "loss": 0.0793,
1191
  "step": 1200
1192
  },
1193
  {
1194
- "epoch": 38.41269841269841,
1195
- "grad_norm": 5.62402868270874,
1196
- "learning_rate": 1.3440860215053765e-06,
1197
- "loss": 0.0737,
1198
  "step": 1210
1199
  },
1200
  {
1201
- "epoch": 38.73015873015873,
1202
- "grad_norm": 2.6461453437805176,
1203
- "learning_rate": 8.960573476702509e-07,
1204
- "loss": 0.0811,
1205
  "step": 1220
1206
  },
1207
  {
1208
- "epoch": 38.98412698412698,
1209
- "eval_accuracy": 0.9126310534198702,
1210
- "eval_loss": 0.320569783449173,
1211
- "eval_runtime": 24.7673,
1212
- "eval_samples_per_second": 80.873,
1213
- "eval_steps_per_second": 1.292,
1214
- "step": 1228
1215
- },
1216
- {
1217
- "epoch": 39.04761904761905,
1218
- "grad_norm": 4.357304573059082,
1219
- "learning_rate": 4.4802867383512544e-07,
1220
- "loss": 0.0855,
1221
  "step": 1230
1222
  },
1223
  {
1224
- "epoch": 39.36507936507937,
1225
- "grad_norm": 3.4464972019195557,
1226
- "learning_rate": 0.0,
1227
- "loss": 0.0659,
1228
  "step": 1240
1229
  },
1230
  {
1231
- "epoch": 39.36507936507937,
1232
- "eval_accuracy": 0.9126310534198702,
1233
- "eval_loss": 0.32159659266471863,
1234
- "eval_runtime": 25.3948,
1235
- "eval_samples_per_second": 78.874,
1236
- "eval_steps_per_second": 1.26,
1237
- "step": 1240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1238
  },
1239
  {
1240
- "epoch": 39.36507936507937,
1241
- "step": 1240,
1242
- "total_flos": 7.840972523253768e+18,
1243
- "train_loss": 0.24690783341084757,
1244
- "train_runtime": 6145.2319,
1245
- "train_samples_per_second": 52.151,
1246
- "train_steps_per_second": 0.202
1247
  }
1248
  ],
1249
  "logging_steps": 10,
1250
- "max_steps": 1240,
1251
  "num_input_tokens_seen": 0,
1252
- "num_train_epochs": 40,
1253
  "save_steps": 500,
1254
  "stateful_callbacks": {
1255
  "TrainerControl": {
@@ -1263,7 +969,7 @@
1263
  "attributes": {}
1264
  }
1265
  },
1266
- "total_flos": 7.840972523253768e+18,
1267
  "train_batch_size": 64,
1268
  "trial_name": null,
1269
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.994671729544341,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-255",
4
+ "epoch": 4.995102840352596,
5
  "eval_steps": 500,
6
+ "global_step": 1275,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.039177277179236046,
13
+ "grad_norm": 9.329336166381836,
14
+ "learning_rate": 3.90625e-06,
15
+ "loss": 0.7,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.07835455435847209,
20
+ "grad_norm": 4.470575332641602,
21
+ "learning_rate": 7.8125e-06,
22
+ "loss": 0.4115,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.11753183153770813,
27
+ "grad_norm": 1.0650607347488403,
28
+ "learning_rate": 1.171875e-05,
29
+ "loss": 0.1272,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.15670910871694418,
34
+ "grad_norm": 0.7108937501907349,
35
+ "learning_rate": 1.5625e-05,
36
+ "loss": 0.0479,
 
 
 
 
 
 
 
 
 
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.1958863858961802,
41
+ "grad_norm": 1.0625723600387573,
42
+ "learning_rate": 1.953125e-05,
43
+ "loss": 0.0359,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.23506366307541626,
48
+ "grad_norm": 0.1246357336640358,
49
+ "learning_rate": 2.34375e-05,
50
+ "loss": 0.0227,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.2742409402546523,
55
+ "grad_norm": 0.5148888826370239,
56
+ "learning_rate": 2.734375e-05,
57
+ "loss": 0.0295,
 
 
 
 
 
 
 
 
 
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.31341821743388837,
62
+ "grad_norm": 0.9743920564651489,
63
+ "learning_rate": 3.125e-05,
64
+ "loss": 0.0294,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.3525954946131244,
69
+ "grad_norm": 0.31461623311042786,
70
+ "learning_rate": 3.5156250000000004e-05,
71
+ "loss": 0.0289,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.3917727717923604,
76
+ "grad_norm": 1.0115059614181519,
77
+ "learning_rate": 3.90625e-05,
78
+ "loss": 0.0391,
 
 
 
 
 
 
 
 
 
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.4309500489715965,
83
+ "grad_norm": 0.08507003635168076,
84
+ "learning_rate": 4.2968750000000004e-05,
85
+ "loss": 0.018,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.4701273261508325,
90
+ "grad_norm": 0.1573338657617569,
91
+ "learning_rate": 4.6875e-05,
92
+ "loss": 0.0214,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.5093046033300686,
97
+ "grad_norm": 0.4784347712993622,
98
+ "learning_rate": 4.9912816041848304e-05,
99
+ "loss": 0.0264,
 
 
 
 
 
 
 
 
 
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.5484818805093046,
104
+ "grad_norm": 0.2329588085412979,
105
+ "learning_rate": 4.94768962510898e-05,
106
+ "loss": 0.0277,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.5876591576885406,
111
+ "grad_norm": 0.05806314945220947,
112
+ "learning_rate": 4.90409764603313e-05,
113
+ "loss": 0.0221,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.6268364348677767,
118
+ "grad_norm": 1.9614442586898804,
119
+ "learning_rate": 4.86050566695728e-05,
120
+ "loss": 0.0311,
 
 
 
 
 
 
 
 
 
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.6660137120470128,
125
+ "grad_norm": 1.0261849164962769,
126
+ "learning_rate": 4.8169136878814306e-05,
127
+ "loss": 0.0334,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.7051909892262488,
132
+ "grad_norm": 0.30455902218818665,
133
+ "learning_rate": 4.7733217088055796e-05,
134
+ "loss": 0.0271,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.7443682664054848,
139
+ "grad_norm": 0.38795918226242065,
140
+ "learning_rate": 4.72972972972973e-05,
141
+ "loss": 0.018,
 
 
 
 
 
 
 
 
 
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 0.7835455435847208,
146
+ "grad_norm": 0.25275129079818726,
147
+ "learning_rate": 4.68613775065388e-05,
148
+ "loss": 0.0331,
149
  "step": 200
150
  },
151
  {
152
+ "epoch": 0.8227228207639569,
153
+ "grad_norm": 0.15323331952095032,
154
+ "learning_rate": 4.642545771578029e-05,
155
+ "loss": 0.0242,
156
  "step": 210
157
  },
158
  {
159
+ "epoch": 0.861900097943193,
160
+ "grad_norm": 0.4740907847881317,
161
+ "learning_rate": 4.59895379250218e-05,
162
+ "loss": 0.0232,
 
 
 
 
 
 
 
 
 
163
  "step": 220
164
  },
165
  {
166
+ "epoch": 0.901077375122429,
167
+ "grad_norm": 0.5308133363723755,
168
+ "learning_rate": 4.55536181342633e-05,
169
+ "loss": 0.0177,
170
  "step": 230
171
  },
172
  {
173
+ "epoch": 0.940254652301665,
174
+ "grad_norm": 0.985102653503418,
175
+ "learning_rate": 4.51176983435048e-05,
176
+ "loss": 0.0301,
177
  "step": 240
178
  },
179
  {
180
+ "epoch": 0.9794319294809011,
181
+ "grad_norm": 0.6651228070259094,
182
+ "learning_rate": 4.4681778552746294e-05,
183
+ "loss": 0.0268,
184
  "step": 250
185
  },
186
  {
187
+ "epoch": 0.9990205680705191,
188
+ "eval_accuracy": 0.994671729544341,
189
+ "eval_loss": 0.025641364976763725,
190
+ "eval_runtime": 164.878,
191
+ "eval_samples_per_second": 99.031,
192
+ "eval_steps_per_second": 1.553,
193
+ "step": 255
194
  },
195
  {
196
+ "epoch": 1.018609206660137,
197
+ "grad_norm": 0.157534658908844,
198
+ "learning_rate": 4.42458587619878e-05,
199
+ "loss": 0.0175,
200
  "step": 260
201
  },
202
  {
203
+ "epoch": 1.0577864838393731,
204
+ "grad_norm": 0.3950434923171997,
205
+ "learning_rate": 4.3809938971229295e-05,
206
+ "loss": 0.0205,
207
  "step": 270
208
  },
209
  {
210
+ "epoch": 1.0969637610186092,
211
+ "grad_norm": 0.771115243434906,
212
+ "learning_rate": 4.337401918047079e-05,
213
+ "loss": 0.032,
214
  "step": 280
215
  },
216
  {
217
+ "epoch": 1.1361410381978452,
218
+ "grad_norm": 0.24406306445598602,
219
+ "learning_rate": 4.2938099389712295e-05,
220
+ "loss": 0.0261,
 
 
 
 
 
 
 
 
 
221
  "step": 290
222
  },
223
  {
224
+ "epoch": 1.1753183153770812,
225
+ "grad_norm": 0.5867096781730652,
226
+ "learning_rate": 4.250217959895379e-05,
227
+ "loss": 0.0297,
228
  "step": 300
229
  },
230
  {
231
+ "epoch": 1.2144955925563172,
232
+ "grad_norm": 0.3914402723312378,
233
+ "learning_rate": 4.2066259808195296e-05,
234
+ "loss": 0.0245,
235
  "step": 310
236
  },
237
  {
238
+ "epoch": 1.2536728697355533,
239
+ "grad_norm": 0.2989954352378845,
240
+ "learning_rate": 4.163034001743679e-05,
241
+ "loss": 0.0275,
 
 
 
 
 
 
 
 
 
242
  "step": 320
243
  },
244
  {
245
+ "epoch": 1.2928501469147895,
246
+ "grad_norm": 0.46654826402664185,
247
+ "learning_rate": 4.119442022667829e-05,
248
+ "loss": 0.031,
249
  "step": 330
250
  },
251
  {
252
+ "epoch": 1.3320274240940255,
253
+ "grad_norm": 0.17611253261566162,
254
+ "learning_rate": 4.0758500435919793e-05,
255
+ "loss": 0.0247,
256
  "step": 340
257
  },
258
  {
259
+ "epoch": 1.3712047012732616,
260
+ "grad_norm": 0.31176942586898804,
261
+ "learning_rate": 4.032258064516129e-05,
262
+ "loss": 0.0242,
 
 
 
 
 
 
 
 
 
263
  "step": 350
264
  },
265
  {
266
+ "epoch": 1.4103819784524976,
267
+ "grad_norm": 0.5007725954055786,
268
+ "learning_rate": 3.9886660854402794e-05,
269
+ "loss": 0.033,
270
  "step": 360
271
  },
272
  {
273
+ "epoch": 1.4495592556317336,
274
+ "grad_norm": 0.6336463093757629,
275
+ "learning_rate": 3.945074106364429e-05,
276
+ "loss": 0.0127,
277
  "step": 370
278
  },
279
  {
280
+ "epoch": 1.4887365328109696,
281
+ "grad_norm": 0.349540650844574,
282
+ "learning_rate": 3.9014821272885795e-05,
283
+ "loss": 0.0327,
 
 
 
 
 
 
 
 
 
284
  "step": 380
285
  },
286
  {
287
+ "epoch": 1.5279138099902057,
288
+ "grad_norm": 0.48749464750289917,
289
+ "learning_rate": 3.857890148212729e-05,
290
+ "loss": 0.0247,
291
  "step": 390
292
  },
293
  {
294
+ "epoch": 1.5670910871694417,
295
+ "grad_norm": 0.21512505412101746,
296
+ "learning_rate": 3.814298169136879e-05,
297
+ "loss": 0.0244,
298
  "step": 400
299
  },
300
  {
301
+ "epoch": 1.606268364348678,
302
+ "grad_norm": 0.1843574196100235,
303
+ "learning_rate": 3.770706190061029e-05,
304
+ "loss": 0.0238,
 
 
 
 
 
 
 
 
 
305
  "step": 410
306
  },
307
  {
308
+ "epoch": 1.645445641527914,
309
+ "grad_norm": 0.17209237813949585,
310
+ "learning_rate": 3.727114210985179e-05,
311
+ "loss": 0.0159,
312
  "step": 420
313
  },
314
  {
315
+ "epoch": 1.68462291870715,
316
+ "grad_norm": 0.17134279012680054,
317
+ "learning_rate": 3.6835222319093286e-05,
318
+ "loss": 0.0147,
319
  "step": 430
320
  },
321
  {
322
+ "epoch": 1.723800195886386,
323
+ "grad_norm": 0.3783511519432068,
324
+ "learning_rate": 3.639930252833479e-05,
325
+ "loss": 0.026,
326
  "step": 440
327
  },
328
  {
329
+ "epoch": 1.762977473065622,
330
+ "grad_norm": 0.20875829458236694,
331
+ "learning_rate": 3.596338273757629e-05,
332
+ "loss": 0.0169,
 
 
 
 
 
 
 
 
 
333
  "step": 450
334
  },
335
  {
336
+ "epoch": 1.802154750244858,
337
+ "grad_norm": 0.061811015009880066,
338
+ "learning_rate": 3.552746294681778e-05,
339
+ "loss": 0.0115,
340
  "step": 460
341
  },
342
  {
343
+ "epoch": 1.841332027424094,
344
+ "grad_norm": 0.5182923078536987,
345
+ "learning_rate": 3.509154315605929e-05,
346
+ "loss": 0.0217,
347
  "step": 470
348
  },
349
  {
350
+ "epoch": 1.88050930460333,
351
+ "grad_norm": 0.3887523114681244,
352
+ "learning_rate": 3.465562336530079e-05,
353
+ "loss": 0.0189,
 
 
 
 
 
 
 
 
 
354
  "step": 480
355
  },
356
  {
357
+ "epoch": 1.9196865817825661,
358
+ "grad_norm": 0.13711315393447876,
359
+ "learning_rate": 3.421970357454228e-05,
360
+ "loss": 0.0132,
361
  "step": 490
362
  },
363
  {
364
+ "epoch": 1.9588638589618022,
365
+ "grad_norm": 0.3663609027862549,
366
+ "learning_rate": 3.3783783783783784e-05,
367
+ "loss": 0.025,
368
  "step": 500
369
  },
370
  {
371
+ "epoch": 1.9980411361410382,
372
+ "grad_norm": 0.28243473172187805,
373
+ "learning_rate": 3.334786399302529e-05,
374
+ "loss": 0.0167,
375
+ "step": 510
 
 
376
  },
377
  {
378
+ "epoch": 1.9980411361410382,
379
+ "eval_accuracy": 0.994671729544341,
380
+ "eval_loss": 0.027501454576849937,
381
+ "eval_runtime": 109.9626,
382
+ "eval_samples_per_second": 148.487,
383
+ "eval_steps_per_second": 2.328,
384
  "step": 510
385
  },
386
  {
387
+ "epoch": 2.037218413320274,
388
+ "grad_norm": 0.20075927674770355,
389
+ "learning_rate": 3.2911944202266785e-05,
390
+ "loss": 0.0189,
391
  "step": 520
392
  },
393
  {
394
+ "epoch": 2.0763956904995102,
395
+ "grad_norm": 0.5090253353118896,
396
+ "learning_rate": 3.247602441150828e-05,
397
+ "loss": 0.0158,
398
  "step": 530
399
  },
400
  {
401
+ "epoch": 2.1155729676787463,
402
+ "grad_norm": 0.20879769325256348,
403
+ "learning_rate": 3.2040104620749785e-05,
404
+ "loss": 0.0198,
 
 
 
 
 
 
 
 
 
405
  "step": 540
406
  },
407
  {
408
+ "epoch": 2.1547502448579823,
409
+ "grad_norm": 0.7945526242256165,
410
+ "learning_rate": 3.160418482999128e-05,
411
+ "loss": 0.0244,
412
  "step": 550
413
  },
414
  {
415
+ "epoch": 2.1939275220372183,
416
+ "grad_norm": 0.49813878536224365,
417
+ "learning_rate": 3.116826503923278e-05,
418
+ "loss": 0.0325,
419
  "step": 560
420
  },
421
  {
422
+ "epoch": 2.2331047992164543,
423
+ "grad_norm": 0.417555034160614,
424
+ "learning_rate": 3.073234524847428e-05,
425
+ "loss": 0.0245,
 
 
 
 
 
 
 
 
 
426
  "step": 570
427
  },
428
  {
429
+ "epoch": 2.2722820763956904,
430
+ "grad_norm": 0.13541927933692932,
431
+ "learning_rate": 3.0296425457715783e-05,
432
+ "loss": 0.0176,
433
  "step": 580
434
  },
435
  {
436
+ "epoch": 2.3114593535749264,
437
+ "grad_norm": 0.22694525122642517,
438
+ "learning_rate": 2.9860505666957283e-05,
439
+ "loss": 0.0199,
440
  "step": 590
441
  },
442
  {
443
+ "epoch": 2.3506366307541624,
444
+ "grad_norm": 0.06209595128893852,
445
+ "learning_rate": 2.942458587619878e-05,
446
+ "loss": 0.0127,
 
 
 
 
 
 
 
 
 
447
  "step": 600
448
  },
449
  {
450
+ "epoch": 2.389813907933399,
451
+ "grad_norm": 0.4733225703239441,
452
+ "learning_rate": 2.898866608544028e-05,
453
+ "loss": 0.0334,
454
  "step": 610
455
  },
456
  {
457
+ "epoch": 2.4289911851126345,
458
+ "grad_norm": 0.37968209385871887,
459
+ "learning_rate": 2.855274629468178e-05,
460
+ "loss": 0.028,
461
  "step": 620
462
  },
463
  {
464
+ "epoch": 2.468168462291871,
465
+ "grad_norm": 0.1550379991531372,
466
+ "learning_rate": 2.8116826503923278e-05,
467
+ "loss": 0.0227,
468
  "step": 630
469
  },
470
  {
471
+ "epoch": 2.5073457394711065,
472
+ "grad_norm": 0.4125171899795532,
473
+ "learning_rate": 2.7680906713164778e-05,
474
+ "loss": 0.0149,
 
 
 
 
 
 
 
 
 
475
  "step": 640
476
  },
477
  {
478
+ "epoch": 2.546523016650343,
479
+ "grad_norm": 0.6104760766029358,
480
+ "learning_rate": 2.724498692240628e-05,
481
+ "loss": 0.0256,
482
  "step": 650
483
  },
484
  {
485
+ "epoch": 2.585700293829579,
486
+ "grad_norm": 0.4356853663921356,
487
+ "learning_rate": 2.6809067131647782e-05,
488
+ "loss": 0.0222,
489
  "step": 660
490
  },
491
  {
492
+ "epoch": 2.624877571008815,
493
+ "grad_norm": 0.3686465322971344,
494
+ "learning_rate": 2.6373147340889275e-05,
495
+ "loss": 0.0213,
 
 
 
 
 
 
 
 
 
496
  "step": 670
497
  },
498
  {
499
+ "epoch": 2.664054848188051,
500
+ "grad_norm": 0.30900245904922485,
501
+ "learning_rate": 2.5937227550130776e-05,
502
+ "loss": 0.0226,
503
  "step": 680
504
  },
505
  {
506
+ "epoch": 2.703232125367287,
507
+ "grad_norm": 0.1763727217912674,
508
+ "learning_rate": 2.550130775937228e-05,
509
+ "loss": 0.0175,
510
  "step": 690
511
  },
512
  {
513
+ "epoch": 2.742409402546523,
514
+ "grad_norm": 0.7033935785293579,
515
+ "learning_rate": 2.5065387968613773e-05,
516
+ "loss": 0.0161,
 
 
 
 
 
 
 
 
 
517
  "step": 700
518
  },
519
  {
520
+ "epoch": 2.781586679725759,
521
+ "grad_norm": 0.4877593219280243,
522
+ "learning_rate": 2.4629468177855277e-05,
523
+ "loss": 0.0245,
524
  "step": 710
525
  },
526
  {
527
+ "epoch": 2.820763956904995,
528
+ "grad_norm": 0.22476495802402496,
529
+ "learning_rate": 2.4193548387096777e-05,
530
+ "loss": 0.0158,
531
  "step": 720
532
  },
533
  {
534
+ "epoch": 2.859941234084231,
535
+ "grad_norm": 0.5083233118057251,
536
+ "learning_rate": 2.3757628596338274e-05,
537
+ "loss": 0.02,
 
 
 
 
 
 
 
 
 
538
  "step": 730
539
  },
540
  {
541
+ "epoch": 2.899118511263467,
542
+ "grad_norm": 0.131087526679039,
543
+ "learning_rate": 2.3321708805579774e-05,
544
+ "loss": 0.021,
545
  "step": 740
546
  },
547
  {
548
+ "epoch": 2.9382957884427032,
549
+ "grad_norm": 0.2617853581905365,
550
+ "learning_rate": 2.2885789014821274e-05,
551
+ "loss": 0.0204,
552
  "step": 750
553
  },
554
  {
555
+ "epoch": 2.9774730656219393,
556
+ "grad_norm": 0.19207285344600677,
557
+ "learning_rate": 2.244986922406277e-05,
558
+ "loss": 0.0177,
559
+ "step": 760
 
 
560
  },
561
  {
562
+ "epoch": 2.997061704211557,
563
+ "eval_accuracy": 0.9935693287604116,
564
+ "eval_loss": 0.026763953268527985,
565
+ "eval_runtime": 112.7695,
566
+ "eval_samples_per_second": 144.791,
567
+ "eval_steps_per_second": 2.27,
568
+ "step": 765
569
  },
570
  {
571
+ "epoch": 3.0166503428011753,
572
+ "grad_norm": 0.7242885231971741,
573
+ "learning_rate": 2.2013949433304275e-05,
574
+ "loss": 0.0206,
575
  "step": 770
576
  },
577
  {
578
+ "epoch": 3.0558276199804113,
579
+ "grad_norm": 0.6204590201377869,
580
+ "learning_rate": 2.1578029642545772e-05,
581
+ "loss": 0.0167,
582
  "step": 780
583
  },
584
  {
585
+ "epoch": 3.0950048971596473,
586
+ "grad_norm": 0.14467577636241913,
587
+ "learning_rate": 2.1142109851787272e-05,
588
+ "loss": 0.0207,
 
 
 
 
 
 
 
 
 
589
  "step": 790
590
  },
591
  {
592
+ "epoch": 3.1341821743388834,
593
+ "grad_norm": 0.3296276032924652,
594
+ "learning_rate": 2.0706190061028772e-05,
595
+ "loss": 0.0177,
596
  "step": 800
597
  },
598
  {
599
+ "epoch": 3.1733594515181194,
600
+ "grad_norm": 0.7694735527038574,
601
+ "learning_rate": 2.0270270270270273e-05,
602
+ "loss": 0.0141,
603
  "step": 810
604
  },
605
  {
606
+ "epoch": 3.2125367286973554,
607
+ "grad_norm": 0.35446447134017944,
608
+ "learning_rate": 1.983435047951177e-05,
609
+ "loss": 0.0206,
 
 
 
 
 
 
 
 
 
610
  "step": 820
611
  },
612
  {
613
+ "epoch": 3.2517140058765914,
614
+ "grad_norm": 1.089401125907898,
615
+ "learning_rate": 1.939843068875327e-05,
616
+ "loss": 0.0326,
617
  "step": 830
618
  },
619
  {
620
+ "epoch": 3.2908912830558275,
621
+ "grad_norm": 0.21801254153251648,
622
+ "learning_rate": 1.896251089799477e-05,
623
+ "loss": 0.0162,
624
  "step": 840
625
  },
626
  {
627
+ "epoch": 3.3300685602350635,
628
+ "grad_norm": 0.42867550253868103,
629
+ "learning_rate": 1.8526591107236267e-05,
630
+ "loss": 0.0195,
 
 
 
 
 
 
 
 
 
631
  "step": 850
632
  },
633
  {
634
+ "epoch": 3.3692458374143,
635
+ "grad_norm": 0.24043036997318268,
636
+ "learning_rate": 1.809067131647777e-05,
637
+ "loss": 0.0155,
638
  "step": 860
639
  },
640
  {
641
+ "epoch": 3.4084231145935355,
642
+ "grad_norm": 0.4469415545463562,
643
+ "learning_rate": 1.7654751525719268e-05,
644
+ "loss": 0.0193,
645
  "step": 870
646
  },
647
  {
648
+ "epoch": 3.447600391772772,
649
+ "grad_norm": 0.14562171697616577,
650
+ "learning_rate": 1.7218831734960768e-05,
651
+ "loss": 0.0169,
652
  "step": 880
653
  },
654
  {
655
+ "epoch": 3.486777668952008,
656
+ "grad_norm": 0.4943673610687256,
657
+ "learning_rate": 1.6782911944202268e-05,
658
+ "loss": 0.0281,
 
 
 
 
 
 
 
 
 
659
  "step": 890
660
  },
661
  {
662
+ "epoch": 3.525954946131244,
663
+ "grad_norm": 0.5602672696113586,
664
+ "learning_rate": 1.6346992153443765e-05,
665
+ "loss": 0.0252,
666
  "step": 900
667
  },
668
  {
669
+ "epoch": 3.56513222331048,
670
+ "grad_norm": 0.6581624150276184,
671
+ "learning_rate": 1.591107236268527e-05,
672
+ "loss": 0.0202,
673
  "step": 910
674
  },
675
  {
676
+ "epoch": 3.604309500489716,
677
+ "grad_norm": 0.45326659083366394,
678
+ "learning_rate": 1.5475152571926766e-05,
679
+ "loss": 0.0257,
 
 
 
 
 
 
 
 
 
680
  "step": 920
681
  },
682
  {
683
+ "epoch": 3.643486777668952,
684
+ "grad_norm": 1.0565999746322632,
685
+ "learning_rate": 1.5039232781168266e-05,
686
+ "loss": 0.02,
687
  "step": 930
688
  },
689
  {
690
+ "epoch": 3.682664054848188,
691
+ "grad_norm": 0.2989865839481354,
692
+ "learning_rate": 1.4603312990409764e-05,
693
+ "loss": 0.0163,
694
  "step": 940
695
  },
696
  {
697
+ "epoch": 3.721841332027424,
698
+ "grad_norm": 0.09961768984794617,
699
+ "learning_rate": 1.4167393199651266e-05,
700
+ "loss": 0.0121,
 
 
 
 
 
 
 
 
 
701
  "step": 950
702
  },
703
  {
704
+ "epoch": 3.76101860920666,
705
+ "grad_norm": 0.6661494970321655,
706
+ "learning_rate": 1.3731473408892765e-05,
707
+ "loss": 0.0123,
708
  "step": 960
709
  },
710
  {
711
+ "epoch": 3.8001958863858962,
712
+ "grad_norm": 0.4994729459285736,
713
+ "learning_rate": 1.3295553618134264e-05,
714
+ "loss": 0.0261,
715
  "step": 970
716
  },
717
  {
718
+ "epoch": 3.8393731635651323,
719
+ "grad_norm": 0.5843683481216431,
720
+ "learning_rate": 1.2859633827375764e-05,
721
+ "loss": 0.0222,
 
 
 
 
 
 
 
 
 
722
  "step": 980
723
  },
724
  {
725
+ "epoch": 3.8785504407443683,
726
+ "grad_norm": 0.30400168895721436,
727
+ "learning_rate": 1.2423714036617264e-05,
728
+ "loss": 0.0217,
729
  "step": 990
730
  },
731
  {
732
+ "epoch": 3.9177277179236043,
733
+ "grad_norm": 0.1775442361831665,
734
+ "learning_rate": 1.1987794245858763e-05,
735
+ "loss": 0.0194,
736
  "step": 1000
737
  },
738
  {
739
+ "epoch": 3.9569049951028403,
740
+ "grad_norm": 0.754188060760498,
741
+ "learning_rate": 1.1551874455100261e-05,
742
+ "loss": 0.0235,
 
 
 
 
 
 
 
 
 
743
  "step": 1010
744
  },
745
  {
746
+ "epoch": 3.9960822722820764,
747
+ "grad_norm": 0.2706276774406433,
748
+ "learning_rate": 1.1115954664341762e-05,
749
+ "loss": 0.0158,
750
  "step": 1020
751
  },
752
  {
753
+ "epoch": 4.0,
754
+ "eval_accuracy": 0.9944879960803528,
755
+ "eval_loss": 0.023838121443986893,
756
+ "eval_runtime": 105.6855,
757
+ "eval_samples_per_second": 154.496,
758
+ "eval_steps_per_second": 2.422,
759
+ "step": 1021
760
+ },
761
+ {
762
+ "epoch": 4.035259549461313,
763
+ "grad_norm": 0.7036624550819397,
764
+ "learning_rate": 1.0680034873583262e-05,
765
+ "loss": 0.0212,
766
  "step": 1030
767
  },
768
  {
769
+ "epoch": 4.074436826640548,
770
+ "grad_norm": 0.3211575746536255,
771
+ "learning_rate": 1.024411508282476e-05,
772
+ "loss": 0.0158,
 
 
 
 
 
 
 
 
 
773
  "step": 1040
774
  },
775
  {
776
+ "epoch": 4.113614103819785,
777
+ "grad_norm": 0.28554221987724304,
778
+ "learning_rate": 9.80819529206626e-06,
779
+ "loss": 0.0166,
780
  "step": 1050
781
  },
782
  {
783
+ "epoch": 4.1527913809990205,
784
+ "grad_norm": 0.23619802296161652,
785
+ "learning_rate": 9.372275501307761e-06,
786
+ "loss": 0.022,
787
  "step": 1060
788
  },
789
  {
790
+ "epoch": 4.191968658178257,
791
+ "grad_norm": 0.24213068187236786,
792
+ "learning_rate": 8.93635571054926e-06,
793
+ "loss": 0.0201,
794
  "step": 1070
795
  },
796
  {
797
+ "epoch": 4.2311459353574925,
798
+ "grad_norm": 0.5181974172592163,
799
+ "learning_rate": 8.500435919790758e-06,
800
+ "loss": 0.0289,
 
 
 
 
 
 
 
 
 
801
  "step": 1080
802
  },
803
  {
804
+ "epoch": 4.270323212536729,
805
+ "grad_norm": 0.9876229166984558,
806
+ "learning_rate": 8.064516129032258e-06,
807
+ "loss": 0.0198,
808
  "step": 1090
809
  },
810
  {
811
+ "epoch": 4.3095004897159646,
812
+ "grad_norm": 1.2217401266098022,
813
+ "learning_rate": 7.628596338273758e-06,
814
+ "loss": 0.0197,
815
  "step": 1100
816
  },
817
  {
818
+ "epoch": 4.348677766895201,
819
+ "grad_norm": 0.6681068539619446,
820
+ "learning_rate": 7.192676547515258e-06,
821
+ "loss": 0.0217,
 
 
 
 
 
 
 
 
 
822
  "step": 1110
823
  },
824
  {
825
+ "epoch": 4.387855044074437,
826
+ "grad_norm": 0.2516974210739136,
827
+ "learning_rate": 6.7567567567567575e-06,
828
+ "loss": 0.015,
829
  "step": 1120
830
  },
831
  {
832
+ "epoch": 4.427032321253673,
833
+ "grad_norm": 0.6431485414505005,
834
+ "learning_rate": 6.320836965998257e-06,
835
+ "loss": 0.0165,
836
  "step": 1130
837
  },
838
  {
839
+ "epoch": 4.466209598432909,
840
+ "grad_norm": 0.8847171664237976,
841
+ "learning_rate": 5.884917175239756e-06,
842
+ "loss": 0.0169,
 
 
 
 
 
 
 
 
 
843
  "step": 1140
844
  },
845
  {
846
+ "epoch": 4.505386875612145,
847
+ "grad_norm": 0.13626788556575775,
848
+ "learning_rate": 5.448997384481256e-06,
849
+ "loss": 0.0135,
850
  "step": 1150
851
  },
852
  {
853
+ "epoch": 4.544564152791381,
854
+ "grad_norm": 0.19009195268154144,
855
+ "learning_rate": 5.013077593722755e-06,
856
+ "loss": 0.0082,
857
  "step": 1160
858
  },
859
  {
860
+ "epoch": 4.583741429970617,
861
+ "grad_norm": 0.5194115042686462,
862
+ "learning_rate": 4.577157802964255e-06,
863
+ "loss": 0.0186,
 
 
 
 
 
 
 
 
 
864
  "step": 1170
865
  },
866
  {
867
+ "epoch": 4.622918707149853,
868
+ "grad_norm": 0.3963209092617035,
869
+ "learning_rate": 4.141238012205754e-06,
870
+ "loss": 0.0211,
871
  "step": 1180
872
  },
873
  {
874
+ "epoch": 4.662095984329089,
875
+ "grad_norm": 0.392490953207016,
876
+ "learning_rate": 3.7053182214472536e-06,
877
+ "loss": 0.0121,
878
  "step": 1190
879
  },
880
  {
881
+ "epoch": 4.701273261508325,
882
+ "grad_norm": 0.2609846293926239,
883
+ "learning_rate": 3.2693984306887534e-06,
884
+ "loss": 0.024,
 
 
 
 
 
 
 
 
 
885
  "step": 1200
886
  },
887
  {
888
+ "epoch": 4.740450538687561,
889
+ "grad_norm": 0.9114863872528076,
890
+ "learning_rate": 2.8334786399302533e-06,
891
+ "loss": 0.022,
892
  "step": 1210
893
  },
894
  {
895
+ "epoch": 4.779627815866798,
896
+ "grad_norm": 0.3400118947029114,
897
+ "learning_rate": 2.3975588491717523e-06,
898
+ "loss": 0.0139,
899
  "step": 1220
900
  },
901
  {
902
+ "epoch": 4.818805093046033,
903
+ "grad_norm": 0.8354482650756836,
904
+ "learning_rate": 1.961639058413252e-06,
905
+ "loss": 0.0168,
 
 
 
 
 
 
 
 
 
906
  "step": 1230
907
  },
908
  {
909
+ "epoch": 4.857982370225269,
910
+ "grad_norm": 0.4545687139034271,
911
+ "learning_rate": 1.5257192676547516e-06,
912
+ "loss": 0.0176,
913
  "step": 1240
914
  },
915
  {
916
+ "epoch": 4.897159647404505,
917
+ "grad_norm": 0.6158453226089478,
918
+ "learning_rate": 1.0897994768962512e-06,
919
+ "loss": 0.0169,
920
+ "step": 1250
921
+ },
922
+ {
923
+ "epoch": 4.936336924583742,
924
+ "grad_norm": 0.11892726272344589,
925
+ "learning_rate": 6.538796861377506e-07,
926
+ "loss": 0.0196,
927
+ "step": 1260
928
+ },
929
+ {
930
+ "epoch": 4.975514201762977,
931
+ "grad_norm": 0.5337911248207092,
932
+ "learning_rate": 2.179598953792502e-07,
933
+ "loss": 0.0112,
934
+ "step": 1270
935
+ },
936
+ {
937
+ "epoch": 4.995102840352596,
938
+ "eval_accuracy": 0.9943655071043606,
939
+ "eval_loss": 0.02594069205224514,
940
+ "eval_runtime": 104.7525,
941
+ "eval_samples_per_second": 155.872,
942
+ "eval_steps_per_second": 2.444,
943
+ "step": 1275
944
  },
945
  {
946
+ "epoch": 4.995102840352596,
947
+ "step": 1275,
948
+ "total_flos": 8.109813238393209e+18,
949
+ "train_loss": 0.030976083857171675,
950
+ "train_runtime": 4473.2861,
951
+ "train_samples_per_second": 73.002,
952
+ "train_steps_per_second": 0.285
953
  }
954
  ],
955
  "logging_steps": 10,
956
+ "max_steps": 1275,
957
  "num_input_tokens_seen": 0,
958
+ "num_train_epochs": 5,
959
  "save_steps": 500,
960
  "stateful_callbacks": {
961
  "TrainerControl": {
 
969
  "attributes": {}
970
  }
971
  },
972
+ "total_flos": 8.109813238393209e+18,
973
  "train_batch_size": 64,
974
  "trial_name": null,
975
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d198d97c8dad6592f41037d1dc6ee08e107c4ed3176ecb5883f0d1b69e853b4d
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c28f4f40287fbfa917bd2f4f97b7457f8e5f3e4b2a551453613ec650e854b4e
3
  size 5240