smilemikan commited on
Commit
d337238
1 Parent(s): e88ac22

Delete checkpoint-46500

Browse files
checkpoint-46500/config.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "_name_or_path": "facebook/nllb-200-distilled-600M",
3
- "activation_dropout": 0.0,
4
- "activation_function": "relu",
5
- "architectures": [
6
- "M2M100ForConditionalGeneration"
7
- ],
8
- "attention_dropout": 0.1,
9
- "bos_token_id": 0,
10
- "d_model": 1024,
11
- "decoder_attention_heads": 16,
12
- "decoder_ffn_dim": 4096,
13
- "decoder_layerdrop": 0,
14
- "decoder_layers": 12,
15
- "decoder_start_token_id": 2,
16
- "dropout": 0.1,
17
- "encoder_attention_heads": 16,
18
- "encoder_ffn_dim": 4096,
19
- "encoder_layerdrop": 0,
20
- "encoder_layers": 12,
21
- "eos_token_id": 2,
22
- "init_std": 0.02,
23
- "is_encoder_decoder": true,
24
- "max_length": 200,
25
- "max_position_embeddings": 1024,
26
- "model_type": "m2m_100",
27
- "num_hidden_layers": 12,
28
- "pad_token_id": 1,
29
- "scale_embedding": true,
30
- "tokenizer_class": "NllbTokenizer",
31
- "torch_dtype": "float32",
32
- "transformers_version": "4.33.0",
33
- "use_cache": true,
34
- "vocab_size": 256205
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-46500/generation_config.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "bos_token_id": 0,
3
- "decoder_start_token_id": 2,
4
- "eos_token_id": 2,
5
- "max_length": 200,
6
- "pad_token_id": 1,
7
- "transformers_version": "4.33.0"
8
- }
 
 
 
 
 
 
 
 
 
checkpoint-46500/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7a344ff111594f24e44411d1e8aa99c38d0b1af1bd7640f6868a15925646357
3
- size 5125261
 
 
 
 
checkpoint-46500/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e988178a3fc162b80bb253888d4c2fa94a93520ea79e12fd78e7d68e2f12b067
3
- size 2460465086
 
 
 
 
checkpoint-46500/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5005c70400b18400623be58b3cc38bbcdc65ddde2a51e70887b655d6feec549
3
- size 14244
 
 
 
 
checkpoint-46500/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f7075068dc4257cf73679b291b55e3f872a2390010c80c52c4b761fed90b4b4
3
- size 1064
 
 
 
 
checkpoint-46500/sentencepiece.bpe.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a
3
- size 4852054
 
 
 
 
checkpoint-46500/special_tokens_map.json DELETED
@@ -1,220 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "ace_Arab",
4
- "ace_Latn",
5
- "acm_Arab",
6
- "acq_Arab",
7
- "aeb_Arab",
8
- "afr_Latn",
9
- "ajp_Arab",
10
- "aka_Latn",
11
- "amh_Ethi",
12
- "apc_Arab",
13
- "arb_Arab",
14
- "ars_Arab",
15
- "ary_Arab",
16
- "arz_Arab",
17
- "asm_Beng",
18
- "ast_Latn",
19
- "awa_Deva",
20
- "ayr_Latn",
21
- "azb_Arab",
22
- "azj_Latn",
23
- "bak_Cyrl",
24
- "bam_Latn",
25
- "ban_Latn",
26
- "bel_Cyrl",
27
- "bem_Latn",
28
- "ben_Beng",
29
- "bho_Deva",
30
- "bjn_Arab",
31
- "bjn_Latn",
32
- "bod_Tibt",
33
- "bos_Latn",
34
- "bug_Latn",
35
- "bul_Cyrl",
36
- "cat_Latn",
37
- "ceb_Latn",
38
- "ces_Latn",
39
- "cjk_Latn",
40
- "ckb_Arab",
41
- "crh_Latn",
42
- "cym_Latn",
43
- "dan_Latn",
44
- "deu_Latn",
45
- "dik_Latn",
46
- "dyu_Latn",
47
- "dzo_Tibt",
48
- "ell_Grek",
49
- "eng_Latn",
50
- "epo_Latn",
51
- "est_Latn",
52
- "eus_Latn",
53
- "ewe_Latn",
54
- "fao_Latn",
55
- "pes_Arab",
56
- "fij_Latn",
57
- "fin_Latn",
58
- "fon_Latn",
59
- "fra_Latn",
60
- "fur_Latn",
61
- "fuv_Latn",
62
- "gla_Latn",
63
- "gle_Latn",
64
- "glg_Latn",
65
- "grn_Latn",
66
- "guj_Gujr",
67
- "hat_Latn",
68
- "hau_Latn",
69
- "heb_Hebr",
70
- "hin_Deva",
71
- "hne_Deva",
72
- "hrv_Latn",
73
- "hun_Latn",
74
- "hye_Armn",
75
- "ibo_Latn",
76
- "ilo_Latn",
77
- "ind_Latn",
78
- "isl_Latn",
79
- "ita_Latn",
80
- "jav_Latn",
81
- "jpn_Jpan",
82
- "kab_Latn",
83
- "kac_Latn",
84
- "kam_Latn",
85
- "kan_Knda",
86
- "kas_Arab",
87
- "kas_Deva",
88
- "kat_Geor",
89
- "knc_Arab",
90
- "knc_Latn",
91
- "kaz_Cyrl",
92
- "kbp_Latn",
93
- "kea_Latn",
94
- "khm_Khmr",
95
- "kik_Latn",
96
- "kin_Latn",
97
- "kir_Cyrl",
98
- "kmb_Latn",
99
- "kon_Latn",
100
- "kor_Hang",
101
- "kmr_Latn",
102
- "lao_Laoo",
103
- "lvs_Latn",
104
- "lij_Latn",
105
- "lim_Latn",
106
- "lin_Latn",
107
- "lit_Latn",
108
- "lmo_Latn",
109
- "ltg_Latn",
110
- "ltz_Latn",
111
- "lua_Latn",
112
- "lug_Latn",
113
- "luo_Latn",
114
- "lus_Latn",
115
- "mag_Deva",
116
- "mai_Deva",
117
- "mal_Mlym",
118
- "mar_Deva",
119
- "min_Latn",
120
- "mkd_Cyrl",
121
- "plt_Latn",
122
- "mlt_Latn",
123
- "mni_Beng",
124
- "khk_Cyrl",
125
- "mos_Latn",
126
- "mri_Latn",
127
- "zsm_Latn",
128
- "mya_Mymr",
129
- "nld_Latn",
130
- "nno_Latn",
131
- "nob_Latn",
132
- "npi_Deva",
133
- "nso_Latn",
134
- "nus_Latn",
135
- "nya_Latn",
136
- "oci_Latn",
137
- "gaz_Latn",
138
- "ory_Orya",
139
- "pag_Latn",
140
- "pan_Guru",
141
- "pap_Latn",
142
- "pol_Latn",
143
- "por_Latn",
144
- "prs_Arab",
145
- "pbt_Arab",
146
- "quy_Latn",
147
- "ron_Latn",
148
- "run_Latn",
149
- "rus_Cyrl",
150
- "sag_Latn",
151
- "san_Deva",
152
- "sat_Beng",
153
- "scn_Latn",
154
- "shn_Mymr",
155
- "sin_Sinh",
156
- "slk_Latn",
157
- "slv_Latn",
158
- "smo_Latn",
159
- "sna_Latn",
160
- "snd_Arab",
161
- "som_Latn",
162
- "sot_Latn",
163
- "spa_Latn",
164
- "als_Latn",
165
- "srd_Latn",
166
- "srp_Cyrl",
167
- "ssw_Latn",
168
- "sun_Latn",
169
- "swe_Latn",
170
- "swh_Latn",
171
- "szl_Latn",
172
- "tam_Taml",
173
- "tat_Cyrl",
174
- "tel_Telu",
175
- "tgk_Cyrl",
176
- "tgl_Latn",
177
- "tha_Thai",
178
- "tir_Ethi",
179
- "taq_Latn",
180
- "taq_Tfng",
181
- "tpi_Latn",
182
- "tsn_Latn",
183
- "tso_Latn",
184
- "tuk_Latn",
185
- "tum_Latn",
186
- "tur_Latn",
187
- "twi_Latn",
188
- "tzm_Tfng",
189
- "uig_Arab",
190
- "ukr_Cyrl",
191
- "umb_Latn",
192
- "urd_Arab",
193
- "uzn_Latn",
194
- "vec_Latn",
195
- "vie_Latn",
196
- "war_Latn",
197
- "wol_Latn",
198
- "xho_Latn",
199
- "ydd_Hebr",
200
- "yor_Latn",
201
- "yue_Hant",
202
- "zho_Hans",
203
- "zho_Hant",
204
- "zul_Latn",
205
- "ain_Jpan"
206
- ],
207
- "bos_token": "<s>",
208
- "cls_token": "<s>",
209
- "eos_token": "</s>",
210
- "mask_token": {
211
- "content": "<mask>",
212
- "lstrip": true,
213
- "normalized": true,
214
- "rstrip": false,
215
- "single_word": false
216
- },
217
- "pad_token": "<pad>",
218
- "sep_token": "</s>",
219
- "unk_token": "<unk>"
220
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-46500/tokenizer_config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "additional_special_tokens": null,
3
- "bos_token": "<s>",
4
- "clean_up_tokenization_spaces": true,
5
- "cls_token": "<s>",
6
- "eos_token": "</s>",
7
- "legacy_behaviour": false,
8
- "mask_token": {
9
- "__type": "AddedToken",
10
- "content": "<mask>",
11
- "lstrip": true,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "model_max_length": 1024,
17
- "pad_token": "<pad>",
18
- "sep_token": "</s>",
19
- "sp_model_kwargs": {},
20
- "src_lang": null,
21
- "tgt_lang": null,
22
- "tokenizer_class": "NllbTokenizer",
23
- "tokenizer_file": null,
24
- "unk_token": "<unk>"
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-46500/trainer_state.json DELETED
@@ -1,1325 +0,0 @@
1
- {
2
- "best_metric": 1.3308771848678589,
3
- "best_model_checkpoint": "nllb-finetuned-jpn-to-ain-2/checkpoint-46500",
4
- "epoch": 10.24454725710509,
5
- "eval_steps": 500,
6
- "global_step": 46500,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.11,
13
- "learning_rate": 1.9833333333333335e-05,
14
- "loss": 4.6196,
15
- "step": 500
16
- },
17
- {
18
- "epoch": 0.11,
19
- "eval_loss": 3.7452237606048584,
20
- "eval_runtime": 43.586,
21
- "eval_samples_per_second": 208.278,
22
- "eval_steps_per_second": 13.032,
23
- "step": 500
24
- },
25
- {
26
- "epoch": 0.22,
27
- "learning_rate": 1.9666666666666666e-05,
28
- "loss": 3.5915,
29
- "step": 1000
30
- },
31
- {
32
- "epoch": 0.22,
33
- "eval_loss": 3.2064383029937744,
34
- "eval_runtime": 43.0726,
35
- "eval_samples_per_second": 210.76,
36
- "eval_steps_per_second": 13.187,
37
- "step": 1000
38
- },
39
- {
40
- "epoch": 0.33,
41
- "learning_rate": 1.95e-05,
42
- "loss": 3.1484,
43
- "step": 1500
44
- },
45
- {
46
- "epoch": 0.33,
47
- "eval_loss": 2.8919074535369873,
48
- "eval_runtime": 42.8255,
49
- "eval_samples_per_second": 211.977,
50
- "eval_steps_per_second": 13.263,
51
- "step": 1500
52
- },
53
- {
54
- "epoch": 0.44,
55
- "learning_rate": 1.9333333333333333e-05,
56
- "loss": 2.902,
57
- "step": 2000
58
- },
59
- {
60
- "epoch": 0.44,
61
- "eval_loss": 2.679006338119507,
62
- "eval_runtime": 43.6162,
63
- "eval_samples_per_second": 208.134,
64
- "eval_steps_per_second": 13.023,
65
- "step": 2000
66
- },
67
- {
68
- "epoch": 0.55,
69
- "learning_rate": 1.916666666666667e-05,
70
- "loss": 2.7296,
71
- "step": 2500
72
- },
73
- {
74
- "epoch": 0.55,
75
- "eval_loss": 2.5206823348999023,
76
- "eval_runtime": 42.8291,
77
- "eval_samples_per_second": 211.959,
78
- "eval_steps_per_second": 13.262,
79
- "step": 2500
80
- },
81
- {
82
- "epoch": 0.66,
83
- "learning_rate": 1.9000333333333335e-05,
84
- "loss": 2.6105,
85
- "step": 3000
86
- },
87
- {
88
- "epoch": 0.66,
89
- "eval_loss": 2.394014835357666,
90
- "eval_runtime": 42.8022,
91
- "eval_samples_per_second": 212.092,
92
- "eval_steps_per_second": 13.27,
93
- "step": 3000
94
- },
95
- {
96
- "epoch": 0.77,
97
- "learning_rate": 1.883366666666667e-05,
98
- "loss": 2.5068,
99
- "step": 3500
100
- },
101
- {
102
- "epoch": 0.77,
103
- "eval_loss": 2.30385422706604,
104
- "eval_runtime": 44.6981,
105
- "eval_samples_per_second": 203.096,
106
- "eval_steps_per_second": 12.707,
107
- "step": 3500
108
- },
109
- {
110
- "epoch": 0.88,
111
- "learning_rate": 1.8667000000000003e-05,
112
- "loss": 2.3848,
113
- "step": 4000
114
- },
115
- {
116
- "epoch": 0.88,
117
- "eval_loss": 2.2251899242401123,
118
- "eval_runtime": 42.8042,
119
- "eval_samples_per_second": 212.082,
120
- "eval_steps_per_second": 13.27,
121
- "step": 4000
122
- },
123
- {
124
- "epoch": 0.99,
125
- "learning_rate": 1.8500333333333337e-05,
126
- "loss": 2.3391,
127
- "step": 4500
128
- },
129
- {
130
- "epoch": 0.99,
131
- "eval_loss": 2.148716449737549,
132
- "eval_runtime": 43.0552,
133
- "eval_samples_per_second": 210.845,
134
- "eval_steps_per_second": 13.192,
135
- "step": 4500
136
- },
137
- {
138
- "epoch": 1.1,
139
- "learning_rate": 1.8334e-05,
140
- "loss": 2.1994,
141
- "step": 5000
142
- },
143
- {
144
- "epoch": 1.1,
145
- "eval_loss": 2.091869354248047,
146
- "eval_runtime": 43.5857,
147
- "eval_samples_per_second": 208.279,
148
- "eval_steps_per_second": 13.032,
149
- "step": 5000
150
- },
151
- {
152
- "epoch": 1.21,
153
- "learning_rate": 1.8167333333333335e-05,
154
- "loss": 2.1446,
155
- "step": 5500
156
- },
157
- {
158
- "epoch": 1.21,
159
- "eval_loss": 2.0449471473693848,
160
- "eval_runtime": 43.2545,
161
- "eval_samples_per_second": 209.874,
162
- "eval_steps_per_second": 13.132,
163
- "step": 5500
164
- },
165
- {
166
- "epoch": 1.32,
167
- "learning_rate": 1.800066666666667e-05,
168
- "loss": 2.1012,
169
- "step": 6000
170
- },
171
- {
172
- "epoch": 1.32,
173
- "eval_loss": 2.0015575885772705,
174
- "eval_runtime": 43.2116,
175
- "eval_samples_per_second": 210.083,
176
- "eval_steps_per_second": 13.145,
177
- "step": 6000
178
- },
179
- {
180
- "epoch": 1.43,
181
- "learning_rate": 1.7834000000000002e-05,
182
- "loss": 2.0542,
183
- "step": 6500
184
- },
185
- {
186
- "epoch": 1.43,
187
- "eval_loss": 1.951575517654419,
188
- "eval_runtime": 43.3919,
189
- "eval_samples_per_second": 209.209,
190
- "eval_steps_per_second": 13.09,
191
- "step": 6500
192
- },
193
- {
194
- "epoch": 1.54,
195
- "learning_rate": 1.7667333333333333e-05,
196
- "loss": 2.0216,
197
- "step": 7000
198
- },
199
- {
200
- "epoch": 1.54,
201
- "eval_loss": 1.9161829948425293,
202
- "eval_runtime": 43.6948,
203
- "eval_samples_per_second": 207.759,
204
- "eval_steps_per_second": 12.999,
205
- "step": 7000
206
- },
207
- {
208
- "epoch": 1.65,
209
- "learning_rate": 1.7500666666666667e-05,
210
- "loss": 1.9926,
211
- "step": 7500
212
- },
213
- {
214
- "epoch": 1.65,
215
- "eval_loss": 1.8790709972381592,
216
- "eval_runtime": 44.1328,
217
- "eval_samples_per_second": 205.698,
218
- "eval_steps_per_second": 12.87,
219
- "step": 7500
220
- },
221
- {
222
- "epoch": 1.76,
223
- "learning_rate": 1.7334e-05,
224
- "loss": 1.9278,
225
- "step": 8000
226
- },
227
- {
228
- "epoch": 1.76,
229
- "eval_loss": 1.845486044883728,
230
- "eval_runtime": 44.2649,
231
- "eval_samples_per_second": 205.084,
232
- "eval_steps_per_second": 12.832,
233
- "step": 8000
234
- },
235
- {
236
- "epoch": 1.87,
237
- "learning_rate": 1.7167333333333334e-05,
238
- "loss": 1.9305,
239
- "step": 8500
240
- },
241
- {
242
- "epoch": 1.87,
243
- "eval_loss": 1.8124595880508423,
244
- "eval_runtime": 43.561,
245
- "eval_samples_per_second": 208.398,
246
- "eval_steps_per_second": 13.039,
247
- "step": 8500
248
- },
249
- {
250
- "epoch": 1.98,
251
- "learning_rate": 1.7000666666666668e-05,
252
- "loss": 1.905,
253
- "step": 9000
254
- },
255
- {
256
- "epoch": 1.98,
257
- "eval_loss": 1.7888203859329224,
258
- "eval_runtime": 43.4924,
259
- "eval_samples_per_second": 208.726,
260
- "eval_steps_per_second": 13.06,
261
- "step": 9000
262
- },
263
- {
264
- "epoch": 2.09,
265
- "learning_rate": 1.6834666666666667e-05,
266
- "loss": 1.828,
267
- "step": 9500
268
- },
269
- {
270
- "epoch": 2.09,
271
- "eval_loss": 1.7763384580612183,
272
- "eval_runtime": 43.5326,
273
- "eval_samples_per_second": 208.534,
274
- "eval_steps_per_second": 13.048,
275
- "step": 9500
276
- },
277
- {
278
- "epoch": 2.2,
279
- "learning_rate": 1.6668e-05,
280
- "loss": 1.808,
281
- "step": 10000
282
- },
283
- {
284
- "epoch": 2.2,
285
- "eval_loss": 1.7431403398513794,
286
- "eval_runtime": 43.6327,
287
- "eval_samples_per_second": 208.055,
288
- "eval_steps_per_second": 13.018,
289
- "step": 10000
290
- },
291
- {
292
- "epoch": 2.31,
293
- "learning_rate": 1.6501333333333334e-05,
294
- "loss": 1.7434,
295
- "step": 10500
296
- },
297
- {
298
- "epoch": 2.31,
299
- "eval_loss": 1.7204526662826538,
300
- "eval_runtime": 44.0612,
301
- "eval_samples_per_second": 206.032,
302
- "eval_steps_per_second": 12.891,
303
- "step": 10500
304
- },
305
- {
306
- "epoch": 2.42,
307
- "learning_rate": 1.6334666666666668e-05,
308
- "loss": 1.7527,
309
- "step": 11000
310
- },
311
- {
312
- "epoch": 2.42,
313
- "eval_loss": 1.7068791389465332,
314
- "eval_runtime": 43.4892,
315
- "eval_samples_per_second": 208.741,
316
- "eval_steps_per_second": 13.061,
317
- "step": 11000
318
- },
319
- {
320
- "epoch": 2.53,
321
- "learning_rate": 1.6168333333333336e-05,
322
- "loss": 1.6977,
323
- "step": 11500
324
- },
325
- {
326
- "epoch": 2.53,
327
- "eval_loss": 1.6881217956542969,
328
- "eval_runtime": 43.1769,
329
- "eval_samples_per_second": 210.252,
330
- "eval_steps_per_second": 13.155,
331
- "step": 11500
332
- },
333
- {
334
- "epoch": 2.64,
335
- "learning_rate": 1.600166666666667e-05,
336
- "loss": 1.6854,
337
- "step": 12000
338
- },
339
- {
340
- "epoch": 2.64,
341
- "eval_loss": 1.6784749031066895,
342
- "eval_runtime": 42.7581,
343
- "eval_samples_per_second": 212.311,
344
- "eval_steps_per_second": 13.284,
345
- "step": 12000
346
- },
347
- {
348
- "epoch": 2.75,
349
- "learning_rate": 1.5835e-05,
350
- "loss": 1.6603,
351
- "step": 12500
352
- },
353
- {
354
- "epoch": 2.75,
355
- "eval_loss": 1.6570212841033936,
356
- "eval_runtime": 42.6669,
357
- "eval_samples_per_second": 212.765,
358
- "eval_steps_per_second": 13.312,
359
- "step": 12500
360
- },
361
- {
362
- "epoch": 2.86,
363
- "learning_rate": 1.5668333333333334e-05,
364
- "loss": 1.6725,
365
- "step": 13000
366
- },
367
- {
368
- "epoch": 2.86,
369
- "eval_loss": 1.6376469135284424,
370
- "eval_runtime": 43.1686,
371
- "eval_samples_per_second": 210.292,
372
- "eval_steps_per_second": 13.158,
373
- "step": 13000
374
- },
375
- {
376
- "epoch": 2.97,
377
- "learning_rate": 1.5501666666666668e-05,
378
- "loss": 1.6384,
379
- "step": 13500
380
- },
381
- {
382
- "epoch": 2.97,
383
- "eval_loss": 1.6240772008895874,
384
- "eval_runtime": 43.2327,
385
- "eval_samples_per_second": 209.98,
386
- "eval_steps_per_second": 13.138,
387
- "step": 13500
388
- },
389
- {
390
- "epoch": 3.08,
391
- "learning_rate": 1.5335e-05,
392
- "loss": 1.5917,
393
- "step": 14000
394
- },
395
- {
396
- "epoch": 3.08,
397
- "eval_loss": 1.6128900051116943,
398
- "eval_runtime": 42.9712,
399
- "eval_samples_per_second": 211.258,
400
- "eval_steps_per_second": 13.218,
401
- "step": 14000
402
- },
403
- {
404
- "epoch": 3.19,
405
- "learning_rate": 1.5168333333333334e-05,
406
- "loss": 1.591,
407
- "step": 14500
408
- },
409
- {
410
- "epoch": 3.19,
411
- "eval_loss": 1.6054280996322632,
412
- "eval_runtime": 43.0919,
413
- "eval_samples_per_second": 210.666,
414
- "eval_steps_per_second": 13.181,
415
- "step": 14500
416
- },
417
- {
418
- "epoch": 3.3,
419
- "learning_rate": 1.5001666666666667e-05,
420
- "loss": 1.5936,
421
- "step": 15000
422
- },
423
- {
424
- "epoch": 3.3,
425
- "eval_loss": 1.5842323303222656,
426
- "eval_runtime": 43.5874,
427
- "eval_samples_per_second": 208.271,
428
- "eval_steps_per_second": 13.031,
429
- "step": 15000
430
- },
431
- {
432
- "epoch": 3.41,
433
- "learning_rate": 1.4835000000000001e-05,
434
- "loss": 1.6031,
435
- "step": 15500
436
- },
437
- {
438
- "epoch": 3.41,
439
- "eval_loss": 1.5732085704803467,
440
- "eval_runtime": 42.6594,
441
- "eval_samples_per_second": 212.802,
442
- "eval_steps_per_second": 13.315,
443
- "step": 15500
444
- },
445
- {
446
- "epoch": 3.53,
447
- "learning_rate": 1.4668666666666669e-05,
448
- "loss": 1.554,
449
- "step": 16000
450
- },
451
- {
452
- "epoch": 3.53,
453
- "eval_loss": 1.5651994943618774,
454
- "eval_runtime": 42.8927,
455
- "eval_samples_per_second": 211.645,
456
- "eval_steps_per_second": 13.242,
457
- "step": 16000
458
- },
459
- {
460
- "epoch": 3.64,
461
- "learning_rate": 1.4502000000000001e-05,
462
- "loss": 1.5284,
463
- "step": 16500
464
- },
465
- {
466
- "epoch": 3.64,
467
- "eval_loss": 1.5577419996261597,
468
- "eval_runtime": 43.6635,
469
- "eval_samples_per_second": 207.908,
470
- "eval_steps_per_second": 13.009,
471
- "step": 16500
472
- },
473
- {
474
- "epoch": 3.75,
475
- "learning_rate": 1.4335666666666667e-05,
476
- "loss": 1.5219,
477
- "step": 17000
478
- },
479
- {
480
- "epoch": 3.75,
481
- "eval_loss": 1.544019103050232,
482
- "eval_runtime": 43.4005,
483
- "eval_samples_per_second": 209.168,
484
- "eval_steps_per_second": 13.087,
485
- "step": 17000
486
- },
487
- {
488
- "epoch": 3.86,
489
- "learning_rate": 1.4169000000000001e-05,
490
- "loss": 1.5173,
491
- "step": 17500
492
- },
493
- {
494
- "epoch": 3.86,
495
- "eval_loss": 1.5335613489151,
496
- "eval_runtime": 43.512,
497
- "eval_samples_per_second": 208.632,
498
- "eval_steps_per_second": 13.054,
499
- "step": 17500
500
- },
501
- {
502
- "epoch": 3.97,
503
- "learning_rate": 1.4002333333333335e-05,
504
- "loss": 1.4781,
505
- "step": 18000
506
- },
507
- {
508
- "epoch": 3.97,
509
- "eval_loss": 1.5262504816055298,
510
- "eval_runtime": 43.3365,
511
- "eval_samples_per_second": 209.477,
512
- "eval_steps_per_second": 13.107,
513
- "step": 18000
514
- },
515
- {
516
- "epoch": 4.08,
517
- "learning_rate": 1.3835666666666667e-05,
518
- "loss": 1.4771,
519
- "step": 18500
520
- },
521
- {
522
- "epoch": 4.08,
523
- "eval_loss": 1.519882082939148,
524
- "eval_runtime": 43.8272,
525
- "eval_samples_per_second": 207.132,
526
- "eval_steps_per_second": 12.96,
527
- "step": 18500
528
- },
529
- {
530
- "epoch": 4.19,
531
- "learning_rate": 1.3669e-05,
532
- "loss": 1.4613,
533
- "step": 19000
534
- },
535
- {
536
- "epoch": 4.19,
537
- "eval_loss": 1.5142260789871216,
538
- "eval_runtime": 43.3978,
539
- "eval_samples_per_second": 209.181,
540
- "eval_steps_per_second": 13.088,
541
- "step": 19000
542
- },
543
- {
544
- "epoch": 4.3,
545
- "learning_rate": 1.3502333333333335e-05,
546
- "loss": 1.452,
547
- "step": 19500
548
- },
549
- {
550
- "epoch": 4.3,
551
- "eval_loss": 1.503504753112793,
552
- "eval_runtime": 43.2384,
553
- "eval_samples_per_second": 209.952,
554
- "eval_steps_per_second": 13.136,
555
- "step": 19500
556
- },
557
- {
558
- "epoch": 4.41,
559
- "learning_rate": 1.3335666666666667e-05,
560
- "loss": 1.4563,
561
- "step": 20000
562
- },
563
- {
564
- "epoch": 4.41,
565
- "eval_loss": 1.495379090309143,
566
- "eval_runtime": 43.6248,
567
- "eval_samples_per_second": 208.093,
568
- "eval_steps_per_second": 13.02,
569
- "step": 20000
570
- },
571
- {
572
- "epoch": 4.52,
573
- "learning_rate": 1.3169e-05,
574
- "loss": 1.46,
575
- "step": 20500
576
- },
577
- {
578
- "epoch": 4.52,
579
- "eval_loss": 1.4834085702896118,
580
- "eval_runtime": 43.9437,
581
- "eval_samples_per_second": 206.583,
582
- "eval_steps_per_second": 12.926,
583
- "step": 20500
584
- },
585
- {
586
- "epoch": 4.63,
587
- "learning_rate": 1.3002333333333334e-05,
588
- "loss": 1.4284,
589
- "step": 21000
590
- },
591
- {
592
- "epoch": 4.63,
593
- "eval_loss": 1.4811300039291382,
594
- "eval_runtime": 43.2207,
595
- "eval_samples_per_second": 210.038,
596
- "eval_steps_per_second": 13.142,
597
- "step": 21000
598
- },
599
- {
600
- "epoch": 4.74,
601
- "learning_rate": 1.2836000000000002e-05,
602
- "loss": 1.4527,
603
- "step": 21500
604
- },
605
- {
606
- "epoch": 4.74,
607
- "eval_loss": 1.4702121019363403,
608
- "eval_runtime": 43.7967,
609
- "eval_samples_per_second": 207.276,
610
- "eval_steps_per_second": 12.969,
611
- "step": 21500
612
- },
613
- {
614
- "epoch": 4.85,
615
- "learning_rate": 1.2669333333333334e-05,
616
- "loss": 1.4375,
617
- "step": 22000
618
- },
619
- {
620
- "epoch": 4.85,
621
- "eval_loss": 1.4648058414459229,
622
- "eval_runtime": 43.2759,
623
- "eval_samples_per_second": 209.77,
624
- "eval_steps_per_second": 13.125,
625
- "step": 22000
626
- },
627
- {
628
- "epoch": 4.96,
629
- "learning_rate": 1.2502666666666668e-05,
630
- "loss": 1.4093,
631
- "step": 22500
632
- },
633
- {
634
- "epoch": 4.96,
635
- "eval_loss": 1.452415108680725,
636
- "eval_runtime": 43.4383,
637
- "eval_samples_per_second": 208.986,
638
- "eval_steps_per_second": 13.076,
639
- "step": 22500
640
- },
641
- {
642
- "epoch": 5.07,
643
- "learning_rate": 1.2336000000000002e-05,
644
- "loss": 1.3688,
645
- "step": 23000
646
- },
647
- {
648
- "epoch": 5.07,
649
- "eval_loss": 1.4525853395462036,
650
- "eval_runtime": 44.7597,
651
- "eval_samples_per_second": 202.816,
652
- "eval_steps_per_second": 12.69,
653
- "step": 23000
654
- },
655
- {
656
- "epoch": 5.18,
657
- "learning_rate": 1.2169333333333336e-05,
658
- "loss": 1.3704,
659
- "step": 23500
660
- },
661
- {
662
- "epoch": 5.18,
663
- "eval_loss": 1.4470324516296387,
664
- "eval_runtime": 45.0353,
665
- "eval_samples_per_second": 201.575,
666
- "eval_steps_per_second": 12.612,
667
- "step": 23500
668
- },
669
- {
670
- "epoch": 5.29,
671
- "learning_rate": 1.2003e-05,
672
- "loss": 1.3672,
673
- "step": 24000
674
- },
675
- {
676
- "epoch": 5.29,
677
- "eval_loss": 1.4429727792739868,
678
- "eval_runtime": 48.0671,
679
- "eval_samples_per_second": 188.861,
680
- "eval_steps_per_second": 11.817,
681
- "step": 24000
682
- },
683
- {
684
- "epoch": 5.4,
685
- "learning_rate": 1.1836333333333334e-05,
686
- "loss": 1.3484,
687
- "step": 24500
688
- },
689
- {
690
- "epoch": 5.4,
691
- "eval_loss": 1.4372690916061401,
692
- "eval_runtime": 45.4091,
693
- "eval_samples_per_second": 199.916,
694
- "eval_steps_per_second": 12.508,
695
- "step": 24500
696
- },
697
- {
698
- "epoch": 5.51,
699
- "learning_rate": 1.1669666666666668e-05,
700
- "loss": 1.3446,
701
- "step": 25000
702
- },
703
- {
704
- "epoch": 5.51,
705
- "eval_loss": 1.4315961599349976,
706
- "eval_runtime": 44.7878,
707
- "eval_samples_per_second": 202.689,
708
- "eval_steps_per_second": 12.682,
709
- "step": 25000
710
- },
711
- {
712
- "epoch": 5.62,
713
- "learning_rate": 1.1503000000000002e-05,
714
- "loss": 1.3445,
715
- "step": 25500
716
- },
717
- {
718
- "epoch": 5.62,
719
- "eval_loss": 1.4315000772476196,
720
- "eval_runtime": 45.134,
721
- "eval_samples_per_second": 201.134,
722
- "eval_steps_per_second": 12.585,
723
- "step": 25500
724
- },
725
- {
726
- "epoch": 5.73,
727
- "learning_rate": 1.1336333333333334e-05,
728
- "loss": 1.3236,
729
- "step": 26000
730
- },
731
- {
732
- "epoch": 5.73,
733
- "eval_loss": 1.424834132194519,
734
- "eval_runtime": 44.2677,
735
- "eval_samples_per_second": 205.07,
736
- "eval_steps_per_second": 12.831,
737
- "step": 26000
738
- },
739
- {
740
- "epoch": 5.84,
741
- "learning_rate": 1.1169666666666667e-05,
742
- "loss": 1.3061,
743
- "step": 26500
744
- },
745
- {
746
- "epoch": 5.84,
747
- "eval_loss": 1.4215062856674194,
748
- "eval_runtime": 44.2542,
749
- "eval_samples_per_second": 205.133,
750
- "eval_steps_per_second": 12.835,
751
- "step": 26500
752
- },
753
- {
754
- "epoch": 5.95,
755
- "learning_rate": 1.1003000000000001e-05,
756
- "loss": 1.3083,
757
- "step": 27000
758
- },
759
- {
760
- "epoch": 5.95,
761
- "eval_loss": 1.4195761680603027,
762
- "eval_runtime": 44.381,
763
- "eval_samples_per_second": 204.547,
764
- "eval_steps_per_second": 12.798,
765
- "step": 27000
766
- },
767
- {
768
- "epoch": 6.06,
769
- "learning_rate": 1.0836333333333333e-05,
770
- "loss": 1.2689,
771
- "step": 27500
772
- },
773
- {
774
- "epoch": 6.06,
775
- "eval_loss": 1.4157874584197998,
776
- "eval_runtime": 44.2987,
777
- "eval_samples_per_second": 204.927,
778
- "eval_steps_per_second": 12.822,
779
- "step": 27500
780
- },
781
- {
782
- "epoch": 6.17,
783
- "learning_rate": 1.0669666666666667e-05,
784
- "loss": 1.2813,
785
- "step": 28000
786
- },
787
- {
788
- "epoch": 6.17,
789
- "eval_loss": 1.4157196283340454,
790
- "eval_runtime": 44.6759,
791
- "eval_samples_per_second": 203.197,
792
- "eval_steps_per_second": 12.714,
793
- "step": 28000
794
- },
795
- {
796
- "epoch": 6.28,
797
- "learning_rate": 1.0503333333333335e-05,
798
- "loss": 1.2863,
799
- "step": 28500
800
- },
801
- {
802
- "epoch": 6.28,
803
- "eval_loss": 1.41006338596344,
804
- "eval_runtime": 44.666,
805
- "eval_samples_per_second": 203.242,
806
- "eval_steps_per_second": 12.717,
807
- "step": 28500
808
- },
809
- {
810
- "epoch": 6.39,
811
- "learning_rate": 1.0336666666666669e-05,
812
- "loss": 1.2668,
813
- "step": 29000
814
- },
815
- {
816
- "epoch": 6.39,
817
- "eval_loss": 1.403244972229004,
818
- "eval_runtime": 44.6624,
819
- "eval_samples_per_second": 203.258,
820
- "eval_steps_per_second": 12.718,
821
- "step": 29000
822
- },
823
- {
824
- "epoch": 6.5,
825
- "learning_rate": 1.017e-05,
826
- "loss": 1.2631,
827
- "step": 29500
828
- },
829
- {
830
- "epoch": 6.5,
831
- "eval_loss": 1.4050610065460205,
832
- "eval_runtime": 44.987,
833
- "eval_samples_per_second": 201.792,
834
- "eval_steps_per_second": 12.626,
835
- "step": 29500
836
- },
837
- {
838
- "epoch": 6.61,
839
- "learning_rate": 1.0003333333333333e-05,
840
- "loss": 1.2543,
841
- "step": 30000
842
- },
843
- {
844
- "epoch": 6.61,
845
- "eval_loss": 1.3968815803527832,
846
- "eval_runtime": 45.3856,
847
- "eval_samples_per_second": 200.019,
848
- "eval_steps_per_second": 12.515,
849
- "step": 30000
850
- },
851
- {
852
- "epoch": 6.72,
853
- "learning_rate": 9.836666666666668e-06,
854
- "loss": 1.2552,
855
- "step": 30500
856
- },
857
- {
858
- "epoch": 6.72,
859
- "eval_loss": 1.3993921279907227,
860
- "eval_runtime": 44.6433,
861
- "eval_samples_per_second": 203.345,
862
- "eval_steps_per_second": 12.723,
863
- "step": 30500
864
- },
865
- {
866
- "epoch": 6.83,
867
- "learning_rate": 9.67e-06,
868
- "loss": 1.2626,
869
- "step": 31000
870
- },
871
- {
872
- "epoch": 6.83,
873
- "eval_loss": 1.387181043624878,
874
- "eval_runtime": 44.4948,
875
- "eval_samples_per_second": 204.024,
876
- "eval_steps_per_second": 12.766,
877
- "step": 31000
878
- },
879
- {
880
- "epoch": 6.94,
881
- "learning_rate": 9.503333333333334e-06,
882
- "loss": 1.2396,
883
- "step": 31500
884
- },
885
- {
886
- "epoch": 6.94,
887
- "eval_loss": 1.3914097547531128,
888
- "eval_runtime": 44.936,
889
- "eval_samples_per_second": 202.021,
890
- "eval_steps_per_second": 12.64,
891
- "step": 31500
892
- },
893
- {
894
- "epoch": 7.05,
895
- "learning_rate": 9.336666666666666e-06,
896
- "loss": 1.2419,
897
- "step": 32000
898
- },
899
- {
900
- "epoch": 7.05,
901
- "eval_loss": 1.3853869438171387,
902
- "eval_runtime": 44.4747,
903
- "eval_samples_per_second": 204.116,
904
- "eval_steps_per_second": 12.771,
905
- "step": 32000
906
- },
907
- {
908
- "epoch": 7.16,
909
- "learning_rate": 9.17e-06,
910
- "loss": 1.2421,
911
- "step": 32500
912
- },
913
- {
914
- "epoch": 7.16,
915
- "eval_loss": 1.3801844120025635,
916
- "eval_runtime": 44.8621,
917
- "eval_samples_per_second": 202.353,
918
- "eval_steps_per_second": 12.661,
919
- "step": 32500
920
- },
921
- {
922
- "epoch": 7.27,
923
- "learning_rate": 9.003333333333334e-06,
924
- "loss": 1.2177,
925
- "step": 33000
926
- },
927
- {
928
- "epoch": 7.27,
929
- "eval_loss": 1.380096673965454,
930
- "eval_runtime": 45.1593,
931
- "eval_samples_per_second": 201.022,
932
- "eval_steps_per_second": 12.578,
933
- "step": 33000
934
- },
935
- {
936
- "epoch": 7.38,
937
- "learning_rate": 8.836666666666668e-06,
938
- "loss": 1.2292,
939
- "step": 33500
940
- },
941
- {
942
- "epoch": 7.38,
943
- "eval_loss": 1.3730745315551758,
944
- "eval_runtime": 44.6177,
945
- "eval_samples_per_second": 203.462,
946
- "eval_steps_per_second": 12.73,
947
- "step": 33500
948
- },
949
- {
950
- "epoch": 7.49,
951
- "learning_rate": 8.67e-06,
952
- "loss": 1.2131,
953
- "step": 34000
954
- },
955
- {
956
- "epoch": 7.49,
957
- "eval_loss": 1.370631217956543,
958
- "eval_runtime": 45.0076,
959
- "eval_samples_per_second": 201.699,
960
- "eval_steps_per_second": 12.62,
961
- "step": 34000
962
- },
963
- {
964
- "epoch": 7.6,
965
- "learning_rate": 8.503666666666668e-06,
966
- "loss": 1.227,
967
- "step": 34500
968
- },
969
- {
970
- "epoch": 7.6,
971
- "eval_loss": 1.3661019802093506,
972
- "eval_runtime": 44.9507,
973
- "eval_samples_per_second": 201.955,
974
- "eval_steps_per_second": 12.636,
975
- "step": 34500
976
- },
977
- {
978
- "epoch": 7.71,
979
- "learning_rate": 8.337e-06,
980
- "loss": 1.2158,
981
- "step": 35000
982
- },
983
- {
984
- "epoch": 7.71,
985
- "eval_loss": 1.3633307218551636,
986
- "eval_runtime": 44.5235,
987
- "eval_samples_per_second": 203.892,
988
- "eval_steps_per_second": 12.757,
989
- "step": 35000
990
- },
991
- {
992
- "epoch": 7.82,
993
- "learning_rate": 8.170333333333334e-06,
994
- "loss": 1.2194,
995
- "step": 35500
996
- },
997
- {
998
- "epoch": 7.82,
999
- "eval_loss": 1.3616106510162354,
1000
- "eval_runtime": 45.4373,
1001
- "eval_samples_per_second": 199.792,
1002
- "eval_steps_per_second": 12.501,
1003
- "step": 35500
1004
- },
1005
- {
1006
- "epoch": 7.93,
1007
- "learning_rate": 8.004e-06,
1008
- "loss": 1.2261,
1009
- "step": 36000
1010
- },
1011
- {
1012
- "epoch": 7.93,
1013
- "eval_loss": 1.357163429260254,
1014
- "eval_runtime": 45.4325,
1015
- "eval_samples_per_second": 199.813,
1016
- "eval_steps_per_second": 12.502,
1017
- "step": 36000
1018
- },
1019
- {
1020
- "epoch": 8.04,
1021
- "learning_rate": 7.837666666666666e-06,
1022
- "loss": 1.218,
1023
- "step": 36500
1024
- },
1025
- {
1026
- "epoch": 8.04,
1027
- "eval_loss": 1.3611406087875366,
1028
- "eval_runtime": 44.5114,
1029
- "eval_samples_per_second": 203.948,
1030
- "eval_steps_per_second": 12.761,
1031
- "step": 36500
1032
- },
1033
- {
1034
- "epoch": 8.15,
1035
- "learning_rate": 7.671e-06,
1036
- "loss": 1.1658,
1037
- "step": 37000
1038
- },
1039
- {
1040
- "epoch": 8.15,
1041
- "eval_loss": 1.356821060180664,
1042
- "eval_runtime": 44.6544,
1043
- "eval_samples_per_second": 203.295,
1044
- "eval_steps_per_second": 12.72,
1045
- "step": 37000
1046
- },
1047
- {
1048
- "epoch": 8.26,
1049
- "learning_rate": 7.504333333333334e-06,
1050
- "loss": 1.1652,
1051
- "step": 37500
1052
- },
1053
- {
1054
- "epoch": 8.26,
1055
- "eval_loss": 1.35381281375885,
1056
- "eval_runtime": 45.0638,
1057
- "eval_samples_per_second": 201.448,
1058
- "eval_steps_per_second": 12.604,
1059
- "step": 37500
1060
- },
1061
- {
1062
- "epoch": 8.37,
1063
- "learning_rate": 7.3376666666666675e-06,
1064
- "loss": 1.1686,
1065
- "step": 38000
1066
- },
1067
- {
1068
- "epoch": 8.37,
1069
- "eval_loss": 1.3533384799957275,
1070
- "eval_runtime": 44.2795,
1071
- "eval_samples_per_second": 205.016,
1072
- "eval_steps_per_second": 12.828,
1073
- "step": 38000
1074
- },
1075
- {
1076
- "epoch": 8.48,
1077
- "learning_rate": 7.1710000000000005e-06,
1078
- "loss": 1.1666,
1079
- "step": 38500
1080
- },
1081
- {
1082
- "epoch": 8.48,
1083
- "eval_loss": 1.3513332605361938,
1084
- "eval_runtime": 44.2333,
1085
- "eval_samples_per_second": 205.23,
1086
- "eval_steps_per_second": 12.841,
1087
- "step": 38500
1088
- },
1089
- {
1090
- "epoch": 8.59,
1091
- "learning_rate": 7.004333333333334e-06,
1092
- "loss": 1.1827,
1093
- "step": 39000
1094
- },
1095
- {
1096
- "epoch": 8.59,
1097
- "eval_loss": 1.3496123552322388,
1098
- "eval_runtime": 44.8817,
1099
- "eval_samples_per_second": 202.265,
1100
- "eval_steps_per_second": 12.655,
1101
- "step": 39000
1102
- },
1103
- {
1104
- "epoch": 8.7,
1105
- "learning_rate": 6.837666666666667e-06,
1106
- "loss": 1.1643,
1107
- "step": 39500
1108
- },
1109
- {
1110
- "epoch": 8.7,
1111
- "eval_loss": 1.3474962711334229,
1112
- "eval_runtime": 44.7028,
1113
- "eval_samples_per_second": 203.074,
1114
- "eval_steps_per_second": 12.706,
1115
- "step": 39500
1116
- },
1117
- {
1118
- "epoch": 8.81,
1119
- "learning_rate": 6.671000000000001e-06,
1120
- "loss": 1.1651,
1121
- "step": 40000
1122
- },
1123
- {
1124
- "epoch": 8.81,
1125
- "eval_loss": 1.3451054096221924,
1126
- "eval_runtime": 44.3384,
1127
- "eval_samples_per_second": 204.743,
1128
- "eval_steps_per_second": 12.811,
1129
- "step": 40000
1130
- },
1131
- {
1132
- "epoch": 8.92,
1133
- "learning_rate": 6.504333333333334e-06,
1134
- "loss": 1.1696,
1135
- "step": 40500
1136
- },
1137
- {
1138
- "epoch": 8.92,
1139
- "eval_loss": 1.3412760496139526,
1140
- "eval_runtime": 44.0851,
1141
- "eval_samples_per_second": 205.92,
1142
- "eval_steps_per_second": 12.884,
1143
- "step": 40500
1144
- },
1145
- {
1146
- "epoch": 9.03,
1147
- "learning_rate": 6.337666666666668e-06,
1148
- "loss": 1.1582,
1149
- "step": 41000
1150
- },
1151
- {
1152
- "epoch": 9.03,
1153
- "eval_loss": 1.3448957204818726,
1154
- "eval_runtime": 43.2139,
1155
- "eval_samples_per_second": 210.071,
1156
- "eval_steps_per_second": 13.144,
1157
- "step": 41000
1158
- },
1159
- {
1160
- "epoch": 9.14,
1161
- "learning_rate": 6.171000000000001e-06,
1162
- "loss": 1.152,
1163
- "step": 41500
1164
- },
1165
- {
1166
- "epoch": 9.14,
1167
- "eval_loss": 1.3436229228973389,
1168
- "eval_runtime": 43.5301,
1169
- "eval_samples_per_second": 208.545,
1170
- "eval_steps_per_second": 13.048,
1171
- "step": 41500
1172
- },
1173
- {
1174
- "epoch": 9.25,
1175
- "learning_rate": 6.004666666666668e-06,
1176
- "loss": 1.1177,
1177
- "step": 42000
1178
- },
1179
- {
1180
- "epoch": 9.25,
1181
- "eval_loss": 1.3422951698303223,
1182
- "eval_runtime": 43.2428,
1183
- "eval_samples_per_second": 209.931,
1184
- "eval_steps_per_second": 13.135,
1185
- "step": 42000
1186
- },
1187
- {
1188
- "epoch": 9.36,
1189
- "learning_rate": 5.838000000000001e-06,
1190
- "loss": 1.1195,
1191
- "step": 42500
1192
- },
1193
- {
1194
- "epoch": 9.36,
1195
- "eval_loss": 1.340394139289856,
1196
- "eval_runtime": 43.8891,
1197
- "eval_samples_per_second": 206.839,
1198
- "eval_steps_per_second": 12.942,
1199
- "step": 42500
1200
- },
1201
- {
1202
- "epoch": 9.47,
1203
- "learning_rate": 5.6713333333333345e-06,
1204
- "loss": 1.1389,
1205
- "step": 43000
1206
- },
1207
- {
1208
- "epoch": 9.47,
1209
- "eval_loss": 1.3390538692474365,
1210
- "eval_runtime": 43.1572,
1211
- "eval_samples_per_second": 210.347,
1212
- "eval_steps_per_second": 13.161,
1213
- "step": 43000
1214
- },
1215
- {
1216
- "epoch": 9.58,
1217
- "learning_rate": 5.5046666666666674e-06,
1218
- "loss": 1.1316,
1219
- "step": 43500
1220
- },
1221
- {
1222
- "epoch": 9.58,
1223
- "eval_loss": 1.3349236249923706,
1224
- "eval_runtime": 43.9132,
1225
- "eval_samples_per_second": 206.726,
1226
- "eval_steps_per_second": 12.935,
1227
- "step": 43500
1228
- },
1229
- {
1230
- "epoch": 9.69,
1231
- "learning_rate": 5.3383333333333345e-06,
1232
- "loss": 1.1376,
1233
- "step": 44000
1234
- },
1235
- {
1236
- "epoch": 9.69,
1237
- "eval_loss": 1.3336918354034424,
1238
- "eval_runtime": 43.0007,
1239
- "eval_samples_per_second": 211.113,
1240
- "eval_steps_per_second": 13.209,
1241
- "step": 44000
1242
- },
1243
- {
1244
- "epoch": 9.8,
1245
- "learning_rate": 5.171666666666667e-06,
1246
- "loss": 1.1407,
1247
- "step": 44500
1248
- },
1249
- {
1250
- "epoch": 9.8,
1251
- "eval_loss": 1.33429753780365,
1252
- "eval_runtime": 43.2578,
1253
- "eval_samples_per_second": 209.858,
1254
- "eval_steps_per_second": 13.131,
1255
- "step": 44500
1256
- },
1257
- {
1258
- "epoch": 9.91,
1259
- "learning_rate": 5.0053333333333344e-06,
1260
- "loss": 1.1149,
1261
- "step": 45000
1262
- },
1263
- {
1264
- "epoch": 9.91,
1265
- "eval_bleu": 29.77431864288693,
1266
- "eval_loss": 1.3337864875793457,
1267
- "eval_runtime": 1068.0893,
1268
- "eval_samples_per_second": 8.499,
1269
- "eval_steps_per_second": 0.532,
1270
- "step": 45000
1271
- },
1272
- {
1273
- "epoch": 10.02,
1274
- "learning_rate": 4.839000000000001e-06,
1275
- "loss": 1.1168,
1276
- "step": 45500
1277
- },
1278
- {
1279
- "epoch": 10.02,
1280
- "eval_bleu": 28.66210620849173,
1281
- "eval_loss": 1.3359324932098389,
1282
- "eval_runtime": 1076.242,
1283
- "eval_samples_per_second": 8.435,
1284
- "eval_steps_per_second": 0.528,
1285
- "step": 45500
1286
- },
1287
- {
1288
- "epoch": 10.13,
1289
- "learning_rate": 4.6723333333333335e-06,
1290
- "loss": 1.1081,
1291
- "step": 46000
1292
- },
1293
- {
1294
- "epoch": 10.13,
1295
- "eval_bleu": 29.761643801576938,
1296
- "eval_loss": 1.3331745862960815,
1297
- "eval_runtime": 1056.7209,
1298
- "eval_samples_per_second": 8.591,
1299
- "eval_steps_per_second": 0.538,
1300
- "step": 46000
1301
- },
1302
- {
1303
- "epoch": 10.24,
1304
- "learning_rate": 4.505666666666667e-06,
1305
- "loss": 1.1125,
1306
- "step": 46500
1307
- },
1308
- {
1309
- "epoch": 10.24,
1310
- "eval_bleu": 28.907829338126366,
1311
- "eval_loss": 1.3308771848678589,
1312
- "eval_runtime": 1080.0134,
1313
- "eval_samples_per_second": 8.405,
1314
- "eval_steps_per_second": 0.526,
1315
- "step": 46500
1316
- }
1317
- ],
1318
- "logging_steps": 500,
1319
- "max_steps": 60000,
1320
- "num_train_epochs": 14,
1321
- "save_steps": 500,
1322
- "total_flos": 4.712260602062438e+16,
1323
- "trial_name": null,
1324
- "trial_params": null
1325
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-46500/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c221b6456f284bee19a1f4343a680b9b37a4aa94da144f788d5bc7ff04bf7c05
3
- size 4664