damand2061 commited on
Commit
1ee98b4
1 Parent(s): 89b4d93

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - autotrain
5
+ - text2text-generation
6
+ base_model: Helsinki-NLP/opus-mt-id-en
7
+ widget:
8
+ - text: "I love AutoTrain"
9
+ datasets:
10
+ - jakartaresearch/inglish
11
+ ---
12
+
13
+ # Model Trained Using AutoTrain
14
+
15
+ - Problem type: Seq2Seq
16
+
17
+ ## Validation Metrics
18
+ loss: 0.59325110912323
19
+
20
+ rouge1: 87.5985
21
+
22
+ rouge2: 74.3003
23
+
24
+ rougeL: 86.0508
25
+
26
+ rougeLsum: 86.0787
27
+
28
+ gen_len: 26.3516
29
+
30
+ runtime: 2119.396
31
+
32
+ samples_per_second: 1.369
33
+
34
+ steps_per_second: 0.343
35
+
36
+ : 3.0
checkpoint-13053/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-id-en",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "swish",
6
+ "add_bias_logits": false,
7
+ "add_final_layer_norm": false,
8
+ "architectures": [
9
+ "MarianMTModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 0,
13
+ "classif_dropout": 0.0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 512,
16
+ "decoder_attention_heads": 8,
17
+ "decoder_ffn_dim": 2048,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 54795,
21
+ "decoder_vocab_size": 54796,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 8,
24
+ "encoder_ffn_dim": 2048,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 0,
28
+ "forced_eos_token_id": 0,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 54795,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.45.0",
54
+ "use_cache": false,
55
+ "vocab_size": 54796
56
+ }
checkpoint-13053/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 54795
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 54795,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 6,
13
+ "pad_token_id": 54795,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.45.0"
16
+ }
checkpoint-13053/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:035031492308c1194f63ad8f86e4efdfac16883fea241e6f258a665eefc7df44
3
+ size 289024432
checkpoint-13053/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f24d81b21d29db64e894dc43f78546f631e6cb09428cec27f96845388356ea1b
3
+ size 577756858
checkpoint-13053/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaa780b7a09ca669d4544214a7c043f89bd755db1731f14e59309c144d1fec07
3
+ size 13990
checkpoint-13053/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81603a74ebf2e6244facfd08eaae3e3fa41ccb6beca5c72c17e37c98244e6ec9
3
+ size 1064
checkpoint-13053/source.spm ADDED
Binary file (801 kB). View file
 
checkpoint-13053/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
checkpoint-13053/target.spm ADDED
Binary file (796 kB). View file
 
checkpoint-13053/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "54795": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "eos_token": "</s>",
30
+ "model_max_length": 512,
31
+ "pad_token": "<pad>",
32
+ "separate_vocabs": false,
33
+ "source_lang": "id",
34
+ "sp_model_kwargs": {},
35
+ "target_lang": "en",
36
+ "tokenizer_class": "MarianTokenizer",
37
+ "unk_token": "<unk>"
38
+ }
checkpoint-13053/trainer_state.json ADDED
@@ -0,0 +1,3735 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.59325110912323,
3
+ "best_model_checkpoint": "autotrain-i56bj-d90g7/checkpoint-13053",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 13053,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005745805561939784,
13
+ "grad_norm": 18.06277084350586,
14
+ "learning_rate": 9.571209800918836e-07,
15
+ "loss": 1.3779,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.011491611123879569,
20
+ "grad_norm": 15.910608291625977,
21
+ "learning_rate": 1.914241960183767e-06,
22
+ "loss": 1.1469,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.01723741668581935,
27
+ "grad_norm": 23.948566436767578,
28
+ "learning_rate": 2.871362940275651e-06,
29
+ "loss": 1.2746,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.022983222247759137,
34
+ "grad_norm": 25.929588317871094,
35
+ "learning_rate": 3.828483920367534e-06,
36
+ "loss": 1.2256,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.02872902780969892,
41
+ "grad_norm": 16.358503341674805,
42
+ "learning_rate": 4.785604900459419e-06,
43
+ "loss": 1.1802,
44
+ "step": 125
45
+ },
46
+ {
47
+ "epoch": 0.0344748333716387,
48
+ "grad_norm": 11.254109382629395,
49
+ "learning_rate": 5.742725880551302e-06,
50
+ "loss": 1.0634,
51
+ "step": 150
52
+ },
53
+ {
54
+ "epoch": 0.04022063893357849,
55
+ "grad_norm": 28.07424545288086,
56
+ "learning_rate": 6.699846860643186e-06,
57
+ "loss": 1.315,
58
+ "step": 175
59
+ },
60
+ {
61
+ "epoch": 0.045966444495518274,
62
+ "grad_norm": 16.383939743041992,
63
+ "learning_rate": 7.656967840735069e-06,
64
+ "loss": 1.17,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.051712250057458053,
69
+ "grad_norm": 22.179977416992188,
70
+ "learning_rate": 8.614088820826952e-06,
71
+ "loss": 1.0369,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 0.05745805561939784,
76
+ "grad_norm": 13.887266159057617,
77
+ "learning_rate": 9.571209800918838e-06,
78
+ "loss": 1.0267,
79
+ "step": 250
80
+ },
81
+ {
82
+ "epoch": 0.06320386118133763,
83
+ "grad_norm": 19.803924560546875,
84
+ "learning_rate": 1.052833078101072e-05,
85
+ "loss": 1.0433,
86
+ "step": 275
87
+ },
88
+ {
89
+ "epoch": 0.0689496667432774,
90
+ "grad_norm": 13.864017486572266,
91
+ "learning_rate": 1.1485451761102605e-05,
92
+ "loss": 1.0669,
93
+ "step": 300
94
+ },
95
+ {
96
+ "epoch": 0.0746954723052172,
97
+ "grad_norm": 8.618718147277832,
98
+ "learning_rate": 1.2442572741194487e-05,
99
+ "loss": 1.0531,
100
+ "step": 325
101
+ },
102
+ {
103
+ "epoch": 0.08044127786715698,
104
+ "grad_norm": 13.573511123657227,
105
+ "learning_rate": 1.3399693721286372e-05,
106
+ "loss": 1.084,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.08618708342909676,
111
+ "grad_norm": 15.934146881103516,
112
+ "learning_rate": 1.4356814701378254e-05,
113
+ "loss": 1.0139,
114
+ "step": 375
115
+ },
116
+ {
117
+ "epoch": 0.09193288899103655,
118
+ "grad_norm": 17.006610870361328,
119
+ "learning_rate": 1.5313935681470137e-05,
120
+ "loss": 1.0439,
121
+ "step": 400
122
+ },
123
+ {
124
+ "epoch": 0.09767869455297633,
125
+ "grad_norm": 15.182875633239746,
126
+ "learning_rate": 1.6271056661562023e-05,
127
+ "loss": 1.1166,
128
+ "step": 425
129
+ },
130
+ {
131
+ "epoch": 0.10342450011491611,
132
+ "grad_norm": 8.765186309814453,
133
+ "learning_rate": 1.7228177641653905e-05,
134
+ "loss": 0.9745,
135
+ "step": 450
136
+ },
137
+ {
138
+ "epoch": 0.1091703056768559,
139
+ "grad_norm": 23.071510314941406,
140
+ "learning_rate": 1.818529862174579e-05,
141
+ "loss": 1.2088,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 0.11491611123879568,
146
+ "grad_norm": 16.805238723754883,
147
+ "learning_rate": 1.9142419601837675e-05,
148
+ "loss": 0.9662,
149
+ "step": 500
150
+ },
151
+ {
152
+ "epoch": 0.12066191680073546,
153
+ "grad_norm": 16.67304039001465,
154
+ "learning_rate": 2.0099540581929557e-05,
155
+ "loss": 1.068,
156
+ "step": 525
157
+ },
158
+ {
159
+ "epoch": 0.12640772236267525,
160
+ "grad_norm": 20.331649780273438,
161
+ "learning_rate": 2.105666156202144e-05,
162
+ "loss": 1.0481,
163
+ "step": 550
164
+ },
165
+ {
166
+ "epoch": 0.13215352792461504,
167
+ "grad_norm": 18.258352279663086,
168
+ "learning_rate": 2.2013782542113324e-05,
169
+ "loss": 0.9451,
170
+ "step": 575
171
+ },
172
+ {
173
+ "epoch": 0.1378993334865548,
174
+ "grad_norm": 43.41960906982422,
175
+ "learning_rate": 2.297090352220521e-05,
176
+ "loss": 1.2434,
177
+ "step": 600
178
+ },
179
+ {
180
+ "epoch": 0.1436451390484946,
181
+ "grad_norm": 14.547021865844727,
182
+ "learning_rate": 2.392802450229709e-05,
183
+ "loss": 0.884,
184
+ "step": 625
185
+ },
186
+ {
187
+ "epoch": 0.1493909446104344,
188
+ "grad_norm": 19.18711280822754,
189
+ "learning_rate": 2.4885145482388973e-05,
190
+ "loss": 1.1211,
191
+ "step": 650
192
+ },
193
+ {
194
+ "epoch": 0.15513675017237416,
195
+ "grad_norm": 12.683270454406738,
196
+ "learning_rate": 2.584226646248086e-05,
197
+ "loss": 1.1419,
198
+ "step": 675
199
+ },
200
+ {
201
+ "epoch": 0.16088255573431395,
202
+ "grad_norm": 12.683286666870117,
203
+ "learning_rate": 2.6799387442572744e-05,
204
+ "loss": 0.9859,
205
+ "step": 700
206
+ },
207
+ {
208
+ "epoch": 0.16662836129625375,
209
+ "grad_norm": 17.171892166137695,
210
+ "learning_rate": 2.775650842266463e-05,
211
+ "loss": 1.1642,
212
+ "step": 725
213
+ },
214
+ {
215
+ "epoch": 0.1723741668581935,
216
+ "grad_norm": 16.442211151123047,
217
+ "learning_rate": 2.8713629402756508e-05,
218
+ "loss": 1.116,
219
+ "step": 750
220
+ },
221
+ {
222
+ "epoch": 0.1781199724201333,
223
+ "grad_norm": 16.613231658935547,
224
+ "learning_rate": 2.9670750382848396e-05,
225
+ "loss": 0.882,
226
+ "step": 775
227
+ },
228
+ {
229
+ "epoch": 0.1838657779820731,
230
+ "grad_norm": 18.972503662109375,
231
+ "learning_rate": 3.0627871362940275e-05,
232
+ "loss": 0.9289,
233
+ "step": 800
234
+ },
235
+ {
236
+ "epoch": 0.18961158354401286,
237
+ "grad_norm": 12.045616149902344,
238
+ "learning_rate": 3.158499234303216e-05,
239
+ "loss": 1.0779,
240
+ "step": 825
241
+ },
242
+ {
243
+ "epoch": 0.19535738910595266,
244
+ "grad_norm": 10.648724555969238,
245
+ "learning_rate": 3.2542113323124045e-05,
246
+ "loss": 1.0184,
247
+ "step": 850
248
+ },
249
+ {
250
+ "epoch": 0.20110319466789245,
251
+ "grad_norm": 12.485301971435547,
252
+ "learning_rate": 3.349923430321593e-05,
253
+ "loss": 0.9873,
254
+ "step": 875
255
+ },
256
+ {
257
+ "epoch": 0.20684900022983221,
258
+ "grad_norm": 23.347667694091797,
259
+ "learning_rate": 3.445635528330781e-05,
260
+ "loss": 1.1136,
261
+ "step": 900
262
+ },
263
+ {
264
+ "epoch": 0.212594805791772,
265
+ "grad_norm": 22.95267105102539,
266
+ "learning_rate": 3.54134762633997e-05,
267
+ "loss": 0.9551,
268
+ "step": 925
269
+ },
270
+ {
271
+ "epoch": 0.2183406113537118,
272
+ "grad_norm": 17.17110824584961,
273
+ "learning_rate": 3.637059724349158e-05,
274
+ "loss": 1.1647,
275
+ "step": 950
276
+ },
277
+ {
278
+ "epoch": 0.22408641691565157,
279
+ "grad_norm": 11.29631519317627,
280
+ "learning_rate": 3.732771822358346e-05,
281
+ "loss": 1.0254,
282
+ "step": 975
283
+ },
284
+ {
285
+ "epoch": 0.22983222247759136,
286
+ "grad_norm": 18.52350425720215,
287
+ "learning_rate": 3.828483920367535e-05,
288
+ "loss": 0.9791,
289
+ "step": 1000
290
+ },
291
+ {
292
+ "epoch": 0.23557802803953115,
293
+ "grad_norm": 24.836822509765625,
294
+ "learning_rate": 3.924196018376723e-05,
295
+ "loss": 1.0647,
296
+ "step": 1025
297
+ },
298
+ {
299
+ "epoch": 0.24132383360147092,
300
+ "grad_norm": 12.823792457580566,
301
+ "learning_rate": 4.0199081163859114e-05,
302
+ "loss": 0.9141,
303
+ "step": 1050
304
+ },
305
+ {
306
+ "epoch": 0.2470696391634107,
307
+ "grad_norm": 11.589208602905273,
308
+ "learning_rate": 4.1156202143950996e-05,
309
+ "loss": 1.1041,
310
+ "step": 1075
311
+ },
312
+ {
313
+ "epoch": 0.2528154447253505,
314
+ "grad_norm": 21.15415382385254,
315
+ "learning_rate": 4.211332312404288e-05,
316
+ "loss": 1.0452,
317
+ "step": 1100
318
+ },
319
+ {
320
+ "epoch": 0.25856125028729027,
321
+ "grad_norm": 15.532148361206055,
322
+ "learning_rate": 4.3070444104134766e-05,
323
+ "loss": 0.9655,
324
+ "step": 1125
325
+ },
326
+ {
327
+ "epoch": 0.2643070558492301,
328
+ "grad_norm": 25.468048095703125,
329
+ "learning_rate": 4.402756508422665e-05,
330
+ "loss": 0.9519,
331
+ "step": 1150
332
+ },
333
+ {
334
+ "epoch": 0.27005286141116985,
335
+ "grad_norm": 15.610981941223145,
336
+ "learning_rate": 4.498468606431853e-05,
337
+ "loss": 1.0293,
338
+ "step": 1175
339
+ },
340
+ {
341
+ "epoch": 0.2757986669731096,
342
+ "grad_norm": 17.443798065185547,
343
+ "learning_rate": 4.594180704441042e-05,
344
+ "loss": 0.9748,
345
+ "step": 1200
346
+ },
347
+ {
348
+ "epoch": 0.28154447253504944,
349
+ "grad_norm": 9.4945650100708,
350
+ "learning_rate": 4.68989280245023e-05,
351
+ "loss": 1.0328,
352
+ "step": 1225
353
+ },
354
+ {
355
+ "epoch": 0.2872902780969892,
356
+ "grad_norm": 14.447216033935547,
357
+ "learning_rate": 4.785604900459418e-05,
358
+ "loss": 1.0917,
359
+ "step": 1250
360
+ },
361
+ {
362
+ "epoch": 0.29303608365892897,
363
+ "grad_norm": 15.632061958312988,
364
+ "learning_rate": 4.881316998468607e-05,
365
+ "loss": 1.1806,
366
+ "step": 1275
367
+ },
368
+ {
369
+ "epoch": 0.2987818892208688,
370
+ "grad_norm": 11.10702896118164,
371
+ "learning_rate": 4.9770290964777946e-05,
372
+ "loss": 0.9064,
373
+ "step": 1300
374
+ },
375
+ {
376
+ "epoch": 0.30452769478280856,
377
+ "grad_norm": 14.962434768676758,
378
+ "learning_rate": 4.991912828807355e-05,
379
+ "loss": 1.0403,
380
+ "step": 1325
381
+ },
382
+ {
383
+ "epoch": 0.3102735003447483,
384
+ "grad_norm": 7.605042934417725,
385
+ "learning_rate": 4.9812718140801915e-05,
386
+ "loss": 1.0427,
387
+ "step": 1350
388
+ },
389
+ {
390
+ "epoch": 0.31601930590668814,
391
+ "grad_norm": 15.476686477661133,
392
+ "learning_rate": 4.9706307993530265e-05,
393
+ "loss": 1.1261,
394
+ "step": 1375
395
+ },
396
+ {
397
+ "epoch": 0.3217651114686279,
398
+ "grad_norm": 15.57099723815918,
399
+ "learning_rate": 4.959989784625862e-05,
400
+ "loss": 1.0964,
401
+ "step": 1400
402
+ },
403
+ {
404
+ "epoch": 0.32751091703056767,
405
+ "grad_norm": 19.747634887695312,
406
+ "learning_rate": 4.949348769898698e-05,
407
+ "loss": 1.1034,
408
+ "step": 1425
409
+ },
410
+ {
411
+ "epoch": 0.3332567225925075,
412
+ "grad_norm": 16.88288116455078,
413
+ "learning_rate": 4.9387077551715334e-05,
414
+ "loss": 1.0307,
415
+ "step": 1450
416
+ },
417
+ {
418
+ "epoch": 0.33900252815444726,
419
+ "grad_norm": 14.693648338317871,
420
+ "learning_rate": 4.928066740444369e-05,
421
+ "loss": 1.0929,
422
+ "step": 1475
423
+ },
424
+ {
425
+ "epoch": 0.344748333716387,
426
+ "grad_norm": 19.459819793701172,
427
+ "learning_rate": 4.917425725717205e-05,
428
+ "loss": 1.0146,
429
+ "step": 1500
430
+ },
431
+ {
432
+ "epoch": 0.35049413927832684,
433
+ "grad_norm": 13.217524528503418,
434
+ "learning_rate": 4.90678471099004e-05,
435
+ "loss": 1.1282,
436
+ "step": 1525
437
+ },
438
+ {
439
+ "epoch": 0.3562399448402666,
440
+ "grad_norm": 16.83650016784668,
441
+ "learning_rate": 4.896143696262876e-05,
442
+ "loss": 0.938,
443
+ "step": 1550
444
+ },
445
+ {
446
+ "epoch": 0.3619857504022064,
447
+ "grad_norm": 15.622553825378418,
448
+ "learning_rate": 4.885502681535711e-05,
449
+ "loss": 1.1303,
450
+ "step": 1575
451
+ },
452
+ {
453
+ "epoch": 0.3677315559641462,
454
+ "grad_norm": 18.172714233398438,
455
+ "learning_rate": 4.8748616668085473e-05,
456
+ "loss": 1.0853,
457
+ "step": 1600
458
+ },
459
+ {
460
+ "epoch": 0.37347736152608596,
461
+ "grad_norm": 25.019132614135742,
462
+ "learning_rate": 4.864220652081383e-05,
463
+ "loss": 1.012,
464
+ "step": 1625
465
+ },
466
+ {
467
+ "epoch": 0.3792231670880257,
468
+ "grad_norm": 21.372913360595703,
469
+ "learning_rate": 4.853579637354218e-05,
470
+ "loss": 1.1325,
471
+ "step": 1650
472
+ },
473
+ {
474
+ "epoch": 0.38496897264996555,
475
+ "grad_norm": 18.59086036682129,
476
+ "learning_rate": 4.842938622627054e-05,
477
+ "loss": 1.0089,
478
+ "step": 1675
479
+ },
480
+ {
481
+ "epoch": 0.3907147782119053,
482
+ "grad_norm": 10.926801681518555,
483
+ "learning_rate": 4.832297607899889e-05,
484
+ "loss": 0.8773,
485
+ "step": 1700
486
+ },
487
+ {
488
+ "epoch": 0.3964605837738451,
489
+ "grad_norm": 59.61377716064453,
490
+ "learning_rate": 4.8216565931727256e-05,
491
+ "loss": 1.0163,
492
+ "step": 1725
493
+ },
494
+ {
495
+ "epoch": 0.4022063893357849,
496
+ "grad_norm": 12.620614051818848,
497
+ "learning_rate": 4.8110155784455606e-05,
498
+ "loss": 1.1281,
499
+ "step": 1750
500
+ },
501
+ {
502
+ "epoch": 0.40795219489772466,
503
+ "grad_norm": 32.67042541503906,
504
+ "learning_rate": 4.800374563718397e-05,
505
+ "loss": 1.0002,
506
+ "step": 1775
507
+ },
508
+ {
509
+ "epoch": 0.41369800045966443,
510
+ "grad_norm": 20.0575008392334,
511
+ "learning_rate": 4.789733548991232e-05,
512
+ "loss": 1.0719,
513
+ "step": 1800
514
+ },
515
+ {
516
+ "epoch": 0.41944380602160425,
517
+ "grad_norm": 13.503483772277832,
518
+ "learning_rate": 4.7790925342640675e-05,
519
+ "loss": 1.1353,
520
+ "step": 1825
521
+ },
522
+ {
523
+ "epoch": 0.425189611583544,
524
+ "grad_norm": 21.6424560546875,
525
+ "learning_rate": 4.768451519536903e-05,
526
+ "loss": 0.9627,
527
+ "step": 1850
528
+ },
529
+ {
530
+ "epoch": 0.4309354171454838,
531
+ "grad_norm": 44.92792892456055,
532
+ "learning_rate": 4.757810504809739e-05,
533
+ "loss": 0.949,
534
+ "step": 1875
535
+ },
536
+ {
537
+ "epoch": 0.4366812227074236,
538
+ "grad_norm": 16.71977424621582,
539
+ "learning_rate": 4.7471694900825745e-05,
540
+ "loss": 0.9162,
541
+ "step": 1900
542
+ },
543
+ {
544
+ "epoch": 0.44242702826936336,
545
+ "grad_norm": 16.909381866455078,
546
+ "learning_rate": 4.73652847535541e-05,
547
+ "loss": 0.9035,
548
+ "step": 1925
549
+ },
550
+ {
551
+ "epoch": 0.44817283383130313,
552
+ "grad_norm": 17.321500778198242,
553
+ "learning_rate": 4.725887460628246e-05,
554
+ "loss": 1.0233,
555
+ "step": 1950
556
+ },
557
+ {
558
+ "epoch": 0.45391863939324295,
559
+ "grad_norm": 24.249675750732422,
560
+ "learning_rate": 4.7152464459010814e-05,
561
+ "loss": 1.1362,
562
+ "step": 1975
563
+ },
564
+ {
565
+ "epoch": 0.4596644449551827,
566
+ "grad_norm": 19.54537010192871,
567
+ "learning_rate": 4.704605431173917e-05,
568
+ "loss": 0.9597,
569
+ "step": 2000
570
+ },
571
+ {
572
+ "epoch": 0.4654102505171225,
573
+ "grad_norm": 10.297558784484863,
574
+ "learning_rate": 4.693964416446753e-05,
575
+ "loss": 1.0074,
576
+ "step": 2025
577
+ },
578
+ {
579
+ "epoch": 0.4711560560790623,
580
+ "grad_norm": 17.777502059936523,
581
+ "learning_rate": 4.6833234017195884e-05,
582
+ "loss": 1.0467,
583
+ "step": 2050
584
+ },
585
+ {
586
+ "epoch": 0.47690186164100207,
587
+ "grad_norm": 25.018428802490234,
588
+ "learning_rate": 4.6726823869924234e-05,
589
+ "loss": 1.0512,
590
+ "step": 2075
591
+ },
592
+ {
593
+ "epoch": 0.48264766720294183,
594
+ "grad_norm": 12.033584594726562,
595
+ "learning_rate": 4.66204137226526e-05,
596
+ "loss": 1.0026,
597
+ "step": 2100
598
+ },
599
+ {
600
+ "epoch": 0.48839347276488165,
601
+ "grad_norm": 11.690512657165527,
602
+ "learning_rate": 4.651400357538095e-05,
603
+ "loss": 1.0681,
604
+ "step": 2125
605
+ },
606
+ {
607
+ "epoch": 0.4941392783268214,
608
+ "grad_norm": 16.697847366333008,
609
+ "learning_rate": 4.640759342810931e-05,
610
+ "loss": 1.1947,
611
+ "step": 2150
612
+ },
613
+ {
614
+ "epoch": 0.4998850838887612,
615
+ "grad_norm": 16.905946731567383,
616
+ "learning_rate": 4.630118328083766e-05,
617
+ "loss": 1.0865,
618
+ "step": 2175
619
+ },
620
+ {
621
+ "epoch": 0.505630889450701,
622
+ "grad_norm": 12.390742301940918,
623
+ "learning_rate": 4.619477313356602e-05,
624
+ "loss": 1.0294,
625
+ "step": 2200
626
+ },
627
+ {
628
+ "epoch": 0.5113766950126408,
629
+ "grad_norm": 15.317625045776367,
630
+ "learning_rate": 4.608836298629437e-05,
631
+ "loss": 1.1887,
632
+ "step": 2225
633
+ },
634
+ {
635
+ "epoch": 0.5171225005745805,
636
+ "grad_norm": 15.924113273620605,
637
+ "learning_rate": 4.598195283902273e-05,
638
+ "loss": 1.0063,
639
+ "step": 2250
640
+ },
641
+ {
642
+ "epoch": 0.5228683061365204,
643
+ "grad_norm": 11.048434257507324,
644
+ "learning_rate": 4.5875542691751086e-05,
645
+ "loss": 0.886,
646
+ "step": 2275
647
+ },
648
+ {
649
+ "epoch": 0.5286141116984602,
650
+ "grad_norm": 16.680204391479492,
651
+ "learning_rate": 4.576913254447944e-05,
652
+ "loss": 1.1075,
653
+ "step": 2300
654
+ },
655
+ {
656
+ "epoch": 0.5343599172603999,
657
+ "grad_norm": 14.081778526306152,
658
+ "learning_rate": 4.56627223972078e-05,
659
+ "loss": 1.1626,
660
+ "step": 2325
661
+ },
662
+ {
663
+ "epoch": 0.5401057228223397,
664
+ "grad_norm": 25.836400985717773,
665
+ "learning_rate": 4.5556312249936155e-05,
666
+ "loss": 1.0299,
667
+ "step": 2350
668
+ },
669
+ {
670
+ "epoch": 0.5458515283842795,
671
+ "grad_norm": 14.950949668884277,
672
+ "learning_rate": 4.544990210266451e-05,
673
+ "loss": 1.0186,
674
+ "step": 2375
675
+ },
676
+ {
677
+ "epoch": 0.5515973339462192,
678
+ "grad_norm": 13.539344787597656,
679
+ "learning_rate": 4.534349195539287e-05,
680
+ "loss": 1.0529,
681
+ "step": 2400
682
+ },
683
+ {
684
+ "epoch": 0.5573431395081591,
685
+ "grad_norm": 11.866837501525879,
686
+ "learning_rate": 4.5237081808121225e-05,
687
+ "loss": 1.0269,
688
+ "step": 2425
689
+ },
690
+ {
691
+ "epoch": 0.5630889450700989,
692
+ "grad_norm": 23.597551345825195,
693
+ "learning_rate": 4.513067166084958e-05,
694
+ "loss": 1.0083,
695
+ "step": 2450
696
+ },
697
+ {
698
+ "epoch": 0.5688347506320386,
699
+ "grad_norm": 15.675888061523438,
700
+ "learning_rate": 4.502426151357794e-05,
701
+ "loss": 1.0386,
702
+ "step": 2475
703
+ },
704
+ {
705
+ "epoch": 0.5745805561939784,
706
+ "grad_norm": 12.158834457397461,
707
+ "learning_rate": 4.491785136630629e-05,
708
+ "loss": 1.1082,
709
+ "step": 2500
710
+ },
711
+ {
712
+ "epoch": 0.5803263617559182,
713
+ "grad_norm": 14.904141426086426,
714
+ "learning_rate": 4.481144121903465e-05,
715
+ "loss": 1.0783,
716
+ "step": 2525
717
+ },
718
+ {
719
+ "epoch": 0.5860721673178579,
720
+ "grad_norm": 29.646265029907227,
721
+ "learning_rate": 4.4705031071763e-05,
722
+ "loss": 1.0602,
723
+ "step": 2550
724
+ },
725
+ {
726
+ "epoch": 0.5918179728797978,
727
+ "grad_norm": 18.369321823120117,
728
+ "learning_rate": 4.4598620924491364e-05,
729
+ "loss": 0.9937,
730
+ "step": 2575
731
+ },
732
+ {
733
+ "epoch": 0.5975637784417376,
734
+ "grad_norm": 12.290249824523926,
735
+ "learning_rate": 4.4492210777219714e-05,
736
+ "loss": 0.9421,
737
+ "step": 2600
738
+ },
739
+ {
740
+ "epoch": 0.6033095840036773,
741
+ "grad_norm": 18.439208984375,
742
+ "learning_rate": 4.438580062994808e-05,
743
+ "loss": 1.0252,
744
+ "step": 2625
745
+ },
746
+ {
747
+ "epoch": 0.6090553895656171,
748
+ "grad_norm": 22.04486846923828,
749
+ "learning_rate": 4.4279390482676434e-05,
750
+ "loss": 0.8886,
751
+ "step": 2650
752
+ },
753
+ {
754
+ "epoch": 0.6148011951275569,
755
+ "grad_norm": 22.83307647705078,
756
+ "learning_rate": 4.4172980335404783e-05,
757
+ "loss": 0.9552,
758
+ "step": 2675
759
+ },
760
+ {
761
+ "epoch": 0.6205470006894966,
762
+ "grad_norm": 19.321020126342773,
763
+ "learning_rate": 4.406657018813315e-05,
764
+ "loss": 1.0092,
765
+ "step": 2700
766
+ },
767
+ {
768
+ "epoch": 0.6262928062514365,
769
+ "grad_norm": 14.00854778289795,
770
+ "learning_rate": 4.3960160040861496e-05,
771
+ "loss": 0.8749,
772
+ "step": 2725
773
+ },
774
+ {
775
+ "epoch": 0.6320386118133763,
776
+ "grad_norm": 7.2084269523620605,
777
+ "learning_rate": 4.385374989358986e-05,
778
+ "loss": 1.0578,
779
+ "step": 2750
780
+ },
781
+ {
782
+ "epoch": 0.637784417375316,
783
+ "grad_norm": 12.39623737335205,
784
+ "learning_rate": 4.374733974631821e-05,
785
+ "loss": 1.0612,
786
+ "step": 2775
787
+ },
788
+ {
789
+ "epoch": 0.6435302229372558,
790
+ "grad_norm": 26.27788734436035,
791
+ "learning_rate": 4.3640929599046566e-05,
792
+ "loss": 0.968,
793
+ "step": 2800
794
+ },
795
+ {
796
+ "epoch": 0.6492760284991956,
797
+ "grad_norm": 11.55069637298584,
798
+ "learning_rate": 4.353451945177492e-05,
799
+ "loss": 0.9956,
800
+ "step": 2825
801
+ },
802
+ {
803
+ "epoch": 0.6550218340611353,
804
+ "grad_norm": 17.346981048583984,
805
+ "learning_rate": 4.342810930450328e-05,
806
+ "loss": 0.984,
807
+ "step": 2850
808
+ },
809
+ {
810
+ "epoch": 0.6607676396230752,
811
+ "grad_norm": 11.784706115722656,
812
+ "learning_rate": 4.3321699157231636e-05,
813
+ "loss": 1.0254,
814
+ "step": 2875
815
+ },
816
+ {
817
+ "epoch": 0.666513445185015,
818
+ "grad_norm": 14.726810455322266,
819
+ "learning_rate": 4.321528900995999e-05,
820
+ "loss": 0.9698,
821
+ "step": 2900
822
+ },
823
+ {
824
+ "epoch": 0.6722592507469547,
825
+ "grad_norm": 18.386871337890625,
826
+ "learning_rate": 4.310887886268835e-05,
827
+ "loss": 0.9413,
828
+ "step": 2925
829
+ },
830
+ {
831
+ "epoch": 0.6780050563088945,
832
+ "grad_norm": 29.44155502319336,
833
+ "learning_rate": 4.3002468715416705e-05,
834
+ "loss": 1.0491,
835
+ "step": 2950
836
+ },
837
+ {
838
+ "epoch": 0.6837508618708343,
839
+ "grad_norm": 15.0762939453125,
840
+ "learning_rate": 4.289605856814506e-05,
841
+ "loss": 0.8545,
842
+ "step": 2975
843
+ },
844
+ {
845
+ "epoch": 0.689496667432774,
846
+ "grad_norm": 21.399852752685547,
847
+ "learning_rate": 4.278964842087342e-05,
848
+ "loss": 1.0221,
849
+ "step": 3000
850
+ },
851
+ {
852
+ "epoch": 0.6952424729947139,
853
+ "grad_norm": 20.22688865661621,
854
+ "learning_rate": 4.2683238273601775e-05,
855
+ "loss": 0.9472,
856
+ "step": 3025
857
+ },
858
+ {
859
+ "epoch": 0.7009882785566537,
860
+ "grad_norm": 21.75887680053711,
861
+ "learning_rate": 4.257682812633013e-05,
862
+ "loss": 1.1666,
863
+ "step": 3050
864
+ },
865
+ {
866
+ "epoch": 0.7067340841185934,
867
+ "grad_norm": 19.081144332885742,
868
+ "learning_rate": 4.247041797905849e-05,
869
+ "loss": 1.0754,
870
+ "step": 3075
871
+ },
872
+ {
873
+ "epoch": 0.7124798896805332,
874
+ "grad_norm": 11.539175987243652,
875
+ "learning_rate": 4.236400783178684e-05,
876
+ "loss": 0.9631,
877
+ "step": 3100
878
+ },
879
+ {
880
+ "epoch": 0.718225695242473,
881
+ "grad_norm": 19.64055824279785,
882
+ "learning_rate": 4.22575976845152e-05,
883
+ "loss": 1.0522,
884
+ "step": 3125
885
+ },
886
+ {
887
+ "epoch": 0.7239715008044127,
888
+ "grad_norm": 15.160033226013184,
889
+ "learning_rate": 4.215118753724355e-05,
890
+ "loss": 0.8811,
891
+ "step": 3150
892
+ },
893
+ {
894
+ "epoch": 0.7297173063663526,
895
+ "grad_norm": 12.39316177368164,
896
+ "learning_rate": 4.2044777389971914e-05,
897
+ "loss": 0.8472,
898
+ "step": 3175
899
+ },
900
+ {
901
+ "epoch": 0.7354631119282924,
902
+ "grad_norm": 7.085254669189453,
903
+ "learning_rate": 4.1938367242700264e-05,
904
+ "loss": 0.9831,
905
+ "step": 3200
906
+ },
907
+ {
908
+ "epoch": 0.7412089174902321,
909
+ "grad_norm": 15.343351364135742,
910
+ "learning_rate": 4.183195709542862e-05,
911
+ "loss": 1.084,
912
+ "step": 3225
913
+ },
914
+ {
915
+ "epoch": 0.7469547230521719,
916
+ "grad_norm": 11.71967601776123,
917
+ "learning_rate": 4.1725546948156977e-05,
918
+ "loss": 1.0677,
919
+ "step": 3250
920
+ },
921
+ {
922
+ "epoch": 0.7527005286141117,
923
+ "grad_norm": 14.376879692077637,
924
+ "learning_rate": 4.161913680088533e-05,
925
+ "loss": 0.9266,
926
+ "step": 3275
927
+ },
928
+ {
929
+ "epoch": 0.7584463341760515,
930
+ "grad_norm": 12.832955360412598,
931
+ "learning_rate": 4.151272665361369e-05,
932
+ "loss": 1.069,
933
+ "step": 3300
934
+ },
935
+ {
936
+ "epoch": 0.7641921397379913,
937
+ "grad_norm": 8.518636703491211,
938
+ "learning_rate": 4.1406316506342046e-05,
939
+ "loss": 1.0591,
940
+ "step": 3325
941
+ },
942
+ {
943
+ "epoch": 0.7699379452999311,
944
+ "grad_norm": 13.678732872009277,
945
+ "learning_rate": 4.12999063590704e-05,
946
+ "loss": 0.8587,
947
+ "step": 3350
948
+ },
949
+ {
950
+ "epoch": 0.7756837508618708,
951
+ "grad_norm": 16.9919376373291,
952
+ "learning_rate": 4.119349621179876e-05,
953
+ "loss": 1.0463,
954
+ "step": 3375
955
+ },
956
+ {
957
+ "epoch": 0.7814295564238106,
958
+ "grad_norm": 19.827529907226562,
959
+ "learning_rate": 4.1087086064527116e-05,
960
+ "loss": 1.0071,
961
+ "step": 3400
962
+ },
963
+ {
964
+ "epoch": 0.7871753619857504,
965
+ "grad_norm": 16.30826759338379,
966
+ "learning_rate": 4.098067591725547e-05,
967
+ "loss": 0.9613,
968
+ "step": 3425
969
+ },
970
+ {
971
+ "epoch": 0.7929211675476902,
972
+ "grad_norm": 19.365869522094727,
973
+ "learning_rate": 4.087426576998383e-05,
974
+ "loss": 0.8334,
975
+ "step": 3450
976
+ },
977
+ {
978
+ "epoch": 0.79866697310963,
979
+ "grad_norm": 10.79730224609375,
980
+ "learning_rate": 4.076785562271218e-05,
981
+ "loss": 0.9526,
982
+ "step": 3475
983
+ },
984
+ {
985
+ "epoch": 0.8044127786715698,
986
+ "grad_norm": 10.032505989074707,
987
+ "learning_rate": 4.066144547544054e-05,
988
+ "loss": 1.0273,
989
+ "step": 3500
990
+ },
991
+ {
992
+ "epoch": 0.8101585842335095,
993
+ "grad_norm": 15.712605476379395,
994
+ "learning_rate": 4.055503532816889e-05,
995
+ "loss": 1.0953,
996
+ "step": 3525
997
+ },
998
+ {
999
+ "epoch": 0.8159043897954493,
1000
+ "grad_norm": 18.788259506225586,
1001
+ "learning_rate": 4.0448625180897255e-05,
1002
+ "loss": 1.1593,
1003
+ "step": 3550
1004
+ },
1005
+ {
1006
+ "epoch": 0.8216501953573891,
1007
+ "grad_norm": 12.3018159866333,
1008
+ "learning_rate": 4.0342215033625605e-05,
1009
+ "loss": 0.943,
1010
+ "step": 3575
1011
+ },
1012
+ {
1013
+ "epoch": 0.8273960009193289,
1014
+ "grad_norm": 26.096878051757812,
1015
+ "learning_rate": 4.023580488635397e-05,
1016
+ "loss": 0.9589,
1017
+ "step": 3600
1018
+ },
1019
+ {
1020
+ "epoch": 0.8331418064812687,
1021
+ "grad_norm": 9.981711387634277,
1022
+ "learning_rate": 4.0129394739082324e-05,
1023
+ "loss": 0.9157,
1024
+ "step": 3625
1025
+ },
1026
+ {
1027
+ "epoch": 0.8388876120432085,
1028
+ "grad_norm": 19.37105941772461,
1029
+ "learning_rate": 4.0022984591810674e-05,
1030
+ "loss": 1.1807,
1031
+ "step": 3650
1032
+ },
1033
+ {
1034
+ "epoch": 0.8446334176051482,
1035
+ "grad_norm": 20.733781814575195,
1036
+ "learning_rate": 3.991657444453904e-05,
1037
+ "loss": 0.9934,
1038
+ "step": 3675
1039
+ },
1040
+ {
1041
+ "epoch": 0.850379223167088,
1042
+ "grad_norm": 27.081636428833008,
1043
+ "learning_rate": 3.981016429726739e-05,
1044
+ "loss": 1.1039,
1045
+ "step": 3700
1046
+ },
1047
+ {
1048
+ "epoch": 0.8561250287290278,
1049
+ "grad_norm": 20.710731506347656,
1050
+ "learning_rate": 3.970375414999575e-05,
1051
+ "loss": 1.0698,
1052
+ "step": 3725
1053
+ },
1054
+ {
1055
+ "epoch": 0.8618708342909676,
1056
+ "grad_norm": 12.305147171020508,
1057
+ "learning_rate": 3.95973440027241e-05,
1058
+ "loss": 0.9086,
1059
+ "step": 3750
1060
+ },
1061
+ {
1062
+ "epoch": 0.8676166398529074,
1063
+ "grad_norm": 8.804189682006836,
1064
+ "learning_rate": 3.949093385545246e-05,
1065
+ "loss": 0.8438,
1066
+ "step": 3775
1067
+ },
1068
+ {
1069
+ "epoch": 0.8733624454148472,
1070
+ "grad_norm": 14.99170207977295,
1071
+ "learning_rate": 3.938452370818081e-05,
1072
+ "loss": 0.8631,
1073
+ "step": 3800
1074
+ },
1075
+ {
1076
+ "epoch": 0.8791082509767869,
1077
+ "grad_norm": 18.036231994628906,
1078
+ "learning_rate": 3.927811356090917e-05,
1079
+ "loss": 1.0129,
1080
+ "step": 3825
1081
+ },
1082
+ {
1083
+ "epoch": 0.8848540565387267,
1084
+ "grad_norm": 11.981534957885742,
1085
+ "learning_rate": 3.9171703413637526e-05,
1086
+ "loss": 0.9894,
1087
+ "step": 3850
1088
+ },
1089
+ {
1090
+ "epoch": 0.8905998621006666,
1091
+ "grad_norm": 15.030149459838867,
1092
+ "learning_rate": 3.906529326636588e-05,
1093
+ "loss": 1.0499,
1094
+ "step": 3875
1095
+ },
1096
+ {
1097
+ "epoch": 0.8963456676626063,
1098
+ "grad_norm": 15.344982147216797,
1099
+ "learning_rate": 3.895888311909424e-05,
1100
+ "loss": 0.9134,
1101
+ "step": 3900
1102
+ },
1103
+ {
1104
+ "epoch": 0.9020914732245461,
1105
+ "grad_norm": 25.36429214477539,
1106
+ "learning_rate": 3.8852472971822596e-05,
1107
+ "loss": 0.9781,
1108
+ "step": 3925
1109
+ },
1110
+ {
1111
+ "epoch": 0.9078372787864859,
1112
+ "grad_norm": 14.226202011108398,
1113
+ "learning_rate": 3.874606282455095e-05,
1114
+ "loss": 0.8312,
1115
+ "step": 3950
1116
+ },
1117
+ {
1118
+ "epoch": 0.9135830843484256,
1119
+ "grad_norm": 43.66263198852539,
1120
+ "learning_rate": 3.863965267727931e-05,
1121
+ "loss": 0.9736,
1122
+ "step": 3975
1123
+ },
1124
+ {
1125
+ "epoch": 0.9193288899103654,
1126
+ "grad_norm": 23.34619140625,
1127
+ "learning_rate": 3.8533242530007665e-05,
1128
+ "loss": 1.0665,
1129
+ "step": 4000
1130
+ },
1131
+ {
1132
+ "epoch": 0.9250746954723053,
1133
+ "grad_norm": 25.17789649963379,
1134
+ "learning_rate": 3.842683238273602e-05,
1135
+ "loss": 1.0652,
1136
+ "step": 4025
1137
+ },
1138
+ {
1139
+ "epoch": 0.930820501034245,
1140
+ "grad_norm": 9.696635246276855,
1141
+ "learning_rate": 3.832042223546438e-05,
1142
+ "loss": 0.867,
1143
+ "step": 4050
1144
+ },
1145
+ {
1146
+ "epoch": 0.9365663065961848,
1147
+ "grad_norm": 9.90356731414795,
1148
+ "learning_rate": 3.821401208819273e-05,
1149
+ "loss": 0.7973,
1150
+ "step": 4075
1151
+ },
1152
+ {
1153
+ "epoch": 0.9423121121581246,
1154
+ "grad_norm": 22.405452728271484,
1155
+ "learning_rate": 3.810760194092109e-05,
1156
+ "loss": 0.8214,
1157
+ "step": 4100
1158
+ },
1159
+ {
1160
+ "epoch": 0.9480579177200643,
1161
+ "grad_norm": 24.39476776123047,
1162
+ "learning_rate": 3.800119179364944e-05,
1163
+ "loss": 0.9049,
1164
+ "step": 4125
1165
+ },
1166
+ {
1167
+ "epoch": 0.9538037232820041,
1168
+ "grad_norm": 9.647842407226562,
1169
+ "learning_rate": 3.7894781646377804e-05,
1170
+ "loss": 0.8708,
1171
+ "step": 4150
1172
+ },
1173
+ {
1174
+ "epoch": 0.959549528843944,
1175
+ "grad_norm": 26.49906349182129,
1176
+ "learning_rate": 3.7788371499106154e-05,
1177
+ "loss": 0.9184,
1178
+ "step": 4175
1179
+ },
1180
+ {
1181
+ "epoch": 0.9652953344058837,
1182
+ "grad_norm": 20.982925415039062,
1183
+ "learning_rate": 3.768196135183451e-05,
1184
+ "loss": 0.9092,
1185
+ "step": 4200
1186
+ },
1187
+ {
1188
+ "epoch": 0.9710411399678235,
1189
+ "grad_norm": 11.060940742492676,
1190
+ "learning_rate": 3.757555120456287e-05,
1191
+ "loss": 0.8978,
1192
+ "step": 4225
1193
+ },
1194
+ {
1195
+ "epoch": 0.9767869455297633,
1196
+ "grad_norm": 9.029313087463379,
1197
+ "learning_rate": 3.7469141057291224e-05,
1198
+ "loss": 0.7428,
1199
+ "step": 4250
1200
+ },
1201
+ {
1202
+ "epoch": 0.982532751091703,
1203
+ "grad_norm": 10.593424797058105,
1204
+ "learning_rate": 3.736273091001958e-05,
1205
+ "loss": 0.9591,
1206
+ "step": 4275
1207
+ },
1208
+ {
1209
+ "epoch": 0.9882785566536428,
1210
+ "grad_norm": 15.341836929321289,
1211
+ "learning_rate": 3.725632076274794e-05,
1212
+ "loss": 0.8476,
1213
+ "step": 4300
1214
+ },
1215
+ {
1216
+ "epoch": 0.9940243622155827,
1217
+ "grad_norm": 13.930924415588379,
1218
+ "learning_rate": 3.714991061547629e-05,
1219
+ "loss": 1.0338,
1220
+ "step": 4325
1221
+ },
1222
+ {
1223
+ "epoch": 0.9997701677775224,
1224
+ "grad_norm": 10.704158782958984,
1225
+ "learning_rate": 3.704350046820465e-05,
1226
+ "loss": 0.923,
1227
+ "step": 4350
1228
+ },
1229
+ {
1230
+ "epoch": 1.0,
1231
+ "eval_gen_len": 26.2909,
1232
+ "eval_loss": 0.734386682510376,
1233
+ "eval_rouge1": 83.7735,
1234
+ "eval_rouge2": 66.5715,
1235
+ "eval_rougeL": 81.6112,
1236
+ "eval_rougeLsum": 81.6279,
1237
+ "eval_runtime": 2086.9324,
1238
+ "eval_samples_per_second": 1.39,
1239
+ "eval_steps_per_second": 0.348,
1240
+ "step": 4351
1241
+ },
1242
+ {
1243
+ "epoch": 1.0055159733394623,
1244
+ "grad_norm": 7.619024276733398,
1245
+ "learning_rate": 3.6937090320933006e-05,
1246
+ "loss": 0.5463,
1247
+ "step": 4375
1248
+ },
1249
+ {
1250
+ "epoch": 1.011261778901402,
1251
+ "grad_norm": 6.126559734344482,
1252
+ "learning_rate": 3.683068017366136e-05,
1253
+ "loss": 0.5629,
1254
+ "step": 4400
1255
+ },
1256
+ {
1257
+ "epoch": 1.0170075844633417,
1258
+ "grad_norm": 20.718738555908203,
1259
+ "learning_rate": 3.672427002638972e-05,
1260
+ "loss": 0.5278,
1261
+ "step": 4425
1262
+ },
1263
+ {
1264
+ "epoch": 1.0227533900252817,
1265
+ "grad_norm": 6.006888389587402,
1266
+ "learning_rate": 3.6617859879118076e-05,
1267
+ "loss": 0.5893,
1268
+ "step": 4450
1269
+ },
1270
+ {
1271
+ "epoch": 1.0284991955872214,
1272
+ "grad_norm": 14.341240882873535,
1273
+ "learning_rate": 3.651144973184643e-05,
1274
+ "loss": 0.6282,
1275
+ "step": 4475
1276
+ },
1277
+ {
1278
+ "epoch": 1.034245001149161,
1279
+ "grad_norm": 16.240156173706055,
1280
+ "learning_rate": 3.640503958457478e-05,
1281
+ "loss": 0.6328,
1282
+ "step": 4500
1283
+ },
1284
+ {
1285
+ "epoch": 1.039990806711101,
1286
+ "grad_norm": 8.577178955078125,
1287
+ "learning_rate": 3.6298629437303145e-05,
1288
+ "loss": 0.6274,
1289
+ "step": 4525
1290
+ },
1291
+ {
1292
+ "epoch": 1.0457366122730407,
1293
+ "grad_norm": 13.634942054748535,
1294
+ "learning_rate": 3.6192219290031495e-05,
1295
+ "loss": 0.6302,
1296
+ "step": 4550
1297
+ },
1298
+ {
1299
+ "epoch": 1.0514824178349804,
1300
+ "grad_norm": 13.95753002166748,
1301
+ "learning_rate": 3.608580914275986e-05,
1302
+ "loss": 0.6617,
1303
+ "step": 4575
1304
+ },
1305
+ {
1306
+ "epoch": 1.0572282233969204,
1307
+ "grad_norm": 16.2097110748291,
1308
+ "learning_rate": 3.597939899548821e-05,
1309
+ "loss": 0.612,
1310
+ "step": 4600
1311
+ },
1312
+ {
1313
+ "epoch": 1.06297402895886,
1314
+ "grad_norm": 17.29051971435547,
1315
+ "learning_rate": 3.5872988848216565e-05,
1316
+ "loss": 0.5602,
1317
+ "step": 4625
1318
+ },
1319
+ {
1320
+ "epoch": 1.0687198345207998,
1321
+ "grad_norm": 11.388086318969727,
1322
+ "learning_rate": 3.576657870094493e-05,
1323
+ "loss": 0.4798,
1324
+ "step": 4650
1325
+ },
1326
+ {
1327
+ "epoch": 1.0744656400827397,
1328
+ "grad_norm": 13.426841735839844,
1329
+ "learning_rate": 3.566016855367328e-05,
1330
+ "loss": 0.5345,
1331
+ "step": 4675
1332
+ },
1333
+ {
1334
+ "epoch": 1.0802114456446794,
1335
+ "grad_norm": 14.696466445922852,
1336
+ "learning_rate": 3.555375840640164e-05,
1337
+ "loss": 0.6485,
1338
+ "step": 4700
1339
+ },
1340
+ {
1341
+ "epoch": 1.0859572512066191,
1342
+ "grad_norm": 12.150015830993652,
1343
+ "learning_rate": 3.544734825912999e-05,
1344
+ "loss": 0.6087,
1345
+ "step": 4725
1346
+ },
1347
+ {
1348
+ "epoch": 1.091703056768559,
1349
+ "grad_norm": 9.841524124145508,
1350
+ "learning_rate": 3.534093811185835e-05,
1351
+ "loss": 0.5868,
1352
+ "step": 4750
1353
+ },
1354
+ {
1355
+ "epoch": 1.0974488623304988,
1356
+ "grad_norm": 8.271934509277344,
1357
+ "learning_rate": 3.5234527964586704e-05,
1358
+ "loss": 0.5462,
1359
+ "step": 4775
1360
+ },
1361
+ {
1362
+ "epoch": 1.1031946678924385,
1363
+ "grad_norm": 7.936254978179932,
1364
+ "learning_rate": 3.512811781731506e-05,
1365
+ "loss": 0.5379,
1366
+ "step": 4800
1367
+ },
1368
+ {
1369
+ "epoch": 1.1089404734543784,
1370
+ "grad_norm": 11.007326126098633,
1371
+ "learning_rate": 3.502170767004342e-05,
1372
+ "loss": 0.5492,
1373
+ "step": 4825
1374
+ },
1375
+ {
1376
+ "epoch": 1.1146862790163181,
1377
+ "grad_norm": 19.896902084350586,
1378
+ "learning_rate": 3.4915297522771773e-05,
1379
+ "loss": 0.6375,
1380
+ "step": 4850
1381
+ },
1382
+ {
1383
+ "epoch": 1.1204320845782578,
1384
+ "grad_norm": 22.039011001586914,
1385
+ "learning_rate": 3.480888737550013e-05,
1386
+ "loss": 0.5828,
1387
+ "step": 4875
1388
+ },
1389
+ {
1390
+ "epoch": 1.1261778901401978,
1391
+ "grad_norm": 6.415574550628662,
1392
+ "learning_rate": 3.4702477228228486e-05,
1393
+ "loss": 0.4254,
1394
+ "step": 4900
1395
+ },
1396
+ {
1397
+ "epoch": 1.1319236957021375,
1398
+ "grad_norm": 12.866458892822266,
1399
+ "learning_rate": 3.459606708095684e-05,
1400
+ "loss": 0.566,
1401
+ "step": 4925
1402
+ },
1403
+ {
1404
+ "epoch": 1.1376695012640772,
1405
+ "grad_norm": 22.729032516479492,
1406
+ "learning_rate": 3.44896569336852e-05,
1407
+ "loss": 0.5428,
1408
+ "step": 4950
1409
+ },
1410
+ {
1411
+ "epoch": 1.143415306826017,
1412
+ "grad_norm": 11.15864372253418,
1413
+ "learning_rate": 3.4383246786413556e-05,
1414
+ "loss": 0.628,
1415
+ "step": 4975
1416
+ },
1417
+ {
1418
+ "epoch": 1.1491611123879568,
1419
+ "grad_norm": 9.711397171020508,
1420
+ "learning_rate": 3.427683663914191e-05,
1421
+ "loss": 0.4856,
1422
+ "step": 5000
1423
+ },
1424
+ {
1425
+ "epoch": 1.1549069179498965,
1426
+ "grad_norm": 13.280930519104004,
1427
+ "learning_rate": 3.417042649187027e-05,
1428
+ "loss": 0.5113,
1429
+ "step": 5025
1430
+ },
1431
+ {
1432
+ "epoch": 1.1606527235118365,
1433
+ "grad_norm": 35.520687103271484,
1434
+ "learning_rate": 3.406401634459862e-05,
1435
+ "loss": 0.4617,
1436
+ "step": 5050
1437
+ },
1438
+ {
1439
+ "epoch": 1.1663985290737762,
1440
+ "grad_norm": 31.962560653686523,
1441
+ "learning_rate": 3.395760619732698e-05,
1442
+ "loss": 0.5678,
1443
+ "step": 5075
1444
+ },
1445
+ {
1446
+ "epoch": 1.1721443346357159,
1447
+ "grad_norm": 18.267778396606445,
1448
+ "learning_rate": 3.385119605005533e-05,
1449
+ "loss": 0.5897,
1450
+ "step": 5100
1451
+ },
1452
+ {
1453
+ "epoch": 1.1778901401976558,
1454
+ "grad_norm": 16.51255989074707,
1455
+ "learning_rate": 3.3744785902783695e-05,
1456
+ "loss": 0.5746,
1457
+ "step": 5125
1458
+ },
1459
+ {
1460
+ "epoch": 1.1836359457595955,
1461
+ "grad_norm": 6.914166450500488,
1462
+ "learning_rate": 3.3638375755512045e-05,
1463
+ "loss": 0.593,
1464
+ "step": 5150
1465
+ },
1466
+ {
1467
+ "epoch": 1.1893817513215352,
1468
+ "grad_norm": 10.85730266571045,
1469
+ "learning_rate": 3.35319656082404e-05,
1470
+ "loss": 0.5905,
1471
+ "step": 5175
1472
+ },
1473
+ {
1474
+ "epoch": 1.1951275568834752,
1475
+ "grad_norm": 17.28081512451172,
1476
+ "learning_rate": 3.342555546096876e-05,
1477
+ "loss": 0.5473,
1478
+ "step": 5200
1479
+ },
1480
+ {
1481
+ "epoch": 1.2008733624454149,
1482
+ "grad_norm": 13.884358406066895,
1483
+ "learning_rate": 3.3319145313697114e-05,
1484
+ "loss": 0.5982,
1485
+ "step": 5225
1486
+ },
1487
+ {
1488
+ "epoch": 1.2066191680073546,
1489
+ "grad_norm": 9.417092323303223,
1490
+ "learning_rate": 3.321273516642547e-05,
1491
+ "loss": 0.6085,
1492
+ "step": 5250
1493
+ },
1494
+ {
1495
+ "epoch": 1.2123649735692945,
1496
+ "grad_norm": 10.19940185546875,
1497
+ "learning_rate": 3.310632501915383e-05,
1498
+ "loss": 0.56,
1499
+ "step": 5275
1500
+ },
1501
+ {
1502
+ "epoch": 1.2181107791312342,
1503
+ "grad_norm": 17.75829315185547,
1504
+ "learning_rate": 3.2999914871882184e-05,
1505
+ "loss": 0.5455,
1506
+ "step": 5300
1507
+ },
1508
+ {
1509
+ "epoch": 1.223856584693174,
1510
+ "grad_norm": 13.822357177734375,
1511
+ "learning_rate": 3.289350472461054e-05,
1512
+ "loss": 0.5122,
1513
+ "step": 5325
1514
+ },
1515
+ {
1516
+ "epoch": 1.2296023902551139,
1517
+ "grad_norm": 6.675373077392578,
1518
+ "learning_rate": 3.27870945773389e-05,
1519
+ "loss": 0.6045,
1520
+ "step": 5350
1521
+ },
1522
+ {
1523
+ "epoch": 1.2353481958170536,
1524
+ "grad_norm": 12.710549354553223,
1525
+ "learning_rate": 3.2680684430067254e-05,
1526
+ "loss": 0.5715,
1527
+ "step": 5375
1528
+ },
1529
+ {
1530
+ "epoch": 1.2410940013789933,
1531
+ "grad_norm": 14.400224685668945,
1532
+ "learning_rate": 3.257427428279561e-05,
1533
+ "loss": 0.5742,
1534
+ "step": 5400
1535
+ },
1536
+ {
1537
+ "epoch": 1.2468398069409332,
1538
+ "grad_norm": 14.449501037597656,
1539
+ "learning_rate": 3.2467864135523967e-05,
1540
+ "loss": 0.6402,
1541
+ "step": 5425
1542
+ },
1543
+ {
1544
+ "epoch": 1.252585612502873,
1545
+ "grad_norm": 10.527132034301758,
1546
+ "learning_rate": 3.236145398825232e-05,
1547
+ "loss": 0.6159,
1548
+ "step": 5450
1549
+ },
1550
+ {
1551
+ "epoch": 1.2583314180648126,
1552
+ "grad_norm": 10.82150650024414,
1553
+ "learning_rate": 3.225504384098067e-05,
1554
+ "loss": 0.5755,
1555
+ "step": 5475
1556
+ },
1557
+ {
1558
+ "epoch": 1.2640772236267526,
1559
+ "grad_norm": 7.481512069702148,
1560
+ "learning_rate": 3.2148633693709036e-05,
1561
+ "loss": 0.5496,
1562
+ "step": 5500
1563
+ },
1564
+ {
1565
+ "epoch": 1.2698230291886923,
1566
+ "grad_norm": 14.726329803466797,
1567
+ "learning_rate": 3.2042223546437386e-05,
1568
+ "loss": 0.4598,
1569
+ "step": 5525
1570
+ },
1571
+ {
1572
+ "epoch": 1.275568834750632,
1573
+ "grad_norm": 17.87427520751953,
1574
+ "learning_rate": 3.193581339916575e-05,
1575
+ "loss": 0.5756,
1576
+ "step": 5550
1577
+ },
1578
+ {
1579
+ "epoch": 1.281314640312572,
1580
+ "grad_norm": 10.091878890991211,
1581
+ "learning_rate": 3.18294032518941e-05,
1582
+ "loss": 0.5296,
1583
+ "step": 5575
1584
+ },
1585
+ {
1586
+ "epoch": 1.2870604458745116,
1587
+ "grad_norm": 20.161104202270508,
1588
+ "learning_rate": 3.1722993104622455e-05,
1589
+ "loss": 0.5092,
1590
+ "step": 5600
1591
+ },
1592
+ {
1593
+ "epoch": 1.2928062514364513,
1594
+ "grad_norm": 26.924951553344727,
1595
+ "learning_rate": 3.161658295735082e-05,
1596
+ "loss": 0.5627,
1597
+ "step": 5625
1598
+ },
1599
+ {
1600
+ "epoch": 1.2985520569983913,
1601
+ "grad_norm": 11.02285099029541,
1602
+ "learning_rate": 3.151017281007917e-05,
1603
+ "loss": 0.5273,
1604
+ "step": 5650
1605
+ },
1606
+ {
1607
+ "epoch": 1.304297862560331,
1608
+ "grad_norm": 26.79828643798828,
1609
+ "learning_rate": 3.140376266280753e-05,
1610
+ "loss": 0.4949,
1611
+ "step": 5675
1612
+ },
1613
+ {
1614
+ "epoch": 1.3100436681222707,
1615
+ "grad_norm": 13.996374130249023,
1616
+ "learning_rate": 3.129735251553588e-05,
1617
+ "loss": 0.5387,
1618
+ "step": 5700
1619
+ },
1620
+ {
1621
+ "epoch": 1.3157894736842106,
1622
+ "grad_norm": 18.909332275390625,
1623
+ "learning_rate": 3.119094236826424e-05,
1624
+ "loss": 0.5246,
1625
+ "step": 5725
1626
+ },
1627
+ {
1628
+ "epoch": 1.3215352792461503,
1629
+ "grad_norm": 13.408002853393555,
1630
+ "learning_rate": 3.1084532220992595e-05,
1631
+ "loss": 0.4895,
1632
+ "step": 5750
1633
+ },
1634
+ {
1635
+ "epoch": 1.32728108480809,
1636
+ "grad_norm": 13.344757080078125,
1637
+ "learning_rate": 3.097812207372095e-05,
1638
+ "loss": 0.4562,
1639
+ "step": 5775
1640
+ },
1641
+ {
1642
+ "epoch": 1.33302689037003,
1643
+ "grad_norm": 16.318279266357422,
1644
+ "learning_rate": 3.087171192644931e-05,
1645
+ "loss": 0.6114,
1646
+ "step": 5800
1647
+ },
1648
+ {
1649
+ "epoch": 1.3387726959319697,
1650
+ "grad_norm": 14.15111255645752,
1651
+ "learning_rate": 3.0765301779177664e-05,
1652
+ "loss": 0.5826,
1653
+ "step": 5825
1654
+ },
1655
+ {
1656
+ "epoch": 1.3445185014939094,
1657
+ "grad_norm": 10.535929679870605,
1658
+ "learning_rate": 3.065889163190602e-05,
1659
+ "loss": 0.546,
1660
+ "step": 5850
1661
+ },
1662
+ {
1663
+ "epoch": 1.3502643070558493,
1664
+ "grad_norm": 14.012350082397461,
1665
+ "learning_rate": 3.055248148463438e-05,
1666
+ "loss": 0.6077,
1667
+ "step": 5875
1668
+ },
1669
+ {
1670
+ "epoch": 1.356010112617789,
1671
+ "grad_norm": 15.3707914352417,
1672
+ "learning_rate": 3.044607133736273e-05,
1673
+ "loss": 0.5442,
1674
+ "step": 5900
1675
+ },
1676
+ {
1677
+ "epoch": 1.3617559181797287,
1678
+ "grad_norm": 28.015796661376953,
1679
+ "learning_rate": 3.033966119009109e-05,
1680
+ "loss": 0.4813,
1681
+ "step": 5925
1682
+ },
1683
+ {
1684
+ "epoch": 1.3675017237416687,
1685
+ "grad_norm": 10.887642860412598,
1686
+ "learning_rate": 3.0233251042819443e-05,
1687
+ "loss": 0.6075,
1688
+ "step": 5950
1689
+ },
1690
+ {
1691
+ "epoch": 1.3732475293036084,
1692
+ "grad_norm": 13.379786491394043,
1693
+ "learning_rate": 3.0126840895547803e-05,
1694
+ "loss": 0.6452,
1695
+ "step": 5975
1696
+ },
1697
+ {
1698
+ "epoch": 1.378993334865548,
1699
+ "grad_norm": 19.68934440612793,
1700
+ "learning_rate": 3.0020430748276156e-05,
1701
+ "loss": 0.5204,
1702
+ "step": 6000
1703
+ },
1704
+ {
1705
+ "epoch": 1.384739140427488,
1706
+ "grad_norm": 11.586589813232422,
1707
+ "learning_rate": 2.991402060100451e-05,
1708
+ "loss": 0.4538,
1709
+ "step": 6025
1710
+ },
1711
+ {
1712
+ "epoch": 1.3904849459894277,
1713
+ "grad_norm": 16.921403884887695,
1714
+ "learning_rate": 2.980761045373287e-05,
1715
+ "loss": 0.6548,
1716
+ "step": 6050
1717
+ },
1718
+ {
1719
+ "epoch": 1.3962307515513674,
1720
+ "grad_norm": 12.915703773498535,
1721
+ "learning_rate": 2.9701200306461226e-05,
1722
+ "loss": 0.5857,
1723
+ "step": 6075
1724
+ },
1725
+ {
1726
+ "epoch": 1.4019765571133074,
1727
+ "grad_norm": 11.560894966125488,
1728
+ "learning_rate": 2.9594790159189582e-05,
1729
+ "loss": 0.6351,
1730
+ "step": 6100
1731
+ },
1732
+ {
1733
+ "epoch": 1.407722362675247,
1734
+ "grad_norm": 12.070367813110352,
1735
+ "learning_rate": 2.948838001191794e-05,
1736
+ "loss": 0.5254,
1737
+ "step": 6125
1738
+ },
1739
+ {
1740
+ "epoch": 1.4134681682371868,
1741
+ "grad_norm": 6.56376314163208,
1742
+ "learning_rate": 2.9381969864646292e-05,
1743
+ "loss": 0.489,
1744
+ "step": 6150
1745
+ },
1746
+ {
1747
+ "epoch": 1.4192139737991267,
1748
+ "grad_norm": 7.615925312042236,
1749
+ "learning_rate": 2.9275559717374652e-05,
1750
+ "loss": 0.4299,
1751
+ "step": 6175
1752
+ },
1753
+ {
1754
+ "epoch": 1.4249597793610664,
1755
+ "grad_norm": 26.307531356811523,
1756
+ "learning_rate": 2.9169149570103005e-05,
1757
+ "loss": 0.5573,
1758
+ "step": 6200
1759
+ },
1760
+ {
1761
+ "epoch": 1.4307055849230061,
1762
+ "grad_norm": 17.957910537719727,
1763
+ "learning_rate": 2.9062739422831365e-05,
1764
+ "loss": 0.5165,
1765
+ "step": 6225
1766
+ },
1767
+ {
1768
+ "epoch": 1.436451390484946,
1769
+ "grad_norm": 5.903622150421143,
1770
+ "learning_rate": 2.8956329275559718e-05,
1771
+ "loss": 0.6125,
1772
+ "step": 6250
1773
+ },
1774
+ {
1775
+ "epoch": 1.4421971960468858,
1776
+ "grad_norm": 71.71666717529297,
1777
+ "learning_rate": 2.8849919128288078e-05,
1778
+ "loss": 0.5896,
1779
+ "step": 6275
1780
+ },
1781
+ {
1782
+ "epoch": 1.4479430016088255,
1783
+ "grad_norm": 7.325091361999512,
1784
+ "learning_rate": 2.874350898101643e-05,
1785
+ "loss": 0.5207,
1786
+ "step": 6300
1787
+ },
1788
+ {
1789
+ "epoch": 1.4536888071707654,
1790
+ "grad_norm": 7.871235370635986,
1791
+ "learning_rate": 2.8637098833744784e-05,
1792
+ "loss": 0.4792,
1793
+ "step": 6325
1794
+ },
1795
+ {
1796
+ "epoch": 1.4594346127327051,
1797
+ "grad_norm": 3.092621326446533,
1798
+ "learning_rate": 2.8530688686473144e-05,
1799
+ "loss": 0.4546,
1800
+ "step": 6350
1801
+ },
1802
+ {
1803
+ "epoch": 1.4651804182946448,
1804
+ "grad_norm": 11.23499870300293,
1805
+ "learning_rate": 2.8424278539201497e-05,
1806
+ "loss": 0.4942,
1807
+ "step": 6375
1808
+ },
1809
+ {
1810
+ "epoch": 1.4709262238565848,
1811
+ "grad_norm": 16.10143280029297,
1812
+ "learning_rate": 2.8317868391929857e-05,
1813
+ "loss": 0.4757,
1814
+ "step": 6400
1815
+ },
1816
+ {
1817
+ "epoch": 1.4766720294185245,
1818
+ "grad_norm": 13.106595039367676,
1819
+ "learning_rate": 2.821145824465821e-05,
1820
+ "loss": 0.5392,
1821
+ "step": 6425
1822
+ },
1823
+ {
1824
+ "epoch": 1.4824178349804642,
1825
+ "grad_norm": 8.037747383117676,
1826
+ "learning_rate": 2.8105048097386567e-05,
1827
+ "loss": 0.5916,
1828
+ "step": 6450
1829
+ },
1830
+ {
1831
+ "epoch": 1.4881636405424041,
1832
+ "grad_norm": 14.93556022644043,
1833
+ "learning_rate": 2.7998637950114927e-05,
1834
+ "loss": 0.564,
1835
+ "step": 6475
1836
+ },
1837
+ {
1838
+ "epoch": 1.4939094461043438,
1839
+ "grad_norm": 10.39865493774414,
1840
+ "learning_rate": 2.789222780284328e-05,
1841
+ "loss": 0.5026,
1842
+ "step": 6500
1843
+ },
1844
+ {
1845
+ "epoch": 1.4996552516662836,
1846
+ "grad_norm": 10.64941120147705,
1847
+ "learning_rate": 2.778581765557164e-05,
1848
+ "loss": 0.4656,
1849
+ "step": 6525
1850
+ },
1851
+ {
1852
+ "epoch": 1.5054010572282235,
1853
+ "grad_norm": 10.483504295349121,
1854
+ "learning_rate": 2.7679407508299993e-05,
1855
+ "loss": 0.5126,
1856
+ "step": 6550
1857
+ },
1858
+ {
1859
+ "epoch": 1.5111468627901632,
1860
+ "grad_norm": 7.613571643829346,
1861
+ "learning_rate": 2.7572997361028346e-05,
1862
+ "loss": 0.5551,
1863
+ "step": 6575
1864
+ },
1865
+ {
1866
+ "epoch": 1.516892668352103,
1867
+ "grad_norm": 14.762700080871582,
1868
+ "learning_rate": 2.7466587213756706e-05,
1869
+ "loss": 0.7115,
1870
+ "step": 6600
1871
+ },
1872
+ {
1873
+ "epoch": 1.5226384739140428,
1874
+ "grad_norm": 15.398651123046875,
1875
+ "learning_rate": 2.736017706648506e-05,
1876
+ "loss": 0.5283,
1877
+ "step": 6625
1878
+ },
1879
+ {
1880
+ "epoch": 1.5283842794759825,
1881
+ "grad_norm": 5.248310089111328,
1882
+ "learning_rate": 2.725376691921342e-05,
1883
+ "loss": 0.4443,
1884
+ "step": 6650
1885
+ },
1886
+ {
1887
+ "epoch": 1.5341300850379223,
1888
+ "grad_norm": 11.633146286010742,
1889
+ "learning_rate": 2.7147356771941772e-05,
1890
+ "loss": 0.4418,
1891
+ "step": 6675
1892
+ },
1893
+ {
1894
+ "epoch": 1.5398758905998622,
1895
+ "grad_norm": 13.065744400024414,
1896
+ "learning_rate": 2.704094662467013e-05,
1897
+ "loss": 0.5505,
1898
+ "step": 6700
1899
+ },
1900
+ {
1901
+ "epoch": 1.545621696161802,
1902
+ "grad_norm": 10.502464294433594,
1903
+ "learning_rate": 2.6934536477398485e-05,
1904
+ "loss": 0.5268,
1905
+ "step": 6725
1906
+ },
1907
+ {
1908
+ "epoch": 1.5513675017237416,
1909
+ "grad_norm": 42.0969352722168,
1910
+ "learning_rate": 2.6828126330126842e-05,
1911
+ "loss": 0.4736,
1912
+ "step": 6750
1913
+ },
1914
+ {
1915
+ "epoch": 1.5571133072856815,
1916
+ "grad_norm": 6.398384094238281,
1917
+ "learning_rate": 2.6721716182855198e-05,
1918
+ "loss": 0.5209,
1919
+ "step": 6775
1920
+ },
1921
+ {
1922
+ "epoch": 1.5628591128476212,
1923
+ "grad_norm": 3.2121517658233643,
1924
+ "learning_rate": 2.6615306035583555e-05,
1925
+ "loss": 0.5577,
1926
+ "step": 6800
1927
+ },
1928
+ {
1929
+ "epoch": 1.568604918409561,
1930
+ "grad_norm": 48.83503723144531,
1931
+ "learning_rate": 2.6508895888311915e-05,
1932
+ "loss": 0.5862,
1933
+ "step": 6825
1934
+ },
1935
+ {
1936
+ "epoch": 1.5743507239715009,
1937
+ "grad_norm": 10.185980796813965,
1938
+ "learning_rate": 2.6402485741040268e-05,
1939
+ "loss": 0.5377,
1940
+ "step": 6850
1941
+ },
1942
+ {
1943
+ "epoch": 1.5800965295334406,
1944
+ "grad_norm": 5.717852592468262,
1945
+ "learning_rate": 2.629607559376862e-05,
1946
+ "loss": 0.447,
1947
+ "step": 6875
1948
+ },
1949
+ {
1950
+ "epoch": 1.5858423350953803,
1951
+ "grad_norm": 10.820260047912598,
1952
+ "learning_rate": 2.618966544649698e-05,
1953
+ "loss": 0.5178,
1954
+ "step": 6900
1955
+ },
1956
+ {
1957
+ "epoch": 1.5915881406573202,
1958
+ "grad_norm": 7.412465572357178,
1959
+ "learning_rate": 2.6083255299225334e-05,
1960
+ "loss": 0.5001,
1961
+ "step": 6925
1962
+ },
1963
+ {
1964
+ "epoch": 1.59733394621926,
1965
+ "grad_norm": 23.02524757385254,
1966
+ "learning_rate": 2.5976845151953694e-05,
1967
+ "loss": 0.4905,
1968
+ "step": 6950
1969
+ },
1970
+ {
1971
+ "epoch": 1.6030797517811997,
1972
+ "grad_norm": 7.75547981262207,
1973
+ "learning_rate": 2.5870435004682047e-05,
1974
+ "loss": 0.592,
1975
+ "step": 6975
1976
+ },
1977
+ {
1978
+ "epoch": 1.6088255573431396,
1979
+ "grad_norm": 44.980621337890625,
1980
+ "learning_rate": 2.57640248574104e-05,
1981
+ "loss": 0.5954,
1982
+ "step": 7000
1983
+ },
1984
+ {
1985
+ "epoch": 1.6145713629050793,
1986
+ "grad_norm": 8.900413513183594,
1987
+ "learning_rate": 2.565761471013876e-05,
1988
+ "loss": 0.5001,
1989
+ "step": 7025
1990
+ },
1991
+ {
1992
+ "epoch": 1.620317168467019,
1993
+ "grad_norm": 12.892452239990234,
1994
+ "learning_rate": 2.5551204562867113e-05,
1995
+ "loss": 0.5148,
1996
+ "step": 7050
1997
+ },
1998
+ {
1999
+ "epoch": 1.626062974028959,
2000
+ "grad_norm": 17.05414390563965,
2001
+ "learning_rate": 2.5444794415595473e-05,
2002
+ "loss": 0.5795,
2003
+ "step": 7075
2004
+ },
2005
+ {
2006
+ "epoch": 1.6318087795908987,
2007
+ "grad_norm": 15.482218742370605,
2008
+ "learning_rate": 2.533838426832383e-05,
2009
+ "loss": 0.5684,
2010
+ "step": 7100
2011
+ },
2012
+ {
2013
+ "epoch": 1.6375545851528384,
2014
+ "grad_norm": 7.356892108917236,
2015
+ "learning_rate": 2.5231974121052183e-05,
2016
+ "loss": 0.5952,
2017
+ "step": 7125
2018
+ },
2019
+ {
2020
+ "epoch": 1.6433003907147783,
2021
+ "grad_norm": 16.71323013305664,
2022
+ "learning_rate": 2.5125563973780543e-05,
2023
+ "loss": 0.7416,
2024
+ "step": 7150
2025
+ },
2026
+ {
2027
+ "epoch": 1.649046196276718,
2028
+ "grad_norm": 12.637115478515625,
2029
+ "learning_rate": 2.5019153826508896e-05,
2030
+ "loss": 0.6563,
2031
+ "step": 7175
2032
+ },
2033
+ {
2034
+ "epoch": 1.6547920018386577,
2035
+ "grad_norm": 13.602217674255371,
2036
+ "learning_rate": 2.4912743679237252e-05,
2037
+ "loss": 0.4979,
2038
+ "step": 7200
2039
+ },
2040
+ {
2041
+ "epoch": 1.6605378074005976,
2042
+ "grad_norm": 10.359466552734375,
2043
+ "learning_rate": 2.480633353196561e-05,
2044
+ "loss": 0.5067,
2045
+ "step": 7225
2046
+ },
2047
+ {
2048
+ "epoch": 1.6662836129625374,
2049
+ "grad_norm": 7.33120059967041,
2050
+ "learning_rate": 2.4699923384693965e-05,
2051
+ "loss": 0.4785,
2052
+ "step": 7250
2053
+ },
2054
+ {
2055
+ "epoch": 1.672029418524477,
2056
+ "grad_norm": 9.278757095336914,
2057
+ "learning_rate": 2.4593513237422322e-05,
2058
+ "loss": 0.4941,
2059
+ "step": 7275
2060
+ },
2061
+ {
2062
+ "epoch": 1.677775224086417,
2063
+ "grad_norm": 6.453770160675049,
2064
+ "learning_rate": 2.448710309015068e-05,
2065
+ "loss": 0.5033,
2066
+ "step": 7300
2067
+ },
2068
+ {
2069
+ "epoch": 1.6835210296483567,
2070
+ "grad_norm": 8.040416717529297,
2071
+ "learning_rate": 2.4380692942879035e-05,
2072
+ "loss": 0.4577,
2073
+ "step": 7325
2074
+ },
2075
+ {
2076
+ "epoch": 1.6892668352102964,
2077
+ "grad_norm": 13.009758949279785,
2078
+ "learning_rate": 2.4274282795607388e-05,
2079
+ "loss": 0.5811,
2080
+ "step": 7350
2081
+ },
2082
+ {
2083
+ "epoch": 1.6950126407722363,
2084
+ "grad_norm": 12.394170761108398,
2085
+ "learning_rate": 2.4167872648335745e-05,
2086
+ "loss": 0.4945,
2087
+ "step": 7375
2088
+ },
2089
+ {
2090
+ "epoch": 1.700758446334176,
2091
+ "grad_norm": 16.887958526611328,
2092
+ "learning_rate": 2.40614625010641e-05,
2093
+ "loss": 0.5528,
2094
+ "step": 7400
2095
+ },
2096
+ {
2097
+ "epoch": 1.7065042518961158,
2098
+ "grad_norm": 5.690896511077881,
2099
+ "learning_rate": 2.3955052353792458e-05,
2100
+ "loss": 0.4227,
2101
+ "step": 7425
2102
+ },
2103
+ {
2104
+ "epoch": 1.7122500574580557,
2105
+ "grad_norm": 5.632653713226318,
2106
+ "learning_rate": 2.3848642206520818e-05,
2107
+ "loss": 0.6466,
2108
+ "step": 7450
2109
+ },
2110
+ {
2111
+ "epoch": 1.7179958630199954,
2112
+ "grad_norm": 14.108399391174316,
2113
+ "learning_rate": 2.374223205924917e-05,
2114
+ "loss": 0.4858,
2115
+ "step": 7475
2116
+ },
2117
+ {
2118
+ "epoch": 1.7237416685819351,
2119
+ "grad_norm": 12.757282257080078,
2120
+ "learning_rate": 2.3635821911977527e-05,
2121
+ "loss": 0.5619,
2122
+ "step": 7500
2123
+ },
2124
+ {
2125
+ "epoch": 1.729487474143875,
2126
+ "grad_norm": 3.3859102725982666,
2127
+ "learning_rate": 2.3529411764705884e-05,
2128
+ "loss": 0.5808,
2129
+ "step": 7525
2130
+ },
2131
+ {
2132
+ "epoch": 1.7352332797058148,
2133
+ "grad_norm": 13.51447582244873,
2134
+ "learning_rate": 2.342300161743424e-05,
2135
+ "loss": 0.5598,
2136
+ "step": 7550
2137
+ },
2138
+ {
2139
+ "epoch": 1.7409790852677545,
2140
+ "grad_norm": 12.655472755432129,
2141
+ "learning_rate": 2.3316591470162597e-05,
2142
+ "loss": 0.4214,
2143
+ "step": 7575
2144
+ },
2145
+ {
2146
+ "epoch": 1.7467248908296944,
2147
+ "grad_norm": 22.180927276611328,
2148
+ "learning_rate": 2.3210181322890953e-05,
2149
+ "loss": 0.5946,
2150
+ "step": 7600
2151
+ },
2152
+ {
2153
+ "epoch": 1.752470696391634,
2154
+ "grad_norm": 11.127065658569336,
2155
+ "learning_rate": 2.3103771175619306e-05,
2156
+ "loss": 0.6091,
2157
+ "step": 7625
2158
+ },
2159
+ {
2160
+ "epoch": 1.7582165019535738,
2161
+ "grad_norm": 14.059673309326172,
2162
+ "learning_rate": 2.2997361028347663e-05,
2163
+ "loss": 0.5442,
2164
+ "step": 7650
2165
+ },
2166
+ {
2167
+ "epoch": 1.7639623075155137,
2168
+ "grad_norm": 9.362860679626465,
2169
+ "learning_rate": 2.289095088107602e-05,
2170
+ "loss": 0.5622,
2171
+ "step": 7675
2172
+ },
2173
+ {
2174
+ "epoch": 1.7697081130774535,
2175
+ "grad_norm": 11.744709968566895,
2176
+ "learning_rate": 2.2784540733804376e-05,
2177
+ "loss": 0.4207,
2178
+ "step": 7700
2179
+ },
2180
+ {
2181
+ "epoch": 1.7754539186393932,
2182
+ "grad_norm": 11.617082595825195,
2183
+ "learning_rate": 2.2678130586532732e-05,
2184
+ "loss": 0.5576,
2185
+ "step": 7725
2186
+ },
2187
+ {
2188
+ "epoch": 1.781199724201333,
2189
+ "grad_norm": 7.126068592071533,
2190
+ "learning_rate": 2.257172043926109e-05,
2191
+ "loss": 0.5096,
2192
+ "step": 7750
2193
+ },
2194
+ {
2195
+ "epoch": 1.7869455297632728,
2196
+ "grad_norm": 6.728802680969238,
2197
+ "learning_rate": 2.2465310291989445e-05,
2198
+ "loss": 0.3561,
2199
+ "step": 7775
2200
+ },
2201
+ {
2202
+ "epoch": 1.7926913353252125,
2203
+ "grad_norm": 6.852474212646484,
2204
+ "learning_rate": 2.2358900144717802e-05,
2205
+ "loss": 0.5567,
2206
+ "step": 7800
2207
+ },
2208
+ {
2209
+ "epoch": 1.7984371408871525,
2210
+ "grad_norm": 9.070609092712402,
2211
+ "learning_rate": 2.225248999744616e-05,
2212
+ "loss": 0.6369,
2213
+ "step": 7825
2214
+ },
2215
+ {
2216
+ "epoch": 1.8041829464490922,
2217
+ "grad_norm": 12.296309471130371,
2218
+ "learning_rate": 2.2146079850174515e-05,
2219
+ "loss": 0.5887,
2220
+ "step": 7850
2221
+ },
2222
+ {
2223
+ "epoch": 1.8099287520110319,
2224
+ "grad_norm": 13.876431465148926,
2225
+ "learning_rate": 2.203966970290287e-05,
2226
+ "loss": 0.5461,
2227
+ "step": 7875
2228
+ },
2229
+ {
2230
+ "epoch": 1.8156745575729718,
2231
+ "grad_norm": 8.236191749572754,
2232
+ "learning_rate": 2.1933259555631225e-05,
2233
+ "loss": 0.4896,
2234
+ "step": 7900
2235
+ },
2236
+ {
2237
+ "epoch": 1.8214203631349115,
2238
+ "grad_norm": 32.38478088378906,
2239
+ "learning_rate": 2.182684940835958e-05,
2240
+ "loss": 0.5381,
2241
+ "step": 7925
2242
+ },
2243
+ {
2244
+ "epoch": 1.8271661686968512,
2245
+ "grad_norm": 7.219331741333008,
2246
+ "learning_rate": 2.1720439261087938e-05,
2247
+ "loss": 0.4077,
2248
+ "step": 7950
2249
+ },
2250
+ {
2251
+ "epoch": 1.8329119742587912,
2252
+ "grad_norm": 10.3890962600708,
2253
+ "learning_rate": 2.1614029113816294e-05,
2254
+ "loss": 0.547,
2255
+ "step": 7975
2256
+ },
2257
+ {
2258
+ "epoch": 1.8386577798207309,
2259
+ "grad_norm": 7.473533630371094,
2260
+ "learning_rate": 2.150761896654465e-05,
2261
+ "loss": 0.5187,
2262
+ "step": 8000
2263
+ },
2264
+ {
2265
+ "epoch": 1.8444035853826706,
2266
+ "grad_norm": 9.898606300354004,
2267
+ "learning_rate": 2.1401208819273007e-05,
2268
+ "loss": 0.5642,
2269
+ "step": 8025
2270
+ },
2271
+ {
2272
+ "epoch": 1.8501493909446105,
2273
+ "grad_norm": 10.344517707824707,
2274
+ "learning_rate": 2.129479867200136e-05,
2275
+ "loss": 0.4615,
2276
+ "step": 8050
2277
+ },
2278
+ {
2279
+ "epoch": 1.8558951965065502,
2280
+ "grad_norm": 34.38172912597656,
2281
+ "learning_rate": 2.118838852472972e-05,
2282
+ "loss": 0.5293,
2283
+ "step": 8075
2284
+ },
2285
+ {
2286
+ "epoch": 1.86164100206849,
2287
+ "grad_norm": 12.77538776397705,
2288
+ "learning_rate": 2.1081978377458077e-05,
2289
+ "loss": 0.4196,
2290
+ "step": 8100
2291
+ },
2292
+ {
2293
+ "epoch": 1.8673868076304299,
2294
+ "grad_norm": 9.176766395568848,
2295
+ "learning_rate": 2.0975568230186433e-05,
2296
+ "loss": 0.4989,
2297
+ "step": 8125
2298
+ },
2299
+ {
2300
+ "epoch": 1.8731326131923696,
2301
+ "grad_norm": 9.476819038391113,
2302
+ "learning_rate": 2.086915808291479e-05,
2303
+ "loss": 0.5131,
2304
+ "step": 8150
2305
+ },
2306
+ {
2307
+ "epoch": 1.8788784187543093,
2308
+ "grad_norm": 12.82066822052002,
2309
+ "learning_rate": 2.0762747935643143e-05,
2310
+ "loss": 0.5161,
2311
+ "step": 8175
2312
+ },
2313
+ {
2314
+ "epoch": 1.8846242243162492,
2315
+ "grad_norm": 6.587464332580566,
2316
+ "learning_rate": 2.06563377883715e-05,
2317
+ "loss": 0.5613,
2318
+ "step": 8200
2319
+ },
2320
+ {
2321
+ "epoch": 1.890370029878189,
2322
+ "grad_norm": 14.604435920715332,
2323
+ "learning_rate": 2.0549927641099856e-05,
2324
+ "loss": 0.5146,
2325
+ "step": 8225
2326
+ },
2327
+ {
2328
+ "epoch": 1.8961158354401286,
2329
+ "grad_norm": 25.261781692504883,
2330
+ "learning_rate": 2.0443517493828213e-05,
2331
+ "loss": 0.4847,
2332
+ "step": 8250
2333
+ },
2334
+ {
2335
+ "epoch": 1.9018616410020686,
2336
+ "grad_norm": 15.835251808166504,
2337
+ "learning_rate": 2.033710734655657e-05,
2338
+ "loss": 0.5851,
2339
+ "step": 8275
2340
+ },
2341
+ {
2342
+ "epoch": 1.9076074465640083,
2343
+ "grad_norm": 25.110139846801758,
2344
+ "learning_rate": 2.0230697199284926e-05,
2345
+ "loss": 0.4597,
2346
+ "step": 8300
2347
+ },
2348
+ {
2349
+ "epoch": 1.913353252125948,
2350
+ "grad_norm": 24.496837615966797,
2351
+ "learning_rate": 2.012428705201328e-05,
2352
+ "loss": 0.5487,
2353
+ "step": 8325
2354
+ },
2355
+ {
2356
+ "epoch": 1.919099057687888,
2357
+ "grad_norm": 11.73768424987793,
2358
+ "learning_rate": 2.0017876904741635e-05,
2359
+ "loss": 0.4469,
2360
+ "step": 8350
2361
+ },
2362
+ {
2363
+ "epoch": 1.9248448632498276,
2364
+ "grad_norm": 10.91761589050293,
2365
+ "learning_rate": 1.9911466757469992e-05,
2366
+ "loss": 0.6221,
2367
+ "step": 8375
2368
+ },
2369
+ {
2370
+ "epoch": 1.9305906688117673,
2371
+ "grad_norm": 5.8950724601745605,
2372
+ "learning_rate": 1.9805056610198348e-05,
2373
+ "loss": 0.447,
2374
+ "step": 8400
2375
+ },
2376
+ {
2377
+ "epoch": 1.9363364743737073,
2378
+ "grad_norm": 8.51844310760498,
2379
+ "learning_rate": 1.9698646462926705e-05,
2380
+ "loss": 0.5028,
2381
+ "step": 8425
2382
+ },
2383
+ {
2384
+ "epoch": 1.942082279935647,
2385
+ "grad_norm": 8.514192581176758,
2386
+ "learning_rate": 1.9592236315655065e-05,
2387
+ "loss": 0.5334,
2388
+ "step": 8450
2389
+ },
2390
+ {
2391
+ "epoch": 1.9478280854975867,
2392
+ "grad_norm": 17.922142028808594,
2393
+ "learning_rate": 1.9485826168383418e-05,
2394
+ "loss": 0.5718,
2395
+ "step": 8475
2396
+ },
2397
+ {
2398
+ "epoch": 1.9535738910595266,
2399
+ "grad_norm": 16.389118194580078,
2400
+ "learning_rate": 1.9379416021111774e-05,
2401
+ "loss": 0.44,
2402
+ "step": 8500
2403
+ },
2404
+ {
2405
+ "epoch": 1.9593196966214663,
2406
+ "grad_norm": 21.868207931518555,
2407
+ "learning_rate": 1.927300587384013e-05,
2408
+ "loss": 0.5156,
2409
+ "step": 8525
2410
+ },
2411
+ {
2412
+ "epoch": 1.965065502183406,
2413
+ "grad_norm": 16.75226593017578,
2414
+ "learning_rate": 1.9166595726568487e-05,
2415
+ "loss": 0.5354,
2416
+ "step": 8550
2417
+ },
2418
+ {
2419
+ "epoch": 1.970811307745346,
2420
+ "grad_norm": 10.739360809326172,
2421
+ "learning_rate": 1.9060185579296844e-05,
2422
+ "loss": 0.4722,
2423
+ "step": 8575
2424
+ },
2425
+ {
2426
+ "epoch": 1.9765571133072857,
2427
+ "grad_norm": 12.929302215576172,
2428
+ "learning_rate": 1.8953775432025197e-05,
2429
+ "loss": 0.551,
2430
+ "step": 8600
2431
+ },
2432
+ {
2433
+ "epoch": 1.9823029188692254,
2434
+ "grad_norm": 17.41213035583496,
2435
+ "learning_rate": 1.8847365284753554e-05,
2436
+ "loss": 0.5017,
2437
+ "step": 8625
2438
+ },
2439
+ {
2440
+ "epoch": 1.9880487244311653,
2441
+ "grad_norm": 5.805027484893799,
2442
+ "learning_rate": 1.874095513748191e-05,
2443
+ "loss": 0.5878,
2444
+ "step": 8650
2445
+ },
2446
+ {
2447
+ "epoch": 1.993794529993105,
2448
+ "grad_norm": 11.490523338317871,
2449
+ "learning_rate": 1.8634544990210267e-05,
2450
+ "loss": 0.5627,
2451
+ "step": 8675
2452
+ },
2453
+ {
2454
+ "epoch": 1.9995403355550447,
2455
+ "grad_norm": 13.531903266906738,
2456
+ "learning_rate": 1.8528134842938623e-05,
2457
+ "loss": 0.5863,
2458
+ "step": 8700
2459
+ },
2460
+ {
2461
+ "epoch": 2.0,
2462
+ "eval_gen_len": 26.2389,
2463
+ "eval_loss": 0.6237149238586426,
2464
+ "eval_rouge1": 86.4062,
2465
+ "eval_rouge2": 71.9313,
2466
+ "eval_rougeL": 84.7508,
2467
+ "eval_rougeLsum": 84.7631,
2468
+ "eval_runtime": 2064.9224,
2469
+ "eval_samples_per_second": 1.405,
2470
+ "eval_steps_per_second": 0.352,
2471
+ "step": 8702
2472
+ },
2473
+ {
2474
+ "epoch": 2.0052861411169847,
2475
+ "grad_norm": 9.042722702026367,
2476
+ "learning_rate": 1.842172469566698e-05,
2477
+ "loss": 0.3464,
2478
+ "step": 8725
2479
+ },
2480
+ {
2481
+ "epoch": 2.0110319466789246,
2482
+ "grad_norm": 29.441604614257812,
2483
+ "learning_rate": 1.8315314548395336e-05,
2484
+ "loss": 0.3289,
2485
+ "step": 8750
2486
+ },
2487
+ {
2488
+ "epoch": 2.016777752240864,
2489
+ "grad_norm": 6.572937488555908,
2490
+ "learning_rate": 1.8208904401123693e-05,
2491
+ "loss": 0.2474,
2492
+ "step": 8775
2493
+ },
2494
+ {
2495
+ "epoch": 2.022523557802804,
2496
+ "grad_norm": 5.759756088256836,
2497
+ "learning_rate": 1.810249425385205e-05,
2498
+ "loss": 0.2546,
2499
+ "step": 8800
2500
+ },
2501
+ {
2502
+ "epoch": 2.028269363364744,
2503
+ "grad_norm": 13.856329917907715,
2504
+ "learning_rate": 1.7996084106580406e-05,
2505
+ "loss": 0.2817,
2506
+ "step": 8825
2507
+ },
2508
+ {
2509
+ "epoch": 2.0340151689266834,
2510
+ "grad_norm": 11.948515892028809,
2511
+ "learning_rate": 1.7889673959308762e-05,
2512
+ "loss": 0.3477,
2513
+ "step": 8850
2514
+ },
2515
+ {
2516
+ "epoch": 2.0397609744886234,
2517
+ "grad_norm": 7.310947418212891,
2518
+ "learning_rate": 1.7783263812037115e-05,
2519
+ "loss": 0.2743,
2520
+ "step": 8875
2521
+ },
2522
+ {
2523
+ "epoch": 2.0455067800505633,
2524
+ "grad_norm": 8.42832088470459,
2525
+ "learning_rate": 1.7676853664765472e-05,
2526
+ "loss": 0.3924,
2527
+ "step": 8900
2528
+ },
2529
+ {
2530
+ "epoch": 2.051252585612503,
2531
+ "grad_norm": 8.289580345153809,
2532
+ "learning_rate": 1.757044351749383e-05,
2533
+ "loss": 0.3083,
2534
+ "step": 8925
2535
+ },
2536
+ {
2537
+ "epoch": 2.0569983911744427,
2538
+ "grad_norm": 5.149430751800537,
2539
+ "learning_rate": 1.7464033370222185e-05,
2540
+ "loss": 0.301,
2541
+ "step": 8950
2542
+ },
2543
+ {
2544
+ "epoch": 2.0627441967363827,
2545
+ "grad_norm": 10.75927734375,
2546
+ "learning_rate": 1.735762322295054e-05,
2547
+ "loss": 0.3943,
2548
+ "step": 8975
2549
+ },
2550
+ {
2551
+ "epoch": 2.068490002298322,
2552
+ "grad_norm": 7.442399501800537,
2553
+ "learning_rate": 1.7251213075678898e-05,
2554
+ "loss": 0.29,
2555
+ "step": 9000
2556
+ },
2557
+ {
2558
+ "epoch": 2.074235807860262,
2559
+ "grad_norm": 7.649430751800537,
2560
+ "learning_rate": 1.714480292840725e-05,
2561
+ "loss": 0.2922,
2562
+ "step": 9025
2563
+ },
2564
+ {
2565
+ "epoch": 2.079981613422202,
2566
+ "grad_norm": 14.108525276184082,
2567
+ "learning_rate": 1.7038392781135608e-05,
2568
+ "loss": 0.3514,
2569
+ "step": 9050
2570
+ },
2571
+ {
2572
+ "epoch": 2.0857274189841415,
2573
+ "grad_norm": 15.770670890808105,
2574
+ "learning_rate": 1.6931982633863968e-05,
2575
+ "loss": 0.302,
2576
+ "step": 9075
2577
+ },
2578
+ {
2579
+ "epoch": 2.0914732245460814,
2580
+ "grad_norm": 10.099161148071289,
2581
+ "learning_rate": 1.6825572486592324e-05,
2582
+ "loss": 0.2739,
2583
+ "step": 9100
2584
+ },
2585
+ {
2586
+ "epoch": 2.0972190301080214,
2587
+ "grad_norm": 9.134102821350098,
2588
+ "learning_rate": 1.671916233932068e-05,
2589
+ "loss": 0.3254,
2590
+ "step": 9125
2591
+ },
2592
+ {
2593
+ "epoch": 2.102964835669961,
2594
+ "grad_norm": 19.84739875793457,
2595
+ "learning_rate": 1.6612752192049037e-05,
2596
+ "loss": 0.2823,
2597
+ "step": 9150
2598
+ },
2599
+ {
2600
+ "epoch": 2.1087106412319008,
2601
+ "grad_norm": 8.968664169311523,
2602
+ "learning_rate": 1.650634204477739e-05,
2603
+ "loss": 0.2593,
2604
+ "step": 9175
2605
+ },
2606
+ {
2607
+ "epoch": 2.1144564467938407,
2608
+ "grad_norm": 3.067753791809082,
2609
+ "learning_rate": 1.6399931897505747e-05,
2610
+ "loss": 0.262,
2611
+ "step": 9200
2612
+ },
2613
+ {
2614
+ "epoch": 2.12020225235578,
2615
+ "grad_norm": 2.221193552017212,
2616
+ "learning_rate": 1.6293521750234103e-05,
2617
+ "loss": 0.3989,
2618
+ "step": 9225
2619
+ },
2620
+ {
2621
+ "epoch": 2.12594805791772,
2622
+ "grad_norm": 8.303793907165527,
2623
+ "learning_rate": 1.618711160296246e-05,
2624
+ "loss": 0.2848,
2625
+ "step": 9250
2626
+ },
2627
+ {
2628
+ "epoch": 2.13169386347966,
2629
+ "grad_norm": 8.77238941192627,
2630
+ "learning_rate": 1.6080701455690816e-05,
2631
+ "loss": 0.273,
2632
+ "step": 9275
2633
+ },
2634
+ {
2635
+ "epoch": 2.1374396690415995,
2636
+ "grad_norm": 9.391227722167969,
2637
+ "learning_rate": 1.597429130841917e-05,
2638
+ "loss": 0.2648,
2639
+ "step": 9300
2640
+ },
2641
+ {
2642
+ "epoch": 2.1431854746035395,
2643
+ "grad_norm": 7.9943389892578125,
2644
+ "learning_rate": 1.5867881161147526e-05,
2645
+ "loss": 0.2839,
2646
+ "step": 9325
2647
+ },
2648
+ {
2649
+ "epoch": 2.1489312801654794,
2650
+ "grad_norm": 8.543972969055176,
2651
+ "learning_rate": 1.5761471013875882e-05,
2652
+ "loss": 0.2569,
2653
+ "step": 9350
2654
+ },
2655
+ {
2656
+ "epoch": 2.154677085727419,
2657
+ "grad_norm": 6.456871032714844,
2658
+ "learning_rate": 1.565506086660424e-05,
2659
+ "loss": 0.2514,
2660
+ "step": 9375
2661
+ },
2662
+ {
2663
+ "epoch": 2.160422891289359,
2664
+ "grad_norm": 10.609663009643555,
2665
+ "learning_rate": 1.5548650719332595e-05,
2666
+ "loss": 0.3059,
2667
+ "step": 9400
2668
+ },
2669
+ {
2670
+ "epoch": 2.1661686968512988,
2671
+ "grad_norm": 12.719677925109863,
2672
+ "learning_rate": 1.5442240572060952e-05,
2673
+ "loss": 0.2691,
2674
+ "step": 9425
2675
+ },
2676
+ {
2677
+ "epoch": 2.1719145024132382,
2678
+ "grad_norm": 6.143183708190918,
2679
+ "learning_rate": 1.533583042478931e-05,
2680
+ "loss": 0.2422,
2681
+ "step": 9450
2682
+ },
2683
+ {
2684
+ "epoch": 2.177660307975178,
2685
+ "grad_norm": 20.21449089050293,
2686
+ "learning_rate": 1.5229420277517665e-05,
2687
+ "loss": 0.3001,
2688
+ "step": 9475
2689
+ },
2690
+ {
2691
+ "epoch": 2.183406113537118,
2692
+ "grad_norm": 4.3389973640441895,
2693
+ "learning_rate": 1.5123010130246022e-05,
2694
+ "loss": 0.2947,
2695
+ "step": 9500
2696
+ },
2697
+ {
2698
+ "epoch": 2.1891519190990576,
2699
+ "grad_norm": 10.764538764953613,
2700
+ "learning_rate": 1.5016599982974378e-05,
2701
+ "loss": 0.2926,
2702
+ "step": 9525
2703
+ },
2704
+ {
2705
+ "epoch": 2.1948977246609975,
2706
+ "grad_norm": 5.7259321212768555,
2707
+ "learning_rate": 1.4910189835702735e-05,
2708
+ "loss": 0.281,
2709
+ "step": 9550
2710
+ },
2711
+ {
2712
+ "epoch": 2.2006435302229375,
2713
+ "grad_norm": 7.039416790008545,
2714
+ "learning_rate": 1.4803779688431091e-05,
2715
+ "loss": 0.2707,
2716
+ "step": 9575
2717
+ },
2718
+ {
2719
+ "epoch": 2.206389335784877,
2720
+ "grad_norm": 9.577095985412598,
2721
+ "learning_rate": 1.4697369541159444e-05,
2722
+ "loss": 0.3298,
2723
+ "step": 9600
2724
+ },
2725
+ {
2726
+ "epoch": 2.212135141346817,
2727
+ "grad_norm": 5.981830596923828,
2728
+ "learning_rate": 1.45909593938878e-05,
2729
+ "loss": 0.3344,
2730
+ "step": 9625
2731
+ },
2732
+ {
2733
+ "epoch": 2.217880946908757,
2734
+ "grad_norm": 5.922014236450195,
2735
+ "learning_rate": 1.4484549246616157e-05,
2736
+ "loss": 0.2731,
2737
+ "step": 9650
2738
+ },
2739
+ {
2740
+ "epoch": 2.2236267524706963,
2741
+ "grad_norm": 9.63442325592041,
2742
+ "learning_rate": 1.4378139099344515e-05,
2743
+ "loss": 0.3236,
2744
+ "step": 9675
2745
+ },
2746
+ {
2747
+ "epoch": 2.2293725580326362,
2748
+ "grad_norm": 15.069372177124023,
2749
+ "learning_rate": 1.4271728952072872e-05,
2750
+ "loss": 0.3373,
2751
+ "step": 9700
2752
+ },
2753
+ {
2754
+ "epoch": 2.235118363594576,
2755
+ "grad_norm": 9.148941993713379,
2756
+ "learning_rate": 1.4165318804801225e-05,
2757
+ "loss": 0.3207,
2758
+ "step": 9725
2759
+ },
2760
+ {
2761
+ "epoch": 2.2408641691565157,
2762
+ "grad_norm": 6.5600385665893555,
2763
+ "learning_rate": 1.4058908657529582e-05,
2764
+ "loss": 0.2614,
2765
+ "step": 9750
2766
+ },
2767
+ {
2768
+ "epoch": 2.2466099747184556,
2769
+ "grad_norm": 12.141286849975586,
2770
+ "learning_rate": 1.3952498510257938e-05,
2771
+ "loss": 0.3223,
2772
+ "step": 9775
2773
+ },
2774
+ {
2775
+ "epoch": 2.2523557802803955,
2776
+ "grad_norm": 6.805424213409424,
2777
+ "learning_rate": 1.3846088362986295e-05,
2778
+ "loss": 0.3697,
2779
+ "step": 9800
2780
+ },
2781
+ {
2782
+ "epoch": 2.258101585842335,
2783
+ "grad_norm": 13.576851844787598,
2784
+ "learning_rate": 1.3739678215714651e-05,
2785
+ "loss": 0.36,
2786
+ "step": 9825
2787
+ },
2788
+ {
2789
+ "epoch": 2.263847391404275,
2790
+ "grad_norm": 5.8922576904296875,
2791
+ "learning_rate": 1.363326806844301e-05,
2792
+ "loss": 0.2679,
2793
+ "step": 9850
2794
+ },
2795
+ {
2796
+ "epoch": 2.2695931969662144,
2797
+ "grad_norm": 3.609133720397949,
2798
+ "learning_rate": 1.3526857921171363e-05,
2799
+ "loss": 0.298,
2800
+ "step": 9875
2801
+ },
2802
+ {
2803
+ "epoch": 2.2753390025281544,
2804
+ "grad_norm": 10.248851776123047,
2805
+ "learning_rate": 1.3420447773899719e-05,
2806
+ "loss": 0.3184,
2807
+ "step": 9900
2808
+ },
2809
+ {
2810
+ "epoch": 2.2810848080900943,
2811
+ "grad_norm": 8.504976272583008,
2812
+ "learning_rate": 1.3314037626628076e-05,
2813
+ "loss": 0.3057,
2814
+ "step": 9925
2815
+ },
2816
+ {
2817
+ "epoch": 2.286830613652034,
2818
+ "grad_norm": 3.3555614948272705,
2819
+ "learning_rate": 1.3207627479356432e-05,
2820
+ "loss": 0.2886,
2821
+ "step": 9950
2822
+ },
2823
+ {
2824
+ "epoch": 2.2925764192139737,
2825
+ "grad_norm": 6.76616096496582,
2826
+ "learning_rate": 1.3101217332084789e-05,
2827
+ "loss": 0.2674,
2828
+ "step": 9975
2829
+ },
2830
+ {
2831
+ "epoch": 2.2983222247759136,
2832
+ "grad_norm": 9.159728050231934,
2833
+ "learning_rate": 1.2994807184813143e-05,
2834
+ "loss": 0.3071,
2835
+ "step": 10000
2836
+ },
2837
+ {
2838
+ "epoch": 2.304068030337853,
2839
+ "grad_norm": 18.63709831237793,
2840
+ "learning_rate": 1.28883970375415e-05,
2841
+ "loss": 0.2902,
2842
+ "step": 10025
2843
+ },
2844
+ {
2845
+ "epoch": 2.309813835899793,
2846
+ "grad_norm": 1.2971268892288208,
2847
+ "learning_rate": 1.2781986890269857e-05,
2848
+ "loss": 0.322,
2849
+ "step": 10050
2850
+ },
2851
+ {
2852
+ "epoch": 2.315559641461733,
2853
+ "grad_norm": 11.695150375366211,
2854
+ "learning_rate": 1.2675576742998213e-05,
2855
+ "loss": 0.2625,
2856
+ "step": 10075
2857
+ },
2858
+ {
2859
+ "epoch": 2.321305447023673,
2860
+ "grad_norm": 17.39932632446289,
2861
+ "learning_rate": 1.256916659572657e-05,
2862
+ "loss": 0.2826,
2863
+ "step": 10100
2864
+ },
2865
+ {
2866
+ "epoch": 2.3270512525856124,
2867
+ "grad_norm": 5.18493127822876,
2868
+ "learning_rate": 1.2462756448454924e-05,
2869
+ "loss": 0.2516,
2870
+ "step": 10125
2871
+ },
2872
+ {
2873
+ "epoch": 2.3327970581475523,
2874
+ "grad_norm": 16.71261978149414,
2875
+ "learning_rate": 1.2356346301183281e-05,
2876
+ "loss": 0.3449,
2877
+ "step": 10150
2878
+ },
2879
+ {
2880
+ "epoch": 2.338542863709492,
2881
+ "grad_norm": 10.863037109375,
2882
+ "learning_rate": 1.2249936153911637e-05,
2883
+ "loss": 0.2464,
2884
+ "step": 10175
2885
+ },
2886
+ {
2887
+ "epoch": 2.3442886692714318,
2888
+ "grad_norm": 4.433645725250244,
2889
+ "learning_rate": 1.2143526006639994e-05,
2890
+ "loss": 0.3322,
2891
+ "step": 10200
2892
+ },
2893
+ {
2894
+ "epoch": 2.3500344748333717,
2895
+ "grad_norm": 7.951370716094971,
2896
+ "learning_rate": 1.203711585936835e-05,
2897
+ "loss": 0.3509,
2898
+ "step": 10225
2899
+ },
2900
+ {
2901
+ "epoch": 2.3557802803953116,
2902
+ "grad_norm": 10.264397621154785,
2903
+ "learning_rate": 1.1930705712096705e-05,
2904
+ "loss": 0.2962,
2905
+ "step": 10250
2906
+ },
2907
+ {
2908
+ "epoch": 2.361526085957251,
2909
+ "grad_norm": 8.093633651733398,
2910
+ "learning_rate": 1.1824295564825062e-05,
2911
+ "loss": 0.4046,
2912
+ "step": 10275
2913
+ },
2914
+ {
2915
+ "epoch": 2.367271891519191,
2916
+ "grad_norm": 12.440337181091309,
2917
+ "learning_rate": 1.1717885417553418e-05,
2918
+ "loss": 0.3199,
2919
+ "step": 10300
2920
+ },
2921
+ {
2922
+ "epoch": 2.3730176970811305,
2923
+ "grad_norm": 7.12372350692749,
2924
+ "learning_rate": 1.1611475270281775e-05,
2925
+ "loss": 0.2804,
2926
+ "step": 10325
2927
+ },
2928
+ {
2929
+ "epoch": 2.3787635026430705,
2930
+ "grad_norm": 13.004124641418457,
2931
+ "learning_rate": 1.1505065123010131e-05,
2932
+ "loss": 0.254,
2933
+ "step": 10350
2934
+ },
2935
+ {
2936
+ "epoch": 2.3845093082050104,
2937
+ "grad_norm": 8.472222328186035,
2938
+ "learning_rate": 1.1398654975738488e-05,
2939
+ "loss": 0.3731,
2940
+ "step": 10375
2941
+ },
2942
+ {
2943
+ "epoch": 2.3902551137669503,
2944
+ "grad_norm": 4.128398895263672,
2945
+ "learning_rate": 1.1292244828466843e-05,
2946
+ "loss": 0.263,
2947
+ "step": 10400
2948
+ },
2949
+ {
2950
+ "epoch": 2.39600091932889,
2951
+ "grad_norm": 10.770340919494629,
2952
+ "learning_rate": 1.11858346811952e-05,
2953
+ "loss": 0.305,
2954
+ "step": 10425
2955
+ },
2956
+ {
2957
+ "epoch": 2.4017467248908297,
2958
+ "grad_norm": 12.2786226272583,
2959
+ "learning_rate": 1.1079424533923556e-05,
2960
+ "loss": 0.3019,
2961
+ "step": 10450
2962
+ },
2963
+ {
2964
+ "epoch": 2.4074925304527692,
2965
+ "grad_norm": 6.7361063957214355,
2966
+ "learning_rate": 1.0973014386651912e-05,
2967
+ "loss": 0.3148,
2968
+ "step": 10475
2969
+ },
2970
+ {
2971
+ "epoch": 2.413238336014709,
2972
+ "grad_norm": 9.457742691040039,
2973
+ "learning_rate": 1.0866604239380269e-05,
2974
+ "loss": 0.2865,
2975
+ "step": 10500
2976
+ },
2977
+ {
2978
+ "epoch": 2.418984141576649,
2979
+ "grad_norm": 3.7694594860076904,
2980
+ "learning_rate": 1.0760194092108624e-05,
2981
+ "loss": 0.246,
2982
+ "step": 10525
2983
+ },
2984
+ {
2985
+ "epoch": 2.424729947138589,
2986
+ "grad_norm": 7.730304718017578,
2987
+ "learning_rate": 1.065378394483698e-05,
2988
+ "loss": 0.335,
2989
+ "step": 10550
2990
+ },
2991
+ {
2992
+ "epoch": 2.4304757527005285,
2993
+ "grad_norm": 10.514723777770996,
2994
+ "learning_rate": 1.0547373797565337e-05,
2995
+ "loss": 0.2765,
2996
+ "step": 10575
2997
+ },
2998
+ {
2999
+ "epoch": 2.4362215582624684,
3000
+ "grad_norm": 5.863102436065674,
3001
+ "learning_rate": 1.0440963650293691e-05,
3002
+ "loss": 0.2872,
3003
+ "step": 10600
3004
+ },
3005
+ {
3006
+ "epoch": 2.441967363824408,
3007
+ "grad_norm": 11.08028793334961,
3008
+ "learning_rate": 1.0334553503022048e-05,
3009
+ "loss": 0.2951,
3010
+ "step": 10625
3011
+ },
3012
+ {
3013
+ "epoch": 2.447713169386348,
3014
+ "grad_norm": 7.661574363708496,
3015
+ "learning_rate": 1.0228143355750404e-05,
3016
+ "loss": 0.2148,
3017
+ "step": 10650
3018
+ },
3019
+ {
3020
+ "epoch": 2.453458974948288,
3021
+ "grad_norm": 8.94846248626709,
3022
+ "learning_rate": 1.0121733208478761e-05,
3023
+ "loss": 0.278,
3024
+ "step": 10675
3025
+ },
3026
+ {
3027
+ "epoch": 2.4592047805102277,
3028
+ "grad_norm": 4.757375717163086,
3029
+ "learning_rate": 1.0015323061207118e-05,
3030
+ "loss": 0.2581,
3031
+ "step": 10700
3032
+ },
3033
+ {
3034
+ "epoch": 2.464950586072167,
3035
+ "grad_norm": 5.684369087219238,
3036
+ "learning_rate": 9.908912913935474e-06,
3037
+ "loss": 0.2654,
3038
+ "step": 10725
3039
+ },
3040
+ {
3041
+ "epoch": 2.470696391634107,
3042
+ "grad_norm": 8.778314590454102,
3043
+ "learning_rate": 9.802502766663829e-06,
3044
+ "loss": 0.3054,
3045
+ "step": 10750
3046
+ },
3047
+ {
3048
+ "epoch": 2.476442197196047,
3049
+ "grad_norm": 9.916199684143066,
3050
+ "learning_rate": 9.696092619392185e-06,
3051
+ "loss": 0.2818,
3052
+ "step": 10775
3053
+ },
3054
+ {
3055
+ "epoch": 2.4821880027579866,
3056
+ "grad_norm": 14.061298370361328,
3057
+ "learning_rate": 9.589682472120542e-06,
3058
+ "loss": 0.2872,
3059
+ "step": 10800
3060
+ },
3061
+ {
3062
+ "epoch": 2.4879338083199265,
3063
+ "grad_norm": 3.405550241470337,
3064
+ "learning_rate": 9.483272324848898e-06,
3065
+ "loss": 0.2891,
3066
+ "step": 10825
3067
+ },
3068
+ {
3069
+ "epoch": 2.4936796138818664,
3070
+ "grad_norm": 13.360318183898926,
3071
+ "learning_rate": 9.376862177577255e-06,
3072
+ "loss": 0.2764,
3073
+ "step": 10850
3074
+ },
3075
+ {
3076
+ "epoch": 2.499425419443806,
3077
+ "grad_norm": 9.567176818847656,
3078
+ "learning_rate": 9.27045203030561e-06,
3079
+ "loss": 0.3287,
3080
+ "step": 10875
3081
+ },
3082
+ {
3083
+ "epoch": 2.505171225005746,
3084
+ "grad_norm": 22.88526725769043,
3085
+ "learning_rate": 9.164041883033966e-06,
3086
+ "loss": 0.2867,
3087
+ "step": 10900
3088
+ },
3089
+ {
3090
+ "epoch": 2.5109170305676853,
3091
+ "grad_norm": 8.176691055297852,
3092
+ "learning_rate": 9.057631735762323e-06,
3093
+ "loss": 0.313,
3094
+ "step": 10925
3095
+ },
3096
+ {
3097
+ "epoch": 2.5166628361296253,
3098
+ "grad_norm": 9.478363990783691,
3099
+ "learning_rate": 8.951221588490678e-06,
3100
+ "loss": 0.3178,
3101
+ "step": 10950
3102
+ },
3103
+ {
3104
+ "epoch": 2.522408641691565,
3105
+ "grad_norm": 11.561506271362305,
3106
+ "learning_rate": 8.844811441219034e-06,
3107
+ "loss": 0.3263,
3108
+ "step": 10975
3109
+ },
3110
+ {
3111
+ "epoch": 2.528154447253505,
3112
+ "grad_norm": 21.103870391845703,
3113
+ "learning_rate": 8.738401293947392e-06,
3114
+ "loss": 0.392,
3115
+ "step": 11000
3116
+ },
3117
+ {
3118
+ "epoch": 2.5339002528154446,
3119
+ "grad_norm": 9.002408981323242,
3120
+ "learning_rate": 8.631991146675747e-06,
3121
+ "loss": 0.2399,
3122
+ "step": 11025
3123
+ },
3124
+ {
3125
+ "epoch": 2.5396460583773846,
3126
+ "grad_norm": 12.6602201461792,
3127
+ "learning_rate": 8.525580999404104e-06,
3128
+ "loss": 0.3234,
3129
+ "step": 11050
3130
+ },
3131
+ {
3132
+ "epoch": 2.545391863939324,
3133
+ "grad_norm": 7.497732639312744,
3134
+ "learning_rate": 8.41917085213246e-06,
3135
+ "loss": 0.3384,
3136
+ "step": 11075
3137
+ },
3138
+ {
3139
+ "epoch": 2.551137669501264,
3140
+ "grad_norm": 7.916686058044434,
3141
+ "learning_rate": 8.312760704860815e-06,
3142
+ "loss": 0.262,
3143
+ "step": 11100
3144
+ },
3145
+ {
3146
+ "epoch": 2.556883475063204,
3147
+ "grad_norm": 9.539471626281738,
3148
+ "learning_rate": 8.206350557589172e-06,
3149
+ "loss": 0.2634,
3150
+ "step": 11125
3151
+ },
3152
+ {
3153
+ "epoch": 2.562629280625144,
3154
+ "grad_norm": 10.455008506774902,
3155
+ "learning_rate": 8.099940410317528e-06,
3156
+ "loss": 0.354,
3157
+ "step": 11150
3158
+ },
3159
+ {
3160
+ "epoch": 2.5683750861870833,
3161
+ "grad_norm": 13.299657821655273,
3162
+ "learning_rate": 7.993530263045885e-06,
3163
+ "loss": 0.2795,
3164
+ "step": 11175
3165
+ },
3166
+ {
3167
+ "epoch": 2.5741208917490233,
3168
+ "grad_norm": 11.104608535766602,
3169
+ "learning_rate": 7.887120115774241e-06,
3170
+ "loss": 0.2768,
3171
+ "step": 11200
3172
+ },
3173
+ {
3174
+ "epoch": 2.5798666973109627,
3175
+ "grad_norm": 7.9573493003845215,
3176
+ "learning_rate": 7.780709968502596e-06,
3177
+ "loss": 0.3175,
3178
+ "step": 11225
3179
+ },
3180
+ {
3181
+ "epoch": 2.5856125028729027,
3182
+ "grad_norm": 6.329565525054932,
3183
+ "learning_rate": 7.674299821230952e-06,
3184
+ "loss": 0.3128,
3185
+ "step": 11250
3186
+ },
3187
+ {
3188
+ "epoch": 2.5913583084348426,
3189
+ "grad_norm": 6.937751770019531,
3190
+ "learning_rate": 7.56788967395931e-06,
3191
+ "loss": 0.2232,
3192
+ "step": 11275
3193
+ },
3194
+ {
3195
+ "epoch": 2.5971041139967825,
3196
+ "grad_norm": 8.591595649719238,
3197
+ "learning_rate": 7.461479526687665e-06,
3198
+ "loss": 0.2984,
3199
+ "step": 11300
3200
+ },
3201
+ {
3202
+ "epoch": 2.602849919558722,
3203
+ "grad_norm": 9.453631401062012,
3204
+ "learning_rate": 7.355069379416021e-06,
3205
+ "loss": 0.2491,
3206
+ "step": 11325
3207
+ },
3208
+ {
3209
+ "epoch": 2.608595725120662,
3210
+ "grad_norm": 6.31212854385376,
3211
+ "learning_rate": 7.248659232144378e-06,
3212
+ "loss": 0.3115,
3213
+ "step": 11350
3214
+ },
3215
+ {
3216
+ "epoch": 2.6143415306826014,
3217
+ "grad_norm": 6.614916801452637,
3218
+ "learning_rate": 7.142249084872733e-06,
3219
+ "loss": 0.2884,
3220
+ "step": 11375
3221
+ },
3222
+ {
3223
+ "epoch": 2.6200873362445414,
3224
+ "grad_norm": 8.102603912353516,
3225
+ "learning_rate": 7.03583893760109e-06,
3226
+ "loss": 0.2584,
3227
+ "step": 11400
3228
+ },
3229
+ {
3230
+ "epoch": 2.6258331418064813,
3231
+ "grad_norm": 4.263225555419922,
3232
+ "learning_rate": 6.929428790329446e-06,
3233
+ "loss": 0.2357,
3234
+ "step": 11425
3235
+ },
3236
+ {
3237
+ "epoch": 2.6315789473684212,
3238
+ "grad_norm": 8.729228019714355,
3239
+ "learning_rate": 6.823018643057802e-06,
3240
+ "loss": 0.2669,
3241
+ "step": 11450
3242
+ },
3243
+ {
3244
+ "epoch": 2.6373247529303607,
3245
+ "grad_norm": 6.767869472503662,
3246
+ "learning_rate": 6.716608495786159e-06,
3247
+ "loss": 0.2634,
3248
+ "step": 11475
3249
+ },
3250
+ {
3251
+ "epoch": 2.6430705584923007,
3252
+ "grad_norm": 6.409708499908447,
3253
+ "learning_rate": 6.610198348514515e-06,
3254
+ "loss": 0.2606,
3255
+ "step": 11500
3256
+ },
3257
+ {
3258
+ "epoch": 2.64881636405424,
3259
+ "grad_norm": 6.780210018157959,
3260
+ "learning_rate": 6.503788201242871e-06,
3261
+ "loss": 0.2801,
3262
+ "step": 11525
3263
+ },
3264
+ {
3265
+ "epoch": 2.65456216961618,
3266
+ "grad_norm": 9.45854663848877,
3267
+ "learning_rate": 6.397378053971227e-06,
3268
+ "loss": 0.2315,
3269
+ "step": 11550
3270
+ },
3271
+ {
3272
+ "epoch": 2.66030797517812,
3273
+ "grad_norm": 8.216423988342285,
3274
+ "learning_rate": 6.290967906699584e-06,
3275
+ "loss": 0.2897,
3276
+ "step": 11575
3277
+ },
3278
+ {
3279
+ "epoch": 2.66605378074006,
3280
+ "grad_norm": 10.603631973266602,
3281
+ "learning_rate": 6.1845577594279395e-06,
3282
+ "loss": 0.2884,
3283
+ "step": 11600
3284
+ },
3285
+ {
3286
+ "epoch": 2.6717995863019994,
3287
+ "grad_norm": 18.06307601928711,
3288
+ "learning_rate": 6.078147612156296e-06,
3289
+ "loss": 0.2767,
3290
+ "step": 11625
3291
+ },
3292
+ {
3293
+ "epoch": 2.6775453918639394,
3294
+ "grad_norm": 8.301953315734863,
3295
+ "learning_rate": 5.971737464884652e-06,
3296
+ "loss": 0.305,
3297
+ "step": 11650
3298
+ },
3299
+ {
3300
+ "epoch": 2.683291197425879,
3301
+ "grad_norm": 3.897515296936035,
3302
+ "learning_rate": 5.865327317613007e-06,
3303
+ "loss": 0.2622,
3304
+ "step": 11675
3305
+ },
3306
+ {
3307
+ "epoch": 2.689037002987819,
3308
+ "grad_norm": 11.311945915222168,
3309
+ "learning_rate": 5.758917170341364e-06,
3310
+ "loss": 0.2443,
3311
+ "step": 11700
3312
+ },
3313
+ {
3314
+ "epoch": 2.6947828085497587,
3315
+ "grad_norm": 11.095697402954102,
3316
+ "learning_rate": 5.65250702306972e-06,
3317
+ "loss": 0.2892,
3318
+ "step": 11725
3319
+ },
3320
+ {
3321
+ "epoch": 2.7005286141116986,
3322
+ "grad_norm": 4.947134017944336,
3323
+ "learning_rate": 5.546096875798076e-06,
3324
+ "loss": 0.325,
3325
+ "step": 11750
3326
+ },
3327
+ {
3328
+ "epoch": 2.706274419673638,
3329
+ "grad_norm": 7.467726707458496,
3330
+ "learning_rate": 5.439686728526433e-06,
3331
+ "loss": 0.3117,
3332
+ "step": 11775
3333
+ },
3334
+ {
3335
+ "epoch": 2.712020225235578,
3336
+ "grad_norm": 9.866068840026855,
3337
+ "learning_rate": 5.333276581254789e-06,
3338
+ "loss": 0.3054,
3339
+ "step": 11800
3340
+ },
3341
+ {
3342
+ "epoch": 2.7177660307975176,
3343
+ "grad_norm": 8.738960266113281,
3344
+ "learning_rate": 5.226866433983145e-06,
3345
+ "loss": 0.3043,
3346
+ "step": 11825
3347
+ },
3348
+ {
3349
+ "epoch": 2.7235118363594575,
3350
+ "grad_norm": 12.69951057434082,
3351
+ "learning_rate": 5.120456286711501e-06,
3352
+ "loss": 0.2877,
3353
+ "step": 11850
3354
+ },
3355
+ {
3356
+ "epoch": 2.7292576419213974,
3357
+ "grad_norm": 12.757490158081055,
3358
+ "learning_rate": 5.014046139439857e-06,
3359
+ "loss": 0.3413,
3360
+ "step": 11875
3361
+ },
3362
+ {
3363
+ "epoch": 2.7350034474833373,
3364
+ "grad_norm": 8.377620697021484,
3365
+ "learning_rate": 4.9076359921682135e-06,
3366
+ "loss": 0.2688,
3367
+ "step": 11900
3368
+ },
3369
+ {
3370
+ "epoch": 2.740749253045277,
3371
+ "grad_norm": 10.275845527648926,
3372
+ "learning_rate": 4.801225844896569e-06,
3373
+ "loss": 0.2605,
3374
+ "step": 11925
3375
+ },
3376
+ {
3377
+ "epoch": 2.7464950586072168,
3378
+ "grad_norm": 8.497511863708496,
3379
+ "learning_rate": 4.694815697624926e-06,
3380
+ "loss": 0.3221,
3381
+ "step": 11950
3382
+ },
3383
+ {
3384
+ "epoch": 2.7522408641691563,
3385
+ "grad_norm": 2.008751153945923,
3386
+ "learning_rate": 4.588405550353282e-06,
3387
+ "loss": 0.276,
3388
+ "step": 11975
3389
+ },
3390
+ {
3391
+ "epoch": 2.757986669731096,
3392
+ "grad_norm": 9.221789360046387,
3393
+ "learning_rate": 4.481995403081638e-06,
3394
+ "loss": 0.2717,
3395
+ "step": 12000
3396
+ },
3397
+ {
3398
+ "epoch": 2.763732475293036,
3399
+ "grad_norm": 12.64124870300293,
3400
+ "learning_rate": 4.375585255809994e-06,
3401
+ "loss": 0.3022,
3402
+ "step": 12025
3403
+ },
3404
+ {
3405
+ "epoch": 2.769478280854976,
3406
+ "grad_norm": 7.941061496734619,
3407
+ "learning_rate": 4.26917510853835e-06,
3408
+ "loss": 0.2974,
3409
+ "step": 12050
3410
+ },
3411
+ {
3412
+ "epoch": 2.7752240864169155,
3413
+ "grad_norm": 11.986599922180176,
3414
+ "learning_rate": 4.1627649612667066e-06,
3415
+ "loss": 0.2933,
3416
+ "step": 12075
3417
+ },
3418
+ {
3419
+ "epoch": 2.7809698919788555,
3420
+ "grad_norm": 8.333921432495117,
3421
+ "learning_rate": 4.056354813995063e-06,
3422
+ "loss": 0.3154,
3423
+ "step": 12100
3424
+ },
3425
+ {
3426
+ "epoch": 2.786715697540795,
3427
+ "grad_norm": 8.222199440002441,
3428
+ "learning_rate": 3.949944666723419e-06,
3429
+ "loss": 0.2618,
3430
+ "step": 12125
3431
+ },
3432
+ {
3433
+ "epoch": 2.792461503102735,
3434
+ "grad_norm": 5.828108310699463,
3435
+ "learning_rate": 3.843534519451775e-06,
3436
+ "loss": 0.3117,
3437
+ "step": 12150
3438
+ },
3439
+ {
3440
+ "epoch": 2.798207308664675,
3441
+ "grad_norm": 4.506043910980225,
3442
+ "learning_rate": 3.7371243721801314e-06,
3443
+ "loss": 0.2742,
3444
+ "step": 12175
3445
+ },
3446
+ {
3447
+ "epoch": 2.8039531142266148,
3448
+ "grad_norm": 6.610835552215576,
3449
+ "learning_rate": 3.6307142249084875e-06,
3450
+ "loss": 0.2821,
3451
+ "step": 12200
3452
+ },
3453
+ {
3454
+ "epoch": 2.8096989197885542,
3455
+ "grad_norm": 9.316131591796875,
3456
+ "learning_rate": 3.524304077636843e-06,
3457
+ "loss": 0.3721,
3458
+ "step": 12225
3459
+ },
3460
+ {
3461
+ "epoch": 2.815444725350494,
3462
+ "grad_norm": 13.79557991027832,
3463
+ "learning_rate": 3.4178939303652e-06,
3464
+ "loss": 0.3067,
3465
+ "step": 12250
3466
+ },
3467
+ {
3468
+ "epoch": 2.8211905309124337,
3469
+ "grad_norm": 3.6889641284942627,
3470
+ "learning_rate": 3.3114837830935558e-06,
3471
+ "loss": 0.3536,
3472
+ "step": 12275
3473
+ },
3474
+ {
3475
+ "epoch": 2.8269363364743736,
3476
+ "grad_norm": 9.081113815307617,
3477
+ "learning_rate": 3.205073635821912e-06,
3478
+ "loss": 0.2818,
3479
+ "step": 12300
3480
+ },
3481
+ {
3482
+ "epoch": 2.8326821420363135,
3483
+ "grad_norm": 6.250842571258545,
3484
+ "learning_rate": 3.0986634885502684e-06,
3485
+ "loss": 0.2985,
3486
+ "step": 12325
3487
+ },
3488
+ {
3489
+ "epoch": 2.8384279475982535,
3490
+ "grad_norm": 7.818601608276367,
3491
+ "learning_rate": 2.9922533412786245e-06,
3492
+ "loss": 0.3299,
3493
+ "step": 12350
3494
+ },
3495
+ {
3496
+ "epoch": 2.844173753160193,
3497
+ "grad_norm": 3.7356948852539062,
3498
+ "learning_rate": 2.8858431940069806e-06,
3499
+ "loss": 0.3068,
3500
+ "step": 12375
3501
+ },
3502
+ {
3503
+ "epoch": 2.849919558722133,
3504
+ "grad_norm": 8.613180160522461,
3505
+ "learning_rate": 2.7794330467353367e-06,
3506
+ "loss": 0.2359,
3507
+ "step": 12400
3508
+ },
3509
+ {
3510
+ "epoch": 2.8556653642840724,
3511
+ "grad_norm": 5.272718906402588,
3512
+ "learning_rate": 2.673022899463693e-06,
3513
+ "loss": 0.2459,
3514
+ "step": 12425
3515
+ },
3516
+ {
3517
+ "epoch": 2.8614111698460123,
3518
+ "grad_norm": 5.97663688659668,
3519
+ "learning_rate": 2.5666127521920493e-06,
3520
+ "loss": 0.3391,
3521
+ "step": 12450
3522
+ },
3523
+ {
3524
+ "epoch": 2.8671569754079522,
3525
+ "grad_norm": 9.63822078704834,
3526
+ "learning_rate": 2.4602026049204054e-06,
3527
+ "loss": 0.3068,
3528
+ "step": 12475
3529
+ },
3530
+ {
3531
+ "epoch": 2.872902780969892,
3532
+ "grad_norm": 4.8020172119140625,
3533
+ "learning_rate": 2.3537924576487615e-06,
3534
+ "loss": 0.2816,
3535
+ "step": 12500
3536
+ },
3537
+ {
3538
+ "epoch": 2.8786485865318316,
3539
+ "grad_norm": 8.880352020263672,
3540
+ "learning_rate": 2.2473823103771176e-06,
3541
+ "loss": 0.2288,
3542
+ "step": 12525
3543
+ },
3544
+ {
3545
+ "epoch": 2.8843943920937716,
3546
+ "grad_norm": 12.863115310668945,
3547
+ "learning_rate": 2.1409721631054737e-06,
3548
+ "loss": 0.2539,
3549
+ "step": 12550
3550
+ },
3551
+ {
3552
+ "epoch": 2.890140197655711,
3553
+ "grad_norm": 18.137331008911133,
3554
+ "learning_rate": 2.03456201583383e-06,
3555
+ "loss": 0.2567,
3556
+ "step": 12575
3557
+ },
3558
+ {
3559
+ "epoch": 2.895886003217651,
3560
+ "grad_norm": 11.29713249206543,
3561
+ "learning_rate": 1.9281518685621863e-06,
3562
+ "loss": 0.2979,
3563
+ "step": 12600
3564
+ },
3565
+ {
3566
+ "epoch": 2.901631808779591,
3567
+ "grad_norm": 5.792328834533691,
3568
+ "learning_rate": 1.8217417212905424e-06,
3569
+ "loss": 0.3037,
3570
+ "step": 12625
3571
+ },
3572
+ {
3573
+ "epoch": 2.907377614341531,
3574
+ "grad_norm": 4.29570198059082,
3575
+ "learning_rate": 1.7153315740188987e-06,
3576
+ "loss": 0.2398,
3577
+ "step": 12650
3578
+ },
3579
+ {
3580
+ "epoch": 2.9131234199034703,
3581
+ "grad_norm": 9.291726112365723,
3582
+ "learning_rate": 1.6089214267472546e-06,
3583
+ "loss": 0.3226,
3584
+ "step": 12675
3585
+ },
3586
+ {
3587
+ "epoch": 2.9188692254654103,
3588
+ "grad_norm": 11.283625602722168,
3589
+ "learning_rate": 1.5025112794756109e-06,
3590
+ "loss": 0.3077,
3591
+ "step": 12700
3592
+ },
3593
+ {
3594
+ "epoch": 2.9246150310273498,
3595
+ "grad_norm": 9.59054946899414,
3596
+ "learning_rate": 1.396101132203967e-06,
3597
+ "loss": 0.2839,
3598
+ "step": 12725
3599
+ },
3600
+ {
3601
+ "epoch": 2.9303608365892897,
3602
+ "grad_norm": 6.355679035186768,
3603
+ "learning_rate": 1.2896909849323233e-06,
3604
+ "loss": 0.2673,
3605
+ "step": 12750
3606
+ },
3607
+ {
3608
+ "epoch": 2.9361066421512296,
3609
+ "grad_norm": 4.829615116119385,
3610
+ "learning_rate": 1.1832808376606794e-06,
3611
+ "loss": 0.3218,
3612
+ "step": 12775
3613
+ },
3614
+ {
3615
+ "epoch": 2.9418524477131696,
3616
+ "grad_norm": 3.804570198059082,
3617
+ "learning_rate": 1.0768706903890355e-06,
3618
+ "loss": 0.2438,
3619
+ "step": 12800
3620
+ },
3621
+ {
3622
+ "epoch": 2.947598253275109,
3623
+ "grad_norm": 9.905104637145996,
3624
+ "learning_rate": 9.704605431173918e-07,
3625
+ "loss": 0.3214,
3626
+ "step": 12825
3627
+ },
3628
+ {
3629
+ "epoch": 2.953344058837049,
3630
+ "grad_norm": 19.192649841308594,
3631
+ "learning_rate": 8.64050395845748e-07,
3632
+ "loss": 0.2366,
3633
+ "step": 12850
3634
+ },
3635
+ {
3636
+ "epoch": 2.9590898643989885,
3637
+ "grad_norm": 8.720338821411133,
3638
+ "learning_rate": 7.576402485741041e-07,
3639
+ "loss": 0.3157,
3640
+ "step": 12875
3641
+ },
3642
+ {
3643
+ "epoch": 2.9648356699609284,
3644
+ "grad_norm": 11.466854095458984,
3645
+ "learning_rate": 6.512301013024603e-07,
3646
+ "loss": 0.3257,
3647
+ "step": 12900
3648
+ },
3649
+ {
3650
+ "epoch": 2.9705814755228683,
3651
+ "grad_norm": 12.517078399658203,
3652
+ "learning_rate": 5.448199540308164e-07,
3653
+ "loss": 0.3519,
3654
+ "step": 12925
3655
+ },
3656
+ {
3657
+ "epoch": 2.9763272810848083,
3658
+ "grad_norm": 7.879445552825928,
3659
+ "learning_rate": 4.384098067591726e-07,
3660
+ "loss": 0.3597,
3661
+ "step": 12950
3662
+ },
3663
+ {
3664
+ "epoch": 2.9820730866467478,
3665
+ "grad_norm": 7.040919780731201,
3666
+ "learning_rate": 3.3199965948752875e-07,
3667
+ "loss": 0.354,
3668
+ "step": 12975
3669
+ },
3670
+ {
3671
+ "epoch": 2.9878188922086877,
3672
+ "grad_norm": 5.912668228149414,
3673
+ "learning_rate": 2.255895122158849e-07,
3674
+ "loss": 0.2328,
3675
+ "step": 13000
3676
+ },
3677
+ {
3678
+ "epoch": 2.993564697770627,
3679
+ "grad_norm": 2.8012125492095947,
3680
+ "learning_rate": 1.1917936494424109e-07,
3681
+ "loss": 0.2318,
3682
+ "step": 13025
3683
+ },
3684
+ {
3685
+ "epoch": 2.999310503332567,
3686
+ "grad_norm": 5.125675201416016,
3687
+ "learning_rate": 1.2769217672597258e-08,
3688
+ "loss": 0.2961,
3689
+ "step": 13050
3690
+ },
3691
+ {
3692
+ "epoch": 3.0,
3693
+ "eval_gen_len": 26.3516,
3694
+ "eval_loss": 0.59325110912323,
3695
+ "eval_rouge1": 87.5985,
3696
+ "eval_rouge2": 74.3003,
3697
+ "eval_rougeL": 86.0508,
3698
+ "eval_rougeLsum": 86.0787,
3699
+ "eval_runtime": 2117.538,
3700
+ "eval_samples_per_second": 1.37,
3701
+ "eval_steps_per_second": 0.343,
3702
+ "step": 13053
3703
+ }
3704
+ ],
3705
+ "logging_steps": 25,
3706
+ "max_steps": 13053,
3707
+ "num_input_tokens_seen": 0,
3708
+ "num_train_epochs": 3,
3709
+ "save_steps": 500,
3710
+ "stateful_callbacks": {
3711
+ "EarlyStoppingCallback": {
3712
+ "args": {
3713
+ "early_stopping_patience": 5,
3714
+ "early_stopping_threshold": 0.01
3715
+ },
3716
+ "attributes": {
3717
+ "early_stopping_patience_counter": 0
3718
+ }
3719
+ },
3720
+ "TrainerControl": {
3721
+ "args": {
3722
+ "should_epoch_stop": false,
3723
+ "should_evaluate": false,
3724
+ "should_log": false,
3725
+ "should_save": true,
3726
+ "should_training_stop": true
3727
+ },
3728
+ "attributes": {}
3729
+ }
3730
+ },
3731
+ "total_flos": 197710094794752.0,
3732
+ "train_batch_size": 2,
3733
+ "trial_name": null,
3734
+ "trial_params": null
3735
+ }
checkpoint-13053/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9c1786ca48fbfd01edce28d1cc0b6b3f5c384e7cc5f3fbe79faa189d691ee3e
3
+ size 5368
checkpoint-13053/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-id-en",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "swish",
6
+ "add_bias_logits": false,
7
+ "add_final_layer_norm": false,
8
+ "architectures": [
9
+ "MarianMTModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 0,
13
+ "classif_dropout": 0.0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 512,
16
+ "decoder_attention_heads": 8,
17
+ "decoder_ffn_dim": 2048,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 54795,
21
+ "decoder_vocab_size": 54796,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 8,
24
+ "encoder_ffn_dim": 2048,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 0,
28
+ "forced_eos_token_id": 0,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 54795,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.45.0",
54
+ "use_cache": true,
55
+ "vocab_size": 54796
56
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 54795
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 54795,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 6,
13
+ "pad_token_id": 54795,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.45.0"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:035031492308c1194f63ad8f86e4efdfac16883fea241e6f258a665eefc7df44
3
+ size 289024432
runs/Oct08_19-23-32_r-damand2061-autotrain-e8fxndl4-de44e-ub1ee/events.out.tfevents.1728415416.r-damand2061-autotrain-e8fxndl4-de44e-ub1ee.103.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b094fa96070d26889ebdd1a19a7659fe20529cfaa93f0ba24809920040d4b14
3
+ size 117907
runs/Oct08_19-23-32_r-damand2061-autotrain-e8fxndl4-de44e-ub1ee/events.out.tfevents.1728436889.r-damand2061-autotrain-e8fxndl4-de44e-ub1ee.103.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d8b79e0cb902cb9ea85942db89c2f265da05814657d88f9b7020944a6d0a6d4
3
+ size 613
source.spm ADDED
Binary file (801 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
target.spm ADDED
Binary file (796 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "54795": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "eos_token": "</s>",
30
+ "model_max_length": 512,
31
+ "pad_token": "<pad>",
32
+ "separate_vocabs": false,
33
+ "source_lang": "id",
34
+ "sp_model_kwargs": {},
35
+ "target_lang": "en",
36
+ "tokenizer_class": "MarianTokenizer",
37
+ "unk_token": "<unk>"
38
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9c1786ca48fbfd01edce28d1cc0b6b3f5c384e7cc5f3fbe79faa189d691ee3e
3
+ size 5368
training_params.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_path": "jakartaresearch/inglish",
3
+ "model": "Helsinki-NLP/opus-mt-id-en",
4
+ "username": "damand2061",
5
+ "seed": 42,
6
+ "train_split": "train",
7
+ "valid_split": "validation",
8
+ "project_name": "autotrain-i56bj-d90g7",
9
+ "push_to_hub": true,
10
+ "text_column": "indonesian",
11
+ "target_column": "english",
12
+ "lr": 5e-05,
13
+ "epochs": 3,
14
+ "max_seq_length": 128,
15
+ "max_target_length": 128,
16
+ "batch_size": 2,
17
+ "warmup_ratio": 0.1,
18
+ "gradient_accumulation": 1,
19
+ "optimizer": "adamw_torch",
20
+ "scheduler": "linear",
21
+ "weight_decay": 0.0,
22
+ "max_grad_norm": 1.0,
23
+ "logging_steps": -1,
24
+ "eval_strategy": "epoch",
25
+ "auto_find_batch_size": false,
26
+ "mixed_precision": "fp16",
27
+ "save_total_limit": 1,
28
+ "peft": false,
29
+ "quantization": "int8",
30
+ "lora_r": 16,
31
+ "lora_alpha": 32,
32
+ "lora_dropout": 0.05,
33
+ "target_modules": "all-linear",
34
+ "log": "tensorboard",
35
+ "early_stopping_patience": 5,
36
+ "early_stopping_threshold": 0.01
37
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff