eljavatar commited on
Commit
b884997
1 Parent(s): 90e964d

Upload model finetuned on codet5p-220m using strategy src_fm_fc_dctx

Browse files
README.md CHANGED
@@ -1,3 +1,52 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: bsd-3-clause
3
+ base_model: Salesforce/codet5p-220m
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: dynamtests_01_codet5p_src_fm_fc_dctx
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # dynamtests_01_codet5p_src_fm_fc_dctx
15
+
16
+ This model is a fine-tuned version of [Salesforce/codet5p-220m](https://huggingface.co/Salesforce/codet5p-220m) on an unknown dataset.
17
+
18
+ ## Model description
19
+
20
+ More information needed
21
+
22
+ ## Intended uses & limitations
23
+
24
+ More information needed
25
+
26
+ ## Training and evaluation data
27
+
28
+ More information needed
29
+
30
+ ## Training procedure
31
+
32
+ ### Training hyperparameters
33
+
34
+ The following hyperparameters were used during training:
35
+ - learning_rate: 2e-05
36
+ - train_batch_size: 8
37
+ - eval_batch_size: 32
38
+ - seed: 42
39
+ - gradient_accumulation_steps: 8
40
+ - total_train_batch_size: 64
41
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
+ - lr_scheduler_type: linear
43
+ - lr_scheduler_warmup_steps: 1974
44
+ - num_epochs: 4
45
+ - mixed_precision_training: Native AMP
46
+
47
+ ### Framework versions
48
+
49
+ - Transformers 4.40.0
50
+ - Pytorch 2.1.0
51
+ - Datasets 3.0.0
52
+ - Tokenizers 0.19.1
added_tokens.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</ECTX>": 32103,
3
+ "</FCTX>": 32101,
4
+ "</PRIVATE_FCTX>": 32105,
5
+ "<ECTX>": 32102,
6
+ "<FCTX>": 32100,
7
+ "<PRIVATE_FCTX>": 32104
8
+ }
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Salesforce/codet5p-220m",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "bos_token_id": 1,
7
+ "classifier_dropout": 0.0,
8
+ "d_ff": 3072,
9
+ "d_kv": 64,
10
+ "d_model": 768,
11
+ "decoder_start_token_id": 0,
12
+ "dense_act_fn": "relu",
13
+ "dropout_rate": 0.1,
14
+ "eos_token_id": 2,
15
+ "feed_forward_proj": "relu",
16
+ "initializer_factor": 1.0,
17
+ "is_encoder_decoder": true,
18
+ "is_gated_act": false,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "t5",
21
+ "n_positions": 1024,
22
+ "num_decoder_layers": 12,
23
+ "num_heads": 12,
24
+ "num_layers": 12,
25
+ "output_past": true,
26
+ "pad_token_id": 0,
27
+ "relative_attention_max_distance": 128,
28
+ "relative_attention_num_buckets": 32,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.40.0",
31
+ "use_cache": true,
32
+ "vocab_size": 32106
33
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "decoder_start_token_id": 0,
5
+ "eos_token_id": 2,
6
+ "pad_token_id": 0,
7
+ "transformers_version": "4.40.0"
8
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c89fe1d2448925fbe7c35105204834a24d4703a6a86010241401f99b19e8db6d
3
+ size 891635790
special_tokens_map.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<FCTX>",
4
+ "</FCTX>",
5
+ "<ECTX>",
6
+ "</ECTX>",
7
+ "<PRIVATE_FCTX>",
8
+ "</PRIVATE_FCTX>"
9
+ ],
10
+ "bos_token": {
11
+ "content": "<s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ "cls_token": {
18
+ "content": "<s>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "eos_token": {
25
+ "content": "</s>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "mask_token": {
32
+ "content": "<mask>",
33
+ "lstrip": true,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ "pad_token": {
39
+ "content": "<pad>",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ "sep_token": {
46
+ "content": "</s>",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ "unk_token": {
53
+ "content": "<unk>",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ }
59
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<pad>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "32100": {
45
+ "content": "<FCTX>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "32101": {
53
+ "content": "</FCTX>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "32102": {
61
+ "content": "<ECTX>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "32103": {
69
+ "content": "</ECTX>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "32104": {
77
+ "content": "<PRIVATE_FCTX>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "32105": {
85
+ "content": "</PRIVATE_FCTX>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ }
92
+ },
93
+ "additional_special_tokens": [
94
+ "<FCTX>",
95
+ "</FCTX>",
96
+ "<ECTX>",
97
+ "</ECTX>",
98
+ "<PRIVATE_FCTX>",
99
+ "</PRIVATE_FCTX>"
100
+ ],
101
+ "bos_token": "<s>",
102
+ "clean_up_tokenization_spaces": true,
103
+ "cls_token": "<s>",
104
+ "eos_token": "</s>",
105
+ "errors": "replace",
106
+ "mask_token": "<mask>",
107
+ "model_max_length": 1024,
108
+ "pad_token": "<pad>",
109
+ "sep_token": "</s>",
110
+ "tokenizer_class": "RobertaTokenizer",
111
+ "trim_offsets": true,
112
+ "unk_token": "<unk>"
113
+ }
trainer_state.json ADDED
@@ -0,0 +1,1161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.0085912942886353,
3
+ "best_model_checkpoint": "/root/finetuning_executions/dynamtests_01_codet5p_src_fm_fc_dctx/checkpoint-39484",
4
+ "epoch": 3.999797396545611,
5
+ "eval_steps": 500,
6
+ "global_step": 39484,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.025325431798612165,
13
+ "grad_norm": 1.3758184909820557,
14
+ "learning_rate": 2.462006079027356e-06,
15
+ "loss": 2.5021,
16
+ "step": 250
17
+ },
18
+ {
19
+ "epoch": 0.05065086359722433,
20
+ "grad_norm": 1.0281180143356323,
21
+ "learning_rate": 4.994934143870315e-06,
22
+ "loss": 1.2333,
23
+ "step": 500
24
+ },
25
+ {
26
+ "epoch": 0.0759762953958365,
27
+ "grad_norm": 1.0643247365951538,
28
+ "learning_rate": 7.527862208713273e-06,
29
+ "loss": 1.1968,
30
+ "step": 750
31
+ },
32
+ {
33
+ "epoch": 0.10130172719444866,
34
+ "grad_norm": 1.0396573543548584,
35
+ "learning_rate": 1.006079027355623e-05,
36
+ "loss": 1.1765,
37
+ "step": 1000
38
+ },
39
+ {
40
+ "epoch": 0.12662715899306085,
41
+ "grad_norm": 1.0461889505386353,
42
+ "learning_rate": 1.259371833839919e-05,
43
+ "loss": 1.154,
44
+ "step": 1250
45
+ },
46
+ {
47
+ "epoch": 0.151952590791673,
48
+ "grad_norm": 0.9765501022338867,
49
+ "learning_rate": 1.5126646403242148e-05,
50
+ "loss": 1.1434,
51
+ "step": 1500
52
+ },
53
+ {
54
+ "epoch": 0.17727802259028516,
55
+ "grad_norm": 1.5702638626098633,
56
+ "learning_rate": 1.765957446808511e-05,
57
+ "loss": 1.1293,
58
+ "step": 1750
59
+ },
60
+ {
61
+ "epoch": 0.20260345438889732,
62
+ "grad_norm": 0.9935035705566406,
63
+ "learning_rate": 1.998986936816849e-05,
64
+ "loss": 1.1071,
65
+ "step": 2000
66
+ },
67
+ {
68
+ "epoch": 0.2279288861875095,
69
+ "grad_norm": 0.9715584516525269,
70
+ "learning_rate": 1.985657158091176e-05,
71
+ "loss": 1.0968,
72
+ "step": 2250
73
+ },
74
+ {
75
+ "epoch": 0.2532543179861217,
76
+ "grad_norm": 0.9306278228759766,
77
+ "learning_rate": 1.9723273793655026e-05,
78
+ "loss": 1.0923,
79
+ "step": 2500
80
+ },
81
+ {
82
+ "epoch": 0.2785797497847338,
83
+ "grad_norm": 1.0911701917648315,
84
+ "learning_rate": 1.9589976006398295e-05,
85
+ "loss": 1.0749,
86
+ "step": 2750
87
+ },
88
+ {
89
+ "epoch": 0.303905181583346,
90
+ "grad_norm": 1.024587869644165,
91
+ "learning_rate": 1.9456678219141565e-05,
92
+ "loss": 1.0695,
93
+ "step": 3000
94
+ },
95
+ {
96
+ "epoch": 0.32923061338195814,
97
+ "grad_norm": 0.9198426008224487,
98
+ "learning_rate": 1.932338043188483e-05,
99
+ "loss": 1.0677,
100
+ "step": 3250
101
+ },
102
+ {
103
+ "epoch": 0.3545560451805703,
104
+ "grad_norm": 0.97704017162323,
105
+ "learning_rate": 1.91900826446281e-05,
106
+ "loss": 1.0551,
107
+ "step": 3500
108
+ },
109
+ {
110
+ "epoch": 0.3798814769791825,
111
+ "grad_norm": 0.8315013647079468,
112
+ "learning_rate": 1.905678485737137e-05,
113
+ "loss": 1.0388,
114
+ "step": 3750
115
+ },
116
+ {
117
+ "epoch": 0.40520690877779464,
118
+ "grad_norm": 0.8992215991020203,
119
+ "learning_rate": 1.8923487070114637e-05,
120
+ "loss": 1.0451,
121
+ "step": 4000
122
+ },
123
+ {
124
+ "epoch": 0.4305323405764068,
125
+ "grad_norm": 1.0379681587219238,
126
+ "learning_rate": 1.8790189282857906e-05,
127
+ "loss": 1.0355,
128
+ "step": 4250
129
+ },
130
+ {
131
+ "epoch": 0.455857772375019,
132
+ "grad_norm": 0.9491779804229736,
133
+ "learning_rate": 1.8656891495601176e-05,
134
+ "loss": 1.0374,
135
+ "step": 4500
136
+ },
137
+ {
138
+ "epoch": 0.48118320417363114,
139
+ "grad_norm": 0.8982920050621033,
140
+ "learning_rate": 1.8523593708344442e-05,
141
+ "loss": 1.0267,
142
+ "step": 4750
143
+ },
144
+ {
145
+ "epoch": 0.5065086359722434,
146
+ "grad_norm": 0.8307796716690063,
147
+ "learning_rate": 1.8390295921087712e-05,
148
+ "loss": 1.0161,
149
+ "step": 5000
150
+ },
151
+ {
152
+ "epoch": 0.5318340677708555,
153
+ "grad_norm": 0.9225348830223083,
154
+ "learning_rate": 1.825699813383098e-05,
155
+ "loss": 1.0142,
156
+ "step": 5250
157
+ },
158
+ {
159
+ "epoch": 0.5571594995694676,
160
+ "grad_norm": 0.8506412506103516,
161
+ "learning_rate": 1.8123700346574248e-05,
162
+ "loss": 1.0091,
163
+ "step": 5500
164
+ },
165
+ {
166
+ "epoch": 0.5824849313680798,
167
+ "grad_norm": 0.9221129417419434,
168
+ "learning_rate": 1.7990402559317517e-05,
169
+ "loss": 1.0004,
170
+ "step": 5750
171
+ },
172
+ {
173
+ "epoch": 0.607810363166692,
174
+ "grad_norm": 0.9478843212127686,
175
+ "learning_rate": 1.7857104772060784e-05,
176
+ "loss": 1.0046,
177
+ "step": 6000
178
+ },
179
+ {
180
+ "epoch": 0.6331357949653041,
181
+ "grad_norm": 2.4678380489349365,
182
+ "learning_rate": 1.7723806984804053e-05,
183
+ "loss": 1.0008,
184
+ "step": 6250
185
+ },
186
+ {
187
+ "epoch": 0.6584612267639163,
188
+ "grad_norm": 0.8661640286445618,
189
+ "learning_rate": 1.7590509197547323e-05,
190
+ "loss": 1.0027,
191
+ "step": 6500
192
+ },
193
+ {
194
+ "epoch": 0.6837866585625285,
195
+ "grad_norm": 0.8918209671974182,
196
+ "learning_rate": 1.7457211410290592e-05,
197
+ "loss": 0.9995,
198
+ "step": 6750
199
+ },
200
+ {
201
+ "epoch": 0.7091120903611406,
202
+ "grad_norm": 0.8873420357704163,
203
+ "learning_rate": 1.732391362303386e-05,
204
+ "loss": 0.9932,
205
+ "step": 7000
206
+ },
207
+ {
208
+ "epoch": 0.7344375221597528,
209
+ "grad_norm": 0.8874693512916565,
210
+ "learning_rate": 1.719061583577713e-05,
211
+ "loss": 0.9785,
212
+ "step": 7250
213
+ },
214
+ {
215
+ "epoch": 0.759762953958365,
216
+ "grad_norm": 0.8625825643539429,
217
+ "learning_rate": 1.7057318048520395e-05,
218
+ "loss": 0.9817,
219
+ "step": 7500
220
+ },
221
+ {
222
+ "epoch": 0.7850883857569771,
223
+ "grad_norm": 0.9080167412757874,
224
+ "learning_rate": 1.6924020261263664e-05,
225
+ "loss": 0.9783,
226
+ "step": 7750
227
+ },
228
+ {
229
+ "epoch": 0.8104138175555893,
230
+ "grad_norm": 0.9515653848648071,
231
+ "learning_rate": 1.6790722474006934e-05,
232
+ "loss": 0.984,
233
+ "step": 8000
234
+ },
235
+ {
236
+ "epoch": 0.8357392493542015,
237
+ "grad_norm": 0.8340937495231628,
238
+ "learning_rate": 1.66574246867502e-05,
239
+ "loss": 0.9819,
240
+ "step": 8250
241
+ },
242
+ {
243
+ "epoch": 0.8610646811528136,
244
+ "grad_norm": 0.9471110701560974,
245
+ "learning_rate": 1.652412689949347e-05,
246
+ "loss": 0.974,
247
+ "step": 8500
248
+ },
249
+ {
250
+ "epoch": 0.8863901129514258,
251
+ "grad_norm": 0.8862300515174866,
252
+ "learning_rate": 1.639082911223674e-05,
253
+ "loss": 0.9653,
254
+ "step": 8750
255
+ },
256
+ {
257
+ "epoch": 0.911715544750038,
258
+ "grad_norm": 0.8293271064758301,
259
+ "learning_rate": 1.6257531324980006e-05,
260
+ "loss": 0.9615,
261
+ "step": 9000
262
+ },
263
+ {
264
+ "epoch": 0.9370409765486502,
265
+ "grad_norm": 0.9127717614173889,
266
+ "learning_rate": 1.6124233537723275e-05,
267
+ "loss": 0.9671,
268
+ "step": 9250
269
+ },
270
+ {
271
+ "epoch": 0.9623664083472623,
272
+ "grad_norm": 0.8955530524253845,
273
+ "learning_rate": 1.599093575046654e-05,
274
+ "loss": 0.9606,
275
+ "step": 9500
276
+ },
277
+ {
278
+ "epoch": 0.9876918401458745,
279
+ "grad_norm": 0.9058429002761841,
280
+ "learning_rate": 1.585763796320981e-05,
281
+ "loss": 0.9586,
282
+ "step": 9750
283
+ },
284
+ {
285
+ "epoch": 0.9999493491364028,
286
+ "eval_loss": 1.0199180841445923,
287
+ "eval_runtime": 1639.424,
288
+ "eval_samples_per_second": 40.384,
289
+ "eval_steps_per_second": 1.262,
290
+ "step": 9871
291
+ },
292
+ {
293
+ "epoch": 1.0130172719444868,
294
+ "grad_norm": 0.8560991287231445,
295
+ "learning_rate": 1.572434017595308e-05,
296
+ "loss": 0.939,
297
+ "step": 10000
298
+ },
299
+ {
300
+ "epoch": 1.0383427037430988,
301
+ "grad_norm": 0.8284989595413208,
302
+ "learning_rate": 1.559104238869635e-05,
303
+ "loss": 0.9317,
304
+ "step": 10250
305
+ },
306
+ {
307
+ "epoch": 1.063668135541711,
308
+ "grad_norm": 0.8807125091552734,
309
+ "learning_rate": 1.5457744601439617e-05,
310
+ "loss": 0.9363,
311
+ "step": 10500
312
+ },
313
+ {
314
+ "epoch": 1.088993567340323,
315
+ "grad_norm": 0.8796108961105347,
316
+ "learning_rate": 1.5324446814182886e-05,
317
+ "loss": 0.9236,
318
+ "step": 10750
319
+ },
320
+ {
321
+ "epoch": 1.1143189991389353,
322
+ "grad_norm": 0.8540758490562439,
323
+ "learning_rate": 1.5191149026926156e-05,
324
+ "loss": 0.9233,
325
+ "step": 11000
326
+ },
327
+ {
328
+ "epoch": 1.1396444309375475,
329
+ "grad_norm": 0.8657658696174622,
330
+ "learning_rate": 1.5057851239669424e-05,
331
+ "loss": 0.9346,
332
+ "step": 11250
333
+ },
334
+ {
335
+ "epoch": 1.1649698627361595,
336
+ "grad_norm": 0.8995893001556396,
337
+ "learning_rate": 1.4924553452412692e-05,
338
+ "loss": 0.917,
339
+ "step": 11500
340
+ },
341
+ {
342
+ "epoch": 1.1902952945347718,
343
+ "grad_norm": 0.9290043115615845,
344
+ "learning_rate": 1.479125566515596e-05,
345
+ "loss": 0.9338,
346
+ "step": 11750
347
+ },
348
+ {
349
+ "epoch": 1.215620726333384,
350
+ "grad_norm": 0.8952407836914062,
351
+ "learning_rate": 1.4657957877899228e-05,
352
+ "loss": 0.9257,
353
+ "step": 12000
354
+ },
355
+ {
356
+ "epoch": 1.240946158131996,
357
+ "grad_norm": 0.8839919567108154,
358
+ "learning_rate": 1.4524660090642497e-05,
359
+ "loss": 0.9104,
360
+ "step": 12250
361
+ },
362
+ {
363
+ "epoch": 1.2662715899306083,
364
+ "grad_norm": 0.9677265286445618,
365
+ "learning_rate": 1.4391362303385765e-05,
366
+ "loss": 0.9259,
367
+ "step": 12500
368
+ },
369
+ {
370
+ "epoch": 1.2915970217292205,
371
+ "grad_norm": 0.9325098395347595,
372
+ "learning_rate": 1.4258064516129033e-05,
373
+ "loss": 0.9203,
374
+ "step": 12750
375
+ },
376
+ {
377
+ "epoch": 1.3169224535278325,
378
+ "grad_norm": 0.778640866279602,
379
+ "learning_rate": 1.4124766728872301e-05,
380
+ "loss": 0.912,
381
+ "step": 13000
382
+ },
383
+ {
384
+ "epoch": 1.3422478853264448,
385
+ "grad_norm": 0.8638414144515991,
386
+ "learning_rate": 1.399146894161557e-05,
387
+ "loss": 0.9181,
388
+ "step": 13250
389
+ },
390
+ {
391
+ "epoch": 1.367573317125057,
392
+ "grad_norm": 1.0181560516357422,
393
+ "learning_rate": 1.3858171154358839e-05,
394
+ "loss": 0.9146,
395
+ "step": 13500
396
+ },
397
+ {
398
+ "epoch": 1.392898748923669,
399
+ "grad_norm": 0.9884174466133118,
400
+ "learning_rate": 1.3724873367102107e-05,
401
+ "loss": 0.9037,
402
+ "step": 13750
403
+ },
404
+ {
405
+ "epoch": 1.4182241807222813,
406
+ "grad_norm": 1.1058709621429443,
407
+ "learning_rate": 1.3591575579845375e-05,
408
+ "loss": 0.9159,
409
+ "step": 14000
410
+ },
411
+ {
412
+ "epoch": 1.4435496125208935,
413
+ "grad_norm": 1.0129822492599487,
414
+ "learning_rate": 1.3458277792588642e-05,
415
+ "loss": 0.8993,
416
+ "step": 14250
417
+ },
418
+ {
419
+ "epoch": 1.4688750443195056,
420
+ "grad_norm": 0.8782840967178345,
421
+ "learning_rate": 1.3324980005331914e-05,
422
+ "loss": 0.9061,
423
+ "step": 14500
424
+ },
425
+ {
426
+ "epoch": 1.4942004761181178,
427
+ "grad_norm": 0.9015256762504578,
428
+ "learning_rate": 1.3191682218075182e-05,
429
+ "loss": 0.9122,
430
+ "step": 14750
431
+ },
432
+ {
433
+ "epoch": 1.51952590791673,
434
+ "grad_norm": 0.9482327103614807,
435
+ "learning_rate": 1.305838443081845e-05,
436
+ "loss": 0.9091,
437
+ "step": 15000
438
+ },
439
+ {
440
+ "epoch": 1.544851339715342,
441
+ "grad_norm": 0.8400648236274719,
442
+ "learning_rate": 1.2925086643561718e-05,
443
+ "loss": 0.9064,
444
+ "step": 15250
445
+ },
446
+ {
447
+ "epoch": 1.5701767715139543,
448
+ "grad_norm": 0.9606112837791443,
449
+ "learning_rate": 1.2791788856304987e-05,
450
+ "loss": 0.8983,
451
+ "step": 15500
452
+ },
453
+ {
454
+ "epoch": 1.5955022033125665,
455
+ "grad_norm": 0.944854736328125,
456
+ "learning_rate": 1.2658491069048255e-05,
457
+ "loss": 0.9055,
458
+ "step": 15750
459
+ },
460
+ {
461
+ "epoch": 1.6208276351111786,
462
+ "grad_norm": 0.8674355745315552,
463
+ "learning_rate": 1.2525193281791523e-05,
464
+ "loss": 0.8895,
465
+ "step": 16000
466
+ },
467
+ {
468
+ "epoch": 1.6461530669097908,
469
+ "grad_norm": 0.8632267713546753,
470
+ "learning_rate": 1.2391895494534791e-05,
471
+ "loss": 0.8876,
472
+ "step": 16250
473
+ },
474
+ {
475
+ "epoch": 1.671478498708403,
476
+ "grad_norm": 0.903851330280304,
477
+ "learning_rate": 1.2258597707278059e-05,
478
+ "loss": 0.9007,
479
+ "step": 16500
480
+ },
481
+ {
482
+ "epoch": 1.696803930507015,
483
+ "grad_norm": 0.9242746829986572,
484
+ "learning_rate": 1.2125833111170356e-05,
485
+ "loss": 0.8953,
486
+ "step": 16750
487
+ },
488
+ {
489
+ "epoch": 1.7221293623056273,
490
+ "grad_norm": 0.9627535343170166,
491
+ "learning_rate": 1.1992535323913624e-05,
492
+ "loss": 0.8881,
493
+ "step": 17000
494
+ },
495
+ {
496
+ "epoch": 1.7474547941042395,
497
+ "grad_norm": 0.8524439334869385,
498
+ "learning_rate": 1.1859237536656893e-05,
499
+ "loss": 0.8913,
500
+ "step": 17250
501
+ },
502
+ {
503
+ "epoch": 1.7727802259028516,
504
+ "grad_norm": 0.9666581749916077,
505
+ "learning_rate": 1.1726472940549186e-05,
506
+ "loss": 0.8972,
507
+ "step": 17500
508
+ },
509
+ {
510
+ "epoch": 1.7981056577014638,
511
+ "grad_norm": 0.8809413909912109,
512
+ "learning_rate": 1.1593175153292454e-05,
513
+ "loss": 0.8858,
514
+ "step": 17750
515
+ },
516
+ {
517
+ "epoch": 1.823431089500076,
518
+ "grad_norm": 0.903626024723053,
519
+ "learning_rate": 1.1459877366035726e-05,
520
+ "loss": 0.8907,
521
+ "step": 18000
522
+ },
523
+ {
524
+ "epoch": 1.848756521298688,
525
+ "grad_norm": 0.8203657865524292,
526
+ "learning_rate": 1.1326579578778994e-05,
527
+ "loss": 0.8849,
528
+ "step": 18250
529
+ },
530
+ {
531
+ "epoch": 1.8740819530973003,
532
+ "grad_norm": 0.8978894948959351,
533
+ "learning_rate": 1.1193281791522262e-05,
534
+ "loss": 0.8831,
535
+ "step": 18500
536
+ },
537
+ {
538
+ "epoch": 1.8994073848959125,
539
+ "grad_norm": 0.9283676743507385,
540
+ "learning_rate": 1.105998400426553e-05,
541
+ "loss": 0.8828,
542
+ "step": 18750
543
+ },
544
+ {
545
+ "epoch": 1.9247328166945246,
546
+ "grad_norm": 0.9514400959014893,
547
+ "learning_rate": 1.09266862170088e-05,
548
+ "loss": 0.8761,
549
+ "step": 19000
550
+ },
551
+ {
552
+ "epoch": 1.9500582484931368,
553
+ "grad_norm": 0.8809083104133606,
554
+ "learning_rate": 1.0793388429752067e-05,
555
+ "loss": 0.8861,
556
+ "step": 19250
557
+ },
558
+ {
559
+ "epoch": 1.975383680291749,
560
+ "grad_norm": 0.8767380118370056,
561
+ "learning_rate": 1.0660090642495335e-05,
562
+ "loss": 0.8811,
563
+ "step": 19500
564
+ },
565
+ {
566
+ "epoch": 2.0,
567
+ "eval_loss": 1.0132982730865479,
568
+ "eval_runtime": 1638.7739,
569
+ "eval_samples_per_second": 40.4,
570
+ "eval_steps_per_second": 1.263,
571
+ "step": 19743
572
+ },
573
+ {
574
+ "epoch": 2.000709112090361,
575
+ "grad_norm": 0.9499657154083252,
576
+ "learning_rate": 1.0526792855238603e-05,
577
+ "loss": 0.8821,
578
+ "step": 19750
579
+ },
580
+ {
581
+ "epoch": 2.0260345438889735,
582
+ "grad_norm": 0.7973400950431824,
583
+ "learning_rate": 1.0393495067981871e-05,
584
+ "loss": 0.8608,
585
+ "step": 20000
586
+ },
587
+ {
588
+ "epoch": 2.0513599756875855,
589
+ "grad_norm": 0.9120876789093018,
590
+ "learning_rate": 1.0260197280725142e-05,
591
+ "loss": 0.8574,
592
+ "step": 20250
593
+ },
594
+ {
595
+ "epoch": 2.0766854074861976,
596
+ "grad_norm": 0.8534213304519653,
597
+ "learning_rate": 1.012689949346841e-05,
598
+ "loss": 0.8604,
599
+ "step": 20500
600
+ },
601
+ {
602
+ "epoch": 2.1020108392848096,
603
+ "grad_norm": 0.8855152726173401,
604
+ "learning_rate": 9.993601706211678e-06,
605
+ "loss": 0.8696,
606
+ "step": 20750
607
+ },
608
+ {
609
+ "epoch": 2.127336271083422,
610
+ "grad_norm": 0.9473037123680115,
611
+ "learning_rate": 9.860303918954946e-06,
612
+ "loss": 0.859,
613
+ "step": 21000
614
+ },
615
+ {
616
+ "epoch": 2.152661702882034,
617
+ "grad_norm": 0.9424493908882141,
618
+ "learning_rate": 9.727006131698214e-06,
619
+ "loss": 0.8577,
620
+ "step": 21250
621
+ },
622
+ {
623
+ "epoch": 2.177987134680646,
624
+ "grad_norm": 0.8066829442977905,
625
+ "learning_rate": 9.593708344441484e-06,
626
+ "loss": 0.8611,
627
+ "step": 21500
628
+ },
629
+ {
630
+ "epoch": 2.2033125664792585,
631
+ "grad_norm": 0.9358799457550049,
632
+ "learning_rate": 9.460410557184752e-06,
633
+ "loss": 0.8572,
634
+ "step": 21750
635
+ },
636
+ {
637
+ "epoch": 2.2286379982778706,
638
+ "grad_norm": 0.9982028007507324,
639
+ "learning_rate": 9.32711276992802e-06,
640
+ "loss": 0.8492,
641
+ "step": 22000
642
+ },
643
+ {
644
+ "epoch": 2.2539634300764826,
645
+ "grad_norm": 0.8830463290214539,
646
+ "learning_rate": 9.19381498267129e-06,
647
+ "loss": 0.8539,
648
+ "step": 22250
649
+ },
650
+ {
651
+ "epoch": 2.279288861875095,
652
+ "grad_norm": 0.9708883762359619,
653
+ "learning_rate": 9.061050386563584e-06,
654
+ "loss": 0.8495,
655
+ "step": 22500
656
+ },
657
+ {
658
+ "epoch": 2.304614293673707,
659
+ "grad_norm": 0.8464154005050659,
660
+ "learning_rate": 8.927752599306852e-06,
661
+ "loss": 0.8519,
662
+ "step": 22750
663
+ },
664
+ {
665
+ "epoch": 2.329939725472319,
666
+ "grad_norm": 0.9446752667427063,
667
+ "learning_rate": 8.79445481205012e-06,
668
+ "loss": 0.8545,
669
+ "step": 23000
670
+ },
671
+ {
672
+ "epoch": 2.3552651572709316,
673
+ "grad_norm": 0.9621785283088684,
674
+ "learning_rate": 8.66115702479339e-06,
675
+ "loss": 0.8482,
676
+ "step": 23250
677
+ },
678
+ {
679
+ "epoch": 2.3805905890695436,
680
+ "grad_norm": 0.9067039489746094,
681
+ "learning_rate": 8.527859237536658e-06,
682
+ "loss": 0.8582,
683
+ "step": 23500
684
+ },
685
+ {
686
+ "epoch": 2.4059160208681556,
687
+ "grad_norm": 0.9858378767967224,
688
+ "learning_rate": 8.394561450279926e-06,
689
+ "loss": 0.8565,
690
+ "step": 23750
691
+ },
692
+ {
693
+ "epoch": 2.431241452666768,
694
+ "grad_norm": 0.9362533092498779,
695
+ "learning_rate": 8.261263663023195e-06,
696
+ "loss": 0.8531,
697
+ "step": 24000
698
+ },
699
+ {
700
+ "epoch": 2.45656688446538,
701
+ "grad_norm": 0.9225192666053772,
702
+ "learning_rate": 8.127965875766463e-06,
703
+ "loss": 0.8555,
704
+ "step": 24250
705
+ },
706
+ {
707
+ "epoch": 2.481892316263992,
708
+ "grad_norm": 0.9358901381492615,
709
+ "learning_rate": 7.994668088509731e-06,
710
+ "loss": 0.8523,
711
+ "step": 24500
712
+ },
713
+ {
714
+ "epoch": 2.5072177480626046,
715
+ "grad_norm": 0.9531691670417786,
716
+ "learning_rate": 7.861370301252999e-06,
717
+ "loss": 0.8443,
718
+ "step": 24750
719
+ },
720
+ {
721
+ "epoch": 2.5325431798612166,
722
+ "grad_norm": 0.9370359182357788,
723
+ "learning_rate": 7.728072513996269e-06,
724
+ "loss": 0.8451,
725
+ "step": 25000
726
+ },
727
+ {
728
+ "epoch": 2.5578686116598286,
729
+ "grad_norm": 0.8840625882148743,
730
+ "learning_rate": 7.5947747267395365e-06,
731
+ "loss": 0.8533,
732
+ "step": 25250
733
+ },
734
+ {
735
+ "epoch": 2.583194043458441,
736
+ "grad_norm": 0.9283475875854492,
737
+ "learning_rate": 7.461476939482805e-06,
738
+ "loss": 0.8425,
739
+ "step": 25500
740
+ },
741
+ {
742
+ "epoch": 2.608519475257053,
743
+ "grad_norm": 0.908301055431366,
744
+ "learning_rate": 7.328179152226073e-06,
745
+ "loss": 0.8433,
746
+ "step": 25750
747
+ },
748
+ {
749
+ "epoch": 2.633844907055665,
750
+ "grad_norm": 0.9126138091087341,
751
+ "learning_rate": 7.194881364969342e-06,
752
+ "loss": 0.8401,
753
+ "step": 26000
754
+ },
755
+ {
756
+ "epoch": 2.6591703388542776,
757
+ "grad_norm": 0.8935621976852417,
758
+ "learning_rate": 7.061583577712611e-06,
759
+ "loss": 0.8418,
760
+ "step": 26250
761
+ },
762
+ {
763
+ "epoch": 2.6844957706528896,
764
+ "grad_norm": 0.8745056986808777,
765
+ "learning_rate": 6.928285790455879e-06,
766
+ "loss": 0.837,
767
+ "step": 26500
768
+ },
769
+ {
770
+ "epoch": 2.7098212024515016,
771
+ "grad_norm": 0.948512077331543,
772
+ "learning_rate": 6.795521194348175e-06,
773
+ "loss": 0.8411,
774
+ "step": 26750
775
+ },
776
+ {
777
+ "epoch": 2.735146634250114,
778
+ "grad_norm": 1.008754014968872,
779
+ "learning_rate": 6.6622234070914425e-06,
780
+ "loss": 0.8355,
781
+ "step": 27000
782
+ },
783
+ {
784
+ "epoch": 2.760472066048726,
785
+ "grad_norm": 1.0162386894226074,
786
+ "learning_rate": 6.528925619834712e-06,
787
+ "loss": 0.8405,
788
+ "step": 27250
789
+ },
790
+ {
791
+ "epoch": 2.785797497847338,
792
+ "grad_norm": 0.9260863661766052,
793
+ "learning_rate": 6.39562783257798e-06,
794
+ "loss": 0.8354,
795
+ "step": 27500
796
+ },
797
+ {
798
+ "epoch": 2.8111229296459506,
799
+ "grad_norm": 0.9513674378395081,
800
+ "learning_rate": 6.262330045321248e-06,
801
+ "loss": 0.8392,
802
+ "step": 27750
803
+ },
804
+ {
805
+ "epoch": 2.8364483614445626,
806
+ "grad_norm": 1.0211256742477417,
807
+ "learning_rate": 6.129032258064517e-06,
808
+ "loss": 0.8324,
809
+ "step": 28000
810
+ },
811
+ {
812
+ "epoch": 2.8617737932431746,
813
+ "grad_norm": 0.9345864057540894,
814
+ "learning_rate": 5.995734470807785e-06,
815
+ "loss": 0.8412,
816
+ "step": 28250
817
+ },
818
+ {
819
+ "epoch": 2.887099225041787,
820
+ "grad_norm": 0.8973652124404907,
821
+ "learning_rate": 5.8624366835510535e-06,
822
+ "loss": 0.8368,
823
+ "step": 28500
824
+ },
825
+ {
826
+ "epoch": 2.912424656840399,
827
+ "grad_norm": 0.8682575225830078,
828
+ "learning_rate": 5.7291388962943215e-06,
829
+ "loss": 0.8316,
830
+ "step": 28750
831
+ },
832
+ {
833
+ "epoch": 2.937750088639011,
834
+ "grad_norm": 0.9307655096054077,
835
+ "learning_rate": 5.595841109037591e-06,
836
+ "loss": 0.8365,
837
+ "step": 29000
838
+ },
839
+ {
840
+ "epoch": 2.9630755204376236,
841
+ "grad_norm": 1.0200505256652832,
842
+ "learning_rate": 5.462543321780859e-06,
843
+ "loss": 0.8395,
844
+ "step": 29250
845
+ },
846
+ {
847
+ "epoch": 2.9884009522362356,
848
+ "grad_norm": 1.0397480726242065,
849
+ "learning_rate": 5.329245534524128e-06,
850
+ "loss": 0.834,
851
+ "step": 29500
852
+ },
853
+ {
854
+ "epoch": 2.999949349136403,
855
+ "eval_loss": 1.0098419189453125,
856
+ "eval_runtime": 1638.8769,
857
+ "eval_samples_per_second": 40.398,
858
+ "eval_steps_per_second": 1.262,
859
+ "step": 29614
860
+ },
861
+ {
862
+ "epoch": 3.0137263840348476,
863
+ "grad_norm": 1.0044381618499756,
864
+ "learning_rate": 5.195947747267396e-06,
865
+ "loss": 0.8305,
866
+ "step": 29750
867
+ },
868
+ {
869
+ "epoch": 3.03905181583346,
870
+ "grad_norm": 1.0036746263504028,
871
+ "learning_rate": 5.062649960010664e-06,
872
+ "loss": 0.8284,
873
+ "step": 30000
874
+ },
875
+ {
876
+ "epoch": 3.064377247632072,
877
+ "grad_norm": 0.8896681070327759,
878
+ "learning_rate": 4.9293521727539325e-06,
879
+ "loss": 0.8296,
880
+ "step": 30250
881
+ },
882
+ {
883
+ "epoch": 3.089702679430684,
884
+ "grad_norm": 0.9313392639160156,
885
+ "learning_rate": 4.796587576646228e-06,
886
+ "loss": 0.8249,
887
+ "step": 30500
888
+ },
889
+ {
890
+ "epoch": 3.1150281112292966,
891
+ "grad_norm": 0.9054111838340759,
892
+ "learning_rate": 4.663822980538523e-06,
893
+ "loss": 0.8217,
894
+ "step": 30750
895
+ },
896
+ {
897
+ "epoch": 3.1403535430279086,
898
+ "grad_norm": 0.9691897630691528,
899
+ "learning_rate": 4.530525193281792e-06,
900
+ "loss": 0.8182,
901
+ "step": 31000
902
+ },
903
+ {
904
+ "epoch": 3.1656789748265206,
905
+ "grad_norm": 1.0348809957504272,
906
+ "learning_rate": 4.39722740602506e-06,
907
+ "loss": 0.8205,
908
+ "step": 31250
909
+ },
910
+ {
911
+ "epoch": 3.191004406625133,
912
+ "grad_norm": 0.8919842839241028,
913
+ "learning_rate": 4.263929618768329e-06,
914
+ "loss": 0.8142,
915
+ "step": 31500
916
+ },
917
+ {
918
+ "epoch": 3.216329838423745,
919
+ "grad_norm": 0.8951621651649475,
920
+ "learning_rate": 4.130631831511598e-06,
921
+ "loss": 0.8264,
922
+ "step": 31750
923
+ },
924
+ {
925
+ "epoch": 3.241655270222357,
926
+ "grad_norm": 0.9754297733306885,
927
+ "learning_rate": 3.9973340442548655e-06,
928
+ "loss": 0.8176,
929
+ "step": 32000
930
+ },
931
+ {
932
+ "epoch": 3.2669807020209696,
933
+ "grad_norm": 1.0148935317993164,
934
+ "learning_rate": 3.864036256998134e-06,
935
+ "loss": 0.8252,
936
+ "step": 32250
937
+ },
938
+ {
939
+ "epoch": 3.2923061338195816,
940
+ "grad_norm": 0.9025924801826477,
941
+ "learning_rate": 3.7307384697414027e-06,
942
+ "loss": 0.816,
943
+ "step": 32500
944
+ },
945
+ {
946
+ "epoch": 3.3176315656181936,
947
+ "grad_norm": 0.9092098474502563,
948
+ "learning_rate": 3.597440682484671e-06,
949
+ "loss": 0.8201,
950
+ "step": 32750
951
+ },
952
+ {
953
+ "epoch": 3.342956997416806,
954
+ "grad_norm": 0.9289584755897522,
955
+ "learning_rate": 3.4641428952279394e-06,
956
+ "loss": 0.8161,
957
+ "step": 33000
958
+ },
959
+ {
960
+ "epoch": 3.368282429215418,
961
+ "grad_norm": 0.9961521029472351,
962
+ "learning_rate": 3.3308451079712077e-06,
963
+ "loss": 0.8188,
964
+ "step": 33250
965
+ },
966
+ {
967
+ "epoch": 3.39360786101403,
968
+ "grad_norm": 1.0093194246292114,
969
+ "learning_rate": 3.1975473207144765e-06,
970
+ "loss": 0.8188,
971
+ "step": 33500
972
+ },
973
+ {
974
+ "epoch": 3.4189332928126426,
975
+ "grad_norm": 1.016876220703125,
976
+ "learning_rate": 3.064249533457745e-06,
977
+ "loss": 0.8119,
978
+ "step": 33750
979
+ },
980
+ {
981
+ "epoch": 3.4442587246112546,
982
+ "grad_norm": 1.0821869373321533,
983
+ "learning_rate": 2.9309517462010133e-06,
984
+ "loss": 0.8221,
985
+ "step": 34000
986
+ },
987
+ {
988
+ "epoch": 3.4695841564098666,
989
+ "grad_norm": 0.9766045808792114,
990
+ "learning_rate": 2.797653958944282e-06,
991
+ "loss": 0.8132,
992
+ "step": 34250
993
+ },
994
+ {
995
+ "epoch": 3.494909588208479,
996
+ "grad_norm": 0.8767127990722656,
997
+ "learning_rate": 2.6643561716875504e-06,
998
+ "loss": 0.8166,
999
+ "step": 34500
1000
+ },
1001
+ {
1002
+ "epoch": 3.520235020007091,
1003
+ "grad_norm": 0.9653995633125305,
1004
+ "learning_rate": 2.5310583844308183e-06,
1005
+ "loss": 0.8074,
1006
+ "step": 34750
1007
+ },
1008
+ {
1009
+ "epoch": 3.545560451805703,
1010
+ "grad_norm": 0.8945389986038208,
1011
+ "learning_rate": 2.397760597174087e-06,
1012
+ "loss": 0.8176,
1013
+ "step": 35000
1014
+ },
1015
+ {
1016
+ "epoch": 3.5708858836043156,
1017
+ "grad_norm": 0.9447450637817383,
1018
+ "learning_rate": 2.2649960010663825e-06,
1019
+ "loss": 0.8235,
1020
+ "step": 35250
1021
+ },
1022
+ {
1023
+ "epoch": 3.5962113154029276,
1024
+ "grad_norm": 1.0400015115737915,
1025
+ "learning_rate": 2.131698213809651e-06,
1026
+ "loss": 0.8136,
1027
+ "step": 35500
1028
+ },
1029
+ {
1030
+ "epoch": 3.6215367472015396,
1031
+ "grad_norm": 0.9300839900970459,
1032
+ "learning_rate": 1.9984004265529192e-06,
1033
+ "loss": 0.8197,
1034
+ "step": 35750
1035
+ },
1036
+ {
1037
+ "epoch": 3.646862179000152,
1038
+ "grad_norm": 0.9101824164390564,
1039
+ "learning_rate": 1.865102639296188e-06,
1040
+ "loss": 0.8078,
1041
+ "step": 36000
1042
+ },
1043
+ {
1044
+ "epoch": 3.672187610798764,
1045
+ "grad_norm": 0.9514500498771667,
1046
+ "learning_rate": 1.7318048520394562e-06,
1047
+ "loss": 0.8084,
1048
+ "step": 36250
1049
+ },
1050
+ {
1051
+ "epoch": 3.697513042597376,
1052
+ "grad_norm": 0.9441540241241455,
1053
+ "learning_rate": 1.5985070647827248e-06,
1054
+ "loss": 0.8213,
1055
+ "step": 36500
1056
+ },
1057
+ {
1058
+ "epoch": 3.7228384743959886,
1059
+ "grad_norm": 1.0184293985366821,
1060
+ "learning_rate": 1.4652092775259933e-06,
1061
+ "loss": 0.8094,
1062
+ "step": 36750
1063
+ },
1064
+ {
1065
+ "epoch": 3.7481639061946006,
1066
+ "grad_norm": 0.991316556930542,
1067
+ "learning_rate": 1.3319114902692617e-06,
1068
+ "loss": 0.8106,
1069
+ "step": 37000
1070
+ },
1071
+ {
1072
+ "epoch": 3.7734893379932126,
1073
+ "grad_norm": 0.9887702465057373,
1074
+ "learning_rate": 1.19861370301253e-06,
1075
+ "loss": 0.8185,
1076
+ "step": 37250
1077
+ },
1078
+ {
1079
+ "epoch": 3.798814769791825,
1080
+ "grad_norm": 0.9897658228874207,
1081
+ "learning_rate": 1.0653159157557984e-06,
1082
+ "loss": 0.8069,
1083
+ "step": 37500
1084
+ },
1085
+ {
1086
+ "epoch": 3.824140201590437,
1087
+ "grad_norm": 0.9137114882469177,
1088
+ "learning_rate": 9.32018128499067e-07,
1089
+ "loss": 0.8114,
1090
+ "step": 37750
1091
+ },
1092
+ {
1093
+ "epoch": 3.849465633389049,
1094
+ "grad_norm": 0.9579175710678101,
1095
+ "learning_rate": 7.987203412423355e-07,
1096
+ "loss": 0.819,
1097
+ "step": 38000
1098
+ },
1099
+ {
1100
+ "epoch": 3.8747910651876616,
1101
+ "grad_norm": 0.9389879107475281,
1102
+ "learning_rate": 6.654225539856039e-07,
1103
+ "loss": 0.8165,
1104
+ "step": 38250
1105
+ },
1106
+ {
1107
+ "epoch": 3.9001164969862736,
1108
+ "grad_norm": 0.9765516519546509,
1109
+ "learning_rate": 5.321247667288723e-07,
1110
+ "loss": 0.8191,
1111
+ "step": 38500
1112
+ },
1113
+ {
1114
+ "epoch": 3.9254419287848856,
1115
+ "grad_norm": 1.0299735069274902,
1116
+ "learning_rate": 3.9882697947214085e-07,
1117
+ "loss": 0.8182,
1118
+ "step": 38750
1119
+ },
1120
+ {
1121
+ "epoch": 3.950767360583498,
1122
+ "grad_norm": 0.9844255447387695,
1123
+ "learning_rate": 2.6552919221540927e-07,
1124
+ "loss": 0.8175,
1125
+ "step": 39000
1126
+ },
1127
+ {
1128
+ "epoch": 3.97609279238211,
1129
+ "grad_norm": 0.9425697922706604,
1130
+ "learning_rate": 1.3223140495867768e-07,
1131
+ "loss": 0.813,
1132
+ "step": 39250
1133
+ },
1134
+ {
1135
+ "epoch": 3.999797396545611,
1136
+ "eval_loss": 1.0085912942886353,
1137
+ "eval_runtime": 1638.7267,
1138
+ "eval_samples_per_second": 40.401,
1139
+ "eval_steps_per_second": 1.263,
1140
+ "step": 39484
1141
+ },
1142
+ {
1143
+ "epoch": 3.999797396545611,
1144
+ "step": 39484,
1145
+ "total_flos": 3.077643948911493e+18,
1146
+ "train_loss": 0.9114745237365729,
1147
+ "train_runtime": 159688.734,
1148
+ "train_samples_per_second": 15.825,
1149
+ "train_steps_per_second": 0.247
1150
+ }
1151
+ ],
1152
+ "logging_steps": 250,
1153
+ "max_steps": 39484,
1154
+ "num_input_tokens_seen": 0,
1155
+ "num_train_epochs": 4,
1156
+ "save_steps": 500,
1157
+ "total_flos": 3.077643948911493e+18,
1158
+ "train_batch_size": 8,
1159
+ "trial_name": null,
1160
+ "trial_params": null
1161
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aecc845e0935bcd44dbbe3b9f46d94173a31bb0bc3a77f8b9c2482246435ad0f
3
+ size 5240
vocab.json ADDED
The diff for this file is too large to render. See raw diff