Osamarafique998 commited on
Commit
f452abf
1 Parent(s): f0eb694

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,61 +1,23 @@
1
  {
2
- "_name_or_path": "lmsys/fastchat-t5-3b-v1.0",
3
  "architectures": [
4
- "T5ForConditionalGeneration"
5
  ],
6
- "d_ff": 5120,
7
- "d_kv": 64,
8
- "d_model": 2048,
9
- "decoder_start_token_id": 0,
10
- "dense_act_fn": "gelu_new",
11
- "dropout_rate": 0.1,
12
- "eos_token_id": 1,
13
- "feed_forward_proj": "gated-gelu",
14
- "initializer_factor": 1.0,
15
- "is_encoder_decoder": true,
16
- "is_gated_act": true,
17
- "layer_norm_epsilon": 1e-06,
18
- "model_type": "t5",
19
- "n_positions": 512,
20
- "num_decoder_layers": 24,
21
- "num_heads": 32,
22
- "num_layers": 24,
23
- "output_past": true,
24
  "pad_token_id": 0,
25
- "relative_attention_max_distance": 128,
26
- "relative_attention_num_buckets": 32,
27
- "task_specific_params": {
28
- "summarization": {
29
- "early_stopping": true,
30
- "length_penalty": 2.0,
31
- "max_length": 200,
32
- "min_length": 30,
33
- "no_repeat_ngram_size": 3,
34
- "num_beams": 4,
35
- "prefix": "summarize: "
36
- },
37
- "translation_en_to_de": {
38
- "early_stopping": true,
39
- "max_length": 300,
40
- "num_beams": 4,
41
- "prefix": "translate English to German: "
42
- },
43
- "translation_en_to_fr": {
44
- "early_stopping": true,
45
- "max_length": 300,
46
- "num_beams": 4,
47
- "prefix": "translate English to French: "
48
- },
49
- "translation_en_to_ro": {
50
- "early_stopping": true,
51
- "max_length": 300,
52
- "num_beams": 4,
53
- "prefix": "translate English to Romanian: "
54
- }
55
- },
56
  "tie_word_embeddings": false,
57
  "torch_dtype": "float32",
58
  "transformers_version": "4.28.1",
59
- "use_cache": true,
60
- "vocab_size": 32110
61
  }
 
1
  {
2
+ "_name_or_path": "lmsys/vicuna-7b-v1.3",
3
  "architectures": [
4
+ "LlamaForCausalLM"
5
  ],
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "max_position_embeddings": 2048,
13
+ "model_type": "llama",
14
+ "num_attention_heads": 32,
15
+ "num_hidden_layers": 32,
 
 
 
 
 
 
 
 
16
  "pad_token_id": 0,
17
+ "rms_norm_eps": 1e-06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  "tie_word_embeddings": false,
19
  "torch_dtype": "float32",
20
  "transformers_version": "4.28.1",
21
+ "use_cache": false,
22
+ "vocab_size": 32000
23
  }
generation_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_from_model_config": true,
3
- "decoder_start_token_id": 0,
4
- "eos_token_id": 1,
5
  "pad_token_id": 0,
6
  "transformers_version": "4.28.1"
7
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
  "pad_token_id": 0,
6
  "transformers_version": "4.28.1"
7
  }
pytorch_model-00001-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2403c0246b7c95b8ff32b84bff67ee4411e331d98ebea8a13f7ea23197c91410
3
+ size 9877993490
pytorch_model-00002-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da5be4f0bec4194afbeb22992907940f2795323555bf12758dc310ea16bdc57c
3
+ size 9894805046
pytorch_model-00003-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5a57b1c09c0a633885c66777e2add57cd5f3c8582c91f21e7ea5ca4fb19f939
3
+ size 7180993529
pytorch_model.bin.index.json CHANGED
@@ -1,567 +1,330 @@
1
  {
2
  "metadata": {
3
- "total_size": 11924824064
4
  },
5
  "weight_map": {
6
- "decoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
7
- "decoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
8
- "decoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
9
- "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00002.bin",
10
- "decoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
11
- "decoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
12
- "decoder.block.0.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
13
- "decoder.block.0.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
14
- "decoder.block.0.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
15
- "decoder.block.0.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
16
- "decoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
17
- "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
18
- "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
19
- "decoder.block.0.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
20
- "decoder.block.0.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
21
- "decoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
22
- "decoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
23
- "decoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
24
- "decoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
25
- "decoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
26
- "decoder.block.1.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
27
- "decoder.block.1.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
28
- "decoder.block.1.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
29
- "decoder.block.1.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
30
- "decoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
31
- "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
32
- "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
33
- "decoder.block.1.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
34
- "decoder.block.1.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
35
- "decoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
36
- "decoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
37
- "decoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
38
- "decoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
39
- "decoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
40
- "decoder.block.10.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
41
- "decoder.block.10.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
42
- "decoder.block.10.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
43
- "decoder.block.10.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
44
- "decoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
45
- "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
46
- "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
47
- "decoder.block.10.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
48
- "decoder.block.10.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
49
- "decoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
50
- "decoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
51
- "decoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
52
- "decoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
53
- "decoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
54
- "decoder.block.11.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
55
- "decoder.block.11.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
56
- "decoder.block.11.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
57
- "decoder.block.11.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
58
- "decoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
59
- "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
60
- "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
61
- "decoder.block.11.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
62
- "decoder.block.11.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
63
- "decoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
64
- "decoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
65
- "decoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
66
- "decoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
67
- "decoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
68
- "decoder.block.12.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
69
- "decoder.block.12.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
70
- "decoder.block.12.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
71
- "decoder.block.12.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
72
- "decoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
73
- "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
74
- "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
75
- "decoder.block.12.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
76
- "decoder.block.12.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
77
- "decoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
78
- "decoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
79
- "decoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
80
- "decoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
81
- "decoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
82
- "decoder.block.13.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
83
- "decoder.block.13.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
84
- "decoder.block.13.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
85
- "decoder.block.13.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
86
- "decoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
87
- "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
88
- "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
89
- "decoder.block.13.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
90
- "decoder.block.13.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
91
- "decoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
92
- "decoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
93
- "decoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
94
- "decoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
95
- "decoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
96
- "decoder.block.14.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
97
- "decoder.block.14.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
98
- "decoder.block.14.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
99
- "decoder.block.14.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
100
- "decoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
101
- "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
102
- "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
103
- "decoder.block.14.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
104
- "decoder.block.14.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
105
- "decoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
106
- "decoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
107
- "decoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
108
- "decoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
109
- "decoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
110
- "decoder.block.15.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
111
- "decoder.block.15.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
112
- "decoder.block.15.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
113
- "decoder.block.15.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
114
- "decoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
115
- "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
116
- "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
117
- "decoder.block.15.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
118
- "decoder.block.15.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
119
- "decoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
120
- "decoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
121
- "decoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
122
- "decoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
123
- "decoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
124
- "decoder.block.16.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
125
- "decoder.block.16.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
126
- "decoder.block.16.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
127
- "decoder.block.16.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
128
- "decoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
129
- "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
130
- "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
131
- "decoder.block.16.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
132
- "decoder.block.16.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
133
- "decoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
134
- "decoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
135
- "decoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
136
- "decoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
137
- "decoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
138
- "decoder.block.17.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
139
- "decoder.block.17.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
140
- "decoder.block.17.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
141
- "decoder.block.17.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
142
- "decoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
143
- "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
144
- "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
145
- "decoder.block.17.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
146
- "decoder.block.17.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
147
- "decoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
148
- "decoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
149
- "decoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
150
- "decoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
151
- "decoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
152
- "decoder.block.18.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
153
- "decoder.block.18.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
154
- "decoder.block.18.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
155
- "decoder.block.18.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
156
- "decoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
157
- "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
158
- "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
159
- "decoder.block.18.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
160
- "decoder.block.18.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
161
- "decoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
162
- "decoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
163
- "decoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
164
- "decoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
165
- "decoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
166
- "decoder.block.19.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
167
- "decoder.block.19.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
168
- "decoder.block.19.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
169
- "decoder.block.19.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
170
- "decoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
171
- "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
172
- "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
173
- "decoder.block.19.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
174
- "decoder.block.19.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
175
- "decoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
176
- "decoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
177
- "decoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
178
- "decoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
179
- "decoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
180
- "decoder.block.2.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
181
- "decoder.block.2.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
182
- "decoder.block.2.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
183
- "decoder.block.2.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
184
- "decoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
185
- "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
186
- "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
187
- "decoder.block.2.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
188
- "decoder.block.2.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
189
- "decoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
190
- "decoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
191
- "decoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
192
- "decoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
193
- "decoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
194
- "decoder.block.20.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
195
- "decoder.block.20.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
196
- "decoder.block.20.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
197
- "decoder.block.20.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
198
- "decoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
199
- "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
200
- "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
201
- "decoder.block.20.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
202
- "decoder.block.20.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
203
- "decoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
204
- "decoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
205
- "decoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
206
- "decoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
207
- "decoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
208
- "decoder.block.21.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
209
- "decoder.block.21.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
210
- "decoder.block.21.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
211
- "decoder.block.21.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
212
- "decoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
213
- "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
214
- "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
215
- "decoder.block.21.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
216
- "decoder.block.21.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
217
- "decoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
218
- "decoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
219
- "decoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
220
- "decoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
221
- "decoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
222
- "decoder.block.22.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
223
- "decoder.block.22.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
224
- "decoder.block.22.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
225
- "decoder.block.22.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
226
- "decoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
227
- "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
228
- "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
229
- "decoder.block.22.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
230
- "decoder.block.22.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
231
- "decoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
232
- "decoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
233
- "decoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
234
- "decoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
235
- "decoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
236
- "decoder.block.23.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
237
- "decoder.block.23.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
238
- "decoder.block.23.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
239
- "decoder.block.23.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
240
- "decoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
241
- "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
242
- "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
243
- "decoder.block.23.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
244
- "decoder.block.23.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
245
- "decoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
246
- "decoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
247
- "decoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
248
- "decoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
249
- "decoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
250
- "decoder.block.3.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
251
- "decoder.block.3.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
252
- "decoder.block.3.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
253
- "decoder.block.3.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
254
- "decoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
255
- "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
256
- "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
257
- "decoder.block.3.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
258
- "decoder.block.3.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
259
- "decoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
260
- "decoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
261
- "decoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
262
- "decoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
263
- "decoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
264
- "decoder.block.4.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
265
- "decoder.block.4.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
266
- "decoder.block.4.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
267
- "decoder.block.4.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
268
- "decoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
269
- "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
270
- "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
271
- "decoder.block.4.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
272
- "decoder.block.4.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
273
- "decoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
274
- "decoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
275
- "decoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
276
- "decoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
277
- "decoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
278
- "decoder.block.5.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
279
- "decoder.block.5.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
280
- "decoder.block.5.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
281
- "decoder.block.5.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
282
- "decoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
283
- "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
284
- "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
285
- "decoder.block.5.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
286
- "decoder.block.5.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
287
- "decoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
288
- "decoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
289
- "decoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
290
- "decoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
291
- "decoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
292
- "decoder.block.6.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
293
- "decoder.block.6.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
294
- "decoder.block.6.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
295
- "decoder.block.6.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
296
- "decoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
297
- "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
298
- "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
299
- "decoder.block.6.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
300
- "decoder.block.6.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
301
- "decoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
302
- "decoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
303
- "decoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
304
- "decoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
305
- "decoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
306
- "decoder.block.7.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
307
- "decoder.block.7.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
308
- "decoder.block.7.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
309
- "decoder.block.7.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
310
- "decoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
311
- "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
312
- "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
313
- "decoder.block.7.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
314
- "decoder.block.7.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
315
- "decoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
316
- "decoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
317
- "decoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
318
- "decoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
319
- "decoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
320
- "decoder.block.8.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
321
- "decoder.block.8.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
322
- "decoder.block.8.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
323
- "decoder.block.8.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
324
- "decoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
325
- "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
326
- "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
327
- "decoder.block.8.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
328
- "decoder.block.8.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
329
- "decoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
330
- "decoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
331
- "decoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
332
- "decoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
333
- "decoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
334
- "decoder.block.9.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
335
- "decoder.block.9.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
336
- "decoder.block.9.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
337
- "decoder.block.9.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
338
- "decoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
339
- "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
340
- "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
341
- "decoder.block.9.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
342
- "decoder.block.9.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
343
- "decoder.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
344
- "decoder.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
345
- "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
346
- "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
347
- "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
348
- "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00002.bin",
349
- "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
350
- "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
351
- "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
352
- "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
353
- "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
354
- "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
355
- "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
356
- "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
357
- "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
358
- "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
359
- "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
360
- "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
361
- "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
362
- "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
363
- "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
364
- "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
365
- "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
366
- "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
367
- "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
368
- "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
369
- "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
370
- "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
371
- "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
372
- "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
373
- "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
374
- "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
375
- "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
376
- "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
377
- "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
378
- "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
379
- "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
380
- "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
381
- "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
382
- "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
383
- "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
384
- "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
385
- "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
386
- "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
387
- "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
388
- "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
389
- "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
390
- "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
391
- "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
392
- "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
393
- "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
394
- "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
395
- "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
396
- "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
397
- "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
398
- "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
399
- "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
400
- "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
401
- "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
402
- "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
403
- "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
404
- "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
405
- "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
406
- "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
407
- "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
408
- "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
409
- "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
410
- "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
411
- "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
412
- "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
413
- "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
414
- "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
415
- "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
416
- "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
417
- "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
418
- "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
419
- "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
420
- "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
421
- "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
422
- "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
423
- "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
424
- "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
425
- "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
426
- "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
427
- "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
428
- "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
429
- "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
430
- "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
431
- "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
432
- "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
433
- "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
434
- "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
435
- "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
436
- "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
437
- "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
438
- "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
439
- "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
440
- "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
441
- "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
442
- "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
443
- "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
444
- "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
445
- "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
446
- "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
447
- "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
448
- "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
449
- "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
450
- "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
451
- "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
452
- "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
453
- "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
454
- "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
455
- "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
456
- "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
457
- "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
458
- "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
459
- "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
460
- "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
461
- "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
462
- "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
463
- "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
464
- "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
465
- "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
466
- "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
467
- "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
468
- "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
469
- "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
470
- "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
471
- "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
472
- "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
473
- "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
474
- "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
475
- "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
476
- "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
477
- "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
478
- "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
479
- "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
480
- "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
481
- "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
482
- "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
483
- "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
484
- "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
485
- "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
486
- "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
487
- "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
488
- "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
489
- "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
490
- "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
491
- "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
492
- "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
493
- "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
494
- "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
495
- "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
496
- "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
497
- "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
498
- "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
499
- "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
500
- "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
501
- "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
502
- "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
503
- "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
504
- "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
505
- "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
506
- "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
507
- "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
508
- "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
509
- "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
510
- "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
511
- "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
512
- "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
513
- "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
514
- "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
515
- "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
516
- "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
517
- "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
518
- "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
519
- "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
520
- "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
521
- "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
522
- "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
523
- "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
524
- "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
525
- "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
526
- "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
527
- "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
528
- "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
529
- "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
530
- "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
531
- "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
532
- "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
533
- "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
534
- "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
535
- "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
536
- "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
537
- "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
538
- "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
539
- "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
540
- "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
541
- "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
542
- "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
543
- "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
544
- "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
545
- "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
546
- "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
547
- "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
548
- "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
549
- "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
550
- "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
551
- "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
552
- "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
553
- "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
554
- "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
555
- "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
556
- "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
557
- "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
558
- "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
559
- "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
560
- "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
561
- "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
562
- "encoder.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
563
- "encoder.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
564
- "lm_head.weight": "pytorch_model-00002-of-00002.bin",
565
- "shared.weight": "pytorch_model-00001-of-00002.bin"
566
  }
567
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 26953670656
4
  },
5
  "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00003-of-00003.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00003.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
16
+ "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
17
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
18
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
19
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
20
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
21
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
22
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
23
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
24
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
25
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
26
+ "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
27
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
28
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
29
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
30
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
31
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
32
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
33
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
34
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
35
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
36
+ "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
37
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
38
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
39
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
40
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
41
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
42
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
43
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
44
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
45
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
46
+ "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
47
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
48
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
49
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
50
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
51
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
52
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
53
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
54
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
55
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
56
+ "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
57
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
58
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
59
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
60
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
61
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
62
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
63
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
64
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
65
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
66
+ "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
67
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
68
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
69
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
70
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
71
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
72
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
73
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
74
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
75
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
76
+ "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
77
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
78
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
79
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
80
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
81
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
82
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
83
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
84
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
85
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
86
+ "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
87
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
88
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
89
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
90
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
91
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
92
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
93
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
94
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
95
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
96
+ "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
97
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
98
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
99
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
100
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
101
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
102
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
103
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
104
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
105
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
106
+ "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
107
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
108
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
109
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
110
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
111
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
112
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
113
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
114
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
115
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
116
+ "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
117
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
118
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
119
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
120
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
121
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
122
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
123
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
124
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
125
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
126
+ "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
127
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
128
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
129
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
130
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
131
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
132
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
133
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
134
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
135
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
136
+ "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
137
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
138
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
139
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
140
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
141
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
142
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
143
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
144
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
145
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
146
+ "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
147
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
148
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
149
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
150
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
151
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
152
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
153
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
154
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
155
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
156
+ "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
157
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
158
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
159
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
160
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
161
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
162
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
163
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
164
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
165
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
166
+ "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
167
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
168
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
169
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
170
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
171
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
172
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
173
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
174
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
175
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
176
+ "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
177
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
178
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
179
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
180
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
181
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
182
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
183
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
184
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
185
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
186
+ "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
187
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
188
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
189
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
190
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
191
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
192
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
193
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
194
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
195
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
196
+ "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
197
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
198
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
199
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
200
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
201
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
202
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
203
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
204
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
205
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
206
+ "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
207
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
208
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
209
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
210
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
211
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
212
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
213
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
214
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
215
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
216
+ "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
217
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
218
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
219
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
220
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
221
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
222
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
223
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
224
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
225
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
226
+ "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
227
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
228
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
229
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
230
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
231
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
232
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
233
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
234
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
235
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
236
+ "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
237
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
238
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
239
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
240
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
241
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
242
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
243
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
244
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
245
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
246
+ "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
247
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
248
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
249
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
250
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
251
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
252
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
253
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
254
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
255
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
256
+ "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
257
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
258
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
259
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
260
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
261
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
262
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
263
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
264
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
265
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
266
+ "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
267
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
268
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
269
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
270
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
271
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
272
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
273
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
274
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
275
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
276
+ "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
277
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
278
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
279
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
280
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
281
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
282
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
283
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
284
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
285
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
286
+ "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
287
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
288
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
289
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
290
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
291
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
292
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
293
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
294
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
295
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
296
+ "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
297
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
298
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
299
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
300
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
301
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
302
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
303
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
304
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
305
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
306
+ "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
307
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
308
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
309
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
310
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
311
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
312
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
313
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
314
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
315
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
316
+ "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
317
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
318
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
319
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
320
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
321
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
322
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
323
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
324
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
325
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
326
+ "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
327
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
328
+ "model.norm.weight": "pytorch_model-00003-of-00003.bin"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  }
330
  }
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87477d0f8edf067f50c88b4946719703de3bfabd31d09d4b7ddf0a17e7353fe8
3
  size 14583
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e75c96f06b249e57a701db73ce821398e69672027a86d3a44063830602a29ab4
3
  size 14583
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eab3593d5b9b66e3025a62f69b1c19ca9adf61b7912a7c90064de15176284593
3
  size 14583
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dae7f45b6bac644ac207a61f43cba6d4b919a4cac22022bbb02907914422f5d
3
  size 14583
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e902f55f5eff3be1ef7fe2935b104fa959e9cf9c432f0c7fd5f80e7d77de61c5
3
  size 14583
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e53c770fe48635faad7fa341007d771781f1397cd47daab5b58f879ffb65f178
3
  size 14583
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e4b14613ded4830170a976a4e690d15e8aacb2ee65182d9240d54cde3938296
3
  size 14583
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68692af1001e65d02e07ac9974ccf4c332cfb23bc8f89566e1a908b1f2c4a1ed
3
  size 14583
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c050ca74b6f9f8f4a6b3c047a2a39cfd44f9f45a1bb0e99e65e55adad696b85
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8078ef448b63bb065c755468450481cc618ba517b813fe5d1dc3d2326638dd4f
3
  size 627
special_tokens_map.json CHANGED
@@ -1,107 +1,24 @@
1
  {
2
- "additional_special_tokens": [
3
- "<extra_id_0>",
4
- "<extra_id_1>",
5
- "<extra_id_2>",
6
- "<extra_id_3>",
7
- "<extra_id_4>",
8
- "<extra_id_5>",
9
- "<extra_id_6>",
10
- "<extra_id_7>",
11
- "<extra_id_8>",
12
- "<extra_id_9>",
13
- "<extra_id_10>",
14
- "<extra_id_11>",
15
- "<extra_id_12>",
16
- "<extra_id_13>",
17
- "<extra_id_14>",
18
- "<extra_id_15>",
19
- "<extra_id_16>",
20
- "<extra_id_17>",
21
- "<extra_id_18>",
22
- "<extra_id_19>",
23
- "<extra_id_20>",
24
- "<extra_id_21>",
25
- "<extra_id_22>",
26
- "<extra_id_23>",
27
- "<extra_id_24>",
28
- "<extra_id_25>",
29
- "<extra_id_26>",
30
- "<extra_id_27>",
31
- "<extra_id_28>",
32
- "<extra_id_29>",
33
- "<extra_id_30>",
34
- "<extra_id_31>",
35
- "<extra_id_32>",
36
- "<extra_id_33>",
37
- "<extra_id_34>",
38
- "<extra_id_35>",
39
- "<extra_id_36>",
40
- "<extra_id_37>",
41
- "<extra_id_38>",
42
- "<extra_id_39>",
43
- "<extra_id_40>",
44
- "<extra_id_41>",
45
- "<extra_id_42>",
46
- "<extra_id_43>",
47
- "<extra_id_44>",
48
- "<extra_id_45>",
49
- "<extra_id_46>",
50
- "<extra_id_47>",
51
- "<extra_id_48>",
52
- "<extra_id_49>",
53
- "<extra_id_50>",
54
- "<extra_id_51>",
55
- "<extra_id_52>",
56
- "<extra_id_53>",
57
- "<extra_id_54>",
58
- "<extra_id_55>",
59
- "<extra_id_56>",
60
- "<extra_id_57>",
61
- "<extra_id_58>",
62
- "<extra_id_59>",
63
- "<extra_id_60>",
64
- "<extra_id_61>",
65
- "<extra_id_62>",
66
- "<extra_id_63>",
67
- "<extra_id_64>",
68
- "<extra_id_65>",
69
- "<extra_id_66>",
70
- "<extra_id_67>",
71
- "<extra_id_68>",
72
- "<extra_id_69>",
73
- "<extra_id_70>",
74
- "<extra_id_71>",
75
- "<extra_id_72>",
76
- "<extra_id_73>",
77
- "<extra_id_74>",
78
- "<extra_id_75>",
79
- "<extra_id_76>",
80
- "<extra_id_77>",
81
- "<extra_id_78>",
82
- "<extra_id_79>",
83
- "<extra_id_80>",
84
- "<extra_id_81>",
85
- "<extra_id_82>",
86
- "<extra_id_83>",
87
- "<extra_id_84>",
88
- "<extra_id_85>",
89
- "<extra_id_86>",
90
- "<extra_id_87>",
91
- "<extra_id_88>",
92
- "<extra_id_89>",
93
- "<extra_id_90>",
94
- "<extra_id_91>",
95
- "<extra_id_92>",
96
- "<extra_id_93>",
97
- "<extra_id_94>",
98
- "<extra_id_95>",
99
- "<extra_id_96>",
100
- "<extra_id_97>",
101
- "<extra_id_98>",
102
- "<extra_id_99>"
103
- ],
104
- "eos_token": "</s>",
105
- "pad_token": "[PAD]",
106
- "unk_token": "<unk>"
107
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json CHANGED
@@ -1,114 +1,34 @@
1
  {
2
- "additional_special_tokens": [
3
- "<extra_id_0>",
4
- "<extra_id_1>",
5
- "<extra_id_2>",
6
- "<extra_id_3>",
7
- "<extra_id_4>",
8
- "<extra_id_5>",
9
- "<extra_id_6>",
10
- "<extra_id_7>",
11
- "<extra_id_8>",
12
- "<extra_id_9>",
13
- "<extra_id_10>",
14
- "<extra_id_11>",
15
- "<extra_id_12>",
16
- "<extra_id_13>",
17
- "<extra_id_14>",
18
- "<extra_id_15>",
19
- "<extra_id_16>",
20
- "<extra_id_17>",
21
- "<extra_id_18>",
22
- "<extra_id_19>",
23
- "<extra_id_20>",
24
- "<extra_id_21>",
25
- "<extra_id_22>",
26
- "<extra_id_23>",
27
- "<extra_id_24>",
28
- "<extra_id_25>",
29
- "<extra_id_26>",
30
- "<extra_id_27>",
31
- "<extra_id_28>",
32
- "<extra_id_29>",
33
- "<extra_id_30>",
34
- "<extra_id_31>",
35
- "<extra_id_32>",
36
- "<extra_id_33>",
37
- "<extra_id_34>",
38
- "<extra_id_35>",
39
- "<extra_id_36>",
40
- "<extra_id_37>",
41
- "<extra_id_38>",
42
- "<extra_id_39>",
43
- "<extra_id_40>",
44
- "<extra_id_41>",
45
- "<extra_id_42>",
46
- "<extra_id_43>",
47
- "<extra_id_44>",
48
- "<extra_id_45>",
49
- "<extra_id_46>",
50
- "<extra_id_47>",
51
- "<extra_id_48>",
52
- "<extra_id_49>",
53
- "<extra_id_50>",
54
- "<extra_id_51>",
55
- "<extra_id_52>",
56
- "<extra_id_53>",
57
- "<extra_id_54>",
58
- "<extra_id_55>",
59
- "<extra_id_56>",
60
- "<extra_id_57>",
61
- "<extra_id_58>",
62
- "<extra_id_59>",
63
- "<extra_id_60>",
64
- "<extra_id_61>",
65
- "<extra_id_62>",
66
- "<extra_id_63>",
67
- "<extra_id_64>",
68
- "<extra_id_65>",
69
- "<extra_id_66>",
70
- "<extra_id_67>",
71
- "<extra_id_68>",
72
- "<extra_id_69>",
73
- "<extra_id_70>",
74
- "<extra_id_71>",
75
- "<extra_id_72>",
76
- "<extra_id_73>",
77
- "<extra_id_74>",
78
- "<extra_id_75>",
79
- "<extra_id_76>",
80
- "<extra_id_77>",
81
- "<extra_id_78>",
82
- "<extra_id_79>",
83
- "<extra_id_80>",
84
- "<extra_id_81>",
85
- "<extra_id_82>",
86
- "<extra_id_83>",
87
- "<extra_id_84>",
88
- "<extra_id_85>",
89
- "<extra_id_86>",
90
- "<extra_id_87>",
91
- "<extra_id_88>",
92
- "<extra_id_89>",
93
- "<extra_id_90>",
94
- "<extra_id_91>",
95
- "<extra_id_92>",
96
- "<extra_id_93>",
97
- "<extra_id_94>",
98
- "<extra_id_95>",
99
- "<extra_id_96>",
100
- "<extra_id_97>",
101
- "<extra_id_98>",
102
- "<extra_id_99>"
103
- ],
104
- "clean_up_tokenization_spaces": true,
105
- "eos_token": "</s>",
106
- "extra_ids": 100,
107
  "model_max_length": 2048,
108
- "pad_token": "<pad>",
109
  "padding_side": "right",
110
  "sp_model_kwargs": {},
111
- "tokenizer_class": "T5Tokenizer",
112
- "unk_token": "<unk>",
113
- "use_fast": false
 
 
 
 
 
 
114
  }
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "model_max_length": 2048,
22
+ "pad_token": null,
23
  "padding_side": "right",
24
  "sp_model_kwargs": {},
25
+ "tokenizer_class": "LlamaTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
  }
trainer_state.json CHANGED
@@ -1,1216 +1,916 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.05239030779305828,
5
- "global_step": 200,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
- "learning_rate": 6.9856793573174994e-09,
13
- "loss": 1.5302,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
- "learning_rate": 1.3971358714634999e-08,
19
- "loss": 1.3991,
20
  "step": 2
21
  },
22
  {
23
- "epoch": 0.0,
24
- "learning_rate": 2.09570380719525e-08,
25
- "loss": 1.5873,
26
  "step": 3
27
  },
28
  {
29
- "epoch": 0.0,
30
- "learning_rate": 2.7942717429269998e-08,
31
- "loss": 1.4977,
32
  "step": 4
33
  },
34
  {
35
- "epoch": 0.0,
36
- "learning_rate": 3.49283967865875e-08,
37
- "loss": 1.4561,
38
  "step": 5
39
  },
40
  {
41
- "epoch": 0.0,
42
- "learning_rate": 4.1914076143905e-08,
43
- "loss": 1.4481,
44
  "step": 6
45
  },
46
  {
47
- "epoch": 0.0,
48
- "learning_rate": 4.8899755501222494e-08,
49
- "loss": 1.5704,
50
  "step": 7
51
  },
52
  {
53
- "epoch": 0.0,
54
- "learning_rate": 5.5885434858539996e-08,
55
- "loss": 1.396,
56
  "step": 8
57
  },
58
  {
59
- "epoch": 0.0,
60
- "learning_rate": 6.28711142158575e-08,
61
- "loss": 1.4132,
62
  "step": 9
63
  },
64
  {
65
- "epoch": 0.0,
66
- "learning_rate": 6.9856793573175e-08,
67
- "loss": 1.3079,
68
  "step": 10
69
  },
70
  {
71
- "epoch": 0.0,
72
- "learning_rate": 7.684247293049249e-08,
73
- "loss": 1.381,
74
  "step": 11
75
  },
76
  {
77
- "epoch": 0.0,
78
- "learning_rate": 8.382815228781e-08,
79
- "loss": 1.5932,
80
  "step": 12
81
  },
82
  {
83
- "epoch": 0.0,
84
- "learning_rate": 9.08138316451275e-08,
85
- "loss": 1.4781,
86
  "step": 13
87
  },
88
  {
89
- "epoch": 0.0,
90
- "learning_rate": 9.779951100244499e-08,
91
- "loss": 1.4499,
92
  "step": 14
93
  },
94
  {
95
- "epoch": 0.0,
96
- "learning_rate": 1.047851903597625e-07,
97
- "loss": 1.3174,
98
  "step": 15
99
  },
100
  {
101
- "epoch": 0.0,
102
- "learning_rate": 1.1177086971707999e-07,
103
- "loss": 1.3322,
104
  "step": 16
105
  },
106
  {
107
- "epoch": 0.0,
108
- "learning_rate": 1.1875654907439749e-07,
109
- "loss": 1.3133,
110
  "step": 17
111
  },
112
  {
113
- "epoch": 0.0,
114
- "learning_rate": 1.25742228431715e-07,
115
- "loss": 1.4163,
116
  "step": 18
117
  },
118
  {
119
- "epoch": 0.0,
120
- "learning_rate": 1.327279077890325e-07,
121
- "loss": 1.3745,
122
  "step": 19
123
  },
124
  {
125
- "epoch": 0.01,
126
- "learning_rate": 1.3971358714635e-07,
127
- "loss": 1.5562,
128
  "step": 20
129
  },
130
  {
131
- "epoch": 0.01,
132
- "learning_rate": 1.466992665036675e-07,
133
- "loss": 1.5255,
134
  "step": 21
135
  },
136
  {
137
- "epoch": 0.01,
138
- "learning_rate": 1.5368494586098498e-07,
139
- "loss": 1.6313,
140
  "step": 22
141
  },
142
  {
143
- "epoch": 0.01,
144
- "learning_rate": 1.606706252183025e-07,
145
- "loss": 1.3289,
146
  "step": 23
147
  },
148
  {
149
- "epoch": 0.01,
150
- "learning_rate": 1.6765630457562e-07,
151
- "loss": 1.3399,
152
  "step": 24
153
  },
154
  {
155
- "epoch": 0.01,
156
- "learning_rate": 1.746419839329375e-07,
157
- "loss": 1.3479,
158
  "step": 25
159
  },
160
  {
161
- "epoch": 0.01,
162
- "learning_rate": 1.81627663290255e-07,
163
- "loss": 1.3508,
164
  "step": 26
165
  },
166
  {
167
- "epoch": 0.01,
168
- "learning_rate": 1.8861334264757248e-07,
169
- "loss": 1.5083,
170
  "step": 27
171
  },
172
  {
173
- "epoch": 0.01,
174
- "learning_rate": 1.9559902200488998e-07,
175
- "loss": 1.3118,
176
  "step": 28
177
  },
178
  {
179
- "epoch": 0.01,
180
- "learning_rate": 2.025847013622075e-07,
181
- "loss": 1.3354,
182
  "step": 29
183
  },
184
  {
185
- "epoch": 0.01,
186
- "learning_rate": 2.09570380719525e-07,
187
- "loss": 1.4193,
188
  "step": 30
189
  },
190
  {
191
- "epoch": 0.01,
192
- "learning_rate": 2.165560600768425e-07,
193
- "loss": 1.3794,
194
  "step": 31
195
  },
196
  {
197
- "epoch": 0.01,
198
- "learning_rate": 2.2354173943415998e-07,
199
- "loss": 1.4642,
200
  "step": 32
201
  },
202
  {
203
- "epoch": 0.01,
204
- "learning_rate": 2.3052741879147748e-07,
205
- "loss": 1.5252,
206
  "step": 33
207
  },
208
  {
209
- "epoch": 0.01,
210
- "learning_rate": 2.3751309814879497e-07,
211
- "loss": 1.2811,
212
  "step": 34
213
  },
214
  {
215
- "epoch": 0.01,
216
- "learning_rate": 2.444987775061125e-07,
217
- "loss": 1.5713,
218
  "step": 35
219
  },
220
  {
221
- "epoch": 0.01,
222
- "learning_rate": 2.5148445686343e-07,
223
- "loss": 1.3266,
224
  "step": 36
225
  },
226
  {
227
- "epoch": 0.01,
228
- "learning_rate": 2.584701362207475e-07,
229
- "loss": 1.2845,
230
  "step": 37
231
  },
232
  {
233
- "epoch": 0.01,
234
- "learning_rate": 2.65455815578065e-07,
235
- "loss": 1.3171,
236
  "step": 38
237
  },
238
  {
239
- "epoch": 0.01,
240
- "learning_rate": 2.724414949353825e-07,
241
- "loss": 1.569,
242
  "step": 39
243
  },
244
  {
245
- "epoch": 0.01,
246
- "learning_rate": 2.794271742927e-07,
247
- "loss": 1.3662,
248
  "step": 40
249
  },
250
  {
251
- "epoch": 0.01,
252
- "learning_rate": 2.864128536500175e-07,
253
- "loss": 1.3673,
254
  "step": 41
255
  },
256
  {
257
- "epoch": 0.01,
258
- "learning_rate": 2.93398533007335e-07,
259
- "loss": 1.5369,
260
  "step": 42
261
  },
262
  {
263
- "epoch": 0.01,
264
- "learning_rate": 3.003842123646525e-07,
265
- "loss": 1.2998,
266
  "step": 43
267
  },
268
  {
269
- "epoch": 0.01,
270
- "learning_rate": 3.0736989172196997e-07,
271
- "loss": 1.3358,
272
  "step": 44
273
  },
274
  {
275
- "epoch": 0.01,
276
- "learning_rate": 3.1435557107928746e-07,
277
- "loss": 1.2411,
278
  "step": 45
279
  },
280
  {
281
- "epoch": 0.01,
282
- "learning_rate": 3.21341250436605e-07,
283
- "loss": 1.3119,
284
  "step": 46
285
  },
286
  {
287
- "epoch": 0.01,
288
- "learning_rate": 3.2832692979392245e-07,
289
- "loss": 1.4592,
290
  "step": 47
291
  },
292
  {
293
- "epoch": 0.01,
294
- "learning_rate": 3.3531260915124e-07,
295
- "loss": 1.3613,
296
  "step": 48
297
  },
298
  {
299
- "epoch": 0.01,
300
- "learning_rate": 3.4229828850855744e-07,
301
- "loss": 1.3615,
302
  "step": 49
303
  },
304
  {
305
- "epoch": 0.01,
306
- "learning_rate": 3.49283967865875e-07,
307
- "loss": 1.3016,
308
  "step": 50
309
  },
310
  {
311
- "epoch": 0.01,
312
- "learning_rate": 3.562696472231925e-07,
313
- "loss": 1.4594,
314
  "step": 51
315
  },
316
  {
317
- "epoch": 0.01,
318
- "learning_rate": 3.6325532658051e-07,
319
- "loss": 1.4994,
320
  "step": 52
321
  },
322
  {
323
- "epoch": 0.01,
324
- "learning_rate": 3.7024100593782747e-07,
325
- "loss": 1.4328,
326
  "step": 53
327
  },
328
  {
329
- "epoch": 0.01,
330
- "learning_rate": 3.7722668529514497e-07,
331
- "loss": 1.4001,
332
  "step": 54
333
  },
334
  {
335
- "epoch": 0.01,
336
- "learning_rate": 3.8421236465246246e-07,
337
- "loss": 1.3456,
338
  "step": 55
339
  },
340
  {
341
- "epoch": 0.01,
342
- "learning_rate": 3.9119804400977996e-07,
343
- "loss": 1.3156,
344
  "step": 56
345
  },
346
  {
347
- "epoch": 0.01,
348
- "learning_rate": 3.981837233670975e-07,
349
- "loss": 1.4528,
350
  "step": 57
351
  },
352
  {
353
- "epoch": 0.02,
354
- "learning_rate": 4.05169402724415e-07,
355
- "loss": 1.2765,
356
  "step": 58
357
  },
358
  {
359
- "epoch": 0.02,
360
- "learning_rate": 4.121550820817325e-07,
361
- "loss": 1.3098,
362
  "step": 59
363
  },
364
  {
365
- "epoch": 0.02,
366
- "learning_rate": 4.1914076143905e-07,
367
- "loss": 1.5001,
368
  "step": 60
369
  },
370
  {
371
- "epoch": 0.02,
372
- "learning_rate": 4.261264407963675e-07,
373
- "loss": 1.5602,
374
  "step": 61
375
  },
376
  {
377
- "epoch": 0.02,
378
- "learning_rate": 4.33112120153685e-07,
379
- "loss": 1.3146,
380
  "step": 62
381
  },
382
  {
383
- "epoch": 0.02,
384
- "learning_rate": 4.4009779951100247e-07,
385
- "loss": 1.1927,
386
  "step": 63
387
  },
388
  {
389
- "epoch": 0.02,
390
- "learning_rate": 4.4708347886831996e-07,
391
- "loss": 1.3499,
392
  "step": 64
393
  },
394
  {
395
- "epoch": 0.02,
396
- "learning_rate": 4.540691582256375e-07,
397
- "loss": 1.4147,
398
  "step": 65
399
  },
400
  {
401
- "epoch": 0.02,
402
- "learning_rate": 4.6105483758295495e-07,
403
- "loss": 1.3828,
404
  "step": 66
405
  },
406
  {
407
- "epoch": 0.02,
408
- "learning_rate": 4.680405169402725e-07,
409
- "loss": 1.4336,
410
  "step": 67
411
  },
412
  {
413
- "epoch": 0.02,
414
- "learning_rate": 4.7502619629758994e-07,
415
- "loss": 1.3039,
416
  "step": 68
417
  },
418
  {
419
- "epoch": 0.02,
420
- "learning_rate": 4.820118756549074e-07,
421
- "loss": 1.2953,
422
  "step": 69
423
  },
424
  {
425
- "epoch": 0.02,
426
- "learning_rate": 4.88997555012225e-07,
427
- "loss": 1.2902,
428
  "step": 70
429
  },
430
  {
431
- "epoch": 0.02,
432
- "learning_rate": 4.959832343695424e-07,
433
- "loss": 1.3334,
434
  "step": 71
435
  },
436
  {
437
- "epoch": 0.02,
438
- "learning_rate": 5.0296891372686e-07,
439
- "loss": 1.5598,
440
  "step": 72
441
  },
442
  {
443
- "epoch": 0.02,
444
- "learning_rate": 5.099545930841775e-07,
445
- "loss": 1.3344,
446
  "step": 73
447
  },
448
  {
449
- "epoch": 0.02,
450
- "learning_rate": 5.16940272441495e-07,
451
- "loss": 1.2667,
452
  "step": 74
453
  },
454
  {
455
- "epoch": 0.02,
456
- "learning_rate": 5.239259517988125e-07,
457
- "loss": 1.2021,
458
  "step": 75
459
  },
460
  {
461
- "epoch": 0.02,
462
- "learning_rate": 5.3091163115613e-07,
463
- "loss": 1.4128,
464
  "step": 76
465
  },
466
  {
467
- "epoch": 0.02,
468
- "learning_rate": 5.378973105134475e-07,
469
- "loss": 1.5772,
470
  "step": 77
471
  },
472
  {
473
- "epoch": 0.02,
474
- "learning_rate": 5.44882989870765e-07,
475
- "loss": 1.3694,
476
  "step": 78
477
  },
478
  {
479
- "epoch": 0.02,
480
- "learning_rate": 5.518686692280825e-07,
481
- "loss": 1.4965,
482
  "step": 79
483
  },
484
  {
485
- "epoch": 0.02,
486
- "learning_rate": 5.588543485854e-07,
487
- "loss": 1.277,
488
  "step": 80
489
  },
490
  {
491
- "epoch": 0.02,
492
- "learning_rate": 5.658400279427175e-07,
493
- "loss": 1.3585,
494
  "step": 81
495
  },
496
  {
497
- "epoch": 0.02,
498
- "learning_rate": 5.72825707300035e-07,
499
- "loss": 1.3944,
500
  "step": 82
501
  },
502
  {
503
- "epoch": 0.02,
504
- "learning_rate": 5.798113866573525e-07,
505
- "loss": 1.4473,
506
  "step": 83
507
  },
508
  {
509
- "epoch": 0.02,
510
- "learning_rate": 5.8679706601467e-07,
511
- "loss": 1.1716,
512
  "step": 84
513
  },
514
  {
515
- "epoch": 0.02,
516
- "learning_rate": 5.937827453719875e-07,
517
- "loss": 1.3948,
518
  "step": 85
519
  },
520
  {
521
- "epoch": 0.02,
522
- "learning_rate": 6.00768424729305e-07,
523
- "loss": 1.3183,
524
  "step": 86
525
  },
526
  {
527
- "epoch": 0.02,
528
- "learning_rate": 6.077541040866225e-07,
529
- "loss": 1.4303,
530
  "step": 87
531
  },
532
  {
533
- "epoch": 0.02,
534
- "learning_rate": 6.147397834439399e-07,
535
- "loss": 1.2323,
536
  "step": 88
537
  },
538
  {
539
- "epoch": 0.02,
540
- "learning_rate": 6.217254628012575e-07,
541
- "loss": 1.4077,
542
  "step": 89
543
  },
544
  {
545
- "epoch": 0.02,
546
- "learning_rate": 6.287111421585749e-07,
547
- "loss": 1.3453,
548
  "step": 90
549
  },
550
  {
551
- "epoch": 0.02,
552
- "learning_rate": 6.356968215158925e-07,
553
- "loss": 1.4308,
554
  "step": 91
555
  },
556
  {
557
- "epoch": 0.02,
558
- "learning_rate": 6.4268250087321e-07,
559
- "loss": 1.3942,
560
  "step": 92
561
  },
562
  {
563
- "epoch": 0.02,
564
- "learning_rate": 6.496681802305274e-07,
565
- "loss": 1.7588,
566
  "step": 93
567
  },
568
  {
569
- "epoch": 0.02,
570
- "learning_rate": 6.566538595878449e-07,
571
- "loss": 1.3247,
572
  "step": 94
573
  },
574
  {
575
- "epoch": 0.02,
576
- "learning_rate": 6.636395389451625e-07,
577
- "loss": 1.2695,
578
  "step": 95
579
  },
580
  {
581
- "epoch": 0.03,
582
- "learning_rate": 6.7062521830248e-07,
583
- "loss": 1.2933,
584
  "step": 96
585
  },
586
  {
587
- "epoch": 0.03,
588
- "learning_rate": 6.776108976597975e-07,
589
- "loss": 1.3273,
590
  "step": 97
591
  },
592
  {
593
- "epoch": 0.03,
594
- "learning_rate": 6.845965770171149e-07,
595
- "loss": 1.3102,
596
  "step": 98
597
  },
598
  {
599
- "epoch": 0.03,
600
- "learning_rate": 6.915822563744325e-07,
601
- "loss": 1.3494,
602
  "step": 99
603
  },
604
  {
605
- "epoch": 0.03,
606
- "learning_rate": 6.9856793573175e-07,
607
- "loss": 1.3154,
608
  "step": 100
609
  },
610
  {
611
- "epoch": 0.03,
612
- "learning_rate": 7.055536150890676e-07,
613
- "loss": 1.2514,
614
  "step": 101
615
  },
616
  {
617
- "epoch": 0.03,
618
- "learning_rate": 7.12539294446385e-07,
619
- "loss": 1.2672,
620
  "step": 102
621
  },
622
  {
623
- "epoch": 0.03,
624
- "learning_rate": 7.195249738037025e-07,
625
- "loss": 1.7321,
626
  "step": 103
627
  },
628
  {
629
- "epoch": 0.03,
630
- "learning_rate": 7.2651065316102e-07,
631
- "loss": 1.2989,
632
  "step": 104
633
  },
634
  {
635
- "epoch": 0.03,
636
- "learning_rate": 7.334963325183376e-07,
637
- "loss": 1.5735,
638
  "step": 105
639
  },
640
  {
641
- "epoch": 0.03,
642
- "learning_rate": 7.404820118756549e-07,
643
- "loss": 1.2504,
644
  "step": 106
645
  },
646
  {
647
- "epoch": 0.03,
648
- "learning_rate": 7.474676912329724e-07,
649
- "loss": 1.2209,
650
  "step": 107
651
  },
652
  {
653
- "epoch": 0.03,
654
- "learning_rate": 7.544533705902899e-07,
655
- "loss": 1.31,
656
  "step": 108
657
  },
658
  {
659
- "epoch": 0.03,
660
- "learning_rate": 7.614390499476075e-07,
661
- "loss": 1.3797,
662
  "step": 109
663
  },
664
  {
665
- "epoch": 0.03,
666
- "learning_rate": 7.684247293049249e-07,
667
- "loss": 1.3813,
668
  "step": 110
669
  },
670
  {
671
- "epoch": 0.03,
672
- "learning_rate": 7.754104086622424e-07,
673
- "loss": 1.3116,
674
  "step": 111
675
  },
676
  {
677
- "epoch": 0.03,
678
- "learning_rate": 7.823960880195599e-07,
679
- "loss": 1.2662,
680
  "step": 112
681
  },
682
  {
683
- "epoch": 0.03,
684
- "learning_rate": 7.893817673768775e-07,
685
- "loss": 1.2918,
686
  "step": 113
687
  },
688
  {
689
- "epoch": 0.03,
690
- "learning_rate": 7.96367446734195e-07,
691
- "loss": 1.288,
692
  "step": 114
693
  },
694
  {
695
- "epoch": 0.03,
696
- "learning_rate": 8.033531260915124e-07,
697
- "loss": 1.2962,
698
  "step": 115
699
  },
700
  {
701
- "epoch": 0.03,
702
- "learning_rate": 8.1033880544883e-07,
703
- "loss": 1.4929,
704
  "step": 116
705
  },
706
  {
707
- "epoch": 0.03,
708
- "learning_rate": 8.173244848061475e-07,
709
- "loss": 1.4425,
710
  "step": 117
711
  },
712
  {
713
- "epoch": 0.03,
714
- "learning_rate": 8.24310164163465e-07,
715
- "loss": 1.4805,
716
  "step": 118
717
  },
718
  {
719
- "epoch": 0.03,
720
- "learning_rate": 8.312958435207824e-07,
721
- "loss": 1.6595,
722
  "step": 119
723
  },
724
  {
725
- "epoch": 0.03,
726
- "learning_rate": 8.382815228781e-07,
727
- "loss": 1.3928,
728
  "step": 120
729
  },
730
  {
731
- "epoch": 0.03,
732
- "learning_rate": 8.452672022354175e-07,
733
- "loss": 1.3824,
734
  "step": 121
735
  },
736
  {
737
- "epoch": 0.03,
738
- "learning_rate": 8.52252881592735e-07,
739
- "loss": 1.4096,
740
  "step": 122
741
  },
742
  {
743
- "epoch": 0.03,
744
- "learning_rate": 8.592385609500524e-07,
745
- "loss": 1.4056,
746
  "step": 123
747
  },
748
  {
749
- "epoch": 0.03,
750
- "learning_rate": 8.6622424030737e-07,
751
- "loss": 1.4939,
752
  "step": 124
753
  },
754
  {
755
- "epoch": 0.03,
756
- "learning_rate": 8.732099196646874e-07,
757
- "loss": 1.4946,
758
  "step": 125
759
  },
760
  {
761
- "epoch": 0.03,
762
- "learning_rate": 8.801955990220049e-07,
763
- "loss": 1.2152,
764
  "step": 126
765
  },
766
  {
767
- "epoch": 0.03,
768
- "learning_rate": 8.871812783793225e-07,
769
- "loss": 1.3495,
770
  "step": 127
771
  },
772
  {
773
- "epoch": 0.03,
774
- "learning_rate": 8.941669577366399e-07,
775
- "loss": 1.3502,
776
  "step": 128
777
  },
778
  {
779
- "epoch": 0.03,
780
- "learning_rate": 9.011526370939574e-07,
781
- "loss": 1.4499,
782
  "step": 129
783
  },
784
  {
785
- "epoch": 0.03,
786
- "learning_rate": 9.08138316451275e-07,
787
- "loss": 1.404,
788
  "step": 130
789
  },
790
  {
791
- "epoch": 0.03,
792
- "learning_rate": 9.151239958085925e-07,
793
- "loss": 1.2707,
794
  "step": 131
795
  },
796
  {
797
- "epoch": 0.03,
798
- "learning_rate": 9.221096751659099e-07,
799
- "loss": 1.3779,
800
  "step": 132
801
  },
802
  {
803
- "epoch": 0.03,
804
- "learning_rate": 9.290953545232274e-07,
805
- "loss": 1.4176,
806
  "step": 133
807
  },
808
  {
809
- "epoch": 0.04,
810
- "learning_rate": 9.36081033880545e-07,
811
- "loss": 1.3269,
812
  "step": 134
813
  },
814
  {
815
- "epoch": 0.04,
816
- "learning_rate": 9.430667132378625e-07,
817
- "loss": 1.4951,
818
  "step": 135
819
  },
820
  {
821
- "epoch": 0.04,
822
- "learning_rate": 9.500523925951799e-07,
823
- "loss": 1.3809,
824
  "step": 136
825
  },
826
  {
827
- "epoch": 0.04,
828
- "learning_rate": 9.570380719524975e-07,
829
- "loss": 1.4824,
830
  "step": 137
831
  },
832
  {
833
- "epoch": 0.04,
834
- "learning_rate": 9.640237513098149e-07,
835
- "loss": 1.4929,
836
  "step": 138
837
  },
838
  {
839
- "epoch": 0.04,
840
- "learning_rate": 9.710094306671325e-07,
841
- "loss": 1.3353,
842
  "step": 139
843
  },
844
  {
845
- "epoch": 0.04,
846
- "learning_rate": 9.7799511002445e-07,
847
- "loss": 1.6328,
848
  "step": 140
849
  },
850
  {
851
- "epoch": 0.04,
852
- "learning_rate": 9.849807893817675e-07,
853
- "loss": 1.4769,
854
  "step": 141
855
  },
856
  {
857
- "epoch": 0.04,
858
- "learning_rate": 9.919664687390849e-07,
859
- "loss": 1.5474,
860
  "step": 142
861
  },
862
  {
863
- "epoch": 0.04,
864
- "learning_rate": 9.989521480964025e-07,
865
- "loss": 1.36,
866
  "step": 143
867
  },
868
  {
869
- "epoch": 0.04,
870
- "learning_rate": 1.00593782745372e-06,
871
- "loss": 1.6957,
872
  "step": 144
873
  },
874
  {
875
- "epoch": 0.04,
876
- "learning_rate": 1.0129235068110374e-06,
877
- "loss": 1.1635,
878
  "step": 145
879
  },
880
  {
881
- "epoch": 0.04,
882
- "learning_rate": 1.019909186168355e-06,
883
- "loss": 1.4348,
884
  "step": 146
885
  },
886
  {
887
- "epoch": 0.04,
888
- "learning_rate": 1.0268948655256724e-06,
889
- "loss": 1.4559,
890
  "step": 147
891
  },
892
  {
893
- "epoch": 0.04,
894
- "learning_rate": 1.03388054488299e-06,
895
- "loss": 1.3772,
896
  "step": 148
897
  },
898
  {
899
- "epoch": 0.04,
900
- "learning_rate": 1.0408662242403074e-06,
901
- "loss": 1.3404,
902
  "step": 149
903
  },
904
  {
905
- "epoch": 0.04,
906
- "learning_rate": 1.047851903597625e-06,
907
- "loss": 1.3518,
908
  "step": 150
909
- },
910
- {
911
- "epoch": 0.04,
912
- "learning_rate": 1.0548375829549424e-06,
913
- "loss": 1.2911,
914
- "step": 151
915
- },
916
- {
917
- "epoch": 0.04,
918
- "learning_rate": 1.06182326231226e-06,
919
- "loss": 1.3891,
920
- "step": 152
921
- },
922
- {
923
- "epoch": 0.04,
924
- "learning_rate": 1.0688089416695776e-06,
925
- "loss": 1.3481,
926
- "step": 153
927
- },
928
- {
929
- "epoch": 0.04,
930
- "learning_rate": 1.075794621026895e-06,
931
- "loss": 1.2061,
932
- "step": 154
933
- },
934
- {
935
- "epoch": 0.04,
936
- "learning_rate": 1.0827803003842124e-06,
937
- "loss": 1.3228,
938
- "step": 155
939
- },
940
- {
941
- "epoch": 0.04,
942
- "learning_rate": 1.08976597974153e-06,
943
- "loss": 1.4932,
944
- "step": 156
945
- },
946
- {
947
- "epoch": 0.04,
948
- "learning_rate": 1.0967516590988476e-06,
949
- "loss": 1.0902,
950
- "step": 157
951
- },
952
- {
953
- "epoch": 0.04,
954
- "learning_rate": 1.103737338456165e-06,
955
- "loss": 1.4818,
956
- "step": 158
957
- },
958
- {
959
- "epoch": 0.04,
960
- "learning_rate": 1.1107230178134824e-06,
961
- "loss": 1.3129,
962
- "step": 159
963
- },
964
- {
965
- "epoch": 0.04,
966
- "learning_rate": 1.1177086971708e-06,
967
- "loss": 1.3357,
968
- "step": 160
969
- },
970
- {
971
- "epoch": 0.04,
972
- "learning_rate": 1.1246943765281176e-06,
973
- "loss": 1.609,
974
- "step": 161
975
- },
976
- {
977
- "epoch": 0.04,
978
- "learning_rate": 1.131680055885435e-06,
979
- "loss": 1.2163,
980
- "step": 162
981
- },
982
- {
983
- "epoch": 0.04,
984
- "learning_rate": 1.1386657352427523e-06,
985
- "loss": 1.3282,
986
- "step": 163
987
- },
988
- {
989
- "epoch": 0.04,
990
- "learning_rate": 1.14565141460007e-06,
991
- "loss": 1.3166,
992
- "step": 164
993
- },
994
- {
995
- "epoch": 0.04,
996
- "learning_rate": 1.1526370939573875e-06,
997
- "loss": 1.3252,
998
- "step": 165
999
- },
1000
- {
1001
- "epoch": 0.04,
1002
- "learning_rate": 1.159622773314705e-06,
1003
- "loss": 1.4827,
1004
- "step": 166
1005
- },
1006
- {
1007
- "epoch": 0.04,
1008
- "learning_rate": 1.1666084526720223e-06,
1009
- "loss": 1.4112,
1010
- "step": 167
1011
- },
1012
- {
1013
- "epoch": 0.04,
1014
- "learning_rate": 1.17359413202934e-06,
1015
- "loss": 1.3811,
1016
- "step": 168
1017
- },
1018
- {
1019
- "epoch": 0.04,
1020
- "learning_rate": 1.1805798113866575e-06,
1021
- "loss": 1.2234,
1022
- "step": 169
1023
- },
1024
- {
1025
- "epoch": 0.04,
1026
- "learning_rate": 1.187565490743975e-06,
1027
- "loss": 1.5784,
1028
- "step": 170
1029
- },
1030
- {
1031
- "epoch": 0.04,
1032
- "learning_rate": 1.1945511701012923e-06,
1033
- "loss": 1.4294,
1034
- "step": 171
1035
- },
1036
- {
1037
- "epoch": 0.05,
1038
- "learning_rate": 1.20153684945861e-06,
1039
- "loss": 1.2927,
1040
- "step": 172
1041
- },
1042
- {
1043
- "epoch": 0.05,
1044
- "learning_rate": 1.2085225288159275e-06,
1045
- "loss": 1.3468,
1046
- "step": 173
1047
- },
1048
- {
1049
- "epoch": 0.05,
1050
- "learning_rate": 1.215508208173245e-06,
1051
- "loss": 1.39,
1052
- "step": 174
1053
- },
1054
- {
1055
- "epoch": 0.05,
1056
- "learning_rate": 1.2224938875305625e-06,
1057
- "loss": 1.6262,
1058
- "step": 175
1059
- },
1060
- {
1061
- "epoch": 0.05,
1062
- "learning_rate": 1.2294795668878799e-06,
1063
- "loss": 1.273,
1064
- "step": 176
1065
- },
1066
- {
1067
- "epoch": 0.05,
1068
- "learning_rate": 1.2364652462451975e-06,
1069
- "loss": 1.3019,
1070
- "step": 177
1071
- },
1072
- {
1073
- "epoch": 0.05,
1074
- "learning_rate": 1.243450925602515e-06,
1075
- "loss": 1.4669,
1076
- "step": 178
1077
- },
1078
- {
1079
- "epoch": 0.05,
1080
- "learning_rate": 1.2504366049598325e-06,
1081
- "loss": 1.388,
1082
- "step": 179
1083
- },
1084
- {
1085
- "epoch": 0.05,
1086
- "learning_rate": 1.2574222843171499e-06,
1087
- "loss": 1.2819,
1088
- "step": 180
1089
- },
1090
- {
1091
- "epoch": 0.05,
1092
- "learning_rate": 1.2644079636744675e-06,
1093
- "loss": 1.4056,
1094
- "step": 181
1095
- },
1096
- {
1097
- "epoch": 0.05,
1098
- "learning_rate": 1.271393643031785e-06,
1099
- "loss": 1.4843,
1100
- "step": 182
1101
- },
1102
- {
1103
- "epoch": 0.05,
1104
- "learning_rate": 1.2783793223891024e-06,
1105
- "loss": 1.2905,
1106
- "step": 183
1107
- },
1108
- {
1109
- "epoch": 0.05,
1110
- "learning_rate": 1.28536500174642e-06,
1111
- "loss": 1.4571,
1112
- "step": 184
1113
- },
1114
- {
1115
- "epoch": 0.05,
1116
- "learning_rate": 1.2923506811037376e-06,
1117
- "loss": 1.3495,
1118
- "step": 185
1119
- },
1120
- {
1121
- "epoch": 0.05,
1122
- "learning_rate": 1.2993363604610548e-06,
1123
- "loss": 1.2697,
1124
- "step": 186
1125
- },
1126
- {
1127
- "epoch": 0.05,
1128
- "learning_rate": 1.3063220398183724e-06,
1129
- "loss": 1.3934,
1130
- "step": 187
1131
- },
1132
- {
1133
- "epoch": 0.05,
1134
- "learning_rate": 1.3133077191756898e-06,
1135
- "loss": 1.2987,
1136
- "step": 188
1137
- },
1138
- {
1139
- "epoch": 0.05,
1140
- "learning_rate": 1.3202933985330074e-06,
1141
- "loss": 1.2335,
1142
- "step": 189
1143
- },
1144
- {
1145
- "epoch": 0.05,
1146
- "learning_rate": 1.327279077890325e-06,
1147
- "loss": 1.3375,
1148
- "step": 190
1149
- },
1150
- {
1151
- "epoch": 0.05,
1152
- "learning_rate": 1.3342647572476424e-06,
1153
- "loss": 1.3014,
1154
- "step": 191
1155
- },
1156
- {
1157
- "epoch": 0.05,
1158
- "learning_rate": 1.34125043660496e-06,
1159
- "loss": 1.2776,
1160
- "step": 192
1161
- },
1162
- {
1163
- "epoch": 0.05,
1164
- "learning_rate": 1.3482361159622776e-06,
1165
- "loss": 1.1337,
1166
- "step": 193
1167
- },
1168
- {
1169
- "epoch": 0.05,
1170
- "learning_rate": 1.355221795319595e-06,
1171
- "loss": 1.3029,
1172
- "step": 194
1173
- },
1174
- {
1175
- "epoch": 0.05,
1176
- "learning_rate": 1.3622074746769124e-06,
1177
- "loss": 1.4668,
1178
- "step": 195
1179
- },
1180
- {
1181
- "epoch": 0.05,
1182
- "learning_rate": 1.3691931540342298e-06,
1183
- "loss": 1.3451,
1184
- "step": 196
1185
- },
1186
- {
1187
- "epoch": 0.05,
1188
- "learning_rate": 1.3761788333915474e-06,
1189
- "loss": 1.3971,
1190
- "step": 197
1191
- },
1192
- {
1193
- "epoch": 0.05,
1194
- "learning_rate": 1.383164512748865e-06,
1195
- "loss": 1.3155,
1196
- "step": 198
1197
- },
1198
- {
1199
- "epoch": 0.05,
1200
- "learning_rate": 1.3901501921061826e-06,
1201
- "loss": 1.2804,
1202
- "step": 199
1203
- },
1204
- {
1205
- "epoch": 0.05,
1206
- "learning_rate": 1.3971358714635e-06,
1207
- "loss": 1.3541,
1208
- "step": 200
1209
  }
1210
  ],
1211
- "max_steps": 95425,
1212
  "num_train_epochs": 25,
1213
- "total_flos": 1535405170098176.0,
1214
  "trial_name": null,
1215
  "trial_params": null
1216
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.31978680879413723,
5
+ "global_step": 150,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
+ "learning_rate": 5.681818181818182e-08,
13
+ "loss": 1.4294,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
+ "learning_rate": 1.1363636363636364e-07,
19
+ "loss": 1.3293,
20
  "step": 2
21
  },
22
  {
23
+ "epoch": 0.01,
24
+ "learning_rate": 1.7045454545454545e-07,
25
+ "loss": 1.0797,
26
  "step": 3
27
  },
28
  {
29
+ "epoch": 0.01,
30
+ "learning_rate": 2.2727272727272729e-07,
31
+ "loss": 0.9853,
32
  "step": 4
33
  },
34
  {
35
+ "epoch": 0.01,
36
+ "learning_rate": 2.840909090909091e-07,
37
+ "loss": 0.983,
38
  "step": 5
39
  },
40
  {
41
+ "epoch": 0.01,
42
+ "learning_rate": 3.409090909090909e-07,
43
+ "loss": 0.9868,
44
  "step": 6
45
  },
46
  {
47
+ "epoch": 0.01,
48
+ "learning_rate": 3.9772727272727276e-07,
49
+ "loss": 1.0148,
50
  "step": 7
51
  },
52
  {
53
+ "epoch": 0.02,
54
+ "learning_rate": 4.5454545454545457e-07,
55
+ "loss": 1.0126,
56
  "step": 8
57
  },
58
  {
59
+ "epoch": 0.02,
60
+ "learning_rate": 5.113636363636364e-07,
61
+ "loss": 0.9555,
62
  "step": 9
63
  },
64
  {
65
+ "epoch": 0.02,
66
+ "learning_rate": 5.681818181818182e-07,
67
+ "loss": 1.0721,
68
  "step": 10
69
  },
70
  {
71
+ "epoch": 0.02,
72
+ "learning_rate": 6.25e-07,
73
+ "loss": 1.0352,
74
  "step": 11
75
  },
76
  {
77
+ "epoch": 0.03,
78
+ "learning_rate": 6.818181818181818e-07,
79
+ "loss": 0.9382,
80
  "step": 12
81
  },
82
  {
83
+ "epoch": 0.03,
84
+ "learning_rate": 7.386363636363638e-07,
85
+ "loss": 1.0059,
86
  "step": 13
87
  },
88
  {
89
+ "epoch": 0.03,
90
+ "learning_rate": 7.954545454545455e-07,
91
+ "loss": 0.9997,
92
  "step": 14
93
  },
94
  {
95
+ "epoch": 0.03,
96
+ "learning_rate": 8.522727272727273e-07,
97
+ "loss": 0.9663,
98
  "step": 15
99
  },
100
  {
101
+ "epoch": 0.03,
102
+ "learning_rate": 9.090909090909091e-07,
103
+ "loss": 0.9601,
104
  "step": 16
105
  },
106
  {
107
+ "epoch": 0.04,
108
+ "learning_rate": 9.65909090909091e-07,
109
+ "loss": 0.9271,
110
  "step": 17
111
  },
112
  {
113
+ "epoch": 0.04,
114
+ "learning_rate": 1.0227272727272729e-06,
115
+ "loss": 0.9655,
116
  "step": 18
117
  },
118
  {
119
+ "epoch": 0.04,
120
+ "learning_rate": 1.0795454545454546e-06,
121
+ "loss": 0.8991,
122
  "step": 19
123
  },
124
  {
125
+ "epoch": 0.04,
126
+ "learning_rate": 1.1363636363636364e-06,
127
+ "loss": 0.9081,
128
  "step": 20
129
  },
130
  {
131
+ "epoch": 0.04,
132
+ "learning_rate": 1.1931818181818183e-06,
133
+ "loss": 0.9127,
134
  "step": 21
135
  },
136
  {
137
+ "epoch": 0.05,
138
+ "learning_rate": 1.25e-06,
139
+ "loss": 0.9912,
140
  "step": 22
141
  },
142
  {
143
+ "epoch": 0.05,
144
+ "learning_rate": 1.3068181818181819e-06,
145
+ "loss": 0.9077,
146
  "step": 23
147
  },
148
  {
149
+ "epoch": 0.05,
150
+ "learning_rate": 1.3636363636363636e-06,
151
+ "loss": 0.9739,
152
  "step": 24
153
  },
154
  {
155
+ "epoch": 0.05,
156
+ "learning_rate": 1.4204545454545458e-06,
157
+ "loss": 0.9148,
158
  "step": 25
159
  },
160
  {
161
+ "epoch": 0.06,
162
+ "learning_rate": 1.4772727272727275e-06,
163
+ "loss": 0.8668,
164
  "step": 26
165
  },
166
  {
167
+ "epoch": 0.06,
168
+ "learning_rate": 1.5340909090909093e-06,
169
+ "loss": 0.8951,
170
  "step": 27
171
  },
172
  {
173
+ "epoch": 0.06,
174
+ "learning_rate": 1.590909090909091e-06,
175
+ "loss": 0.842,
176
  "step": 28
177
  },
178
  {
179
+ "epoch": 0.06,
180
+ "learning_rate": 1.6477272727272728e-06,
181
+ "loss": 0.8439,
182
  "step": 29
183
  },
184
  {
185
+ "epoch": 0.06,
186
+ "learning_rate": 1.7045454545454546e-06,
187
+ "loss": 0.8471,
188
  "step": 30
189
  },
190
  {
191
+ "epoch": 0.07,
192
+ "learning_rate": 1.7613636363636365e-06,
193
+ "loss": 0.9474,
194
  "step": 31
195
  },
196
  {
197
+ "epoch": 0.07,
198
+ "learning_rate": 1.8181818181818183e-06,
199
+ "loss": 0.8957,
200
  "step": 32
201
  },
202
  {
203
+ "epoch": 0.07,
204
+ "learning_rate": 1.8750000000000003e-06,
205
+ "loss": 0.8596,
206
  "step": 33
207
  },
208
  {
209
+ "epoch": 0.07,
210
+ "learning_rate": 1.931818181818182e-06,
211
+ "loss": 0.8183,
212
  "step": 34
213
  },
214
  {
215
+ "epoch": 0.07,
216
+ "learning_rate": 1.9886363636363638e-06,
217
+ "loss": 0.9198,
218
  "step": 35
219
  },
220
  {
221
+ "epoch": 0.08,
222
+ "learning_rate": 2.0454545454545457e-06,
223
+ "loss": 0.8402,
224
  "step": 36
225
  },
226
  {
227
+ "epoch": 0.08,
228
+ "learning_rate": 2.1022727272727277e-06,
229
+ "loss": 0.9209,
230
  "step": 37
231
  },
232
  {
233
+ "epoch": 0.08,
234
+ "learning_rate": 2.1590909090909092e-06,
235
+ "loss": 0.8554,
236
  "step": 38
237
  },
238
  {
239
+ "epoch": 0.08,
240
+ "learning_rate": 2.2159090909090912e-06,
241
+ "loss": 0.9276,
242
  "step": 39
243
  },
244
  {
245
+ "epoch": 0.09,
246
+ "learning_rate": 2.2727272727272728e-06,
247
+ "loss": 0.8872,
248
  "step": 40
249
  },
250
  {
251
+ "epoch": 0.09,
252
+ "learning_rate": 2.3295454545454547e-06,
253
+ "loss": 0.9047,
254
  "step": 41
255
  },
256
  {
257
+ "epoch": 0.09,
258
+ "learning_rate": 2.3863636363636367e-06,
259
+ "loss": 0.8514,
260
  "step": 42
261
  },
262
  {
263
+ "epoch": 0.09,
264
+ "learning_rate": 2.4431818181818182e-06,
265
+ "loss": 0.7976,
266
  "step": 43
267
  },
268
  {
269
+ "epoch": 0.09,
270
+ "learning_rate": 2.5e-06,
271
+ "loss": 0.8126,
272
  "step": 44
273
  },
274
  {
275
+ "epoch": 0.1,
276
+ "learning_rate": 2.556818181818182e-06,
277
+ "loss": 0.8536,
278
  "step": 45
279
  },
280
  {
281
+ "epoch": 0.1,
282
+ "learning_rate": 2.6136363636363637e-06,
283
+ "loss": 0.8228,
284
  "step": 46
285
  },
286
  {
287
+ "epoch": 0.1,
288
+ "learning_rate": 2.6704545454545457e-06,
289
+ "loss": 0.8439,
290
  "step": 47
291
  },
292
  {
293
+ "epoch": 0.1,
294
+ "learning_rate": 2.7272727272727272e-06,
295
+ "loss": 0.7859,
296
  "step": 48
297
  },
298
  {
299
+ "epoch": 0.1,
300
+ "learning_rate": 2.784090909090909e-06,
301
+ "loss": 0.8424,
302
  "step": 49
303
  },
304
  {
305
+ "epoch": 0.11,
306
+ "learning_rate": 2.8409090909090916e-06,
307
+ "loss": 0.7674,
308
  "step": 50
309
  },
310
  {
311
+ "epoch": 0.11,
312
+ "learning_rate": 2.897727272727273e-06,
313
+ "loss": 0.8022,
314
  "step": 51
315
  },
316
  {
317
+ "epoch": 0.11,
318
+ "learning_rate": 2.954545454545455e-06,
319
+ "loss": 0.7343,
320
  "step": 52
321
  },
322
  {
323
+ "epoch": 0.11,
324
+ "learning_rate": 3.0113636363636366e-06,
325
+ "loss": 0.7343,
326
  "step": 53
327
  },
328
  {
329
+ "epoch": 0.12,
330
+ "learning_rate": 3.0681818181818186e-06,
331
+ "loss": 0.7848,
332
  "step": 54
333
  },
334
  {
335
+ "epoch": 0.12,
336
+ "learning_rate": 3.125e-06,
337
+ "loss": 0.8391,
338
  "step": 55
339
  },
340
  {
341
+ "epoch": 0.12,
342
+ "learning_rate": 3.181818181818182e-06,
343
+ "loss": 0.7934,
344
  "step": 56
345
  },
346
  {
347
+ "epoch": 0.12,
348
+ "learning_rate": 3.2386363636363637e-06,
349
+ "loss": 0.774,
350
  "step": 57
351
  },
352
  {
353
+ "epoch": 0.12,
354
+ "learning_rate": 3.2954545454545456e-06,
355
+ "loss": 0.7292,
356
  "step": 58
357
  },
358
  {
359
+ "epoch": 0.13,
360
+ "learning_rate": 3.352272727272727e-06,
361
+ "loss": 0.7393,
362
  "step": 59
363
  },
364
  {
365
+ "epoch": 0.13,
366
+ "learning_rate": 3.409090909090909e-06,
367
+ "loss": 0.7484,
368
  "step": 60
369
  },
370
  {
371
+ "epoch": 0.13,
372
+ "learning_rate": 3.4659090909090915e-06,
373
+ "loss": 0.7077,
374
  "step": 61
375
  },
376
  {
377
+ "epoch": 0.13,
378
+ "learning_rate": 3.522727272727273e-06,
379
+ "loss": 0.8003,
380
  "step": 62
381
  },
382
  {
383
+ "epoch": 0.13,
384
+ "learning_rate": 3.579545454545455e-06,
385
+ "loss": 0.7998,
386
  "step": 63
387
  },
388
  {
389
+ "epoch": 0.14,
390
+ "learning_rate": 3.6363636363636366e-06,
391
+ "loss": 0.7882,
392
  "step": 64
393
  },
394
  {
395
+ "epoch": 0.14,
396
+ "learning_rate": 3.6931818181818186e-06,
397
+ "loss": 0.7326,
398
  "step": 65
399
  },
400
  {
401
+ "epoch": 0.14,
402
+ "learning_rate": 3.7500000000000005e-06,
403
+ "loss": 0.7945,
404
  "step": 66
405
  },
406
  {
407
+ "epoch": 0.14,
408
+ "learning_rate": 3.806818181818182e-06,
409
+ "loss": 0.7518,
410
  "step": 67
411
  },
412
  {
413
+ "epoch": 0.14,
414
+ "learning_rate": 3.863636363636364e-06,
415
+ "loss": 0.711,
416
  "step": 68
417
  },
418
  {
419
+ "epoch": 0.15,
420
+ "learning_rate": 3.9204545454545456e-06,
421
+ "loss": 0.7393,
422
  "step": 69
423
  },
424
  {
425
+ "epoch": 0.15,
426
+ "learning_rate": 3.9772727272727275e-06,
427
+ "loss": 0.7285,
428
  "step": 70
429
  },
430
  {
431
+ "epoch": 0.15,
432
+ "learning_rate": 4.0340909090909095e-06,
433
+ "loss": 0.6976,
434
  "step": 71
435
  },
436
  {
437
+ "epoch": 0.15,
438
+ "learning_rate": 4.0909090909090915e-06,
439
+ "loss": 0.7845,
440
  "step": 72
441
  },
442
  {
443
+ "epoch": 0.16,
444
+ "learning_rate": 4.1477272727272734e-06,
445
+ "loss": 0.7104,
446
  "step": 73
447
  },
448
  {
449
+ "epoch": 0.16,
450
+ "learning_rate": 4.204545454545455e-06,
451
+ "loss": 0.7021,
452
  "step": 74
453
  },
454
  {
455
+ "epoch": 0.16,
456
+ "learning_rate": 4.2613636363636365e-06,
457
+ "loss": 0.7425,
458
  "step": 75
459
  },
460
  {
461
+ "epoch": 0.16,
462
+ "learning_rate": 4.3181818181818185e-06,
463
+ "loss": 0.6481,
464
  "step": 76
465
  },
466
  {
467
+ "epoch": 0.16,
468
+ "learning_rate": 4.3750000000000005e-06,
469
+ "loss": 0.7236,
470
  "step": 77
471
  },
472
  {
473
+ "epoch": 0.17,
474
+ "learning_rate": 4.4318181818181824e-06,
475
+ "loss": 0.6932,
476
  "step": 78
477
  },
478
  {
479
+ "epoch": 0.17,
480
+ "learning_rate": 4.4886363636363636e-06,
481
+ "loss": 0.805,
482
  "step": 79
483
  },
484
  {
485
+ "epoch": 0.17,
486
+ "learning_rate": 4.5454545454545455e-06,
487
+ "loss": 0.7581,
488
  "step": 80
489
  },
490
  {
491
+ "epoch": 0.17,
492
+ "learning_rate": 4.6022727272727275e-06,
493
+ "loss": 0.7845,
494
  "step": 81
495
  },
496
  {
497
+ "epoch": 0.17,
498
+ "learning_rate": 4.6590909090909095e-06,
499
+ "loss": 0.722,
500
  "step": 82
501
  },
502
  {
503
+ "epoch": 0.18,
504
+ "learning_rate": 4.715909090909091e-06,
505
+ "loss": 0.7098,
506
  "step": 83
507
  },
508
  {
509
+ "epoch": 0.18,
510
+ "learning_rate": 4.772727272727273e-06,
511
+ "loss": 0.6372,
512
  "step": 84
513
  },
514
  {
515
+ "epoch": 0.18,
516
+ "learning_rate": 4.829545454545455e-06,
517
+ "loss": 0.6828,
518
  "step": 85
519
  },
520
  {
521
+ "epoch": 0.18,
522
+ "learning_rate": 4.8863636363636365e-06,
523
+ "loss": 0.655,
524
  "step": 86
525
  },
526
  {
527
+ "epoch": 0.19,
528
+ "learning_rate": 4.9431818181818184e-06,
529
+ "loss": 0.7154,
530
  "step": 87
531
  },
532
  {
533
+ "epoch": 0.19,
534
+ "learning_rate": 5e-06,
535
+ "loss": 0.7593,
536
  "step": 88
537
  },
538
  {
539
+ "epoch": 0.19,
540
+ "learning_rate": 5.056818181818182e-06,
541
+ "loss": 0.7319,
542
  "step": 89
543
  },
544
  {
545
+ "epoch": 0.19,
546
+ "learning_rate": 5.113636363636364e-06,
547
+ "loss": 0.7033,
548
  "step": 90
549
  },
550
  {
551
+ "epoch": 0.19,
552
+ "learning_rate": 5.170454545454546e-06,
553
+ "loss": 0.6835,
554
  "step": 91
555
  },
556
  {
557
+ "epoch": 0.2,
558
+ "learning_rate": 5.2272727272727274e-06,
559
+ "loss": 0.7718,
560
  "step": 92
561
  },
562
  {
563
+ "epoch": 0.2,
564
+ "learning_rate": 5.28409090909091e-06,
565
+ "loss": 0.7964,
566
  "step": 93
567
  },
568
  {
569
+ "epoch": 0.2,
570
+ "learning_rate": 5.340909090909091e-06,
571
+ "loss": 0.727,
572
  "step": 94
573
  },
574
  {
575
+ "epoch": 0.2,
576
+ "learning_rate": 5.397727272727273e-06,
577
+ "loss": 0.7526,
578
  "step": 95
579
  },
580
  {
581
+ "epoch": 0.2,
582
+ "learning_rate": 5.4545454545454545e-06,
583
+ "loss": 0.7241,
584
  "step": 96
585
  },
586
  {
587
+ "epoch": 0.21,
588
+ "learning_rate": 5.511363636363637e-06,
589
+ "loss": 0.7246,
590
  "step": 97
591
  },
592
  {
593
+ "epoch": 0.21,
594
+ "learning_rate": 5.568181818181818e-06,
595
+ "loss": 0.7095,
596
  "step": 98
597
  },
598
  {
599
+ "epoch": 0.21,
600
+ "learning_rate": 5.625e-06,
601
+ "loss": 0.6755,
602
  "step": 99
603
  },
604
  {
605
+ "epoch": 0.21,
606
+ "learning_rate": 5.681818181818183e-06,
607
+ "loss": 0.6322,
608
  "step": 100
609
  },
610
  {
611
+ "epoch": 0.22,
612
+ "learning_rate": 5.738636363636364e-06,
613
+ "loss": 0.7094,
614
  "step": 101
615
  },
616
  {
617
+ "epoch": 0.22,
618
+ "learning_rate": 5.795454545454546e-06,
619
+ "loss": 0.7256,
620
  "step": 102
621
  },
622
  {
623
+ "epoch": 0.22,
624
+ "learning_rate": 5.852272727272727e-06,
625
+ "loss": 0.7096,
626
  "step": 103
627
  },
628
  {
629
+ "epoch": 0.22,
630
+ "learning_rate": 5.90909090909091e-06,
631
+ "loss": 0.7148,
632
  "step": 104
633
  },
634
  {
635
+ "epoch": 0.22,
636
+ "learning_rate": 5.965909090909091e-06,
637
+ "loss": 0.6763,
638
  "step": 105
639
  },
640
  {
641
+ "epoch": 0.23,
642
+ "learning_rate": 6.022727272727273e-06,
643
+ "loss": 0.6761,
644
  "step": 106
645
  },
646
  {
647
+ "epoch": 0.23,
648
+ "learning_rate": 6.079545454545454e-06,
649
+ "loss": 0.7454,
650
  "step": 107
651
  },
652
  {
653
+ "epoch": 0.23,
654
+ "learning_rate": 6.136363636363637e-06,
655
+ "loss": 0.694,
656
  "step": 108
657
  },
658
  {
659
+ "epoch": 0.23,
660
+ "learning_rate": 6.193181818181818e-06,
661
+ "loss": 0.6797,
662
  "step": 109
663
  },
664
  {
665
+ "epoch": 0.23,
666
+ "learning_rate": 6.25e-06,
667
+ "loss": 0.6294,
668
  "step": 110
669
  },
670
  {
671
+ "epoch": 0.24,
672
+ "learning_rate": 6.306818181818183e-06,
673
+ "loss": 0.709,
674
  "step": 111
675
  },
676
  {
677
+ "epoch": 0.24,
678
+ "learning_rate": 6.363636363636364e-06,
679
+ "loss": 0.6536,
680
  "step": 112
681
  },
682
  {
683
+ "epoch": 0.24,
684
+ "learning_rate": 6.420454545454546e-06,
685
+ "loss": 0.7591,
686
  "step": 113
687
  },
688
  {
689
+ "epoch": 0.24,
690
+ "learning_rate": 6.477272727272727e-06,
691
+ "loss": 0.6292,
692
  "step": 114
693
  },
694
  {
695
+ "epoch": 0.25,
696
+ "learning_rate": 6.53409090909091e-06,
697
+ "loss": 0.7554,
698
  "step": 115
699
  },
700
  {
701
+ "epoch": 0.25,
702
+ "learning_rate": 6.590909090909091e-06,
703
+ "loss": 0.6496,
704
  "step": 116
705
  },
706
  {
707
+ "epoch": 0.25,
708
+ "learning_rate": 6.647727272727273e-06,
709
+ "loss": 0.7014,
710
  "step": 117
711
  },
712
  {
713
+ "epoch": 0.25,
714
+ "learning_rate": 6.704545454545454e-06,
715
+ "loss": 0.6783,
716
  "step": 118
717
  },
718
  {
719
+ "epoch": 0.25,
720
+ "learning_rate": 6.761363636363637e-06,
721
+ "loss": 0.6998,
722
  "step": 119
723
  },
724
  {
725
+ "epoch": 0.26,
726
+ "learning_rate": 6.818181818181818e-06,
727
+ "loss": 0.6159,
728
  "step": 120
729
  },
730
  {
731
+ "epoch": 0.26,
732
+ "learning_rate": 6.875e-06,
733
+ "loss": 0.7181,
734
  "step": 121
735
  },
736
  {
737
+ "epoch": 0.26,
738
+ "learning_rate": 6.931818181818183e-06,
739
+ "loss": 0.7008,
740
  "step": 122
741
  },
742
  {
743
+ "epoch": 0.26,
744
+ "learning_rate": 6.988636363636364e-06,
745
+ "loss": 0.6097,
746
  "step": 123
747
  },
748
  {
749
+ "epoch": 0.26,
750
+ "learning_rate": 7.045454545454546e-06,
751
+ "loss": 0.7091,
752
  "step": 124
753
  },
754
  {
755
+ "epoch": 0.27,
756
+ "learning_rate": 7.102272727272727e-06,
757
+ "loss": 0.6313,
758
  "step": 125
759
  },
760
  {
761
+ "epoch": 0.27,
762
+ "learning_rate": 7.15909090909091e-06,
763
+ "loss": 0.6355,
764
  "step": 126
765
  },
766
  {
767
+ "epoch": 0.27,
768
+ "learning_rate": 7.215909090909091e-06,
769
+ "loss": 0.6559,
770
  "step": 127
771
  },
772
  {
773
+ "epoch": 0.27,
774
+ "learning_rate": 7.272727272727273e-06,
775
+ "loss": 0.6066,
776
  "step": 128
777
  },
778
  {
779
+ "epoch": 0.28,
780
+ "learning_rate": 7.329545454545455e-06,
781
+ "loss": 0.6655,
782
  "step": 129
783
  },
784
  {
785
+ "epoch": 0.28,
786
+ "learning_rate": 7.386363636363637e-06,
787
+ "loss": 0.6623,
788
  "step": 130
789
  },
790
  {
791
+ "epoch": 0.28,
792
+ "learning_rate": 7.443181818181818e-06,
793
+ "loss": 0.6746,
794
  "step": 131
795
  },
796
  {
797
+ "epoch": 0.28,
798
+ "learning_rate": 7.500000000000001e-06,
799
+ "loss": 0.6795,
800
  "step": 132
801
  },
802
  {
803
+ "epoch": 0.28,
804
+ "learning_rate": 7.556818181818183e-06,
805
+ "loss": 0.6885,
806
  "step": 133
807
  },
808
  {
809
+ "epoch": 0.29,
810
+ "learning_rate": 7.613636363636364e-06,
811
+ "loss": 0.6868,
812
  "step": 134
813
  },
814
  {
815
+ "epoch": 0.29,
816
+ "learning_rate": 7.670454545454547e-06,
817
+ "loss": 0.6539,
818
  "step": 135
819
  },
820
  {
821
+ "epoch": 0.29,
822
+ "learning_rate": 7.727272727272727e-06,
823
+ "loss": 0.6463,
824
  "step": 136
825
  },
826
  {
827
+ "epoch": 0.29,
828
+ "learning_rate": 7.784090909090911e-06,
829
+ "loss": 0.6706,
830
  "step": 137
831
  },
832
  {
833
+ "epoch": 0.29,
834
+ "learning_rate": 7.840909090909091e-06,
835
+ "loss": 0.6806,
836
  "step": 138
837
  },
838
  {
839
+ "epoch": 0.3,
840
+ "learning_rate": 7.897727272727273e-06,
841
+ "loss": 0.6643,
842
  "step": 139
843
  },
844
  {
845
+ "epoch": 0.3,
846
+ "learning_rate": 7.954545454545455e-06,
847
+ "loss": 0.6554,
848
  "step": 140
849
  },
850
  {
851
+ "epoch": 0.3,
852
+ "learning_rate": 8.011363636363637e-06,
853
+ "loss": 0.6783,
854
  "step": 141
855
  },
856
  {
857
+ "epoch": 0.3,
858
+ "learning_rate": 8.068181818181819e-06,
859
+ "loss": 0.6641,
860
  "step": 142
861
  },
862
  {
863
+ "epoch": 0.3,
864
+ "learning_rate": 8.125000000000001e-06,
865
+ "loss": 0.6763,
866
  "step": 143
867
  },
868
  {
869
+ "epoch": 0.31,
870
+ "learning_rate": 8.181818181818183e-06,
871
+ "loss": 0.7325,
872
  "step": 144
873
  },
874
  {
875
+ "epoch": 0.31,
876
+ "learning_rate": 8.238636363636365e-06,
877
+ "loss": 0.6366,
878
  "step": 145
879
  },
880
  {
881
+ "epoch": 0.31,
882
+ "learning_rate": 8.295454545454547e-06,
883
+ "loss": 0.7373,
884
  "step": 146
885
  },
886
  {
887
+ "epoch": 0.31,
888
+ "learning_rate": 8.352272727272727e-06,
889
+ "loss": 0.7105,
890
  "step": 147
891
  },
892
  {
893
+ "epoch": 0.32,
894
+ "learning_rate": 8.40909090909091e-06,
895
+ "loss": 0.6865,
896
  "step": 148
897
  },
898
  {
899
+ "epoch": 0.32,
900
+ "learning_rate": 8.465909090909091e-06,
901
+ "loss": 0.6784,
902
  "step": 149
903
  },
904
  {
905
+ "epoch": 0.32,
906
+ "learning_rate": 8.522727272727273e-06,
907
+ "loss": 0.6769,
908
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  }
910
  ],
911
+ "max_steps": 11725,
912
  "num_train_epochs": 25,
913
+ "total_flos": 3.974479252291584e+17,
914
  "trial_name": null,
915
  "trial_params": null
916
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd79995eb50e2a9114d5c9c816c56c9095e5e18d9043dc3135caed891cd6c73d
3
- size 3771
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2694e65878114e56cf098ae35983b751a7d3942a83bdcf5cca40a623501a6e7f
3
+ size 3707