leonzhou286 commited on
Commit
23836cf
1 Parent(s): b9633b2

Upload LlamaMoEForCausalLM

Browse files
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- license: mit
3
  language:
4
  - en
5
- base_model: meta-llama/Meta-Llama-3-8B-Instruct
6
  ---
7
 
8
  # Llama 3 8b Instruct MOE
 
1
  ---
2
+ base_model: meta-llama/Meta-Llama-3-8B-Instruct
3
  language:
4
  - en
5
+ license: mit
6
  ---
7
 
8
  # Llama 3 8b Instruct MOE
config.json ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/notebooks/converted_model",
3
+ "add_weight_norm": false,
4
+ "architectures": [
5
+ "LlamaMoEForCausalLM"
6
+ ],
7
+ "attention_bias": false,
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 128000,
10
+ "calculator_type": "UniversalCalculator",
11
+ "capacity_factor": 1.25,
12
+ "drop_tokens": true,
13
+ "dropped_padding": "zero",
14
+ "eos_token_id": 128009,
15
+ "gate_add_noise": true,
16
+ "gate_balance_loss_weight": 0.01,
17
+ "gate_network": "mlp",
18
+ "gate_noise_epsilon": 0.01,
19
+ "gate_type": "TopKBalancedNoisyGate",
20
+ "gate_use_balance": true,
21
+ "gate_use_softmax": true,
22
+ "gates": "mlp",
23
+ "hidden_act": "silu",
24
+ "hidden_size": 4096,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 14336,
27
+ "max_position_embeddings": 8192,
28
+ "mlp_bias": false,
29
+ "model_type": "llama_moe",
30
+ "multiply_gate_scores": true,
31
+ "num_attention_heads": 32,
32
+ "num_experts": 8,
33
+ "num_hidden_layers": 32,
34
+ "num_key_value_heads": 8,
35
+ "num_selects": 2,
36
+ "pad_token_id": 0,
37
+ "pretraining_tp": 1,
38
+ "rms_norm_eps": 1e-05,
39
+ "rope_scaling": null,
40
+ "rope_theta": 500000.0,
41
+ "score_scale_factor": 8.0,
42
+ "size_experts": [
43
+ [
44
+ 1792,
45
+ 1792,
46
+ 1792,
47
+ 1792,
48
+ 1792,
49
+ 1792,
50
+ 1792,
51
+ 1792
52
+ ],
53
+ [
54
+ 1792,
55
+ 1792,
56
+ 1792,
57
+ 1792,
58
+ 1792,
59
+ 1792,
60
+ 1792,
61
+ 1792
62
+ ],
63
+ [
64
+ 1792,
65
+ 1792,
66
+ 1792,
67
+ 1792,
68
+ 1792,
69
+ 1792,
70
+ 1792,
71
+ 1792
72
+ ],
73
+ [
74
+ 1792,
75
+ 1792,
76
+ 1792,
77
+ 1792,
78
+ 1792,
79
+ 1792,
80
+ 1792,
81
+ 1792
82
+ ],
83
+ [
84
+ 1792,
85
+ 1792,
86
+ 1792,
87
+ 1792,
88
+ 1792,
89
+ 1792,
90
+ 1792,
91
+ 1792
92
+ ],
93
+ [
94
+ 1792,
95
+ 1792,
96
+ 1792,
97
+ 1792,
98
+ 1792,
99
+ 1792,
100
+ 1792,
101
+ 1792
102
+ ],
103
+ [
104
+ 1792,
105
+ 1792,
106
+ 1792,
107
+ 1792,
108
+ 1792,
109
+ 1792,
110
+ 1792,
111
+ 1792
112
+ ],
113
+ [
114
+ 1792,
115
+ 1792,
116
+ 1792,
117
+ 1792,
118
+ 1792,
119
+ 1792,
120
+ 1792,
121
+ 1792
122
+ ],
123
+ [
124
+ 1792,
125
+ 1792,
126
+ 1792,
127
+ 1792,
128
+ 1792,
129
+ 1792,
130
+ 1792,
131
+ 1792
132
+ ],
133
+ [
134
+ 1792,
135
+ 1792,
136
+ 1792,
137
+ 1792,
138
+ 1792,
139
+ 1792,
140
+ 1792,
141
+ 1792
142
+ ],
143
+ [
144
+ 1792,
145
+ 1792,
146
+ 1792,
147
+ 1792,
148
+ 1792,
149
+ 1792,
150
+ 1792,
151
+ 1792
152
+ ],
153
+ [
154
+ 1792,
155
+ 1792,
156
+ 1792,
157
+ 1792,
158
+ 1792,
159
+ 1792,
160
+ 1792,
161
+ 1792
162
+ ],
163
+ [
164
+ 1792,
165
+ 1792,
166
+ 1792,
167
+ 1792,
168
+ 1792,
169
+ 1792,
170
+ 1792,
171
+ 1792
172
+ ],
173
+ [
174
+ 1792,
175
+ 1792,
176
+ 1792,
177
+ 1792,
178
+ 1792,
179
+ 1792,
180
+ 1792,
181
+ 1792
182
+ ],
183
+ [
184
+ 1792,
185
+ 1792,
186
+ 1792,
187
+ 1792,
188
+ 1792,
189
+ 1792,
190
+ 1792,
191
+ 1792
192
+ ],
193
+ [
194
+ 1792,
195
+ 1792,
196
+ 1792,
197
+ 1792,
198
+ 1792,
199
+ 1792,
200
+ 1792,
201
+ 1792
202
+ ],
203
+ [
204
+ 1792,
205
+ 1792,
206
+ 1792,
207
+ 1792,
208
+ 1792,
209
+ 1792,
210
+ 1792,
211
+ 1792
212
+ ],
213
+ [
214
+ 1792,
215
+ 1792,
216
+ 1792,
217
+ 1792,
218
+ 1792,
219
+ 1792,
220
+ 1792,
221
+ 1792
222
+ ],
223
+ [
224
+ 1792,
225
+ 1792,
226
+ 1792,
227
+ 1792,
228
+ 1792,
229
+ 1792,
230
+ 1792,
231
+ 1792
232
+ ],
233
+ [
234
+ 1792,
235
+ 1792,
236
+ 1792,
237
+ 1792,
238
+ 1792,
239
+ 1792,
240
+ 1792,
241
+ 1792
242
+ ],
243
+ [
244
+ 1792,
245
+ 1792,
246
+ 1792,
247
+ 1792,
248
+ 1792,
249
+ 1792,
250
+ 1792,
251
+ 1792
252
+ ],
253
+ [
254
+ 1792,
255
+ 1792,
256
+ 1792,
257
+ 1792,
258
+ 1792,
259
+ 1792,
260
+ 1792,
261
+ 1792
262
+ ],
263
+ [
264
+ 1792,
265
+ 1792,
266
+ 1792,
267
+ 1792,
268
+ 1792,
269
+ 1792,
270
+ 1792,
271
+ 1792
272
+ ],
273
+ [
274
+ 1792,
275
+ 1792,
276
+ 1792,
277
+ 1792,
278
+ 1792,
279
+ 1792,
280
+ 1792,
281
+ 1792
282
+ ],
283
+ [
284
+ 1792,
285
+ 1792,
286
+ 1792,
287
+ 1792,
288
+ 1792,
289
+ 1792,
290
+ 1792,
291
+ 1792
292
+ ],
293
+ [
294
+ 1792,
295
+ 1792,
296
+ 1792,
297
+ 1792,
298
+ 1792,
299
+ 1792,
300
+ 1792,
301
+ 1792
302
+ ],
303
+ [
304
+ 1792,
305
+ 1792,
306
+ 1792,
307
+ 1792,
308
+ 1792,
309
+ 1792,
310
+ 1792,
311
+ 1792
312
+ ],
313
+ [
314
+ 1792,
315
+ 1792,
316
+ 1792,
317
+ 1792,
318
+ 1792,
319
+ 1792,
320
+ 1792,
321
+ 1792
322
+ ],
323
+ [
324
+ 1792,
325
+ 1792,
326
+ 1792,
327
+ 1792,
328
+ 1792,
329
+ 1792,
330
+ 1792,
331
+ 1792
332
+ ],
333
+ [
334
+ 1792,
335
+ 1792,
336
+ 1792,
337
+ 1792,
338
+ 1792,
339
+ 1792,
340
+ 1792,
341
+ 1792
342
+ ],
343
+ [
344
+ 1792,
345
+ 1792,
346
+ 1792,
347
+ 1792,
348
+ 1792,
349
+ 1792,
350
+ 1792,
351
+ 1792
352
+ ],
353
+ [
354
+ 1792,
355
+ 1792,
356
+ 1792,
357
+ 1792,
358
+ 1792,
359
+ 1792,
360
+ 1792,
361
+ 1792
362
+ ]
363
+ ],
364
+ "tie_word_embeddings": false,
365
+ "torch_dtype": "float32",
366
+ "transformers_version": "4.44.1",
367
+ "use_cache": true,
368
+ "vocab_size": 128256
369
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "eos_token_id": 128009,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.44.1"
7
+ }
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d566266796875d08001b3872f0bbde688b94896ac83ea405d4e48bc459cd055
3
+ size 4975640656
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5204f00c86053f3c7f4794c28815431cd450e8e9eaf19733ba6c90542ac442ec
3
+ size 4980139832
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a7cecfc025b582eb7316bb368e2346465e4a1f86818bde964add032fa7b4892
3
+ size 4972047184
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9f85a45ae8268290105e2c6f48c047971db7633854e5966bfed730fe2404f8f
3
+ size 4972047240
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94c9be217601bac4e3d2a4b4f32548034b0f288aabc42fc694dcf7c029941775
3
+ size 4992722896
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c781a9934ac4ffa1401d6397674783ae52d9b6b4fbe54f3a9abf215a1961097
3
+ size 4988824600
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb0dde269058e7ada125951d937383c8553d99e852a47da117b6b95295bd721e
3
+ size 2248164200
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff