BAAI
/

hyxmmm commited on
Commit
d63c0a8
·
verified ·
1 Parent(s): 49b8c02

Upload configuration_aquilamoe.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_aquilamoe.py +143 -0
configuration_aquilamoe.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """ AquilaMoE model configuration"""
15
+ # Copied from transformers.models.mixtral.configuration_mixtral
16
+
17
+ from transformers import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+
24
+ class AquilaMoeConfig(PretrainedConfig):
25
+ r"""
26
+ Args:
27
+ vocab_size (`int`, *optional*, defaults to 150000):
28
+ Vocabulary size of the AquilaMoE model. Defines the number of different tokens that can be represented by the
29
+ `inputs_ids` passed when calling [`AquilaMoE`]
30
+ hidden_size (`int`, *optional*, defaults to 4096):
31
+ Dimension of the hidden representations.
32
+ intermediate_size (`int`, *optional*, defaults to 14336):
33
+ Dimension of the MLP representations.
34
+ num_hidden_layers (`int`, *optional*, defaults to 32):
35
+ Number of hidden layers in the Transformer encoder.
36
+ num_attention_heads (`int`, *optional*, defaults to 32):
37
+ Number of attention heads for each attention layer in the Transformer encoder.
38
+ num_key_value_heads (`int`, *optional*, defaults to 8):
39
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
40
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
41
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
42
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
43
+ by meanpooling all the original heads within that group. For more details checkout [this
44
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
45
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
46
+ The non-linear activation function (function or string) in the decoder.
47
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
48
+ The maximum sequence length that this model might ever be used with. AquilaMoE's sliding window attention
49
+ allows sequence of up to 4096*32 tokens.
50
+ initializer_range (`float`, *optional*, defaults to 0.02):
51
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
52
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
53
+ The epsilon used by the rms normalization layers.
54
+ use_cache (`bool`, *optional*, defaults to `True`):
55
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
56
+ relevant if `config.is_decoder=True`.
57
+ pad_token_id (`int`, *optional*):
58
+ The id of the padding token.
59
+ bos_token_id (`int`, *optional*, defaults to 1):
60
+ The id of the "beginning-of-sequence" token.
61
+ eos_token_id (`int`, *optional*, defaults to 2):
62
+ The id of the "end-of-sequence" token.
63
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
64
+ Whether the model's input and output word embeddings should be tied.
65
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
66
+ The base period of the RoPE embeddings.
67
+ sliding_window (`int`, *optional*, defaults to 4096):
68
+ Sliding window attention window size. If not specified, will default to `4096`.
69
+ attention_dropout (`float`, *optional*, defaults to 0.0):
70
+ The dropout ratio for the attention probabilities.
71
+ num_experts_per_tok (`int`, *optional*, defaults to 2):
72
+ The number of experts to root per-token, can be also interpreted as the `top-p` routing
73
+ parameter
74
+ num_local_experts (`int`, *optional*, defaults to 8):
75
+ Number of experts per Sparse MLP layer.
76
+ output_router_logits (`bool`, *optional*, defaults to `False`):
77
+ Whether or not the router logits should be returned by the model. Enabeling this will also
78
+ allow the model to output the auxiliary loss. See [here]() for more details
79
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
80
+ The aux loss factor for the total loss.
81
+
82
+ """
83
+
84
+ model_type = "aquilamoe"
85
+ keys_to_ignore_at_inference = ["past_key_values"]
86
+
87
+ def __init__(
88
+ self,
89
+ vocab_size=150000,
90
+ hidden_size=4096,
91
+ intermediate_size=14336,
92
+ num_hidden_layers=32,
93
+ num_attention_heads=32,
94
+ num_key_value_heads=8,
95
+ hidden_act="silu",
96
+ max_position_embeddings=4096 * 32,
97
+ initializer_range=0.02,
98
+ rms_norm_eps=1e-5,
99
+ use_cache=True,
100
+ pad_token_id=None,
101
+ bos_token_id=1,
102
+ eos_token_id=2,
103
+ tie_word_embeddings=False,
104
+ rope_theta=1e6,
105
+ sliding_window=4096,
106
+ attention_dropout=0.0,
107
+ num_experts_per_tok=2,
108
+ num_local_experts=8,
109
+ output_router_logits=False,
110
+ router_aux_loss_coef=0.001,
111
+ **kwargs,
112
+ ):
113
+ self.vocab_size = vocab_size
114
+ self.max_position_embeddings = max_position_embeddings
115
+ self.hidden_size = hidden_size
116
+ self.intermediate_size = intermediate_size
117
+ self.num_hidden_layers = num_hidden_layers
118
+ self.num_attention_heads = num_attention_heads
119
+ self.sliding_window = sliding_window
120
+
121
+ # for backward compatibility
122
+ if num_key_value_heads is None:
123
+ num_key_value_heads = num_attention_heads
124
+
125
+ self.num_key_value_heads = num_key_value_heads
126
+ self.hidden_act = hidden_act
127
+ self.initializer_range = initializer_range
128
+ self.rms_norm_eps = rms_norm_eps
129
+ self.use_cache = use_cache
130
+ self.rope_theta = rope_theta
131
+ self.attention_dropout = attention_dropout
132
+
133
+ self.num_experts_per_tok = num_experts_per_tok
134
+ self.num_local_experts = num_local_experts
135
+ self.output_router_logits = output_router_logits
136
+ self.router_aux_loss_coef = router_aux_loss_coef
137
+ super().__init__(
138
+ pad_token_id=pad_token_id,
139
+ bos_token_id=bos_token_id,
140
+ eos_token_id=eos_token_id,
141
+ tie_word_embeddings=tie_word_embeddings,
142
+ **kwargs,
143
+ )