File size: 2,360 Bytes
5430157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from transformers import PretrainedConfig
from typing import List


class LMConfig(PretrainedConfig):
    model_type = "minimind"

    def __init__(

            self,

            dim: int = 768,

            n_layers: int = 16,

            n_heads: int = 16,

            n_kv_heads: int = 8,

            vocab_size: int = 6400,

            hidden_dim: int = None,

            multiple_of: int = 64,

            norm_eps: float = 1e-5,

            max_seq_len: int = 512,

            dropout: float = 0.0,

            flash_attn: bool = True,

            ####################################################

            # Here are the specific configurations of MOE

            # When use_moe is false, the following is invalid

            ####################################################

            use_moe: bool = False,

            num_experts_per_tok=2,

            n_routed_experts=4,

            n_shared_experts: bool = True,

            scoring_func='softmax',

            aux_loss_alpha=0.01,

            seq_aux=True,

            norm_topk_prob=True,

            **kwargs,

    ):
        self.dim = dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.n_kv_heads = n_kv_heads
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.multiple_of = multiple_of
        self.norm_eps = norm_eps
        self.max_seq_len = max_seq_len
        self.dropout = dropout
        self.flash_attn = flash_attn
        ####################################################
        # Here are the specific configurations of MOE
        # When use_moe is false, the following is invalid
        ####################################################
        self.use_moe = use_moe
        self.num_experts_per_tok = num_experts_per_tok  # 每个token选择的专家数量
        self.n_routed_experts = n_routed_experts  # 总的专家数量
        self.n_shared_experts = n_shared_experts  # 共享专家
        self.scoring_func = scoring_func  # 评分函数,默认为'softmax'
        self.aux_loss_alpha = aux_loss_alpha  # 辅助损失的alpha参数
        self.seq_aux = seq_aux  # 是否在序列级别上计算辅助损失
        self.norm_topk_prob = norm_topk_prob  # 是否标准化top-k概率
        super().__init__(**kwargs)