File size: 4,627 Bytes
92233d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from transformers.configuration_utils import PretrainedConfig
class LlamaMoEConfig(PretrainedConfig):
model_type = "llama_moe"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
# -------- moe expert configs --------
num_experts=16,
num_selects=4,
size_experts=None,
# -------- moe gate configs --------
gate_type="TopKBalancedNoisyGate",
gate_network="mlp",
gate_use_softmax=True,
gate_use_balance=True,
gate_balance_loss_weight=1e-2,
gate_add_noise=True,
# TopKBalancedNoisyGate
gate_noise_epsilon=1e-2,
# -------- moe calculator configs --------
calculator_type="UniversalCalculator",
multiply_gate_scores=True,
score_scale_factor=1.0,
add_weight_norm=False,
# SwitchDropTokenCalculator
drop_tokens=True,
dropped_padding="zero",
capacity_factor=1.25,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self._rope_scaling_validation()
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.num_experts = num_experts
self.num_selects = num_selects
self.size_experts = size_experts
self.gate_type = gate_type
self.gate_network = gate_network
self.gate_use_softmax = gate_use_softmax
self.gate_use_balance = gate_use_balance
self.gate_balance_loss_weight = gate_balance_loss_weight
self.gate_add_noise = gate_add_noise
self.gate_noise_epsilon = gate_noise_epsilon
self.calculator_type = calculator_type
self.multiply_gate_scores = multiply_gate_scores
self.score_scale_factor = score_scale_factor
self.add_weight_norm = add_weight_norm
self.drop_tokens = drop_tokens
self.dropped_padding = dropped_padding
self.capacity_factor = capacity_factor
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
f"got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
if (
rope_scaling_factor is None
or not isinstance(rope_scaling_factor, float)
or rope_scaling_factor <= 1.0
):
raise ValueError(
f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}"
)
|