pirroh commited on
Commit
b634e3d
1 Parent(s): e023a84

Delete configuration_replit_lm.py

Browse files
Files changed (1) hide show
  1. configuration_replit_lm.py +0 -168
configuration_replit_lm.py DELETED
@@ -1,168 +0,0 @@
1
- # Copyright 2022 MosaicML Examples authors
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- """Forked for ReplitLM"""
5
-
6
- """A HuggingFace-style model configuration."""
7
-
8
-
9
- from typing import Optional, Tuple, Union
10
- from transformers import PretrainedConfig
11
- class ReplitLMConfig(PretrainedConfig):
12
- model_type = 'replit_lm'
13
-
14
- def __init__(
15
- self,
16
- d_model: int = 2048,
17
- n_heads: int = 16,
18
- n_layers: int = 24,
19
- mlp_ratio: int = 4,
20
- max_seq_len: int = 2048,
21
- vocab_size: int = 50368,
22
- attn_pdrop: float = 0.0,
23
- resid_pdrop: float = 0.0,
24
- emb_pdrop: float = 0.0,
25
- attn_impl: str = 'triton',
26
- attn_qk_ln: bool = False,
27
- attn_clip_qkv: Optional[float] = None,
28
- softmax_scale: Optional[float] = None,
29
- prefix_lm: Optional[bool] = False,
30
- attn_uses_sequence_id: Optional[bool] = False,
31
- alibi: bool = False,
32
- alibi_bias_max: int = 8,
33
- init_device: str = 'cpu',
34
- logit_scale: Optional[Union[float, str]] = None,
35
- no_bias: bool = False,
36
- verbose: int = 0,
37
- param_init_fn: str = 'kaiming_normal_',
38
- init_div_is_residual: Union[int, float, str, bool] = True,
39
- init_std: float = 0.02,
40
- emb_init_std: Optional[float] = None,
41
- emb_init_uniform_lim: Optional[Union[Tuple[float, float],
42
- float]] = None,
43
- init_gain: float = 0,
44
- fan_mode: str = 'fan_in',
45
- init_nonlinearity: str = 'relu',
46
- embedding_fraction: float = 1.0,
47
- low_precision_layernorm: bool = True,
48
- use_cache: bool = False,
49
- **kwargs,
50
- ):
51
- """The ReplitLM configuration class.
52
-
53
- Args:
54
- d_model (int): The size of the embedding dimension of the model.
55
- n_heads (int): The number of attention heads.
56
- n_layers (int): The number of layers in the model.
57
- mlp_ratio (int): The ratio of the up/down scale in the MLP.
58
- max_seq_len (int): The maximum sequence length of the model.
59
- vocab_size (int): The size of the vocabulary.
60
- attn_pdrop (float): The dropout probability for the attention layers.
61
- resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
62
- emb_pdrop (float): The dropout probability for the embedding layer.
63
- attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
64
- attn_qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
65
- attn_clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
66
- this value.
67
- softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
68
- use the default scale of ``1/sqrt(d_keys)``.
69
- prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
70
- extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
71
- can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
72
- attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
73
- When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
74
- which sub-sequence each token belongs to.
75
- Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
76
- alibi (bool): Whether to use the alibi bias instead of position embeddings.
77
- alibi_bias_max (int): The maximum value of the alibi bias.
78
- init_device (str): The device to use for parameter initialization.
79
- logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
80
- no_bias (bool): Whether to use bias in all layers.
81
- verbose (int): The verbosity level. 0 is silent.
82
- param_init_fn (str): The parameter initialization scheme to use. One of 'default_', 'baseline_', 'kaiming_uniform_',
83
- 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or 'xavier_normal_'.
84
- init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
85
- init_std (float): The standard deviation of the normal distribution used to initialize the model,
86
- if using the baseline_ parameter initialization scheme.
87
- emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
88
- emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
89
- used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
90
- init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
91
- fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
92
- init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
93
- embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
94
- low_precision_layernorm (bool): Whether to use low precision layer normalization.
95
- use_cache (bool): Whether or not the model should return the last key/values attentions
96
- """
97
- self.d_model = d_model
98
- self.n_heads = n_heads
99
- self.n_layers = n_layers
100
- self.mlp_ratio = mlp_ratio
101
- self.max_seq_len = max_seq_len
102
- self.vocab_size = vocab_size
103
- self.attn_pdrop = attn_pdrop
104
- self.resid_pdrop = resid_pdrop
105
- self.emb_pdrop = emb_pdrop
106
- self.attn_impl = attn_impl
107
- self.attn_qk_ln = attn_qk_ln
108
- self.attn_clip_qkv = attn_clip_qkv
109
- self.softmax_scale = softmax_scale
110
- self.prefix_lm = prefix_lm
111
- self.attn_uses_sequence_id = attn_uses_sequence_id
112
- self.alibi = alibi
113
- self.alibi_bias_max = alibi_bias_max
114
- self.init_device = init_device
115
- self.logit_scale = logit_scale
116
- self.no_bias = no_bias
117
- self.verbose = verbose
118
- self.param_init_fn = param_init_fn
119
- self.init_div_is_residual = init_div_is_residual
120
- self.init_std = init_std
121
- self.emb_init_std = emb_init_std
122
- self.emb_init_uniform_lim = emb_init_uniform_lim
123
- self.init_std = init_std
124
- self.init_gain = init_gain
125
- self.fan_mode = fan_mode
126
- self.init_nonlinearity = init_nonlinearity
127
- self.embedding_fraction = embedding_fraction
128
- self.low_precision_layernorm = low_precision_layernorm
129
- self.use_cache = use_cache
130
- if 'name' in kwargs:
131
- del kwargs['name']
132
- if 'loss_fn' in kwargs:
133
- del kwargs['loss_fn']
134
- super().__init__(**kwargs)
135
-
136
- self._validate_config()
137
-
138
- def _validate_config(self):
139
- if self.d_model % self.n_heads != 0:
140
- raise ValueError('d_model must be divisible by n_heads')
141
- if any(prob < 0 or prob > 1
142
- for prob in [self.attn_pdrop, self.resid_pdrop, self.emb_pdrop]):
143
- raise ValueError(
144
- 'attn_pdrop, resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1'
145
- )
146
- if self.attn_impl not in ['torch', 'flash', 'triton']:
147
- raise ValueError(f'Unknown attn_impl={self.attn_impl}')
148
- if self.prefix_lm and self.attn_impl not in ['torch', 'triton']:
149
- raise NotImplementedError(
150
- 'prefix_lm only implemented with torch and triton attention.')
151
- if self.alibi and self.attn_impl not in ['torch', 'triton']:
152
- raise NotImplementedError(
153
- 'alibi only implemented with torch and triton attention.')
154
- if self.attn_uses_sequence_id and self.attn_impl not in [
155
- 'torch', 'triton'
156
- ]:
157
- raise NotImplementedError(
158
- 'attn_uses_sequence_id only implemented with torch and triton attention.'
159
- )
160
- if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
161
- raise ValueError(
162
- 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'
163
- )
164
- if isinstance(self.logit_scale,
165
- str) and self.logit_scale != 'inv_sqrt_d_model':
166
- raise ValueError(
167
- f"{self.logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
168
- )