diwank commited on
Commit
3175fd2
1 Parent(s): d518181

Upload folder using huggingface_hub

Browse files
attention.py CHANGED
@@ -7,7 +7,8 @@ import torch.nn as nn
7
  from einops import rearrange
8
  from packaging import version
9
  from torch import nn
10
- from .norm import LPLayerNorm
 
11
 
12
  def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
13
  if original_is_causal and num_query_tokens != num_key_tokens:
@@ -46,7 +47,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_
46
  attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
47
  if is_causal and (not q.size(2) == 1):
48
  s = max(s_q, s_k)
49
- causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
50
  causal_mask = causal_mask.tril()
51
  causal_mask = causal_mask.to(torch.bool)
52
  causal_mask = ~causal_mask
@@ -141,8 +142,8 @@ def triton_flash_attn_fn(query, key, value, n_heads, past_key_value=None, softma
141
  key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
142
  value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
143
  if multiquery:
144
- key = key.expand(*key.shape[:2], n_heads, key.size(-1))
145
- value = value.expand(*value.shape[:2], n_heads, value.size(-1))
146
  reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
147
  attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
148
  output = attn_output.view(*attn_output.shape[:2], -1)
@@ -155,7 +156,7 @@ class MultiheadAttention(nn.Module):
155
  additive bias.
156
  """
157
 
158
- def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
159
  super().__init__()
160
  self.attn_impl = attn_impl
161
  self.clip_qkv = clip_qkv
@@ -166,13 +167,16 @@ class MultiheadAttention(nn.Module):
166
  if self.softmax_scale is None:
167
  self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
168
  self.attn_dropout_p = attn_pdrop
169
- self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
 
 
 
170
  fuse_splits = (d_model, 2 * d_model)
171
  self.Wqkv._fused = (0, fuse_splits)
172
  if self.qk_ln:
173
- layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
174
- self.q_ln = layernorm_class(self.d_model, device=device)
175
- self.k_ln = layernorm_class(self.d_model, device=device)
176
  if self.attn_impl == 'flash':
177
  self.attn_fn = flash_attn_fn
178
  elif self.attn_impl == 'triton':
@@ -185,13 +189,13 @@ class MultiheadAttention(nn.Module):
185
  warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
186
  else:
187
  raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
188
- self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
189
  self.out_proj._is_residual = True
190
 
191
  def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
192
  qkv = self.Wqkv(x)
193
  if self.clip_qkv:
194
- qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
195
  (query, key, value) = qkv.chunk(3, dim=2)
196
  key_padding_mask = attention_mask
197
  if self.qk_ln:
@@ -208,7 +212,7 @@ class MultiQueryAttention(nn.Module):
208
  additive bias.
209
  """
210
 
211
- def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
212
  super().__init__()
213
  self.attn_impl = attn_impl
214
  self.clip_qkv = clip_qkv
@@ -220,13 +224,16 @@ class MultiQueryAttention(nn.Module):
220
  if self.softmax_scale is None:
221
  self.softmax_scale = 1 / math.sqrt(self.head_dim)
222
  self.attn_dropout_p = attn_pdrop
223
- self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
 
 
 
224
  fuse_splits = (d_model, d_model + self.head_dim)
225
  self.Wqkv._fused = (0, fuse_splits)
226
  if self.qk_ln:
227
- layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
228
- self.q_ln = layernorm_class(d_model, device=device)
229
- self.k_ln = layernorm_class(self.head_dim, device=device)
230
  if self.attn_impl == 'flash':
231
  self.attn_fn = flash_attn_fn
232
  elif self.attn_impl == 'triton':
@@ -239,13 +246,13 @@ class MultiQueryAttention(nn.Module):
239
  warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
240
  else:
241
  raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
242
- self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
243
  self.out_proj._is_residual = True
244
 
245
  def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
246
  qkv = self.Wqkv(x)
247
  if self.clip_qkv:
248
- qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
249
  (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
250
  key_padding_mask = attention_mask
251
  if self.qk_ln:
 
7
  from einops import rearrange
8
  from packaging import version
9
  from torch import nn
10
+ from .fc import FC_CLASS_REGISTRY
11
+ from .norm import NORM_CLASS_REGISTRY
12
 
13
  def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
14
  if original_is_causal and num_query_tokens != num_key_tokens:
 
47
  attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
48
  if is_causal and (not q.size(2) == 1):
49
  s = max(s_q, s_k)
50
+ causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32)
51
  causal_mask = causal_mask.tril()
52
  causal_mask = causal_mask.to(torch.bool)
53
  causal_mask = ~causal_mask
 
142
  key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
143
  value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
144
  if multiquery:
145
+ key = key.repeat(1, 1, n_heads, 1)
146
+ value = value.repeat(1, 1, n_heads, 1)
147
  reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
148
  attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
149
  output = attn_output.view(*attn_output.shape[:2], -1)
 
156
  additive bias.
157
  """
158
 
159
+ def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', verbose: int=0, device: Optional[str]=None):
160
  super().__init__()
161
  self.attn_impl = attn_impl
162
  self.clip_qkv = clip_qkv
 
167
  if self.softmax_scale is None:
168
  self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
169
  self.attn_dropout_p = attn_pdrop
170
+ fc_kwargs = {}
171
+ if fc_type != 'te':
172
+ fc_kwargs['device'] = device
173
+ self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, 3 * self.d_model, **fc_kwargs)
174
  fuse_splits = (d_model, 2 * d_model)
175
  self.Wqkv._fused = (0, fuse_splits)
176
  if self.qk_ln:
177
+ norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
178
+ self.q_ln = norm_class(self.d_model, device=device)
179
+ self.k_ln = norm_class(self.d_model, device=device)
180
  if self.attn_impl == 'flash':
181
  self.attn_fn = flash_attn_fn
182
  elif self.attn_impl == 'triton':
 
189
  warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
190
  else:
191
  raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
192
+ self.out_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
193
  self.out_proj._is_residual = True
194
 
195
  def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
196
  qkv = self.Wqkv(x)
197
  if self.clip_qkv:
198
+ qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
199
  (query, key, value) = qkv.chunk(3, dim=2)
200
  key_padding_mask = attention_mask
201
  if self.qk_ln:
 
212
  additive bias.
213
  """
214
 
215
+ def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', verbose: int=0, device: Optional[str]=None):
216
  super().__init__()
217
  self.attn_impl = attn_impl
218
  self.clip_qkv = clip_qkv
 
224
  if self.softmax_scale is None:
225
  self.softmax_scale = 1 / math.sqrt(self.head_dim)
226
  self.attn_dropout_p = attn_pdrop
227
+ fc_kwargs = {}
228
+ if fc_type != 'te':
229
+ fc_kwargs['device'] = device
230
+ self.Wqkv = FC_CLASS_REGISTRY[fc_type](d_model, d_model + 2 * self.head_dim, **fc_kwargs)
231
  fuse_splits = (d_model, d_model + self.head_dim)
232
  self.Wqkv._fused = (0, fuse_splits)
233
  if self.qk_ln:
234
+ norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
235
+ self.q_ln = norm_class(d_model, device=device)
236
+ self.k_ln = norm_class(self.head_dim, device=device)
237
  if self.attn_impl == 'flash':
238
  self.attn_fn = flash_attn_fn
239
  elif self.attn_impl == 'triton':
 
246
  warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
247
  else:
248
  raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
249
+ self.out_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
250
  self.out_proj._is_residual = True
251
 
252
  def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
253
  qkv = self.Wqkv(x)
254
  if self.clip_qkv:
255
+ qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
256
  (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
257
  key_padding_mask = attention_mask
258
  if self.qk_ln:
blocks.py CHANGED
@@ -3,31 +3,23 @@ from typing import Dict, Optional, Tuple
3
  import torch
4
  import torch.nn as nn
5
  from .attention import ATTN_CLASS_REGISTRY
 
 
6
  from .norm import NORM_CLASS_REGISTRY
7
 
8
- class MPTMLP(nn.Module):
9
-
10
- def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
11
- super().__init__()
12
- self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
13
- self.act = nn.GELU(approximate='none')
14
- self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
15
- self.down_proj._is_residual = True
16
-
17
- def forward(self, x):
18
- return self.down_proj(self.act(self.up_proj(x)))
19
-
20
  class MPTBlock(nn.Module):
21
 
22
- def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
23
  del kwargs
24
  super().__init__()
25
  norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
26
  attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
27
  self.norm_1 = norm_class(d_model, device=device)
28
- self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device)
29
- self.norm_2 = norm_class(d_model, device=device)
30
- self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
 
 
31
  self.resid_attn_dropout = nn.Dropout(resid_pdrop)
32
  self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
33
 
@@ -35,7 +27,9 @@ class MPTBlock(nn.Module):
35
  a = self.norm_1(x)
36
  (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
37
  x = x + self.resid_attn_dropout(b)
38
- m = self.norm_2(x)
 
 
39
  n = self.ffn(m)
40
  x = x + self.resid_ffn_dropout(n)
41
  return (x, attn_weights, past_key_value)
 
3
  import torch
4
  import torch.nn as nn
5
  from .attention import ATTN_CLASS_REGISTRY
6
+ from .fc import FC_CLASS_REGISTRY
7
+ from .ffn import FFN_CLASS_REGISTRY, build_ffn
8
  from .norm import NORM_CLASS_REGISTRY
9
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  class MPTBlock(nn.Module):
11
 
12
+ def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, ffn_config: Dict={'ffn_type': 'mptmlp'}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, fc_type: str='torch', device: Optional[str]=None, **kwargs):
13
  del kwargs
14
  super().__init__()
15
  norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
16
  attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
17
  self.norm_1 = norm_class(d_model, device=device)
18
+ self.attn = attn_class(d_model=d_model, n_heads=n_heads, attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], norm_type=norm_type, fc_type=fc_type, verbose=verbose, device=device)
19
+ self.norm_2 = None
20
+ if not getattr(FFN_CLASS_REGISTRY[ffn_config['ffn_type']], '_has_norm', False):
21
+ self.norm_2 = norm_class(d_model, device=device)
22
+ self.ffn = build_ffn(d_model=d_model, expansion_ratio=expansion_ratio, device=device, **ffn_config)
23
  self.resid_attn_dropout = nn.Dropout(resid_pdrop)
24
  self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
25
 
 
27
  a = self.norm_1(x)
28
  (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
29
  x = x + self.resid_attn_dropout(b)
30
+ m = x
31
+ if self.norm_2 is not None:
32
+ m = self.norm_2(x)
33
  n = self.ffn(m)
34
  x = x + self.resid_ffn_dropout(n)
35
  return (x, attn_weights, past_key_value)
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": "./mpt-30b-orca-hf/",
3
  "architectures": [
4
  "MPTForCausalLM"
5
  ],
@@ -23,6 +22,11 @@
23
  "emb_pdrop": 0,
24
  "embedding_fraction": 1.0,
25
  "expansion_ratio": 4,
 
 
 
 
 
26
  "init_config": {
27
  "emb_init_std": null,
28
  "emb_init_uniform_lim": null,
@@ -45,7 +49,7 @@
45
  "norm_type": "low_precision_layernorm",
46
  "resid_pdrop": 0,
47
  "tokenizer_name": "sam-mosaic/gpt-neox-20b-chatml",
48
- "torch_dtype": "bfloat16",
49
  "transformers_version": "4.30.2",
50
  "use_cache": false,
51
  "verbose": 0,
 
1
  {
 
2
  "architectures": [
3
  "MPTForCausalLM"
4
  ],
 
22
  "emb_pdrop": 0,
23
  "embedding_fraction": 1.0,
24
  "expansion_ratio": 4,
25
+ "fc_type": "torch",
26
+ "ffn_config": {
27
+ "fc_type": "torch",
28
+ "ffn_type": "mptmlp"
29
+ },
30
  "init_config": {
31
  "emb_init_std": null,
32
  "emb_init_uniform_lim": null,
 
49
  "norm_type": "low_precision_layernorm",
50
  "resid_pdrop": 0,
51
  "tokenizer_name": "sam-mosaic/gpt-neox-20b-chatml",
52
+ "torch_dtype": "float32",
53
  "transformers_version": "4.30.2",
54
  "use_cache": false,
55
  "verbose": 0,
configuration_mpt.py CHANGED
@@ -1,26 +1,28 @@
1
  """A HuggingFace-style model configuration."""
 
2
  from typing import Dict, Optional, Union
3
  from transformers import PretrainedConfig
4
  attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
 
5
  init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
6
 
7
  class MPTConfig(PretrainedConfig):
8
  model_type = 'mpt'
9
 
10
- def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
11
  """The MPT configuration class.
12
 
13
  Args:
14
  d_model (int): The size of the embedding dimension of the model.
15
  n_heads (int): The number of attention heads.
16
  n_layers (int): The number of layers in the model.
17
- expansion_ratio (int): The ratio of the up/down scale in the MLP.
18
  max_seq_len (int): The maximum sequence length of the model.
19
  vocab_size (int): The size of the vocabulary.
20
  resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
21
  emb_pdrop (float): The dropout probability for the embedding layer.
22
  learned_pos_emb (bool): Whether to use learned positional embeddings
23
- attn_config (Dict): A dictionary used to configure the model's attention module:
24
  attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
25
  attn_pdrop (float): The dropout probability for the attention layers.
26
  attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
@@ -38,6 +40,8 @@ class MPTConfig(PretrainedConfig):
38
  Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
39
  alibi (bool): Whether to use the alibi bias instead of position embeddings.
40
  alibi_bias_max (int): The maximum value of the alibi bias.
 
 
41
  init_device (str): The device to use for parameter initialization.
42
  logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
43
  no_bias (bool): Whether to use bias in all layers.
@@ -61,6 +65,7 @@ class MPTConfig(PretrainedConfig):
61
  init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
62
  ---
63
  See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
 
64
  """
65
  self.d_model = d_model
66
  self.n_heads = n_heads
@@ -72,6 +77,7 @@ class MPTConfig(PretrainedConfig):
72
  self.emb_pdrop = emb_pdrop
73
  self.learned_pos_emb = learned_pos_emb
74
  self.attn_config = attn_config
 
75
  self.init_device = init_device
76
  self.logit_scale = logit_scale
77
  self.no_bias = no_bias
@@ -80,6 +86,7 @@ class MPTConfig(PretrainedConfig):
80
  self.norm_type = norm_type
81
  self.use_cache = use_cache
82
  self.init_config = init_config
 
83
  if 'name' in kwargs:
84
  del kwargs['name']
85
  if 'loss_fn' in kwargs:
@@ -95,6 +102,7 @@ class MPTConfig(PretrainedConfig):
95
 
96
  def _validate_config(self):
97
  self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
 
98
  self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
99
  if self.d_model % self.n_heads != 0:
100
  raise ValueError('d_model must be divisible by n_heads')
@@ -115,4 +123,13 @@ class MPTConfig(PretrainedConfig):
115
  if self.init_config.get('name', None) is None:
116
  raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
117
  if not self.learned_pos_emb and (not self.attn_config['alibi']):
118
- raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')
 
 
 
 
 
 
 
 
 
 
1
  """A HuggingFace-style model configuration."""
2
+ import warnings
3
  from typing import Dict, Optional, Union
4
  from transformers import PretrainedConfig
5
  attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
6
+ ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
7
  init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
8
 
9
  class MPTConfig(PretrainedConfig):
10
  model_type = 'mpt'
11
 
12
+ def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, ffn_config: Dict=ffn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, fc_type: str='torch', **kwargs):
13
  """The MPT configuration class.
14
 
15
  Args:
16
  d_model (int): The size of the embedding dimension of the model.
17
  n_heads (int): The number of attention heads.
18
  n_layers (int): The number of layers in the model.
19
+ expansion_ratio (int): The ratio of the up/down scale in the ffn.
20
  max_seq_len (int): The maximum sequence length of the model.
21
  vocab_size (int): The size of the vocabulary.
22
  resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
23
  emb_pdrop (float): The dropout probability for the embedding layer.
24
  learned_pos_emb (bool): Whether to use learned positional embeddings
25
+ attn_config (Dict): A dictionary used to configure the model's attention module:
26
  attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
27
  attn_pdrop (float): The dropout probability for the attention layers.
28
  attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
 
40
  Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
41
  alibi (bool): Whether to use the alibi bias instead of position embeddings.
42
  alibi_bias_max (int): The maximum value of the alibi bias.
43
+ ffn_config (Dict): A dictionary used to configure the model's ffn module:
44
+ ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
45
  init_device (str): The device to use for parameter initialization.
46
  logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
47
  no_bias (bool): Whether to use bias in all layers.
 
65
  init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
66
  ---
67
  See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
68
+ fc_type (str): choose fc layer implementaion. Options: torch and te. te layers support fp8 when using H100 GPUs.
69
  """
70
  self.d_model = d_model
71
  self.n_heads = n_heads
 
77
  self.emb_pdrop = emb_pdrop
78
  self.learned_pos_emb = learned_pos_emb
79
  self.attn_config = attn_config
80
+ self.ffn_config = ffn_config
81
  self.init_device = init_device
82
  self.logit_scale = logit_scale
83
  self.no_bias = no_bias
 
86
  self.norm_type = norm_type
87
  self.use_cache = use_cache
88
  self.init_config = init_config
89
+ self.fc_type = fc_type
90
  if 'name' in kwargs:
91
  del kwargs['name']
92
  if 'loss_fn' in kwargs:
 
102
 
103
  def _validate_config(self):
104
  self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
105
+ self.ffn_config = self._set_config_defaults(self.ffn_config, ffn_config_defaults)
106
  self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
107
  if self.d_model % self.n_heads != 0:
108
  raise ValueError('d_model must be divisible by n_heads')
 
123
  if self.init_config.get('name', None) is None:
124
  raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
125
  if not self.learned_pos_emb and (not self.attn_config['alibi']):
126
+ raise warnings.warn(f'Positional information not being provided to the model using either learned_pos_emb or alibi.')
127
+ if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
128
+ try:
129
+ import transformer_engine.pytorch as te
130
+ except:
131
+ raise ImportError('TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed.The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\npip install flash-attn==1.0.6 --no-build-isolation \npip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156')
132
+ if self.ffn_config['ffn_type'] == 'mptmlp':
133
+ self.ffn_config['fc_type'] = self.fc_type
134
+ elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
135
+ self.bias = not self.no_bias
fc.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ FC_CLASS_REGISTRY = {'torch': nn.Linear}
3
+ try:
4
+ import transformer_engine.pytorch as te
5
+ FC_CLASS_REGISTRY['te'] = te.Linear
6
+ except:
7
+ pass
ffn.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GPT Blocks used for the GPT Model."""
2
+ from typing import Optional
3
+ import torch
4
+ import torch.nn as nn
5
+ from .attention import ATTN_CLASS_REGISTRY
6
+ from .fc import FC_CLASS_REGISTRY
7
+ from .norm import NORM_CLASS_REGISTRY
8
+ try:
9
+ import transformer_engine.pytorch as te
10
+ except:
11
+ te = None
12
+
13
+ class MPTMLP(nn.Module):
14
+
15
+ def __init__(self, d_model: int, expansion_ratio: int, fc_type: str='torch', device: Optional[str]=None):
16
+ super().__init__()
17
+ fc_kwargs = {}
18
+ if fc_type != 'te':
19
+ fc_kwargs['device'] = device
20
+ self.up_proj = FC_CLASS_REGISTRY[fc_type](d_model, expansion_ratio * d_model, **fc_kwargs)
21
+ self.act = nn.GELU(approximate='none')
22
+ self.down_proj = FC_CLASS_REGISTRY[fc_type](expansion_ratio * d_model, d_model, **fc_kwargs)
23
+ self.down_proj._is_residual = True
24
+
25
+ def forward(self, x):
26
+ return self.down_proj(self.act(self.up_proj(x)))
27
+ FFN_CLASS_REGISTRY = {'mptmlp': MPTMLP}
28
+ if te is not None:
29
+ te.LayerNormMLP._has_norm = True
30
+ FFN_CLASS_REGISTRY['te_ln_mlp'] = te.LayerNormMLP
31
+
32
+ def build_ffn(d_model: int, expansion_ratio: int, fc_type: str='torch', device: Optional[str]=None, **kwargs):
33
+ ffn_type = kwargs.pop('ffn_type')
34
+ if ffn_type == 'mptmlp':
35
+ if kwargs is not None and len(kwargs) > 0:
36
+ raise ValueError(f'MPTMLP got an unexpected keyword argument: {kwargs}')
37
+ return MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, device=device)
38
+ elif ffn_type == 'te_ln_mlp':
39
+ return te.LayerNormMLP(hidden_size=d_model, ffn_hidden_size=d_model * expansion_ratio, **kwargs)
40
+ raise ValueError(f'ffn_type={ffn_type!r} not recognized.')
modeling_mpt.py CHANGED
@@ -13,12 +13,14 @@ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutpu
13
  from .attention import attn_bias_shape, build_attn_bias
14
  from .blocks import MPTBlock
15
  from .custom_embedding import SharedEmbedding
 
 
16
  from .norm import NORM_CLASS_REGISTRY
17
  from .configuration_mpt import MPTConfig
18
  from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
19
  from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
20
  from .meta_init_context import init_empty_weights
21
- from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
22
  try:
23
  from .flash_attn_triton import flash_attn_func
24
  except:
@@ -40,6 +42,7 @@ class MPTModel(MPTPreTrainedModel):
40
  self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
41
  self.alibi = config.attn_config['alibi']
42
  self.alibi_bias_max = config.attn_config['alibi_bias_max']
 
43
  if config.init_device == 'mixed':
44
  if dist.get_local_rank() == 0:
45
  config.init_device = 'cpu'
@@ -51,7 +54,7 @@ class MPTModel(MPTPreTrainedModel):
51
  norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
52
  self.embedding_fraction = config.embedding_fraction
53
  self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
54
- if not self.alibi:
55
  self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
56
  self.emb_drop = nn.Dropout(config.emb_pdrop)
57
  self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
@@ -80,7 +83,7 @@ class MPTModel(MPTPreTrainedModel):
80
  def get_input_embeddings(self):
81
  return self.wte
82
 
83
- def set_input_embeddings(self, value):
84
  self.wte = value
85
 
86
  @torch.no_grad()
@@ -166,9 +169,7 @@ class MPTModel(MPTPreTrainedModel):
166
  S = input_ids.size(1)
167
  assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
168
  tok_emb = self.wte(input_ids)
169
- if self.alibi:
170
- x = tok_emb
171
- else:
172
  past_position = 0
173
  if past_key_values is not None:
174
  if len(past_key_values) != self.config.n_layers:
@@ -183,6 +184,8 @@ class MPTModel(MPTPreTrainedModel):
183
  pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
184
  pos_emb = self.wpe(pos)
185
  x = tok_emb + pos_emb
 
 
186
  if self.embedding_fraction == 1:
187
  x = self.emb_drop(x)
188
  else:
@@ -228,7 +231,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
228
  if not config.tie_word_embeddings:
229
  raise ValueError('MPTForCausalLM only supports tied word embeddings')
230
  print(f'Instantiating an MPTForCausalLM model from {__file__}')
231
- self.transformer = MPTModel(config)
232
  for child in self.transformer.children():
233
  if isinstance(child, torch.nn.ModuleList):
234
  continue
@@ -275,9 +278,9 @@ class MPTForCausalLM(MPTPreTrainedModel):
275
  logits *= self.logit_scale
276
  loss = None
277
  if labels is not None:
278
- labels = torch.roll(labels, shifts=-1)
279
- labels[:, -1] = -100
280
- loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
281
  return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
282
 
283
  def param_init_fn(self, module):
 
13
  from .attention import attn_bias_shape, build_attn_bias
14
  from .blocks import MPTBlock
15
  from .custom_embedding import SharedEmbedding
16
+ from .fc import FC_CLASS_REGISTRY
17
+ from .ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
18
  from .norm import NORM_CLASS_REGISTRY
19
  from .configuration_mpt import MPTConfig
20
  from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
21
  from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
22
  from .meta_init_context import init_empty_weights
23
+ from .param_init_fns import generic_param_init_fn_, MODEL_INIT_REGISTRY
24
  try:
25
  from .flash_attn_triton import flash_attn_func
26
  except:
 
42
  self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
43
  self.alibi = config.attn_config['alibi']
44
  self.alibi_bias_max = config.attn_config['alibi_bias_max']
45
+ self.learned_pos_emb = config.learned_pos_emb
46
  if config.init_device == 'mixed':
47
  if dist.get_local_rank() == 0:
48
  config.init_device = 'cpu'
 
54
  norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
55
  self.embedding_fraction = config.embedding_fraction
56
  self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
57
+ if self.learned_pos_emb:
58
  self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
59
  self.emb_drop = nn.Dropout(config.emb_pdrop)
60
  self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
 
83
  def get_input_embeddings(self):
84
  return self.wte
85
 
86
+ def set_input_embeddings(self, value: nn.Embedding):
87
  self.wte = value
88
 
89
  @torch.no_grad()
 
169
  S = input_ids.size(1)
170
  assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
171
  tok_emb = self.wte(input_ids)
172
+ if self.learned_pos_emb:
 
 
173
  past_position = 0
174
  if past_key_values is not None:
175
  if len(past_key_values) != self.config.n_layers:
 
184
  pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
185
  pos_emb = self.wpe(pos)
186
  x = tok_emb + pos_emb
187
+ else:
188
+ x = tok_emb
189
  if self.embedding_fraction == 1:
190
  x = self.emb_drop(x)
191
  else:
 
231
  if not config.tie_word_embeddings:
232
  raise ValueError('MPTForCausalLM only supports tied word embeddings')
233
  print(f'Instantiating an MPTForCausalLM model from {__file__}')
234
+ self.transformer: MPTModel = MPTModel(config)
235
  for child in self.transformer.children():
236
  if isinstance(child, torch.nn.ModuleList):
237
  continue
 
278
  logits *= self.logit_scale
279
  loss = None
280
  if labels is not None:
281
+ _labels = torch.roll(labels, shifts=-1)
282
+ _labels[:, -1] = -100
283
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), _labels.to(logits.device).view(-1))
284
  return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
285
 
286
  def param_init_fn(self, module):
norm.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import torch
2
 
3
  def _cast_if_autocast_enabled(tensor):
@@ -53,4 +54,4 @@ class LPRMSNorm(RMSNorm):
53
  downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
54
  with torch.autocast(enabled=False, device_type=x.device.type):
55
  return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
56
- NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
 
1
+ from typing import Dict, Type
2
  import torch
3
 
4
  def _cast_if_autocast_enabled(tensor):
 
54
  downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
55
  with torch.autocast(enabled=False, device_type=x.device.type):
56
  return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
57
+ NORM_CLASS_REGISTRY: Dict[str, Type[torch.nn.Module]] = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
param_init_fns.py CHANGED
@@ -5,7 +5,12 @@ from functools import partial
5
  from typing import Optional, Tuple, Union
6
  import torch
7
  from torch import nn
 
8
  from .norm import NORM_CLASS_REGISTRY
 
 
 
 
9
 
10
  def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
11
  del kwargs
@@ -44,7 +49,7 @@ def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model:
44
  if init_div_is_residual is not False:
45
  if verbose > 1:
46
  warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
47
- if isinstance(module, nn.Linear):
48
  if hasattr(module, '_fused'):
49
  fused_init_helper_(module, init_fn_)
50
  else:
@@ -114,6 +119,19 @@ def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model:
114
  module.out_proj.weight.div_(div_is_residual)
115
  if module.out_proj.bias is not None:
116
  torch.nn.init.zeros_(module.out_proj.bias)
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  else:
118
  for _ in module.parameters(recurse=False):
119
  raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
 
5
  from typing import Optional, Tuple, Union
6
  import torch
7
  from torch import nn
8
+ from .fc import FC_CLASS_REGISTRY
9
  from .norm import NORM_CLASS_REGISTRY
10
+ try:
11
+ import transformer_engine.pytorch as te
12
+ except:
13
+ te = None
14
 
15
  def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
16
  del kwargs
 
49
  if init_div_is_residual is not False:
50
  if verbose > 1:
51
  warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
52
+ if isinstance(module, tuple(set(FC_CLASS_REGISTRY.values()))):
53
  if hasattr(module, '_fused'):
54
  fused_init_helper_(module, init_fn_)
55
  else:
 
119
  module.out_proj.weight.div_(div_is_residual)
120
  if module.out_proj.bias is not None:
121
  torch.nn.init.zeros_(module.out_proj.bias)
122
+ elif te is not None and isinstance(module, te.LayerNormMLP):
123
+ if module.layer_norm_weight is not None:
124
+ torch.nn.init.ones_(module.layer_norm_weight)
125
+ if module.layer_norm_bias is not None:
126
+ torch.nn.init.zeros_(module.layer_norm_bias)
127
+ init_fn_(module.fc1_weight)
128
+ if module.fc1_bias is not None:
129
+ torch.nn.init.zeros_(module.fc1_bias)
130
+ init_fn_(module.fc2_weight)
131
+ if module.fc2_bias is not None:
132
+ torch.nn.init.zeros_(module.fc2_bias)
133
+ with torch.no_grad():
134
+ module.fc2_weight.div_(div_is_residual)
135
  else:
136
  for _ in module.parameters(recurse=False):
137
  raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
pytorch_model-00001-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c51240f27be83417d07e38ba6ab0541ed5560611291bf403c3064a5f5b830889
3
+ size 9901940807
pytorch_model-00002-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15a418e63d400aa3e9ddd5c4fe64378eaeed49ca20f2d09a4507cfb6209de2d1
3
+ size 9865240711
pytorch_model-00003-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ec366585fe0612110b1251ad7706e9d0d4841657c702b4393d3df6ca9b0f32a
3
+ size 9865240711
pytorch_model-00004-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:853b9552f51a9c076cd9f043f958c6c78802d9512f01dfb9f852b456008ddfc2
3
+ size 9865240711
pytorch_model-00005-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f394f487f012f860fa6e63504aa4ffdae670101b845b966bfb43951bcfb4798f
3
+ size 9865240711
pytorch_model-00006-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d07e4cabbbeeb55b072dab24d8b152a36fbc0ec5021e4211d584f871fea4791
3
+ size 9865240711
pytorch_model-00007-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c2c9d32c24b95b3d674e1d9a43688f4aed426dd1bd54b6038c63d3c0baf282d
3
+ size 9865240711
pytorch_model-00008-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d80b8726b77a8c55a88ca09bf74f85028f6c0bfe93890942da37f44336fcac3
3
+ size 9865240711
pytorch_model-00009-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f18fa796abea986040439eb79a6c8e40da7214b5bbb873cad047c09e30a954b5
3
+ size 9865240711
pytorch_model-00010-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:032ac755ca09655de532d5b2df927e79e8143c102ccf2bec9da54d64b0e51708
3
+ size 9865240711
pytorch_model-00011-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaf5276708ad788f6f8e1a73f77b36f54a4413ff1277b1c72cdf8c999d286fea
3
+ size 9865240711
pytorch_model-00012-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46ee5a0ab0d96c7b904be86d8f2f7afae664a3cfd9bfaa3cca08ea3419dd7273
3
+ size 9865240711
pytorch_model-00013-of-00013.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a2f8577fb3fe785e377e032b886d5aa44d5a4786a128d478a66b7d6291369d6
3
+ size 1644197388
pytorch_model.bin.index.json CHANGED
@@ -1,297 +1,298 @@
1
  {
2
  "metadata": {
3
- "total_size": 59914401792
4
  },
5
  "weight_map": {
6
- "transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
7
- "transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
8
- "transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
9
- "transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
10
- "transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-00007.bin",
11
- "transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-00007.bin",
12
- "transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
13
- "transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
14
- "transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
15
- "transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
16
- "transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-00007.bin",
17
- "transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-00007.bin",
18
- "transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
19
- "transformer.blocks.10.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
20
- "transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
21
- "transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
22
- "transformer.blocks.10.norm_1.weight": "pytorch_model-00002-of-00007.bin",
23
- "transformer.blocks.10.norm_2.weight": "pytorch_model-00002-of-00007.bin",
24
- "transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
25
- "transformer.blocks.11.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
26
- "transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
27
- "transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
28
- "transformer.blocks.11.norm_1.weight": "pytorch_model-00002-of-00007.bin",
29
- "transformer.blocks.11.norm_2.weight": "pytorch_model-00002-of-00007.bin",
30
- "transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
31
- "transformer.blocks.12.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
32
- "transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
33
- "transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
34
- "transformer.blocks.12.norm_1.weight": "pytorch_model-00002-of-00007.bin",
35
- "transformer.blocks.12.norm_2.weight": "pytorch_model-00002-of-00007.bin",
36
- "transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
37
- "transformer.blocks.13.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
38
- "transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
39
- "transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
40
- "transformer.blocks.13.norm_1.weight": "pytorch_model-00002-of-00007.bin",
41
- "transformer.blocks.13.norm_2.weight": "pytorch_model-00002-of-00007.bin",
42
- "transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
43
- "transformer.blocks.14.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
44
- "transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
45
- "transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
46
- "transformer.blocks.14.norm_1.weight": "pytorch_model-00002-of-00007.bin",
47
- "transformer.blocks.14.norm_2.weight": "pytorch_model-00002-of-00007.bin",
48
- "transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
49
- "transformer.blocks.15.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
50
- "transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
51
- "transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
52
- "transformer.blocks.15.norm_1.weight": "pytorch_model-00002-of-00007.bin",
53
- "transformer.blocks.15.norm_2.weight": "pytorch_model-00002-of-00007.bin",
54
- "transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
55
- "transformer.blocks.16.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
56
- "transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
57
- "transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
58
- "transformer.blocks.16.norm_1.weight": "pytorch_model-00003-of-00007.bin",
59
- "transformer.blocks.16.norm_2.weight": "pytorch_model-00003-of-00007.bin",
60
- "transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
61
- "transformer.blocks.17.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
62
- "transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
63
- "transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
64
- "transformer.blocks.17.norm_1.weight": "pytorch_model-00003-of-00007.bin",
65
- "transformer.blocks.17.norm_2.weight": "pytorch_model-00003-of-00007.bin",
66
- "transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
67
- "transformer.blocks.18.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
68
- "transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
69
- "transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
70
- "transformer.blocks.18.norm_1.weight": "pytorch_model-00003-of-00007.bin",
71
- "transformer.blocks.18.norm_2.weight": "pytorch_model-00003-of-00007.bin",
72
- "transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
73
- "transformer.blocks.19.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
74
- "transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
75
- "transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
76
- "transformer.blocks.19.norm_1.weight": "pytorch_model-00003-of-00007.bin",
77
- "transformer.blocks.19.norm_2.weight": "pytorch_model-00003-of-00007.bin",
78
- "transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
79
- "transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
80
- "transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
81
- "transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
82
- "transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-00007.bin",
83
- "transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-00007.bin",
84
- "transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
85
- "transformer.blocks.20.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
86
- "transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
87
- "transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
88
- "transformer.blocks.20.norm_1.weight": "pytorch_model-00003-of-00007.bin",
89
- "transformer.blocks.20.norm_2.weight": "pytorch_model-00003-of-00007.bin",
90
- "transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
91
- "transformer.blocks.21.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
92
- "transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
93
- "transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
94
- "transformer.blocks.21.norm_1.weight": "pytorch_model-00003-of-00007.bin",
95
- "transformer.blocks.21.norm_2.weight": "pytorch_model-00003-of-00007.bin",
96
- "transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
97
- "transformer.blocks.22.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
98
- "transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
99
- "transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
100
- "transformer.blocks.22.norm_1.weight": "pytorch_model-00003-of-00007.bin",
101
- "transformer.blocks.22.norm_2.weight": "pytorch_model-00003-of-00007.bin",
102
- "transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
103
- "transformer.blocks.23.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
104
- "transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
105
- "transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
106
- "transformer.blocks.23.norm_1.weight": "pytorch_model-00003-of-00007.bin",
107
- "transformer.blocks.23.norm_2.weight": "pytorch_model-00003-of-00007.bin",
108
- "transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
109
- "transformer.blocks.24.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
110
- "transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
111
- "transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
112
- "transformer.blocks.24.norm_1.weight": "pytorch_model-00004-of-00007.bin",
113
- "transformer.blocks.24.norm_2.weight": "pytorch_model-00004-of-00007.bin",
114
- "transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
115
- "transformer.blocks.25.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
116
- "transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
117
- "transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
118
- "transformer.blocks.25.norm_1.weight": "pytorch_model-00004-of-00007.bin",
119
- "transformer.blocks.25.norm_2.weight": "pytorch_model-00004-of-00007.bin",
120
- "transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
121
- "transformer.blocks.26.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
122
- "transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
123
- "transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
124
- "transformer.blocks.26.norm_1.weight": "pytorch_model-00004-of-00007.bin",
125
- "transformer.blocks.26.norm_2.weight": "pytorch_model-00004-of-00007.bin",
126
- "transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
127
- "transformer.blocks.27.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
128
- "transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
129
- "transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
130
- "transformer.blocks.27.norm_1.weight": "pytorch_model-00004-of-00007.bin",
131
- "transformer.blocks.27.norm_2.weight": "pytorch_model-00004-of-00007.bin",
132
- "transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
133
- "transformer.blocks.28.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
134
- "transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
135
- "transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
136
- "transformer.blocks.28.norm_1.weight": "pytorch_model-00004-of-00007.bin",
137
- "transformer.blocks.28.norm_2.weight": "pytorch_model-00004-of-00007.bin",
138
- "transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
139
- "transformer.blocks.29.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
140
- "transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
141
- "transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
142
- "transformer.blocks.29.norm_1.weight": "pytorch_model-00004-of-00007.bin",
143
- "transformer.blocks.29.norm_2.weight": "pytorch_model-00004-of-00007.bin",
144
- "transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
145
- "transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
146
- "transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
147
- "transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
148
- "transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-00007.bin",
149
- "transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-00007.bin",
150
- "transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
151
- "transformer.blocks.30.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
152
- "transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
153
- "transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
154
- "transformer.blocks.30.norm_1.weight": "pytorch_model-00004-of-00007.bin",
155
- "transformer.blocks.30.norm_2.weight": "pytorch_model-00004-of-00007.bin",
156
- "transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
157
- "transformer.blocks.31.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
158
- "transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
159
- "transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
160
- "transformer.blocks.31.norm_1.weight": "pytorch_model-00004-of-00007.bin",
161
- "transformer.blocks.31.norm_2.weight": "pytorch_model-00004-of-00007.bin",
162
- "transformer.blocks.32.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
163
- "transformer.blocks.32.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
164
- "transformer.blocks.32.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
165
- "transformer.blocks.32.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
166
- "transformer.blocks.32.norm_1.weight": "pytorch_model-00005-of-00007.bin",
167
- "transformer.blocks.32.norm_2.weight": "pytorch_model-00005-of-00007.bin",
168
- "transformer.blocks.33.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
169
- "transformer.blocks.33.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
170
- "transformer.blocks.33.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
171
- "transformer.blocks.33.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
172
- "transformer.blocks.33.norm_1.weight": "pytorch_model-00005-of-00007.bin",
173
- "transformer.blocks.33.norm_2.weight": "pytorch_model-00005-of-00007.bin",
174
- "transformer.blocks.34.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
175
- "transformer.blocks.34.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
176
- "transformer.blocks.34.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
177
- "transformer.blocks.34.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
178
- "transformer.blocks.34.norm_1.weight": "pytorch_model-00005-of-00007.bin",
179
- "transformer.blocks.34.norm_2.weight": "pytorch_model-00005-of-00007.bin",
180
- "transformer.blocks.35.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
181
- "transformer.blocks.35.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
182
- "transformer.blocks.35.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
183
- "transformer.blocks.35.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
184
- "transformer.blocks.35.norm_1.weight": "pytorch_model-00005-of-00007.bin",
185
- "transformer.blocks.35.norm_2.weight": "pytorch_model-00005-of-00007.bin",
186
- "transformer.blocks.36.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
187
- "transformer.blocks.36.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
188
- "transformer.blocks.36.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
189
- "transformer.blocks.36.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
190
- "transformer.blocks.36.norm_1.weight": "pytorch_model-00005-of-00007.bin",
191
- "transformer.blocks.36.norm_2.weight": "pytorch_model-00005-of-00007.bin",
192
- "transformer.blocks.37.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
193
- "transformer.blocks.37.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
194
- "transformer.blocks.37.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
195
- "transformer.blocks.37.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
196
- "transformer.blocks.37.norm_1.weight": "pytorch_model-00005-of-00007.bin",
197
- "transformer.blocks.37.norm_2.weight": "pytorch_model-00005-of-00007.bin",
198
- "transformer.blocks.38.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
199
- "transformer.blocks.38.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
200
- "transformer.blocks.38.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
201
- "transformer.blocks.38.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
202
- "transformer.blocks.38.norm_1.weight": "pytorch_model-00005-of-00007.bin",
203
- "transformer.blocks.38.norm_2.weight": "pytorch_model-00005-of-00007.bin",
204
- "transformer.blocks.39.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
205
- "transformer.blocks.39.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
206
- "transformer.blocks.39.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
207
- "transformer.blocks.39.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
208
- "transformer.blocks.39.norm_1.weight": "pytorch_model-00005-of-00007.bin",
209
- "transformer.blocks.39.norm_2.weight": "pytorch_model-00005-of-00007.bin",
210
- "transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
211
- "transformer.blocks.4.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
212
- "transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
213
- "transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
214
- "transformer.blocks.4.norm_1.weight": "pytorch_model-00001-of-00007.bin",
215
- "transformer.blocks.4.norm_2.weight": "pytorch_model-00001-of-00007.bin",
216
- "transformer.blocks.40.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
217
- "transformer.blocks.40.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
218
- "transformer.blocks.40.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
219
- "transformer.blocks.40.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
220
- "transformer.blocks.40.norm_1.weight": "pytorch_model-00006-of-00007.bin",
221
- "transformer.blocks.40.norm_2.weight": "pytorch_model-00006-of-00007.bin",
222
- "transformer.blocks.41.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
223
- "transformer.blocks.41.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
224
- "transformer.blocks.41.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
225
- "transformer.blocks.41.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
226
- "transformer.blocks.41.norm_1.weight": "pytorch_model-00006-of-00007.bin",
227
- "transformer.blocks.41.norm_2.weight": "pytorch_model-00006-of-00007.bin",
228
- "transformer.blocks.42.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
229
- "transformer.blocks.42.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
230
- "transformer.blocks.42.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
231
- "transformer.blocks.42.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
232
- "transformer.blocks.42.norm_1.weight": "pytorch_model-00006-of-00007.bin",
233
- "transformer.blocks.42.norm_2.weight": "pytorch_model-00006-of-00007.bin",
234
- "transformer.blocks.43.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
235
- "transformer.blocks.43.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
236
- "transformer.blocks.43.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
237
- "transformer.blocks.43.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
238
- "transformer.blocks.43.norm_1.weight": "pytorch_model-00006-of-00007.bin",
239
- "transformer.blocks.43.norm_2.weight": "pytorch_model-00006-of-00007.bin",
240
- "transformer.blocks.44.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
241
- "transformer.blocks.44.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
242
- "transformer.blocks.44.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
243
- "transformer.blocks.44.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
244
- "transformer.blocks.44.norm_1.weight": "pytorch_model-00006-of-00007.bin",
245
- "transformer.blocks.44.norm_2.weight": "pytorch_model-00006-of-00007.bin",
246
- "transformer.blocks.45.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
247
- "transformer.blocks.45.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
248
- "transformer.blocks.45.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
249
- "transformer.blocks.45.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
250
- "transformer.blocks.45.norm_1.weight": "pytorch_model-00006-of-00007.bin",
251
- "transformer.blocks.45.norm_2.weight": "pytorch_model-00006-of-00007.bin",
252
- "transformer.blocks.46.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
253
- "transformer.blocks.46.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
254
- "transformer.blocks.46.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
255
- "transformer.blocks.46.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
256
- "transformer.blocks.46.norm_1.weight": "pytorch_model-00006-of-00007.bin",
257
- "transformer.blocks.46.norm_2.weight": "pytorch_model-00006-of-00007.bin",
258
- "transformer.blocks.47.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
259
- "transformer.blocks.47.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
260
- "transformer.blocks.47.ffn.down_proj.weight": "pytorch_model-00007-of-00007.bin",
261
- "transformer.blocks.47.ffn.up_proj.weight": "pytorch_model-00007-of-00007.bin",
262
- "transformer.blocks.47.norm_1.weight": "pytorch_model-00006-of-00007.bin",
263
- "transformer.blocks.47.norm_2.weight": "pytorch_model-00006-of-00007.bin",
264
- "transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
265
- "transformer.blocks.5.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
266
- "transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
267
- "transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
268
- "transformer.blocks.5.norm_1.weight": "pytorch_model-00001-of-00007.bin",
269
- "transformer.blocks.5.norm_2.weight": "pytorch_model-00001-of-00007.bin",
270
- "transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
271
- "transformer.blocks.6.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
272
- "transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
273
- "transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
274
- "transformer.blocks.6.norm_1.weight": "pytorch_model-00001-of-00007.bin",
275
- "transformer.blocks.6.norm_2.weight": "pytorch_model-00001-of-00007.bin",
276
- "transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
277
- "transformer.blocks.7.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
278
- "transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
279
- "transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
280
- "transformer.blocks.7.norm_1.weight": "pytorch_model-00001-of-00007.bin",
281
- "transformer.blocks.7.norm_2.weight": "pytorch_model-00001-of-00007.bin",
282
- "transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
283
- "transformer.blocks.8.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
284
- "transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
285
- "transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
286
- "transformer.blocks.8.norm_1.weight": "pytorch_model-00002-of-00007.bin",
287
- "transformer.blocks.8.norm_2.weight": "pytorch_model-00002-of-00007.bin",
288
- "transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
289
- "transformer.blocks.9.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
290
- "transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
291
- "transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
292
- "transformer.blocks.9.norm_1.weight": "pytorch_model-00002-of-00007.bin",
293
- "transformer.blocks.9.norm_2.weight": "pytorch_model-00002-of-00007.bin",
294
- "transformer.norm_f.weight": "pytorch_model-00007-of-00007.bin",
295
- "transformer.wte.weight": "pytorch_model-00001-of-00007.bin"
 
296
  }
297
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 120063684608
4
  },
5
  "weight_map": {
6
+ "transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
7
+ "transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
8
+ "transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
9
+ "transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
10
+ "transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-00013.bin",
11
+ "transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-00013.bin",
12
+ "transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
13
+ "transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
14
+ "transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
15
+ "transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
16
+ "transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-00013.bin",
17
+ "transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-00013.bin",
18
+ "transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
19
+ "transformer.blocks.10.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
20
+ "transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
21
+ "transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
22
+ "transformer.blocks.10.norm_1.weight": "pytorch_model-00003-of-00013.bin",
23
+ "transformer.blocks.10.norm_2.weight": "pytorch_model-00003-of-00013.bin",
24
+ "transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
25
+ "transformer.blocks.11.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
26
+ "transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
27
+ "transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
28
+ "transformer.blocks.11.norm_1.weight": "pytorch_model-00003-of-00013.bin",
29
+ "transformer.blocks.11.norm_2.weight": "pytorch_model-00003-of-00013.bin",
30
+ "transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
31
+ "transformer.blocks.12.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
32
+ "transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
33
+ "transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
34
+ "transformer.blocks.12.norm_1.weight": "pytorch_model-00004-of-00013.bin",
35
+ "transformer.blocks.12.norm_2.weight": "pytorch_model-00004-of-00013.bin",
36
+ "transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
37
+ "transformer.blocks.13.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
38
+ "transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
39
+ "transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
40
+ "transformer.blocks.13.norm_1.weight": "pytorch_model-00004-of-00013.bin",
41
+ "transformer.blocks.13.norm_2.weight": "pytorch_model-00004-of-00013.bin",
42
+ "transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
43
+ "transformer.blocks.14.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
44
+ "transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
45
+ "transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
46
+ "transformer.blocks.14.norm_1.weight": "pytorch_model-00004-of-00013.bin",
47
+ "transformer.blocks.14.norm_2.weight": "pytorch_model-00004-of-00013.bin",
48
+ "transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
49
+ "transformer.blocks.15.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
50
+ "transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
51
+ "transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
52
+ "transformer.blocks.15.norm_1.weight": "pytorch_model-00004-of-00013.bin",
53
+ "transformer.blocks.15.norm_2.weight": "pytorch_model-00004-of-00013.bin",
54
+ "transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
55
+ "transformer.blocks.16.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
56
+ "transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
57
+ "transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
58
+ "transformer.blocks.16.norm_1.weight": "pytorch_model-00005-of-00013.bin",
59
+ "transformer.blocks.16.norm_2.weight": "pytorch_model-00005-of-00013.bin",
60
+ "transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
61
+ "transformer.blocks.17.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
62
+ "transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
63
+ "transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
64
+ "transformer.blocks.17.norm_1.weight": "pytorch_model-00005-of-00013.bin",
65
+ "transformer.blocks.17.norm_2.weight": "pytorch_model-00005-of-00013.bin",
66
+ "transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
67
+ "transformer.blocks.18.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
68
+ "transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
69
+ "transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
70
+ "transformer.blocks.18.norm_1.weight": "pytorch_model-00005-of-00013.bin",
71
+ "transformer.blocks.18.norm_2.weight": "pytorch_model-00005-of-00013.bin",
72
+ "transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
73
+ "transformer.blocks.19.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
74
+ "transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
75
+ "transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
76
+ "transformer.blocks.19.norm_1.weight": "pytorch_model-00005-of-00013.bin",
77
+ "transformer.blocks.19.norm_2.weight": "pytorch_model-00005-of-00013.bin",
78
+ "transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
79
+ "transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
80
+ "transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
81
+ "transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
82
+ "transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-00013.bin",
83
+ "transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-00013.bin",
84
+ "transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
85
+ "transformer.blocks.20.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
86
+ "transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
87
+ "transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
88
+ "transformer.blocks.20.norm_1.weight": "pytorch_model-00006-of-00013.bin",
89
+ "transformer.blocks.20.norm_2.weight": "pytorch_model-00006-of-00013.bin",
90
+ "transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
91
+ "transformer.blocks.21.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
92
+ "transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
93
+ "transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
94
+ "transformer.blocks.21.norm_1.weight": "pytorch_model-00006-of-00013.bin",
95
+ "transformer.blocks.21.norm_2.weight": "pytorch_model-00006-of-00013.bin",
96
+ "transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
97
+ "transformer.blocks.22.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
98
+ "transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
99
+ "transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
100
+ "transformer.blocks.22.norm_1.weight": "pytorch_model-00006-of-00013.bin",
101
+ "transformer.blocks.22.norm_2.weight": "pytorch_model-00006-of-00013.bin",
102
+ "transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
103
+ "transformer.blocks.23.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
104
+ "transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
105
+ "transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
106
+ "transformer.blocks.23.norm_1.weight": "pytorch_model-00006-of-00013.bin",
107
+ "transformer.blocks.23.norm_2.weight": "pytorch_model-00006-of-00013.bin",
108
+ "transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
109
+ "transformer.blocks.24.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
110
+ "transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
111
+ "transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
112
+ "transformer.blocks.24.norm_1.weight": "pytorch_model-00007-of-00013.bin",
113
+ "transformer.blocks.24.norm_2.weight": "pytorch_model-00007-of-00013.bin",
114
+ "transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
115
+ "transformer.blocks.25.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
116
+ "transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
117
+ "transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
118
+ "transformer.blocks.25.norm_1.weight": "pytorch_model-00007-of-00013.bin",
119
+ "transformer.blocks.25.norm_2.weight": "pytorch_model-00007-of-00013.bin",
120
+ "transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
121
+ "transformer.blocks.26.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
122
+ "transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
123
+ "transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
124
+ "transformer.blocks.26.norm_1.weight": "pytorch_model-00007-of-00013.bin",
125
+ "transformer.blocks.26.norm_2.weight": "pytorch_model-00007-of-00013.bin",
126
+ "transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
127
+ "transformer.blocks.27.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
128
+ "transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
129
+ "transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
130
+ "transformer.blocks.27.norm_1.weight": "pytorch_model-00007-of-00013.bin",
131
+ "transformer.blocks.27.norm_2.weight": "pytorch_model-00007-of-00013.bin",
132
+ "transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
133
+ "transformer.blocks.28.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
134
+ "transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
135
+ "transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
136
+ "transformer.blocks.28.norm_1.weight": "pytorch_model-00008-of-00013.bin",
137
+ "transformer.blocks.28.norm_2.weight": "pytorch_model-00008-of-00013.bin",
138
+ "transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
139
+ "transformer.blocks.29.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
140
+ "transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
141
+ "transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
142
+ "transformer.blocks.29.norm_1.weight": "pytorch_model-00008-of-00013.bin",
143
+ "transformer.blocks.29.norm_2.weight": "pytorch_model-00008-of-00013.bin",
144
+ "transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
145
+ "transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
146
+ "transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
147
+ "transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
148
+ "transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-00013.bin",
149
+ "transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-00013.bin",
150
+ "transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
151
+ "transformer.blocks.30.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
152
+ "transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
153
+ "transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
154
+ "transformer.blocks.30.norm_1.weight": "pytorch_model-00008-of-00013.bin",
155
+ "transformer.blocks.30.norm_2.weight": "pytorch_model-00008-of-00013.bin",
156
+ "transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
157
+ "transformer.blocks.31.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
158
+ "transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
159
+ "transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
160
+ "transformer.blocks.31.norm_1.weight": "pytorch_model-00008-of-00013.bin",
161
+ "transformer.blocks.31.norm_2.weight": "pytorch_model-00008-of-00013.bin",
162
+ "transformer.blocks.32.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
163
+ "transformer.blocks.32.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
164
+ "transformer.blocks.32.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
165
+ "transformer.blocks.32.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
166
+ "transformer.blocks.32.norm_1.weight": "pytorch_model-00009-of-00013.bin",
167
+ "transformer.blocks.32.norm_2.weight": "pytorch_model-00009-of-00013.bin",
168
+ "transformer.blocks.33.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
169
+ "transformer.blocks.33.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
170
+ "transformer.blocks.33.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
171
+ "transformer.blocks.33.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
172
+ "transformer.blocks.33.norm_1.weight": "pytorch_model-00009-of-00013.bin",
173
+ "transformer.blocks.33.norm_2.weight": "pytorch_model-00009-of-00013.bin",
174
+ "transformer.blocks.34.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
175
+ "transformer.blocks.34.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
176
+ "transformer.blocks.34.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
177
+ "transformer.blocks.34.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
178
+ "transformer.blocks.34.norm_1.weight": "pytorch_model-00009-of-00013.bin",
179
+ "transformer.blocks.34.norm_2.weight": "pytorch_model-00009-of-00013.bin",
180
+ "transformer.blocks.35.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
181
+ "transformer.blocks.35.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
182
+ "transformer.blocks.35.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
183
+ "transformer.blocks.35.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
184
+ "transformer.blocks.35.norm_1.weight": "pytorch_model-00009-of-00013.bin",
185
+ "transformer.blocks.35.norm_2.weight": "pytorch_model-00009-of-00013.bin",
186
+ "transformer.blocks.36.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
187
+ "transformer.blocks.36.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
188
+ "transformer.blocks.36.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
189
+ "transformer.blocks.36.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
190
+ "transformer.blocks.36.norm_1.weight": "pytorch_model-00010-of-00013.bin",
191
+ "transformer.blocks.36.norm_2.weight": "pytorch_model-00010-of-00013.bin",
192
+ "transformer.blocks.37.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
193
+ "transformer.blocks.37.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
194
+ "transformer.blocks.37.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
195
+ "transformer.blocks.37.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
196
+ "transformer.blocks.37.norm_1.weight": "pytorch_model-00010-of-00013.bin",
197
+ "transformer.blocks.37.norm_2.weight": "pytorch_model-00010-of-00013.bin",
198
+ "transformer.blocks.38.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
199
+ "transformer.blocks.38.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
200
+ "transformer.blocks.38.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
201
+ "transformer.blocks.38.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
202
+ "transformer.blocks.38.norm_1.weight": "pytorch_model-00010-of-00013.bin",
203
+ "transformer.blocks.38.norm_2.weight": "pytorch_model-00010-of-00013.bin",
204
+ "transformer.blocks.39.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
205
+ "transformer.blocks.39.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
206
+ "transformer.blocks.39.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
207
+ "transformer.blocks.39.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
208
+ "transformer.blocks.39.norm_1.weight": "pytorch_model-00010-of-00013.bin",
209
+ "transformer.blocks.39.norm_2.weight": "pytorch_model-00010-of-00013.bin",
210
+ "transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
211
+ "transformer.blocks.4.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
212
+ "transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
213
+ "transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
214
+ "transformer.blocks.4.norm_1.weight": "pytorch_model-00002-of-00013.bin",
215
+ "transformer.blocks.4.norm_2.weight": "pytorch_model-00002-of-00013.bin",
216
+ "transformer.blocks.40.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
217
+ "transformer.blocks.40.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
218
+ "transformer.blocks.40.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
219
+ "transformer.blocks.40.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
220
+ "transformer.blocks.40.norm_1.weight": "pytorch_model-00011-of-00013.bin",
221
+ "transformer.blocks.40.norm_2.weight": "pytorch_model-00011-of-00013.bin",
222
+ "transformer.blocks.41.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
223
+ "transformer.blocks.41.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
224
+ "transformer.blocks.41.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
225
+ "transformer.blocks.41.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
226
+ "transformer.blocks.41.norm_1.weight": "pytorch_model-00011-of-00013.bin",
227
+ "transformer.blocks.41.norm_2.weight": "pytorch_model-00011-of-00013.bin",
228
+ "transformer.blocks.42.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
229
+ "transformer.blocks.42.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
230
+ "transformer.blocks.42.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
231
+ "transformer.blocks.42.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
232
+ "transformer.blocks.42.norm_1.weight": "pytorch_model-00011-of-00013.bin",
233
+ "transformer.blocks.42.norm_2.weight": "pytorch_model-00011-of-00013.bin",
234
+ "transformer.blocks.43.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
235
+ "transformer.blocks.43.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
236
+ "transformer.blocks.43.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
237
+ "transformer.blocks.43.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
238
+ "transformer.blocks.43.norm_1.weight": "pytorch_model-00011-of-00013.bin",
239
+ "transformer.blocks.43.norm_2.weight": "pytorch_model-00011-of-00013.bin",
240
+ "transformer.blocks.44.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
241
+ "transformer.blocks.44.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
242
+ "transformer.blocks.44.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
243
+ "transformer.blocks.44.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
244
+ "transformer.blocks.44.norm_1.weight": "pytorch_model-00012-of-00013.bin",
245
+ "transformer.blocks.44.norm_2.weight": "pytorch_model-00012-of-00013.bin",
246
+ "transformer.blocks.45.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
247
+ "transformer.blocks.45.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
248
+ "transformer.blocks.45.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
249
+ "transformer.blocks.45.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
250
+ "transformer.blocks.45.norm_1.weight": "pytorch_model-00012-of-00013.bin",
251
+ "transformer.blocks.45.norm_2.weight": "pytorch_model-00012-of-00013.bin",
252
+ "transformer.blocks.46.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
253
+ "transformer.blocks.46.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
254
+ "transformer.blocks.46.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
255
+ "transformer.blocks.46.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
256
+ "transformer.blocks.46.norm_1.weight": "pytorch_model-00012-of-00013.bin",
257
+ "transformer.blocks.46.norm_2.weight": "pytorch_model-00012-of-00013.bin",
258
+ "transformer.blocks.47.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
259
+ "transformer.blocks.47.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
260
+ "transformer.blocks.47.ffn.down_proj.weight": "pytorch_model-00013-of-00013.bin",
261
+ "transformer.blocks.47.ffn.up_proj.weight": "pytorch_model-00013-of-00013.bin",
262
+ "transformer.blocks.47.norm_1.weight": "pytorch_model-00012-of-00013.bin",
263
+ "transformer.blocks.47.norm_2.weight": "pytorch_model-00012-of-00013.bin",
264
+ "transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
265
+ "transformer.blocks.5.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
266
+ "transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
267
+ "transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
268
+ "transformer.blocks.5.norm_1.weight": "pytorch_model-00002-of-00013.bin",
269
+ "transformer.blocks.5.norm_2.weight": "pytorch_model-00002-of-00013.bin",
270
+ "transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
271
+ "transformer.blocks.6.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
272
+ "transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
273
+ "transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
274
+ "transformer.blocks.6.norm_1.weight": "pytorch_model-00002-of-00013.bin",
275
+ "transformer.blocks.6.norm_2.weight": "pytorch_model-00002-of-00013.bin",
276
+ "transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
277
+ "transformer.blocks.7.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
278
+ "transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
279
+ "transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
280
+ "transformer.blocks.7.norm_1.weight": "pytorch_model-00002-of-00013.bin",
281
+ "transformer.blocks.7.norm_2.weight": "pytorch_model-00002-of-00013.bin",
282
+ "transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
283
+ "transformer.blocks.8.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
284
+ "transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
285
+ "transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
286
+ "transformer.blocks.8.norm_1.weight": "pytorch_model-00003-of-00013.bin",
287
+ "transformer.blocks.8.norm_2.weight": "pytorch_model-00003-of-00013.bin",
288
+ "transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
289
+ "transformer.blocks.9.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
290
+ "transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
291
+ "transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
292
+ "transformer.blocks.9.norm_1.weight": "pytorch_model-00003-of-00013.bin",
293
+ "transformer.blocks.9.norm_2.weight": "pytorch_model-00003-of-00013.bin",
294
+ "transformer.norm_f.weight": "pytorch_model-00013-of-00013.bin",
295
+ "transformer.wpe.weight": "pytorch_model-00001-of-00013.bin",
296
+ "transformer.wte.weight": "pytorch_model-00001-of-00013.bin"
297
  }
298
  }