diwank commited on Jul 19, 2023

Commit

3175fd2

•

1 Parent(s): d518181

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

attention.py +25 -18
blocks.py +11 -17
config.json +6 -2
configuration_mpt.py +21 -4
fc.py +7 -0
ffn.py +40 -0
modeling_mpt.py +13 -10
norm.py +2 -1
param_init_fns.py +19 -1
pytorch_model-00001-of-00013.bin +3 -0
pytorch_model-00002-of-00013.bin +3 -0
pytorch_model-00003-of-00013.bin +3 -0
pytorch_model-00004-of-00013.bin +3 -0
pytorch_model-00005-of-00013.bin +3 -0
pytorch_model-00006-of-00013.bin +3 -0
pytorch_model-00007-of-00013.bin +3 -0
pytorch_model-00008-of-00013.bin +3 -0
pytorch_model-00009-of-00013.bin +3 -0
pytorch_model-00010-of-00013.bin +3 -0
pytorch_model-00011-of-00013.bin +3 -0
pytorch_model-00012-of-00013.bin +3 -0
pytorch_model-00013-of-00013.bin +3 -0
pytorch_model.bin.index.json +292 -291

attention.py CHANGED Viewed

@@ -7,7 +7,8 @@ import torch.nn as nn
 from einops import rearrange
 from packaging import version
 from torch import nn
-from .norm import LPLayerNorm
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
     if original_is_causal and num_query_tokens != num_key_tokens:
@@ -46,7 +47,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
     if is_causal and (not q.size(2) == 1):
         s = max(s_q, s_k)
-        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
         causal_mask = causal_mask.tril()
         causal_mask = causal_mask.to(torch.bool)
         causal_mask = ~causal_mask
@@ -141,8 +142,8 @@ def triton_flash_attn_fn(query, key, value, n_heads, past_key_value=None, softma
     key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
     value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
     if multiquery:
-        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
-        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
     output = attn_output.view(*attn_output.shape[:2], -1)
@@ -155,7 +156,7 @@ class MultiheadAttention(nn.Module):
     additive bias.
     """
-    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
         super().__init__()
         self.attn_impl = attn_impl
         self.clip_qkv = clip_qkv
@@ -166,13 +167,16 @@ class MultiheadAttention(nn.Module):
         if self.softmax_scale is None:
             self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
         self.attn_dropout_p = attn_pdrop
-        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
         fuse_splits = (d_model, 2 * d_model)
         self.Wqkv._fused = (0, fuse_splits)
         if self.qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(self.d_model, device=device)
-            self.k_ln = layernorm_class(self.d_model, device=device)
         if self.attn_impl == 'flash':
             self.attn_fn = flash_attn_fn
         elif self.attn_impl == 'triton':
@@ -185,13 +189,13 @@ class MultiheadAttention(nn.Module):
                 warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
         else:
             raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
     def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
         (query, key, value) = qkv.chunk(3, dim=2)
         key_padding_mask = attention_mask
         if self.qk_ln:
@@ -208,7 +212,7 @@ class MultiQueryAttention(nn.Module):
     additive bias.
     """
-    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
         super().__init__()
         self.attn_impl = attn_impl
         self.clip_qkv = clip_qkv
@@ -220,13 +224,16 @@ class MultiQueryAttention(nn.Module):
         if self.softmax_scale is None:
             self.softmax_scale = 1 / math.sqrt(self.head_dim)
         self.attn_dropout_p = attn_pdrop
-        self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
         fuse_splits = (d_model, d_model + self.head_dim)
         self.Wqkv._fused = (0, fuse_splits)
         if self.qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(d_model, device=device)
-            self.k_ln = layernorm_class(self.head_dim, device=device)
         if self.attn_impl == 'flash':
             self.attn_fn = flash_attn_fn
         elif self.attn_impl == 'triton':
@@ -239,13 +246,13 @@ class MultiQueryAttention(nn.Module):
                 warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
         else:
             raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
     def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
         (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
         key_padding_mask = attention_mask
         if self.qk_ln:

 from einops import rearrange
 from packaging import version
 from torch import nn
+from .fc import FC_CLASS_REGISTRY
+from .norm import NORM_CLASS_REGISTRY
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
     if original_is_causal and num_query_tokens != num_key_tokens:
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
     if is_causal and (not q.size(2) == 1):
         s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32)
         causal_mask = causal_mask.tril()
         causal_mask = causal_mask.to(torch.bool)
         causal_mask = ~causal_mask
     key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
     value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
     if multiquery:
+        key = key.repeat(1, 1, n_heads, 1)
+        value = value.repeat(1, 1, n_heads, 1)
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
     output = attn_output.view(*attn_output.shape[:2], -1)
     additive bias.
     """
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', verbose: int=0, device: Optional[str]=None):
         super().__init__()
         self.attn_impl = attn_impl
         self.clip_qkv = clip_qkv
         if self.softmax_scale is None:
             self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
         self.attn_dropout_p = attn_pdrop
+        fc_kwargs = {}
+        if fc_type != 'te':
+            fc_kwargs['device'] = device
+        self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, 3 * self.d_model, **fc_kwargs)
         fuse_splits = (d_model, 2 * d_model)
         self.Wqkv._fused = (0, fuse_splits)
         if self.qk_ln:
+            norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+            self.q_ln = norm_class(self.d_model, device=device)
+            self.k_ln = norm_class(self.d_model, device=device)
         if self.attn_impl == 'flash':
             self.attn_fn = flash_attn_fn
         elif self.attn_impl == 'triton':
                 warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
         else:
             raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+        self.out_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
         self.out_proj._is_residual = True
     def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
+            qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
         (query, key, value) = qkv.chunk(3, dim=2)
         key_padding_mask = attention_mask
         if self.qk_ln:
     additive bias.
     """
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', verbose: int=0, device: Optional[str]=None):
         super().__init__()
         self.attn_impl = attn_impl
         self.clip_qkv = clip_qkv
         if self.softmax_scale is None:
             self.softmax_scale = 1 / math.sqrt(self.head_dim)
         self.attn_dropout_p = attn_pdrop
+        fc_kwargs = {}
+        if fc_type != 'te':
+            fc_kwargs['device'] = device
+        self.Wqkv = FC_CLASS_REGISTRY[fc_type](d_model, d_model + 2 * self.head_dim, **fc_kwargs)
         fuse_splits = (d_model, d_model + self.head_dim)
         self.Wqkv._fused = (0, fuse_splits)
         if self.qk_ln:
+            norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+            self.q_ln = norm_class(d_model, device=device)
+            self.k_ln = norm_class(self.head_dim, device=device)
         if self.attn_impl == 'flash':
             self.attn_fn = flash_attn_fn
         elif self.attn_impl == 'triton':
                 warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
         else:
             raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+        self.out_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
         self.out_proj._is_residual = True
     def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
+            qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
         (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
         key_padding_mask = attention_mask
         if self.qk_ln:

blocks.py CHANGED Viewed

@@ -3,31 +3,23 @@ from typing import Dict, Optional, Tuple
 import torch
 import torch.nn as nn
 from .attention import ATTN_CLASS_REGISTRY
 from .norm import NORM_CLASS_REGISTRY
-class MPTMLP(nn.Module):
-    def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
-        super().__init__()
-        self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
-        self.act = nn.GELU(approximate='none')
-        self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
-        self.down_proj._is_residual = True
-    def forward(self, x):
-        return self.down_proj(self.act(self.up_proj(x)))
 class MPTBlock(nn.Module):
-    def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
         del kwargs
         super().__init__()
         norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
         attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
         self.norm_1 = norm_class(d_model, device=device)
-        self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device)
-        self.norm_2 = norm_class(d_model, device=device)
-        self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
@@ -35,7 +27,9 @@ class MPTBlock(nn.Module):
         a = self.norm_1(x)
         (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
         x = x + self.resid_attn_dropout(b)
-        m = self.norm_2(x)
         n = self.ffn(m)
         x = x + self.resid_ffn_dropout(n)
         return (x, attn_weights, past_key_value)

 import torch
 import torch.nn as nn
 from .attention import ATTN_CLASS_REGISTRY
+from .fc import FC_CLASS_REGISTRY
+from .ffn import FFN_CLASS_REGISTRY, build_ffn
 from .norm import NORM_CLASS_REGISTRY
 class MPTBlock(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, ffn_config: Dict={'ffn_type': 'mptmlp'}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, fc_type: str='torch', device: Optional[str]=None, **kwargs):
         del kwargs
         super().__init__()
         norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
         attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
         self.norm_1 = norm_class(d_model, device=device)
+        self.attn = attn_class(d_model=d_model, n_heads=n_heads, attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], norm_type=norm_type, fc_type=fc_type, verbose=verbose, device=device)
+        self.norm_2 = None
+        if not getattr(FFN_CLASS_REGISTRY[ffn_config['ffn_type']], '_has_norm', False):
+            self.norm_2 = norm_class(d_model, device=device)
+        self.ffn = build_ffn(d_model=d_model, expansion_ratio=expansion_ratio, device=device, **ffn_config)
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
         a = self.norm_1(x)
         (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
         x = x + self.resid_attn_dropout(b)
+        m = x
+        if self.norm_2 is not None:
+            m = self.norm_2(x)
         n = self.ffn(m)
         x = x + self.resid_ffn_dropout(n)
         return (x, attn_weights, past_key_value)

config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_name_or_path": "./mpt-30b-orca-hf/",
   "architectures": [
     "MPTForCausalLM"
   ],
@@ -23,6 +22,11 @@
   "emb_pdrop": 0,
   "embedding_fraction": 1.0,
   "expansion_ratio": 4,
   "init_config": {
     "emb_init_std": null,
     "emb_init_uniform_lim": null,
@@ -45,7 +49,7 @@
   "norm_type": "low_precision_layernorm",
   "resid_pdrop": 0,
   "tokenizer_name": "sam-mosaic/gpt-neox-20b-chatml",
-  "torch_dtype": "bfloat16",
   "transformers_version": "4.30.2",
   "use_cache": false,
   "verbose": 0,

 {
   "architectures": [
     "MPTForCausalLM"
   ],
   "emb_pdrop": 0,
   "embedding_fraction": 1.0,
   "expansion_ratio": 4,
+  "fc_type": "torch",
+  "ffn_config": {
+    "fc_type": "torch",
+    "ffn_type": "mptmlp"
+  },
   "init_config": {
     "emb_init_std": null,
     "emb_init_uniform_lim": null,
   "norm_type": "low_precision_layernorm",
   "resid_pdrop": 0,
   "tokenizer_name": "sam-mosaic/gpt-neox-20b-chatml",
+  "torch_dtype": "float32",
   "transformers_version": "4.30.2",
   "use_cache": false,
   "verbose": 0,

configuration_mpt.py CHANGED Viewed

@@ -1,26 +1,28 @@
 """A HuggingFace-style model configuration."""
 from typing import Dict, Optional, Union
 from transformers import PretrainedConfig
 attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
 init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
 class MPTConfig(PretrainedConfig):
     model_type = 'mpt'
-    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
         """The MPT configuration class.
         Args:
             d_model (int): The size of the embedding dimension of the model.
             n_heads (int): The number of attention heads.
             n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the MLP.
             max_seq_len (int): The maximum sequence length of the model.
             vocab_size (int): The size of the vocabulary.
             resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
             emb_pdrop (float): The dropout probability for the embedding layer.
             learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict):  A dictionary used to configure the model's attention module:
                 attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
                 attn_pdrop (float): The dropout probability for the attention layers.
                 attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
@@ -38,6 +40,8 @@ class MPTConfig(PretrainedConfig):
                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
                 alibi_bias_max (int): The maximum value of the alibi bias.
             init_device (str): The device to use for parameter initialization.
             logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
             no_bias (bool): Whether to use bias in all layers.
@@ -61,6 +65,7 @@ class MPTConfig(PretrainedConfig):
                 init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
                 ---
                 See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
         """
         self.d_model = d_model
         self.n_heads = n_heads
@@ -72,6 +77,7 @@ class MPTConfig(PretrainedConfig):
         self.emb_pdrop = emb_pdrop
         self.learned_pos_emb = learned_pos_emb
         self.attn_config = attn_config
         self.init_device = init_device
         self.logit_scale = logit_scale
         self.no_bias = no_bias
@@ -80,6 +86,7 @@ class MPTConfig(PretrainedConfig):
         self.norm_type = norm_type
         self.use_cache = use_cache
         self.init_config = init_config
         if 'name' in kwargs:
             del kwargs['name']
         if 'loss_fn' in kwargs:
@@ -95,6 +102,7 @@ class MPTConfig(PretrainedConfig):
     def _validate_config(self):
         self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
         self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
         if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
@@ -115,4 +123,13 @@ class MPTConfig(PretrainedConfig):
         if self.init_config.get('name', None) is None:
             raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
         if not self.learned_pos_emb and (not self.attn_config['alibi']):
-            raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')

 """A HuggingFace-style model configuration."""
+import warnings
 from typing import Dict, Optional, Union
 from transformers import PretrainedConfig
 attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
+ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
 init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
 class MPTConfig(PretrainedConfig):
     model_type = 'mpt'
+    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, ffn_config: Dict=ffn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, fc_type: str='torch', **kwargs):
         """The MPT configuration class.
         Args:
             d_model (int): The size of the embedding dimension of the model.
             n_heads (int): The number of attention heads.
             n_layers (int): The number of layers in the model.
+            expansion_ratio (int): The ratio of the up/down scale in the ffn.
             max_seq_len (int): The maximum sequence length of the model.
             vocab_size (int): The size of the vocabulary.
             resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
             emb_pdrop (float): The dropout probability for the embedding layer.
             learned_pos_emb (bool): Whether to use learned positional embeddings
+            attn_config (Dict): A dictionary used to configure the model's attention module:
                 attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
                 attn_pdrop (float): The dropout probability for the attention layers.
                 attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
                 alibi_bias_max (int): The maximum value of the alibi bias.
+            ffn_config (Dict): A dictionary used to configure the model's ffn module:
+                ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
             init_device (str): The device to use for parameter initialization.
             logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
             no_bias (bool): Whether to use bias in all layers.
                 init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
                 ---
                 See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
+            fc_type (str): choose fc layer implementaion. Options: torch and te. te layers support fp8 when using H100 GPUs.
         """
         self.d_model = d_model
         self.n_heads = n_heads
         self.emb_pdrop = emb_pdrop
         self.learned_pos_emb = learned_pos_emb
         self.attn_config = attn_config
+        self.ffn_config = ffn_config
         self.init_device = init_device
         self.logit_scale = logit_scale
         self.no_bias = no_bias
         self.norm_type = norm_type
         self.use_cache = use_cache
         self.init_config = init_config
+        self.fc_type = fc_type
         if 'name' in kwargs:
             del kwargs['name']
         if 'loss_fn' in kwargs:
     def _validate_config(self):
         self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
+        self.ffn_config = self._set_config_defaults(self.ffn_config, ffn_config_defaults)
         self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
         if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
         if self.init_config.get('name', None) is None:
             raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
         if not self.learned_pos_emb and (not self.attn_config['alibi']):
+            raise warnings.warn(f'Positional information not being provided to the model using either learned_pos_emb or alibi.')
+        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            try:
+                import transformer_engine.pytorch as te
+            except:
+                raise ImportError('TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed.The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\npip install flash-attn==1.0.6 --no-build-isolation \npip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156')
+        if self.ffn_config['ffn_type'] == 'mptmlp':
+            self.ffn_config['fc_type'] = self.fc_type
+        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            self.bias = not self.no_bias

fc.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from torch import nn
+FC_CLASS_REGISTRY = {'torch': nn.Linear}
+try:
+    import transformer_engine.pytorch as te
+    FC_CLASS_REGISTRY['te'] = te.Linear
+except:
+    pass

ffn.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""GPT Blocks used for the GPT Model."""
+from typing import Optional
+import torch
+import torch.nn as nn
+from .attention import ATTN_CLASS_REGISTRY
+from .fc import FC_CLASS_REGISTRY
+from .norm import NORM_CLASS_REGISTRY
+try:
+    import transformer_engine.pytorch as te
+except:
+    te = None
+class MPTMLP(nn.Module):
+    def __init__(self, d_model: int, expansion_ratio: int, fc_type: str='torch', device: Optional[str]=None):
+        super().__init__()
+        fc_kwargs = {}
+        if fc_type != 'te':
+            fc_kwargs['device'] = device
+        self.up_proj = FC_CLASS_REGISTRY[fc_type](d_model, expansion_ratio * d_model, **fc_kwargs)
+        self.act = nn.GELU(approximate='none')
+        self.down_proj = FC_CLASS_REGISTRY[fc_type](expansion_ratio * d_model, d_model, **fc_kwargs)
+        self.down_proj._is_residual = True
+    def forward(self, x):
+        return self.down_proj(self.act(self.up_proj(x)))
+FFN_CLASS_REGISTRY = {'mptmlp': MPTMLP}
+if te is not None:
+    te.LayerNormMLP._has_norm = True
+    FFN_CLASS_REGISTRY['te_ln_mlp'] = te.LayerNormMLP
+def build_ffn(d_model: int, expansion_ratio: int, fc_type: str='torch', device: Optional[str]=None, **kwargs):
+    ffn_type = kwargs.pop('ffn_type')
+    if ffn_type == 'mptmlp':
+        if kwargs is not None and len(kwargs) > 0:
+            raise ValueError(f'MPTMLP got an unexpected keyword argument: {kwargs}')
+        return MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, device=device)
+    elif ffn_type == 'te_ln_mlp':
+        return te.LayerNormMLP(hidden_size=d_model, ffn_hidden_size=d_model * expansion_ratio, **kwargs)
+    raise ValueError(f'ffn_type={ffn_type!r} not recognized.')

modeling_mpt.py CHANGED Viewed

@@ -13,12 +13,14 @@ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutpu
 from .attention import attn_bias_shape, build_attn_bias
 from .blocks import MPTBlock
 from .custom_embedding import SharedEmbedding
 from .norm import NORM_CLASS_REGISTRY
 from .configuration_mpt import MPTConfig
 from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
 from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
-from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
 try:
     from .flash_attn_triton import flash_attn_func
 except:
@@ -40,6 +42,7 @@ class MPTModel(MPTPreTrainedModel):
         self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
         self.alibi = config.attn_config['alibi']
         self.alibi_bias_max = config.attn_config['alibi_bias_max']
         if config.init_device == 'mixed':
             if dist.get_local_rank() == 0:
                 config.init_device = 'cpu'
@@ -51,7 +54,7 @@ class MPTModel(MPTPreTrainedModel):
         norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
         self.embedding_fraction = config.embedding_fraction
         self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
-        if not self.alibi:
             self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
         self.emb_drop = nn.Dropout(config.emb_pdrop)
         self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
@@ -80,7 +83,7 @@ class MPTModel(MPTPreTrainedModel):
     def get_input_embeddings(self):
         return self.wte
-    def set_input_embeddings(self, value):
         self.wte = value
     @torch.no_grad()
@@ -166,9 +169,7 @@ class MPTModel(MPTPreTrainedModel):
         S = input_ids.size(1)
         assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
         tok_emb = self.wte(input_ids)
-        if self.alibi:
-            x = tok_emb
-        else:
             past_position = 0
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
@@ -183,6 +184,8 @@ class MPTModel(MPTPreTrainedModel):
                 pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
             pos_emb = self.wpe(pos)
             x = tok_emb + pos_emb
         if self.embedding_fraction == 1:
             x = self.emb_drop(x)
         else:
@@ -228,7 +231,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
         if not config.tie_word_embeddings:
             raise ValueError('MPTForCausalLM only supports tied word embeddings')
         print(f'Instantiating an MPTForCausalLM model from {__file__}')
-        self.transformer = MPTModel(config)
         for child in self.transformer.children():
             if isinstance(child, torch.nn.ModuleList):
                 continue
@@ -275,9 +278,9 @@ class MPTForCausalLM(MPTPreTrainedModel):
             logits *= self.logit_scale
         loss = None
         if labels is not None:
-            labels = torch.roll(labels, shifts=-1)
-            labels[:, -1] = -100
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
         return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
     def param_init_fn(self, module):

 from .attention import attn_bias_shape, build_attn_bias
 from .blocks import MPTBlock
 from .custom_embedding import SharedEmbedding
+from .fc import FC_CLASS_REGISTRY
+from .ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
 from .norm import NORM_CLASS_REGISTRY
 from .configuration_mpt import MPTConfig
 from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
 from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
+from .param_init_fns import generic_param_init_fn_, MODEL_INIT_REGISTRY
 try:
     from .flash_attn_triton import flash_attn_func
 except:
         self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
         self.alibi = config.attn_config['alibi']
         self.alibi_bias_max = config.attn_config['alibi_bias_max']
+        self.learned_pos_emb = config.learned_pos_emb
         if config.init_device == 'mixed':
             if dist.get_local_rank() == 0:
                 config.init_device = 'cpu'
         norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
         self.embedding_fraction = config.embedding_fraction
         self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
+        if self.learned_pos_emb:
             self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
         self.emb_drop = nn.Dropout(config.emb_pdrop)
         self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
     def get_input_embeddings(self):
         return self.wte
+    def set_input_embeddings(self, value: nn.Embedding):
         self.wte = value
     @torch.no_grad()
         S = input_ids.size(1)
         assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
         tok_emb = self.wte(input_ids)
+        if self.learned_pos_emb:
             past_position = 0
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
                 pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
             pos_emb = self.wpe(pos)
             x = tok_emb + pos_emb
+        else:
+            x = tok_emb
         if self.embedding_fraction == 1:
             x = self.emb_drop(x)
         else:
         if not config.tie_word_embeddings:
             raise ValueError('MPTForCausalLM only supports tied word embeddings')
         print(f'Instantiating an MPTForCausalLM model from {__file__}')
+        self.transformer: MPTModel = MPTModel(config)
         for child in self.transformer.children():
             if isinstance(child, torch.nn.ModuleList):
                 continue
             logits *= self.logit_scale
         loss = None
         if labels is not None:
+            _labels = torch.roll(labels, shifts=-1)
+            _labels[:, -1] = -100
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), _labels.to(logits.device).view(-1))
         return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
     def param_init_fn(self, module):

norm.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 def _cast_if_autocast_enabled(tensor):
@@ -53,4 +54,4 @@ class LPRMSNorm(RMSNorm):
         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
         with torch.autocast(enabled=False, device_type=x.device.type):
             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
-NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}

+from typing import Dict, Type
 import torch
 def _cast_if_autocast_enabled(tensor):
         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
         with torch.autocast(enabled=False, device_type=x.device.type):
             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
+NORM_CLASS_REGISTRY: Dict[str, Type[torch.nn.Module]] = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}

param_init_fns.py CHANGED Viewed

@@ -5,7 +5,12 @@ from functools import partial
 from typing import Optional, Tuple, Union
 import torch
 from torch import nn
 from .norm import NORM_CLASS_REGISTRY
 def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
     del kwargs
@@ -44,7 +49,7 @@ def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model:
     if init_div_is_residual is not False:
         if verbose > 1:
             warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
-    if isinstance(module, nn.Linear):
         if hasattr(module, '_fused'):
             fused_init_helper_(module, init_fn_)
         else:
@@ -114,6 +119,19 @@ def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model:
                 module.out_proj.weight.div_(div_is_residual)
         if module.out_proj.bias is not None:
             torch.nn.init.zeros_(module.out_proj.bias)
     else:
         for _ in module.parameters(recurse=False):
             raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')

 from typing import Optional, Tuple, Union
 import torch
 from torch import nn
+from .fc import FC_CLASS_REGISTRY
 from .norm import NORM_CLASS_REGISTRY
+try:
+    import transformer_engine.pytorch as te
+except:
+    te = None
 def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
     del kwargs
     if init_div_is_residual is not False:
         if verbose > 1:
             warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
+    if isinstance(module, tuple(set(FC_CLASS_REGISTRY.values()))):
         if hasattr(module, '_fused'):
             fused_init_helper_(module, init_fn_)
         else:
                 module.out_proj.weight.div_(div_is_residual)
         if module.out_proj.bias is not None:
             torch.nn.init.zeros_(module.out_proj.bias)
+    elif te is not None and isinstance(module, te.LayerNormMLP):
+        if module.layer_norm_weight is not None:
+            torch.nn.init.ones_(module.layer_norm_weight)
+        if module.layer_norm_bias is not None:
+            torch.nn.init.zeros_(module.layer_norm_bias)
+        init_fn_(module.fc1_weight)
+        if module.fc1_bias is not None:
+            torch.nn.init.zeros_(module.fc1_bias)
+        init_fn_(module.fc2_weight)
+        if module.fc2_bias is not None:
+            torch.nn.init.zeros_(module.fc2_bias)
+        with torch.no_grad():
+            module.fc2_weight.div_(div_is_residual)
     else:
         for _ in module.parameters(recurse=False):
             raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')

pytorch_model-00001-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c51240f27be83417d07e38ba6ab0541ed5560611291bf403c3064a5f5b830889
+size 9901940807

pytorch_model-00002-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15a418e63d400aa3e9ddd5c4fe64378eaeed49ca20f2d09a4507cfb6209de2d1
+size 9865240711

pytorch_model-00003-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ec366585fe0612110b1251ad7706e9d0d4841657c702b4393d3df6ca9b0f32a
+size 9865240711

pytorch_model-00004-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:853b9552f51a9c076cd9f043f958c6c78802d9512f01dfb9f852b456008ddfc2
+size 9865240711

pytorch_model-00005-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f394f487f012f860fa6e63504aa4ffdae670101b845b966bfb43951bcfb4798f
+size 9865240711

pytorch_model-00006-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d07e4cabbbeeb55b072dab24d8b152a36fbc0ec5021e4211d584f871fea4791
+size 9865240711

pytorch_model-00007-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c2c9d32c24b95b3d674e1d9a43688f4aed426dd1bd54b6038c63d3c0baf282d
+size 9865240711

pytorch_model-00008-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d80b8726b77a8c55a88ca09bf74f85028f6c0bfe93890942da37f44336fcac3
+size 9865240711

pytorch_model-00009-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f18fa796abea986040439eb79a6c8e40da7214b5bbb873cad047c09e30a954b5
+size 9865240711

pytorch_model-00010-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:032ac755ca09655de532d5b2df927e79e8143c102ccf2bec9da54d64b0e51708
+size 9865240711

pytorch_model-00011-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaf5276708ad788f6f8e1a73f77b36f54a4413ff1277b1c72cdf8c999d286fea
+size 9865240711

pytorch_model-00012-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46ee5a0ab0d96c7b904be86d8f2f7afae664a3cfd9bfaa3cca08ea3419dd7273
+size 9865240711

pytorch_model-00013-of-00013.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a2f8577fb3fe785e377e032b886d5aa44d5a4786a128d478a66b7d6291369d6
+size 1644197388

pytorch_model.bin.index.json CHANGED Viewed

@@ -1,297 +1,298 @@
 {
   "metadata": {
-    "total_size": 59914401792
   },
   "weight_map": {
-    "transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.10.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.10.norm_1.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.10.norm_2.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.11.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.11.norm_1.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.11.norm_2.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.12.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.12.norm_1.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.12.norm_2.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.13.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.13.norm_1.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.13.norm_2.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.14.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.14.norm_1.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.14.norm_2.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.15.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.15.norm_1.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.15.norm_2.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.16.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.16.norm_1.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.16.norm_2.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.17.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.17.norm_1.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.17.norm_2.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.18.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.18.norm_1.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.18.norm_2.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.19.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.19.norm_1.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.19.norm_2.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.20.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.20.norm_1.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.20.norm_2.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.21.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.21.norm_1.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.21.norm_2.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.22.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.22.norm_1.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.22.norm_2.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.23.attn.out_proj.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.23.norm_1.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.23.norm_2.weight": "pytorch_model-00003-of-00007.bin",
-    "transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.24.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.24.norm_1.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.24.norm_2.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.25.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.25.norm_1.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.25.norm_2.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.26.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.26.norm_1.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.26.norm_2.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.27.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.27.norm_1.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.27.norm_2.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.28.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.28.norm_1.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.28.norm_2.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.29.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.29.norm_1.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.29.norm_2.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.30.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.30.norm_1.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.30.norm_2.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.31.attn.out_proj.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.31.norm_1.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.31.norm_2.weight": "pytorch_model-00004-of-00007.bin",
-    "transformer.blocks.32.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.32.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.32.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.32.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.32.norm_1.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.32.norm_2.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.33.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.33.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.33.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.33.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.33.norm_1.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.33.norm_2.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.34.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.34.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.34.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.34.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.34.norm_1.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.34.norm_2.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.35.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.35.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.35.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.35.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.35.norm_1.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.35.norm_2.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.36.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.36.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.36.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.36.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.36.norm_1.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.36.norm_2.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.37.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.37.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.37.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.37.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.37.norm_1.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.37.norm_2.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.38.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.38.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.38.ffn.down_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.38.ffn.up_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.38.norm_1.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.38.norm_2.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.39.attn.Wqkv.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.39.attn.out_proj.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.39.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.39.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.39.norm_1.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.39.norm_2.weight": "pytorch_model-00005-of-00007.bin",
-    "transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.4.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.4.norm_1.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.4.norm_2.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.40.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.40.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.40.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.40.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.40.norm_1.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.40.norm_2.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.41.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.41.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.41.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.41.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.41.norm_1.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.41.norm_2.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.42.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.42.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.42.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.42.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.42.norm_1.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.42.norm_2.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.43.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.43.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.43.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.43.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.43.norm_1.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.43.norm_2.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.44.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.44.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.44.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.44.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.44.norm_1.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.44.norm_2.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.45.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.45.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.45.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.45.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.45.norm_1.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.45.norm_2.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.46.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.46.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.46.ffn.down_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.46.ffn.up_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.46.norm_1.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.46.norm_2.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.47.attn.Wqkv.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.47.attn.out_proj.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.47.ffn.down_proj.weight": "pytorch_model-00007-of-00007.bin",
-    "transformer.blocks.47.ffn.up_proj.weight": "pytorch_model-00007-of-00007.bin",
-    "transformer.blocks.47.norm_1.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.47.norm_2.weight": "pytorch_model-00006-of-00007.bin",
-    "transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.5.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.5.norm_1.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.5.norm_2.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.6.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.6.norm_1.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.6.norm_2.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.7.attn.out_proj.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.7.norm_1.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.7.norm_2.weight": "pytorch_model-00001-of-00007.bin",
-    "transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.8.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.8.norm_1.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.8.norm_2.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.9.attn.out_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.9.norm_1.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.blocks.9.norm_2.weight": "pytorch_model-00002-of-00007.bin",
-    "transformer.norm_f.weight": "pytorch_model-00007-of-00007.bin",
-    "transformer.wte.weight": "pytorch_model-00001-of-00007.bin"
   }
 }

 {
   "metadata": {
+    "total_size": 120063684608
   },
   "weight_map": {
+    "transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.10.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.10.norm_1.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.10.norm_2.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.11.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.11.norm_1.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.11.norm_2.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.12.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.12.norm_1.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.12.norm_2.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.13.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.13.norm_1.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.13.norm_2.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.14.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.14.norm_1.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.14.norm_2.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.15.attn.out_proj.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.15.norm_1.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.15.norm_2.weight": "pytorch_model-00004-of-00013.bin",
+    "transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.16.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.16.norm_1.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.16.norm_2.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.17.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.17.norm_1.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.17.norm_2.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.18.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.18.norm_1.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.18.norm_2.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.19.attn.out_proj.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.19.norm_1.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.19.norm_2.weight": "pytorch_model-00005-of-00013.bin",
+    "transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.20.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.20.norm_1.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.20.norm_2.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.21.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.21.norm_1.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.21.norm_2.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.22.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.22.norm_1.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.22.norm_2.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.23.attn.out_proj.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.23.norm_1.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.23.norm_2.weight": "pytorch_model-00006-of-00013.bin",
+    "transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.24.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.24.norm_1.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.24.norm_2.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.25.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.25.norm_1.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.25.norm_2.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.26.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.26.norm_1.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.26.norm_2.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.27.attn.out_proj.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.27.norm_1.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.27.norm_2.weight": "pytorch_model-00007-of-00013.bin",
+    "transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.28.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.28.norm_1.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.28.norm_2.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.29.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.29.norm_1.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.29.norm_2.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.30.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.30.norm_1.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.30.norm_2.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.31.attn.out_proj.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.31.norm_1.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.31.norm_2.weight": "pytorch_model-00008-of-00013.bin",
+    "transformer.blocks.32.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.32.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.32.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.32.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.32.norm_1.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.32.norm_2.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.33.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.33.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.33.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.33.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.33.norm_1.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.33.norm_2.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.34.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.34.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.34.ffn.down_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.34.ffn.up_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.34.norm_1.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.34.norm_2.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.35.attn.Wqkv.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.35.attn.out_proj.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.35.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.35.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.35.norm_1.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.35.norm_2.weight": "pytorch_model-00009-of-00013.bin",
+    "transformer.blocks.36.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.36.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.36.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.36.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.36.norm_1.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.36.norm_2.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.37.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.37.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.37.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.37.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.37.norm_1.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.37.norm_2.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.38.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.38.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.38.ffn.down_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.38.ffn.up_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.38.norm_1.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.38.norm_2.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.39.attn.Wqkv.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.39.attn.out_proj.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.39.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.39.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.39.norm_1.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.39.norm_2.weight": "pytorch_model-00010-of-00013.bin",
+    "transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.4.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.4.norm_1.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.4.norm_2.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.40.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.40.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.40.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.40.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.40.norm_1.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.40.norm_2.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.41.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.41.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.41.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.41.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.41.norm_1.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.41.norm_2.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.42.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.42.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.42.ffn.down_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.42.ffn.up_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.42.norm_1.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.42.norm_2.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.43.attn.Wqkv.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.43.attn.out_proj.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.43.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.43.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.43.norm_1.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.43.norm_2.weight": "pytorch_model-00011-of-00013.bin",
+    "transformer.blocks.44.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.44.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.44.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.44.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.44.norm_1.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.44.norm_2.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.45.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.45.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.45.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.45.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.45.norm_1.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.45.norm_2.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.46.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.46.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.46.ffn.down_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.46.ffn.up_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.46.norm_1.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.46.norm_2.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.47.attn.Wqkv.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.47.attn.out_proj.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.47.ffn.down_proj.weight": "pytorch_model-00013-of-00013.bin",
+    "transformer.blocks.47.ffn.up_proj.weight": "pytorch_model-00013-of-00013.bin",
+    "transformer.blocks.47.norm_1.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.47.norm_2.weight": "pytorch_model-00012-of-00013.bin",
+    "transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.5.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.5.norm_1.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.5.norm_2.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.6.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.6.norm_1.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.6.norm_2.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.7.attn.out_proj.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.7.norm_1.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.7.norm_2.weight": "pytorch_model-00002-of-00013.bin",
+    "transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.8.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.8.norm_1.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.8.norm_2.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.9.attn.out_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.9.norm_1.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.blocks.9.norm_2.weight": "pytorch_model-00003-of-00013.bin",
+    "transformer.norm_f.weight": "pytorch_model-00013-of-00013.bin",
+    "transformer.wpe.weight": "pytorch_model-00001-of-00013.bin",
+    "transformer.wte.weight": "pytorch_model-00001-of-00013.bin"
   }
 }