jiang719 commited on 12 days ago

Commit

2fed580

•

1 Parent(s): 8039218

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

__init__.py +0 -0
config.json +60 -0
configuration_vmistral.py +310 -0
generation_config.json +7 -0
generation_utils.py +376 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +803 -0
modeling_vmistral.py +1766 -0
modeling_web.py +681 -0
preprocessor_config.json +20 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +59 -0
vision.py +653 -0

__init__.py ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "_flash_attn_2_enabled": true,
+  "_name_or_path": "None",
+  "additional_vocab_size": 2,
+  "alpha_initializer": "zeros",
+  "alpha_type": "float",
+  "alphas_initializer_range": 0.0,
+  "architectures": [
+    "WebForVisionText2Text"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_vmistral.VMistralConfig",
+    "AutoModelForCausalLM": "modeling_web.WebForVisionText2Text"
+  },
+  "bos_token_id": 1,
+  "cross_layer_interval": 1,
+  "eos_token_id": 2,
+  "freeze_lm_head": false,
+  "freeze_text_layers": false,
+  "freeze_text_module_exceptions": [],
+  "freeze_vision_layers": false,
+  "freeze_vision_module_exceptions": [],
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_token_id": 32001,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "vmistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "perceiver_config": {
+    "model_type": "vmistral",
+    "qk_layer_norms_perceiver": true,
+    "resampler_depth": 3
+  },
+  "qk_layer_norms": true,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.1",
+  "use_cache": true,
+  "use_resampler": true,
+  "vision_config": {
+    "hidden_size": 1152,
+    "image_size": 960,
+    "intermediate_size": 4304,
+    "model_type": "vmistral",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "patch_size": 14
+  },
+  "vocab_size": 32000,
+  "web_attention_range": 2
+}

configuration_vmistral.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" VMistral model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "lt-asset/Waffle_VLM_WebSight": "https://huggingface.co/lt-asset/Waffle_VLM_WebSight/blob/main/configuration_vmistral.py",
+}
+class VMistralVisionConfig(PretrainedConfig):
+    r"""
+    """
+    model_type = "vmistral"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        web_attention_range=1,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.web_attention_range = web_attention_range
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+class VMistralPerceiverConfig(PretrainedConfig):
+    r"""
+    TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        use_resampler (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the resampler
+        resampler_n_latents (`int`, *optional*, defaults to ):
+            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+        resampler_depth (`int`, *optional*, defaults to 6):
+            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+        resampler_n_heads (`int`, *optional*, defaults to 16):
+            Number of heads in each Transformer block (for multi-headed self-attention).
+        resampler_head_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of each head projection in the Transformer block.
+        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
+            Whether or not to use qk layer norms in perceiver
+    """
+    model_type = "vmistral"
+    def __init__(
+        self,
+        resampler_n_latents=64,
+        resampler_depth=6,
+        resampler_n_heads=16,
+        resampler_head_dim=96,
+        qk_layer_norms_perceiver=False,
+        **kwargs,
+    ):
+        self.resampler_n_latents = resampler_n_latents
+        self.resampler_depth = resampler_depth
+        self.resampler_n_heads = resampler_n_heads
+        self.resampler_head_dim = resampler_head_dim
+        self.qk_layer_norms_perceiver = qk_layer_norms_perceiver
+        super().__init__(**kwargs)
+class VMistralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        additional_vocab_size (`int`, *optional`, defaults to 0):
+            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
+            are always trainable whereas regular vocab tokens can be frozen or not.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
+            Initialization type for the alphas.
+        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
+            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
+            Attention.
+        alpha_type (`str`, *optional*, defaults to `"float"`):
+            Whether the gating alphas should be vectors or single floats.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        cross_layer_interval (`int`, *optional*, default to 1)
+            Interval for cross attention (from text to image) layers.
+        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
+        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
+        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing text layers when `freeze_text_layers` is `True`
+        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
+        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
+        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
+        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
+        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
+        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict
+    Example:
+    ```python
+    >>> from transformers import MistralModel, MistralConfig
+    >>> # Initializing a Mistral 7B style configuration
+    >>> configuration = MistralConfig()
+    >>> # Initializing a model from the Mistral 7B style configuration
+    >>> model = MistralModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vmistral"
+    is_composition = False
+    def __init__(
+        self,
+        additional_vocab_size=0,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        alpha_initializer="zeros",
+        alphas_initializer_range=0.0,
+        alpha_type="float",
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,  # None in the original configuration_mistral, we set it to the unk_token_id
+        bos_token_id=1,
+        eos_token_id=2,
+        image_token_id=32_001,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        cross_layer_interval=1,
+        qk_layer_norms=False,
+        freeze_text_layers=True,
+        freeze_text_module_exceptions=[],
+        freeze_lm_head=False,
+        freeze_vision_layers=True,
+        freeze_vision_module_exceptions=[],
+        attention_dropout=0.0,
+        _flash_attn_2_enabled=True,
+        use_resampler=False,
+        vision_config=None,
+        perceiver_config=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.image_token_id = image_token_id
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.alpha_initializer = alpha_initializer
+        self.alphas_initializer_range = alphas_initializer_range
+        self.alpha_type = alpha_type
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.cross_layer_interval = cross_layer_interval
+        self.qk_layer_norms = qk_layer_norms
+        self.freeze_vision_layers = freeze_vision_layers
+        self.freeze_text_layers = freeze_text_layers
+        self.freeze_text_module_exceptions = freeze_text_module_exceptions
+        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
+        self.freeze_lm_head = freeze_lm_head
+        self.use_resampler = use_resampler
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+        self.attention_dropout = attention_dropout
+        if perceiver_config is None:
+            self.perceiver_config = VMistralPerceiverConfig()
+        elif isinstance(perceiver_config, dict):
+            self.perceiver_config = VMistralPerceiverConfig(**perceiver_config)
+        elif isinstance(perceiver_config, VMistralPerceiverConfig):
+            self.perceiver_config = perceiver_config
+        if vision_config is None:
+            self.vision_config = VMistralVisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = VMistralVisionConfig(**vision_config)
+        elif isinstance(vision_config, VMistralVisionConfig):
+            self.vision_config = vision_config
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
+        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
+        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
+        # of this object many attributes have default values and haven't yet been overridden.
+        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.41.1"
+}

generation_utils.py ADDED Viewed

	@@ -0,0 +1,376 @@

+from typing import Any, Dict, Optional, List
+import torch
+from transformers import GenerationMixin
+from transformers import AutoTokenizer
+import re
+import traceback
+class WebGenerationMixin(GenerationMixin):
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+        if getattr(outputs, "state", None) is not None:
+            model_kwargs["state"] = outputs.state
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+        if not is_encoder_decoder:
+            # update attention mask
+            if 'web_attention_mask' not in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs['web_attention_mask'] = torch.tril(torch.ones((attention_mask.shape[-1], attention_mask.shape[-1]), dtype = attention_mask.dtype)).unsqueeze(0)
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+            model_kwargs['html_tree'] = outputs.html_tree
+        else:
+            # update decoder attention mask
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                model_kwargs["decoder_attention_mask"] = torch.cat(
+                    [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
+                    dim=-1,
+                )
+        if "cache_position" in model_kwargs and model_kwargs["cache_position"] is not None:
+            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
+        return model_kwargs
+    def _reorder_cache(self, past_key_values, beam_idx):
+        raise NotImplementedError(
+            f"Make sure that a `_reorder_cache` function is correctly implemented in {self.__class__.__module__} to"
+            f" enable beam search for {self.__class__}"
+        )
+class TreeNode():
+    def __init__(self,content: list, idx: int):
+        self.open_tag: List[str] = content
+        self.end_tag: Optional[List[str]] = None
+        self.self_closing_tag: Optional[List[str]] = None
+        self.text = ""
+        self.name: Optional[str] = None
+        self.parent: Optional['TreeNode'] = None  # Use 'TreeNode' as a string for forward reference
+        self.open_tag_range: Optional[List[int]] = None
+        self.end_tag_range: Optional[List[int]] = None
+        self.text_range = [-1,-1]
+        self.self_closing_tag_range = [-1,-1]
+        self.idx: int = idx
+        self.children: List['TreeNode'] = []  # List of TreeNode instances
+    def partially_open(self):
+        if not self.open_tag: return False
+        if any('<' in s for s in self.open_tag) and not any('>' in s for s in self.open_tag):
+            return True
+        return False
+    def add_child(self,child):
+        assert child.parent is None, "Child already has a parent"
+        assert child not in self.children, "Child is already in children list"
+        child.parent = self
+        self.children.append(child)
+    def get_range(self):
+        if self.text:
+            return list(range(*self.text_range))
+        elif self.self_closing_tag:
+            return list(range(*self.self_closing_tag_range))
+        else:
+            attn_range = []
+            if self.open_tag_range:
+                attn_range += list(range(*self.open_tag_range))
+            if self.end_tag_range:
+                attn_range += list(range(*self.end_tag_range))
+            return attn_range
+    def __repr__(self):
+        return f"Node(name='{self.open_tag}', idx = {self.idx})"
+    def print_tree(self, level=0, input_ids = None, tokenizer = None):
+        if level == 0:
+            print("--------")
+        indent = "  " * level
+        if self.text:
+            print(f"{indent}{tokenizer.convert_tokens_to_string(self.text).strip()}, level = {level} ")
+        elif self.self_closing_tag:
+            print(f"{indent}{tokenizer.convert_tokens_to_string(self.self_closing_tag).strip()}, level = {level} ")
+        elif self.open_tag:
+            print(f"{indent}{tokenizer.convert_tokens_to_string(self.open_tag).strip()}, level = {level} ")
+            for child in self.children:
+                child.print_tree(level + 1, input_ids, tokenizer)
+            if self.end_tag:
+                print(f"{indent}{tokenizer.convert_tokens_to_string(self.end_tag).strip()}, level = {level} ")
+        else:
+            for child in self.children:
+                child.print_tree(level + 1, input_ids, tokenizer)
+        if level == 0:
+            print("--------")
+    def get_tree(self, level=0, input_ids = None, tokenizer=None):
+        tree_str = ""
+        indent = "  " * level
+        if self.text:
+            tree_str+=f"{indent}{tokenizer.convert_tokens_to_string(self.text).strip()} \n"
+        elif self.self_closing_tag:
+            tree_str+=f"{indent}{tokenizer.convert_tokens_to_string(self.self_closing_tag).strip()} \n"
+        elif self.open_tag:
+            tree_str+=f"{indent}{tokenizer.convert_tokens_to_string(self.open_tag).strip()} \n"
+            for child in self.children:
+                tree_str+=child.get_tree(level + 1, input_ids, tokenizer)
+            if self.end_tag:
+                tree_str+=f"{indent}{tokenizer.convert_tokens_to_string(self.end_tag).strip()} \n"
+        else:
+            for child in self.children:
+                tree_str+=child.get_tree(level + 1, input_ids, tokenizer)
+        return tree_str
+class TreeBuilder():
+    def __init__(self, tokenizer: AutoTokenizer = None, root: TreeNode = None, cur_node: TreeNode = None):
+        self.tokenizer = tokenizer
+        self.root = TreeNode(None, 0)
+        self.cur_node = self.root
+        self.buffer = []
+        self.buffer_start_index = 0
+        self.idx = 0
+        self.full_attention_list= None
+        self.web_attention_mask = None
+        self.input_ids = None
+        self.void_elements = [
+            "area",
+            "base",
+            "br",
+            "col",
+            "embed",
+            "hr",
+            "img",
+            "input",
+            "link",
+            "meta",
+            "param",
+            "source",
+            "track",
+            "wbr"
+        ]
+    def is_empty(self):
+        return self.root == None
+    def in_buffer(self, text):
+        if len(self.buffer) == 0:
+            return False
+        return any(text in s for s in self.buffer)
+    def find_buffer(self, text):
+        # Iterate over the list of strings with their indices
+        for index, s in enumerate(self.buffer):
+            if text in s:
+                return index
+        return -1
+    # Function to extract xxx from <xxx> or <xxx yyy>
+    def extract_open_tag_name(self,buffer):
+        input_string = self.tokenizer.convert_tokens_to_string(buffer)
+        match = re.search(r'<\s*(\w+)(?:\s+[^>]*)?>', input_string)
+        if match:
+            return match.group(1)
+        return None
+    def extract_close_tag_name(self,buffer):
+        # if isinstance(input_string, list):
+        #     input_string = "".join(input_string).replace('Ċ', '\n').replace('Ġ', ' ').replace('ĉ', '\t')
+        input_string = self.tokenizer.convert_tokens_to_string(buffer)
+        match = re.search(r'</\s*(\w+)(?:\s+[^>]*)?>', input_string)
+        if match:
+            return match.group(1)
+        return None
+    def is_not_empty_buffer(self):
+        return self.tokenizer.convert_tokens_to_string(self.buffer).strip() != ''
+    def get_parent_and_siblings_attention_range(self):
+        attn_range = []
+        if self.cur_node.parent:
+            parent = self.cur_node.parent
+            if parent.open_tag_range:
+                attn_range += list(range(*parent.open_tag_range))
+            for child in parent.children:
+                if child is not self.cur_node:
+                    if child.open_tag and child.end_tag:
+                        attn_range += list(range(*child.open_tag_range))
+                        attn_range += list(range(*child.end_tag_range))
+                    elif child.text:
+                        attn_range += list(range(*child.text_range))
+                    elif child.self_closing_tag:
+                        attn_range += list(range(*child.self_closing_tag_range))
+                    else:
+                        raise Exception(f"??? line 151, get p and s attention range")
+        return attn_range
+    def update_buffer(self, cur_decoded_token):
+        # open tag situations
+        assert isinstance(cur_decoded_token,list), f"{cur_decoded_token}"
+        self.buffer+=cur_decoded_token
+        assert isinstance(cur_decoded_token[0],str)
+        # print(self.buffer)
+        try:
+            # dealing with end tag
+            if self.in_buffer('</' ) and self.in_buffer('>') and self.find_buffer('</') <= self.find_buffer('>'):
+                close_tag_name = self.extract_close_tag_name(self.buffer)
+                if self.cur_node.open_tag and not self.cur_node.end_tag:
+                    assert close_tag_name == self.extract_open_tag_name(self.cur_node.open_tag), f"close_tag_name is {close_tag_name}, with buffer: {self.buffer}, open is-----{self.cur_node.open_tag}---"
+                elif self.cur_node.text or self.cur_node.self_closing_tag or self.cur_node.end_tag:
+                    content = None
+                    if self.cur_node.text: content = self.cur_node.text
+                    elif self.cur_node.self_closing_tag: content = self.cur_node.self_closing_tag
+                    elif self.cur_node.end_tag: content = self.cur_node.end_tag
+                    self.root.print_tree(0,None,self.tokenizer)
+                    raise Exception(f"This should never happen\n {content}, buffer is {self.buffer}")
+                    # assert close_tag_name == extract_open_tag_name(self.cur_node.open_tag), f"close_tag_name is {close_tag_name}, with buffer: {self.buffer}, open is-----{self.cur_node.open_tag}---"
+                else:
+                    raise Exception(f"having end tag without having an open tag\n {self.cur_node.text}")
+                self.cur_node.end_tag = self.buffer[:self.find_buffer('>')+1]
+                self.cur_node.end_tag_range = [self.buffer_start_index, self.buffer_start_index + self.find_buffer('>')+1]
+                self.buffer_start_index += self.find_buffer('>')+1
+                self.buffer = self.buffer[self.find_buffer('>')+1:]
+            # dealing with open tag
+            elif self.in_buffer('</'):
+                if self.cur_node.open_tag and not self.cur_node.end_tag:
+                    pass
+                elif self.cur_node.text or self.cur_node.self_closing_tag or (self.cur_node.open_tag and self.cur_node.end_tag):
+                    cur_end_tag_index = self.find_buffer('</')
+                    # import pdb;pdb.set_trace()
+                    if self.cur_node.text:
+                        self.cur_node.text += self.buffer[:cur_end_tag_index]
+                        self.cur_node.text_range[1] += len(self.buffer[:cur_end_tag_index])
+                    elif self.cur_node.self_closing_tag:
+                        self.cur_node.self_closing_tag += self.buffer[:cur_end_tag_index]
+                        self.cur_node.self_closing_tag_range[1] += len(self.buffer[:cur_end_tag_index])
+                    else:
+                        self.cur_node.end_tag += self.buffer[:cur_end_tag_index]
+                        self.cur_node.end_tag_range[1] += len(self.buffer[:cur_end_tag_index])
+                    self.buffer_start_index += len(self.buffer[:cur_end_tag_index])
+                    self.buffer =self.buffer[cur_end_tag_index:]
+                    self.cur_node = self.cur_node.parent
+                else:
+                    raise Exception(f"having end tag without having an open tag\n {self.cur_node.text} {self.cur_node} {self.cur_node.parent.open_tag}")
+            elif self.in_buffer('<') and self.in_buffer('>'):
+                # in the case of self_closing tag
+                if self.in_buffer('/>'):
+                    self.cur_node.open_tag = None
+                    self.cur_node.self_closing_tag = self.buffer[:self.find_buffer(">")+1]
+                    self.cur_node.self_closing_tag_range = [self.buffer_start_index, self.buffer_start_index + self.find_buffer('>')+1]
+                else:
+                    open_tag_name = self.extract_open_tag_name(self.buffer)
+                    if open_tag_name in self.void_elements:
+                        self.cur_node.open_tag = None
+                        self.cur_node.self_closing_tag = self.buffer[:self.find_buffer(">")+1]
+                        self.cur_node.self_closing_tag_range = [self.buffer_start_index, self.buffer_start_index + self.find_buffer('>')+1]
+                    else:
+                        self.cur_node.open_tag = self.buffer[:self.find_buffer(">")+1]
+                        self.cur_node.open_tag_range = [self.buffer_start_index, self.buffer_start_index + self.find_buffer('>')+1]
+                self.buffer_start_index += self.find_buffer('>')+1
+                self.buffer = self.buffer[self.find_buffer(">")+1:]
+            elif self.in_buffer('<'):
+                if self.full_attention_list is None:
+                    self.full_attention_list = self.buffer[:-1]
+                    self.buffer = self.buffer[-1:]
+                    self.buffer_start_index = len(self.full_attention_list)
+                else:
+                    cur_open_tag_index = self.find_buffer('<')
+                    # full open tag, indicating a pair of open and close tags, or a single open tag
+                    if not self.cur_node.partially_open() and self.cur_node.open_tag:
+                        if self.cur_node.end_tag:
+                            self.cur_node.end_tag += self.buffer[:cur_open_tag_index]
+                            self.cur_node.end_tag_range[1] += len(self.buffer[:cur_open_tag_index])
+                            self.buffer_start_index += len(self.buffer[:cur_open_tag_index])
+                            self.buffer =self.buffer[cur_open_tag_index:]
+                            child_node = TreeNode(self.buffer, self.idx)
+                            if self.cur_node.parent:
+                                self.cur_node.parent.add_child(child_node)
+                            else:
+                                raise Exception(f"This should never happen, a html element with full open tag should have a parent, {self.cur_node.open_tag}")
+                            self.idx += 1
+                            self.cur_node = child_node
+                        else:
+                            child_node = TreeNode(self.buffer, self.idx)
+                            self.cur_node.add_child(child_node)
+                            self.idx += 1
+                            self.cur_node = child_node
+                    elif self.cur_node.text or self.cur_node.self_closing_tag:
+                        if self.cur_node.text:
+                            self.cur_node.text += self.buffer[:cur_open_tag_index]
+                            self.cur_node.text_range[1] += len(self.buffer[:cur_open_tag_index])
+                        elif self.cur_node.self_closing_tag:
+                            self.cur_node.self_closing_tag += self.buffer[:cur_open_tag_index]
+                            self.cur_node.self_closing_tag_range[1] += len(self.buffer[:cur_open_tag_index])
+                        self.buffer_start_index += len(self.buffer[:cur_open_tag_index])
+                        self.buffer =self.buffer[cur_open_tag_index:]
+                        child_node = TreeNode(self.buffer, self.idx)
+                        self.cur_node.parent.add_child(child_node)
+                        self.idx += 1
+                        self.cur_node = child_node
+            # if the current node has an open tag, and we are encountering texts, we create a new text node, and move down a level
+            elif (self.cur_node.open_tag or self.cur_node.self_closing_tag) and not self.in_buffer('<') and self.is_not_empty_buffer():
+                child_node = TreeNode(None, self.idx)
+                child_node.text = self.buffer
+                child_node.text_range[0] = self.buffer_start_index
+                child_node.text_range[1] = self.buffer_start_index + len(self.buffer)
+                if self.cur_node.end_tag or self.cur_node.self_closing_tag:
+                    self.cur_node.parent.add_child(child_node)
+                else:
+                    self.cur_node.add_child(child_node)
+                self.idx += 1
+                self.cur_node = child_node
+                self.buffer_start_index += len(self.buffer)
+                self.buffer = []
+            # if the current node does not have an open tag, but we are encountering text, we add to the exisitng text node
+            elif self.cur_node.text and not self.in_buffer('<') and self.is_not_empty_buffer():
+                self.cur_node.text += self.buffer
+                assert self.cur_node.text_range[0] != -1 and self.cur_node.text_range[1] != -1, f"self.cur_node.text_range[0] and [1] should not be -1 but: {self.cur_node.text_range[0]}, {self.cur_node.text_range[1]}"
+                self.cur_node.text_range[1] += len(self.buffer)
+                self.buffer_start_index += len(self.buffer)
+                self.buffer =[]
+        except Exception as e:
+            traceback.format_exc()
+            raise Exception(e)
+        if self.full_attention_list is None:
+            attn_range = list(range(len(self.buffer)))
+        else:
+            attn_range = list(range(len(self.full_attention_list))) + self.get_parent_and_siblings_attention_range() + self.cur_node.get_range() + [i + self.buffer_start_index  for i in list(range(len(self.buffer)))]
+        return attn_range

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f3ab17e6766272fd3e1a53624c9b428796aeea8c4a917401c1b7c9814135922
+size 4895986336

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c9c4577adcbbfe172eea0ddd138bf858bafbb7d40805290ff0b1033a56ec994
+size 4915916144

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c607a9800a94135d0b2498983d92d63adf64d4e3500310d774bf36a2b230f5a
+size 4915916176

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24c9fa2dcc493160239537f0b227c04ceb208fe2b2710ad62d3f234e5228a769
+size 1688301256

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,803 @@

+{
+  "metadata": {
+    "total_size": 16416014464
+  },
+  "weight_map": {
+    "lm_head.additional_fc.weight": "model-00004-of-00004.safetensors",
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.additional_embedding.weight": "model-00001-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.modality_projection.act.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.modality_projection.act.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.modality_projection.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.modality_projection.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.context_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.context_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.k_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.k_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.latents_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.latents_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.output_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.q_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.q_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.0.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.1.c_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.1.fc.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.1.ln.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.0.1.ln.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.context_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.context_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.k_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.k_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.latents_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.latents_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.output_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.q_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.q_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.0.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.1.c_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.1.fc.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.1.ln.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.1.1.ln.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.context_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.context_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.k_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.k_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.latents_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.latents_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.output_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.q_layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.q_layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.0.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.1.c_proj.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.1.fc.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.1.ln.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.blocks.2.1.ln.weight": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.latents": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.layer_norm.bias": "model-00001-of-00004.safetensors",
+    "model.perceiver_resampler.layer_norm.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.embeddings.position_embedding.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.attention.in_proj_bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.attention.in_proj_weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.attention.out_proj.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.attention.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.head.probe": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.post_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.vision_model.vision_model.post_layernorm.weight": "model-00001-of-00004.safetensors"
+  }
+}

modeling_vmistral.py ADDED Viewed

	@@ -0,0 +1,1766 @@

+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch VMistral model."""
+from dataclasses import dataclass
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    replace_return_docstrings,
+)
+from einops import rearrange, repeat
+from transformers import PreTrainedModel
+from transformers.utils import logging
+from transformers.modeling_outputs import ModelOutput
+from .configuration_vmistral import VMistralConfig
+from .vision import SiglipVisionModel
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "VMistralConfig"
+VMistral_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "HuggingFaceM4/VLM_WebSight_finetuned"
+]
+@dataclass
+class VMistralBaseModelOutputWithPast(ModelOutput):
+    """
+    Base class for VMistral model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class VMistralCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for VMistral causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+def expand_inputs_for_generation(
+    input_ids,
+    expand_size=1,
+    is_encoder_decoder=False,
+    attention_mask=None,
+    encoder_outputs=None,
+    **model_kwargs,
+):
+    expanded_return_idx = (
+        torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
+    )
+    input_ids = input_ids.index_select(0, expanded_return_idx)
+    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
+    model_kwargs["image_hidden_states"] = model_kwargs.get("image_hidden_states", None)
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
+    if attention_mask is not None:
+        model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
+    if model_kwargs["pixel_values"] is not None:
+        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
+    elif model_kwargs["image_hidden_states"] is not None:
+        model_kwargs["image_hidden_states"] = model_kwargs["image_hidden_states"].index_select(0, expanded_return_idx)
+    return input_ids, model_kwargs
+def update_model_kwargs_for_generation(outputs, model_kwargs):
+    # must have this key set to at least None
+    if "past_key_values" in outputs:
+        model_kwargs["past_key_values"] = outputs.past_key_values
+    else:
+        model_kwargs["past_key_values"] = None
+    # update token_type_ids with last value
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+    # update attention masks
+    if "attention_mask" in model_kwargs:
+        attention_mask = model_kwargs["attention_mask"]
+        model_kwargs["attention_mask"] = torch.cat(
+            [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+        )
+    # Get the precomputed image_hidden_states
+    model_kwargs["image_hidden_states"] = outputs.image_hidden_states
+    return model_kwargs
+def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
+    token_type_ids = kwargs.get("token_type_ids", None)
+    # only last token for inputs_ids if past is defined in kwargs
+    if past_key_values:
+        input_ids = input_ids[:, -1].unsqueeze(-1)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+    attention_mask = kwargs.get("attention_mask", None)
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+    pixel_values = kwargs.get("pixel_values", None)
+    image_hidden_states = kwargs.get("image_hidden_states", None)
+    return {
+        "input_ids": input_ids,
+        "past_key_values": past_key_values,
+        "use_cache": kwargs.get("use_cache"),
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+        "token_type_ids": token_type_ids,
+        "pixel_values": pixel_values,
+        "image_hidden_states": image_hidden_states,
+    }
+def freeze_model(model, module_exceptions=[]):
+    mapping = {
+        "LayerNorm": nn.LayerNorm,
+        "Linear": nn.Linear,
+        "Embedding": nn.Embedding,
+    }
+    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
+    for module in model.modules():
+        if module_exceptions and any([isinstance(module, t) for t in module_exceptions_mapped]):
+            module.requires_grad_(True)  # Explicitly setting it to true to avoid any mistakes
+        else:
+            module.requires_grad_(False)
+    return model
+class DecoupledEmbedding(nn.Embedding):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings.
+    In practise, the regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0, then it will create `num_additional_embeddings` additional parameters that are always trained.
+    If `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
+    """
+    def __init__(
+        self,
+        num_embeddings,
+        num_additional_embeddings,
+        embedding_dim,
+        partially_freeze=False,
+        device=None,
+        dtype=None,
+        padding_idx=None,
+        **kwargs,
+    ) -> None:
+        """
+        num_additional_embeddings: int. Number of additional embeddings. Only useful when you `partially_freeze=True`.
+        partially_freeze: bool. If True, the regular `weight` will be frozen. `additional_weight` is never frozen.
+        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`, `max_norm` or `norm_type`. We are not supporting these.
+        """
+        if padding_idx is not None and padding_idx > num_embeddings:
+            raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
+        super().__init__(
+            num_embeddings=num_embeddings,
+            embedding_dim=embedding_dim,
+            device=device,
+            dtype=dtype,
+            padding_idx=padding_idx,
+            **kwargs,
+        )
+        self.num_embeddings = num_embeddings
+        self.padding_idx = padding_idx
+        self.num_additional_embeddings = num_additional_embeddings
+        self.partially_freeze = partially_freeze
+        if partially_freeze:
+            self.weight.requires_grad_(False)
+        if self.num_additional_embeddings > 0:
+            self.additional_embedding = nn.Embedding(
+                num_embeddings=self.num_additional_embeddings,
+                embedding_dim=embedding_dim,
+                device=device,
+                dtype=dtype,
+            )
+    def forward(self, input_ids):
+        """
+        we have 2 embeddings, with different indices - one pretrained self.weight and another
+        self.additional_embedding.weight that is being trained.
+        in order to make a lookup of the input ids, we:
+        1. find out the indices of the entries belonging to the 2nd embedding
+        2. extract those values while subtracting the size of the first embedding (num_embeddings),
+           since the 2nd embedding starts from 0 and not num_embeddings
+        3. perform the 2nd embedding lookup
+        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
+        5. perform the 1st embedding lookup
+        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
+        note: for the 1st embedding lookup we could have looked up only the low indices and not do
+        the padding, but then we have to create a new tensor and populate it with 2 tensors that are
+        spread out across various indices - i.e. not a simple concat - I haven't benchmarked the
+        complex case if it's any faster, given that seqlens are usually relatively short it's
+        probably not faster or if faster not by much - but might be a good idea to measure.
+        """
+        if self.num_additional_embeddings == 0:
+            return self.additional_embedding(input_ids)
+        # Clone so that we don't modify the original input_ids later on
+        input_ids = input_ids.clone()
+        additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
+        input_ids_additional_vocab = input_ids[additional_vocab_indices]
+        additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)
+        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
+        input_ids[additional_vocab_indices] = 0
+        full_vector = F.embedding(input_ids, self.weight)
+        # overwrite the records with high indices
+        full_vector[additional_vocab_indices] = additional_embeddings
+        return full_vector
+    def extra_repr(self) -> str:
+        return "num_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
+            self.num_embeddings,
+            self.num_additional_embeddings,
+            self.embedding_dim,
+            self.partially_freeze,
+        )
+class DecoupledLinear(nn.Linear):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters.
+    In practise, the regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0, then it will create `out_additional_features * in_features` additional parameters that are always trained.
+    If `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        out_additional_features: int = 0,
+        bias: bool = True,
+        partially_freeze: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when `partially_freeze=True`.
+        partially_freeze: bool. If True, the regular `weight` will be frozen and extra parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
+        """
+        super().__init__(in_features, out_features, bias, device, dtype)
+        self.out_additional_features = out_additional_features
+        self.partially_freeze = partially_freeze
+        self.in_features = in_features
+        self.out_features = out_features
+        if partially_freeze:
+            self.weight.requires_grad_(False)
+            if bias:
+                self.bias.requires_grad_(False)
+        if out_additional_features > 0:
+            self.additional_fc = nn.Linear(
+                in_features=in_features,
+                out_features=out_additional_features,
+                bias=bias,
+                device=device,
+                dtype=dtype,
+            )
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = F.linear(input, self.weight, self.bias)
+        if self.out_additional_features > 0:
+            additional_features = self.additional_fc(input)
+            output = torch.cat((output, additional_features), -1)
+        return output
+    def extra_repr(self) -> str:
+        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
+        return "in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
+            self.in_features,
+            self.out_features,
+            self.out_additional_features,
+            self.bias is not None,
+            self.partially_freeze,
+        )
+class SwiGLU(nn.Module):
+    def __init__(self, embed_dim) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.fc2 = nn.Linear(embed_dim, embed_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_1 = self.fc1(x)
+        x_1 = torch.mul(x_1, torch.sigmoid(x_1))
+        x_2 = self.fc2(x)
+        x = torch.mul(x_1, x_2)
+        return x
+class ModalityProjection(nn.Module):
+    def __init__(self, embed_dim_in, embed_dim_out) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(embed_dim_in, embed_dim_out, bias=False)
+        self.act = SwiGLU(embed_dim_out)
+        self.fc2 = nn.Linear(embed_dim_out, embed_dim_out, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self, embed_dim: int, depth: int, n_heads: int, head_dim: int, n_latents: int, qk_layer_norms: bool
+    ) -> None:
+        """
+        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
+        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
+        returns a Tensor of shape [bsz, n_latents, embed_dim].
+        :param embed_dim: Dimensionality of embeddings being fed to the Perceiver Resampler (also dimensionality of
+                          latent embeddings *returned* by the Perceiver Resampler. Could be e.g., VIT embed_dim, ResNet
+                          pool dim, and so on.
+        :param depth: Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+        :param n_heads: Number of heads in each Transformer block (for multi-headed self-attention).
+        :param head_dim: Dimensionality of each head projection in the Transformer block.
+        :param n_latents: Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+        """
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
+        self.qk_layer_norms = qk_layer_norms
+        # Create Latents for Perceiver
+        self.latents = nn.Parameter(torch.ones(self.n_latents, self.embed_dim))
+        self.intermediate_dim = self.embed_dim * 4
+        # Create Transformer Blocks
+        self.blocks = nn.ModuleList(
+            [
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(self.embed_dim, self.n_heads, self.head_dim, self.qk_layer_norms),
+                        MLP(self.embed_dim, self.intermediate_dim),
+                    ]
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(self, context: torch.Tensor) -> torch.Tensor:
+        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
+        latents = repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
+        # Feed through Perceiver Attention blocks...
+        for attn, ff in self.blocks:
+            latents = attn(context, latents) + latents
+            latents = ff(latents) + latents
+        return self.layer_norm(latents)
+class PerceiverAttention(nn.Module):
+    def __init__(self, embed_dim: int, n_heads: int, head_dim: int, qk_layer_norms: bool) -> None:
+        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
+        self.qk_layer_norms = qk_layer_norms
+        # Normalization & Scaling
+        self.context_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.latents_layer_norm = nn.LayerNorm(self.embed_dim)
+        if self.qk_layer_norms:
+            self.q_layer_norm = nn.LayerNorm(self.head_dim)
+            self.k_layer_norm = nn.LayerNorm(self.head_dim)
+        self.qk_scale = self.head_dim**-0.5
+        # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
+        self.q_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+        self.output_proj = nn.Linear(self.n_heads * self.head_dim, self.embed_dim, bias=False)
+    def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
+        """
+        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
+        :param context: Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample.
+        :param latents: Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to.
+        :return: Tensor of shape [bsz, n_latents, embed_dim] representing attention over latents w/ cross from context.
+        """
+        context = self.context_layer_norm(context)
+        latents = self.latents_layer_norm(latents)
+        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
+        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
+        q = self.q_proj(latents)
+        k = self.k_proj(torch.cat([context, latents], dim=-2))
+        v = self.v_proj(torch.cat([context, latents], dim=-2))
+        # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
+        #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
+        q, k, v = [rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads) for x in (q, k, v)]
+        if self.qk_layer_norms:
+            q = self.q_layer_norm(q)
+            k = self.k_layer_norm(k)
+        scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
+        stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
+        attn = stabilized_scores.softmax(dim=-1)
+        # Attend & project back to output...
+        resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
+        return self.output_proj(
+            rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
+        )
+class MLP(nn.Module):
+    def __init__(self, embed_dim, intermediate_size):
+        """Simple MLP block with intermediate_size and embedding size"""
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.ln = nn.LayerNorm(self.embed_dim)
+        self.fc = nn.Linear(self.embed_dim, intermediate_size, bias=False)
+        self.act = nn.ReLU()
+        self.c_proj = nn.Linear(intermediate_size, self.embed_dim, bias=False)
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.ln(hidden_states)
+        hidden_states = self.fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        return hidden_states
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class MistralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+class MistralRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class MistralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class MistralAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config: VMistralConfig, qk_layer_norms: bool = False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qk_layer_norms = qk_layer_norms
+        if self.qk_layer_norms:
+            self.q_layer_norm = MistralRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_layer_norm = MistralRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.rotary_emb = MistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.attention_dropout = config.attention_dropout
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use"
+                " `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = (
+            self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        )
+        value_states = (
+            self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        )
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class MistralFlashAttention2(MistralAttention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use"
+                " `attention_mask` instead.`"
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        use_sliding_windows = False
+        # use_sliding_windows = (
+        #     _flash_supports_window_size
+        #     and hasattr(self.config, "sliding_window") is not None
+        #     and kv_seq_len > self.config.sliding_window
+        # )
+        _flash_supports_window_size = None
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory"
+                " efficient implementation make sure to upgrade flash-attn library."
+            )
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window:
+                slicing_tokens = kv_seq_len - self.config.sliding_window
+                past_key = past_key_value[0]
+                past_value = past_key_value[1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        "past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1,"
+                        f" head_dim`), got {past_key.shape}"
+                    )
+                past_key_value = (past_key, past_value)
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
+                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=self.is_causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=self.is_causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=self.is_causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=self.is_causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: VMistralConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = (
+            MistralAttention(config=config)
+        )
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use"
+                " `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`VMistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class VMistralPreTrainedModel(PreTrainedModel):
+    config_class = VMistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_sdpa = False
+    def _init_weights(self, module):
+        # important: this ported version of the model isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the m4 code
+        # base should be used for training from scratch and it contains the correct code.
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    # @classmethod
+    # def override_vision_model_wrapper(cls, model, config, vision_model_name, vision_model_params, torch_dtype):
+    #     # this can be called via from_pretrained from a class w/ head or w/o head so we extract the beheaded model version
+    #     beheaded_model = model.model if hasattr(model, "model") else model
+    #     cls.override_vision_model(beheaded_model, vision_model_name, vision_model_params, torch_dtype)
+    #     beheaded_model.freeze_relevant_params(config)
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class VMistralModel(VMistralPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+    Args:
+        config: VMistralConfig
+    """
+    def __init__(self, config: VMistralConfig, vision_model=None):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.sliding_window = config.sliding_window
+        self.embed_tokens = DecoupledEmbedding(
+            num_embeddings=config.vocab_size,
+            num_additional_embeddings=config.additional_vocab_size,
+            embedding_dim=config.hidden_size,
+            partially_freeze=config.freeze_text_layers,
+            padding_idx=self.padding_idx,
+        )
+        # Load an uninitialized model and later in from_pretrained will load the pre-trained model -
+        # this solves the losing of weights in `from_pretrained` on the main model
+        self.vision_model = SiglipVisionModel(config.vision_config)
+        # Dim projection - projecting from the vision dim to the text dim
+        self.modality_projection = ModalityProjection(
+            embed_dim_in=self.config.vision_config.hidden_size, embed_dim_out=self.config.hidden_size
+        )
+        # Perceiver Resampler
+        if config.use_resampler:
+            self.perceiver_resampler = PerceiverResampler(
+                config.hidden_size,
+                config.perceiver_config.resampler_depth,
+                config.perceiver_config.resampler_n_heads,
+                config.perceiver_config.resampler_head_dim,
+                config.perceiver_config.resampler_n_latents,
+                config.perceiver_config.qk_layer_norms_perceiver,
+            )
+        if config.use_resampler:
+            self.image_seq_len = config.perceiver_config.resampler_n_latents
+        else:
+            self.image_seq_len = (
+                config.vision_config.image_size // config.vision_config.patch_size
+            ) ** 2  # TODO: pretty sure that does not work for CLIP models since there is the CLS token
+        self.image_token_id = self.config.image_token_id
+        self.layers = nn.ModuleList([MistralDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.freeze_relevant_params(config)
+    def freeze_relevant_params(self, config=None):
+        if config is None:
+            config = self.config
+        if config.freeze_text_layers:
+            self.freeze_text_layers(config.freeze_text_module_exceptions)
+        if config.freeze_vision_layers:
+            freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
+    def freeze_text_layers(self, module_exceptions):
+        for module in [self.layers, self.norm]:
+            freeze_model(module, module_exceptions=module_exceptions)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def inputs_merger(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        image_hidden_states: Optional[torch.Tensor] = None,
+    ):
+        """
+        This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
+        The merging happens as follows:
+        - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
+        - We get the image hidden states for the image through the vision encoder (and potentially the perceiver), and that hidden state is then projected into the text embedding space.
+        We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
+        - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
+        - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
+        """
+        batch_size = input_ids.size(0)
+        if inputs_embeds is not None:
+            new_inputs_embeds = inputs_embeds.clone()
+            if image_hidden_states is not None:
+                vision_pipeline_output_seq_len = image_hidden_states.shape[1]
+                vision_hidden_size = image_hidden_states.shape[2]
+                # Get the number of images for each example
+                num_images = (input_ids == self.image_token_id).sum(dim=-1) // self.image_seq_len
+                cum_num_images = num_images.cumsum(dim=-1)
+                for batch_idx in range(batch_size):
+                    # Get the number of images for this particular example
+                    example_num_images = num_images[batch_idx]
+                    # Get the image_hidden_states corresponding to True images for the example, so get rid of the padding images.
+                    start = 0 if batch_idx == 0 else cum_num_images[batch_idx - 1]
+                    end = cum_num_images[batch_idx]
+                    example_true_image_hidden_states = image_hidden_states[start:end]
+                    if (
+                        new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id].shape[0]
+                        != example_num_images * vision_pipeline_output_seq_len
+                    ):
+                        raise ValueError(
+                            "new_inputs_embeds to replace has shape[0]:"
+                            f" {new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id].shape[0]} but"
+                            " should have shape[0]:"
+                            f" {example_num_images}*{vision_pipeline_output_seq_len}={example_num_images * vision_pipeline_output_seq_len} "
+                        )
+                    # Insert the image_hidden_states
+                    new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id] = (
+                        example_true_image_hidden_states.view(
+                            example_num_images * vision_pipeline_output_seq_len,
+                            vision_hidden_size,
+                        )
+                    )
+        return_dict = {}
+        if inputs_embeds is not None:
+            return_dict["inputs_embeds"] = new_inputs_embeds
+        return return_dict
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VMistralBaseModelOutputWithPast]:
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # START VISUAL INPUTS INTEGRATION
+        if pixel_values is not None and image_hidden_states is not None:
+            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
+        elif pixel_values is not None:
+            pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
+            batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
+            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
+            # Remove padding images - padding images are full 0.
+            real_images_inds = pixel_values.sum(dim=(-1, -2, -3)) != 0.0
+            pixel_values = pixel_values[real_images_inds]
+            # Get sequence from the vision encoder
+            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
+            # Modality projection
+            image_hidden_states = self.modality_projection(image_hidden_states)
+            if self.config.use_resampler:
+                image_hidden_states = self.perceiver_resampler(image_hidden_states)
+        elif image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
+        if past_key_values is None:
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            new_inp = self.inputs_merger(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                image_hidden_states=image_hidden_states,
+            )
+            inputs_embeds = new_inp["inputs_embeds"]
+        # Can do add some token types embeddings here (image token vs text token)
+        # something like inputs_embeds += self.token_types(token_types)
+        # embed positions
+        if (
+            attention_mask is not None
+            and hasattr(self.config, "_flash_attn_2_enabled")
+            and self.config._flash_attn_2_enabled
+            and past_key_values is not None
+        ):
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if getattr(self.config, "_flash_attn_2_enabled", False):
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+            attention_mask[attention_mask == -float("inf")] = torch.finfo(self.dtype).min
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, image_hidden_states]
+                if v is not None
+            )
+        return VMistralBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            image_hidden_states=image_hidden_states,
+        )
+class VMistralForVisionText2Text(VMistralPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config, vision_model=None):
+        super().__init__(config)
+        self.model = VMistralModel(config, vision_model=vision_model)
+        self.image_token_id = self.config.image_token_id
+        self.lm_head = DecoupledLinear(
+            in_features=config.hidden_size,
+            out_features=config.vocab_size,
+            out_additional_features=config.additional_vocab_size,
+            bias=False,
+            partially_freeze=config.freeze_lm_head,
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def tie_weights(self):
+        """
+        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
+        """
+        output_embeddings = self.get_output_embeddings()
+        input_embeddings = self.get_input_embeddings()
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings.weight = input_embeddings.weight
+            if input_embeddings.num_additional_embeddings > 0:
+                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
+                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+            output_embeddings.out_features = input_embeddings.num_embeddings
+            if hasattr(output_embeddings, "out_additional_features") and hasattr(
+                input_embeddings, "num_additional_embeddings"
+            ):
+                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=VMistralCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VMistralCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_hidden_states=image_hidden_states,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=self.image_token_id)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return VMistralCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        image_hidden_states = kwargs.pop("image_hidden_states", None)
+        if image_hidden_states is not None:
+            kwargs["pixel_values"] = None
+        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
+        unwanted_kwargs = ["token_type_ids"]
+        for kwarg in unwanted_kwargs:
+            inputs.pop(kwarg, None)
+        return inputs
+    @staticmethod
+    def _expand_inputs_for_generation(
+        *args,
+        **model_kwargs,
+    ):
+        return expand_inputs_for_generation(*args, **model_kwargs)
+    @staticmethod
+    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder):
+        return update_model_kwargs_for_generation(outputs, model_kwargs)
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past

modeling_web.py ADDED Viewed

	@@ -0,0 +1,681 @@

+from dataclasses import dataclass
+import inspect
+import warnings
+from typing import List, Optional, Tuple, Union
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.utils import (
+    is_flash_attn_2_available
+)
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+from .configuration_vmistral import VMistralConfig
+from .vision import SiglipVisionModel
+from .modeling_vmistral import *
+from .generation_utils import TreeBuilder, WebGenerationMixin
+import time
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+@dataclass
+class WebLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    html_tree: TreeBuilder = None
+class WebAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config: VMistralConfig, qk_layer_norms: bool = False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qk_layer_norms = qk_layer_norms
+        if self.qk_layer_norms:
+            self.q_layer_norm = MistralRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_layer_norm = MistralRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.rotary_emb = MistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.attention_dropout = config.attention_dropout
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        web_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use"
+                " `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = (
+            self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        )
+        value_states = (
+            self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        )
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        web_attention_range = self.config.web_attention_range
+        def split_tensor(tensor):
+            if int(web_attention_range) == 8:
+                return
+            fraction = float(web_attention_range) / 8
+            split_size_2 = int(self.num_heads * fraction)
+            split_size_1 = self.num_heads - split_size_2
+            return torch.split(tensor, [split_size_1, split_size_2], dim=1)
+        if int(web_attention_range) != 8:
+            query_states_1, query_states_2 = split_tensor(query_states)
+            key_states_1, key_states_2 = split_tensor(key_states)
+            value_states_1, value_states_2 = split_tensor(value_states)
+            with torch.backends.cuda.sdp_kernel(
+                enable_flash=False, enable_math=True, enable_mem_efficient=False
+            ):
+                attn_output_1 = F.scaled_dot_product_attention(query_states_1, key_states_1, value_states_1, attn_mask=attention_mask)
+                attn_output_2 = F.scaled_dot_product_attention(query_states_2, key_states_2, value_states_2, attn_mask=web_attention_mask)
+            attn_output = torch.cat([attn_output_1, attn_output_2], dim=1)
+        else:
+            with torch.backends.cuda.sdp_kernel(
+                enable_flash=False, enable_math=True, enable_mem_efficient=False
+            ):
+                attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attention_mask=web_attention_mask)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class WebFlashAttention2(WebAttention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+class WebDecoderLayer(nn.Module):
+    def __init__(self, config: VMistralConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = (
+            WebAttention(config=config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else WebFlashAttention2(config)
+        )
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        web_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use"
+                " `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            web_attention_mask=web_attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class WebPreTrainedModel(PreTrainedModel):
+    config_class = VMistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["WebDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_sdpa = False
+class WebModel(WebPreTrainedModel, VMistralModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+    Args:
+        config: VMistralConfig
+    """
+    def __init__(self, config: VMistralConfig, vision_model=None):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.sliding_window = config.sliding_window
+        self.embed_tokens = DecoupledEmbedding(
+            num_embeddings=config.vocab_size,
+            num_additional_embeddings=config.additional_vocab_size,
+            embedding_dim=config.hidden_size,
+            partially_freeze=config.freeze_text_layers,
+            padding_idx=self.padding_idx,
+        )
+        # Load an uninitialized model and later in from_pretrained will load the pre-trained model -
+        # this solves the losing of weights in `from_pretrained` on the main model
+        self.vision_model = SiglipVisionModel(config.vision_config)
+        # Dim projection - projecting from the vision dim to the text dim
+        self.modality_projection = ModalityProjection(
+            embed_dim_in=self.config.vision_config.hidden_size, embed_dim_out=self.config.hidden_size
+        )
+        # Perceiver Resampler
+        if config.use_resampler:
+            self.perceiver_resampler = PerceiverResampler(
+                config.hidden_size,
+                config.perceiver_config.resampler_depth,
+                config.perceiver_config.resampler_n_heads,
+                config.perceiver_config.resampler_head_dim,
+                config.perceiver_config.resampler_n_latents,
+                config.perceiver_config.qk_layer_norms_perceiver,
+            )
+        if config.use_resampler:
+            self.image_seq_len = config.perceiver_config.resampler_n_latents
+        else:
+            self.image_seq_len = (
+                config.vision_config.image_size // config.vision_config.patch_size
+            ) ** 2  # TODO: pretty sure that does not work for CLIP models since there is the CLS token
+        self.image_token_id = self.config.image_token_id
+        self.layers = nn.ModuleList([WebDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.freeze_relevant_params(config)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        web_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VMistralBaseModelOutputWithPast]:
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # START VISUAL INPUTS INTEGRATION
+        if pixel_values is not None and image_hidden_states is not None:
+            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
+        elif pixel_values is not None:
+            pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
+            batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
+            # this change allows multi image in a single batch
+            pixel_values = pixel_values.contiguous().view(batch_size, num_images, *pixel_values.shape[2:])
+            # # Remove padding images - padding images are full 0.
+            # real_images_inds = pixel_values.sum(dim=(-1, -2, -3)) != 0.0
+            # print(real_images_inds)
+            # pixel_values = pixel_values[real_images_inds]
+            # # Get sequence from the vision encoder
+            # print("shape_pixel", pixel_values.shape)
+            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
+            # Modality projection
+            image_hidden_states = self.modality_projection(image_hidden_states)
+            if self.config.use_resampler:
+                image_hidden_states = self.perceiver_resampler(image_hidden_states)
+        elif image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
+        if past_key_values is None:
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            new_inp = self.inputs_merger(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                image_hidden_states=image_hidden_states,
+            )
+            inputs_embeds = new_inp["inputs_embeds"]
+        # Can do add some token types embeddings here (image token vs text token)
+        # something like inputs_embeds += self.token_types(token_types)
+        # embed positions
+        if (
+            attention_mask is not None
+            and hasattr(self.config, "_flash_attn_2_enabled")
+            and self.config._flash_attn_2_enabled
+            and past_key_values is not None
+        ):
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        # We did not implement our model using Flash attn 2
+        self.config._flash_attn_2_enabled = False
+        if not getattr(self.config, "_flash_attn_2_enabled", False):
+            # 2d mask is passed through the layers
+            # attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+            web_attention_mask = web_attention_mask.unsqueeze(1)
+            inverted_mask = 1.0 - web_attention_mask.to(inputs_embeds.dtype)
+            web_attention_mask = inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), -1.e32
+            )
+            if input_ids is not None:
+                bsz, L = input_ids.size()[:2]
+                web_attention_mask = web_attention_mask[:, :, -L:, :]
+        else:
+            print("Exiting, wrong branch")
+            exit()
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+            attention_mask[attention_mask == -float("inf")] = torch.finfo(self.dtype).min
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    web_attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    web_attention_mask=web_attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, image_hidden_states]
+                if v is not None
+            )
+        return VMistralBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            image_hidden_states=image_hidden_states,
+        )
+class WebForVisionText2Text(WebPreTrainedModel, WebGenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config, vision_model=None):
+        super().__init__(config)
+        self.model = WebModel(config, vision_model=vision_model)
+        self.image_token_id = self.config.image_token_id
+        self.lm_head = DecoupledLinear(
+            in_features=config.hidden_size,
+            out_features=config.vocab_size,
+            out_additional_features=config.additional_vocab_size,
+            bias=False,
+            partially_freeze=config.freeze_lm_head,
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        web_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        html_tree = None,
+    ) -> Union[Tuple, WebLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            web_attention_mask=web_attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_hidden_states=image_hidden_states,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        # print(f"forward takes: {time.time()-start_time}")
+        return WebLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+            html_tree = html_tree
+        )
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs
+    ):
+        image_hidden_states = kwargs.pop("image_hidden_states", None)
+        if image_hidden_states is not None:
+            kwargs["pixel_values"] = None
+        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
+        web_attention_mask, html_tree = None, kwargs.get("html_tree")
+        if html_tree.web_attention_mask is None :
+            attention_mask = inputs["attention_mask"]
+            web_attention_mask = torch.tril(torch.ones((attention_mask.shape[-1], attention_mask.shape[-1]), dtype = attention_mask.dtype)).unsqueeze(0)
+            html_tree.web_attention_mask = web_attention_mask
+        else:
+            html_tree = kwargs.get("html_tree")
+            input_ids = inputs["input_ids"]
+            tokenizer = html_tree.tokenizer
+            cur_decoded_token = tokenizer.convert_tokens_to_string([" "]+tokenizer.convert_ids_to_tokens(input_ids[:,-1]))
+            web_attn_range = html_tree.update_buffer([cur_decoded_token])
+            bsz, L = html_tree.web_attention_mask.size()[:2]
+            web_attention_mask = torch.zeros((bsz, L + 1, L + 1)).type_as(html_tree.web_attention_mask)
+            web_attention_mask[:, :L, :L] = html_tree.web_attention_mask
+            web_attn_range = torch.tensor(list(range(67))+[i + 67 for i in web_attn_range], dtype = web_attention_mask.dtype)
+            web_attention_mask[:, -1, web_attn_range] = 1
+            html_tree.web_attention_mask = web_attention_mask
+            if html_tree.input_ids is None :
+                html_tree.input_ids = input_ids
+            else:
+                html_tree.input_ids = torch.cat((html_tree.input_ids, input_ids), dim = 1)
+        unwanted_kwargs = ["token_type_ids"]
+        inputs.update({
+            "web_attention_mask": web_attention_mask.to(inputs['attention_mask'].device),
+            "html_tree": html_tree,
+        })
+        for kwarg in unwanted_kwargs:
+            inputs.pop(kwarg, None)
+        return inputs

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "IdeficsImageProcessor",
+    "AutoProcessor": "IdeficsProcessor"
+  },
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_num_channels": 3,
+  "image_processor_type": "IdeficsImageProcessor",
+  "image_size": 960,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "IdeficsProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<fake_token_around_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "processor_class": "IdeficsProcessor",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}

vision.py ADDED Viewed

	@@ -0,0 +1,653 @@

+# coding=utf-8
+# Copyright 2023 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" A simplified copy of https://huggingface.co/HuggingFaceM4/siglip-so400m-14-384-flash-attn2 """
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.utils import (
+    ModelOutput,
+    is_flash_attn_2_available,
+    logging,)
+from .configuration_vmistral import VMistralVisionConfig
+logger = logging.get_logger(__name__)
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: VMistralVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        # print(self.patch_embedding)
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->Siglip
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # if past_key_value is not None:
+        #     cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        #     key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
+                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+        causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: VMistralVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = (
+            SiglipAttention(config)
+            # if not getattr(config, "_flash_attn_2_enabled", False)
+            # else SiglipFlashAttention2(config)
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+class SiglipVisionTransformer(nn.Module):
+    def __init__(self, config: VMistralVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.head = SiglipMultiheadAttentionPoolingHead(config)
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states = self.embeddings(pixel_values)
+        # print("hidden_states", hidden_states.shape)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        pooled_output = self.head(last_hidden_state)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+    def __init__(self, config: VMistralVisionConfig):
+        super().__init__()
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+        return hidden_state[:, 0]
+class SiglipVisionModel(nn.Module):
+    def __init__(self, config: VMistralVisionConfig):
+        super().__init__()
+        self.config = config
+        self.vision_model = SiglipVisionTransformer(config)
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )