VedantPadwal commited on May 25

Commit

6b8a59c

•

1 Parent(s): 5fdf30c

Upload 17 files

Browse files

Files changed (17) hide show

attn_mask.py +269 -0
config.json +148 -0
configuration_phi3_v.py +115 -0
gen.py +126 -0
image_embedding_phi3_v.py +301 -0
image_processing_phi3_v.py +274 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +599 -0
modeling_phi3_v.py +780 -0
preprocessor_config.json +20 -0
processing_phi3_v.py +217 -0
sample_inference.py +129 -0
special_tokens_map.json +36 -0
tokenizer.json +0 -0
tokenizer_config.json +408 -0
utils.py +255 -0

attn_mask.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+from utils import FloatTensor
+import mlx.core as mx
+# Custom function to mimic torch.finfo
+def get_finfo_min(dtype: mx.Dtype):
+    dtype_str = str(dtype)
+    if dtype_str == 'float32':
+        return -3.4028235e+38  # Minimum value for float32
+    elif dtype_str == 'float64':
+        return -1.7976931348623157e+308  # Minimum value for float64
+    elif dtype_str == 'float16':
+        return -65504.0  # Minimum value for float16
+    else:
+        raise ValueError(f"Unsupported data type: {dtype_str}")
+@dataclass
+class AttentionMaskConverter:
+    is_causal: bool
+    sliding_window: Optional[int]
+    def __init__(self, is_causal: bool, sliding_window: Optional[int] = None):
+        self.is_causal = is_causal
+        self.sliding_window = sliding_window
+        if self.sliding_window is not None and self.sliding_window <= 0:
+            raise ValueError(
+                f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`"
+            )
+    def to_causal_4d(
+        self,
+        batch_size: int,
+        query_length: int,
+        key_value_length: int,
+        dtype: mx.Dtype,
+        device: Union[mx.Device, "str"] = "cpu",
+    ) -> Optional[mx.array]:
+        """
+        Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
+        bias to upper right hand triangular matrix (causal mask).
+        """
+        if not self.is_causal:
+            raise ValueError(f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True.")
+        # If shape is not cached, create a new causal mask and cache it
+        input_shape = (batch_size, query_length)
+        past_key_values_length = key_value_length - query_length
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        causal_4d_mask = None
+        if input_shape[-1] > 1 or self.sliding_window is not None:
+            causal_4d_mask = self._make_causal_mask(
+                input_shape,
+                dtype,
+                device=device,
+                past_key_values_length=past_key_values_length,
+                sliding_window=self.sliding_window,
+            )
+        return causal_4d_mask
+    def to_4d(
+        self,
+        attention_mask_2d: mx.array,
+        query_length: int,
+        dtype: mx.Dtype,
+        key_value_length: Optional[int] = None,
+    ) -> mx.array:
+        """
+        Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
+        key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
+        causal, a causal mask will be added.
+        """
+        input_shape = (attention_mask_2d.shape[0], query_length)
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        causal_4d_mask = None
+        if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
+            if key_value_length is None:
+                raise ValueError(
+                    "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask."
+                )
+            past_key_values_length = key_value_length - query_length
+            causal_4d_mask = self._make_causal_mask(
+                input_shape,
+                dtype,
+                device=attention_mask_2d.device,
+                past_key_values_length=past_key_values_length,
+                sliding_window=self.sliding_window,
+            )
+        elif self.sliding_window is not None:
+            raise NotImplementedError("Sliding window is currently only implemented for causal masking")
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to(
+            attention_mask_2d.device
+        )
+        if causal_4d_mask is not None:
+            expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), get_finfo_min(dtype))
+        # expanded_attn_mask + causal_4d_mask can cause some overflow
+        expanded_4d_mask = expanded_attn_mask
+        return expanded_4d_mask
+    @staticmethod
+    def _make_causal_mask(
+        input_ids_shape: Tuple[int, int],
+        dtype: mx.Dtype,
+        device: mx.Device,
+        past_key_values_length: int = 0,
+        sliding_window: Optional[int] = None,
+    ):
+        """
+        Make causal mask used for bi-directional self-attention.
+        """
+        bsz, tgt_len = input_ids_shape
+        mask = mx.full((tgt_len, tgt_len), get_finfo_min(dtype), device=device)
+        mask_cond = mx.arange(tgt_len, device=device)
+        mask = mask * (mask_cond[:, None] >= mask_cond[None, :])
+        mask = mask.astype(dtype)
+        if past_key_values_length > 0:
+            past_mask = mx.zeros((tgt_len, past_key_values_length), dtype=dtype, device=device)
+            mask = mx.concatenate([past_mask, mask], dim=-1)
+        # add lower triangular sliding window mask if necessary
+        if sliding_window is not None:
+            diagonal = past_key_values_length - sliding_window - 1
+            context_mask = mx.tril(mx.ones_like(mask, dtype=mx.bool_), k=diagonal)
+            mask = mask * (1 - context_mask.astype(dtype)) + context_mask.astype(dtype) * get_finfo_min(dtype)
+        return mask.expand_dims(axis=0).expand_dims(axis=0).broadcast_to((bsz, 1, tgt_len, tgt_len + past_key_values_length))
+    @staticmethod
+    def _expand_mask(mask: mx.array, dtype: mx.Dtype, tgt_len: Optional[int] = None):
+        """
+        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+        """
+        bsz, src_len = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+        inverted_mask = 1.0 - expanded_mask
+        return inverted_mask.masked_fill(inverted_mask.to(mx.bool_), get_finfo_min(dtype))
+    @staticmethod
+    def _unmask_unattended(
+        expanded_mask: FloatTensor,
+        min_dtype: float,
+    ):
+        # fmt: off
+        """
+        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
+        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        Details: https://github.com/pytorch/pytorch/issues/110213
+        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
+        `attention_mask` is [bsz, src_seq_len].
+        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.
+        For example, if `expanded_mask` is (e.g. here left-padding case)
+        ```
+        [[[[0, 0, 0],
+           [0, 0, 0],
+           [0, 0, 1]]],
+         [[[1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]]],
+         [[[0, 0, 0],
+           [0, 1, 0],
+           [0, 1, 1]]]]
+        ```
+        then the modified `expanded_mask` will be
+        ```
+        [[[[1, 1, 1],   <-- modified
+           [1, 1, 1],   <-- modified
+           [0, 0, 1]]],
+         [[[1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]]],
+         [[[1, 1, 1],   <-- modified
+           [0, 1, 0],
+           [0, 1, 1]]]]
+        ```
+        """
+        # fmt: on
+        if expanded_mask.dtype == mx.bool_:
+            raise ValueError(
+                "AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor."
+            )
+        return expanded_mask.mul(~mx.all(expanded_mask == min_dtype, dim=-1, keepdim=True))
+def _prepare_4d_causal_attention_mask(
+    attention_mask: Optional[mx.array],
+    input_shape: Union[mx.array, Tuple, List],
+    inputs_embeds: mx.array,
+    past_key_values_length: int,
+    sliding_window: Optional[int] = None,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`
+    Args:
+        attention_mask (`mx.array` or `None`):
+            A 2D attention mask of shape `(batch_size, key_value_length)`
+        input_shape (`tuple(int)` or `list(int)`):
+            The input shape should be a tuple that defines `(batch_size, query_length)`.
+        inputs_embeds (`mx.array`):
+            The embedded inputs as a torch Tensor.
+        past_key_values_length (`int`):
+            The length of the key value cache.
+        sliding_window (`int`, *optional*):
+            If the model uses windowed attention, a sliding window should be passed.
+    """
+    attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
+    key_value_length = input_shape[-1] + past_key_values_length
+    # 4d mask is passed through the layers
+    if attention_mask is not None and len(attention_mask.shape) == 2:
+        attention_mask = attn_mask_converter.to_4d(
+            attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
+        )
+    elif attention_mask is not None and len(attention_mask.shape) == 4:
+        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+        if tuple(attention_mask.shape) != expected_shape:
+            raise ValueError(
+                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+            )
+        else:
+            # if the 4D mask has correct shape - invert it and fill with negative infinity
+            inverted_mask = 1.0 - attention_mask
+            attention_mask = inverted_mask.masked_fill(
+                inverted_mask.to(mx.bool_), get_finfo_min(inputs_embeds.dtype)
+            )
+    else:
+        attention_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+    return attention_mask

config.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "_name_or_path": "Phi-3-vision-128k-instruct",
+  "architectures": [
+    "Phi3VForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3_v.Phi3VConfig",
+    "AutoModelForCausalLM": "modeling_phi3_v.Phi3VForCausalLM"
+  },
+  "bos_token_id": 1,
+  "embd_layer": {
+    "embedding_cls": "image",
+    "hd_transform_order": "sub_glb",
+    "projection_cls": "mlp",
+    "use_hd_transform": true,
+    "with_learnable_separator": true
+  },
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "img_processor": {
+    "image_dim_out": 1024,
+    "model_name": "openai/clip-vit-large-patch14-336",
+    "name": "clip_vision_model",
+    "num_img_tokens": 144
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "model_type": "phi3_v",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1.0299999713897705,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0799999237060547,
+      1.2299998998641968,
+      1.2299998998641968,
+      1.2999999523162842,
+      1.4499999284744263,
+      1.5999999046325684,
+      1.6499998569488525,
+      1.8999998569488525,
+      2.859999895095825,
+      3.68999981880188,
+      5.419999599456787,
+      5.489999771118164,
+      5.489999771118164,
+      9.09000015258789,
+      11.579999923706055,
+      15.65999984741211,
+      15.769999504089355,
+      15.789999961853027,
+      18.360000610351562,
+      21.989999771118164,
+      23.079999923706055,
+      30.009998321533203,
+      32.35000228881836,
+      32.590003967285156,
+      35.56000518798828,
+      39.95000457763672,
+      53.840003967285156,
+      56.20000457763672,
+      57.95000457763672,
+      59.29000473022461,
+      59.77000427246094,
+      59.920005798339844,
+      61.190006256103516,
+      61.96000671386719,
+      62.50000762939453,
+      63.3700065612793,
+      63.48000717163086,
+      63.48000717163086,
+      63.66000747680664,
+      63.850006103515625,
+      64.08000946044922,
+      64.760009765625,
+      64.80001068115234,
+      64.81001281738281,
+      64.81001281738281
+    ],
+    "short_factor": [
+      1.05,
+      1.05,
+      1.05,
+      1.1,
+      1.1,
+      1.1,
+      1.2500000000000002,
+      1.2500000000000002,
+      1.4000000000000004,
+      1.4500000000000004,
+      1.5500000000000005,
+      1.8500000000000008,
+      1.9000000000000008,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.1000000000000005,
+      2.1000000000000005,
+      2.2,
+      2.3499999999999996,
+      2.3499999999999996,
+      2.3499999999999996,
+      2.3499999999999996,
+      2.3999999999999995,
+      2.3999999999999995,
+      2.6499999999999986,
+      2.6999999999999984,
+      2.8999999999999977,
+      2.9499999999999975,
+      3.049999999999997,
+      3.049999999999997,
+      3.049999999999997
+    ],
+    "type": "su"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.1",
+  "use_cache": true,
+  "vocab_size": 32064,
+  "_attn_implementation": "eager"
+}

configuration_phi3_v.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from dataclasses import dataclass, field
+import inspect
+import logging
+from typing import Optional, List, Union, Dict, Tuple, Any
+from transformers.configuration_utils import PretrainedConfig
+import mlx.core as mx
+PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",
+}
+class Phi3VConfig(PretrainedConfig):
+    model_type = "phi3_v"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        original_max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        bos_token_id=1,
+        eos_token_id=32000,
+        pad_token_id=32000,
+        sliding_window=None,
+        embd_layer: str = "default",
+        **kwargs,
+        ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+        self.embd_layer = embd_layer
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+            )

gen.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from PIL import Image
+import requests
+from transformers import AutoProcessor
+from modeling_phi3_v import Phi3VModel
+model_path = "./"
+# kwargs = {}
+# kwargs['torch_dtype'] = torch.bfloat16
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+model = Phi3VModel.from_pretrained(model_path)
+user_prompt = '<|user|>\n'
+assistant_prompt = '<|assistant|>\n'
+prompt_suffix = "<|end|>\n"
+#################################################### text-only ####################################################
+# single-image prompt
+prompt = f"{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}"
+print(f">>> Prompt\n{prompt}")
+inputs = processor(prompt, images=None, return_tensors="pt")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids,
+                                  skip_special_tokens=True,
+                                  clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+#################################################### text-only 2 ####################################################
+# single-image prompt
+prompt = f"{user_prompt}Give me the code for sloving two-sum problem.{prompt_suffix}{assistant_prompt}"
+print(f">>> Prompt\n{prompt}")
+inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids,
+                                  skip_special_tokens=True,
+                                  clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+#################################################### EXAMPLE 1 ####################################################
+# single-image prompt
+prompt = f"{user_prompt}<|image_1|>\nWhat is shown in this image?{prompt_suffix}{assistant_prompt}"
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+print(f">>> Prompt\n{prompt}")
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids,
+                                  skip_special_tokens=True,
+                                  clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+#################################################### EXAMPLE 2 ####################################################
+# multiple image prompt
+# Note: image tokens must start from <|image_1|>
+prompt = f"{user_prompt}<|image_1|>\n<|image_2|>\n What is shown in this two images?{prompt_suffix}{assistant_prompt}"
+print(f">>> Prompt\n{prompt}")
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image_1 = Image.open(requests.get(url, stream=True).raw)
+url = "https://img.freepik.com/free-photo/painting-mountain-lake-with-mountain-background_188544-9126.jpg?w=2000"
+image_2 = Image.open(requests.get(url, stream=True).raw)
+images = [image_1, image_2]
+inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+#################################################### EXAMPLE 3 ####################################################
+# chat template
+chat = [
+    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
+    {"role": "assistant", "content": "The image depicts a street scene with a prominent red stop sign in the foreground. The background showcases a building with traditional Chinese architecture, characterized by its red roof and ornate decorations. There are also several statues of lions, which are common in Chinese culture, positioned in front of the building. The street is lined with various shops and businesses, and there's a car passing by."},
+    {"role": "user", "content": "What is so special about this image"}
+]
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+# need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
+if prompt.endswith("<|endoftext|>"):
+    prompt = prompt.rstrip("<|endoftext|>")
+print(f">>> Prompt\n{prompt}")
+inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+############################# to markdown #############################
+# single-image prompt
+prompt = f"{user_prompt}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}"
+url = "https://support.content.office.net/en-us/media/3dd2b79b-9160-403d-9967-af893d17b580.png"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
+print(f">>> Prompt\n{prompt}")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids,
+                                  skip_special_tokens=False,
+                                  clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')

image_embedding_phi3_v.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, PretrainedConfig
+from transformers import CLIPVisionConfig
+from transformers.utils import logging
+from datetime import datetime
+logger = logging.get_logger(__name__)
+CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
+  attention_dropout=0.0,
+  dropout=0.0,
+  hidden_act="quick_gelu",
+  hidden_size=1024,
+  image_size=336,
+  initializer_factor=1.0,
+  initializer_range=0.02,
+  intermediate_size=4096,
+  layer_norm_eps=1e-05,
+  num_attention_heads=16,
+  num_channels=3,
+  num_hidden_layers=24,
+  patch_size=14,
+  projection_dim=768
+)
+class Phi3ImageEmbedding(nn.Module):
+    """Phi3 Image embedding."""
+    def __init__(self, config: PretrainedConfig, wte=None, **kwargs) -> None:
+        super().__init__()
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(config, 'n_embd') else config.hidden_size
+        if hasattr(config, 'embd_pdrop') or hasattr(config, 'embed_pdrop'):
+            embd_drop = config.embd_pdrop if hasattr(config, 'embd_pdrop') else config.embed_pdrop
+            self.drop = nn.Dropout(embd_drop)
+        else:
+            self.drop = None
+        self.wte = wte
+        if isinstance(config.img_processor, dict) and config.img_processor.get('name', None) == 'clip_vision_model':
+            assert 'model_name' in config.img_processor, 'model_name must be provided for CLIPVisionModel'
+            assert 'image_dim_out' in config.img_processor, 'image_dim_out must be provided for CLIPVisionModel'
+            assert 'num_img_tokens' in config.img_processor, 'num_img_tokens must be provided for CLIPVisionModel'
+            assert config.img_processor['model_name'] == 'openai/clip-vit-large-patch14-336'
+            clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
+            self.img_processor = CLIPVisionModel(clip_config)
+            image_dim_out = config.img_processor['image_dim_out']
+            self.num_img_tokens = config.img_processor['num_img_tokens']
+        else:
+            raise NotImplementedError(f'img_processor = {config.img_processor}, not implemented')
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = kwargs.get('use_hd_transform', False)
+        self.with_learnable_separator = kwargs.get('with_learnable_separator', False)
+        self.hd_transform_order = kwargs.get('hd_transform_order', 'glb_sub')
+        # with_hd_transform and with_learnable_separator should have same value
+        assert self.use_hd_transform == self.with_learnable_separator, 'use_hd_transform and with_learnable_separator should have same value'
+        if self.with_learnable_separator:
+            assert self.use_hd_transform, 'learnable separator is only for hd transform'
+            # 1024 * 4, merge spatial to channel dimension
+            self.glb_GN = nn.Parameter(torch.zeros([1, 1, self.image_dim_out * 4]))
+            self.sub_GN = nn.Parameter(torch.zeros([1, 1, 1, self.image_dim_out * 4]))
+            logger.info(f'learnable separator enabled for hd transform, hd_transform_order = {self.hd_transform_order}')
+        projection_cls = kwargs.get('projection_cls', 'linear')
+        if projection_cls == 'linear':
+            self.img_projection = nn.Linear(image_dim_out, hidden_size)
+        elif projection_cls == 'mlp' and self.use_hd_transform:
+            dim_projection = hidden_size
+            depth = 2
+            layers = [nn.Linear(image_dim_out * 4, dim_projection)]
+            for _ in range(1, depth):
+                layers.extend([nn.GELU(),
+                                nn.Linear(dim_projection, dim_projection)])
+            self.img_projection = nn.Sequential(*layers)
+        elif projection_cls == 'mlp':
+            dim_projection = hidden_size
+            depth = 2
+            layers = [nn.Linear(image_dim_out, dim_projection)]
+            for _ in range(1, depth):
+                layers.extend([nn.GELU(),
+                                nn.Linear(dim_projection, dim_projection)])
+            self.img_projection = nn.Sequential(*layers)
+        else:
+            raise NotImplementedError(f'projection_cls = {projection_cls}, not implemented')
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+        if isinstance(config.img_processor, dict):
+            self.layer_idx = config.img_processor.get('layer_idx', -2)
+            self.type_feature = config.img_processor.get('type_feature', 'patch')
+        else:
+            self.layer_idx = -2
+            self.type_feature = 'patch'
+    def set_img_features(self, img_features: torch.FloatTensor) -> None:
+        self.img_features = img_features
+    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
+        self.img_sizes = img_sizes
+    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        LAYER_IDX = self.layer_idx
+        TYPE_FEATURE = self.type_feature
+        img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)
+        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+        if TYPE_FEATURE == "patch":
+            patch_feature = img_feature[:, 1:]
+            return patch_feature
+        if TYPE_FEATURE == "cls_patch":
+            return img_feature
+        raise NotImplementedError
+    def forward(self, input_ids: torch.LongTensor, pixel_values: torch.FloatTensor, image_sizes=None) -> torch.FloatTensor:
+        MAX_INPUT_ID = int(1e9)
+        img_embeds = pixel_values
+        img_sizes = image_sizes
+        if self.img_features is not None:
+            img_embeds = self.img_features.clone()
+            self.img_features = None
+        if self.img_sizes is not None:
+            img_sizes = self.img_sizes
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        with torch.no_grad():
+            positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=False)
+        select = False
+        if isinstance(self.img_projection, nn.Sequential):
+            target_device = self.img_projection[0].bias.device
+            target_dtype = self.img_projection[0].bias.dtype
+        else:  # It's a single nn.Linear layer
+            target_device = self.img_projection.bias.device
+            target_dtype = self.img_projection.bias.dtype
+        if len(positions.tolist()) > 0:
+            with torch.no_grad():
+                g_values = abs(input_ids[positions[:, 0], positions[:, 1]])
+            if self.use_hd_transform and img_sizes is not None and len(img_sizes):
+                hd_transform = True
+                assert img_embeds.ndim == 5, f'img_embeds size: {img_embeds.size()}, expect 5D tensor for hd transform'
+                # img_embeds: (num_images, max_num_crops, 3, H, W)
+                # img_sizes: (num_images, 2).view(1, -1)
+                start_time = datetime.now()
+                bs = img_embeds.shape[0]
+                # Nx(HW)xC
+                img_features = self.get_img_features(img_embeds.flatten(0, 1))
+                base_feat_height = base_feat_width = int(img_features.shape[1] ** 0.5)
+                assert base_feat_height == 24 and base_feat_width == 24, f'base_feat_height: {base_feat_height}, base_feat_width: {base_feat_width}, expect 24x24 features for hd transform'
+                # bs x max_num_crops x (24x24) x C
+                img_features = img_features.view(bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
+                C = self.image_dim_out
+                H = base_feat_height
+                output_imgs = []
+                output_len = []
+                # training is tensor, inference is list
+                if isinstance(img_sizes, torch.Tensor):
+                    img_sizes = img_sizes.view(-1, 2)
+                for _bs in range(bs):
+                    h, w = img_sizes[_bs]
+                    h = h // 336
+                    w = w // 336
+                    B_ = h * w
+                    # 1 x (24x24) x 1024
+                    global_img_feature = img_features[_bs, :1]
+                    # 1 x 12 x 12 x 4096
+                    glb_img = global_img_feature.reshape(1,H,H,C).reshape(1,H//2,2,H//2,2,C).contiguous().permute(0,1,3,2,4,5).reshape(1,H//2,H//2,4*C).contiguous()
+                    temp_glb_GN = self.sub_GN.repeat(1, H//2, 1, 1)
+                    # 1 x 156 x 4096
+                    glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(1,-1,4*C)
+                    # (max_num_crops-1) x (12x12) x C
+                    sub_img = img_features[_bs, 1:]
+                    # 16x574x1024
+                    # get rid of padding sub_img
+                    sub_img = sub_img[:B_]
+                    # (num_crops, 12, 2, 12, 2, 1024) -> (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
+                    sub_img = sub_img.reshape(B_,H,H,C).reshape(B_,H//2,2,H//2,2,C).contiguous().permute(0,1,3,2,4,5).reshape(B_,-1,4*C).contiguous()
+                    sub_img = sub_img.reshape(1, h, w, 12, 12, -1).permute(0,1,3,2,4,5).reshape(1,h*12,w*12,4*C)
+                    temp_sub_GN = self.sub_GN.repeat(1, h*12, 1, 1)
+                    sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(1,-1,4*C)
+                    # (1, num_img_tokens, 1024*4)
+                    # glb + sub
+                    if self.hd_transform_order == 'glb_sub':
+                        output_imgs.append(torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+                    elif self.hd_transform_order == 'sub_glb':
+                        output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+                    else:
+                        raise NotImplementedError(f'hd_transform_order = {self.hd_transform_order}, not implemented')
+                    temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
+                    assert temp_len == output_imgs[-1].shape[1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'
+                    output_len.append(temp_len)
+                num_img_tokens = output_len
+                img_set_tensor = []
+                for _output_img in output_imgs:
+                    img_feature_proj = self.img_projection(_output_img.to(target_device).to(target_dtype))
+                    img_set_tensor.append(img_feature_proj)
+                logger.info(f'img_embeds size: {img_embeds.size()}, image sizes: {img_sizes} loading time {datetime.now() - start_time}')
+            elif img_embeds.ndim == 4:
+                selected_g_values = g_values[::self.num_img_tokens]
+                assert len(img_embeds) == len(selected_g_values), f'img_embeds size: {img_embeds.size()}, selected_g_values size: {len(selected_g_values)}, selected_g_value {selected_g_values}'
+                start_time = datetime.now()
+                tt = (
+                    self.get_img_features(img_embeds)
+                    .to(target_device)
+                    .to(target_dtype)
+                    .reshape(-1, self.image_dim_out)
+                )
+                logger.info(f'img_embeds size: {img_embeds.size()}, loading time {datetime.now() - start_time}')
+                img_set_tensor = self.img_projection(tt)  # adapted visual features.
+            elif img_embeds.ndim == 3:
+                selected_g_values = g_values[::self.num_img_tokens]
+                assert len(img_embeds) == len(selected_g_values), f'img_embeds size: {img_embeds.size()}, selected_g_values size: {len(selected_g_values)}, selected_g_value {selected_g_values}'
+                tt = (
+                    img_embeds
+                    .to(target_device)
+                    .to(target_dtype)
+                    .view(-1, self.image_dim_out)
+                )
+                img_set_tensor = self.img_projection(tt)  # adapted visual features.
+            else:
+                raise NotImplementedError
+            select = True
+        with torch.no_grad():
+            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
+        hidden_states = self.wte(input_ids)
+        if select:
+            if hd_transform:
+                idx = 0
+                for i, cnt in enumerate(num_img_tokens):
+                    hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = (
+                        img_set_tensor[i]
+                        .to(hidden_states.dtype)
+                        .to(hidden_states.device)
+                        )
+                    idx += cnt
+            else:
+                idx = 0
+                assert len(selected_g_values) * self.num_img_tokens == len(img_set_tensor), f'len(selected_g_values) * self.num_img_tokens = {len(selected_g_values) * self.num_img_tokens}, len(img_set_tensor) = {len(img_set_tensor)}'
+                for i, g in enumerate(selected_g_values):
+                    cnt = self.num_img_tokens
+                    hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = (
+                        img_set_tensor[i * cnt : (i + 1) * cnt]
+                        .to(hidden_states.dtype)
+                        .to(hidden_states.device)
+                        )
+                    idx += cnt
+        if self.drop is not None:
+            hidden_states = self.drop(hidden_states)
+        return hidden_states

image_processing_phi3_v.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Phi3-V."""
+from typing import List, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ImageInput,
+    make_list_of_images,
+    valid_images,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+from transformers import AutoImageProcessor
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+import torch
+import torchvision
+def padding_336(b):
+    width, height = b.size
+    tar = int(np.ceil(height / 336) * 336)
+    top_padding = int((tar - height)/2)
+    bottom_padding = tar - height - top_padding
+    left_padding = 0
+    right_padding = 0
+    b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
+    return b
+def calc_padded_size(width, height, padding_unit=336):
+    target_height = int(np.ceil(height / padding_unit) * padding_unit)
+    top_padding = int((target_height - height) / 2)
+    bottom_padding = target_height - height - top_padding
+    left_padding = 0
+    right_padding = 0
+    padded_width = width + left_padding + right_padding
+    padded_height = height + top_padding + bottom_padding
+    return padded_width, padded_height
+def HD_transform(img, hd_num=16):
+    width, height = img.size
+    trans = False
+    if width < height:
+        img = img.transpose(Image.TRANSPOSE)
+        trans = True
+        width, height = img.size
+    ratio = (width/ height)
+    scale = 1
+    while scale*np.ceil(scale/ratio) <= hd_num:
+        scale += 1
+    scale -= 1
+    new_w = int(scale * 336)
+    new_h = int(new_w / ratio)
+    img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
+    img = padding_336(img)
+    width, height = img.size
+    if trans:
+        img = img.transpose(Image.TRANSPOSE)
+    return img
+def calc_hd_transform_size(width, height, hd_num=16):
+    transposed = False
+    if width < height:
+        width, height = height, width
+        transposed = True
+    ratio = width / height
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= hd_num:
+        scale += 1
+    scale -= 1
+    new_width = int(scale * 336)
+    new_height = int(new_width / ratio)
+    padded_width, padded_height = calc_padded_size(new_width, new_height)
+    if transposed:
+        padded_width, padded_height = padded_height, padded_width
+    return padded_width, padded_height
+def pad_to_max_num_crops_tensor(images, max_crops=5):
+    """
+    images: B x 3 x H x W, B<=max_crops
+    """
+    B, _, H, W = images.shape
+    if B < max_crops:
+        pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
+        images = torch.cat([images, pad], dim=0)
+    return images
+class Phi3VImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Phi3 image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques
+    for processing high resolution images as explained in the [InternLM-XComposer2-4KHD](https://arxiv.org/abs/2401.16420)
+    Args:
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        num_crops: int = 1,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_crops = num_crops
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+    def calc_num_image_tokens(
+            self,
+            images: ImageInput
+    ):
+        """ Calculate the number of image tokens for each image.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+        """
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        images = [image.convert('RGB') for image in images]
+        # (H, W, C)
+        elems = [HD_transform(im, hd_num = self.num_crops) for im in images]
+        shapes = [[im.size[1], im.size[0]] for im in elems]
+        num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
+        return num_img_tokens
+    def calc_num_image_tokens_from_image_size(self, width, height):
+        """
+        Calculate the number of image tokens for a given image size.
+        Args:
+            width (`int`): Width of the image.
+            height (`int`): Height of the image.
+        """
+        new_width, new_height = calc_hd_transform_size(width, height, hd_num=self.num_crops)
+        num_img_tokens = int((new_height // 336 * new_width // 336 + 1) * 144 + 1 + (new_height // 336 + 1) * 12)
+        return num_img_tokens
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        """
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        image_sizes = []
+        img_processor = torchvision.transforms.Compose([
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize(image_mean, image_std)
+        ])
+        # PIL images
+        # HD_transform pad images to size of multiiply of 336, 336
+        # convert to RGB first
+        images = [image.convert('RGB') for image in images]
+        elems = [HD_transform(im, hd_num = self.num_crops) for im in images]
+        # tensor transform and normalize
+        hd_images = [img_processor(im) for im in elems]
+        # create global image
+        global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(336, 336), mode='bicubic',).to(im.dtype) for im in hd_images]
+        # [(3, h, w)], where h, w is multiple of 336
+        shapes = [[im.size(1), im.size(2)] for im in hd_images]
+        num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
+        # reshape to channel dimension -> (num_images, num_crops, 3, 336, 336)
+        # (1, 3, h//336, 336, w//336, 336) -> (1, h//336, w//336, 3, 336, 336) -> (h//336*w//336, 3, 336, 336)
+        hd_images_reshape = [im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336).contiguous() for im, (h, w) in zip(hd_images, shapes)]
+        # concat global image and local image
+        hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
+        # pad to max_num_crops
+        image_transformed = [pad_to_max_num_crops_tensor(im, self.num_crops+1) for im in hd_images_reshape]
+        image_transformed = torch.stack(image_transformed, dim=0)
+        image_sizes = [torch.LongTensor(_shapes) for _shapes in shapes]
+        padded_images = image_transformed
+        image_sizes = shapes
+        data = {"pixel_values": padded_images,
+                "image_sizes": image_sizes,
+                "num_img_tokens": num_img_tokens
+                }
+        return BatchFeature(data=data, tensor_type=return_tensors)
+AutoImageProcessor.register("Phi3VImageProcessor", Phi3VImageProcessor)

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:175b2fe918dd8bd2549e3441615ee0c6d7b1f6d638c0104a614546f55c273482
+size 4944122112

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e61ece5a8f0c9663afa06cc22799056f5cc084fb993518bf036dc8e268fd4c94
+size 3349208776

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,599 @@

+{
+  "metadata": {
+    "total_size": 8293242880
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors",
+    "model.vision_embed_tokens.glb_GN": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.embeddings.class_embedding": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.pre_layrnorm.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_processor.vision_model.pre_layrnorm.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_projection.0.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_projection.0.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_projection.2.bias": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.img_projection.2.weight": "model-00001-of-00002.safetensors",
+    "model.vision_embed_tokens.sub_GN": "model-00001-of-00002.safetensors"
+  }
+}

modeling_phi3_v.py ADDED Viewed

	@@ -0,0 +1,780 @@

+from dataclasses import dataclass
+import glob
+import json
+from pathlib import Path
+from typing import Dict, Optional, List, Tuple, Union
+import math
+import warnings
+import mlx.core as mx
+import mlx.nn as nn
+import logging
+# from llms.mlx_lm.models.base import BaseModelArgs
+from configuration_phi3_v import Phi3VConfig
+from utils import BaseModelOutputWithPast, FloatTensor, LongTensor, Cache, DynamicCache, CausalLMOutputWithPast
+from image_embedding_phi3_v import Phi3ImageEmbedding
+from attn_mask import _prepare_4d_causal_attention_mask
+from huggingface_hub import snapshot_download
+class Phi3RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+    def __call__(self, x, position_ids, seq_len=None):
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                self.base ** (mx.arange(0, self.dim, 2, Dtype=mx.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+        emb = mx.concatenate((freqs, freqs), dim=-1)
+        cos = emb.cos()
+        sin = emb.sin()
+        return cos.to(Dtype=x.Dtype), sin.to(Dtype=x.Dtype)
+class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta)
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    def __call__(self, x, position_ids, seq_len=None):
+        seq_len = mx.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = mx.array(self.long_factor, Dtype=mx.float32)
+        else:
+            ext_factors = mx.array(self.short_factor, Dtype=mx.float32)
+        inv_freq_shape = mx.arange(0, self.dim, 2, Dtype=mx.int64).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+        emb = mx.concatenate((freqs, freqs), dim=-1)
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+        cos = emb.cos() * scaling_factor
+        sin = emb.sin() * scaling_factor
+        return cos.to(Dtype=x.Dtype), sin.to(Dtype=x.Dtype)
+class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta)
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    def __call__(self, x, position_ids, seq_len=None):
+        seq_len = mx.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = mx.array(self.long_factor, Dtype=mx.float32)
+        else:
+            ext_factors = mx.array(self.short_factor, Dtype=mx.float32)
+        inv_freq_shape = mx.arange(0, self.dim, 2, Dtype=mx.int64).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+        emb = mx.concatenate((freqs, freqs), dim=-1)
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = 0.1 * math.log(scale) + 1.0
+        cos = emb.cos() * scaling_factor
+        sin = emb.sin() * scaling_factor
+        return cos.to(Dtype=x.Dtype), sin.to(Dtype=x.Dtype)
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return mx.concatenate((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Phi3MLP(nn.Module):
+    def __init__(self, config: Phi3VConfig):
+        super().__init__()
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+    def __call__(self, x) -> mx.array:
+        x = self.gate_up_proj(x)
+        gate, x = mx.split(x, 2, axis=-1)
+        return self.down_proj(nn.silu(gate) * x)
+def repeat_kv(hidden_states: mx.array, n_rep: int) -> mx.array:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Phi3Attention(nn.Module):
+    def __init__(self, config: Phi3VConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logging.warning(
+                "Instantiating %s without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class.",
+                self.__class__.__name__,
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = config.rope_scaling
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
+        self._init_rope()
+    def _init_rope(self):
+        if self.rope_scaling is None:
+            self.rotary_emb = Phi3RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            if scaling_type == "su":
+                self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config)
+            elif scaling_type == "yarn":
+                self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config)
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        attention_mask: Optional[mx.array] = None,
+        position_ids: Optional[LongTensor] = None,
+        past_key_value: Optional[Tuple[mx.array, mx.array]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[mx.array, Optional[mx.array], Optional[Tuple[mx.array]]]:
+        logging.warning("You are not running the flash-attention implementation, expect numerical differences.")
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = mx.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        attn_weights = mx.softmax(attn_weights, dim=-1, Dtype=mx.float32).to(value_states.Dtype)
+        attn_weights = mx.Dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = mx.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Phi3SdpaAttention(Phi3Attention):
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        attention_mask: Optional[mx.array] = None,
+        position_ids: Optional[LongTensor] = None,
+        past_key_value: Optional[Tuple[mx.array, mx.array]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[mx.array, Optional[mx.array], Optional[Tuple[mx.array]]]:
+        if output_attentions:
+            logging.warning(
+                "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().__call__(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = mx.fast.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+PHI3_ATTENTION_CLASSES = {
+    "eager": Phi3Attention,
+    "sdpa": Phi3SdpaAttention,
+}
+class Phi3DecoderLayer(nn.Module):
+    def __init__(self, config: Phi3VConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        self.mlp = Phi3MLP(config)
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        attention_mask: Optional[mx.array] = None,
+        position_ids: Optional[LongTensor] = None,
+        past_key_value: Optional[Tuple[mx.array]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[mx.array, Optional[Tuple[FloatTensor, FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class Phi3VPreTrainedModel(nn.Module):
+    config_class = Phi3VConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = False
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _version = "0.0.5"
+    def __init__(self, config):
+        super(Phi3VPreTrainedModel, self).__init__()
+        self.config = config
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class Phi3VModel(Phi3VPreTrainedModel):
+    def __init__(self, config: Phi3VConfig):
+        super(Phi3VModel, self).__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.embed_dropout = nn.Dropout(config.embd_pdrop)
+        # Vision embedding integration
+        if isinstance(config.embd_layer, dict) and config.embd_layer.get('embedding_cls') == 'image':
+            self.vision_embed_tokens = Phi3ImageEmbedding(config)
+        else:
+            self.vision_embed_tokens = None
+        self.layers = [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.apply(self._init_weights)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def __call__(
+        self,
+        input_ids: LongTensor = None,
+        attention_mask: Optional[mx.array] = None,
+        position_ids: Optional[LongTensor] = None,
+        past_key_values: Optional[List[FloatTensor]] = None,
+        inputs_embeds: Optional[FloatTensor] = None,
+        pixel_values: Optional[FloatTensor] = None,
+        image_sizes: Optional[LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        past_key_values_length = 0
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logging.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = mx.arange(
+                past_key_values_length, seq_length + past_key_values_length, Dtype=mx.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            if pixel_values is not None and image_sizes is not None:
+                assert self.vision_embed_tokens is not None, "Vision embedding layer is not defined"
+                inputs_embeds = self.vision_embed_tokens(input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
+            else:
+                inputs_embeds = self.embed_tokens(input_ids)
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    @staticmethod
+    def from_pretrained(path_or_hf_repo: str):
+        path = Path(path_or_hf_repo)
+        if not path.exists():
+            path = Path(
+                snapshot_download(
+                    repo_id=path_or_hf_repo,
+                    allow_patterns=[
+                        "*.json",
+                        "*.safetensors",
+                        "*.py",
+                        "tokenizer.model",
+                        "*.tiktoken",
+                    ],
+                )
+            )
+        with open(path / "config.json", "r") as f:
+            model_config = json.load(f)
+        model = Phi3VModel(Phi3VConfig.from_dict(model_config))
+        weight_files = list(glob.glob(f"{path}/*.safetensors"))
+        assert len(weight_files) > 0, f"No safetensors weight files found: {weight_files}"
+        # Load weights from all files
+        weights = {}
+        for wf in weight_files:
+            weights.update(mx.load(wf))
+        # Ensure all weights are converted to lists if necessary
+        for k, v in weights.items():
+            if hasattr(v, 'tolist'):
+                weights[k] = v.tolist()
+        # Load weights
+        model.load_weights(list(weights.items()))
+        return model
+class Phi3VForCausalLM(Phi3VPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi3VModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def __call__(
+        self,
+        input_ids: LongTensor = None,
+        attention_mask: Optional[mx.array] = None,
+        position_ids: Optional[LongTensor] = None,
+        past_key_values: Optional[List[FloatTensor]] = None,
+        inputs_embeds: Optional[FloatTensor] = None,
+        pixel_values: Optional[FloatTensor] = None,
+        image_sizes: Optional[LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, pixel_values=None, image_sizes=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "image_sizes": image_sizes,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "auto_map": {
+        "AutoProcessor": "processing_phi3_v.Phi3VProcessor",
+        "AutoImageProcessor": "image_processing_phi3_v.Phi3VImageProcessor"
+    },
+    "num_crops": 16,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_processor_type": "Phi3VImageProcessor",
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "processor_class": "Phi3VProcessor",
+    "num_img_tokens": 144
+  }

processing_phi3_v.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Phi3-V.
+"""
+import re
+from typing import List, Optional, Union
+import torch
+import transformers
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
+from transformers.utils import TensorType
+from .image_processing_phi3_v import Phi3VImageProcessor
+transformers.Phi3VImageProcessor = Phi3VImageProcessor
+class Phi3VProcessor(ProcessorMixin):
+    r"""
+    Constructs a Phi3-V processor which wraps a Phi3-V image processor and a LLaMa tokenizer into a single processor.
+    [`Phi3VProcessor`] offers all the functionalities of [`Phi3VImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~Phi3VProcessor.__call__`] and [`~Phi3VProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Phi3VImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Phi3VImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    special_image_token = "<|image|>"
+    def __init__(self, image_processor, tokenizer):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.num_img_tokens = image_processor.num_img_tokens
+        self.img_tokens = [f"<|image_{i+1}|>" for i in range(1000000)]
+    def __call__(
+        self,
+        text: Union[TextInput, List[TextInput]],
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        Phi3ImageProcessor's [`~Phi3ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is not None:
+            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+        else:
+            image_inputs = {}
+        inputs = self._convert_images_texts_to_inputs(image_inputs, text, padding=padding, truncation=truncation, max_length=max_length, return_tensors=return_tensors)
+        return inputs
+    def calc_num_image_tokens(self, images: ImageInput):
+        """ Calculate the number of image tokens for each image.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+        """
+        return self.image_processor.calc_num_image_tokens(images)
+    def calc_num_image_tokens_from_image_size(self, width, height):
+        """ Calculate the number of image token for an image with given width and height.
+        Args:
+            width (`int`):
+                Width of the image.
+            height (`int`):
+                Height of the image.
+        """
+        return self.image_processor.calc_num_image_tokens_from_image_size(width, height)
+    @property
+    def special_image_token_id(self):
+        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
+    def get_special_image_token_id(self):
+        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
+    def _convert_images_texts_to_inputs(self, images, texts, padding=False, truncation=None, max_length=None, return_tensors=None):
+        if not len(images):
+            model_inputs = self.tokenizer(texts, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length)
+            return BatchFeature(data={**model_inputs})
+        pattern = r"<\|image_\d+\|>"
+        prompt_chunks = [self.tokenizer(chunk).input_ids for chunk in re.split(pattern, texts)]
+        if 'num_img_tokens' in images:
+            num_img_tokens = images['num_img_tokens']
+        else:
+            assert 'num_crops' in images, 'num_crops must be provided in images if num_img_tokens is not provided'
+            num_crops = images['num_crops']
+            num_img_tokens = [_num_crops * self.num_img_tokens for _num_crops in num_crops]
+        images, image_sizes = images['pixel_values'], images['image_sizes']
+        # image_tags needs to start from 1 to n
+        image_tags = re.findall(pattern, texts)
+        # image_ids = [int(s.split("|")[1].split("_")[-1]) * -1 for s in image_tags]
+        # image_ids_pad = [[iid]*num_img_tokens[i] for i, iid in enumerate(image_ids)]
+        image_ids = [int(s.split("|")[1].split("_")[-1]) for s in image_tags]
+        unique_image_ids = sorted(list(set(image_ids)))
+        # image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be [1, 4, 5]
+        # check the condition
+        assert unique_image_ids == list(range(1, len(unique_image_ids)+1)), f"image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be {unique_image_ids}"
+        # total images must be the same as the number of image tags
+        assert len(unique_image_ids) == len(images), f"total images must be the same as the number of image tags, got {len(unique_image_ids)} image tags and {len(images)} images"
+        image_ids_pad = [[-iid]*num_img_tokens[iid-1] for iid in image_ids]
+        def insert_separator(X, sep_list):
+            if len(X) > len(sep_list):
+                sep_list.append([])
+            return [ele for sublist in zip(X, sep_list) for ele in sublist]
+        input_ids = []
+        offset = 0
+        for x in insert_separator(prompt_chunks, image_ids_pad):
+            input_ids.extend(x[offset:])
+        input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)
+        attention_mask = (input_ids > -1000000).to(torch.long)
+        return BatchFeature(data={"input_ids": input_ids,
+                                  "attention_mask": attention_mask,
+                                  "pixel_values": images,
+                                  "image_sizes": image_sizes})
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

sample_inference.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from PIL import Image
+import requests
+import torch
+from transformers import AutoModelForCausalLM
+from transformers import AutoProcessor
+model_path = "./"
+kwargs = {}
+kwargs['torch_dtype'] = torch.bfloat16
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype="auto").cuda()
+user_prompt = '<|user|>\n'
+assistant_prompt = '<|assistant|>\n'
+prompt_suffix = "<|end|>\n"
+#################################################### text-only ####################################################
+# single-image prompt
+prompt = f"{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}"
+print(f">>> Prompt\n{prompt}")
+inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids,
+                                  skip_special_tokens=True,
+                                  clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+#################################################### text-only 2 ####################################################
+# single-image prompt
+prompt = f"{user_prompt}Give me the code for sloving two-sum problem.{prompt_suffix}{assistant_prompt}"
+print(f">>> Prompt\n{prompt}")
+inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids,
+                                  skip_special_tokens=True,
+                                  clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+#################################################### EXAMPLE 1 ####################################################
+# single-image prompt
+prompt = f"{user_prompt}<|image_1|>\nWhat is shown in this image?{prompt_suffix}{assistant_prompt}"
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+print(f">>> Prompt\n{prompt}")
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids,
+                                  skip_special_tokens=True,
+                                  clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+#################################################### EXAMPLE 2 ####################################################
+# multiple image prompt
+# Note: image tokens must start from <|image_1|>
+prompt = f"{user_prompt}<|image_1|>\n<|image_2|>\n What is shown in this two images?{prompt_suffix}{assistant_prompt}"
+print(f">>> Prompt\n{prompt}")
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image_1 = Image.open(requests.get(url, stream=True).raw)
+url = "https://img.freepik.com/free-photo/painting-mountain-lake-with-mountain-background_188544-9126.jpg?w=2000"
+image_2 = Image.open(requests.get(url, stream=True).raw)
+images = [image_1, image_2]
+inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+#################################################### EXAMPLE 3 ####################################################
+# chat template
+chat = [
+    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
+    {"role": "assistant", "content": "The image depicts a street scene with a prominent red stop sign in the foreground. The background showcases a building with traditional Chinese architecture, characterized by its red roof and ornate decorations. There are also several statues of lions, which are common in Chinese culture, positioned in front of the building. The street is lined with various shops and businesses, and there's a car passing by."},
+    {"role": "user", "content": "What is so special about this image"}
+]
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+# need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
+if prompt.endswith("<|endoftext|>"):
+    prompt = prompt.rstrip("<|endoftext|>")
+print(f">>> Prompt\n{prompt}")
+inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')
+############################# to markdown #############################
+# single-image prompt
+prompt = f"{user_prompt}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}"
+url = "https://support.content.office.net/en-us/media/3dd2b79b-9160-403d-9967-af893d17b580.png"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
+print(f">>> Prompt\n{prompt}")
+generate_ids = model.generate(**inputs,
+                              max_new_tokens=1000,
+                              eos_token_id=processor.tokenizer.eos_token_id,
+                              )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids,
+                                  skip_special_tokens=False,
+                                  clean_up_tokenization_spaces=False)[0]
+print(f'>>> Response\n{response}')

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "additional_special_tokens": [
+    "<|system|>",
+    "<|end|>",
+    "<|user|>",
+    "<|end|>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,408 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<|placeholder7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32012": {
+      "content": "<|placeholder8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32013": {
+      "content": "<|placeholder9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32014": {
+      "content": "<|placeholder10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32015": {
+      "content": "<|placeholder11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "<|placeholder12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32017": {
+      "content": "<|placeholder13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32018": {
+      "content": "<|placeholder14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32019": {
+      "content": "<|placeholder15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32020": {
+      "content": "<|placeholder16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32021": {
+      "content": "<|placeholder17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32022": {
+      "content": "<|placeholder18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32023": {
+      "content": "<|placeholder19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32024": {
+      "content": "<|placeholder20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32025": {
+      "content": "<|placeholder21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32026": {
+      "content": "<|placeholder22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32027": {
+      "content": "<|placeholder23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32028": {
+      "content": "<|placeholder24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32029": {
+      "content": "<|placeholder25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32030": {
+      "content": "<|placeholder26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32031": {
+      "content": "<|placeholder27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32032": {
+      "content": "<|placeholder28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32033": {
+      "content": "<|placeholder29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32034": {
+      "content": "<|placeholder30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32035": {
+      "content": "<|placeholder31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32036": {
+      "content": "<|placeholder32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32037": {
+      "content": "<|placeholder33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32038": {
+      "content": "<|placeholder34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32039": {
+      "content": "<|placeholder35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32040": {
+      "content": "<|placeholder36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32041": {
+      "content": "<|placeholder37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32042": {
+      "content": "<|placeholder38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32043": {
+      "content": "<|placeholder39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32044": {
+      "content": "<|image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|system|>",
+    "<|end|>",
+    "<|user|>",
+    "<|end|>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "legacy": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

utils.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from dataclasses import dataclass, field
+import inspect
+import logging
+from typing import Optional, List, Union, Dict, Tuple, Any
+from transformers.configuration_utils import PretrainedConfig
+import mlx.core as mx
+# Define a custom float tensor type using the provided data type
+class FloatTensor:
+    def __init__(self, data):
+        if data is not None:
+            self.tensor = mx.array(data, dtype=mx.float32)
+        else:
+            self.tensor = None
+    def __repr__(self):
+        return repr(self.tensor)
+# Define a custom LongTensor class
+class LongTensor:
+    def __init__(self, data=None):
+        if data is not None:
+            self.tensor = mx.array(data, dtype=mx.int64)
+        else:
+            self.tensor = None
+    def assign(self, data):
+        self.tensor = mx.array(data, dtype=mx.int64)
+    def __repr__(self):
+        return repr(self.tensor)
+@dataclass
+class BaseModelOutputWithPast:
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    last_hidden_state: FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[FloatTensor]]] = None
+    hidden_states: Optional[Tuple[FloatTensor, ...]] = None
+    attentions: Optional[Tuple[FloatTensor, ...]] = None
+@dataclass
+class Cache:
+    """
+    Base, abstract class for all caches. The actual data structure is specific to each subclass.
+    """
+    def update(
+        self,
+        key_states: mx.array,
+        value_states: mx.array,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[mx.array, mx.array]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`mx.array`):
+                The new key states to cache.
+            value_states (`mx.array`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
+                cache to be created.
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        raise NotImplementedError("Make sure to implement `update` in a subclass.")
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # TODO: deprecate this function in favor of `cache_position`
+        raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.")
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states, if there is any."""
+        raise NotImplementedError("Make sure to implement `get_max_length` in a subclass.")
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Cache without size limit -> all cache is usable
+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+    # def reorder_cache(self, beam_idx: LongTensor):
+    #     """Reorders the cache for beam search, given the selected beam indices."""
+    #     for layer_idx in range(len(self.key_cache)):
+    #         device = self.key_cache[layer_idx].device
+    #         self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+    #         device = self.value_cache[layer_idx].device
+    #         self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+    @property
+    def seen_tokens(self):
+        logging.warning(
+            "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` "
+            "model input instead."
+        )
+        if hasattr(self, "_seen_tokens"):
+            return self._seen_tokens
+        else:
+            return None
+class DynamicCache(Cache):
+    """
+    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
+    `[batch_size, num_heads, seq_len, head_dim]`.
+    """
+    def __init__(self) -> None:
+        self.key_cache: List[mx.array] = []
+        self.value_cache: List[mx.array] = []
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+    def __getitem__(self, layer_idx: int) -> List[Tuple[mx.array]]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (self.key_cache[layer_idx], self.value_cache[layer_idx])
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (self.key_cache[layer_idx], self.value_cache[layer_idx])
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.key_cache)
+    def update(
+        self,
+        key_states: mx.array,
+        value_states: mx.array,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[mx.array, mx.array]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`mx.array`):
+                The new key states to cache.
+            value_states (`mx.array`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += key_states.shape[-2]
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        else:
+            self.key_cache[layer_idx] = mx.concatenate([self.key_cache[layer_idx], key_states], dim=-2)
+            self.value_cache[layer_idx] = mx.concatenate([self.value_cache[layer_idx], value_states], dim=-2)
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # TODO: deprecate this function in favor of `cache_position`
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+    def to_legacy_cache(self) -> Tuple[Tuple[mx.array], Tuple[mx.array]]:
+        """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format."""
+        legacy_cache = ()
+        for layer_idx in range(len(self)):
+            legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),)
+        return legacy_cache
+    @classmethod
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[FloatTensor]]] = None) -> "DynamicCache":
+        """Converts a cache in the legacy cache format into an equivalent `DynamicCache`."""
+        cache = cls()
+        if past_key_values is not None:
+            for layer_idx in range(len(past_key_values)):
+                key_states, value_states = past_key_values[layer_idx]
+                cache.update(key_states, value_states, layer_idx)
+        return cache
+@dataclass
+class CausalLMOutputWithPast():
+    loss: Optional[FloatTensor] = None
+    logits: FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[FloatTensor]]] = None
+    hidden_states: Optional[Tuple[FloatTensor, ...]] = None
+    attentions: Optional[Tuple[FloatTensor, ...]] = None