Upload LlavaForConditionalGeneration

Browse files

Files changed (5) hide show

config.json +242 -0
configuration_llava.py +131 -0
generation_config.json +6 -0
model.safetensors +3 -0
modeling_llava.py +345 -0

config.json ADDED Viewed

	@@ -0,0 +1,242 @@

+{
+  "_name_or_path": "output/jp-llava-small-sfcoco-bs8-lr5e5/checkpoints",
+  "architectures": [
+    "LlavaForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_llava.LlavaConfig",
+    "AutoModelForVision2Seq": "modeling_llava.LlavaForConditionalGeneration"
+  },
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "mlp_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "llava_mlp",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 2,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "model_type": "llava",
+  "text_config": {
+    "_name_or_path": "rinna/japanese-gpt-neox-small",
+    "add_cross_attention": false,
+    "architectures": [
+      "GPTNeoXForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 2,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": 0.1,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 3,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.0,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 2048,
+    "min_length": 0,
+    "model_type": "gpt_neox",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rope_scaling": null,
+    "rotary_emb_base": 10000,
+    "rotary_pct": 1.0,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": "T5Tokenizer",
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "use_parallel_residual": false,
+    "vocab_size": 44416
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_decoder_only_language_model": true,
+  "vision_config": {
+    "_name_or_path": "openai/clip-vit-large-patch14",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_select_feature": "patch",
+  "vision_select_layer": -2
+}

configuration_llava.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright 2023 Stability AI team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Union
+from transformers import PretrainedConfig, CLIPVisionConfig
+from transformers.models.auto import CONFIG_MAPPING
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class LlavaMlpConfig(PretrainedConfig):
+    model_type = "llava_mlp"
+    def __init__(
+        self,
+        num_hidden_layers=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_hidden_layers = num_hidden_layers
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+        # get the qformer config dict if we are loading from InstructBlipConfig
+        if config_dict.get("model_type") == "llava":
+            config_dict = config_dict["mlp_config"]
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class LlavaConfig(PretrainedConfig):
+    model_type = "llava"
+    is_composition = True
+    def __init__(
+        self,
+        vision_config=None,
+        mlp_config=None,
+        text_config=None,
+        vision_select_layer=-2,
+        vision_select_feature="patch",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {}
+            logger.info(
+                "vision_config is None. initializing the CLIPVisionConfig with default values."
+            )
+        if mlp_config is None:
+            mlp_config = {}
+            logger.info(
+                "mlp_config is None. Initializing the LlavaMlpConfig with default values."
+            )
+        if text_config is None:
+            text_config = {}
+            logger.info(
+                "text_config is None. Initializing the text config with default values (`OPTConfig`)."
+            )
+        self.vision_config = CLIPVisionConfig(**vision_config)
+        self.mlp_config = LlavaMlpConfig(**mlp_config)
+        text_model_type = text_config["model_type"]
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+        self.use_decoder_only_language_model = (
+            self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        )
+        self.vision_select_layer = vision_select_layer
+        assert vision_select_feature in [
+            "cls_patch",
+            "patch",
+        ], f"Unexpected select feature: {vision_select_feature}"
+        self.vision_select_feature = vision_select_feature
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+    @classmethod
+    def from_vision_mlp_text_configs(
+        cls,
+        vision_config: CLIPVisionConfig,
+        mlp_config: LlavaMlpConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        return cls(
+            vision_config=vision_config.to_dict(),
+            mlp_config=mlp_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "transformers_version": "4.35.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dca99e6a876047698b4bde214395e27b6babfa133584f341efe7c03006134a5f
+size 1831419000

modeling_llava.py ADDED Viewed

	@@ -0,0 +1,345 @@

+# Copyright 2023 Stability AI team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union, Any
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    PreTrainedModel,
+    CLIPVisionModel,
+)
+from transformers.utils import logging, ModelOutput
+from .configuration_llava import LlavaConfig
+logger = logging.get_logger(__name__)
+@dataclass
+class LlavaForConditionalGenerationModelOutput(ModelOutput):
+    loss: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+class LlavaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = LlavaConfig
+    base_model_prefix = "llava"
+    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlip
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if (
+            isinstance(module, nn.Conv2d)
+            or isinstance(module, nn.Embedding)
+            or isinstance(module, nn.Linear)
+        ):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class LlavaForConditionalGeneration(LlavaPreTrainedModel):
+    config_class = LlavaConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = []
+    def __init__(self, config: LlavaConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionModel(config.vision_config)
+        if config.use_decoder_only_language_model:
+            language_model = AutoModelForCausalLM.from_config(config.text_config)
+        else:
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+        if language_model._no_split_modules is not None:
+            self._no_split_modules.extend(language_model._no_split_modules)
+        if language_model._keep_in_fp32_modules is not None:
+            self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)
+        self.language_model = language_model
+        modules = [
+            nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size)
+        ]
+        for _ in range(1, config.mlp_config.num_hidden_layers):
+            modules.append(nn.GELU())
+            modules.append(
+                nn.Linear(
+                    config.text_config.hidden_size, config.text_config.hidden_size
+                )
+            )
+        self.mlp = nn.Sequential(*modules)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+        if (
+            len(hf_device_map) > 1
+            and "language_model" not in hf_device_map
+            and torch.cuda.device_count() > 1
+        ):
+            # warn users about unexpected behavior when using multi-GPU + InstructBLIP + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = (
+                True  # For `generate` compatibility
+            )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, LlavaForConditionalGenerationModelOutput]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # step 1: forward the images through the vision encoder,
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+            output_hidden_states=True,
+        )
+        # (bsz, seq len, hidden_size)
+        image_embeds = vision_outputs.hidden_states[self.config.vision_select_layer]
+        if self.config.vision_select_feature == "patch":
+            image_embeds = image_embeds[:, 1:]
+        elif self.config.vision_select_feature == "cls_patch":
+            image_embeds = image_embeds
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        # step 2: forward the image embeddings through the mlp
+        image_embeds = self.mlp(image_embeds)
+        image_attention_mask = torch.ones(
+            image_embeds.size()[:-1], device=image_embeds.device
+        )
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # step 3: concatenate
+        inputs_embeds = torch.cat(
+            [image_embeds, inputs_embeds.to(image_embeds.device)],
+            dim=1,
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, device=input_ids.device)
+        attention_mask = torch.cat(
+            [image_attention_mask.to(attention_mask.device), attention_mask],
+            dim=1,
+        )
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                labels = labels.to(logits.device)
+                logits = logits[:, -labels.size(1) :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous().to(logits.device)
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+                loss = loss_fct(
+                    shift_logits.view(-1, self.config.text_config.vocab_size),
+                    shift_labels.view(-1),
+                )
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+        if not return_dict:
+            output = (logits, vision_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+        return LlavaForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            language_model_outputs=outputs,
+        )
+    def get_image_embeds(self, pixel_values: torch.FloatTensor):
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=True,
+        )
+        image_embeds = vision_outputs.hidden_states[self.config.vision_select_layer]
+        if self.config.vision_select_feature == "patch":
+            image_embeds = image_embeds[:, 1:]
+        elif self.config.vision_select_feature == "cls_patch":
+            image_embeds = image_embeds
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        image_embeds = self.mlp(image_embeds)
+        image_attention_mask = torch.ones(
+            image_embeds.size()[:-1], device=image_embeds.device
+        )
+        return dict(
+            image_embeds=image_embeds,
+            image_attention_mask=image_attention_mask,
+        )
+    def prepare_for_lm_generation(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        batch_size = pixel_values.shape[0]
+        vision_outputs = self.get_image_embeds(pixel_values)
+        image_embeds = vision_outputs["image_embeds"]
+        image_attention_mask = vision_outputs["image_attention_mask"]
+        if input_ids is None:
+            input_ids = (
+                torch.LongTensor([[self.config.text_config.bos_token_id]])
+                .repeat(batch_size, 1)
+                .to(image_embeds.device)
+            )
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        attention_mask = torch.cat(
+            [
+                image_attention_mask,
+                attention_mask.to(image_attention_mask.device),
+            ],
+            dim=1,
+        )
+        # concatenate query embeddings with prompt embeddings
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        inputs_embeds = torch.cat(
+            [image_embeds, inputs_embeds.to(image_embeds.device)],
+            dim=1,
+        )
+        return dict(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+        encodings = self.prepare_for_lm_generation(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        outputs = self.language_model.generate(
+            **encodings,
+            **generate_kwargs,
+        )
+        return outputs