MC-LLaVA-3b / configuration_llava.py
visheratin's picture
Upload folder using huggingface_hub
1ceb68b
raw
history blame
1.34 kB
# coding=utf-8
from transformers.configuration_utils import PretrainedConfig
from open_clip import get_model_config
from configuration_phi import PhiConfig
class LlavaConfig(PretrainedConfig):
model_type = "llava"
is_composition = False
def __init__(
self,
text_config=None,
vision_tower_name="ViT-SO400M-14-SigLIP-384",
ignore_index=-100,
image_token_index=50297,
projector_hidden_act="gelu",
projector_tokens_num=1,
vocab_size=51200,
**kwargs,
):
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
self.projector_tokens_num = projector_tokens_num
self.vocab_size = vocab_size
self.vision_tower_name = vision_tower_name
vision_config = get_model_config(vision_tower_name)
self.vision_embed_dim = vision_config["embed_dim"]
self.vocab_size = self.vocab_size
self.text_config = text_config
if isinstance(self.text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
self.text_config = PhiConfig(**text_config)
self.vocab_size = self.text_config.vocab_size
super().__init__(**kwargs)