File size: 1,344 Bytes
1ceb68b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# coding=utf-8
from transformers.configuration_utils import PretrainedConfig
from open_clip import get_model_config
from configuration_phi import PhiConfig
class LlavaConfig(PretrainedConfig):
model_type = "llava"
is_composition = False
def __init__(
self,
text_config=None,
vision_tower_name="ViT-SO400M-14-SigLIP-384",
ignore_index=-100,
image_token_index=50297,
projector_hidden_act="gelu",
projector_tokens_num=1,
vocab_size=51200,
**kwargs,
):
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
self.projector_tokens_num = projector_tokens_num
self.vocab_size = vocab_size
self.vision_tower_name = vision_tower_name
vision_config = get_model_config(vision_tower_name)
self.vision_embed_dim = vision_config["embed_dim"]
self.vocab_size = self.vocab_size
self.text_config = text_config
if isinstance(self.text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
self.text_config = PhiConfig(**text_config)
self.vocab_size = self.text_config.vocab_size
super().__init__(**kwargs) |