benchang1110
commited on
Upload config
Browse files- config.json +3 -0
- configuration_taivisionlm.py +110 -0
config.json
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"hidden_size": 2048,
|
3 |
"ignore_index": -100,
|
4 |
"image_token_index": 32000,
|
|
|
1 |
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig"
|
4 |
+
},
|
5 |
"hidden_size": 2048,
|
6 |
"ignore_index": -100,
|
7 |
"image_token_index": 32000,
|
configuration_taivisionlm.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""TraVisionLM configuration"""
|
2 |
+
|
3 |
+
from transformers import PretrainedConfig
|
4 |
+
from transformers import logging, CONFIG_MAPPING
|
5 |
+
import warnings
|
6 |
+
import transformers
|
7 |
+
|
8 |
+
logger = logging.get_logger(__name__)
|
9 |
+
|
10 |
+
class TaiVisionLMConfig(PretrainedConfig):
|
11 |
+
model_type = "taivisionlm"
|
12 |
+
is_composition = False
|
13 |
+
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
vision_config=None,
|
17 |
+
text_config=None,
|
18 |
+
ignore_index=-100,
|
19 |
+
image_token_idx=32000,
|
20 |
+
vocab_size=32001,
|
21 |
+
projection_dim=768,
|
22 |
+
hidden_size=2048,
|
23 |
+
**kwargs,
|
24 |
+
):
|
25 |
+
self.ignore_index = ignore_index
|
26 |
+
self.image_token_index = image_token_idx
|
27 |
+
self._vocab_size = vocab_size
|
28 |
+
self.projection_dim = projection_dim
|
29 |
+
self.hidden_size = hidden_size
|
30 |
+
self.vision_config = vision_config
|
31 |
+
self.is_encoder_decoder = False
|
32 |
+
|
33 |
+
if isinstance(self.vision_config, dict):
|
34 |
+
vision_config["model_type"] = (
|
35 |
+
vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
|
36 |
+
)
|
37 |
+
self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
|
38 |
+
elif vision_config is None:
|
39 |
+
self.vision_config = CONFIG_MAPPING["siglip_vision_model"](
|
40 |
+
attention_dropout=0.0,
|
41 |
+
hidden_act="gelu_pytorch_tanh",
|
42 |
+
hidden_size=768,
|
43 |
+
image_size=224,
|
44 |
+
intermediate_size=3072,
|
45 |
+
layer_norm_eps=1e-06,
|
46 |
+
num_attention_heads=12,
|
47 |
+
num_channels=3,
|
48 |
+
num_hidden_layers=12,
|
49 |
+
patch_size=16,
|
50 |
+
)
|
51 |
+
|
52 |
+
self.vocab_size = vocab_size
|
53 |
+
self.text_config = text_config
|
54 |
+
|
55 |
+
if isinstance(self.text_config, dict):
|
56 |
+
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gpt2"
|
57 |
+
self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
|
58 |
+
elif text_config is None:
|
59 |
+
self.text_config = CONFIG_MAPPING["llama"](
|
60 |
+
architecture= ["LlamaForCausalLM"],
|
61 |
+
hidden_act = "silu",
|
62 |
+
attention_bias = False,
|
63 |
+
attention_dropout = 0.0,
|
64 |
+
bos_token_id = 1,
|
65 |
+
eos_token_id = 2,
|
66 |
+
hidden_size = 2048,
|
67 |
+
initializer_range = 0.02,
|
68 |
+
intermediate_size = 5632,
|
69 |
+
max_position_embeddings = 2048,
|
70 |
+
model_type = "llama",
|
71 |
+
num_attention_heads = 32,
|
72 |
+
num_hidden_layers = 22,
|
73 |
+
num_key_value_heads = 4,
|
74 |
+
pretraining_tp = 1,
|
75 |
+
rms_norm_eps = 1e-05,
|
76 |
+
rope_scaling = None,
|
77 |
+
rope_theta = 10000.0,
|
78 |
+
tie_word_embeddings = False,
|
79 |
+
torch_dtype = "bfloat16",
|
80 |
+
transformers_version = "4.40.2",
|
81 |
+
use_cache = True,
|
82 |
+
vocab_size = 32000
|
83 |
+
)
|
84 |
+
self.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
|
85 |
+
self.pad_token_id = self.text_config.pad_token_id
|
86 |
+
self.vision_config.projection_dim = projection_dim
|
87 |
+
super().__init__(**kwargs)
|
88 |
+
|
89 |
+
@property
|
90 |
+
def vocab_size(self):
|
91 |
+
warnings.warn(
|
92 |
+
"The `vocab_size` attribute is deprecated and will be removed in v4.44, Please use `text_config.vocab_size` instead.",
|
93 |
+
FutureWarning,
|
94 |
+
)
|
95 |
+
return self._vocab_size
|
96 |
+
|
97 |
+
@vocab_size.setter
|
98 |
+
def vocab_size(self, value):
|
99 |
+
self._vocab_size = value
|
100 |
+
|
101 |
+
def to_dict(self):
|
102 |
+
output = super().to_dict()
|
103 |
+
output.pop("_vocab_size", None)
|
104 |
+
return output
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
config = TaiVisionLMConfig()
|
108 |
+
TaiVisionLMConfig.register_for_auto_class()
|
109 |
+
config.push_to_hub("benchang1110/TaiVision-base")
|
110 |
+
config.save_pretrained("./")
|