benchang1110
commited on
Upload TaiVisionForCausalLM
Browse files- config.json +6 -1
- model.safetensors +1 -1
- modeling_taivisionlm.py +9 -40
config.json
CHANGED
@@ -1,6 +1,10 @@
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"auto_map": {
|
3 |
-
"AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig"
|
|
|
4 |
},
|
5 |
"hidden_size": 2048,
|
6 |
"ignore_index": -100,
|
@@ -21,6 +25,7 @@
|
|
21 |
"torch_dtype": "bfloat16",
|
22 |
"vocab_size": 32001
|
23 |
},
|
|
|
24 |
"transformers_version": "4.44.0",
|
25 |
"vision_config": {
|
26 |
"model_type": "siglip_vision_model",
|
|
|
1 |
{
|
2 |
+
"architectures": [
|
3 |
+
"TaiVisionForCausalLM"
|
4 |
+
],
|
5 |
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig",
|
7 |
+
"AutoModelForCausalLM": "modeling_taivisionlm.TaiVisionForCausalLM"
|
8 |
},
|
9 |
"hidden_size": 2048,
|
10 |
"ignore_index": -100,
|
|
|
25 |
"torch_dtype": "bfloat16",
|
26 |
"vocab_size": 32001
|
27 |
},
|
28 |
+
"torch_dtype": "float32",
|
29 |
"transformers_version": "4.44.0",
|
30 |
"vision_config": {
|
31 |
"model_type": "siglip_vision_model",
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4806424752
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11d50e45bc0203fb3be9a06add95e21a024690098db67cd7b97f29ae03c2bb57
|
3 |
size 4806424752
|
modeling_taivisionlm.py
CHANGED
@@ -156,18 +156,17 @@ class TaiVisionForCausalLM(TaiVisionPreTrainedModel):
|
|
156 |
self.language_model = language_model
|
157 |
self.post_init()
|
158 |
|
159 |
-
def
|
160 |
-
|
161 |
-
load the pretrained weights for language model and vision model
|
162 |
-
'''
|
163 |
-
import transformers
|
164 |
-
language_model = AutoModelForCausalLM.from_pretrained("benchang1110/Taiwan-tinyllama-v1.0-chat")
|
165 |
if language_model.vocab_size != self.vocab_size:
|
166 |
print("vocab size mismatch, resize the token embeddings for the pretained language model")
|
167 |
language_model.resize_token_embeddings(self.vocab_size)
|
168 |
-
self.language_model
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
171 |
|
172 |
# Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_input_embeddings with PaliGemma->TaiVisionLM
|
173 |
def get_input_embeddings(self):
|
@@ -439,34 +438,4 @@ class TaiVisionForCausalLM(TaiVisionPreTrainedModel):
|
|
439 |
if cache_position[0] == 0:
|
440 |
model_inputs["pixel_values"] = pixel_values
|
441 |
|
442 |
-
return model_inputs
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
if __name__ == '__main__':
|
447 |
-
import transformers
|
448 |
-
config = transformers.AutoConfig.from_pretrained("benchang1110/TaiVision-base",trust_remote_code=True)
|
449 |
-
model = TaiVisionForCausalLM(config).to("cuda")
|
450 |
-
print(model)
|
451 |
-
model.save_pretrained
|
452 |
-
# Test forward
|
453 |
-
import torch
|
454 |
-
from PIL import Image
|
455 |
-
import requests
|
456 |
-
# Initialize processor
|
457 |
-
processor = transformers.AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
|
458 |
-
|
459 |
-
# Load image
|
460 |
-
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
|
461 |
-
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
|
462 |
-
|
463 |
-
# Define prompt and label
|
464 |
-
prompt = "What is the color of the car?"
|
465 |
-
label = "I am fine, thank you."
|
466 |
-
|
467 |
-
# Process inputs
|
468 |
-
inputs = processor(prompts=prompt,images=image, return_tensors="pt", padding=False, max_length=512).to('cuda')
|
469 |
-
|
470 |
-
outputs = model.generate(**inputs, max_length=512, do_sample=True, pad_token_id=processor.tokenizer.pad_token_id)
|
471 |
-
print(processor.decode(outputs[0], skip_special_tokens=True))
|
472 |
-
|
|
|
156 |
self.language_model = language_model
|
157 |
self.post_init()
|
158 |
|
159 |
+
def load_language_model(self, model_id = "benchang1110/Taiwan-tinyllama-v1.0-chat"):
|
160 |
+
language_model = AutoModelForCausalLM.from_pretrained(model_id)
|
|
|
|
|
|
|
|
|
161 |
if language_model.vocab_size != self.vocab_size:
|
162 |
print("vocab size mismatch, resize the token embeddings for the pretained language model")
|
163 |
language_model.resize_token_embeddings(self.vocab_size)
|
164 |
+
self.language_model.load_state_dict(language_model.state_dict(),strict=True)
|
165 |
+
|
166 |
+
def load_vision_model(self,model_id = "google/siglip-base-patch16-224"):
|
167 |
+
import transformers
|
168 |
+
vision_model = transformers.SiglipVisionModel.from_pretrained(model_id)
|
169 |
+
self.vision_tower.load_state_dict(vision_model.state_dict(),strict=True)
|
170 |
|
171 |
# Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_input_embeddings with PaliGemma->TaiVisionLM
|
172 |
def get_input_embeddings(self):
|
|
|
438 |
if cache_position[0] == 0:
|
439 |
model_inputs["pixel_values"] = pixel_values
|
440 |
|
441 |
+
return model_inputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|