DAMO-NLP-SG
/

VL3-SigLIP-NaViT

Image Feature Extraction

videollama3_vision_encoder

feature-extraction

multi-modal-large-language-model

Model card Files Files and versions Community

Cyril666 commited on about 7 hours ago

Commit

21cb29e

·

verified ·

1 Parent(s): 9b85062

Update README.md

Files changed (1) hide show

README.md +10 -21

README.md CHANGED Viewed

@@ -46,38 +46,27 @@ Before training, the model parameters and architecture are initialized from [Sig
 ## 🤖 Quick Start
 ```python
 import torch
-from transformers import AutoModelForCausalLM, AutoProcessor, AutoModel, AutoImageProcessor
 model_name = "DAMO-NLP-SG/VL3-SigLIP-NaViT"
-model = AutoModelForCausalLM.from_pretrained(
     model_name,
     trust_remote_code=True,
     device_map="auto",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
 )
-processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
-# Video conversation
-conversation = [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {
-        "role": "user",
-        "content": [
-            {"type": "video", "data": {"video_path": "https://github.com/DAMO-NLP-SG/VideoLLaMA3/raw/refs/heads/main/assets/cat_and_chicken.mp4", "fps": 1, "max_frames": 128}},
-            {"type": "text", "data": "What is the cat doing?"},
-        ]
-    },
-]
-inputs = processor(conversation=conversation, return_tensors="pt")
-inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
 if "pixel_values" in inputs:
     inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
-output_ids = model.generate(**inputs, max_new_tokens=128)
-response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-print(response)
 ```

 ## 🤖 Quick Start
 ```python
 import torch
+from transformers import AutoModel, AutoImageProcessor
+from transformers.image_utils import load_image
 model_name = "DAMO-NLP-SG/VL3-SigLIP-NaViT"
+image_path = "https://github.com/DAMO-NLP-SG/VideoLLaMA3/blob/main/assets/sora.png?raw=true"
+images = load_image(image_path)
+model = AutoModel.from_pretrained(
     model_name,
     trust_remote_code=True,
     device_map="auto",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
 )
+processor = AutoImageProcessor.from_pretrained(model_name, trust_remote_code=True)
+inputs = processor(images=images, merge_size=1)
+inputs = {k: torch.tensor(v).cuda() for k, v in inputs.items()}
 if "pixel_values" in inputs:
     inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+image_features = model(**inputs)
 ```