Update README.md
Browse files
README.md
CHANGED
@@ -46,38 +46,27 @@ Before training, the model parameters and architecture are initialized from [Sig
|
|
46 |
## 🤖 Quick Start
|
47 |
```python
|
48 |
import torch
|
49 |
-
from transformers import
|
|
|
50 |
|
51 |
model_name = "DAMO-NLP-SG/VL3-SigLIP-NaViT"
|
|
|
|
|
52 |
|
53 |
-
model =
|
54 |
model_name,
|
55 |
trust_remote_code=True,
|
56 |
device_map="auto",
|
57 |
torch_dtype=torch.bfloat16,
|
58 |
attn_implementation="flash_attention_2",
|
59 |
)
|
60 |
-
processor =
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
{"role": "system", "content": "You are a helpful assistant."},
|
65 |
-
{
|
66 |
-
"role": "user",
|
67 |
-
"content": [
|
68 |
-
{"type": "video", "data": {"video_path": "https://github.com/DAMO-NLP-SG/VideoLLaMA3/raw/refs/heads/main/assets/cat_and_chicken.mp4", "fps": 1, "max_frames": 128}},
|
69 |
-
{"type": "text", "data": "What is the cat doing?"},
|
70 |
-
]
|
71 |
-
},
|
72 |
-
]
|
73 |
-
|
74 |
-
inputs = processor(conversation=conversation, return_tensors="pt")
|
75 |
-
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
76 |
if "pixel_values" in inputs:
|
77 |
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
78 |
-
|
79 |
-
response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
|
80 |
-
print(response)
|
81 |
```
|
82 |
|
83 |
|
|
|
46 |
## 🤖 Quick Start
|
47 |
```python
|
48 |
import torch
|
49 |
+
from transformers import AutoModel, AutoImageProcessor
|
50 |
+
from transformers.image_utils import load_image
|
51 |
|
52 |
model_name = "DAMO-NLP-SG/VL3-SigLIP-NaViT"
|
53 |
+
image_path = "https://github.com/DAMO-NLP-SG/VideoLLaMA3/blob/main/assets/sora.png?raw=true"
|
54 |
+
images = load_image(image_path)
|
55 |
|
56 |
+
model = AutoModel.from_pretrained(
|
57 |
model_name,
|
58 |
trust_remote_code=True,
|
59 |
device_map="auto",
|
60 |
torch_dtype=torch.bfloat16,
|
61 |
attn_implementation="flash_attention_2",
|
62 |
)
|
63 |
+
processor = AutoImageProcessor.from_pretrained(model_name, trust_remote_code=True)
|
64 |
+
|
65 |
+
inputs = processor(images=images, merge_size=1)
|
66 |
+
inputs = {k: torch.tensor(v).cuda() for k, v in inputs.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
if "pixel_values" in inputs:
|
68 |
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
69 |
+
image_features = model(**inputs)
|
|
|
|
|
70 |
```
|
71 |
|
72 |
|