Cyril666 commited on
Commit
21cb29e
·
verified ·
1 Parent(s): 9b85062

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +10 -21
README.md CHANGED
@@ -46,38 +46,27 @@ Before training, the model parameters and architecture are initialized from [Sig
46
  ## 🤖 Quick Start
47
  ```python
48
  import torch
49
- from transformers import AutoModelForCausalLM, AutoProcessor, AutoModel, AutoImageProcessor
 
50
 
51
  model_name = "DAMO-NLP-SG/VL3-SigLIP-NaViT"
 
 
52
 
53
- model = AutoModelForCausalLM.from_pretrained(
54
  model_name,
55
  trust_remote_code=True,
56
  device_map="auto",
57
  torch_dtype=torch.bfloat16,
58
  attn_implementation="flash_attention_2",
59
  )
60
- processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
61
-
62
- # Video conversation
63
- conversation = [
64
- {"role": "system", "content": "You are a helpful assistant."},
65
- {
66
- "role": "user",
67
- "content": [
68
- {"type": "video", "data": {"video_path": "https://github.com/DAMO-NLP-SG/VideoLLaMA3/raw/refs/heads/main/assets/cat_and_chicken.mp4", "fps": 1, "max_frames": 128}},
69
- {"type": "text", "data": "What is the cat doing?"},
70
- ]
71
- },
72
- ]
73
-
74
- inputs = processor(conversation=conversation, return_tensors="pt")
75
- inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
76
  if "pixel_values" in inputs:
77
  inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
78
- output_ids = model.generate(**inputs, max_new_tokens=128)
79
- response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
80
- print(response)
81
  ```
82
 
83
 
 
46
  ## 🤖 Quick Start
47
  ```python
48
  import torch
49
+ from transformers import AutoModel, AutoImageProcessor
50
+ from transformers.image_utils import load_image
51
 
52
  model_name = "DAMO-NLP-SG/VL3-SigLIP-NaViT"
53
+ image_path = "https://github.com/DAMO-NLP-SG/VideoLLaMA3/blob/main/assets/sora.png?raw=true"
54
+ images = load_image(image_path)
55
 
56
+ model = AutoModel.from_pretrained(
57
  model_name,
58
  trust_remote_code=True,
59
  device_map="auto",
60
  torch_dtype=torch.bfloat16,
61
  attn_implementation="flash_attention_2",
62
  )
63
+ processor = AutoImageProcessor.from_pretrained(model_name, trust_remote_code=True)
64
+
65
+ inputs = processor(images=images, merge_size=1)
66
+ inputs = {k: torch.tensor(v).cuda() for k, v in inputs.items()}
 
 
 
 
 
 
 
 
 
 
 
 
67
  if "pixel_values" in inputs:
68
  inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
69
+ image_features = model(**inputs)
 
 
70
  ```
71
 
72