Rename xgen-mm-vid-inference-script.py to xgen-mm-vid-inference-script_hf.py

Browse files

Files changed (1) hide show

xgen-mm-vid-inference-script.py → xgen-mm-vid-inference-script_hf.py +17 -37

xgen-mm-vid-inference-script.py → xgen-mm-vid-inference-script_hf.py RENAMED Viewed

@@ -1,26 +1,13 @@
-# %%
-from modeling_xgenmm import *
-# %%
-cfg = XGenMMConfig()
-model = XGenMMModelForConditionalGeneration(cfg)
-model = model.cuda()
-model = model.half()
-# %%
-from transformers import AutoTokenizer, AutoImageProcessor
-xgenmm_path = "Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"
-tokenizer = AutoTokenizer.from_pretrained(
-    xgenmm_path, trust_remote_code=True, use_fast=False, legacy=False
-)
-image_processor = AutoImageProcessor.from_pretrained(
-    xgenmm_path, trust_remote_code=True
-)
 tokenizer = model.update_special_tokens(tokenizer)
-# model = model.to("cuda")
 model.eval()
 tokenizer.padding_side = "left"
 tokenizer.eos_token = "<|end|>"
@@ -34,9 +21,8 @@ import torchvision.io
 import math
 def sample_frames(vframes, num_frames):
-    frame_indice = np.linspace(0, len(vframes) - 1, num_frames, dtype=int)
     video = vframes[frame_indice]
     video_list = []
     for i in range(len(video)):
@@ -49,8 +35,7 @@ def generate(messages, images):
     # images = [Image.open(BytesIO(img_bytes)) for img_bytes in img_bytes_list]
     image_sizes = [image.size for image in images]
     # Similar operation in model_worker.py
-    image_tensor = [image_processor([img])["pixel_values"].to(model.device, dtype=torch.float16) for img in images]
     image_tensor = torch.stack(image_tensor, dim=1)
     image_tensor = image_tensor.squeeze(2)
@@ -101,23 +86,18 @@ def predict(video_file, num_frames=8):
     prompt = ""
     prompt = prompt + "<image>\n"
-    prompt = prompt + "Describe this video."
     messages = [{"role": "user", "content": prompt}]
     return generate(messages, images)
 # %%
-import torch
-your_checkpoint_path = ""
-sd = torch.load(your_checkpoint_path)
-model.load_state_dict(sd)
-# %%
-your_video_path = ""
 print(
     predict(
-        your_video_path,
-        num_frames = 16
     )
 )

+from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, LogitsProcessor
+import torch
+model_name_or_path = "Salesforce/xgen-mm-vid-phi3-mini-r-v1.5-128tokens-16frames"
+model = AutoModelForVision2Seq.from_pretrained(model_name_or_path, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, use_fast=False, legacy=False)
+image_processor = AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
 tokenizer = model.update_special_tokens(tokenizer)
+model = model.to('cuda')
 model.eval()
 tokenizer.padding_side = "left"
 tokenizer.eos_token = "<|end|>"
 import math
 def sample_frames(vframes, num_frames):
+    frame_indice = np.linspace(int(num_frames/2), len(vframes) - int(num_frames/2), num_frames, dtype=int)
     video = vframes[frame_indice]
     video_list = []
     for i in range(len(video)):
     # images = [Image.open(BytesIO(img_bytes)) for img_bytes in img_bytes_list]
     image_sizes = [image.size for image in images]
     # Similar operation in model_worker.py
+    image_tensor = [image_processor([img])["pixel_values"].to(model.device, dtype=torch.float32) for img in images]
     image_tensor = torch.stack(image_tensor, dim=1)
     image_tensor = image_tensor.squeeze(2)
     prompt = ""
     prompt = prompt + "<image>\n"
+    # prompt = prompt + "What's the main gist of the video ?"
+    prompt = prompt + "Please describe the primary object or subject in the video, capturing their attributes, actions, positions, and movements."
     messages = [{"role": "user", "content": prompt}]
     return generate(messages, images)
 # %%
+video_path = ""
 print(
     predict(
+        video_path,
+        num_frames = 8
     )
 )
+# %%