Spaces:

THUdyh
/

Oryx

Running on Zero

App Files Files Community

THUdyh commited on 21 days ago

Commit

223aac8

•

1 Parent(s): 0e70eb4

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -38

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import torch
 import re
 from decord import VideoReader, cpu
 from PIL import Image
 import numpy as np
@@ -12,7 +13,6 @@ import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 import sys
-# sys.path.append('/mnt/lzy/oryx-demo')
 from oryx.conversation import conv_templates, SeparatorStyle
 from oryx.model.builder import load_pretrained_model
 from oryx.utils import disable_torch_init
@@ -83,14 +83,23 @@ def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_im
     return input_ids
 @spaces.GPU(duration=120)
-def oryx_inference(video, text):
-    vr = VideoReader(video, ctx=cpu(0))
-    total_frame_num = len(vr)
-    fps = round(vr.get_avg_fps())
-    uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
-    frame_idx = uniform_sampled_frames.tolist()
-    spare_frames = vr.get_batch(frame_idx).asnumpy()
-    video = [Image.fromarray(frame) for frame in spare_frames]
     conv_mode = "qwen_1_5"
@@ -104,39 +113,73 @@ def oryx_inference(video, text):
     input_ids = preprocess_qwen([{'from': 'human','value': question},{'from': 'gpt','value': None}], tokenizer, has_image=True).to(device)
-    video_processed = []
-    for idx, frame in enumerate(video):
         image_processor.do_resize = False
         image_processor.do_center_crop = False
-        frame = process_anyres_video_genli(frame, image_processor)
-        if frame_idx is not None and idx in frame_idx:
-            video_processed.append(frame.unsqueeze(0))
-        elif frame_idx is None:
-            video_processed.append(frame.unsqueeze(0))
-    if frame_idx is None:
-        frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
-    video_processed = torch.cat(video_processed, dim=0).bfloat16().to(device)
-    video_processed = (video_processed, video_processed)
-    video_data = (video_processed, (384, 384), "video")
     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
     keywords = [stop_str]
     with torch.inference_mode():
-        output_ids = model.generate(
-            inputs=input_ids,
-            images=video_data[0][0],
-            images_highres=video_data[0][1],
-            modalities=video_data[2],
-            do_sample=False,
-            temperature=0,
-            max_new_tokens=1024,
-            use_cache=True,
-        )
     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
@@ -147,12 +190,23 @@ def oryx_inference(video, text):
     return outputs
 # Define input and output for the Gradio interface
 demo = gr.Interface(
     fn=oryx_inference,
-    inputs=[gr.Video(label="Input Video"), gr.Textbox(label="Input Text")],
     outputs="text",
-    title="Oryx Inference",
-    description="This is a demo for Oryx inference."
 )
 # Launch the Gradio app

 import gradio as gr
 import torch
 import re
+import os
 from decord import VideoReader, cpu
 from PIL import Image
 import numpy as np
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 import sys
 from oryx.conversation import conv_templates, SeparatorStyle
 from oryx.model.builder import load_pretrained_model
 from oryx.utils import disable_torch_init
     return input_ids
 @spaces.GPU(duration=120)
+def oryx_inference(multimodal):
+    visual, text = multimodal["files"][0], multimodal["text"]
+    if visual.endswith(".mp4"):
+        modality = "video"
+    else:
+        modality = "image"
+    if modality == "video":
+        vr = VideoReader(visual, ctx=cpu(0))
+        total_frame_num = len(vr)
+        fps = round(vr.get_avg_fps())
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        spare_frames = vr.get_batch(frame_idx).asnumpy()
+        video = [Image.fromarray(frame) for frame in spare_frames]
+    else:
+        image = [Image.open(visual)]
+        image_sizes = [image[0].size]
     conv_mode = "qwen_1_5"
     input_ids = preprocess_qwen([{'from': 'human','value': question},{'from': 'gpt','value': None}], tokenizer, has_image=True).to(device)
+    if modality == "video":
+        video_processed = []
+        for idx, frame in enumerate(video):
+            image_processor.do_resize = False
+            image_processor.do_center_crop = False
+            frame = process_anyres_video_genli(frame, image_processor)
+            if frame_idx is not None and idx in frame_idx:
+                video_processed.append(frame.unsqueeze(0))
+            elif frame_idx is None:
+                video_processed.append(frame.unsqueeze(0))
+        if frame_idx is None:
+            frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
+        video_processed = torch.cat(video_processed, dim=0).bfloat16().to(device)
+        video_processed = (video_processed, video_processed)
+        video_data = (video_processed, (384, 384), "video")
+    else:
         image_processor.do_resize = False
         image_processor.do_center_crop = False
+        image_tensor, image_highres_tensor = [], []
+        for visual in image:
+            image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
+            image_tensor.append(image_tensor_)
+            image_highres_tensor.append(image_highres_tensor_)
+        if all(x.shape == image_tensor[0].shape for x in image_tensor):
+            image_tensor = torch.stack(image_tensor, dim=0)
+        if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
+            image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
+        if type(image_tensor) is list:
+            image_tensor = [_image.bfloat16().to(device) for _image in image_tensor]
+        else:
+            image_tensor = image_tensor.bfloat16().to(device)
+        if type(image_highres_tensor) is list:
+            image_highres_tensor = [_image.bfloat16().to(device) for _image in image_highres_tensor]
+        else:
+            image_highres_tensor = image_highres_tensor.bfloat16().to(device)
     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
     keywords = [stop_str]
     with torch.inference_mode():
+        if modality == "video":
+            output_ids = model.generate(
+                inputs=input_ids,
+                images=video_data[0][0],
+                images_highres=video_data[0][1],
+                modalities=video_data[2],
+                do_sample=False,
+                temperature=0,
+                max_new_tokens=1024,
+                use_cache=True,
+            )
+        else:
+            output_ids = model.generate(
+                inputs=input_ids,
+                images=image_tensor,
+                images_highres=image_highres_tensor,
+                image_sizes=image_sizes,
+                modalities=['image'],
+                do_sample=False,
+                temperature=0,
+                max_new_tokens=1024,
+                use_cache=True,
+            )
     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
     return outputs
 # Define input and output for the Gradio interface
+cur_dir = os.path.dirname(os.path.abspath(__file__))
 demo = gr.Interface(
     fn=oryx_inference,
+    inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
     outputs="text",
+    examples=[
+            {
+                "files":[f"{cur_dir}/case/case1.mp4"],
+                "text":"Describe what is happening in this video in detail.",
+            },
+            {
+                "files":[f"{cur_dir}/case/image.png"],
+                "text":"Describe this icon.",
+            },
+        ],
+    title="Oryx Demo",
+    description="A huggingface space for Oryx-7B."
 )
 # Launch the Gradio app