Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,986 Bytes
8f558df 21fcfe6 3890132 0e31dfe 21fcfe6 8f558df 21fcfe6 02558d9 a533ef3 425e364 a533ef3 a16627e 4a28884 02558d9 a16627e 4a28884 a16627e 21fcfe6 d0c2593 21fcfe6 88bde73 0edd73a 88bde73 25d5485 88bde73 25d5485 02558d9 dcf6d05 a16627e dcf6d05 a16627e e1a05a3 dcf6d05 13775ff dcf6d05 8f558df a16627e 6bf8982 a16627e 6bf8982 a16627e 6bf8982 a16627e 8f558df a16627e 25d5485 6bf8982 dd07a7a 25d5485 8f558df 25d5485 21fcfe6 25d5485 8f558df 755339c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
import subprocess
from datetime import datetime
import numpy as np
import os
# Install flash-attn
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
# Model and Processor Loading (Done once at startup)
MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16).to("cuda").eval()
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
DESCRIPTION = "[Qwen2-VL-7B Demo](https://huggingface.co./Qwen/Qwen2-VL-7B-Instruct)"
@spaces.GPU
def qwen_inference(media_path, text_input=None):
image_extensions = Image.registered_extensions()
if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
media_type = "image"
elif media_path.endswith(("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")): # Check if it's a video path
media_type = "video"
else:
raise ValueError("Unsupported media type. Please upload an image or video.")
messages = [
{
"role": "user",
"content": [
{
"type": media_type,
media_type: media_path,
**({"fps": 8.0} if media_type == "video" else {}),
},
{"type": "text", "text": text_input},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return output_text
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Image/Video Input"):
with gr.Row():
with gr.Column():
input_media = gr.File(label="Upload Image or Video", type="filepath")
text_input = gr.Textbox(label="Question")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
submit_btn.click(qwen_inference, [input_media, text_input], [output_text])
demo.launch(debug=True) |