File size: 3,300 Bytes
0097859
29259b5
 
6c89696
29259b5
 
 
0097859
6c89696
29259b5
6c89696
 
 
 
 
 
 
29259b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c89696
 
 
 
29259b5
 
6c89696
 
 
 
29259b5
 
6c89696
29259b5
 
6c89696
29259b5
6c89696
 
29259b5
 
6c89696
 
29259b5
6c89696
 
 
29259b5
6c89696
29259b5
6c89696
 
29259b5
6c89696
 
 
29259b5
 
 
 
 
 
 
 
 
 
 
 
 
 
6b8041d
29259b5
 
 
 
 
 
 
6c89696
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
from transformers.image_utils import load_image
from threading import Thread
import time
import torch
import spaces

MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to("cuda").eval()

@spaces.GPU
def model_inference(input_dict, history):
    text = input_dict["text"]
    files = input_dict["files"]

    # Load images if provided
    if len(files) > 1:
        images = [load_image(image) for image in files]
    elif len(files) == 1:
        images = [load_image(files[0])]
    else:
        images = []

    # Validate input
    if text == "" and not images:
        gr.Error("Please input a query and optionally image(s).")
        return
    if text == "" and images:
        gr.Error("Please input a text query along with the image(s).")
        return

    # Prepare messages for the model
    messages = [
        {
            "role": "user",
            "content": [
                *[{"type": "image", "image": image} for image in images],
                {"type": "text", "text": text},
            ],
        }
    ]

    # Apply chat template and process inputs
    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt],
        images=images if images else None,
        return_tensors="pt",
        padding=True,
    ).to("cuda")

    # Set up streamer for real-time output
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)

    # Start generation in a separate thread
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Stream the output
    buffer = ""
    yield "..."
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        yield buffer


# Example inputs
examples = [
    [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
    [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
    [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
    [{"text": "What art era do these artpieces belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
    [{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
    [{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
    [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
    [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
]

demo = gr.ChatInterface(
    fn=model_inference,
    title="# **Qwen2.5-VL-3B-Instruct**",
    examples=examples,
    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
    stop_btn="Stop Generation",
    multimodal=True,
    cache_examples=False,
)

demo.launch(debug=True)