qwen-vl / app.py
artificialguybr's picture
Update app.py
789b1d4
raw
history blame
2.66 kB
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from PIL import Image
import re
import requests
from io import BytesIO
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-14B-Chat-Int4", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-14B-Chat-Int4", device_map="auto", trust_remote_code=True).eval()
def generate_predictions(image_input, text_input, with_grounding):
user_image_path = "/tmp/user_input_test_image.jpg"
original_image = Image.fromarray((255 - (image_input * 255).astype('uint8')))
original_image.save(user_image_path)
if with_grounding == "Yes":
text_input += " with grounding"
query = tokenizer.from_list_format([
{'image': user_image_path},
{'text': text_input},
])
inputs = tokenizer(query, return_tensors='pt')
inputs = inputs.to(model.device)
pred = model.generate(**inputs)
full_response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
frontend_response = re.sub(r'Picture \d+:|<.*?>|\/tmp\/.*\.jpg', '', full_response).replace(text_input, '').strip()
print("Generated Caption:", frontend_response) # Debugging line
image_with_boxes = tokenizer.draw_bbox_on_latest_picture(full_response)
# Check if the response contains bounding box coordinates
if not re.search(r'\(\d+,\d+\),\(\d+,\d+\)', frontend_response):
image_with_boxes = original_image
if image_with_boxes:
temp_path = "/tmp/image_with_boxes.jpg"
image_with_boxes.save(temp_path)
image_with_boxes = Image.open(temp_path)
return image_with_boxes, frontend_response
iface = gr.Interface(
fn=generate_predictions,
inputs=[
gr.inputs.Image(label="Image Input"),
gr.inputs.Textbox(default="Generate a caption for that image:", label="Prompt"),
gr.inputs.Radio(["No", "Yes"], label="With Grounding", default="No")
],
outputs=[
gr.outputs.Image(type='pil', label="Image"),
gr.outputs.Textbox(label="Generated")
],
title="Qwen-VL Demonstration",
description = """
## Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud
**Space by [@Artificialguybr](https://twitter.com/artificialguybr)**
### Key Features:
- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.
- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.
- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.
""",
)
iface.launch()