|
import gradio as gr |
|
from gradio_client import Client, handle_file |
|
from huggingface_hub import InferenceClient |
|
from PIL import Image |
|
from threading import Thread |
|
import time |
|
|
|
|
|
moondream_client = Client("vikhyatk/moondream2") |
|
qwq_client = InferenceClient("Qwen/QwQ-32B-Preview") |
|
|
|
|
|
def describe_image(image, user_message): |
|
result = moondream_client.predict( |
|
img=handle_file(image), |
|
prompt="Describe this image.", |
|
api_name="/answer_question" |
|
) |
|
|
|
description = result |
|
user_message = description + "\n" + user_message |
|
|
|
|
|
qwq_result = qwq_client.chat_completion( |
|
messages=[{"role": "user", "content": user_message}], |
|
max_tokens=512, |
|
temperature=0.7, |
|
top_p=0.95 |
|
) |
|
|
|
return qwq_result['choices'][0]['message']['content'] |
|
|
|
|
|
def chat_or_image(message, history, max_new_tokens=250): |
|
txt = message["text"] |
|
ext_buffer = f"{txt}" |
|
|
|
messages = [] |
|
images = [] |
|
|
|
|
|
for i, msg in enumerate(history): |
|
if isinstance(msg[0], tuple): |
|
messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]}) |
|
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]}) |
|
images.append(Image.open(msg[0][0]).convert("RGB")) |
|
elif isinstance(msg[0], str) and isinstance(history[i-1][0], str): |
|
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]}) |
|
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]}) |
|
|
|
|
|
if len(message["files"]) == 1: |
|
if isinstance(message["files"][0], str): |
|
image = Image.open(message["files"][0]).convert("RGB") |
|
else: |
|
image = Image.open(message["files"][0]["path"]).convert("RGB") |
|
images.append(image) |
|
messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]}) |
|
else: |
|
messages.append({"role": "user", "content": [{"type": "text", "text": txt}]}) |
|
|
|
|
|
texts = moondream_client.apply_chat_template(messages, add_generation_prompt=True) |
|
|
|
if images == []: |
|
inputs = moondream_client(text=texts, return_tensors="pt").to("cuda") |
|
else: |
|
inputs = moondream_client(text=texts, images=images, return_tensors="pt").to("cuda") |
|
|
|
streamer = TextIteratorStreamer(moondream_client, skip_special_tokens=True, skip_prompt=True) |
|
|
|
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens) |
|
generated_text = "" |
|
|
|
|
|
thread = Thread(target=qwq_client.chat_completion, kwargs=generation_kwargs) |
|
thread.start() |
|
buffer = "" |
|
|
|
|
|
for new_text in streamer: |
|
buffer += new_text |
|
generated_text_without_prompt = buffer |
|
time.sleep(0.01) |
|
yield buffer |
|
|
|
|
|
demo = gr.Interface( |
|
fn=chat_or_image, |
|
inputs=[ |
|
gr.Image(type="filepath", label="Upload image (Optional)"), |
|
gr.Textbox(label="Ask anything", placeholder="Ask...", lines=2) |
|
], |
|
outputs="text", |
|
title="Multimodal Llama Chatbot", |
|
description="Interact with the Llama chatbot. Upload an image, ask a question, or both!", |
|
live=True |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(show_error=True) |
|
|