|
import gradio as gr |
|
from gradio_client import Client, handle_file |
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
moondream_client = Client("vikhyatk/moondream2") |
|
|
|
|
|
llama_client = InferenceClient("Qwen/QwQ-32B-Preview") |
|
|
|
|
|
history = [] |
|
|
|
|
|
def describe_image(image, user_message, history): |
|
|
|
if image is None: |
|
return "No image provided", history |
|
|
|
result = moondream_client.predict( |
|
img=handle_file(image), |
|
prompt="Describe this image.", |
|
api_name="/answer_question" |
|
) |
|
|
|
description = result |
|
history.append({"role": "user", "content": user_message}) |
|
history.append({"role": "assistant", "content": description}) |
|
|
|
return description, history |
|
|
|
|
|
def chat_with_text(user_message, history, max_new_tokens=250): |
|
|
|
history.append({"role": "user", "content": user_message}) |
|
|
|
|
|
texts = [{"role": msg["role"], "content": msg["content"]} for msg in history] |
|
llama_result = llama_client.chat_completion( |
|
messages=texts, |
|
max_tokens=max_new_tokens, |
|
temperature=0.7, |
|
top_p=0.95 |
|
) |
|
|
|
|
|
assistant_reply = llama_result["choices"][0]["message"]["content"] |
|
history.append({"role": "assistant", "content": assistant_reply}) |
|
|
|
return assistant_reply, history |
|
|
|
|
|
def bot_streaming(message, history=None, max_new_tokens=250): |
|
if history is None: |
|
history = [] |
|
|
|
user_message = message.get("text", "") |
|
image = message.get("image", None) |
|
|
|
if image: |
|
response, history = describe_image(image, user_message, history) |
|
else: |
|
response, history = chat_with_text(user_message, history, max_new_tokens) |
|
|
|
|
|
return response, history |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=bot_streaming, |
|
title="Multimodal Chat Assistant", |
|
additional_inputs=[ |
|
gr.Slider( |
|
minimum=10, |
|
maximum=500, |
|
value=250, |
|
step=10, |
|
label="Maximum number of new tokens to generate", |
|
) |
|
], |
|
description=( |
|
"This demo combines text and image understanding using Moondream2 for visual " |
|
"tasks and Qwen/QwQ-32B-Preview for conversational AI. Upload an image, ask questions, " |
|
"or just chat!" |
|
), |
|
stop_btn="Stop Generation", |
|
fill_height=True, |
|
multimodal=True, |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |
|
|