File size: 3,772 Bytes
87ce80f
d8ff437
77d3dbe
62d3eaa
 
 
87ce80f
62d3eaa
7b6d332
942dab0
8f1cf32
62d3eaa
4e4190b
8f1cf32
 
fab8ffe
 
 
 
219615a
09818f0
 
62d3eaa
942dab0
0004fe0
8e1991d
 
 
8f1cf32
4e4190b
09818f0
8f1cf32
62d3eaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e4190b
62d3eaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c4ca35
62d3eaa
0c4ca35
 
 
 
 
 
 
62d3eaa
 
 
0c4ca35
e719a30
dceb63e
62d3eaa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from gradio_client import Client, handle_file
from huggingface_hub import InferenceClient
from PIL import Image
from threading import Thread
import time

# Initialize clients for Moondream and QwQ
moondream_client = Client("vikhyatk/moondream2")
qwq_client = InferenceClient("Qwen/QwQ-32B-Preview")

# Function to describe the image using Moondream API
def describe_image(image, user_message):
    result = moondream_client.predict(
        img=handle_file(image),
        prompt="Describe this image.",
        api_name="/answer_question"
    )
    
    description = result
    user_message = description + "\n" + user_message 

    # Using QwQ model for conversation after description
    qwq_result = qwq_client.chat_completion(
        messages=[{"role": "user", "content": user_message}],
        max_tokens=512,
        temperature=0.7,
        top_p=0.95
    )
    
    return qwq_result['choices'][0]['message']['content']

# Function to handle chat or image-based conversation
def chat_or_image(message, history, max_new_tokens=250):
    txt = message["text"]
    ext_buffer = f"{txt}"
    
    messages = [] 
    images = []

    # Process the conversation history
    for i, msg in enumerate(history): 
        if isinstance(msg[0], tuple):
            messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
            messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
            images.append(Image.open(msg[0][0]).convert("RGB"))
        elif isinstance(msg[0], str) and isinstance(history[i-1][0], str): # text only turn
            messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
            messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})

    # Add current message
    if len(message["files"]) == 1:
        if isinstance(message["files"][0], str):  # Example images
            image = Image.open(message["files"][0]).convert("RGB")
        else:  # Regular image input
            image = Image.open(message["files"][0]["path"]).convert("RGB")
        images.append(image)
        messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
    else:
        messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})

    # Processing the conversation to send to the model
    texts = moondream_client.apply_chat_template(messages, add_generation_prompt=True)

    if images == []:
        inputs = moondream_client(text=texts, return_tensors="pt").to("cuda")
    else:
        inputs = moondream_client(text=texts, images=images, return_tensors="pt").to("cuda")

    streamer = TextIteratorStreamer(moondream_client, skip_special_tokens=True, skip_prompt=True)

    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
    generated_text = ""
    
    # Generating the response with threading to avoid blocking
    thread = Thread(target=qwq_client.chat_completion, kwargs=generation_kwargs)
    thread.start()
    buffer = ""
    
    # Stream the generated text
    for new_text in streamer:
        buffer += new_text
        generated_text_without_prompt = buffer
        time.sleep(0.01)
        yield buffer

# Gradio Interface setup
demo = gr.Interface(
    fn=chat_or_image,
    inputs=[
        gr.Image(type="filepath", label="Upload image (Optional)"),
        gr.Textbox(label="Ask anything", placeholder="Ask...", lines=2)
    ],
    outputs="text",
    title="Multimodal Llama Chatbot",
    description="Interact with the Llama chatbot. Upload an image, ask a question, or both!",
    live=True
)

if __name__ == "__main__":
    demo.launch(show_error=True)