|
from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer |
|
from PIL import Image |
|
import requests |
|
import torch |
|
from threading import Thread |
|
import gradio as gr |
|
from gradio import FileData |
|
import time |
|
import spaces |
|
import os |
|
|
|
|
|
hf_token = os.getenv("HF_AUTH_TOKEN") |
|
if not hf_token: |
|
raise ValueError("Hugging Face token not found. Set HF_AUTH_TOKEN in your Space settings.") |
|
|
|
|
|
ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct" |
|
|
|
|
|
model = MllamaForConditionalGeneration.from_pretrained( |
|
ckpt, |
|
torch_dtype=torch.bfloat16, |
|
token=hf_token |
|
).to("cuda") |
|
|
|
processor = AutoProcessor.from_pretrained(ckpt, token=hf_token) |
|
|
|
|
|
@spaces.GPU |
|
def bot_streaming(message, history, max_new_tokens=4500): |
|
txt = message["text"] |
|
ext_buffer = f"{txt}" |
|
|
|
messages = [] |
|
images = [] |
|
|
|
|
|
for i, msg in enumerate(history): |
|
if isinstance(msg[0], tuple): |
|
messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]}) |
|
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]}) |
|
images.append(Image.open(msg[0][0]).convert("RGB")) |
|
elif isinstance(history[i-1], tuple) and isinstance(msg[0], str): |
|
|
|
pass |
|
elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): |
|
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]}) |
|
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]}) |
|
|
|
|
|
if len(message["files"]) == 1: |
|
if isinstance(message["files"][0], str): |
|
image = Image.open(message["files"][0]).convert("RGB") |
|
else: |
|
image = Image.open(message["files"][0]["path"]).convert("RGB") |
|
images.append(image) |
|
messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]}) |
|
else: |
|
messages.append({"role": "user", "content": [{"type": "text", "text": txt}]}) |
|
|
|
|
|
texts = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
if images == []: |
|
inputs = processor(text=texts, return_tensors="pt").to("cuda") |
|
else: |
|
inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda") |
|
|
|
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True) |
|
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens) |
|
generated_text = "" |
|
|
|
|
|
thread = Thread(target=model.generate, kwargs=generation_kwargs) |
|
thread.start() |
|
buffer = "" |
|
|
|
for new_text in streamer: |
|
buffer += new_text |
|
time.sleep(0.01) |
|
yield buffer |
|
|
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=bot_streaming, |
|
title="Multimodal Llama", |
|
examples=[], |
|
textbox=gr.MultimodalTextbox(), |
|
additional_inputs=[ |
|
gr.Slider( |
|
minimum=10, |
|
maximum=5000, |
|
value=250, |
|
step=10, |
|
label="Maximum number of new tokens to generate", |
|
) |
|
], |
|
cache_examples=False, |
|
description=( |
|
"Try Multimodal Llama by Meta with transformers in this demo. " |
|
"Upload an image, and start chatting about it, or simply try one of the examples below. " |
|
"To learn more about Llama Vision, visit [our blog post](https://huggingface.co./blog/llama32)." |
|
), |
|
stop_btn="Stop Generation", |
|
fill_height=True, |
|
multimodal=True |
|
) |
|
|
|
demo.launch(debug=True) |
|
|