import gradio as gr
from inference import inference_and_run
import spaces
import os
import shutil
from PIL import Image
from gradio_image_prompter import ImagePrompter
model_name = 'Ferret-UI'
cur_dir = os.path.dirname(os.path.abspath(__file__))
@spaces.GPU()
def inference_with_gradio(chatbot, image_data, prompt, model_path, temperature=0.2, top_p=0.7, max_new_tokens=512):
if image_data is None:
raise gr.Error("Please upload an image and draw a bounding box if needed.")
# Handle the image and bounding box data
image = image_data["image"]
box = None
if "points" in image_data and image_data["points"] and len(image_data["points"]) > 0:
points = image_data["points"][0]
# Convert points to [x1, y1, x2, y2] format
box = f"{points[0]}, {points[1]}, {points[3]}, {points[4]}"
# Convert numpy array to a PIL Image
pil_image = Image.fromarray(image)
# Save the image
filename = "temp_image.png"
dir_path = "./"
image_path = os.path.join(dir_path, filename)
pil_image.save(image_path) # Save the PIL image to the file system
if "gemma" in model_path.lower():
conv_mode = "ferret_gemma_instruct"
else:
conv_mode = "ferret_llama_3"
print("the box: ", box)
# Call the main inference function with the model and mask (if applicable)
inference_text = inference_and_run(
image_path=filename,
image_dir=dir_path,
prompt=prompt,
model_path=model_path,
conv_mode=conv_mode,
temperature=temperature,
top_p=top_p,
box=box,
max_new_tokens=max_new_tokens,
)
if isinstance(inference_text, (list, tuple)):
inference_text = str(inference_text[0])
# Update chatbot history
new_history = chatbot.copy() if chatbot else []
new_history.append((prompt, inference_text))
return new_history
def submit_chat(chatbot, text_input):
return chatbot, ''
def clear_chat():
return [], None, "", 0.2, 0.7, 512
html = f"""
{model_name}
📱 Grounded Mobile UI Understanding with Multimodal LLMs.
A new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities.
We release two Ferret-UI checkpoints, built on gemma-2b and Llama-3-8B models respectively, for public exploration. 🚀