import gradio as gr from inference import inference_and_run import spaces import os import shutil from PIL import Image from gradio_image_prompter import ImagePrompter model_name = 'Ferret-UI' cur_dir = os.path.dirname(os.path.abspath(__file__)) @spaces.GPU() def inference_with_gradio(chatbot, image_data, prompt, model_path, temperature=0.2, top_p=0.7, max_new_tokens=512): if image_data is None: raise gr.Error("Please upload an image and draw a bounding box if needed.") # Handle the image and bounding box data image = image_data["image"] box = None if "points" in image_data and image_data["points"] and len(image_data["points"]) > 0: points = image_data["points"][0] # Convert points to [x1, y1, x2, y2] format box = f"{points[0]}, {points[1]}, {points[3]}, {points[4]}" # Convert numpy array to a PIL Image pil_image = Image.fromarray(image) # Save the image filename = "temp_image.png" dir_path = "./" image_path = os.path.join(dir_path, filename) pil_image.save(image_path) # Save the PIL image to the file system if "gemma" in model_path.lower(): conv_mode = "ferret_gemma_instruct" else: conv_mode = "ferret_llama_3" print("the box: ", box) # Call the main inference function with the model and mask (if applicable) inference_text = inference_and_run( image_path=filename, image_dir=dir_path, prompt=prompt, model_path=model_path, conv_mode=conv_mode, temperature=temperature, top_p=top_p, box=box, max_new_tokens=max_new_tokens, ) if isinstance(inference_text, (list, tuple)): inference_text = str(inference_text[0]) # Update chatbot history new_history = chatbot.copy() if chatbot else [] new_history.append((prompt, inference_text)) return new_history def submit_chat(chatbot, text_input): return chatbot, '' def clear_chat(): return [], None, "", 0.2, 0.7, 512 html = f"""
Ferret-UI

 {model_name}

📱 Grounded Mobile UI Understanding with Multimodal LLMs.
A new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities.

We release two Ferret-UI checkpoints, built on gemma-2b and Llama-3-8B models respectively, for public exploration. 🚀

""" latex_delimiters_set = [{ "left": "\\(", "right": "\\)", "display": False }, { "left": "\\begin{equation}", "right": "\\end{equation}", "display": True }, { "left": "\\begin{align}", "right": "\\end{align}", "display": True }] with gr.Blocks(title=model_name) as demo: gr.HTML(html) with gr.Row(): with gr.Column(scale=3): # Replace image_input with ImagePrompter image_input = ImagePrompter(label="Upload Image & Draw Bounding Box") text_input = gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt") model_dropdown = gr.Dropdown( choices=[ "jadechoghari/Ferret-UI-Gemma2b", "jadechoghari/Ferret-UI-Llama8b", ], label="Model Path", value="jadechoghari/Ferret-UI-Gemma2b" ) temperature_input = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.2, label="Temperature") top_p_input = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.7, label="Top P") max_new_tokens_input = gr.Slider(minimum=1, maximum=1024, step=1, value=512, label="Max New Tokens") gr.Examples( examples=[ [{"image": "appstore_reminders.png"}, "Describe the contents inside the box"], [{"image": "appstore_reminders.png"}, "What is the text shown inside the highlighted area"] ], inputs=[image_input, text_input], label="Try these examples" ) with gr.Column(scale=7): chatbot = gr.Chatbot( label="Chat with Ferret-UI", height=400, show_copy_button=True, latex_delimiters=latex_delimiters_set, type="tuples" ) with gr.Row(): send_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("Clear", variant="secondary") send_click_event = send_btn.click( inference_with_gradio, [chatbot, image_input, text_input, model_dropdown, temperature_input, top_p_input, max_new_tokens_input], chatbot ).then( submit_chat, [chatbot, text_input], [chatbot, text_input] ) submit_event = text_input.submit( inference_with_gradio, [chatbot, image_input, text_input, model_dropdown, temperature_input, top_p_input, max_new_tokens_input], chatbot ).then( submit_chat, [chatbot, text_input], [chatbot, text_input] ) clear_btn.click( clear_chat, outputs=[chatbot, image_input, text_input, temperature_input, top_p_input, max_new_tokens_input] ) demo.launch()