Spaces:

jadechoghari
/

ferret-demo

Running on Zero

App Files Files Community

ferret-demo / appv1.py

jadechoghari

add bbox support

4e961a2 13 days ago

raw

history blame contribute delete

7.06 kB

	import gradio as gr
	from inference import inference_and_run
	import spaces
	import os
	import re
	import shutil

	model_name = 'Ferret-UI'
	cur_dir = os.path.dirname(os.path.abspath(__file__))

	@spaces.GPU()
	def inference_with_gradio(chatbot, image, prompt, model_path, box=None, temperature=0.2, top_p=0.7, max_new_tokens=512):
	dir_path = os.path.dirname(image)
	# image_path = image
	# Define the directory where you want to save the image (current directory)
	filename = os.path.basename(image)
	dir_path = "./"

	# Create the new path for the file (in the current directory)
	image_path = os.path.join(dir_path, filename)
	shutil.copy(image, image_path)
	print("filename path: ", filename)
	if "gemma" in model_path.lower():
	conv_mode = "ferret_gemma_instruct"
	else:
	conv_mode = "ferret_llama_3"

	# inference_text = inference_and_run(
	# image_path=image_path,
	# prompt=prompt,
	# conv_mode=conv_mode,
	# model_path=model_path,
	# box=box
	# )
	inference_text = inference_and_run(
	image_path=filename, # double check this
	image_dir=dir_path,
	prompt=prompt,
	model_path="jadechoghari/Ferret-UI-Gemma2b",
	conv_mode=conv_mode,
	temperature=temperature,
	top_p=top_p,
	box=box,
	max_new_tokens=max_new_tokens,
	# stop=stop # Assuming we want to process the image
	)
	if isinstance(inference_text, (list, tuple)):
	inference_text = str(inference_text[0])

	# Update chatbot history with new message pair
	new_history = chatbot.copy() if chatbot else []
	new_history.append((prompt, inference_text))
	return new_history

	def submit_chat(chatbot, text_input):
	response = ''
	# chatbot.append((text_input, response))
	return chatbot, ''

	def clear_chat():
	return [], None, "", "", 0.2, 0.7, 512


	html = f"""
	<div style="text-align: center; padding: 20px;">
	<div style="display: inline-block; background-color: #f5f5f7; padding: 20px; border-radius: 20px; box-shadow: 0px 6px 20px rgba(0, 0, 0, 0.1);">
	<div style="display: flex; align-items: center;">
	<img src='https://github.com/apple/ml-ferret/blob/main/ferretui/figs/ferretui_icon.png?raw=true' alt='Ferret-UI'
	style='width: 80px; height: 80px; border-radius: 20px; box-shadow: 0px 8px 16px rgba(0, 0, 0, 0.2);'/>
	<div style="margin-left: 15px;">
	<h1 style="font-size: 2.8em; font-family: -apple-system, BlinkMacSystemFont, sans-serif; color: #1D1D1F;
	font-weight: bold; margin-bottom: 0;"> {model_name}</h1>
	<p style="font-size: 1.2em; color: #6e6e73; font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin-top: 5px;">
	📱 Grounded Mobile UI Understanding with Multimodal LLMs.<br>
	A new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities.
	</p>
	<a href='https://huggingface.co./jadechoghari/Ferret-UI-Gemma2b' style='text-decoration: none;'>
	<button style="background-color: #007aff; color: white; font-size: 1.2em; padding: 10px 20px; border-radius: 10px; border: none; margin-top: 10px; box-shadow: 0px 4px 12px rgba(0, 122, 255, 0.4); cursor: pointer;">
	🤗 Try on Hugging Face
	</button>
	</a>
	</div>
	</div>
	</div>
	<p style="font-size: 1.2em; color: #86868B; font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin-top: 30px;">
	We release two Ferret-UI checkpoints, built on gemma-2b and Llama-3-8B models respectively, for public exploration. 🚀
	</p>
	</div>
	"""

	latex_delimiters_set = [{
	"left": "\\(",
	"right": "\\)",
	"display": False
	}, {
	"left": "\\begin{equation}",
	"right": "\\end{equation}",
	"display": True
	}, {
	"left": "\\begin{align}",
	"right": "\\end{align}",
	"display": True
	}]

	# Set up UI components
	image_input = gr.Image(label="Upload Image", type="filepath", height=350)
	text_input = gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt")
	model_dropdown = gr.Dropdown(choices=[
	"jadechoghari/Ferret-UI-Gemma2b",
	"jadechoghari/Ferret-UI-Llama8b",
	], label="Model Path", value="jadechoghari/Ferret-UI-Gemma2b")

	bounding_box_input = gr.Textbox(placeholder="Optional bounding box (x1, y1, x2, y2)", label="Bounding Box (optional)")
	# Adding Sliders for temperature, top_p, and max_new_tokens
	temperature_input = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.2, label="Temperature")
	top_p_input = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.7, label="Top P")
	max_new_tokens_input = gr.Slider(minimum=1, maximum=1024, step=1, value=512, label="Max New Tokens")


	chatbot = gr.Chatbot(label="Chat with Ferret-UI", height=400, show_copy_button=True, latex_delimiters=latex_delimiters_set, type="tuples")

	with gr.Blocks(title=model_name, theme=gr.themes.Ocean()) as demo:
	gr.HTML(html)
	with gr.Row():
	with gr.Column(scale=3):
	image_input.render()
	text_input.render()
	model_dropdown.render()
	bounding_box_input.render()
	temperature_input.render() # Render temperature input
	top_p_input.render() # Render top_p input
	max_new_tokens_input.render()
	gr.Examples(
	examples=[
	["appstore_reminders.png", "Describe the image in details", "jadechoghari/Ferret-UI-Gemma2b", None],
	["appstore_reminders.png", "What's inside the selected region?", "jadechoghari/Ferret-UI-Gemma2b", "189, 906, 404, 970"],
	["appstore_reminders.png", "Where is the Game Tab?", "jadechoghari/Ferret-UI-Gemma2b", None],
	],
	inputs=[image_input, text_input, model_dropdown, bounding_box_input]
	)
	with gr.Column(scale=7):
	chatbot.render()
	with gr.Row():
	send_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear", variant="secondary")

	send_click_event = send_btn.click(
	inference_with_gradio, [chatbot, image_input, text_input, model_dropdown, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input], chatbot
	).then(submit_chat, [chatbot, text_input], [chatbot, text_input])
	submit_event = text_input.submit(
	inference_with_gradio, [chatbot, image_input, text_input, model_dropdown, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input], chatbot
	).then(submit_chat, [chatbot, text_input], [chatbot, text_input])

	clear_btn.click(clear_chat, outputs=[chatbot, image_input, text_input, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input])

	demo.launch()