taesiri's picture
update
2eb115d
raw
history blame contribute delete
No virus
3.78 kB
import gradio as gr
import pandas as pd
from datasets import load_dataset
from PIL import Image, ImageOps
df_final = pd.read_pickle("./df_final.pkl")
dataset = load_dataset("XAI/vlmsareblind")
def show_row(row_index, selected_task):
task_df = df_final[df_final["task"] == selected_task]
row = task_df.iloc[int(row_index)]
custom_id = int(row["custom_id"])
image = dataset["valid"][custom_id]["image"]
# Add white padding to the image
original_size = image.size
new_size = (original_size[0] * 2, original_size[1] * 2)
padding = (
(new_size[0] - original_size[0]) // 2,
(new_size[1] - original_size[1]) // 2,
)
image_with_padding = ImageOps.expand(image, border=padding, fill="white")
prompt = dataset["valid"][custom_id]["prompt"]
model_output = row["content_raw"]
ground_truth = row["gt"]
task = row["task"]
is_correct = row["is_correct"]
return image_with_padding, prompt, model_output, ground_truth, task, is_correct
def update_slider(selected_task):
task_df = df_final[df_final["task"] == selected_task]
return gr.Slider(
minimum=0,
maximum=len(task_df) - 1,
step=1,
label=f"Select Row Index (0-{len(task_df) - 1})",
value=0,
)
# Create accuracy breakdown dataframe
accuracy_breakdown = (
df_final.groupby("task")["is_correct"]
.mean()
.sort_values(ascending=False)
.mul(100)
.apply(lambda x: f"{x:.2f}")
.reset_index()
)
accuracy_breakdown.columns = ["Task", "Accuracy (%)"]
# Create the Gradio interface
with gr.Blocks() as app:
gr.Markdown("# BlindTest Results Review (GPT-4o mini)")
gr.HTML(
"""
<p style="text-align: center;">
This is a review of results from the GPT-4 mini model on the VLMs Are Blind dataset.
<br>
<a href="https://vlmsareblind.github.io/" target="_blank">Project Website</a> |
<a href="https://arxiv.org/abs/2407.06581" target="_blank">arXiv Paper</a>
</p>
"""
)
with gr.Row():
task_dropdown = gr.Dropdown(
choices=df_final["task"].unique().tolist(),
label="Select Task",
value=df_final["task"].unique()[0],
)
row_selector = gr.Slider(
minimum=0,
maximum=len(df_final[df_final["task"] == df_final["task"].unique()[0]]) - 1,
step=1,
label=f"Select Row Index (0-{len(df_final[df_final['task'] == df_final['task'].unique()[0]]) - 1})",
value=0,
)
with gr.Row():
with gr.Column(scale=2):
image_output = gr.Image(label="Image", type="pil")
with gr.Column(scale=3):
prompt_output = gr.Textbox(label="Prompt", lines=3)
model_output = gr.Textbox(label="Model Output", lines=2)
ground_truth = gr.Textbox(label="Ground Truth", lines=2)
task = gr.Textbox(label="Task")
is_correct = gr.Checkbox(label="Is Correct")
gr.Markdown("## Accuracy Breakdown by Task")
gr.DataFrame(accuracy_breakdown)
task_dropdown.change(update_slider, inputs=task_dropdown, outputs=row_selector)
task_dropdown.change(
show_row,
inputs=[gr.Slider(value=0, visible=False), task_dropdown],
outputs=[
image_output,
prompt_output,
model_output,
ground_truth,
task,
is_correct,
],
)
row_selector.change(
show_row,
inputs=[row_selector, task_dropdown],
outputs=[
image_output,
prompt_output,
model_output,
ground_truth,
task,
is_correct,
],
)
# Launch the app
app.launch()