import gradio as gr import pandas as pd from datasets import load_dataset from PIL import Image, ImageOps df_final = pd.read_pickle("./df_final.pkl") dataset = load_dataset("XAI/vlmsareblind") def show_row(row_index, selected_task): task_df = df_final[df_final["task"] == selected_task] row = task_df.iloc[int(row_index)] custom_id = int(row["custom_id"]) image = dataset["valid"][custom_id]["image"] # Add white padding to the image original_size = image.size new_size = (original_size[0] * 2, original_size[1] * 2) padding = ( (new_size[0] - original_size[0]) // 2, (new_size[1] - original_size[1]) // 2, ) image_with_padding = ImageOps.expand(image, border=padding, fill="white") prompt = dataset["valid"][custom_id]["prompt"] model_output = row["content_raw"] ground_truth = row["gt"] task = row["task"] is_correct = row["is_correct"] return image_with_padding, prompt, model_output, ground_truth, task, is_correct def update_slider(selected_task): task_df = df_final[df_final["task"] == selected_task] return gr.Slider( minimum=0, maximum=len(task_df) - 1, step=1, label=f"Select Row Index (0-{len(task_df) - 1})", value=0, ) # Create accuracy breakdown dataframe accuracy_breakdown = ( df_final.groupby("task")["is_correct"] .mean() .sort_values(ascending=False) .mul(100) .apply(lambda x: f"{x:.2f}") .reset_index() ) accuracy_breakdown.columns = ["Task", "Accuracy (%)"] # Create the Gradio interface with gr.Blocks() as app: gr.Markdown("# BlindTest Results Review (GPT-4o mini)") gr.HTML( """

This is a review of results from the GPT-4 mini model on the VLMs Are Blind dataset.
Project Website | arXiv Paper

""" ) with gr.Row(): task_dropdown = gr.Dropdown( choices=df_final["task"].unique().tolist(), label="Select Task", value=df_final["task"].unique()[0], ) row_selector = gr.Slider( minimum=0, maximum=len(df_final[df_final["task"] == df_final["task"].unique()[0]]) - 1, step=1, label=f"Select Row Index (0-{len(df_final[df_final['task'] == df_final['task'].unique()[0]]) - 1})", value=0, ) with gr.Row(): with gr.Column(scale=2): image_output = gr.Image(label="Image", type="pil") with gr.Column(scale=3): prompt_output = gr.Textbox(label="Prompt", lines=3) model_output = gr.Textbox(label="Model Output", lines=2) ground_truth = gr.Textbox(label="Ground Truth", lines=2) task = gr.Textbox(label="Task") is_correct = gr.Checkbox(label="Is Correct") gr.Markdown("## Accuracy Breakdown by Task") gr.DataFrame(accuracy_breakdown) task_dropdown.change(update_slider, inputs=task_dropdown, outputs=row_selector) task_dropdown.change( show_row, inputs=[gr.Slider(value=0, visible=False), task_dropdown], outputs=[ image_output, prompt_output, model_output, ground_truth, task, is_correct, ], ) row_selector.change( show_row, inputs=[row_selector, task_dropdown], outputs=[ image_output, prompt_output, model_output, ground_truth, task, is_correct, ], ) # Launch the app app.launch()