Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter | |
from data_reviewer import create_data_viewer | |
# Define constants and enums | |
TITLE = "<h1>VL-RewardBench Leaderboard</h1>" | |
INTRODUCTION_TEXT = "https://vl-rewardbench.github.io/" | |
GOOGLE_SHEET_URL = ( | |
"https://docs.google.com/spreadsheets/d/1fPqZLF1FQFyy4n9I6GNk7MeDSGlJDVVes9yEBqN8RwU/export?gid=0&format=csv" | |
) | |
ABOUT_TEXT = """Welcome to VLRewardBench! | |
We introduce a novel benchmark VL-RewardBench, designed to expose limitations of vision-language reward models across visual perception, hallucination detection, and reasoning tasks. | |
Our evaluation reveals including that models primarily fail at basic visual perception rather than reasoning, and that performance on our benchmark strongly correlates (r>0.9) with downstream vision-language tasks. | |
The splits are: | |
- General (VLFeedback + WildVision | |
- Hallucination (POVID, RLAIF, RLHF-V) | |
- Reasoning (MMMU-Pro, MathVerse)""" | |
class AutoEvalColumn: | |
model = {"name": "Model", "type": "markdown", "displayed_by_default": True, "never_hidden": True} | |
general = {"name": "General", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
hallucination = {"name": "Hallucination", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
reasoning = {"name": "Reasoning", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
overall = {"name": "Overall Consistency", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
macro = {"name": "Macro Average", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
model_size = {"name": "Model", "type": "str", "displayed_by_default": False, "never_hidden": False} | |
opensource = {"name": "Open Source?", "type": "str", "displayed_by_default": False, "never_hidden": False} | |
def get_result_data(): | |
return pd.read_csv(GOOGLE_SHEET_URL) | |
def init_leaderboard(dataframe): | |
if dataframe is None or dataframe.empty: | |
raise ValueError("Leaderboard DataFrame is empty or None.") | |
return Leaderboard( | |
value=dataframe, | |
datatype=[col["type"] for col in AutoEvalColumn.__dict__.values() if isinstance(col, dict)], | |
select_columns=SelectColumns( | |
default_selection=[ | |
col["name"] | |
for col in AutoEvalColumn.__dict__.values() | |
if isinstance(col, dict) and col["displayed_by_default"] | |
], | |
cant_deselect=[ | |
col["name"] | |
for col in AutoEvalColumn.__dict__.values() | |
if isinstance(col, dict) and col.get("never_hidden", False) | |
], | |
label="Select Columns to Display:", | |
), | |
search_columns=["Model"], | |
filter_columns=[ | |
ColumnFilter("Open Source?", type="checkboxgroup", label="Open Source?"), | |
ColumnFilter("Model Size", type="checkboxgroup", label="Model Size"), | |
], | |
interactive=False, | |
) | |
def format_model_link(row): | |
"""Format model name as HTML link if URL is available""" | |
model_name = row['Model'] | |
url = row.get('URL', '') | |
if pd.notna(url) and url.strip(): | |
return f'<a href="{url}" target="_blank">{model_name}</a>' | |
return model_name | |
# Initialize the Gradio interface | |
demo = gr.Blocks() | |
with demo: | |
gr.HTML(TITLE) | |
gr.Markdown(INTRODUCTION_TEXT) | |
with gr.Tabs() as tabs: | |
with gr.TabItem("π Leaderboard"): | |
# Load your DataFrame here instead of the sample data | |
df = get_result_data() | |
df["Model"] = df.apply(format_model_link, axis=1) | |
del df["URL"] | |
df = df.sort_values('Overall Consistency', ascending=False) | |
leaderboard = init_leaderboard(df) | |
with gr.TabItem("π Data Viewer"): | |
dataset_split, sample_idx = create_data_viewer() | |
with gr.TabItem("βΉοΈ About"): | |
gr.Markdown(ABOUT_TEXT) | |
demo.launch() | |