Spaces:
Running
Running
import gradio as gr | |
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter | |
import config | |
from envs import RESULTS_REPO_ID, REPO_ID, API | |
from pathlib import Path | |
import pandas as pd | |
import os | |
from utils import parse_json_files, create_scatter_plot | |
from huggingface_hub import snapshot_download | |
def restart_space(): | |
API.restart_space(repo_id=REPO_ID) | |
abs_path = Path(__file__).parent | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
# 🥇 Agent Leaderboard | |
""") | |
df = parse_json_files(os.path.join(abs_path, "evals")) | |
with gr.Tabs(): | |
with gr.Tab("SWE-Bench"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
scatter_plot = gr.Plot(create_scatter_plot(df, "results_total_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"])) | |
with gr.Column(scale=1): | |
Leaderboard( | |
value=df, | |
select_columns=SelectColumns( | |
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS, | |
cant_deselect=["agent_name"], | |
label="Select Columns to Display:", | |
), | |
search_columns=config.SWEBENCH_SEARCH_COLUMNS, | |
column_widths={"agent_name": 40, | |
"results_accuracy": 20, | |
"results_total_cost": 20}, | |
) | |
with gr.Tab("USACO"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
scatter_plot = gr.Plot(create_scatter_plot(df, "results_total_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"])) | |
with gr.Column(scale=1): | |
Leaderboard( | |
value=df, | |
select_columns=SelectColumns( | |
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS, | |
cant_deselect=["agent_name"], | |
label="Select Columns to Display:", | |
), | |
search_columns=config.SWEBENCH_SEARCH_COLUMNS, | |
column_widths={"agent_name": 40, | |
"results_accuracy": 20, | |
"results_total_cost": 20}, | |
) | |
with gr.Tab("About"): | |
gr.Markdown((Path(__file__).parent / "about.md").read_text()) | |
if __name__ == "__main__": | |
# Download the results from the Hugging Face Hub | |
snapshot_download(RESULTS_REPO_ID, | |
local_dir=abs_path / "evals", | |
repo_type='dataset', | |
tqdm_class=None, | |
etag_timeout=30, | |
max_workers=4, | |
) | |
demo.launch() |