import gradio as gr from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter import config from envs import RESULTS_REPO_ID, REPO_ID, API, HF_TOKEN from pathlib import Path import pandas as pd import os import json from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart from utils.processing import check_and_process_uploads from huggingface_hub import snapshot_download from apscheduler.schedulers.background import BackgroundScheduler from datetime import datetime import json import re import markdown import asyncio from apscheduler.schedulers.asyncio import AsyncIOScheduler import weave from utils.db import TracePreprocessor # Initialize the TracePreprocessor preprocessor = TracePreprocessor() from datetime import datetime abs_path = Path(__file__).parent def restart_space(): API.restart_space(repo_id=REPO_ID, token=HF_TOKEN) # New function to download results def download_latest_results(): print("Downloading latest results...") snapshot_download(RESULTS_REPO_ID, local_dir= "evals_upload", repo_type='dataset', tqdm_class=None, etag_timeout=30, max_workers=4, ) print("Download complete.") def get_analyzed_traces(agent_name, benchmark_name): return preprocessor.get_analyzed_traces(agent_name, benchmark_name) def get_failure_report(agent_name, benchmark_name): return preprocessor.get_failure_report(agent_name, benchmark_name) def parse_json_files(folder_path, benchmark_name): return preprocessor.get_parsed_results(benchmark_name) def update_agent_dropdown(benchmark_name, metric): df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name) agents = df['Agent Name'].tolist() best_agent = get_best_agent(benchmark_name, metric) return gr.Dropdown(choices=agents, value=best_agent, label="Select Agent") def get_best_agent(benchmark_name, metric): df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name) return df.loc[df[metric].idxmax()]['Agent Name'] def update_task_analysis(benchmark_name, agent_name): if not agent_name: return "Please select an agent.", None, None, "" analyzed_traces = get_analyzed_traces(agent_name, benchmark_name) if not analyzed_traces: return f"No analysis available for agent: {agent_name}", None, None, "" task_ids = list(analyzed_traces.keys()) overview, flow_chart, _ = update_task_details(benchmark_name, agent_name, task_ids[0]) return overview, flow_chart, gr.Dropdown(choices=task_ids, value=task_ids[0], label="Select Task"), "" def update_task_details(benchmark_name, agent_name, task_id): if not task_id: return "Please select a task.", None, "" analyzed_traces = get_analyzed_traces(agent_name, benchmark_name) if not analyzed_traces or task_id not in analyzed_traces: return f"No analysis available for task: {task_id}", None, "" analysis = analyzed_traces[task_id] summary = analysis.get('task_analysis', {}) overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n" # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n" # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n" # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n" if summary.get('overview', 'No overview available.') != "Not available": flow_chart = create_flow_chart(analysis['steps']) else: flow_chart = None return overview, flow_chart, "" def format_call_info(step, step_index): call_data = step['call_data'] analysis = step['analysis'] def format_json(obj): # if isinstance(obj, dict) and 'choices' in obj: # # Special handling for message content # formatted_content = format_message_content(obj['choices'][0]) # return f'
{formatted_content}
' # else: json_str = json.dumps(obj, indent=2) json_str = json_str.replace(' ', ' ') json_str = json_str.replace('\n', '
') return f'
{json_str}
' # Currently not used but we can enable it to format message content def format_message_content(content): # Convert Markdown to HTML html_content = markdown.markdown(content) # Replace ``` code blocks with styled pre blocks html_content = re.sub(r'```python\n(.*?)```', lambda m: f'
{m.group(1)}
', html_content, flags=re.DOTALL) return html_content formatted_info = f"""

Step {step_index + 1}: {analysis.get('headline', '')}

Call Metadata

Inputs

{format_json(call_data['inputs'])}

Outputs

{format_json(call_data['outputs'])}

Usage

{format_json(call_data['summary'])}

Analysis

""" return formatted_info def update_failure_report(agent_name, benchmark_name): failure_report = get_failure_report(agent_name, benchmark_name) if not failure_report: return "No failure report available for this agent.", None # Create overview of failure categories categories_overview = "## Failure Categories Overview\n\n" for category in failure_report['failure_categories']: categories_overview += f"### {category['category_name']}\n" categories_overview += f"{category['description']}\n\n" # Count tasks affected by each category category_counts = {} for task, classification in failure_report['task_classifications'].items(): category_id = classification['category_id'] category_counts[category_id] = category_counts.get(category_id, 0) + 1 # Prepare data for bar chart categories = [cat['category_name'] for cat in failure_report['failure_categories']] counts = [category_counts.get(str(i+1), 0) for i in range(len(categories))] # Create bar chart chart = create_bar_chart(categories, counts, "Failure Categories", "Number of Affected Tasks", "Failure Categories Distribution") return categories_overview, chart with gr.Blocks() as demo: gr.Markdown(""" # 🥇 Agent Leaderboard """) with gr.Tabs(): with gr.Tab("USACO"): with gr.Row(): with gr.Column(scale=2): Leaderboard( value=parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), select_columns=SelectColumns( default_selection=config.USACO_ON_LOAD_COLUMNS, cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.USACO_HIDE_COLUMNS, search_columns=config.USACO_SEARCH_COLUMNS, column_widths={"Agent Name": 40, "Accuracy": 20, "Total Cost": 20}, ) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"])) gr.Markdown("# Failure Report") with gr.Row(): with gr.Column(scale=1): failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report") with gr.Row(): with gr.Column(scale=1): failure_categories_overview = gr.Markdown() with gr.Column(scale=1): failure_categories_chart = gr.Plot() # Initialize the failure report agent dropdown with all agents demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[failure_report_agent_dropdown]) # Update failure report when agent is selected failure_report_agent_dropdown.change(update_failure_report, inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)], outputs=[failure_categories_overview, failure_categories_chart]) gr.Markdown("# Agent Monitor") with gr.Row(): with gr.Column(scale=1): agent_dropdown = gr.Dropdown(label="Select Agent") with gr.Column(scale=1): task_dropdown = gr.Dropdown(label="Select USACO Task") with gr.Row(): task_overview = gr.Markdown() with gr.Row(): flow_chart = gr.Plot(label="Task Flow") # Initialize the agent dropdown with the best agent demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown]) demo.load(update_task_analysis, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) agent_dropdown.change(update_task_analysis, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) task_dropdown.change(update_task_details, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown], outputs=[task_overview, flow_chart, gr.Textbox(visible=False)]) gr.Markdown("# Raw Predictions") with gr.Row(): with gr.Column(scale=1): raw_agent_dropdown = gr.Dropdown(label="Select Agent") with gr.Column(scale=1): raw_task_dropdown = gr.Dropdown(label="Select Task") with gr.Column(scale=1): raw_step_dropdown = gr.Dropdown(label="Select Step") with gr.Row(): raw_call_details = gr.HTML() def update_raw_task_dropdown(agent_name): analyzed_traces = get_analyzed_traces(agent_name, "usaco") if not analyzed_traces: return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}." task_ids = list(analyzed_traces.keys()) steps = analyzed_traces[task_ids[0]]['steps'] return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0) def update_raw_step_dropdown(agent_name, task_id): analyzed_traces = get_analyzed_traces(agent_name, "usaco") if not analyzed_traces or task_id not in analyzed_traces: return gr.Dropdown(choices=[], label="Select Step", value="No data available.") steps = analyzed_traces[task_id]['steps'] return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0) def update_raw_call_details(agent_name, task_id, step_index): analyzed_traces = get_analyzed_traces(agent_name, "usaco") if not analyzed_traces or task_id not in analyzed_traces: return "No data available for this selection." steps = analyzed_traces[task_id]['steps'] if step_index is None: return "Invalid step selection." step = steps[step_index] return format_call_info(step, step_index) # Initialize the raw agent dropdown with all agents demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[raw_agent_dropdown]) demo.load(update_raw_task_dropdown, inputs=[raw_agent_dropdown], outputs=[raw_task_dropdown, raw_step_dropdown]) demo.load(update_raw_call_details, inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], outputs=[raw_call_details]) raw_agent_dropdown.change(update_raw_task_dropdown, inputs=[raw_agent_dropdown], outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details]) raw_task_dropdown.change(update_raw_step_dropdown, inputs=[raw_agent_dropdown, raw_task_dropdown], outputs=[raw_step_dropdown, raw_call_details]) raw_step_dropdown.change(update_raw_call_details, inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], outputs=[raw_call_details]) with gr.Tab("SWE-Bench Verified"): with gr.Row(): with gr.Column(scale=2): Leaderboard( value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), select_columns=SelectColumns( default_selection=config.SWEBENCH_ON_LOAD_COLUMNS, cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.SWEBENCH_HIDE_COLUMNS, search_columns=config.SWEBENCH_SEARCH_COLUMNS, column_widths={"Agent Name": 40, "Accuracy": 20, "Total Cost": 20}, ) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"])) gr.Markdown("# Failure Report") with gr.Row(): with gr.Column(scale=1): failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report") with gr.Row(): with gr.Column(scale=1): failure_categories_overview = gr.Markdown() with gr.Column(scale=1): failure_categories_chart = gr.Plot() # Initialize the failure report agent dropdown with all agents demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[failure_report_agent_dropdown]) # Update failure report when agent is selected failure_report_agent_dropdown.change(update_failure_report, inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)], outputs=[failure_categories_overview, failure_categories_chart]) gr.Markdown("# Agent Monitor") with gr.Row(): with gr.Column(scale=1): agent_dropdown = gr.Dropdown(label="Select Agent") with gr.Column(scale=1): task_dropdown = gr.Dropdown(label="Select SWE-Bench Task") with gr.Row(): task_overview = gr.Markdown() with gr.Row(): flow_chart = gr.Plot(label="Task Flow") # Initialize the agent dropdown with the best agent demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown]) demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) agent_dropdown.change(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) task_dropdown.change(update_task_details, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown], outputs=[task_overview, flow_chart, gr.Textbox(visible=False)]) gr.Markdown("# Raw Predictions") with gr.Row(): with gr.Column(scale=1): raw_agent_dropdown = gr.Dropdown(label="Select Agent") with gr.Column(scale=1): raw_task_dropdown = gr.Dropdown(label="Select Task") with gr.Column(scale=1): raw_step_dropdown = gr.Dropdown(label="Select Step") with gr.Row(): raw_call_details = gr.HTML() def update_raw_task_dropdown(agent_name): analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified") if not analyzed_traces: return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}." task_ids = list(analyzed_traces.keys()) steps = analyzed_traces[task_ids[0]]['steps'] return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0) def update_raw_step_dropdown(agent_name, task_id): analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified") if not analyzed_traces or task_id not in analyzed_traces: return gr.Dropdown(choices=[], label="Select Step", value="No data available.") steps = analyzed_traces[task_id]['steps'] return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0) def update_raw_call_details(agent_name, task_id, step_index): analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified") if not analyzed_traces or task_id not in analyzed_traces: return "No data available for this selection." steps = analyzed_traces[task_id]['steps'] if step_index is None: return "Invalid step selection." step = steps[step_index] return format_call_info(step, step_index) # Initialize the raw agent dropdown with all agents demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[raw_agent_dropdown]) demo.load(update_raw_task_dropdown, inputs=[raw_agent_dropdown], outputs=[raw_task_dropdown, raw_step_dropdown]) demo.load(update_raw_call_details, inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], outputs=[raw_call_details]) raw_agent_dropdown.change(update_raw_task_dropdown, inputs=[raw_agent_dropdown], outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details]) raw_task_dropdown.change(update_raw_step_dropdown, inputs=[raw_agent_dropdown, raw_task_dropdown], outputs=[raw_step_dropdown, raw_call_details]) raw_step_dropdown.change(update_raw_call_details, inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], outputs=[raw_call_details]) with gr.Tab("SWE-Bench Lite"): with gr.Row(): with gr.Column(scale=2): Leaderboard( value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), select_columns=SelectColumns( default_selection=config.SWEBENCH_ON_LOAD_COLUMNS, cant_deselect=["Agent Name"], label="Select Columns to Display:", ), search_columns=config.SWEBENCH_SEARCH_COLUMNS, hide_columns=config.SWEBENCH_HIDE_COLUMNS, column_widths={"Agent Name": 40, "Accuracy": 20, "Total Cost": 20}, ) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"])) gr.Markdown("# Failure Report") with gr.Row(): with gr.Column(scale=1): failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report") with gr.Row(): with gr.Column(scale=1): failure_categories_overview = gr.Markdown() with gr.Column(scale=1): failure_categories_chart = gr.Plot() # Initialize the failure report agent dropdown with all agents demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[failure_report_agent_dropdown]) # Update failure report when agent is selected failure_report_agent_dropdown.change(update_failure_report, inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)], outputs=[failure_categories_overview, failure_categories_chart]) gr.Markdown("# Agent Monitor") with gr.Row(): with gr.Column(scale=1): agent_dropdown = gr.Dropdown(label="Select Agent") with gr.Column(scale=1): task_dropdown = gr.Dropdown(label="Select SWE-Bench Task") with gr.Row(): task_overview = gr.Markdown() with gr.Row(): flow_chart = gr.Plot(label="Task Flow") # Initialize the agent dropdown with the best agent demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown]) demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) agent_dropdown.change(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) task_dropdown.change(update_task_details, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown], outputs=[task_overview, flow_chart, gr.Textbox(visible=False)]) gr.Markdown("# Raw Predictions") with gr.Row(): with gr.Column(scale=1): raw_agent_dropdown = gr.Dropdown(label="Select Agent") with gr.Column(scale=1): raw_task_dropdown = gr.Dropdown(label="Select Task") with gr.Column(scale=1): raw_step_dropdown = gr.Dropdown(label="Select Step") with gr.Row(): raw_call_details = gr.HTML() def update_raw_task_dropdown(agent_name): analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite") if not analyzed_traces: return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}." task_ids = list(analyzed_traces.keys()) steps = analyzed_traces[task_ids[0]]['steps'] return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0) def update_raw_step_dropdown(agent_name, task_id): analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite") if not analyzed_traces or task_id not in analyzed_traces: return gr.Dropdown(choices=[], label="Select Step", value="No data available.") steps = analyzed_traces[task_id]['steps'] return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0) def update_raw_call_details(agent_name, task_id, step_index): analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite") if not analyzed_traces or task_id not in analyzed_traces: return "No data available for this selection." steps = analyzed_traces[task_id]['steps'] if step_index is None: return "Invalid step selection." step = steps[step_index] return format_call_info(step, step_index) # Initialize the raw agent dropdown with all agents demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[raw_agent_dropdown]) demo.load(update_raw_task_dropdown, inputs=[raw_agent_dropdown], outputs=[raw_task_dropdown, raw_step_dropdown]) demo.load(update_raw_call_details, inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], outputs=[raw_call_details]) raw_agent_dropdown.change(update_raw_task_dropdown, inputs=[raw_agent_dropdown], outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details]) raw_task_dropdown.change(update_raw_step_dropdown, inputs=[raw_agent_dropdown, raw_task_dropdown], outputs=[raw_step_dropdown, raw_call_details]) raw_step_dropdown.change(update_raw_call_details, inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], outputs=[raw_call_details]) with gr.Tab("MLAgentBench"): with gr.Row(): with gr.Column(scale=2): Leaderboard( value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), select_columns=SelectColumns( default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS, cant_deselect=["Agent Name"], label="Select Columns to Display:", ), search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS, hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS, column_widths={"Agent Name": 40, "Overall Score": 20, "Total Cost": 20}, ) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"])) gr.Markdown("# Failure Report") with gr.Row(): with gr.Column(scale=1): failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report") with gr.Row(): with gr.Column(scale=1): failure_categories_overview = gr.Markdown() with gr.Column(scale=1): failure_categories_chart = gr.Plot() # Initialize the failure report agent dropdown with all agents demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[failure_report_agent_dropdown]) # Update failure report when agent is selected failure_report_agent_dropdown.change(update_failure_report, inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)], outputs=[failure_categories_overview, failure_categories_chart]) gr.Markdown("# Agent Monitor") with gr.Row(): with gr.Column(scale=1): agent_dropdown = gr.Dropdown(label="Select Agent") with gr.Column(scale=1): task_dropdown = gr.Dropdown(label="Select SWE-Bench Task") with gr.Row(): task_overview = gr.Markdown() with gr.Row(): flow_chart = gr.Plot(label="Task Flow") # Initialize the agent dropdown with the best agent demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown]) demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) agent_dropdown.change(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) task_dropdown.change(update_task_details, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown], outputs=[task_overview, flow_chart, gr.Textbox(visible=False)]) gr.Markdown("# Raw Predictions") with gr.Row(): with gr.Column(scale=1): raw_agent_dropdown = gr.Dropdown(label="Select Agent") with gr.Column(scale=1): raw_task_dropdown = gr.Dropdown(label="Select Task") with gr.Column(scale=1): raw_step_dropdown = gr.Dropdown(label="Select Step") with gr.Row(): raw_call_details = gr.HTML() def update_raw_task_dropdown(agent_name): analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench") if not analyzed_traces: return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}." task_ids = list(analyzed_traces.keys()) steps = analyzed_traces[task_ids[0]]['steps'] return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0) def update_raw_step_dropdown(agent_name, task_id): analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench") if not analyzed_traces or task_id not in analyzed_traces: return gr.Dropdown(choices=[], label="Select Step", value="No data available.") steps = analyzed_traces[task_id]['steps'] return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0) def update_raw_call_details(agent_name, task_id, step_index): analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench") if not analyzed_traces or task_id not in analyzed_traces: return "No data available for this selection." steps = analyzed_traces[task_id]['steps'] if step_index is None: return "Invalid step selection." step = steps[step_index] return format_call_info(step, step_index) # Initialize the raw agent dropdown with all agents demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[raw_agent_dropdown]) demo.load(update_raw_task_dropdown, inputs=[raw_agent_dropdown], outputs=[raw_task_dropdown, raw_step_dropdown]) demo.load(update_raw_call_details, inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], outputs=[raw_call_details]) raw_agent_dropdown.change(update_raw_task_dropdown, inputs=[raw_agent_dropdown], outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details]) raw_task_dropdown.change(update_raw_step_dropdown, inputs=[raw_agent_dropdown, raw_task_dropdown], outputs=[raw_step_dropdown, raw_call_details]) raw_step_dropdown.change(update_raw_call_details, inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], outputs=[raw_call_details]) with gr.Tab("About"): gr.Markdown((Path(__file__).parent / "about.md").read_text()) async def main(): # Preprocess traces # preprocessor.preprocess_traces('evals_live') # # # Download the results from the Hugging Face Hub # await asyncio.to_thread(download_latest_results) # # Check for new uploads and process them # await check_and_process_uploads() scheduler = AsyncIOScheduler() scheduler.add_job(restart_space, "interval", hours=1) scheduler.add_job(download_latest_results, "interval", hours=1) # scheduler.add_job(check_and_process_uploads, "interval", hours=1) scheduler.start() await demo.launch() if __name__ == "__main__": weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}') asyncio.run(main())