Spaces:
Running
Running
Commit
·
0b3117f
1
Parent(s):
3fa7903
added initial version of visibility feature and fixed automatic update of results every hour
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ from envs import RESULTS_REPO_ID, REPO_ID, API, HF_TOKEN
|
|
5 |
from pathlib import Path
|
6 |
import pandas as pd
|
7 |
import os
|
|
|
8 |
from utils import parse_json_files, create_scatter_plot
|
9 |
from huggingface_hub import snapshot_download
|
10 |
from apscheduler.schedulers.background import BackgroundScheduler
|
@@ -12,8 +13,84 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
12 |
def restart_space():
|
13 |
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
abs_path = Path(__file__).parent
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
with gr.Blocks() as demo:
|
18 |
gr.Markdown("""
|
19 |
# 🥇 Agent Leaderboard
|
@@ -54,20 +131,27 @@ with gr.Blocks() as demo:
|
|
54 |
"results_accuracy": 20,
|
55 |
"results_total_cost": 20},
|
56 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
with gr.Tab("About"):
|
58 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
59 |
|
60 |
if __name__ == "__main__":
|
61 |
# Download the results from the Hugging Face Hub
|
62 |
-
|
63 |
-
local_dir=abs_path / "evals",
|
64 |
-
repo_type='dataset',
|
65 |
-
tqdm_class=None,
|
66 |
-
etag_timeout=30,
|
67 |
-
max_workers=4,
|
68 |
-
)
|
69 |
|
70 |
scheduler = BackgroundScheduler()
|
71 |
scheduler.add_job(restart_space, "interval", hours=1) # restarted every 1h
|
|
|
72 |
scheduler.start()
|
73 |
demo.launch()
|
|
|
5 |
from pathlib import Path
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
+
import json
|
9 |
from utils import parse_json_files, create_scatter_plot
|
10 |
from huggingface_hub import snapshot_download
|
11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
13 |
def restart_space():
|
14 |
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
15 |
|
16 |
+
# New function to download results
|
17 |
+
def download_latest_results():
|
18 |
+
print("Downloading latest results...")
|
19 |
+
snapshot_download(RESULTS_REPO_ID,
|
20 |
+
local_dir=abs_path / "evals",
|
21 |
+
repo_type='dataset',
|
22 |
+
tqdm_class=None,
|
23 |
+
etag_timeout=30,
|
24 |
+
max_workers=4,
|
25 |
+
)
|
26 |
+
print("Download complete.")
|
27 |
+
|
28 |
abs_path = Path(__file__).parent
|
29 |
|
30 |
+
|
31 |
+
# load task_analyses.json from evals/usaco_traces folder
|
32 |
+
with open(os.path.join(abs_path, "evals", "usaco_traces", "task_analyses.json"), "r") as f:
|
33 |
+
analyzed_traces = json.load(f)
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
def update_task_analysis(task_id):
|
38 |
+
if task_id not in analyzed_traces:
|
39 |
+
return "No analysis available for this task.", [], ""
|
40 |
+
|
41 |
+
analysis = analyzed_traces[task_id]
|
42 |
+
summary = analysis['summary']
|
43 |
+
|
44 |
+
if isinstance(summary, str):
|
45 |
+
try:
|
46 |
+
summary = json.loads(summary)
|
47 |
+
except json.JSONDecodeError:
|
48 |
+
return "Error: Unable to parse summary data.", [], ""
|
49 |
+
elif not isinstance(summary, dict):
|
50 |
+
return "Error: Summary data is in an unexpected format.", [], ""
|
51 |
+
|
52 |
+
overview = f"# Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
|
53 |
+
overview += f"## Successes\n{summary.get('successes', 'No successes listed.')}\n\n"
|
54 |
+
overview += f"## Challenges\n{summary.get('challenges', 'No challenges listed.')}\n\n"
|
55 |
+
|
56 |
+
steps = [(f"Step {i+1}", i) for i in range(len(analysis['steps']))]
|
57 |
+
|
58 |
+
return overview, gr.Dropdown(choices=steps, label="Agent Steps"), ""
|
59 |
+
|
60 |
+
def update_step_details(task_id, step_index):
|
61 |
+
if task_id not in analyzed_traces:
|
62 |
+
return "No analysis available for this task."
|
63 |
+
|
64 |
+
if step_index is None:
|
65 |
+
return "Please select a step to view details."
|
66 |
+
|
67 |
+
steps = analyzed_traces[task_id]['steps']
|
68 |
+
|
69 |
+
if isinstance(step_index, tuple):
|
70 |
+
step_index = step_index[1]
|
71 |
+
elif isinstance(step_index, str):
|
72 |
+
step_index = int(step_index.split()[-1]) - 1
|
73 |
+
|
74 |
+
if step_index < 0 or step_index >= len(steps):
|
75 |
+
return f"Invalid step index: {step_index}"
|
76 |
+
|
77 |
+
step = steps[step_index]
|
78 |
+
analysis = step['analysis']
|
79 |
+
|
80 |
+
if isinstance(analysis, str):
|
81 |
+
try:
|
82 |
+
analysis = json.loads(analysis)
|
83 |
+
except json.JSONDecodeError:
|
84 |
+
return "Error: Unable to parse step analysis data."
|
85 |
+
elif not isinstance(analysis, dict):
|
86 |
+
return "Error: Step analysis data is in an unexpected format."
|
87 |
+
|
88 |
+
details = f"# Step {step_index + 1} Details\n\n"
|
89 |
+
details += f"## Description\n{analysis.get('description', 'No description available.')}\n\n"
|
90 |
+
details += f"## Assessment\n{analysis.get('assessment', 'No assessment available.')}\n\n"
|
91 |
+
|
92 |
+
return details
|
93 |
+
|
94 |
with gr.Blocks() as demo:
|
95 |
gr.Markdown("""
|
96 |
# 🥇 Agent Leaderboard
|
|
|
131 |
"results_accuracy": 20,
|
132 |
"results_total_cost": 20},
|
133 |
)
|
134 |
+
gr.Markdown("## USACO Task Trace Explorer")
|
135 |
+
with gr.Row():
|
136 |
+
with gr.Column(scale=1):
|
137 |
+
task_dropdown = gr.Dropdown(choices=list(analyzed_traces.keys()), label="Select USACO Task")
|
138 |
+
task_overview = gr.Markdown()
|
139 |
+
with gr.Column(scale=1):
|
140 |
+
steps_dropdown = gr.Dropdown(label="Agent Steps")
|
141 |
+
step_details = gr.Markdown()
|
142 |
+
|
143 |
+
task_dropdown.change(update_task_analysis, inputs=[task_dropdown], outputs=[task_overview, steps_dropdown, step_details])
|
144 |
+
steps_dropdown.change(update_step_details, inputs=[task_dropdown, steps_dropdown], outputs=[step_details])
|
145 |
+
|
146 |
with gr.Tab("About"):
|
147 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
148 |
|
149 |
if __name__ == "__main__":
|
150 |
# Download the results from the Hugging Face Hub
|
151 |
+
download_latest_results()
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
scheduler = BackgroundScheduler()
|
154 |
scheduler.add_job(restart_space, "interval", hours=1) # restarted every 1h
|
155 |
+
scheduler.add_job(download_latest_results, "interval", hours=1) # download latest results every 1h
|
156 |
scheduler.start()
|
157 |
demo.launch()
|