benediktstroebl commited on
Commit
0b3117f
·
1 Parent(s): 3fa7903

added initial version of visibility feature and fixed automatic update of results every hour

Browse files
Files changed (1) hide show
  1. app.py +91 -7
app.py CHANGED
@@ -5,6 +5,7 @@ from envs import RESULTS_REPO_ID, REPO_ID, API, HF_TOKEN
5
  from pathlib import Path
6
  import pandas as pd
7
  import os
 
8
  from utils import parse_json_files, create_scatter_plot
9
  from huggingface_hub import snapshot_download
10
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -12,8 +13,84 @@ from apscheduler.schedulers.background import BackgroundScheduler
12
  def restart_space():
13
  API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  abs_path = Path(__file__).parent
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  with gr.Blocks() as demo:
18
  gr.Markdown("""
19
  # 🥇 Agent Leaderboard
@@ -54,20 +131,27 @@ with gr.Blocks() as demo:
54
  "results_accuracy": 20,
55
  "results_total_cost": 20},
56
  )
 
 
 
 
 
 
 
 
 
 
 
 
57
  with gr.Tab("About"):
58
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
59
 
60
  if __name__ == "__main__":
61
  # Download the results from the Hugging Face Hub
62
- snapshot_download(RESULTS_REPO_ID,
63
- local_dir=abs_path / "evals",
64
- repo_type='dataset',
65
- tqdm_class=None,
66
- etag_timeout=30,
67
- max_workers=4,
68
- )
69
 
70
  scheduler = BackgroundScheduler()
71
  scheduler.add_job(restart_space, "interval", hours=1) # restarted every 1h
 
72
  scheduler.start()
73
  demo.launch()
 
5
  from pathlib import Path
6
  import pandas as pd
7
  import os
8
+ import json
9
  from utils import parse_json_files, create_scatter_plot
10
  from huggingface_hub import snapshot_download
11
  from apscheduler.schedulers.background import BackgroundScheduler
 
13
  def restart_space():
14
  API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
15
 
16
+ # New function to download results
17
+ def download_latest_results():
18
+ print("Downloading latest results...")
19
+ snapshot_download(RESULTS_REPO_ID,
20
+ local_dir=abs_path / "evals",
21
+ repo_type='dataset',
22
+ tqdm_class=None,
23
+ etag_timeout=30,
24
+ max_workers=4,
25
+ )
26
+ print("Download complete.")
27
+
28
  abs_path = Path(__file__).parent
29
 
30
+
31
+ # load task_analyses.json from evals/usaco_traces folder
32
+ with open(os.path.join(abs_path, "evals", "usaco_traces", "task_analyses.json"), "r") as f:
33
+ analyzed_traces = json.load(f)
34
+
35
+
36
+
37
+ def update_task_analysis(task_id):
38
+ if task_id not in analyzed_traces:
39
+ return "No analysis available for this task.", [], ""
40
+
41
+ analysis = analyzed_traces[task_id]
42
+ summary = analysis['summary']
43
+
44
+ if isinstance(summary, str):
45
+ try:
46
+ summary = json.loads(summary)
47
+ except json.JSONDecodeError:
48
+ return "Error: Unable to parse summary data.", [], ""
49
+ elif not isinstance(summary, dict):
50
+ return "Error: Summary data is in an unexpected format.", [], ""
51
+
52
+ overview = f"# Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
53
+ overview += f"## Successes\n{summary.get('successes', 'No successes listed.')}\n\n"
54
+ overview += f"## Challenges\n{summary.get('challenges', 'No challenges listed.')}\n\n"
55
+
56
+ steps = [(f"Step {i+1}", i) for i in range(len(analysis['steps']))]
57
+
58
+ return overview, gr.Dropdown(choices=steps, label="Agent Steps"), ""
59
+
60
+ def update_step_details(task_id, step_index):
61
+ if task_id not in analyzed_traces:
62
+ return "No analysis available for this task."
63
+
64
+ if step_index is None:
65
+ return "Please select a step to view details."
66
+
67
+ steps = analyzed_traces[task_id]['steps']
68
+
69
+ if isinstance(step_index, tuple):
70
+ step_index = step_index[1]
71
+ elif isinstance(step_index, str):
72
+ step_index = int(step_index.split()[-1]) - 1
73
+
74
+ if step_index < 0 or step_index >= len(steps):
75
+ return f"Invalid step index: {step_index}"
76
+
77
+ step = steps[step_index]
78
+ analysis = step['analysis']
79
+
80
+ if isinstance(analysis, str):
81
+ try:
82
+ analysis = json.loads(analysis)
83
+ except json.JSONDecodeError:
84
+ return "Error: Unable to parse step analysis data."
85
+ elif not isinstance(analysis, dict):
86
+ return "Error: Step analysis data is in an unexpected format."
87
+
88
+ details = f"# Step {step_index + 1} Details\n\n"
89
+ details += f"## Description\n{analysis.get('description', 'No description available.')}\n\n"
90
+ details += f"## Assessment\n{analysis.get('assessment', 'No assessment available.')}\n\n"
91
+
92
+ return details
93
+
94
  with gr.Blocks() as demo:
95
  gr.Markdown("""
96
  # 🥇 Agent Leaderboard
 
131
  "results_accuracy": 20,
132
  "results_total_cost": 20},
133
  )
134
+ gr.Markdown("## USACO Task Trace Explorer")
135
+ with gr.Row():
136
+ with gr.Column(scale=1):
137
+ task_dropdown = gr.Dropdown(choices=list(analyzed_traces.keys()), label="Select USACO Task")
138
+ task_overview = gr.Markdown()
139
+ with gr.Column(scale=1):
140
+ steps_dropdown = gr.Dropdown(label="Agent Steps")
141
+ step_details = gr.Markdown()
142
+
143
+ task_dropdown.change(update_task_analysis, inputs=[task_dropdown], outputs=[task_overview, steps_dropdown, step_details])
144
+ steps_dropdown.change(update_step_details, inputs=[task_dropdown, steps_dropdown], outputs=[step_details])
145
+
146
  with gr.Tab("About"):
147
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
148
 
149
  if __name__ == "__main__":
150
  # Download the results from the Hugging Face Hub
151
+ download_latest_results()
 
 
 
 
 
 
152
 
153
  scheduler = BackgroundScheduler()
154
  scheduler.add_job(restart_space, "interval", hours=1) # restarted every 1h
155
+ scheduler.add_job(download_latest_results, "interval", hours=1) # download latest results every 1h
156
  scheduler.start()
157
  demo.launch()