benediktstroebl commited on
Commit
b7d1f08
·
1 Parent(s): 9d2915b

added verified agents management and column and fixed widths

Browse files
Files changed (4) hide show
  1. about.md +0 -3
  2. app.py +6 -18
  3. utils/db.py +18 -0
  4. verified_agents.yaml +31 -0
about.md CHANGED
@@ -1,4 +1 @@
1
-
2
- # Agent leaderboard
3
-
4
  Coming soon...
 
 
 
 
1
  Coming soon...
app.py CHANGED
@@ -227,15 +227,12 @@ with gr.Blocks() as demo:
227
  Leaderboard(
228
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'),
229
  select_columns=SelectColumns(
230
- default_selection=config.USACO_ON_LOAD_COLUMNS,
231
  cant_deselect=["Agent Name"],
232
  label="Select Columns to Display:",
233
  ),
234
  hide_columns=config.USACO_HIDE_COLUMNS,
235
  # search_columns=config.USACO_SEARCH_COLUMNS,
236
- column_widths={"Agent Name": 40,
237
- "Accuracy": 20,
238
- "Total Cost": 20},
239
  )
240
  with gr.Row():
241
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
@@ -359,15 +356,12 @@ with gr.Blocks() as demo:
359
  Leaderboard(
360
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
361
  select_columns=SelectColumns(
362
- default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
363
  cant_deselect=["Agent Name"],
364
  label="Select Columns to Display:",
365
  ),
366
  hide_columns=config.SWEBENCH_HIDE_COLUMNS,
367
- # search_columns=config.SWEBENCH_SEARCH_COLUMNS,
368
- column_widths={"Agent Name": 40,
369
- "Accuracy": 20,
370
- "Total Cost": 20},
371
  )
372
  with gr.Row():
373
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
@@ -490,15 +484,12 @@ with gr.Blocks() as demo:
490
  Leaderboard(
491
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
492
  select_columns=SelectColumns(
493
- default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
494
  cant_deselect=["Agent Name"],
495
  label="Select Columns to Display:",
496
  ),
497
  # search_columns=config.SWEBENCH_SEARCH_COLUMNS,
498
- hide_columns=config.SWEBENCH_HIDE_COLUMNS,
499
- column_widths={"Agent Name": 40,
500
- "Accuracy": 20,
501
- "Total Cost": 20},
502
  )
503
  with gr.Row():
504
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
@@ -621,15 +612,12 @@ with gr.Blocks() as demo:
621
  Leaderboard(
622
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
623
  select_columns=SelectColumns(
624
- default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS,
625
  cant_deselect=["Agent Name"],
626
  label="Select Columns to Display:",
627
  ),
628
  # search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
629
  hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
630
- column_widths={"Agent Name": 40,
631
- "Overall Score": 20,
632
- "Total Cost": 20},
633
  )
634
  with gr.Row():
635
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
 
227
  Leaderboard(
228
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'),
229
  select_columns=SelectColumns(
230
+ default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
231
  cant_deselect=["Agent Name"],
232
  label="Select Columns to Display:",
233
  ),
234
  hide_columns=config.USACO_HIDE_COLUMNS,
235
  # search_columns=config.USACO_SEARCH_COLUMNS,
 
 
 
236
  )
237
  with gr.Row():
238
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
356
  Leaderboard(
357
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
358
  select_columns=SelectColumns(
359
+ default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
360
  cant_deselect=["Agent Name"],
361
  label="Select Columns to Display:",
362
  ),
363
  hide_columns=config.SWEBENCH_HIDE_COLUMNS,
364
+ # search_columns=config.SWEBENCH_SEARCH_COLUMNS
 
 
 
365
  )
366
  with gr.Row():
367
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
484
  Leaderboard(
485
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
486
  select_columns=SelectColumns(
487
+ default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
488
  cant_deselect=["Agent Name"],
489
  label="Select Columns to Display:",
490
  ),
491
  # search_columns=config.SWEBENCH_SEARCH_COLUMNS,
492
+ hide_columns=config.SWEBENCH_HIDE_COLUMNS
 
 
 
493
  )
494
  with gr.Row():
495
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
612
  Leaderboard(
613
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
614
  select_columns=SelectColumns(
615
+ default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
616
  cant_deselect=["Agent Name"],
617
  label="Select Columns to Display:",
618
  ),
619
  # search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
620
  hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
 
 
 
621
  )
622
  with gr.Row():
623
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
utils/db.py CHANGED
@@ -6,6 +6,7 @@ from functools import lru_cache
6
  import threading
7
  import pandas as pd
8
  import ast
 
9
 
10
  class TracePreprocessor:
11
  def __init__(self, db_path='preprocessed_traces.db'):
@@ -160,6 +161,12 @@ class TracePreprocessor:
160
  '''
161
  df = pd.read_sql_query(query, conn, params=(benchmark_name,))
162
 
 
 
 
 
 
 
163
  # Round float columns to 3 decimal places
164
  float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
165
  for column in float_columns:
@@ -229,6 +236,17 @@ class TracePreprocessor:
229
  })
230
 
231
  return df
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  if __name__ == '__main__':
234
  preprocessor = TracePreprocessor()
 
6
  import threading
7
  import pandas as pd
8
  import ast
9
+ import yaml
10
 
11
  class TracePreprocessor:
12
  def __init__(self, db_path='preprocessed_traces.db'):
 
161
  '''
162
  df = pd.read_sql_query(query, conn, params=(benchmark_name,))
163
 
164
+ # Load verified agents
165
+ verified_agents = self.load_verified_agents()
166
+
167
+ # Add 'Verified' column
168
+ df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
169
+
170
  # Round float columns to 3 decimal places
171
  float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
172
  for column in float_columns:
 
236
  })
237
 
238
  return df
239
+
240
+ def load_verified_agents(self, file_path='verified_agents.yaml'):
241
+ with open(file_path, 'r') as f:
242
+ verified_data = yaml.safe_load(f)
243
+
244
+ verified_agents = set()
245
+ for benchmark, agents in verified_data.items():
246
+ for agent in agents:
247
+ verified_agents.add((benchmark, agent['agent_name']))
248
+
249
+ return verified_agents
250
 
251
  if __name__ == '__main__':
252
  preprocessor = TracePreprocessor()
verified_agents.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file contains information about verified agent results for different benchmarks.
2
+ # Format:
3
+ # benchmark_name:
4
+ # - agent_name: "Name of the agent"
5
+ # verification_date: YYYY-MM-DD
6
+
7
+ usaco:
8
+ - agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
9
+ verification_date: 2024-08-20
10
+ - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
11
+ verification_date: 2024-08-20
12
+ - agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
13
+ verification_date: 2024-08-20
14
+ - agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
15
+ verification_date: 2024-08-12
16
+ - agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
17
+ verification_date: 2024-08-20
18
+ - agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
19
+ verification_date: 2024-08-11
20
+ - agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
21
+ verification_date: 2024-08-12
22
+
23
+ swebench_verified:
24
+ - agent_name: "Agentless (gpt-4o-mini-2024-07-18) (50 Instances)"
25
+ verification_date: 2024-08-17
26
+ - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1) (50 Instances)"
27
+ verification_date: 2024-08-19
28
+
29
+ mlagentbench:
30
+ - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
31
+ verification_date: 2024-08-19