benediktstroebl commited on
Commit
1783518
·
1 Parent(s): 356b0eb

added automatic download for results

Browse files
app.py CHANGED
@@ -1,13 +1,24 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
3
  import config
 
4
  from pathlib import Path
5
  import pandas as pd
6
  import os
7
  from utils import parse_json_files, create_scatter_plot
 
8
 
9
  abs_path = Path(__file__).parent
10
 
 
 
 
 
 
 
 
 
 
11
  df = parse_json_files(os.path.join(abs_path, "evals"))
12
 
13
  with gr.Blocks() as demo:
@@ -19,7 +30,7 @@ with gr.Blocks() as demo:
19
  with gr.Tab("SWE-Bench"):
20
  with gr.Row():
21
  with gr.Column(scale=1):
22
- scatter_plot = gr.Plot(create_scatter_plot(df, "results_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"]))
23
  with gr.Column(scale=1):
24
  Leaderboard(
25
  value=df,
@@ -31,12 +42,12 @@ with gr.Blocks() as demo:
31
  search_columns=config.SWEBENCH_SEARCH_COLUMNS,
32
  column_widths={"agent_name": 40,
33
  "results_accuracy": 20,
34
- "results_cost": 20},
35
  )
36
  with gr.Tab("USACO"):
37
  with gr.Row():
38
  with gr.Column(scale=1):
39
- scatter_plot = gr.Plot(create_scatter_plot(df, "results_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"]))
40
  with gr.Column(scale=1):
41
  Leaderboard(
42
  value=df,
@@ -48,7 +59,7 @@ with gr.Blocks() as demo:
48
  search_columns=config.SWEBENCH_SEARCH_COLUMNS,
49
  column_widths={"agent_name": 40,
50
  "results_accuracy": 20,
51
- "results_cost": 20},
52
  )
53
  with gr.Tab("About"):
54
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
3
  import config
4
+ from envs import RESULTS_REPO_ID
5
  from pathlib import Path
6
  import pandas as pd
7
  import os
8
  from utils import parse_json_files, create_scatter_plot
9
+ from huggingface_hub import snapshot_download
10
 
11
  abs_path = Path(__file__).parent
12
 
13
+ # Download the results from the Hugging Face Hub
14
+ snapshot_download(RESULTS_REPO_ID,
15
+ local_dir=abs_path / "evals",
16
+ repo_type='dataset',
17
+ tqdm_class=None,
18
+ etag_timeout=30,
19
+ max_workers=4,
20
+ )
21
+
22
  df = parse_json_files(os.path.join(abs_path, "evals"))
23
 
24
  with gr.Blocks() as demo:
 
30
  with gr.Tab("SWE-Bench"):
31
  with gr.Row():
32
  with gr.Column(scale=1):
33
+ scatter_plot = gr.Plot(create_scatter_plot(df, "results_total_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"]))
34
  with gr.Column(scale=1):
35
  Leaderboard(
36
  value=df,
 
42
  search_columns=config.SWEBENCH_SEARCH_COLUMNS,
43
  column_widths={"agent_name": 40,
44
  "results_accuracy": 20,
45
+ "results_total_cost": 20},
46
  )
47
  with gr.Tab("USACO"):
48
  with gr.Row():
49
  with gr.Column(scale=1):
50
+ scatter_plot = gr.Plot(create_scatter_plot(df, "results_total_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"]))
51
  with gr.Column(scale=1):
52
  Leaderboard(
53
  value=df,
 
59
  search_columns=config.SWEBENCH_SEARCH_COLUMNS,
60
  column_widths={"agent_name": 40,
61
  "results_accuracy": 20,
62
+ "results_total_cost": 20},
63
  )
64
  with gr.Tab("About"):
65
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
config.py CHANGED
@@ -9,9 +9,9 @@ TYPES = [
9
  SWEBENCH_ON_LOAD_COLUMNS = [
10
  "agent_name",
11
  "results_accuracy",
12
- "results_cost",
13
  ]
14
- SWEBENCH_SEARCH_COLUMNS = ['results_cost']
15
 
16
 
17
  NUMERIC_INTERVALS = {
 
9
  SWEBENCH_ON_LOAD_COLUMNS = [
10
  "agent_name",
11
  "results_accuracy",
12
+ "results_total_cost",
13
  ]
14
+ SWEBENCH_SEARCH_COLUMNS = ['results_total_cost']
15
 
16
 
17
  NUMERIC_INTERVALS = {
envs.py ADDED
@@ -0,0 +1 @@
 
 
1
+ RESULTS_REPO_ID = 'agent-evals/results'
evals/example_agent_1.json CHANGED
@@ -6,6 +6,6 @@
6
  },
7
  "results": {
8
  "accuracy": 12,
9
- "cost": 34
10
  }
11
  }
 
6
  },
7
  "results": {
8
  "accuracy": 12,
9
+ "total_cost": 34
10
  }
11
  }
evals/example_agent_2.json CHANGED
@@ -6,6 +6,6 @@
6
  },
7
  "results": {
8
  "accuracy": 34,
9
- "cost": 50
10
  }
11
  }
 
6
  },
7
  "results": {
8
  "accuracy": 34,
9
+ "total_cost": 50
10
  }
11
  }
evals/example_agent_3.json CHANGED
@@ -6,6 +6,6 @@
6
  },
7
  "results": {
8
  "accuracy": 60,
9
- "cost": 55
10
  }
11
  }
 
6
  },
7
  "results": {
8
  "accuracy": 60,
9
+ "total_cost": 55
10
  }
11
  }
evals/swebench_lite_example_agent_1722587866.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"config": {"agent_name": "example_agent", "benchmark_name": "swebench_lite", "date": "2024-10-10", "swe_bench": {"dataset_name": "princeton-nlp/SWE-bench_Lite", "max_workers": 1}, "swe_bench_dataset": "princeton-nlp/SWE-bench_Lite", "swe_bench_max_workers": 1}, "results": {"accuracy": 0.0, "total_cost": 0}, "raw_eval_results": {"total_instances": 300, "submitted_instances": 1, "completed_instances": 0, "resolved_instances": 0, "unresolved_instances": 0, "empty_patch_instances": 0, "error_instances": 1, "unstopped_instances": 0, "completed_ids": [], "incomplete_ids": ["astropy__astropy-14182", "astropy__astropy-14365", "astropy__astropy-14995", "astropy__astropy-6938", "astropy__astropy-7746", "django__django-10914", "django__django-10924", "django__django-11001", "django__django-11019", "django__django-11039", "django__django-11049", "django__django-11099", "django__django-11133", "django__django-11179", "django__django-11283", "django__django-11422", "django__django-11564", "django__django-11583", "django__django-11620", "django__django-11630", "django__django-11742", "django__django-11797", "django__django-11815", "django__django-11848", "django__django-11905", "django__django-11910", "django__django-11964", "django__django-11999", "django__django-12113", "django__django-12125", "django__django-12184", "django__django-12284", "django__django-12286", "django__django-12308", "django__django-12453", "django__django-12470", "django__django-12497", "django__django-12589", "django__django-12700", "django__django-12708", "django__django-12747", "django__django-12856", "django__django-12908", "django__django-12915", "django__django-12983", "django__django-13028", "django__django-13033", "django__django-13158", "django__django-13220", "django__django-13230", "django__django-13265", "django__django-13315", "django__django-13321", "django__django-13401", "django__django-13447", "django__django-13448", "django__django-13551", "django__django-13590", "django__django-13658", "django__django-13660", "django__django-13710", "django__django-13757", "django__django-13768", "django__django-13925", "django__django-13933", "django__django-13964", "django__django-14016", "django__django-14017", "django__django-14155", "django__django-14238", "django__django-14382", "django__django-14411", "django__django-14534", "django__django-14580", "django__django-14608", "django__django-14667", "django__django-14672", "django__django-14730", "django__django-14752", "django__django-14787", "django__django-14855", "django__django-14915", "django__django-14997", "django__django-14999", "django__django-15061", "django__django-15202", "django__django-15213", "django__django-15252", "django__django-15320", "django__django-15347", "django__django-15388", "django__django-15400", "django__django-15498", "django__django-15695", "django__django-15738", "django__django-15781", "django__django-15789", "django__django-15790", "django__django-15814", "django__django-15819", "django__django-15851", "django__django-15902", "django__django-15996", "django__django-16041", "django__django-16046", "django__django-16139", "django__django-16229", "django__django-16255", "django__django-16379", "django__django-16400", "django__django-16408", "django__django-16527", "django__django-16595", "django__django-16816", "django__django-16820", "django__django-16873", "django__django-16910", "django__django-17051", "django__django-17087", "matplotlib__matplotlib-18869", "matplotlib__matplotlib-22711", "matplotlib__matplotlib-22835", "matplotlib__matplotlib-23299", "matplotlib__matplotlib-23314", "matplotlib__matplotlib-23476", "matplotlib__matplotlib-23562", "matplotlib__matplotlib-23563", "matplotlib__matplotlib-23913", "matplotlib__matplotlib-23964", "matplotlib__matplotlib-23987", "matplotlib__matplotlib-24149", "matplotlib__matplotlib-24265", "matplotlib__matplotlib-24334", "matplotlib__matplotlib-24970", "matplotlib__matplotlib-25079", "matplotlib__matplotlib-25311", "matplotlib__matplotlib-25332", "matplotlib__matplotlib-25433", "matplotlib__matplotlib-25442", "matplotlib__matplotlib-25498", "matplotlib__matplotlib-26011", "matplotlib__matplotlib-26020", "mwaskom__seaborn-2848", "mwaskom__seaborn-3010", "mwaskom__seaborn-3190", "mwaskom__seaborn-3407", "pallets__flask-4045", "pallets__flask-4992", "pallets__flask-5063", "psf__requests-1963", "psf__requests-2148", "psf__requests-2317", "psf__requests-2674", "psf__requests-3362", "psf__requests-863", "pydata__xarray-3364", "pydata__xarray-4094", "pydata__xarray-4248", "pydata__xarray-4493", "pydata__xarray-5131", "pylint-dev__pylint-5859", "pylint-dev__pylint-6506", "pylint-dev__pylint-7080", "pylint-dev__pylint-7114", "pylint-dev__pylint-7228", "pylint-dev__pylint-7993", "pytest-dev__pytest-11143", "pytest-dev__pytest-11148", "pytest-dev__pytest-5103", "pytest-dev__pytest-5221", "pytest-dev__pytest-5227", "pytest-dev__pytest-5413", "pytest-dev__pytest-5495", "pytest-dev__pytest-5692", "pytest-dev__pytest-6116", "pytest-dev__pytest-7168", "pytest-dev__pytest-7220", "pytest-dev__pytest-7373", "pytest-dev__pytest-7432", "pytest-dev__pytest-7490", "pytest-dev__pytest-8365", "pytest-dev__pytest-8906", "pytest-dev__pytest-9359", "scikit-learn__scikit-learn-10297", "scikit-learn__scikit-learn-10508", "scikit-learn__scikit-learn-10949", "scikit-learn__scikit-learn-11040", "scikit-learn__scikit-learn-11281", "scikit-learn__scikit-learn-12471", "scikit-learn__scikit-learn-13142", "scikit-learn__scikit-learn-13241", "scikit-learn__scikit-learn-13439", "scikit-learn__scikit-learn-13496", "scikit-learn__scikit-learn-13497", "scikit-learn__scikit-learn-13584", "scikit-learn__scikit-learn-13779", "scikit-learn__scikit-learn-14087", "scikit-learn__scikit-learn-14092", "scikit-learn__scikit-learn-14894", "scikit-learn__scikit-learn-14983", "scikit-learn__scikit-learn-15512", "scikit-learn__scikit-learn-15535", "scikit-learn__scikit-learn-25500", "scikit-learn__scikit-learn-25570", "scikit-learn__scikit-learn-25638", "scikit-learn__scikit-learn-25747", "sphinx-doc__sphinx-10325", "sphinx-doc__sphinx-10451", "sphinx-doc__sphinx-11445", "sphinx-doc__sphinx-7686", "sphinx-doc__sphinx-7738", "sphinx-doc__sphinx-7975", "sphinx-doc__sphinx-8273", "sphinx-doc__sphinx-8282", "sphinx-doc__sphinx-8435", "sphinx-doc__sphinx-8474", "sphinx-doc__sphinx-8506", "sphinx-doc__sphinx-8595", "sphinx-doc__sphinx-8627", "sphinx-doc__sphinx-8713", "sphinx-doc__sphinx-8721", "sphinx-doc__sphinx-8801", "sympy__sympy-11400", "sympy__sympy-11870", "sympy__sympy-11897", "sympy__sympy-12171", "sympy__sympy-12236", "sympy__sympy-12419", "sympy__sympy-12454", "sympy__sympy-12481", "sympy__sympy-13031", "sympy__sympy-13043", "sympy__sympy-13146", "sympy__sympy-13177", "sympy__sympy-13437", "sympy__sympy-13471", "sympy__sympy-13480", "sympy__sympy-13647", "sympy__sympy-13773", "sympy__sympy-13895", "sympy__sympy-13915", "sympy__sympy-13971", "sympy__sympy-14024", "sympy__sympy-14308", "sympy__sympy-14317", "sympy__sympy-14396", "sympy__sympy-14774", "sympy__sympy-14817", "sympy__sympy-15011", "sympy__sympy-15308", "sympy__sympy-15345", "sympy__sympy-15346", "sympy__sympy-15609", "sympy__sympy-15678", "sympy__sympy-16106", "sympy__sympy-16281", "sympy__sympy-16503", "sympy__sympy-16792", "sympy__sympy-16988", "sympy__sympy-17022", "sympy__sympy-17139", "sympy__sympy-17630", "sympy__sympy-17655", "sympy__sympy-18057", "sympy__sympy-18087", "sympy__sympy-18189", "sympy__sympy-18199", "sympy__sympy-18532", "sympy__sympy-18621", "sympy__sympy-18698", "sympy__sympy-18835", "sympy__sympy-19007", "sympy__sympy-19254", "sympy__sympy-19487", "sympy__sympy-20049", "sympy__sympy-20154", "sympy__sympy-20212", "sympy__sympy-20322", "sympy__sympy-20442", "sympy__sympy-20590", "sympy__sympy-20639", "sympy__sympy-21055", "sympy__sympy-21171", "sympy__sympy-21379", "sympy__sympy-21612", "sympy__sympy-21614", "sympy__sympy-21627", "sympy__sympy-21847", "sympy__sympy-22005", "sympy__sympy-22714", "sympy__sympy-22840", "sympy__sympy-23117", "sympy__sympy-23191", "sympy__sympy-23262", "sympy__sympy-24066", "sympy__sympy-24102", "sympy__sympy-24152", "sympy__sympy-24213", "sympy__sympy-24909"], "empty_patch_ids": [], "submitted_ids": ["astropy__astropy-12907"], "resolved_ids": [], "unresolved_ids": [], "error_ids": ["astropy__astropy-12907"], "unstopped_containers": [], "unremoved_images": [], "schema_version": 2}, "raw_logging_results": {"total_cost": 0}}
utils.py CHANGED
@@ -13,25 +13,28 @@ def parse_json_files(folder_path):
13
 
14
  # Iterate through all JSON files in the folder
15
  for json_file in folder.glob('*.json'):
16
- with open(json_file, 'r') as file:
17
- data = json.load(file)
18
-
19
- # Extract config and results
20
- config = data['config']
21
- results = data['results']
22
-
23
- # Combine config and results into a single dictionary
24
- combined_data = {
25
- 'agent_name': config['agent_name'],
26
- 'benchmark_name': config['benchmark_name'],
27
- 'date': config['date']
28
- }
29
-
30
- # Add results with 'results_' prefix
31
- for key, value in results.items():
32
- combined_data[f'results_{key}'] = value
33
-
34
- data_list.append(combined_data)
 
 
 
35
 
36
  # Create DataFrame from the list of dictionaries
37
  df = pd.DataFrame(data_list)
 
13
 
14
  # Iterate through all JSON files in the folder
15
  for json_file in folder.glob('*.json'):
16
+ try:
17
+ with open(json_file, 'r') as file:
18
+ data = json.load(file)
19
+
20
+ # Extract config and results
21
+ config = data['config']
22
+ results = data['results']
23
+
24
+ # Combine config and results into a single dictionary
25
+ combined_data = {
26
+ 'agent_name': config['agent_name'],
27
+ 'benchmark_name': config['benchmark_name'],
28
+ 'date': config['date']
29
+ }
30
+
31
+ # Add results with 'results_' prefix
32
+ for key, value in results.items():
33
+ combined_data[f'results_{key}'] = value
34
+
35
+ data_list.append(combined_data)
36
+ except Exception as e:
37
+ print(f"Error processing {json_file}: {e}. Skipping!")
38
 
39
  # Create DataFrame from the list of dictionaries
40
  df = pd.DataFrame(data_list)