benediktstroebl commited on
Commit
066588c
·
1 Parent(s): 5cbaf0e

layout update

Browse files
app.py CHANGED
@@ -17,6 +17,11 @@ import re
17
  import markdown
18
  import asyncio
19
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
 
 
 
 
 
20
 
21
 
22
  def restart_space():
@@ -26,7 +31,7 @@ def restart_space():
26
  def download_latest_results():
27
  print("Downloading latest results...")
28
  snapshot_download(RESULTS_REPO_ID,
29
- local_dir=abs_path / "evals_live",
30
  repo_type='dataset',
31
  tqdm_class=None,
32
  etag_timeout=30,
@@ -63,13 +68,13 @@ def get_analyzed_traces(agent_name, benchmark_name):
63
 
64
  def update_agent_dropdown(benchmark_name, metric):
65
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
66
- agents = df['agent_name'].tolist()
67
  best_agent = get_best_agent(benchmark_name, metric)
68
  return gr.Dropdown(choices=agents, value=best_agent, label="Select Agent")
69
 
70
  def get_best_agent(benchmark_name, metric):
71
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
72
- return df.loc[df[metric].idxmax()]['agent_name']
73
 
74
  def update_task_analysis(benchmark_name, agent_name):
75
  if not agent_name:
@@ -80,8 +85,10 @@ def update_task_analysis(benchmark_name, agent_name):
80
  return f"No analysis available for agent: {agent_name}", None, None, ""
81
 
82
  task_ids = list(analyzed_traces.keys())
 
 
83
 
84
- return "", None, gr.Dropdown(choices=task_ids, value=task_ids[0], label="Select Task"), ""
85
 
86
  def update_task_details(benchmark_name, agent_name, task_id):
87
  if not task_id:
@@ -92,11 +99,12 @@ def update_task_details(benchmark_name, agent_name, task_id):
92
  return f"No analysis available for task: {task_id}", None, ""
93
 
94
  analysis = analyzed_traces[task_id]
95
- summary = analysis.get('summary', {})
96
 
97
- overview = f"# Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
98
- overview += f"## Successes\n{summary.get('successes', 'No successes listed.')}\n\n"
99
- overview += f"## Challenges\n{summary.get('challenges', 'No challenges listed.')}\n\n"
 
100
 
101
  flow_chart = create_flow_chart(analysis['steps'])
102
 
@@ -161,7 +169,7 @@ def format_call_info(step, step_index):
161
  }}
162
  </style>
163
 
164
- <h2>Step {step_index + 1}: {analysis.get('step_outline', 'N/A')}</h2>
165
 
166
  <h3>Call Metadata</h3>
167
  <ul>
@@ -200,29 +208,29 @@ with gr.Blocks() as demo:
200
  with gr.Tabs():
201
  with gr.Tab("USACO"):
202
  with gr.Row():
203
- with gr.Column(scale=1):
204
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "results_total_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"]))
205
- with gr.Column(scale=1):
206
  Leaderboard(
207
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'),
208
  select_columns=SelectColumns(
209
  default_selection=config.USACO_ON_LOAD_COLUMNS,
210
- cant_deselect=["agent_name"],
211
  label="Select Columns to Display:",
212
  ),
213
  search_columns=config.USACO_SEARCH_COLUMNS,
214
- column_widths={"agent_name": 40,
215
- "results_accuracy": 20,
216
- "results_total_cost": 20},
217
  )
 
 
218
  gr.Markdown("## Agent Monitor")
219
  with gr.Row():
220
- with gr.Column(scale=1):
221
- task_dropdown = gr.Dropdown(label="Select USACO Task")
222
- task_overview = gr.Markdown()
223
  with gr.Column(scale=1):
224
  agent_dropdown = gr.Dropdown(label="Select Agent")
225
- step_details = gr.Markdown()
 
 
 
226
  with gr.Row():
227
  flow_chart = gr.Plot(label="Task Flow")
228
 
@@ -298,19 +306,19 @@ with gr.Blocks() as demo:
298
  with gr.Tab("SWE-Bench"):
299
  with gr.Row():
300
  with gr.Column(scale=1):
301
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "results_total_cost", "results_accuracy", "Cost (in USD)", "Accuracy", ["agent_name"]))
302
  with gr.Column(scale=1):
303
  Leaderboard(
304
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
305
  select_columns=SelectColumns(
306
  default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
307
- cant_deselect=["agent_name"],
308
  label="Select Columns to Display:",
309
  ),
310
  search_columns=config.SWEBENCH_SEARCH_COLUMNS,
311
- column_widths={"agent_name": 40,
312
- "results_accuracy": 20,
313
- "results_total_cost": 20},
314
  )
315
 
316
  with gr.Tab("About"):
 
17
  import markdown
18
  import asyncio
19
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
20
+ import weave
21
+
22
+
23
+ from datetime import datetime
24
+ weave.init(f'leaderboard_testing_{datetime.now().strftime("%Y%m%d%H%M%S")}')
25
 
26
 
27
  def restart_space():
 
31
  def download_latest_results():
32
  print("Downloading latest results...")
33
  snapshot_download(RESULTS_REPO_ID,
34
+ local_dir=abs_path / "evals_upload",
35
  repo_type='dataset',
36
  tqdm_class=None,
37
  etag_timeout=30,
 
68
 
69
  def update_agent_dropdown(benchmark_name, metric):
70
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
71
+ agents = df['Agent Name'].tolist()
72
  best_agent = get_best_agent(benchmark_name, metric)
73
  return gr.Dropdown(choices=agents, value=best_agent, label="Select Agent")
74
 
75
  def get_best_agent(benchmark_name, metric):
76
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
77
+ return df.loc[df[metric].idxmax()]['Agent Name']
78
 
79
  def update_task_analysis(benchmark_name, agent_name):
80
  if not agent_name:
 
85
  return f"No analysis available for agent: {agent_name}", None, None, ""
86
 
87
  task_ids = list(analyzed_traces.keys())
88
+
89
+ overview, flow_chart, _ = update_task_details(benchmark_name, agent_name, task_ids[0])
90
 
91
+ return overview, flow_chart, gr.Dropdown(choices=task_ids, value=task_ids[0], label="Select Task"), ""
92
 
93
  def update_task_details(benchmark_name, agent_name, task_id):
94
  if not task_id:
 
99
  return f"No analysis available for task: {task_id}", None, ""
100
 
101
  analysis = analyzed_traces[task_id]
102
+ summary = analysis.get('task_analysis', {})
103
 
104
+ overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
105
+ overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
106
+ overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
107
+ overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
108
 
109
  flow_chart = create_flow_chart(analysis['steps'])
110
 
 
169
  }}
170
  </style>
171
 
172
+ <h2>Step {step_index + 1}: {analysis.get('headline', 'N/A')}</h2>
173
 
174
  <h3>Call Metadata</h3>
175
  <ul>
 
208
  with gr.Tabs():
209
  with gr.Tab("USACO"):
210
  with gr.Row():
211
+ with gr.Column(scale=2):
 
 
212
  Leaderboard(
213
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'),
214
  select_columns=SelectColumns(
215
  default_selection=config.USACO_ON_LOAD_COLUMNS,
216
+ cant_deselect=["Agent Name"],
217
  label="Select Columns to Display:",
218
  ),
219
  search_columns=config.USACO_SEARCH_COLUMNS,
220
+ column_widths={"Agent Name": 40,
221
+ "Accuracy": 20,
222
+ "Total Cost": 20},
223
  )
224
+ with gr.Row():
225
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
226
  gr.Markdown("## Agent Monitor")
227
  with gr.Row():
 
 
 
228
  with gr.Column(scale=1):
229
  agent_dropdown = gr.Dropdown(label="Select Agent")
230
+ with gr.Column(scale=1):
231
+ task_dropdown = gr.Dropdown(label="Select USACO Task")
232
+ with gr.Row():
233
+ task_overview = gr.Markdown()
234
  with gr.Row():
235
  flow_chart = gr.Plot(label="Task Flow")
236
 
 
306
  with gr.Tab("SWE-Bench"):
307
  with gr.Row():
308
  with gr.Column(scale=1):
309
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
310
  with gr.Column(scale=1):
311
  Leaderboard(
312
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
313
  select_columns=SelectColumns(
314
  default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
315
+ cant_deselect=["Agent Name"],
316
  label="Select Columns to Display:",
317
  ),
318
  search_columns=config.SWEBENCH_SEARCH_COLUMNS,
319
+ column_widths={"Agent Name": 40,
320
+ "Accuracy": 20,
321
+ "Total Cost": 20},
322
  )
323
 
324
  with gr.Tab("About"):
config.py CHANGED
@@ -7,18 +7,18 @@ TYPES = [
7
  ]
8
 
9
  SWEBENCH_ON_LOAD_COLUMNS = [
10
- "agent_name",
11
- "results_accuracy",
12
- "results_total_cost",
13
  ]
14
- SWEBENCH_SEARCH_COLUMNS = ['results_total_cost']
15
 
16
  USACO_ON_LOAD_COLUMNS = [
17
- "agent_name",
18
- "results_accuracy",
19
- "results_total_cost",
20
  ]
21
- USACO_SEARCH_COLUMNS = ['results_total_cost']
22
 
23
 
24
  NUMERIC_INTERVALS = {
 
7
  ]
8
 
9
  SWEBENCH_ON_LOAD_COLUMNS = [
10
+ "Agent Name",
11
+ "Accuracy",
12
+ "Total Cost",
13
  ]
14
+ SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
15
 
16
  USACO_ON_LOAD_COLUMNS = [
17
+ "Agent Name",
18
+ "Accuracy",
19
+ "Total Cost",
20
  ]
21
+ USACO_SEARCH_COLUMNS = ['Total Cost']
22
 
23
 
24
  NUMERIC_INTERVALS = {
evals_live/swebench_lite_example_agent_1722587866.json CHANGED
@@ -1 +1 @@
1
- {"config": {"agent_name": "example_agent_1", "benchmark_name": "swebench_lite", "date": "2024-10-10", "swe_bench": {"dataset_name": "princeton-nlp/SWE-bench_Lite", "max_workers": 1}, "swe_bench_dataset": "princeton-nlp/SWE-bench_Lite", "swe_bench_max_workers": 1}, "results": {"accuracy": 13.0, "total_cost": 3.12}, "raw_eval_results": {"total_instances": 300, "submitted_instances": 1, "completed_instances": 0, "resolved_instances": 0, "unresolved_instances": 0, "empty_patch_instances": 0, "error_instances": 1, "unstopped_instances": 0, "completed_ids": [], "incomplete_ids": ["astropy__astropy-14182", "astropy__astropy-14365", "astropy__astropy-14995", "astropy__astropy-6938", "astropy__astropy-7746", "django__django-10914", "django__django-10924", "django__django-11001", "django__django-11019", "django__django-11039", "django__django-11049", "django__django-11099", "django__django-11133", "django__django-11179", "django__django-11283", "django__django-11422", "django__django-11564", "django__django-11583", "django__django-11620", "django__django-11630", "django__django-11742", "django__django-11797", "django__django-11815", "django__django-11848", "django__django-11905", "django__django-11910", "django__django-11964", "django__django-11999", "django__django-12113", "django__django-12125", "django__django-12184", "django__django-12284", "django__django-12286", "django__django-12308", "django__django-12453", "django__django-12470", "django__django-12497", "django__django-12589", "django__django-12700", "django__django-12708", "django__django-12747", "django__django-12856", "django__django-12908", "django__django-12915", "django__django-12983", "django__django-13028", "django__django-13033", "django__django-13158", "django__django-13220", "django__django-13230", "django__django-13265", "django__django-13315", "django__django-13321", "django__django-13401", "django__django-13447", "django__django-13448", "django__django-13551", "django__django-13590", "django__django-13658", "django__django-13660", "django__django-13710", "django__django-13757", "django__django-13768", "django__django-13925", "django__django-13933", "django__django-13964", "django__django-14016", "django__django-14017", "django__django-14155", "django__django-14238", "django__django-14382", "django__django-14411", "django__django-14534", "django__django-14580", "django__django-14608", "django__django-14667", "django__django-14672", "django__django-14730", "django__django-14752", "django__django-14787", "django__django-14855", "django__django-14915", "django__django-14997", "django__django-14999", "django__django-15061", "django__django-15202", "django__django-15213", "django__django-15252", "django__django-15320", "django__django-15347", "django__django-15388", "django__django-15400", "django__django-15498", "django__django-15695", "django__django-15738", "django__django-15781", "django__django-15789", "django__django-15790", "django__django-15814", "django__django-15819", "django__django-15851", "django__django-15902", "django__django-15996", "django__django-16041", "django__django-16046", "django__django-16139", "django__django-16229", "django__django-16255", "django__django-16379", "django__django-16400", "django__django-16408", "django__django-16527", "django__django-16595", "django__django-16816", "django__django-16820", "django__django-16873", "django__django-16910", "django__django-17051", "django__django-17087", "matplotlib__matplotlib-18869", "matplotlib__matplotlib-22711", "matplotlib__matplotlib-22835", "matplotlib__matplotlib-23299", "matplotlib__matplotlib-23314", "matplotlib__matplotlib-23476", "matplotlib__matplotlib-23562", "matplotlib__matplotlib-23563", "matplotlib__matplotlib-23913", "matplotlib__matplotlib-23964", "matplotlib__matplotlib-23987", "matplotlib__matplotlib-24149", "matplotlib__matplotlib-24265", "matplotlib__matplotlib-24334", "matplotlib__matplotlib-24970", "matplotlib__matplotlib-25079", "matplotlib__matplotlib-25311", "matplotlib__matplotlib-25332", "matplotlib__matplotlib-25433", "matplotlib__matplotlib-25442", "matplotlib__matplotlib-25498", "matplotlib__matplotlib-26011", "matplotlib__matplotlib-26020", "mwaskom__seaborn-2848", "mwaskom__seaborn-3010", "mwaskom__seaborn-3190", "mwaskom__seaborn-3407", "pallets__flask-4045", "pallets__flask-4992", "pallets__flask-5063", "psf__requests-1963", "psf__requests-2148", "psf__requests-2317", "psf__requests-2674", "psf__requests-3362", "psf__requests-863", "pydata__xarray-3364", "pydata__xarray-4094", "pydata__xarray-4248", "pydata__xarray-4493", "pydata__xarray-5131", "pylint-dev__pylint-5859", "pylint-dev__pylint-6506", "pylint-dev__pylint-7080", "pylint-dev__pylint-7114", "pylint-dev__pylint-7228", "pylint-dev__pylint-7993", "pytest-dev__pytest-11143", "pytest-dev__pytest-11148", "pytest-dev__pytest-5103", "pytest-dev__pytest-5221", "pytest-dev__pytest-5227", "pytest-dev__pytest-5413", "pytest-dev__pytest-5495", "pytest-dev__pytest-5692", "pytest-dev__pytest-6116", "pytest-dev__pytest-7168", "pytest-dev__pytest-7220", "pytest-dev__pytest-7373", "pytest-dev__pytest-7432", "pytest-dev__pytest-7490", "pytest-dev__pytest-8365", "pytest-dev__pytest-8906", "pytest-dev__pytest-9359", "scikit-learn__scikit-learn-10297", "scikit-learn__scikit-learn-10508", "scikit-learn__scikit-learn-10949", "scikit-learn__scikit-learn-11040", "scikit-learn__scikit-learn-11281", "scikit-learn__scikit-learn-12471", "scikit-learn__scikit-learn-13142", "scikit-learn__scikit-learn-13241", "scikit-learn__scikit-learn-13439", "scikit-learn__scikit-learn-13496", "scikit-learn__scikit-learn-13497", "scikit-learn__scikit-learn-13584", "scikit-learn__scikit-learn-13779", "scikit-learn__scikit-learn-14087", "scikit-learn__scikit-learn-14092", "scikit-learn__scikit-learn-14894", "scikit-learn__scikit-learn-14983", "scikit-learn__scikit-learn-15512", "scikit-learn__scikit-learn-15535", "scikit-learn__scikit-learn-25500", "scikit-learn__scikit-learn-25570", "scikit-learn__scikit-learn-25638", "scikit-learn__scikit-learn-25747", "sphinx-doc__sphinx-10325", "sphinx-doc__sphinx-10451", "sphinx-doc__sphinx-11445", "sphinx-doc__sphinx-7686", "sphinx-doc__sphinx-7738", "sphinx-doc__sphinx-7975", "sphinx-doc__sphinx-8273", "sphinx-doc__sphinx-8282", "sphinx-doc__sphinx-8435", "sphinx-doc__sphinx-8474", "sphinx-doc__sphinx-8506", "sphinx-doc__sphinx-8595", "sphinx-doc__sphinx-8627", "sphinx-doc__sphinx-8713", "sphinx-doc__sphinx-8721", "sphinx-doc__sphinx-8801", "sympy__sympy-11400", "sympy__sympy-11870", "sympy__sympy-11897", "sympy__sympy-12171", "sympy__sympy-12236", "sympy__sympy-12419", "sympy__sympy-12454", "sympy__sympy-12481", "sympy__sympy-13031", "sympy__sympy-13043", "sympy__sympy-13146", "sympy__sympy-13177", "sympy__sympy-13437", "sympy__sympy-13471", "sympy__sympy-13480", "sympy__sympy-13647", "sympy__sympy-13773", "sympy__sympy-13895", "sympy__sympy-13915", "sympy__sympy-13971", "sympy__sympy-14024", "sympy__sympy-14308", "sympy__sympy-14317", "sympy__sympy-14396", "sympy__sympy-14774", "sympy__sympy-14817", "sympy__sympy-15011", "sympy__sympy-15308", "sympy__sympy-15345", "sympy__sympy-15346", "sympy__sympy-15609", "sympy__sympy-15678", "sympy__sympy-16106", "sympy__sympy-16281", "sympy__sympy-16503", "sympy__sympy-16792", "sympy__sympy-16988", "sympy__sympy-17022", "sympy__sympy-17139", "sympy__sympy-17630", "sympy__sympy-17655", "sympy__sympy-18057", "sympy__sympy-18087", "sympy__sympy-18189", "sympy__sympy-18199", "sympy__sympy-18532", "sympy__sympy-18621", "sympy__sympy-18698", "sympy__sympy-18835", "sympy__sympy-19007", "sympy__sympy-19254", "sympy__sympy-19487", "sympy__sympy-20049", "sympy__sympy-20154", "sympy__sympy-20212", "sympy__sympy-20322", "sympy__sympy-20442", "sympy__sympy-20590", "sympy__sympy-20639", "sympy__sympy-21055", "sympy__sympy-21171", "sympy__sympy-21379", "sympy__sympy-21612", "sympy__sympy-21614", "sympy__sympy-21627", "sympy__sympy-21847", "sympy__sympy-22005", "sympy__sympy-22714", "sympy__sympy-22840", "sympy__sympy-23117", "sympy__sympy-23191", "sympy__sympy-23262", "sympy__sympy-24066", "sympy__sympy-24102", "sympy__sympy-24152", "sympy__sympy-24213", "sympy__sympy-24909"], "empty_patch_ids": [], "submitted_ids": ["astropy__astropy-12907"], "resolved_ids": [], "unresolved_ids": [], "error_ids": ["astropy__astropy-12907"], "unstopped_containers": [], "unremoved_images": [], "schema_version": 2}, "raw_logging_results": {"total_cost": 0}}
 
1
+ {"config": {"agent_name": "example_agent_1", "benchmark_name": "swebench_lite", "date": "2024-10-10", "swe_bench": {"dataset_name": "princeton-nlp/SWE-bench_Lite", "max_workers": 1}, "swe_bench_dataset": "princeton-nlp/SWE-bench_Lite", "swe_bench_max_workers": 1}, "results": {"accuracy": 0.130, "total_cost": 3.12}, "raw_eval_results": {"total_instances": 300, "submitted_instances": 1, "completed_instances": 0, "resolved_instances": 0, "unresolved_instances": 0, "empty_patch_instances": 0, "error_instances": 1, "unstopped_instances": 0, "completed_ids": [], "incomplete_ids": ["astropy__astropy-14182", "astropy__astropy-14365", "astropy__astropy-14995", "astropy__astropy-6938", "astropy__astropy-7746", "django__django-10914", "django__django-10924", "django__django-11001", "django__django-11019", "django__django-11039", "django__django-11049", "django__django-11099", "django__django-11133", "django__django-11179", "django__django-11283", "django__django-11422", "django__django-11564", "django__django-11583", "django__django-11620", "django__django-11630", "django__django-11742", "django__django-11797", "django__django-11815", "django__django-11848", "django__django-11905", "django__django-11910", "django__django-11964", "django__django-11999", "django__django-12113", "django__django-12125", "django__django-12184", "django__django-12284", "django__django-12286", "django__django-12308", "django__django-12453", "django__django-12470", "django__django-12497", "django__django-12589", "django__django-12700", "django__django-12708", "django__django-12747", "django__django-12856", "django__django-12908", "django__django-12915", "django__django-12983", "django__django-13028", "django__django-13033", "django__django-13158", "django__django-13220", "django__django-13230", "django__django-13265", "django__django-13315", "django__django-13321", "django__django-13401", "django__django-13447", "django__django-13448", "django__django-13551", "django__django-13590", "django__django-13658", "django__django-13660", "django__django-13710", "django__django-13757", "django__django-13768", "django__django-13925", "django__django-13933", "django__django-13964", "django__django-14016", "django__django-14017", "django__django-14155", "django__django-14238", "django__django-14382", "django__django-14411", "django__django-14534", "django__django-14580", "django__django-14608", "django__django-14667", "django__django-14672", "django__django-14730", "django__django-14752", "django__django-14787", "django__django-14855", "django__django-14915", "django__django-14997", "django__django-14999", "django__django-15061", "django__django-15202", "django__django-15213", "django__django-15252", "django__django-15320", "django__django-15347", "django__django-15388", "django__django-15400", "django__django-15498", "django__django-15695", "django__django-15738", "django__django-15781", "django__django-15789", "django__django-15790", "django__django-15814", "django__django-15819", "django__django-15851", "django__django-15902", "django__django-15996", "django__django-16041", "django__django-16046", "django__django-16139", "django__django-16229", "django__django-16255", "django__django-16379", "django__django-16400", "django__django-16408", "django__django-16527", "django__django-16595", "django__django-16816", "django__django-16820", "django__django-16873", "django__django-16910", "django__django-17051", "django__django-17087", "matplotlib__matplotlib-18869", "matplotlib__matplotlib-22711", "matplotlib__matplotlib-22835", "matplotlib__matplotlib-23299", "matplotlib__matplotlib-23314", "matplotlib__matplotlib-23476", "matplotlib__matplotlib-23562", "matplotlib__matplotlib-23563", "matplotlib__matplotlib-23913", "matplotlib__matplotlib-23964", "matplotlib__matplotlib-23987", "matplotlib__matplotlib-24149", "matplotlib__matplotlib-24265", "matplotlib__matplotlib-24334", "matplotlib__matplotlib-24970", "matplotlib__matplotlib-25079", "matplotlib__matplotlib-25311", "matplotlib__matplotlib-25332", "matplotlib__matplotlib-25433", "matplotlib__matplotlib-25442", "matplotlib__matplotlib-25498", "matplotlib__matplotlib-26011", "matplotlib__matplotlib-26020", "mwaskom__seaborn-2848", "mwaskom__seaborn-3010", "mwaskom__seaborn-3190", "mwaskom__seaborn-3407", "pallets__flask-4045", "pallets__flask-4992", "pallets__flask-5063", "psf__requests-1963", "psf__requests-2148", "psf__requests-2317", "psf__requests-2674", "psf__requests-3362", "psf__requests-863", "pydata__xarray-3364", "pydata__xarray-4094", "pydata__xarray-4248", "pydata__xarray-4493", "pydata__xarray-5131", "pylint-dev__pylint-5859", "pylint-dev__pylint-6506", "pylint-dev__pylint-7080", "pylint-dev__pylint-7114", "pylint-dev__pylint-7228", "pylint-dev__pylint-7993", "pytest-dev__pytest-11143", "pytest-dev__pytest-11148", "pytest-dev__pytest-5103", "pytest-dev__pytest-5221", "pytest-dev__pytest-5227", "pytest-dev__pytest-5413", "pytest-dev__pytest-5495", "pytest-dev__pytest-5692", "pytest-dev__pytest-6116", "pytest-dev__pytest-7168", "pytest-dev__pytest-7220", "pytest-dev__pytest-7373", "pytest-dev__pytest-7432", "pytest-dev__pytest-7490", "pytest-dev__pytest-8365", "pytest-dev__pytest-8906", "pytest-dev__pytest-9359", "scikit-learn__scikit-learn-10297", "scikit-learn__scikit-learn-10508", "scikit-learn__scikit-learn-10949", "scikit-learn__scikit-learn-11040", "scikit-learn__scikit-learn-11281", "scikit-learn__scikit-learn-12471", "scikit-learn__scikit-learn-13142", "scikit-learn__scikit-learn-13241", "scikit-learn__scikit-learn-13439", "scikit-learn__scikit-learn-13496", "scikit-learn__scikit-learn-13497", "scikit-learn__scikit-learn-13584", "scikit-learn__scikit-learn-13779", "scikit-learn__scikit-learn-14087", "scikit-learn__scikit-learn-14092", "scikit-learn__scikit-learn-14894", "scikit-learn__scikit-learn-14983", "scikit-learn__scikit-learn-15512", "scikit-learn__scikit-learn-15535", "scikit-learn__scikit-learn-25500", "scikit-learn__scikit-learn-25570", "scikit-learn__scikit-learn-25638", "scikit-learn__scikit-learn-25747", "sphinx-doc__sphinx-10325", "sphinx-doc__sphinx-10451", "sphinx-doc__sphinx-11445", "sphinx-doc__sphinx-7686", "sphinx-doc__sphinx-7738", "sphinx-doc__sphinx-7975", "sphinx-doc__sphinx-8273", "sphinx-doc__sphinx-8282", "sphinx-doc__sphinx-8435", "sphinx-doc__sphinx-8474", "sphinx-doc__sphinx-8506", "sphinx-doc__sphinx-8595", "sphinx-doc__sphinx-8627", "sphinx-doc__sphinx-8713", "sphinx-doc__sphinx-8721", "sphinx-doc__sphinx-8801", "sympy__sympy-11400", "sympy__sympy-11870", "sympy__sympy-11897", "sympy__sympy-12171", "sympy__sympy-12236", "sympy__sympy-12419", "sympy__sympy-12454", "sympy__sympy-12481", "sympy__sympy-13031", "sympy__sympy-13043", "sympy__sympy-13146", "sympy__sympy-13177", "sympy__sympy-13437", "sympy__sympy-13471", "sympy__sympy-13480", "sympy__sympy-13647", "sympy__sympy-13773", "sympy__sympy-13895", "sympy__sympy-13915", "sympy__sympy-13971", "sympy__sympy-14024", "sympy__sympy-14308", "sympy__sympy-14317", "sympy__sympy-14396", "sympy__sympy-14774", "sympy__sympy-14817", "sympy__sympy-15011", "sympy__sympy-15308", "sympy__sympy-15345", "sympy__sympy-15346", "sympy__sympy-15609", "sympy__sympy-15678", "sympy__sympy-16106", "sympy__sympy-16281", "sympy__sympy-16503", "sympy__sympy-16792", "sympy__sympy-16988", "sympy__sympy-17022", "sympy__sympy-17139", "sympy__sympy-17630", "sympy__sympy-17655", "sympy__sympy-18057", "sympy__sympy-18087", "sympy__sympy-18189", "sympy__sympy-18199", "sympy__sympy-18532", "sympy__sympy-18621", "sympy__sympy-18698", "sympy__sympy-18835", "sympy__sympy-19007", "sympy__sympy-19254", "sympy__sympy-19487", "sympy__sympy-20049", "sympy__sympy-20154", "sympy__sympy-20212", "sympy__sympy-20322", "sympy__sympy-20442", "sympy__sympy-20590", "sympy__sympy-20639", "sympy__sympy-21055", "sympy__sympy-21171", "sympy__sympy-21379", "sympy__sympy-21612", "sympy__sympy-21614", "sympy__sympy-21627", "sympy__sympy-21847", "sympy__sympy-22005", "sympy__sympy-22714", "sympy__sympy-22840", "sympy__sympy-23117", "sympy__sympy-23191", "sympy__sympy-23262", "sympy__sympy-24066", "sympy__sympy-24102", "sympy__sympy-24152", "sympy__sympy-24213", "sympy__sympy-24909"], "empty_patch_ids": [], "submitted_ids": ["astropy__astropy-12907"], "resolved_ids": [], "unresolved_ids": [], "error_ids": ["astropy__astropy-12907"], "unstopped_containers": [], "unremoved_images": [], "schema_version": 2}, "raw_logging_results": {"total_cost": 0}}
evals_live/swebench_lite_example_agent_17227906123.json CHANGED
@@ -1 +1 @@
1
- {"config": {"agent_name": "example_agent_3", "benchmark_name": "swebench_lite", "date": "2024-08-04", "run_id": "swebench_lite_example_agent_1722790656", "dataset_name": "princeton-nlp/SWE-bench_Lite", "max_workers": 1}, "results": {"accuracy": 21.0, "total_cost": 4.14}, "raw_eval_results": {"total_instances": 300, "submitted_instances": 1, "completed_instances": 0, "resolved_instances": 0, "unresolved_instances": 0, "empty_patch_instances": 0, "error_instances": 1, "unstopped_instances": 0, "completed_ids": [], "incomplete_ids": ["astropy__astropy-14182", "astropy__astropy-14365", "astropy__astropy-14995", "astropy__astropy-6938", "astropy__astropy-7746", "django__django-10914", "django__django-10924", "django__django-11001", "django__django-11019", "django__django-11039", "django__django-11049", "django__django-11099", "django__django-11133", "django__django-11179", "django__django-11283", "django__django-11422", "django__django-11564", "django__django-11583", "django__django-11620", "django__django-11630", "django__django-11742", "django__django-11797", "django__django-11815", "django__django-11848", "django__django-11905", "django__django-11910", "django__django-11964", "django__django-11999", "django__django-12113", "django__django-12125", "django__django-12184", "django__django-12284", "django__django-12286", "django__django-12308", "django__django-12453", "django__django-12470", "django__django-12497", "django__django-12589", "django__django-12700", "django__django-12708", "django__django-12747", "django__django-12856", "django__django-12908", "django__django-12915", "django__django-12983", "django__django-13028", "django__django-13033", "django__django-13158", "django__django-13220", "django__django-13230", "django__django-13265", "django__django-13315", "django__django-13321", "django__django-13401", "django__django-13447", "django__django-13448", "django__django-13551", "django__django-13590", "django__django-13658", "django__django-13660", "django__django-13710", "django__django-13757", "django__django-13768", "django__django-13925", "django__django-13933", "django__django-13964", "django__django-14016", "django__django-14017", "django__django-14155", "django__django-14238", "django__django-14382", "django__django-14411", "django__django-14534", "django__django-14580", "django__django-14608", "django__django-14667", "django__django-14672", "django__django-14730", "django__django-14752", "django__django-14787", "django__django-14855", "django__django-14915", "django__django-14997", "django__django-14999", "django__django-15061", "django__django-15202", "django__django-15213", "django__django-15252", "django__django-15320", "django__django-15347", "django__django-15388", "django__django-15400", "django__django-15498", "django__django-15695", "django__django-15738", "django__django-15781", "django__django-15789", "django__django-15790", "django__django-15814", "django__django-15819", "django__django-15851", "django__django-15902", "django__django-15996", "django__django-16041", "django__django-16046", "django__django-16139", "django__django-16229", "django__django-16255", "django__django-16379", "django__django-16400", "django__django-16408", "django__django-16527", "django__django-16595", "django__django-16816", "django__django-16820", "django__django-16873", "django__django-16910", "django__django-17051", "django__django-17087", "matplotlib__matplotlib-18869", "matplotlib__matplotlib-22711", "matplotlib__matplotlib-22835", "matplotlib__matplotlib-23299", "matplotlib__matplotlib-23314", "matplotlib__matplotlib-23476", "matplotlib__matplotlib-23562", "matplotlib__matplotlib-23563", "matplotlib__matplotlib-23913", "matplotlib__matplotlib-23964", "matplotlib__matplotlib-23987", "matplotlib__matplotlib-24149", "matplotlib__matplotlib-24265", "matplotlib__matplotlib-24334", "matplotlib__matplotlib-24970", "matplotlib__matplotlib-25079", "matplotlib__matplotlib-25311", "matplotlib__matplotlib-25332", "matplotlib__matplotlib-25433", "matplotlib__matplotlib-25442", "matplotlib__matplotlib-25498", "matplotlib__matplotlib-26011", "matplotlib__matplotlib-26020", "mwaskom__seaborn-2848", "mwaskom__seaborn-3010", "mwaskom__seaborn-3190", "mwaskom__seaborn-3407", "pallets__flask-4045", "pallets__flask-4992", "pallets__flask-5063", "psf__requests-1963", "psf__requests-2148", "psf__requests-2317", "psf__requests-2674", "psf__requests-3362", "psf__requests-863", "pydata__xarray-3364", "pydata__xarray-4094", "pydata__xarray-4248", "pydata__xarray-4493", "pydata__xarray-5131", "pylint-dev__pylint-5859", "pylint-dev__pylint-6506", "pylint-dev__pylint-7080", "pylint-dev__pylint-7114", "pylint-dev__pylint-7228", "pylint-dev__pylint-7993", "pytest-dev__pytest-11143", "pytest-dev__pytest-11148", "pytest-dev__pytest-5103", "pytest-dev__pytest-5221", "pytest-dev__pytest-5227", "pytest-dev__pytest-5413", "pytest-dev__pytest-5495", "pytest-dev__pytest-5692", "pytest-dev__pytest-6116", "pytest-dev__pytest-7168", "pytest-dev__pytest-7220", "pytest-dev__pytest-7373", "pytest-dev__pytest-7432", "pytest-dev__pytest-7490", "pytest-dev__pytest-8365", "pytest-dev__pytest-8906", "pytest-dev__pytest-9359", "scikit-learn__scikit-learn-10297", "scikit-learn__scikit-learn-10508", "scikit-learn__scikit-learn-10949", "scikit-learn__scikit-learn-11040", "scikit-learn__scikit-learn-11281", "scikit-learn__scikit-learn-12471", "scikit-learn__scikit-learn-13142", "scikit-learn__scikit-learn-13241", "scikit-learn__scikit-learn-13439", "scikit-learn__scikit-learn-13496", "scikit-learn__scikit-learn-13497", "scikit-learn__scikit-learn-13584", "scikit-learn__scikit-learn-13779", "scikit-learn__scikit-learn-14087", "scikit-learn__scikit-learn-14092", "scikit-learn__scikit-learn-14894", "scikit-learn__scikit-learn-14983", "scikit-learn__scikit-learn-15512", "scikit-learn__scikit-learn-15535", "scikit-learn__scikit-learn-25500", "scikit-learn__scikit-learn-25570", "scikit-learn__scikit-learn-25638", "scikit-learn__scikit-learn-25747", "sphinx-doc__sphinx-10325", "sphinx-doc__sphinx-10451", "sphinx-doc__sphinx-11445", "sphinx-doc__sphinx-7686", "sphinx-doc__sphinx-7738", "sphinx-doc__sphinx-7975", "sphinx-doc__sphinx-8273", "sphinx-doc__sphinx-8282", "sphinx-doc__sphinx-8435", "sphinx-doc__sphinx-8474", "sphinx-doc__sphinx-8506", "sphinx-doc__sphinx-8595", "sphinx-doc__sphinx-8627", "sphinx-doc__sphinx-8713", "sphinx-doc__sphinx-8721", "sphinx-doc__sphinx-8801", "sympy__sympy-11400", "sympy__sympy-11870", "sympy__sympy-11897", "sympy__sympy-12171", "sympy__sympy-12236", "sympy__sympy-12419", "sympy__sympy-12454", "sympy__sympy-12481", "sympy__sympy-13031", "sympy__sympy-13043", "sympy__sympy-13146", "sympy__sympy-13177", "sympy__sympy-13437", "sympy__sympy-13471", "sympy__sympy-13480", "sympy__sympy-13647", "sympy__sympy-13773", "sympy__sympy-13895", "sympy__sympy-13915", "sympy__sympy-13971", "sympy__sympy-14024", "sympy__sympy-14308", "sympy__sympy-14317", "sympy__sympy-14396", "sympy__sympy-14774", "sympy__sympy-14817", "sympy__sympy-15011", "sympy__sympy-15308", "sympy__sympy-15345", "sympy__sympy-15346", "sympy__sympy-15609", "sympy__sympy-15678", "sympy__sympy-16106", "sympy__sympy-16281", "sympy__sympy-16503", "sympy__sympy-16792", "sympy__sympy-16988", "sympy__sympy-17022", "sympy__sympy-17139", "sympy__sympy-17630", "sympy__sympy-17655", "sympy__sympy-18057", "sympy__sympy-18087", "sympy__sympy-18189", "sympy__sympy-18199", "sympy__sympy-18532", "sympy__sympy-18621", "sympy__sympy-18698", "sympy__sympy-18835", "sympy__sympy-19007", "sympy__sympy-19254", "sympy__sympy-19487", "sympy__sympy-20049", "sympy__sympy-20154", "sympy__sympy-20212", "sympy__sympy-20322", "sympy__sympy-20442", "sympy__sympy-20590", "sympy__sympy-20639", "sympy__sympy-21055", "sympy__sympy-21171", "sympy__sympy-21379", "sympy__sympy-21612", "sympy__sympy-21614", "sympy__sympy-21627", "sympy__sympy-21847", "sympy__sympy-22005", "sympy__sympy-22714", "sympy__sympy-22840", "sympy__sympy-23117", "sympy__sympy-23191", "sympy__sympy-23262", "sympy__sympy-24066", "sympy__sympy-24102", "sympy__sympy-24152", "sympy__sympy-24213", "sympy__sympy-24909"], "empty_patch_ids": [], "submitted_ids": ["astropy__astropy-12907"], "resolved_ids": [], "unresolved_ids": [], "error_ids": ["astropy__astropy-12907"], "unstopped_containers": [], "unremoved_images": [], "schema_version": 2}, "raw_logging_results": [{"task_id": "math-operations-001", "trace_id": "b91453e4-2871-457c-9490-63784fd77dfc", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e0199fc690>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "4350fef0-22a0-4dca-a71b-00649a1c057b", "outputs": ["It looks like you're testing the response. How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 15, "prompt_tokens": 8, "total_tokens": 23}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}, {"task_id": "math-operations-001", "trace_id": "3e38e470-583e-4db1-8ba0-5e9c72ec1432", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e01988b410>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "ba229a30-d4fa-473f-9671-3fbd5b31f0b0", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}]}
 
1
+ {"config": {"agent_name": "example_agent_3", "benchmark_name": "swebench_lite", "date": "2024-08-04", "run_id": "swebench_lite_example_agent_1722790656", "dataset_name": "princeton-nlp/SWE-bench_Lite", "max_workers": 1}, "results": {"accuracy": 0.210, "total_cost": 4.14}, "raw_eval_results": {"total_instances": 300, "submitted_instances": 1, "completed_instances": 0, "resolved_instances": 0, "unresolved_instances": 0, "empty_patch_instances": 0, "error_instances": 1, "unstopped_instances": 0, "completed_ids": [], "incomplete_ids": ["astropy__astropy-14182", "astropy__astropy-14365", "astropy__astropy-14995", "astropy__astropy-6938", "astropy__astropy-7746", "django__django-10914", "django__django-10924", "django__django-11001", "django__django-11019", "django__django-11039", "django__django-11049", "django__django-11099", "django__django-11133", "django__django-11179", "django__django-11283", "django__django-11422", "django__django-11564", "django__django-11583", "django__django-11620", "django__django-11630", "django__django-11742", "django__django-11797", "django__django-11815", "django__django-11848", "django__django-11905", "django__django-11910", "django__django-11964", "django__django-11999", "django__django-12113", "django__django-12125", "django__django-12184", "django__django-12284", "django__django-12286", "django__django-12308", "django__django-12453", "django__django-12470", "django__django-12497", "django__django-12589", "django__django-12700", "django__django-12708", "django__django-12747", "django__django-12856", "django__django-12908", "django__django-12915", "django__django-12983", "django__django-13028", "django__django-13033", "django__django-13158", "django__django-13220", "django__django-13230", "django__django-13265", "django__django-13315", "django__django-13321", "django__django-13401", "django__django-13447", "django__django-13448", "django__django-13551", "django__django-13590", "django__django-13658", "django__django-13660", "django__django-13710", "django__django-13757", "django__django-13768", "django__django-13925", "django__django-13933", "django__django-13964", "django__django-14016", "django__django-14017", "django__django-14155", "django__django-14238", "django__django-14382", "django__django-14411", "django__django-14534", "django__django-14580", "django__django-14608", "django__django-14667", "django__django-14672", "django__django-14730", "django__django-14752", "django__django-14787", "django__django-14855", "django__django-14915", "django__django-14997", "django__django-14999", "django__django-15061", "django__django-15202", "django__django-15213", "django__django-15252", "django__django-15320", "django__django-15347", "django__django-15388", "django__django-15400", "django__django-15498", "django__django-15695", "django__django-15738", "django__django-15781", "django__django-15789", "django__django-15790", "django__django-15814", "django__django-15819", "django__django-15851", "django__django-15902", "django__django-15996", "django__django-16041", "django__django-16046", "django__django-16139", "django__django-16229", "django__django-16255", "django__django-16379", "django__django-16400", "django__django-16408", "django__django-16527", "django__django-16595", "django__django-16816", "django__django-16820", "django__django-16873", "django__django-16910", "django__django-17051", "django__django-17087", "matplotlib__matplotlib-18869", "matplotlib__matplotlib-22711", "matplotlib__matplotlib-22835", "matplotlib__matplotlib-23299", "matplotlib__matplotlib-23314", "matplotlib__matplotlib-23476", "matplotlib__matplotlib-23562", "matplotlib__matplotlib-23563", "matplotlib__matplotlib-23913", "matplotlib__matplotlib-23964", "matplotlib__matplotlib-23987", "matplotlib__matplotlib-24149", "matplotlib__matplotlib-24265", "matplotlib__matplotlib-24334", "matplotlib__matplotlib-24970", "matplotlib__matplotlib-25079", "matplotlib__matplotlib-25311", "matplotlib__matplotlib-25332", "matplotlib__matplotlib-25433", "matplotlib__matplotlib-25442", "matplotlib__matplotlib-25498", "matplotlib__matplotlib-26011", "matplotlib__matplotlib-26020", "mwaskom__seaborn-2848", "mwaskom__seaborn-3010", "mwaskom__seaborn-3190", "mwaskom__seaborn-3407", "pallets__flask-4045", "pallets__flask-4992", "pallets__flask-5063", "psf__requests-1963", "psf__requests-2148", "psf__requests-2317", "psf__requests-2674", "psf__requests-3362", "psf__requests-863", "pydata__xarray-3364", "pydata__xarray-4094", "pydata__xarray-4248", "pydata__xarray-4493", "pydata__xarray-5131", "pylint-dev__pylint-5859", "pylint-dev__pylint-6506", "pylint-dev__pylint-7080", "pylint-dev__pylint-7114", "pylint-dev__pylint-7228", "pylint-dev__pylint-7993", "pytest-dev__pytest-11143", "pytest-dev__pytest-11148", "pytest-dev__pytest-5103", "pytest-dev__pytest-5221", "pytest-dev__pytest-5227", "pytest-dev__pytest-5413", "pytest-dev__pytest-5495", "pytest-dev__pytest-5692", "pytest-dev__pytest-6116", "pytest-dev__pytest-7168", "pytest-dev__pytest-7220", "pytest-dev__pytest-7373", "pytest-dev__pytest-7432", "pytest-dev__pytest-7490", "pytest-dev__pytest-8365", "pytest-dev__pytest-8906", "pytest-dev__pytest-9359", "scikit-learn__scikit-learn-10297", "scikit-learn__scikit-learn-10508", "scikit-learn__scikit-learn-10949", "scikit-learn__scikit-learn-11040", "scikit-learn__scikit-learn-11281", "scikit-learn__scikit-learn-12471", "scikit-learn__scikit-learn-13142", "scikit-learn__scikit-learn-13241", "scikit-learn__scikit-learn-13439", "scikit-learn__scikit-learn-13496", "scikit-learn__scikit-learn-13497", "scikit-learn__scikit-learn-13584", "scikit-learn__scikit-learn-13779", "scikit-learn__scikit-learn-14087", "scikit-learn__scikit-learn-14092", "scikit-learn__scikit-learn-14894", "scikit-learn__scikit-learn-14983", "scikit-learn__scikit-learn-15512", "scikit-learn__scikit-learn-15535", "scikit-learn__scikit-learn-25500", "scikit-learn__scikit-learn-25570", "scikit-learn__scikit-learn-25638", "scikit-learn__scikit-learn-25747", "sphinx-doc__sphinx-10325", "sphinx-doc__sphinx-10451", "sphinx-doc__sphinx-11445", "sphinx-doc__sphinx-7686", "sphinx-doc__sphinx-7738", "sphinx-doc__sphinx-7975", "sphinx-doc__sphinx-8273", "sphinx-doc__sphinx-8282", "sphinx-doc__sphinx-8435", "sphinx-doc__sphinx-8474", "sphinx-doc__sphinx-8506", "sphinx-doc__sphinx-8595", "sphinx-doc__sphinx-8627", "sphinx-doc__sphinx-8713", "sphinx-doc__sphinx-8721", "sphinx-doc__sphinx-8801", "sympy__sympy-11400", "sympy__sympy-11870", "sympy__sympy-11897", "sympy__sympy-12171", "sympy__sympy-12236", "sympy__sympy-12419", "sympy__sympy-12454", "sympy__sympy-12481", "sympy__sympy-13031", "sympy__sympy-13043", "sympy__sympy-13146", "sympy__sympy-13177", "sympy__sympy-13437", "sympy__sympy-13471", "sympy__sympy-13480", "sympy__sympy-13647", "sympy__sympy-13773", "sympy__sympy-13895", "sympy__sympy-13915", "sympy__sympy-13971", "sympy__sympy-14024", "sympy__sympy-14308", "sympy__sympy-14317", "sympy__sympy-14396", "sympy__sympy-14774", "sympy__sympy-14817", "sympy__sympy-15011", "sympy__sympy-15308", "sympy__sympy-15345", "sympy__sympy-15346", "sympy__sympy-15609", "sympy__sympy-15678", "sympy__sympy-16106", "sympy__sympy-16281", "sympy__sympy-16503", "sympy__sympy-16792", "sympy__sympy-16988", "sympy__sympy-17022", "sympy__sympy-17139", "sympy__sympy-17630", "sympy__sympy-17655", "sympy__sympy-18057", "sympy__sympy-18087", "sympy__sympy-18189", "sympy__sympy-18199", "sympy__sympy-18532", "sympy__sympy-18621", "sympy__sympy-18698", "sympy__sympy-18835", "sympy__sympy-19007", "sympy__sympy-19254", "sympy__sympy-19487", "sympy__sympy-20049", "sympy__sympy-20154", "sympy__sympy-20212", "sympy__sympy-20322", "sympy__sympy-20442", "sympy__sympy-20590", "sympy__sympy-20639", "sympy__sympy-21055", "sympy__sympy-21171", "sympy__sympy-21379", "sympy__sympy-21612", "sympy__sympy-21614", "sympy__sympy-21627", "sympy__sympy-21847", "sympy__sympy-22005", "sympy__sympy-22714", "sympy__sympy-22840", "sympy__sympy-23117", "sympy__sympy-23191", "sympy__sympy-23262", "sympy__sympy-24066", "sympy__sympy-24102", "sympy__sympy-24152", "sympy__sympy-24213", "sympy__sympy-24909"], "empty_patch_ids": [], "submitted_ids": ["astropy__astropy-12907"], "resolved_ids": [], "unresolved_ids": [], "error_ids": ["astropy__astropy-12907"], "unstopped_containers": [], "unremoved_images": [], "schema_version": 2}, "raw_logging_results": [{"task_id": "math-operations-001", "trace_id": "b91453e4-2871-457c-9490-63784fd77dfc", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e0199fc690>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "4350fef0-22a0-4dca-a71b-00649a1c057b", "outputs": ["It looks like you're testing the response. How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 15, "prompt_tokens": 8, "total_tokens": 23}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}, {"task_id": "math-operations-001", "trace_id": "3e38e470-583e-4db1-8ba0-5e9c72ec1432", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e01988b410>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "ba229a30-d4fa-473f-9671-3fbd5b31f0b0", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}]}
evals_live/swebench_lite_example_agent_1722790656.json CHANGED
@@ -1 +1 @@
1
- {"config": {"agent_name": "example_agent_2", "benchmark_name": "swebench_lite", "date": "2024-08-04", "run_id": "swebench_lite_example_agent_1722790656", "dataset_name": "princeton-nlp/SWE-bench_Lite", "max_workers": 1}, "results": {"accuracy": 32.0, "total_cost": 4.84}, "raw_eval_results": {"total_instances": 300, "submitted_instances": 1, "completed_instances": 0, "resolved_instances": 0, "unresolved_instances": 0, "empty_patch_instances": 0, "error_instances": 1, "unstopped_instances": 0, "completed_ids": [], "incomplete_ids": ["astropy__astropy-14182", "astropy__astropy-14365", "astropy__astropy-14995", "astropy__astropy-6938", "astropy__astropy-7746", "django__django-10914", "django__django-10924", "django__django-11001", "django__django-11019", "django__django-11039", "django__django-11049", "django__django-11099", "django__django-11133", "django__django-11179", "django__django-11283", "django__django-11422", "django__django-11564", "django__django-11583", "django__django-11620", "django__django-11630", "django__django-11742", "django__django-11797", "django__django-11815", "django__django-11848", "django__django-11905", "django__django-11910", "django__django-11964", "django__django-11999", "django__django-12113", "django__django-12125", "django__django-12184", "django__django-12284", "django__django-12286", "django__django-12308", "django__django-12453", "django__django-12470", "django__django-12497", "django__django-12589", "django__django-12700", "django__django-12708", "django__django-12747", "django__django-12856", "django__django-12908", "django__django-12915", "django__django-12983", "django__django-13028", "django__django-13033", "django__django-13158", "django__django-13220", "django__django-13230", "django__django-13265", "django__django-13315", "django__django-13321", "django__django-13401", "django__django-13447", "django__django-13448", "django__django-13551", "django__django-13590", "django__django-13658", "django__django-13660", "django__django-13710", "django__django-13757", "django__django-13768", "django__django-13925", "django__django-13933", "django__django-13964", "django__django-14016", "django__django-14017", "django__django-14155", "django__django-14238", "django__django-14382", "django__django-14411", "django__django-14534", "django__django-14580", "django__django-14608", "django__django-14667", "django__django-14672", "django__django-14730", "django__django-14752", "django__django-14787", "django__django-14855", "django__django-14915", "django__django-14997", "django__django-14999", "django__django-15061", "django__django-15202", "django__django-15213", "django__django-15252", "django__django-15320", "django__django-15347", "django__django-15388", "django__django-15400", "django__django-15498", "django__django-15695", "django__django-15738", "django__django-15781", "django__django-15789", "django__django-15790", "django__django-15814", "django__django-15819", "django__django-15851", "django__django-15902", "django__django-15996", "django__django-16041", "django__django-16046", "django__django-16139", "django__django-16229", "django__django-16255", "django__django-16379", "django__django-16400", "django__django-16408", "django__django-16527", "django__django-16595", "django__django-16816", "django__django-16820", "django__django-16873", "django__django-16910", "django__django-17051", "django__django-17087", "matplotlib__matplotlib-18869", "matplotlib__matplotlib-22711", "matplotlib__matplotlib-22835", "matplotlib__matplotlib-23299", "matplotlib__matplotlib-23314", "matplotlib__matplotlib-23476", "matplotlib__matplotlib-23562", "matplotlib__matplotlib-23563", "matplotlib__matplotlib-23913", "matplotlib__matplotlib-23964", "matplotlib__matplotlib-23987", "matplotlib__matplotlib-24149", "matplotlib__matplotlib-24265", "matplotlib__matplotlib-24334", "matplotlib__matplotlib-24970", "matplotlib__matplotlib-25079", "matplotlib__matplotlib-25311", "matplotlib__matplotlib-25332", "matplotlib__matplotlib-25433", "matplotlib__matplotlib-25442", "matplotlib__matplotlib-25498", "matplotlib__matplotlib-26011", "matplotlib__matplotlib-26020", "mwaskom__seaborn-2848", "mwaskom__seaborn-3010", "mwaskom__seaborn-3190", "mwaskom__seaborn-3407", "pallets__flask-4045", "pallets__flask-4992", "pallets__flask-5063", "psf__requests-1963", "psf__requests-2148", "psf__requests-2317", "psf__requests-2674", "psf__requests-3362", "psf__requests-863", "pydata__xarray-3364", "pydata__xarray-4094", "pydata__xarray-4248", "pydata__xarray-4493", "pydata__xarray-5131", "pylint-dev__pylint-5859", "pylint-dev__pylint-6506", "pylint-dev__pylint-7080", "pylint-dev__pylint-7114", "pylint-dev__pylint-7228", "pylint-dev__pylint-7993", "pytest-dev__pytest-11143", "pytest-dev__pytest-11148", "pytest-dev__pytest-5103", "pytest-dev__pytest-5221", "pytest-dev__pytest-5227", "pytest-dev__pytest-5413", "pytest-dev__pytest-5495", "pytest-dev__pytest-5692", "pytest-dev__pytest-6116", "pytest-dev__pytest-7168", "pytest-dev__pytest-7220", "pytest-dev__pytest-7373", "pytest-dev__pytest-7432", "pytest-dev__pytest-7490", "pytest-dev__pytest-8365", "pytest-dev__pytest-8906", "pytest-dev__pytest-9359", "scikit-learn__scikit-learn-10297", "scikit-learn__scikit-learn-10508", "scikit-learn__scikit-learn-10949", "scikit-learn__scikit-learn-11040", "scikit-learn__scikit-learn-11281", "scikit-learn__scikit-learn-12471", "scikit-learn__scikit-learn-13142", "scikit-learn__scikit-learn-13241", "scikit-learn__scikit-learn-13439", "scikit-learn__scikit-learn-13496", "scikit-learn__scikit-learn-13497", "scikit-learn__scikit-learn-13584", "scikit-learn__scikit-learn-13779", "scikit-learn__scikit-learn-14087", "scikit-learn__scikit-learn-14092", "scikit-learn__scikit-learn-14894", "scikit-learn__scikit-learn-14983", "scikit-learn__scikit-learn-15512", "scikit-learn__scikit-learn-15535", "scikit-learn__scikit-learn-25500", "scikit-learn__scikit-learn-25570", "scikit-learn__scikit-learn-25638", "scikit-learn__scikit-learn-25747", "sphinx-doc__sphinx-10325", "sphinx-doc__sphinx-10451", "sphinx-doc__sphinx-11445", "sphinx-doc__sphinx-7686", "sphinx-doc__sphinx-7738", "sphinx-doc__sphinx-7975", "sphinx-doc__sphinx-8273", "sphinx-doc__sphinx-8282", "sphinx-doc__sphinx-8435", "sphinx-doc__sphinx-8474", "sphinx-doc__sphinx-8506", "sphinx-doc__sphinx-8595", "sphinx-doc__sphinx-8627", "sphinx-doc__sphinx-8713", "sphinx-doc__sphinx-8721", "sphinx-doc__sphinx-8801", "sympy__sympy-11400", "sympy__sympy-11870", "sympy__sympy-11897", "sympy__sympy-12171", "sympy__sympy-12236", "sympy__sympy-12419", "sympy__sympy-12454", "sympy__sympy-12481", "sympy__sympy-13031", "sympy__sympy-13043", "sympy__sympy-13146", "sympy__sympy-13177", "sympy__sympy-13437", "sympy__sympy-13471", "sympy__sympy-13480", "sympy__sympy-13647", "sympy__sympy-13773", "sympy__sympy-13895", "sympy__sympy-13915", "sympy__sympy-13971", "sympy__sympy-14024", "sympy__sympy-14308", "sympy__sympy-14317", "sympy__sympy-14396", "sympy__sympy-14774", "sympy__sympy-14817", "sympy__sympy-15011", "sympy__sympy-15308", "sympy__sympy-15345", "sympy__sympy-15346", "sympy__sympy-15609", "sympy__sympy-15678", "sympy__sympy-16106", "sympy__sympy-16281", "sympy__sympy-16503", "sympy__sympy-16792", "sympy__sympy-16988", "sympy__sympy-17022", "sympy__sympy-17139", "sympy__sympy-17630", "sympy__sympy-17655", "sympy__sympy-18057", "sympy__sympy-18087", "sympy__sympy-18189", "sympy__sympy-18199", "sympy__sympy-18532", "sympy__sympy-18621", "sympy__sympy-18698", "sympy__sympy-18835", "sympy__sympy-19007", "sympy__sympy-19254", "sympy__sympy-19487", "sympy__sympy-20049", "sympy__sympy-20154", "sympy__sympy-20212", "sympy__sympy-20322", "sympy__sympy-20442", "sympy__sympy-20590", "sympy__sympy-20639", "sympy__sympy-21055", "sympy__sympy-21171", "sympy__sympy-21379", "sympy__sympy-21612", "sympy__sympy-21614", "sympy__sympy-21627", "sympy__sympy-21847", "sympy__sympy-22005", "sympy__sympy-22714", "sympy__sympy-22840", "sympy__sympy-23117", "sympy__sympy-23191", "sympy__sympy-23262", "sympy__sympy-24066", "sympy__sympy-24102", "sympy__sympy-24152", "sympy__sympy-24213", "sympy__sympy-24909"], "empty_patch_ids": [], "submitted_ids": ["astropy__astropy-12907"], "resolved_ids": [], "unresolved_ids": [], "error_ids": ["astropy__astropy-12907"], "unstopped_containers": [], "unremoved_images": [], "schema_version": 2}, "raw_logging_results": [{"task_id": "math-operations-001", "trace_id": "b91453e4-2871-457c-9490-63784fd77dfc", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e0199fc690>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "4350fef0-22a0-4dca-a71b-00649a1c057b", "outputs": ["It looks like you're testing the response. How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 15, "prompt_tokens": 8, "total_tokens": 23}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}, {"task_id": "math-operations-001", "trace_id": "3e38e470-583e-4db1-8ba0-5e9c72ec1432", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e01988b410>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "ba229a30-d4fa-473f-9671-3fbd5b31f0b0", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}]}
 
1
+ {"config": {"agent_name": "example_agent_2", "benchmark_name": "swebench_lite", "date": "2024-08-04", "run_id": "swebench_lite_example_agent_1722790656", "dataset_name": "princeton-nlp/SWE-bench_Lite", "max_workers": 1}, "results": {"accuracy": 0.320, "total_cost": 4.84}, "raw_eval_results": {"total_instances": 300, "submitted_instances": 1, "completed_instances": 0, "resolved_instances": 0, "unresolved_instances": 0, "empty_patch_instances": 0, "error_instances": 1, "unstopped_instances": 0, "completed_ids": [], "incomplete_ids": ["astropy__astropy-14182", "astropy__astropy-14365", "astropy__astropy-14995", "astropy__astropy-6938", "astropy__astropy-7746", "django__django-10914", "django__django-10924", "django__django-11001", "django__django-11019", "django__django-11039", "django__django-11049", "django__django-11099", "django__django-11133", "django__django-11179", "django__django-11283", "django__django-11422", "django__django-11564", "django__django-11583", "django__django-11620", "django__django-11630", "django__django-11742", "django__django-11797", "django__django-11815", "django__django-11848", "django__django-11905", "django__django-11910", "django__django-11964", "django__django-11999", "django__django-12113", "django__django-12125", "django__django-12184", "django__django-12284", "django__django-12286", "django__django-12308", "django__django-12453", "django__django-12470", "django__django-12497", "django__django-12589", "django__django-12700", "django__django-12708", "django__django-12747", "django__django-12856", "django__django-12908", "django__django-12915", "django__django-12983", "django__django-13028", "django__django-13033", "django__django-13158", "django__django-13220", "django__django-13230", "django__django-13265", "django__django-13315", "django__django-13321", "django__django-13401", "django__django-13447", "django__django-13448", "django__django-13551", "django__django-13590", "django__django-13658", "django__django-13660", "django__django-13710", "django__django-13757", "django__django-13768", "django__django-13925", "django__django-13933", "django__django-13964", "django__django-14016", "django__django-14017", "django__django-14155", "django__django-14238", "django__django-14382", "django__django-14411", "django__django-14534", "django__django-14580", "django__django-14608", "django__django-14667", "django__django-14672", "django__django-14730", "django__django-14752", "django__django-14787", "django__django-14855", "django__django-14915", "django__django-14997", "django__django-14999", "django__django-15061", "django__django-15202", "django__django-15213", "django__django-15252", "django__django-15320", "django__django-15347", "django__django-15388", "django__django-15400", "django__django-15498", "django__django-15695", "django__django-15738", "django__django-15781", "django__django-15789", "django__django-15790", "django__django-15814", "django__django-15819", "django__django-15851", "django__django-15902", "django__django-15996", "django__django-16041", "django__django-16046", "django__django-16139", "django__django-16229", "django__django-16255", "django__django-16379", "django__django-16400", "django__django-16408", "django__django-16527", "django__django-16595", "django__django-16816", "django__django-16820", "django__django-16873", "django__django-16910", "django__django-17051", "django__django-17087", "matplotlib__matplotlib-18869", "matplotlib__matplotlib-22711", "matplotlib__matplotlib-22835", "matplotlib__matplotlib-23299", "matplotlib__matplotlib-23314", "matplotlib__matplotlib-23476", "matplotlib__matplotlib-23562", "matplotlib__matplotlib-23563", "matplotlib__matplotlib-23913", "matplotlib__matplotlib-23964", "matplotlib__matplotlib-23987", "matplotlib__matplotlib-24149", "matplotlib__matplotlib-24265", "matplotlib__matplotlib-24334", "matplotlib__matplotlib-24970", "matplotlib__matplotlib-25079", "matplotlib__matplotlib-25311", "matplotlib__matplotlib-25332", "matplotlib__matplotlib-25433", "matplotlib__matplotlib-25442", "matplotlib__matplotlib-25498", "matplotlib__matplotlib-26011", "matplotlib__matplotlib-26020", "mwaskom__seaborn-2848", "mwaskom__seaborn-3010", "mwaskom__seaborn-3190", "mwaskom__seaborn-3407", "pallets__flask-4045", "pallets__flask-4992", "pallets__flask-5063", "psf__requests-1963", "psf__requests-2148", "psf__requests-2317", "psf__requests-2674", "psf__requests-3362", "psf__requests-863", "pydata__xarray-3364", "pydata__xarray-4094", "pydata__xarray-4248", "pydata__xarray-4493", "pydata__xarray-5131", "pylint-dev__pylint-5859", "pylint-dev__pylint-6506", "pylint-dev__pylint-7080", "pylint-dev__pylint-7114", "pylint-dev__pylint-7228", "pylint-dev__pylint-7993", "pytest-dev__pytest-11143", "pytest-dev__pytest-11148", "pytest-dev__pytest-5103", "pytest-dev__pytest-5221", "pytest-dev__pytest-5227", "pytest-dev__pytest-5413", "pytest-dev__pytest-5495", "pytest-dev__pytest-5692", "pytest-dev__pytest-6116", "pytest-dev__pytest-7168", "pytest-dev__pytest-7220", "pytest-dev__pytest-7373", "pytest-dev__pytest-7432", "pytest-dev__pytest-7490", "pytest-dev__pytest-8365", "pytest-dev__pytest-8906", "pytest-dev__pytest-9359", "scikit-learn__scikit-learn-10297", "scikit-learn__scikit-learn-10508", "scikit-learn__scikit-learn-10949", "scikit-learn__scikit-learn-11040", "scikit-learn__scikit-learn-11281", "scikit-learn__scikit-learn-12471", "scikit-learn__scikit-learn-13142", "scikit-learn__scikit-learn-13241", "scikit-learn__scikit-learn-13439", "scikit-learn__scikit-learn-13496", "scikit-learn__scikit-learn-13497", "scikit-learn__scikit-learn-13584", "scikit-learn__scikit-learn-13779", "scikit-learn__scikit-learn-14087", "scikit-learn__scikit-learn-14092", "scikit-learn__scikit-learn-14894", "scikit-learn__scikit-learn-14983", "scikit-learn__scikit-learn-15512", "scikit-learn__scikit-learn-15535", "scikit-learn__scikit-learn-25500", "scikit-learn__scikit-learn-25570", "scikit-learn__scikit-learn-25638", "scikit-learn__scikit-learn-25747", "sphinx-doc__sphinx-10325", "sphinx-doc__sphinx-10451", "sphinx-doc__sphinx-11445", "sphinx-doc__sphinx-7686", "sphinx-doc__sphinx-7738", "sphinx-doc__sphinx-7975", "sphinx-doc__sphinx-8273", "sphinx-doc__sphinx-8282", "sphinx-doc__sphinx-8435", "sphinx-doc__sphinx-8474", "sphinx-doc__sphinx-8506", "sphinx-doc__sphinx-8595", "sphinx-doc__sphinx-8627", "sphinx-doc__sphinx-8713", "sphinx-doc__sphinx-8721", "sphinx-doc__sphinx-8801", "sympy__sympy-11400", "sympy__sympy-11870", "sympy__sympy-11897", "sympy__sympy-12171", "sympy__sympy-12236", "sympy__sympy-12419", "sympy__sympy-12454", "sympy__sympy-12481", "sympy__sympy-13031", "sympy__sympy-13043", "sympy__sympy-13146", "sympy__sympy-13177", "sympy__sympy-13437", "sympy__sympy-13471", "sympy__sympy-13480", "sympy__sympy-13647", "sympy__sympy-13773", "sympy__sympy-13895", "sympy__sympy-13915", "sympy__sympy-13971", "sympy__sympy-14024", "sympy__sympy-14308", "sympy__sympy-14317", "sympy__sympy-14396", "sympy__sympy-14774", "sympy__sympy-14817", "sympy__sympy-15011", "sympy__sympy-15308", "sympy__sympy-15345", "sympy__sympy-15346", "sympy__sympy-15609", "sympy__sympy-15678", "sympy__sympy-16106", "sympy__sympy-16281", "sympy__sympy-16503", "sympy__sympy-16792", "sympy__sympy-16988", "sympy__sympy-17022", "sympy__sympy-17139", "sympy__sympy-17630", "sympy__sympy-17655", "sympy__sympy-18057", "sympy__sympy-18087", "sympy__sympy-18189", "sympy__sympy-18199", "sympy__sympy-18532", "sympy__sympy-18621", "sympy__sympy-18698", "sympy__sympy-18835", "sympy__sympy-19007", "sympy__sympy-19254", "sympy__sympy-19487", "sympy__sympy-20049", "sympy__sympy-20154", "sympy__sympy-20212", "sympy__sympy-20322", "sympy__sympy-20442", "sympy__sympy-20590", "sympy__sympy-20639", "sympy__sympy-21055", "sympy__sympy-21171", "sympy__sympy-21379", "sympy__sympy-21612", "sympy__sympy-21614", "sympy__sympy-21627", "sympy__sympy-21847", "sympy__sympy-22005", "sympy__sympy-22714", "sympy__sympy-22840", "sympy__sympy-23117", "sympy__sympy-23191", "sympy__sympy-23262", "sympy__sympy-24066", "sympy__sympy-24102", "sympy__sympy-24152", "sympy__sympy-24213", "sympy__sympy-24909"], "empty_patch_ids": [], "submitted_ids": ["astropy__astropy-12907"], "resolved_ids": [], "unresolved_ids": [], "error_ids": ["astropy__astropy-12907"], "unstopped_containers": [], "unremoved_images": [], "schema_version": 2}, "raw_logging_results": [{"task_id": "math-operations-001", "trace_id": "b91453e4-2871-457c-9490-63784fd77dfc", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e0199fc690>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "4350fef0-22a0-4dca-a71b-00649a1c057b", "outputs": ["It looks like you're testing the response. How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 15, "prompt_tokens": 8, "total_tokens": 23}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}, {"task_id": "math-operations-001", "trace_id": "3e38e470-583e-4db1-8ba0-5e9c72ec1432", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e01988b410>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "ba229a30-d4fa-473f-9671-3fbd5b31f0b0", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}]}
evals_live/usaco_USACO_Zero-shot_gpt-4o-mini-2024-07-18_1723149367.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:47da40649a27795be0ee4c80b1c56ef250aece1ae132c06c181e344c377b82e9
3
- size 692302610
 
 
 
 
evals_live/usaco_usaco_example_agent_1722871.json CHANGED
@@ -1 +1,127 @@
1
- {"config": {"agent_name": "usaco_example_agent_1", "benchmark_name": "usaco", "date": "2024-08-05", "run_id": "usaco_usaco_example_agent_1722871527"}, "results": {"accuracy": 32.0, "total_cost": 1.12}, "raw_eval_results": {"rdict": {"1333_platinum_good_bitstrings": [{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]}, "sdict": {"1333_platinum_good_bitstrings": [{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]}, "rs": [[{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]], "ss": [[{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]]}, "raw_logging_results": [{"task_id": "1333_platinum_good_bitstrings", "trace_id": "3aaa346a-30ee-4cb6-9b6d-e59930656d45", "project_id": "citp_agent_eval/usaco_1722871516", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x75aea89672e0>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "9a995abc-5d34-478e-86b2-46ea71a55a96", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.13", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.9.19 (main, May 6 2024, 19:43:03) \n[GCC 11.2.0]"}, "task_id": "1333_platinum_good_bitstrings"}, "_children": [], "_feedback": null}]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "agent_name": "usaco_example_agent_1",
4
+ "benchmark_name": "usaco",
5
+ "date": "2024-08-05",
6
+ "run_id": "usaco_usaco_example_agent_1722871527"
7
+ },
8
+ "results": {
9
+ "accuracy": 0.320,
10
+ "total_cost": 1.12
11
+ },
12
+ "raw_eval_results": {
13
+ "rdict": {
14
+ "1333_platinum_good_bitstrings": [
15
+ {
16
+ "result_type": 7,
17
+ "status": "No submission, error during judging",
18
+ "judge_output": "No submission, error during judging",
19
+ "num_passed": 0,
20
+ "fraction_passed": 0,
21
+ "result_list": null,
22
+ "num_tests": 10,
23
+ "problem_id": "1333_platinum_good_bitstrings"
24
+ }
25
+ ]
26
+ },
27
+ "sdict": {
28
+ "1333_platinum_good_bitstrings": [
29
+ {
30
+ "solution_code": "test",
31
+ "result": {
32
+ "result_type": 7,
33
+ "status": "No submission, error during judging",
34
+ "judge_output": "No submission, error during judging",
35
+ "num_passed": 0,
36
+ "fraction_passed": 0,
37
+ "result_list": null,
38
+ "num_tests": 10,
39
+ "problem_id": "1333_platinum_good_bitstrings"
40
+ },
41
+ "problem_id": "1333_platinum_good_bitstrings"
42
+ }
43
+ ]
44
+ },
45
+ "rs": [
46
+ [
47
+ {
48
+ "result_type": 7,
49
+ "status": "No submission, error during judging",
50
+ "judge_output": "No submission, error during judging",
51
+ "num_passed": 0,
52
+ "fraction_passed": 0,
53
+ "result_list": null,
54
+ "num_tests": 10,
55
+ "problem_id": "1333_platinum_good_bitstrings"
56
+ }
57
+ ]
58
+ ],
59
+ "ss": [
60
+ [
61
+ {
62
+ "solution_code": "test",
63
+ "result": {
64
+ "result_type": 7,
65
+ "status": "No submission, error during judging",
66
+ "judge_output": "No submission, error during judging",
67
+ "num_passed": 0,
68
+ "fraction_passed": 0,
69
+ "result_list": null,
70
+ "num_tests": 10,
71
+ "problem_id": "1333_platinum_good_bitstrings"
72
+ },
73
+ "problem_id": "1333_platinum_good_bitstrings"
74
+ }
75
+ ]
76
+ ]
77
+ },
78
+ "raw_logging_results": [
79
+ {
80
+ "task_id": "1333_platinum_good_bitstrings",
81
+ "trace_id": "3aaa346a-30ee-4cb6-9b6d-e59930656d45",
82
+ "project_id": "citp_agent_eval/usaco_1722871516",
83
+ "inputs": {
84
+ "self": "<openai.resources.chat.completions.Completions object at 0x75aea89672e0>",
85
+ "messages": [
86
+ {
87
+ "role": "user",
88
+ "content": "test"
89
+ }
90
+ ],
91
+ "model": "gpt-4o-mini-2024-07-18",
92
+ "max_tokens": 2000,
93
+ "n": 1,
94
+ "temperature": 1
95
+ },
96
+ "id": "9a995abc-5d34-478e-86b2-46ea71a55a96",
97
+ "outputs": [
98
+ "Test received! How can I assist you today?"
99
+ ],
100
+ "exception": null,
101
+ "summary": {
102
+ "usage": {
103
+ "gpt-4o-mini-2024-07-18": {
104
+ "requests": 1,
105
+ "completion_tokens": 10,
106
+ "prompt_tokens": 8,
107
+ "total_tokens": 18
108
+ }
109
+ }
110
+ },
111
+ "display_name": null,
112
+ "attributes": {
113
+ "weave": {
114
+ "client_version": "0.50.13",
115
+ "source": "python-sdk",
116
+ "os_name": "Linux",
117
+ "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024",
118
+ "os_release": "6.8.0-1010-azure",
119
+ "sys_version": "3.9.19 (main, May 6 2024, 19:43:03) \n[GCC 11.2.0]"
120
+ },
121
+ "task_id": "1333_platinum_good_bitstrings"
122
+ },
123
+ "_children": [],
124
+ "_feedback": null
125
+ }
126
+ ]
127
+ }
evals_live/usaco_usaco_example_agent_1722871527.json CHANGED
@@ -6,7 +6,7 @@
6
  "run_id": "usaco_usaco_example_agent_1722871527"
7
  },
8
  "results": {
9
- "accuracy": 42.0,
10
  "total_cost": 1.42
11
  },
12
  "raw_eval_results": {
 
6
  "run_id": "usaco_usaco_example_agent_1722871527"
7
  },
8
  "results": {
9
+ "accuracy": 0.420,
10
  "total_cost": 1.42
11
  },
12
  "raw_eval_results": {
evals_live/usaco_usaco_test_172306727812321123.json CHANGED
@@ -5,7 +5,7 @@
5
  "date": "2024-08-07",
6
  "run_id": "usaco_usaco_test_1723067278"
7
  },
8
- "results": {"accuracy": 44.0, "total_cost": 2.56},
9
  "raw_eval_results": {
10
  "rdict": {
11
  "1333_platinum_good_bitstrings": [
 
5
  "date": "2024-08-07",
6
  "run_id": "usaco_usaco_test_1723067278"
7
  },
8
+ "results": {"accuracy": 0.440, "total_cost": 2.56},
9
  "raw_eval_results": {
10
  "rdict": {
11
  "1333_platinum_good_bitstrings": [