File size: 1,257 Bytes
b7d1f08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# This file contains information about verified agent results for different benchmarks.
# Format: 
#   benchmark_name:
#     - agent_name: "Name of the agent"
#       verification_date: YYYY-MM-DD

usaco:
  - agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-20
  - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-20
  - agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-20
  - agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-12
  - agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-20
  - agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-11
  - agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-12

swebench_verified:
  - agent_name: "Agentless (gpt-4o-mini-2024-07-18) (50 Instances)"
    verification_date: 2024-08-17
  - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1) (50 Instances)"
    verification_date: 2024-08-19

mlagentbench:
  - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-19