Spaces:
Running
Running
File size: 1,257 Bytes
b7d1f08 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# This file contains information about verified agent results for different benchmarks.
# Format:
# benchmark_name:
# - agent_name: "Name of the agent"
# verification_date: YYYY-MM-DD
usaco:
- agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-20
- agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-20
- agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-20
- agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-12
- agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-20
- agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-11
- agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-12
swebench_verified:
- agent_name: "Agentless (gpt-4o-mini-2024-07-18) (50 Instances)"
verification_date: 2024-08-17
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1) (50 Instances)"
verification_date: 2024-08-19
mlagentbench:
- agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-19 |