|
""" |
|
Module for running benchmarks. |
|
|
|
This module defines functions to run benchmarks using a given agent and to print |
|
the results of the benchmark tasks. |
|
|
|
Functions |
|
--------- |
|
run : function |
|
Runs the benchmark tasks using the provided agent and returns a list of TaskResult objects. |
|
|
|
print_results : function |
|
Prints the results of the benchmark tasks to the console. |
|
""" |
|
import time |
|
|
|
from typing import List |
|
|
|
import yaml |
|
|
|
from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult |
|
from gpt_engineer.core.base_agent import BaseAgent |
|
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv |
|
|
|
|
|
def run( |
|
agent: BaseAgent, |
|
benchmark: Benchmark, |
|
verbose=False, |
|
) -> List[TaskResult]: |
|
""" |
|
Runs the benchmark tasks using the provided agent and returns a list of TaskResult objects. |
|
|
|
Parameters |
|
---------- |
|
agent : BaseAgent |
|
The agent to use for running the benchmark tasks. |
|
benchmark : Benchmark |
|
The benchmark containing the tasks to run. |
|
verbose : bool, default=False |
|
A flag to indicate whether to print verbose output during the benchmark. |
|
|
|
Returns |
|
------- |
|
List[TaskResult] |
|
A list of TaskResult objects representing the results of the benchmark tasks. |
|
""" |
|
task_results = [] |
|
for task in benchmark.tasks: |
|
print(f"--> Running task: {task.name}\n") |
|
|
|
t0 = time.time() |
|
files_dict = agent.improve(task.initial_code, task.prompt) |
|
t1 = time.time() |
|
|
|
env = DiskExecutionEnv() |
|
env.upload(files_dict) |
|
|
|
if task.command: |
|
p = env.popen(task.command) |
|
stdout, stderr = p.communicate(benchmark.timeout) |
|
stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8") |
|
else: |
|
p, stdout, stderr = None, None, None |
|
|
|
exec_result = Assertable( |
|
files=files_dict, |
|
env=env, |
|
process=p, |
|
stdout=stdout, |
|
stderr=stderr, |
|
) |
|
|
|
task_results.append( |
|
TaskResult( |
|
task_name=task.name, |
|
assertion_results={ |
|
assertion_name: assertion(exec_result) |
|
for assertion_name, assertion in task.assertions.items() |
|
}, |
|
duration=t1 - t0, |
|
) |
|
) |
|
|
|
if verbose: |
|
print_results(task_results) |
|
return task_results |
|
|
|
|
|
def print_results(results: list[TaskResult]): |
|
""" |
|
Prints the results of the benchmark tasks to the console. |
|
|
|
Parameters |
|
---------- |
|
results : list[TaskResult] |
|
A list of TaskResult objects representing the results of the benchmark tasks. |
|
|
|
Returns |
|
------- |
|
None |
|
""" |
|
for task_result in results: |
|
print(f"\n--- Results for {task_result.task_name} ---") |
|
print(f"{task_result.task_name} ({task_result.duration:.2f}s)") |
|
for assertion_name, assertion_result in task_result.assertion_results.items(): |
|
checkmark = "β
" if assertion_result else "β" |
|
print(f" {checkmark} {assertion_name}") |
|
print() |
|
|
|
success_rates = [task_result.success_rate for task_result in results] |
|
avg_success_rate = sum(success_rates) / len(results) |
|
|
|
total_time = sum(task_result.duration for task_result in results) |
|
|
|
correct_assertions = sum( |
|
sum( |
|
assertion_result |
|
for assertion_result in task_result.assertion_results.values() |
|
) |
|
for task_result in results |
|
) |
|
total_assertions = sum( |
|
len(task_result.assertion_results) for task_result in results |
|
) |
|
correct_tasks = [ |
|
task_result for task_result in results if task_result.success_rate == 1 |
|
] |
|
|
|
print("--- Results ---") |
|
print(f"Total time: {total_time:.2f}s") |
|
print(f"Completely correct tasks: {len(correct_tasks)}/{len(results)}") |
|
print(f"Total correct assertions: {correct_assertions}/{total_assertions}") |
|
print(f"Average success rate: {avg_success_rate * 100}% on {len(results)} tasks") |
|
print("--- Results ---") |
|
print() |
|
|
|
|
|
def export_yaml_results(yaml_path, complete_results, config): |
|
for results in complete_results.values(): |
|
correct_tasks = [ |
|
task_result |
|
for task_result in results["detailed"] |
|
if task_result["solved"] == 1.0 |
|
] |
|
fraction_correct = len(correct_tasks) / len(results["detailed"]) |
|
results["fully_solved"] = fraction_correct |
|
complete_results["config"] = config |
|
with open(yaml_path, "w") as f: |
|
yaml.dump(complete_results, f, indent=4) |
|
|