File size: 4,607 Bytes
e7c3249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Module for running benchmarks.

This module defines functions to run benchmarks using a given agent and to print
the results of the benchmark tasks.

Functions
---------
run : function
    Runs the benchmark tasks using the provided agent and returns a list of TaskResult objects.

print_results : function
    Prints the results of the benchmark tasks to the console.
"""
import time

from typing import List

import yaml

from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult
from gpt_engineer.core.base_agent import BaseAgent
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv


def run(
    agent: BaseAgent,
    benchmark: Benchmark,
    verbose=False,
) -> List[TaskResult]:
    """
    Runs the benchmark tasks using the provided agent and returns a list of TaskResult objects.

    Parameters
    ----------
    agent : BaseAgent
        The agent to use for running the benchmark tasks.
    benchmark : Benchmark
        The benchmark containing the tasks to run.
    verbose : bool, default=False
        A flag to indicate whether to print verbose output during the benchmark.

    Returns
    -------
    List[TaskResult]
        A list of TaskResult objects representing the results of the benchmark tasks.
    """
    task_results = []
    for task in benchmark.tasks:
        print(f"--> Running task: {task.name}\n")

        t0 = time.time()
        files_dict = agent.improve(task.initial_code, task.prompt)
        t1 = time.time()

        env = DiskExecutionEnv()
        env.upload(files_dict)

        if task.command:
            p = env.popen(task.command)
            stdout, stderr = p.communicate(benchmark.timeout)
            stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
        else:
            p, stdout, stderr = None, None, None

        exec_result = Assertable(
            files=files_dict,
            env=env,
            process=p,
            stdout=stdout,
            stderr=stderr,
        )

        task_results.append(
            TaskResult(
                task_name=task.name,
                assertion_results={
                    assertion_name: assertion(exec_result)
                    for assertion_name, assertion in task.assertions.items()
                },
                duration=t1 - t0,
            )
        )

        if verbose:
            print_results(task_results)
    return task_results


def print_results(results: list[TaskResult]):
    """
    Prints the results of the benchmark tasks to the console.

    Parameters
    ----------
    results : list[TaskResult]
        A list of TaskResult objects representing the results of the benchmark tasks.

    Returns
    -------
    None
    """
    for task_result in results:
        print(f"\n--- Results for {task_result.task_name} ---")
        print(f"{task_result.task_name} ({task_result.duration:.2f}s)")
        for assertion_name, assertion_result in task_result.assertion_results.items():
            checkmark = "✅" if assertion_result else "❌"
            print(f"  {checkmark} {assertion_name}")
        print()

    success_rates = [task_result.success_rate for task_result in results]
    avg_success_rate = sum(success_rates) / len(results)

    total_time = sum(task_result.duration for task_result in results)

    correct_assertions = sum(
        sum(
            assertion_result
            for assertion_result in task_result.assertion_results.values()
        )
        for task_result in results
    )
    total_assertions = sum(
        len(task_result.assertion_results) for task_result in results
    )
    correct_tasks = [
        task_result for task_result in results if task_result.success_rate == 1
    ]

    print("--- Results ---")
    print(f"Total time: {total_time:.2f}s")
    print(f"Completely correct tasks: {len(correct_tasks)}/{len(results)}")
    print(f"Total correct assertions: {correct_assertions}/{total_assertions}")
    print(f"Average success rate: {avg_success_rate * 100}% on {len(results)} tasks")
    print("--- Results ---")
    print()


def export_yaml_results(yaml_path, complete_results, config):
    for results in complete_results.values():
        correct_tasks = [
            task_result
            for task_result in results["detailed"]
            if task_result["solved"] == 1.0
        ]
        fraction_correct = len(correct_tasks) / len(results["detailed"])
        results["fully_solved"] = fraction_correct
    complete_results["config"] = config
    with open(yaml_path, "w") as f:
        yaml.dump(complete_results, f, indent=4)