File size: 3,221 Bytes
b225a21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import datetime
import time

import pytest
import requests

URL_BENCHMARK = "http://localhost:8080/ap/v1"
URL_AGENT = "http://localhost:8000/ap/v1"

try:
    response = requests.get(f"{URL_AGENT}/agent/tasks")
except requests.exceptions.ConnectionError:
    pytest.skip("No agent available to test against", allow_module_level=True)


@pytest.mark.parametrize(
    "eval_id, input_text, expected_artifact_length, test_name, should_be_successful",
    [
        (
            "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
            "Write the word 'Washington' to a .txt file",
            0,
            "WriteFile",
            True,
        ),
        (
            "f219f3d3-a41b-45a9-a3d0-389832086ee8",
            "Read the file called file_to_read.txt "
            "and write its content to a file called output.txt",
            1,
            "ReadFile",
            False,
        ),
    ],
)
def test_entire_workflow(
    eval_id: str,
    input_text: str,
    expected_artifact_length: int,
    test_name: str,
    should_be_successful: bool,
):
    task_request = {"eval_id": eval_id, "input": input_text}
    response = requests.get(f"{URL_AGENT}/agent/tasks")
    task_count_before = response.json()["pagination"]["total_items"]
    # First POST request
    task_response_benchmark = requests.post(
        URL_BENCHMARK + "/agent/tasks", json=task_request
    )
    response = requests.get(f"{URL_AGENT}/agent/tasks")
    task_count_after = response.json()["pagination"]["total_items"]
    assert task_count_after == task_count_before + 1

    timestamp_after_task_eval_created = datetime.datetime.now(datetime.timezone.utc)
    time.sleep(1.1)  # To make sure the 2 timestamps to compare are different
    assert task_response_benchmark.status_code == 200
    task_response_benchmark = task_response_benchmark.json()
    assert task_response_benchmark["input"] == input_text

    task_response_benchmark_id = task_response_benchmark["task_id"]

    response_task_agent = requests.get(
        f"{URL_AGENT}/agent/tasks/{task_response_benchmark_id}"
    )
    assert response_task_agent.status_code == 200
    response_task_agent = response_task_agent.json()
    assert len(response_task_agent["artifacts"]) == expected_artifact_length

    step_request = {"input": input_text}

    step_response = requests.post(
        URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/steps",
        json=step_request,
    )
    assert step_response.status_code == 200
    step_response = step_response.json()
    assert step_response["is_last"] is True  # Assuming is_last is always True

    eval_response = requests.post(
        URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/evaluations",
        json={},
    )
    assert eval_response.status_code == 200
    eval_response = eval_response.json()
    print("eval_response")
    print(eval_response)
    assert eval_response["run_details"]["test_name"] == test_name
    assert eval_response["metrics"]["success"] == should_be_successful
    benchmark_start_time = datetime.datetime.fromisoformat(
        eval_response["run_details"]["benchmark_start_time"]
    )

    assert benchmark_start_time < timestamp_after_task_eval_created