File size: 3,221 Bytes
b225a21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import datetime
import time
import pytest
import requests
URL_BENCHMARK = "http://localhost:8080/ap/v1"
URL_AGENT = "http://localhost:8000/ap/v1"
try:
response = requests.get(f"{URL_AGENT}/agent/tasks")
except requests.exceptions.ConnectionError:
pytest.skip("No agent available to test against", allow_module_level=True)
@pytest.mark.parametrize(
"eval_id, input_text, expected_artifact_length, test_name, should_be_successful",
[
(
"021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
"Write the word 'Washington' to a .txt file",
0,
"WriteFile",
True,
),
(
"f219f3d3-a41b-45a9-a3d0-389832086ee8",
"Read the file called file_to_read.txt "
"and write its content to a file called output.txt",
1,
"ReadFile",
False,
),
],
)
def test_entire_workflow(
eval_id: str,
input_text: str,
expected_artifact_length: int,
test_name: str,
should_be_successful: bool,
):
task_request = {"eval_id": eval_id, "input": input_text}
response = requests.get(f"{URL_AGENT}/agent/tasks")
task_count_before = response.json()["pagination"]["total_items"]
# First POST request
task_response_benchmark = requests.post(
URL_BENCHMARK + "/agent/tasks", json=task_request
)
response = requests.get(f"{URL_AGENT}/agent/tasks")
task_count_after = response.json()["pagination"]["total_items"]
assert task_count_after == task_count_before + 1
timestamp_after_task_eval_created = datetime.datetime.now(datetime.timezone.utc)
time.sleep(1.1) # To make sure the 2 timestamps to compare are different
assert task_response_benchmark.status_code == 200
task_response_benchmark = task_response_benchmark.json()
assert task_response_benchmark["input"] == input_text
task_response_benchmark_id = task_response_benchmark["task_id"]
response_task_agent = requests.get(
f"{URL_AGENT}/agent/tasks/{task_response_benchmark_id}"
)
assert response_task_agent.status_code == 200
response_task_agent = response_task_agent.json()
assert len(response_task_agent["artifacts"]) == expected_artifact_length
step_request = {"input": input_text}
step_response = requests.post(
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/steps",
json=step_request,
)
assert step_response.status_code == 200
step_response = step_response.json()
assert step_response["is_last"] is True # Assuming is_last is always True
eval_response = requests.post(
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/evaluations",
json={},
)
assert eval_response.status_code == 200
eval_response = eval_response.json()
print("eval_response")
print(eval_response)
assert eval_response["run_details"]["test_name"] == test_name
assert eval_response["metrics"]["success"] == should_be_successful
benchmark_start_time = datetime.datetime.fromisoformat(
eval_response["run_details"]["benchmark_start_time"]
)
assert benchmark_start_time < timestamp_after_task_eval_created
|