|
import datetime |
|
import time |
|
|
|
import pytest |
|
import requests |
|
|
|
URL_BENCHMARK = "http://localhost:8080/ap/v1" |
|
URL_AGENT = "http://localhost:8000/ap/v1" |
|
|
|
try: |
|
response = requests.get(f"{URL_AGENT}/agent/tasks") |
|
except requests.exceptions.ConnectionError: |
|
pytest.skip("No agent available to test against", allow_module_level=True) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"eval_id, input_text, expected_artifact_length, test_name, should_be_successful", |
|
[ |
|
( |
|
"021c695a-6cc4-46c2-b93a-f3a9b0f4d123", |
|
"Write the word 'Washington' to a .txt file", |
|
0, |
|
"WriteFile", |
|
True, |
|
), |
|
( |
|
"f219f3d3-a41b-45a9-a3d0-389832086ee8", |
|
"Read the file called file_to_read.txt " |
|
"and write its content to a file called output.txt", |
|
1, |
|
"ReadFile", |
|
False, |
|
), |
|
], |
|
) |
|
def test_entire_workflow( |
|
eval_id: str, |
|
input_text: str, |
|
expected_artifact_length: int, |
|
test_name: str, |
|
should_be_successful: bool, |
|
): |
|
task_request = {"eval_id": eval_id, "input": input_text} |
|
response = requests.get(f"{URL_AGENT}/agent/tasks") |
|
task_count_before = response.json()["pagination"]["total_items"] |
|
|
|
task_response_benchmark = requests.post( |
|
URL_BENCHMARK + "/agent/tasks", json=task_request |
|
) |
|
response = requests.get(f"{URL_AGENT}/agent/tasks") |
|
task_count_after = response.json()["pagination"]["total_items"] |
|
assert task_count_after == task_count_before + 1 |
|
|
|
timestamp_after_task_eval_created = datetime.datetime.now(datetime.timezone.utc) |
|
time.sleep(1.1) |
|
assert task_response_benchmark.status_code == 200 |
|
task_response_benchmark = task_response_benchmark.json() |
|
assert task_response_benchmark["input"] == input_text |
|
|
|
task_response_benchmark_id = task_response_benchmark["task_id"] |
|
|
|
response_task_agent = requests.get( |
|
f"{URL_AGENT}/agent/tasks/{task_response_benchmark_id}" |
|
) |
|
assert response_task_agent.status_code == 200 |
|
response_task_agent = response_task_agent.json() |
|
assert len(response_task_agent["artifacts"]) == expected_artifact_length |
|
|
|
step_request = {"input": input_text} |
|
|
|
step_response = requests.post( |
|
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/steps", |
|
json=step_request, |
|
) |
|
assert step_response.status_code == 200 |
|
step_response = step_response.json() |
|
assert step_response["is_last"] is True |
|
|
|
eval_response = requests.post( |
|
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/evaluations", |
|
json={}, |
|
) |
|
assert eval_response.status_code == 200 |
|
eval_response = eval_response.json() |
|
print("eval_response") |
|
print(eval_response) |
|
assert eval_response["run_details"]["test_name"] == test_name |
|
assert eval_response["metrics"]["success"] == should_be_successful |
|
benchmark_start_time = datetime.datetime.fromisoformat( |
|
eval_response["run_details"]["benchmark_start_time"] |
|
) |
|
|
|
assert benchmark_start_time < timestamp_after_task_eval_created |
|
|