kakumusic's picture
Upload folder using huggingface_hub
b225a21 verified
raw
history blame
13 kB
import contextlib
import json
import logging
import os
import shutil
import threading
import time
from pathlib import Path
from typing import Generator
import pytest
from agbenchmark.challenges import OPTIONAL_CATEGORIES, BaseChallenge
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.processing.report_types import Test
from agbenchmark.reports.ReportManager import RegressionTestsTracker
from agbenchmark.reports.reports import (
add_test_result_to_report,
make_empty_test_report,
session_finish,
)
from agbenchmark.utils.data_types import Category
GLOBAL_TIMEOUT = (
1500 # The tests will stop after 25 minutes so we can send the reports.
)
agbenchmark_config = AgentBenchmarkConfig.load()
logger = logging.getLogger(__name__)
pytest_plugins = ["agbenchmark.utils.dependencies"]
collect_ignore = ["challenges"]
@pytest.fixture(scope="module")
def config() -> AgentBenchmarkConfig:
return agbenchmark_config
@pytest.fixture(autouse=True)
def temp_folder() -> Generator[Path, None, None]:
"""
Pytest fixture that sets up and tears down the temporary folder for each test.
It is automatically used in every test due to the 'autouse=True' parameter.
"""
# create output directory if it doesn't exist
if not os.path.exists(agbenchmark_config.temp_folder):
os.makedirs(agbenchmark_config.temp_folder, exist_ok=True)
yield agbenchmark_config.temp_folder
# teardown after test function completes
if not os.getenv("KEEP_TEMP_FOLDER_FILES"):
for filename in os.listdir(agbenchmark_config.temp_folder):
file_path = os.path.join(agbenchmark_config.temp_folder, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
logger.warning(f"Failed to delete {file_path}. Reason: {e}")
def pytest_addoption(parser: pytest.Parser) -> None:
"""
Pytest hook that adds command-line options to the `pytest` command.
The added options are specific to agbenchmark and control its behavior:
* `--mock` is used to run the tests in mock mode.
* `--host` is used to specify the host for the tests.
* `--category` is used to run only tests of a specific category.
* `--nc` is used to run the tests without caching.
* `--cutoff` is used to specify a cutoff time for the tests.
* `--improve` is used to run only the tests that are marked for improvement.
* `--maintain` is used to run only the tests that are marked for maintenance.
* `--explore` is used to run the tests in exploration mode.
* `--test` is used to run a specific test.
* `--no-dep` is used to run the tests without dependencies.
* `--keep-answers` is used to keep the answers of the tests.
Args:
parser: The Pytest CLI parser to which the command-line options are added.
"""
parser.addoption("-N", "--attempts", action="store")
parser.addoption("--no-dep", action="store_true")
parser.addoption("--mock", action="store_true")
parser.addoption("--host", default=None)
parser.addoption("--nc", action="store_true")
parser.addoption("--cutoff", action="store")
parser.addoption("--category", action="append")
parser.addoption("--test", action="append")
parser.addoption("--improve", action="store_true")
parser.addoption("--maintain", action="store_true")
parser.addoption("--explore", action="store_true")
parser.addoption("--keep-answers", action="store_true")
def pytest_configure(config: pytest.Config) -> None:
# Register category markers to prevent "unknown marker" warnings
for category in Category:
config.addinivalue_line("markers", f"{category.value}: {category}")
@pytest.fixture(autouse=True)
def check_regression(request: pytest.FixtureRequest) -> None:
"""
Fixture that checks for every test if it should be treated as a regression test,
and whether to skip it based on that.
The test name is retrieved from the `request` object. Regression reports are loaded
from the path specified in the benchmark configuration.
Effect:
* If the `--improve` option is used and the current test is considered a regression
test, it is skipped.
* If the `--maintain` option is used and the current test is not considered a
regression test, it is also skipped.
Args:
request: The request object from which the test name and the benchmark
configuration are retrieved.
"""
with contextlib.suppress(FileNotFoundError):
rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file)
assert isinstance(request.node, pytest.Function)
assert isinstance(request.node.parent, pytest.Class)
test_name = request.node.parent.name
challenge_location = getattr(request.node.cls, "CHALLENGE_LOCATION", "")
skip_string = f"Skipping {test_name} at {challenge_location}"
# Check if the test name exists in the regression tests
is_regression_test = rt_tracker.has_regression_test(test_name)
if request.config.getoption("--improve") and is_regression_test:
pytest.skip(f"{skip_string} because it's a regression test")
elif request.config.getoption("--maintain") and not is_regression_test:
pytest.skip(f"{skip_string} because it's not a regression test")
@pytest.fixture(autouse=True, scope="session")
def mock(request: pytest.FixtureRequest) -> bool:
"""
Pytest fixture that retrieves the value of the `--mock` command-line option.
The `--mock` option is used to run the tests in mock mode.
Args:
request: The `pytest.FixtureRequest` from which the `--mock` option value
is retrieved.
Returns:
bool: Whether `--mock` is set for this session.
"""
mock = request.config.getoption("--mock")
assert isinstance(mock, bool)
return mock
test_reports: dict[str, Test] = {}
def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
"""
Pytest hook that is called when a test report is being generated.
It is used to generate and finalize reports for each test.
Args:
item: The test item for which the report is being generated.
call: The call object from which the test result is retrieved.
"""
challenge: type[BaseChallenge] = item.cls # type: ignore
challenge_id = challenge.info.eval_id
if challenge_id not in test_reports:
test_reports[challenge_id] = make_empty_test_report(challenge.info)
if call.when == "setup":
test_name = item.nodeid.split("::")[1]
item.user_properties.append(("test_name", test_name))
if call.when == "call":
add_test_result_to_report(
test_reports[challenge_id], item, call, agbenchmark_config
)
def timeout_monitor(start_time: int) -> None:
"""
Function that limits the total execution time of the test suite.
This function is supposed to be run in a separate thread and calls `pytest.exit`
if the total execution time has exceeded the global timeout.
Args:
start_time (int): The start time of the test suite.
"""
while time.time() - start_time < GLOBAL_TIMEOUT:
time.sleep(1) # check every second
pytest.exit("Test suite exceeded the global timeout", returncode=1)
def pytest_sessionstart(session: pytest.Session) -> None:
"""
Pytest hook that is called at the start of a test session.
Sets up and runs a `timeout_monitor` in a separate thread.
"""
start_time = time.time()
t = threading.Thread(target=timeout_monitor, args=(start_time,))
t.daemon = True # Daemon threads are abruptly stopped at shutdown
t.start()
def pytest_sessionfinish(session: pytest.Session) -> None:
"""
Pytest hook that is called at the end of a test session.
Finalizes and saves the test reports.
"""
session_finish(agbenchmark_config)
def pytest_generate_tests(metafunc: pytest.Metafunc):
n = metafunc.config.getoption("-N")
metafunc.parametrize("i_attempt", range(int(n)) if type(n) is str else [0])
def pytest_collection_modifyitems(
items: list[pytest.Function], config: pytest.Config
) -> None:
"""
Pytest hook that is called after initial test collection has been performed.
Modifies the collected test items based on the agent benchmark configuration,
adding the dependency marker and category markers.
Args:
items: The collected test items to be modified.
config: The active pytest configuration.
"""
rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file)
try:
challenges_beaten_in_the_past = json.loads(
agbenchmark_config.challenges_already_beaten_file.read_bytes()
)
except FileNotFoundError:
challenges_beaten_in_the_past = {}
selected_tests: tuple[str] = config.getoption("--test") # type: ignore
selected_categories: tuple[str] = config.getoption("--category") # type: ignore
# Can't use a for-loop to remove items in-place
i = 0
while i < len(items):
item = items[i]
assert item.cls and issubclass(item.cls, BaseChallenge)
challenge = item.cls
challenge_name = challenge.info.name
if not issubclass(challenge, BaseChallenge):
item.warn(
pytest.PytestCollectionWarning(
f"Non-challenge item collected: {challenge}"
)
)
i += 1
continue
# --test: remove the test from the set if it's not specifically selected
if selected_tests and challenge.info.name not in selected_tests:
items.remove(item)
continue
# Filter challenges for --maintain, --improve, and --explore:
# --maintain -> only challenges expected to be passed (= regression tests)
# --improve -> only challenges that so far are not passed (reliably)
# --explore -> only challenges that have never been passed
is_regression_test = rt_tracker.has_regression_test(challenge.info.name)
has_been_passed = challenges_beaten_in_the_past.get(challenge.info.name, False)
if (
(config.getoption("--maintain") and not is_regression_test)
or (config.getoption("--improve") and is_regression_test)
or (config.getoption("--explore") and has_been_passed)
):
items.remove(item)
continue
dependencies = challenge.info.dependencies
if (
config.getoption("--test")
or config.getoption("--no-dep")
or config.getoption("--maintain")
):
# Ignore dependencies:
# --test -> user selected specific tests to run, don't care about deps
# --no-dep -> ignore dependency relations regardless of test selection
# --maintain -> all "regression" tests must pass, so run all of them
dependencies = []
elif config.getoption("--improve"):
# Filter dependencies, keep only deps that are not "regression" tests
dependencies = [
d for d in dependencies if not rt_tracker.has_regression_test(d)
]
# Set category markers
challenge_categories = set(c.value for c in challenge.info.category)
for category in challenge_categories:
item.add_marker(category)
# Enforce category selection
if selected_categories:
if not challenge_categories.intersection(set(selected_categories)):
items.remove(item)
continue
# # Filter dependencies, keep only deps from selected categories
# dependencies = [
# d for d in dependencies
# if not set(d.categories).intersection(set(selected_categories))
# ]
# Skip items in optional categories that are not selected for the subject agent
challenge_optional_categories = challenge_categories & set(OPTIONAL_CATEGORIES)
if challenge_optional_categories and not (
agbenchmark_config.categories
and challenge_optional_categories.issubset(
set(agbenchmark_config.categories)
)
):
logger.debug(
f"Skipping {challenge_name}: "
f"category {' and '.join(challenge_optional_categories)} is optional, "
"and not explicitly selected in the benchmark config."
)
items.remove(item)
continue
# Add marker for the DependencyManager
item.add_marker(pytest.mark.depends(on=dependencies, name=challenge_name))
i += 1