import logging from abc import ABC, abstractmethod from pathlib import Path from typing import AsyncIterator, Awaitable, ClassVar, Optional import pytest from agent_protocol_client import AgentApi, Step from colorama import Fore, Style from pydantic import BaseModel, Field from agbenchmark.config import AgentBenchmarkConfig from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult logger = logging.getLogger(__name__) class ChallengeInfo(BaseModel): eval_id: str = "" name: str task: str task_artifacts_dir: Optional[Path] = None category: list[Category] difficulty: Optional[DifficultyLevel] = None description: Optional[str] = None dependencies: list[str] = Field(default_factory=list) reference_answer: Optional[str] source_uri: str """Internal reference indicating the source of the challenge specification""" available: bool = True unavailable_reason: str = "" class BaseChallenge(ABC): """ The base class and shared interface for all specific challenge implementations. """ info: ClassVar[ChallengeInfo] @classmethod @abstractmethod def from_source_uri(cls, source_uri: str) -> type["BaseChallenge"]: """ Construct an individual challenge subclass from a suitable `source_uri` (as in `ChallengeInfo.source_uri`). """ ... @abstractmethod def test_method( self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest, i_attempt: int, ) -> None | Awaitable[None]: """ Test method for use by Pytest-based benchmark sessions. Should return normally if the challenge passes, and raise a (preferably descriptive) error otherwise. """ ... @classmethod async def run_challenge( cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False ) -> AsyncIterator[Step]: """ Runs the challenge on the subject agent with the specified timeout. Also prints basic challenge and status info to STDOUT. Params: config: The subject agent's benchmark config. timeout: Timeout (seconds) after which to stop the run if not finished. Yields: Step: The steps generated by the agent for the challenge task. """ # avoid circular import from agbenchmark.agent_api_interface import run_api_agent print() print( f"{Fore.MAGENTA + Style.BRIGHT}{'='*24} " f"Starting {cls.info.name} challenge" f" {'='*24}{Style.RESET_ALL}" ) print(f"{Fore.CYAN}Timeout:{Fore.RESET} {timeout} seconds") print(f"{Fore.CYAN}Task:{Fore.RESET} {cls.info.task}") print() logger.debug(f"Starting {cls.info.name} challenge run") i = 0 async for step in run_api_agent( cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock ): i += 1 print(f"[{cls.info.name}] - step {step.name} ({i}. request)") yield step logger.debug(f"Finished {cls.info.name} challenge run") @classmethod @abstractmethod async def evaluate_task_state( cls, agent: AgentApi, task_id: str ) -> list[EvalResult]: ...