Spaces:

kakumusic
/

autogpt

Configuration error

App Files Files Community

autogpt / benchmark /agbenchmark /challenges /base.py

kakumusic

Upload folder using huggingface_hub

b225a21 verified 3 months ago

raw

history blame

3.34 kB

	import logging
	from abc import ABC, abstractmethod
	from pathlib import Path
	from typing import AsyncIterator, Awaitable, ClassVar, Optional

	import pytest
	from agent_protocol_client import AgentApi, Step
	from colorama import Fore, Style
	from pydantic import BaseModel, Field

	from agbenchmark.config import AgentBenchmarkConfig
	from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult

	logger = logging.getLogger(__name__)


	class ChallengeInfo(BaseModel):
	eval_id: str = ""
	name: str
	task: str
	task_artifacts_dir: Optional[Path] = None
	category: list[Category]
	difficulty: Optional[DifficultyLevel] = None
	description: Optional[str] = None
	dependencies: list[str] = Field(default_factory=list)
	reference_answer: Optional[str]

	source_uri: str
	"""Internal reference indicating the source of the challenge specification"""

	available: bool = True
	unavailable_reason: str = ""


	class BaseChallenge(ABC):
	"""
	The base class and shared interface for all specific challenge implementations.
	"""

	info: ClassVar[ChallengeInfo]

	@classmethod
	@abstractmethod
	def from_source_uri(cls, source_uri: str) -> type["BaseChallenge"]:
	"""
	Construct an individual challenge subclass from a suitable `source_uri` (as in
	`ChallengeInfo.source_uri`).
	"""
	...

	@abstractmethod
	def test_method(
	self,
	config: AgentBenchmarkConfig,
	request: pytest.FixtureRequest,
	i_attempt: int,
	) -> None \| Awaitable[None]:
	"""
	Test method for use by Pytest-based benchmark sessions. Should return normally
	if the challenge passes, and raise a (preferably descriptive) error otherwise.
	"""
	...

	@classmethod
	async def run_challenge(
	cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
	) -> AsyncIterator[Step]:
	"""
	Runs the challenge on the subject agent with the specified timeout.
	Also prints basic challenge and status info to STDOUT.

	Params:
	config: The subject agent's benchmark config.
	timeout: Timeout (seconds) after which to stop the run if not finished.

	Yields:
	Step: The steps generated by the agent for the challenge task.
	"""
	# avoid circular import
	from agbenchmark.agent_api_interface import run_api_agent

	print()
	print(
	f"{Fore.MAGENTA + Style.BRIGHT}{'='*24} "
	f"Starting {cls.info.name} challenge"
	f" {'='*24}{Style.RESET_ALL}"
	)
	print(f"{Fore.CYAN}Timeout:{Fore.RESET} {timeout} seconds")
	print(f"{Fore.CYAN}Task:{Fore.RESET} {cls.info.task}")

	print()
	logger.debug(f"Starting {cls.info.name} challenge run")
	i = 0
	async for step in run_api_agent(
	cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
	):
	i += 1
	print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
	yield step
	logger.debug(f"Finished {cls.info.name} challenge run")

	@classmethod
	@abstractmethod
	async def evaluate_task_state(
	cls, agent: AgentApi, task_id: str
	) -> list[EvalResult]:
	...