Spaces:

kakumusic
/

autogpt

Configuration error

App Files Files Community

autogpt / benchmark /agbenchmark /challenges /builtin.py

kakumusic

Upload folder using huggingface_hub

b225a21 verified 3 months ago

raw

history blame

17 kB

	import glob
	import json
	import logging
	import os
	import subprocess
	import sys
	import tempfile
	from collections import deque
	from pathlib import Path
	from typing import Annotated, Any, ClassVar, Iterator, Literal, Optional

	import pytest
	from agent_protocol_client import AgentApi, ApiClient
	from agent_protocol_client import Configuration as ClientConfig
	from agent_protocol_client import Step
	from colorama import Fore, Style
	from openai import _load_client as get_openai_client
	from pydantic import (
	BaseModel,
	Field,
	StringConstraints,
	ValidationInfo,
	field_validator,
	)

	from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
	from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
	from agbenchmark.config import AgentBenchmarkConfig
	from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
	from agbenchmark.utils.prompts import (
	END_PROMPT,
	FEW_SHOT_EXAMPLES,
	PROMPT_MAP,
	SCORING_MAP,
	)

	from .base import BaseChallenge, ChallengeInfo

	logger = logging.getLogger(__name__)

	with open(Path(__file__).parent / "optional_categories.json") as f:
	OPTIONAL_CATEGORIES: list[str] = json.load(f)["optional_categories"]


	class BuiltinChallengeSpec(BaseModel):
	eval_id: str = ""
	name: str
	task: str
	category: list[Category]
	dependencies: list[str]
	cutoff: int

	class Info(BaseModel):
	difficulty: DifficultyLevel
	description: Annotated[
	str, StringConstraints(pattern=r"^Tests if the agent can.*")
	]
	side_effects: list[str] = Field(default_factory=list)

	info: Info

	class Ground(BaseModel):
	answer: str
	should_contain: Optional[list[str]] = None
	should_not_contain: Optional[list[str]] = None
	files: list[str]
	case_sensitive: Optional[bool] = True

	class Eval(BaseModel):
	type: str
	scoring: Optional[Literal["percentage", "scale", "binary"]] = None
	template: Optional[
	Literal["rubric", "reference", "question", "custom"]
	] = None
	examples: Optional[str] = None

	@field_validator("scoring", "template")
	def validate_eval_fields(cls, value, info: ValidationInfo):
	field_name = info.field_name
	if "type" in info.data and info.data["type"] == "llm":
	if value is None:
	raise ValueError(
	f"{field_name} must be provided when eval type is 'llm'"
	)
	else:
	if value is not None:
	raise ValueError(
	f"{field_name} should only exist when eval type is 'llm'"
	)
	return value

	eval: Eval

	ground: Ground

	metadata: Optional[dict[str, Any]] = None
	spec_file: Path \| None = Field(None, exclude=True)


	class BuiltinChallenge(BaseChallenge):
	"""
	Base class for AGBenchmark's built-in challenges (challenges/*/.json).

	All of the logic is present in this class. Individual challenges are created as
	subclasses of `BuiltinChallenge` with challenge-specific values assigned to the
	ClassVars `_spec` etc.

	Dynamically constructing subclasses rather than class instances for the individual
	challenges makes them suitable for collection by Pytest, which will run their
	`test_method` like any regular test item.
	"""

	_spec: ClassVar[BuiltinChallengeSpec]
	CHALLENGE_LOCATION: ClassVar[str]
	ARTIFACTS_LOCATION: ClassVar[str]

	SOURCE_URI_PREFIX = "__BUILTIN__"

	@classmethod
	def from_challenge_spec(
	cls, spec: BuiltinChallengeSpec
	) -> type["BuiltinChallenge"]:
	if not spec.spec_file:
	raise ValueError("spec.spec_file not defined")

	challenge_info = ChallengeInfo(
	eval_id=spec.eval_id,
	name=spec.name,
	task=spec.task,
	task_artifacts_dir=spec.spec_file.parent,
	category=spec.category,
	difficulty=spec.info.difficulty,
	description=spec.info.description,
	dependencies=spec.dependencies,
	reference_answer=spec.ground.answer,
	source_uri=(
	f"__BUILTIN__/{spec.spec_file.relative_to(Path(__file__).parent)}"
	),
	)

	challenge_class_name = f"Test{challenge_info.name}"
	logger.debug(f"Creating {challenge_class_name} from spec: {spec.spec_file}")
	return type(
	challenge_class_name,
	(BuiltinChallenge,),
	{
	"info": challenge_info,
	"_spec": spec,
	"CHALLENGE_LOCATION": str(spec.spec_file),
	"ARTIFACTS_LOCATION": str(spec.spec_file.resolve().parent),
	},
	)

	@classmethod
	def from_challenge_spec_file(cls, spec_file: Path) -> type["BuiltinChallenge"]:
	challenge_spec = BuiltinChallengeSpec.model_validate_json(spec_file.read_text())
	challenge_spec.spec_file = spec_file
	return cls.from_challenge_spec(challenge_spec)

	@classmethod
	def from_source_uri(cls, source_uri: str) -> type["BuiltinChallenge"]:
	if not source_uri.startswith(cls.SOURCE_URI_PREFIX):
	raise ValueError(f"Invalid source_uri for BuiltinChallenge: {source_uri}")

	path = source_uri.split("/", 1)[1]
	spec_file = Path(__file__).parent / path
	return cls.from_challenge_spec_file(spec_file)

	@pytest.mark.asyncio
	async def test_method(
	self,
	config: AgentBenchmarkConfig,
	request: pytest.FixtureRequest,
	i_attempt: int,
	) -> None:
	# if os.environ.get("HELICONE_API_KEY"):
	# from helicone.lock import HeliconeLockManager

	# HeliconeLockManager.write_custom_property("challenge", self.info.name)

	timeout = self._spec.cutoff or 60

	if request.config.getoption("--nc"):
	timeout = 100000
	elif cutoff := request.config.getoption("--cutoff"):
	timeout = int(cutoff) # type: ignore

	task_id = ""
	n_steps = 0
	timed_out = None
	agent_task_cost = None
	steps: list[Step] = []
	try:
	async for step in self.run_challenge(
	config, timeout, mock=bool(request.config.getoption("--mock"))
	):
	if not task_id:
	task_id = step.task_id

	n_steps += 1
	steps.append(step.model_copy())
	if step.additional_output:
	agent_task_cost = step.additional_output.get(
	"task_total_cost",
	step.additional_output.get("task_cumulative_cost"),
	)
	timed_out = False
	except TimeoutError:
	timed_out = True

	assert isinstance(request.node, pytest.Item)
	request.node.user_properties.append(("steps", steps))
	request.node.user_properties.append(("n_steps", n_steps))
	request.node.user_properties.append(("timed_out", timed_out))
	request.node.user_properties.append(("agent_task_cost", agent_task_cost))

	agent_client_config = ClientConfig(host=config.host)
	async with ApiClient(agent_client_config) as api_client:
	api_instance = AgentApi(api_client)
	eval_results = await self.evaluate_task_state(api_instance, task_id)

	if not eval_results:
	if timed_out:
	raise TimeoutError("Timed out, no results to evaluate")
	else:
	raise ValueError("No results to evaluate")

	request.node.user_properties.append(
	(
	"answers",
	[r.result for r in eval_results]
	if request.config.getoption("--keep-answers")
	else None,
	)
	)
	request.node.user_properties.append(("scores", [r.score for r in eval_results]))

	# FIXME: this allows partial failure
	assert any(r.passed for r in eval_results), (
	f"No passed evals: {eval_results}"
	if not timed_out
	else f"Timed out; no passed evals: {eval_results}"
	)

	@classmethod
	async def evaluate_task_state(
	cls, agent: AgentApi, task_id: str
	) -> list[EvalResult]:
	with tempfile.TemporaryDirectory() as workspace:
	workspace = Path(workspace)
	await download_agent_artifacts_into_folder(agent, task_id, workspace)
	if cls.info.task_artifacts_dir:
	copy_challenge_artifacts_into_workspace(
	cls.info.task_artifacts_dir, "custom_python", workspace
	)

	return list(cls.evaluate_workspace_content(workspace))

	@classmethod
	def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
	result_ground = cls._spec.ground
	outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)

	if result_ground.should_contain or result_ground.should_not_contain:
	for source, content in outputs_for_eval:
	score = cls.score_result(content, result_ground)
	if score is not None:
	print(f"{Fore.GREEN}Your score is:{Style.RESET_ALL}", score)
	yield EvalResult(
	result=content,
	result_source=str(source),
	score=score,
	passed=score > 0.9, # FIXME: arbitrary threshold
	)

	if result_ground.eval.type in ("python", "pytest"):
	for py_file, output in outputs_for_eval:
	yield EvalResult(
	result=output,
	result_source=str(py_file),
	score=float(not output.startswith("Error:")),
	passed=not output.startswith("Error:"),
	)

	if result_ground.eval.type == "llm":
	combined_results = "\n".join(output[1] for output in outputs_for_eval)
	llm_eval = cls.score_result_with_llm(combined_results, result_ground)
	print(f"{Fore.GREEN}Your score is:{Style.RESET_ALL}", llm_eval)
	if result_ground.eval.scoring == "percentage":
	score = llm_eval / 100
	elif result_ground.eval.scoring == "scale":
	score = llm_eval / 10
	else:
	score = llm_eval

	yield EvalResult(
	result=combined_results,
	result_source=", ".join(str(res[0]) for res in outputs_for_eval),
	score=score,
	passed=score > 0.9, # FIXME: arbitrary threshold
	)

	@staticmethod
	def get_outputs_for_eval(
	workspace: str \| Path \| dict[str, str], ground: BuiltinChallengeSpec.Ground
	) -> Iterator[tuple[str \| Path, str]]:
	if isinstance(workspace, dict):
	workspace = workspace["output"]

	script_dir = workspace

	for file_pattern in ground.files:
	# Check if it is a file extension
	if file_pattern.startswith("."):
	# Find all files with the given extension in the workspace
	matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
	else:
	# Otherwise, it is a specific file
	matching_files = [os.path.join(script_dir, file_pattern)]

	logger.debug(
	f"Files to evaluate for pattern `{file_pattern}`: {matching_files}"
	)

	for file_path in matching_files:
	relative_file_path = Path(file_path).relative_to(workspace)
	logger.debug(
	f"Evaluating {relative_file_path} "
	f"(eval type: {ground.eval.type})..."
	)
	if ground.eval.type == "python":
	result = subprocess.run(
	[sys.executable, file_path],
	cwd=os.path.abspath(workspace),
	capture_output=True,
	text=True,
	)
	if "error" in result.stderr or result.returncode != 0:
	yield relative_file_path, f"Error: {result.stderr}\n"
	else:
	yield relative_file_path, f"Output: {result.stdout}\n"
	else:
	with open(file_path, "r") as f:
	yield relative_file_path, f.read()
	else:
	if ground.eval.type == "pytest":
	result = subprocess.run(
	[sys.executable, "-m", "pytest"],
	cwd=os.path.abspath(workspace),
	capture_output=True,
	text=True,
	)
	logger.debug(f"EXIT CODE: {result.returncode}")
	logger.debug(f"STDOUT: {result.stdout}")
	logger.debug(f"STDERR: {result.stderr}")
	if "error" in result.stderr or result.returncode != 0:
	yield "pytest", f"Error: {result.stderr.strip() or result.stdout}\n"
	else:
	yield "pytest", f"Output: {result.stdout}\n"

	@staticmethod
	def score_result(content: str, ground: BuiltinChallengeSpec.Ground) -> float \| None:
	print(f"{Fore.BLUE}Scoring content:{Style.RESET_ALL}", content)
	if ground.should_contain:
	for should_contain_word in ground.should_contain:
	if not ground.case_sensitive:
	should_contain_word = should_contain_word.lower()
	content = content.lower()
	print_content = (
	f"{Fore.BLUE}Word that should exist{Style.RESET_ALL}"
	f" - {should_contain_word}:"
	)
	if should_contain_word not in content:
	print(print_content, "False")
	return 0.0
	else:
	print(print_content, "True")
	return 1.0

	if ground.should_not_contain:
	for should_not_contain_word in ground.should_not_contain:
	if not ground.case_sensitive:
	should_not_contain_word = should_not_contain_word.lower()
	content = content.lower()
	print_content = (
	f"{Fore.BLUE}Word that should not exist{Style.RESET_ALL}"
	f" - {should_not_contain_word}:"
	)
	if should_not_contain_word in content:
	print(print_content, "False")
	return 0.0
	else:
	print(print_content, "True")
	return 1.0

	@classmethod
	def score_result_with_llm(
	cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False
	) -> float:
	if mock:
	return 1.0

	# the validation for this is done in the Eval BaseModel
	scoring = SCORING_MAP[ground.eval.scoring] # type: ignore
	prompt = PROMPT_MAP[ground.eval.template].format( # type: ignore
	task=cls._spec.task, scoring=scoring, answer=ground.answer, response=content
	)

	if ground.eval.examples:
	prompt += FEW_SHOT_EXAMPLES.format(examples=ground.eval.examples)

	prompt += END_PROMPT

	answer = get_openai_client().chat.completions.create(
	model="gpt-4",
	messages=[
	{"role": "system", "content": prompt},
	],
	)

	return float(answer.choices[0].message.content) # type: ignore


	def load_builtin_challenges() -> Iterator[type[BuiltinChallenge]]:
	logger.info("Loading built-in challenges...")

	challenges_path = Path(__file__).parent
	logger.debug(f"Looking for challenge spec files in {challenges_path}...")

	json_files = deque(challenges_path.rglob("data.json"))

	logger.debug(f"Found {len(json_files)} built-in challenges.")

	loaded, ignored = 0, 0
	while json_files:
	# Take and remove the first element from json_files
	json_file = json_files.popleft()
	if _challenge_should_be_ignored(json_file):
	ignored += 1
	continue

	challenge = BuiltinChallenge.from_challenge_spec_file(json_file)
	logger.debug(f"Generated test for {challenge.info.name}")
	yield challenge

	loaded += 1

	logger.info(
	f"Loading built-in challenges complete: loaded {loaded}, ignored {ignored}."
	)


	def _challenge_should_be_ignored(json_file_path: Path):
	return (
	"challenges/deprecated" in json_file_path.as_posix()
	or "challenges/library" in json_file_path.as_posix()
	)