from dataclasses import dataclass from typing import List, Dict, Any, Optional import json import requests from bs4 import BeautifulSoup from openai import OpenAI """ EXAMPLE OUTPUT: What is the current population for the city where Einstein was born? Step 1 ---------------------------------------- Executing: fetch_wiki_content Arguments: {'title': 'Albert Einstein'} Step 2 ---------------------------------------- Executing: deliver_answer Arguments: {'fields': ['Ulm, German Empire']} ANSWER FROM THE ASSISTANT: ['Ulm, German Empire'] Step 3 ---------------------------------------- Executing: fetch_wiki_content Arguments: {'title': 'Ulm'} Step 4 ---------------------------------------- Executing: deliver_answer Arguments: {'fields': ['128,928']} ANSWER FROM THE ASSISTANT: ['128,928'] Step 5 ---------------------------------------- Extraction Complete Why was Einstein famous? Step 1 ---------------------------------------- Executing: fetch_wiki_content Arguments: {'title': 'Albert Einstein'} Step 2 ---------------------------------------- Executing: deliver_answer Arguments: {'fields': ['Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics.', 'His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world\'s most famous equation."', 'He received the 1921 Nobel Prize in Physics.']} ANSWER FROM THE ASSISTANT: ['Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics.', 'His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world\'s most famous equation."', 'He received the 1921 Nobel Prize in Physics.'] Step 3 ---------------------------------------- Extraction Complete """ @dataclass class WikiConfig: """Configuration for OpenAI and Wikipedia settings""" api_key: str = "sk-123" api_base: str = "{info}/v1" model: Optional[str] = None max_steps: int = 5 wikipedia_base_url: str = "https://en.wikipedia.org/wiki/" class WikiTools: """Collection of Wikipedia and extraction tools""" def __init__(self, base_url: str): self.base_url = base_url def fetch_wiki_content(self, title: str, section: Optional[str] = None) -> str: """Fetch and clean Wikipedia article content, optionally from a specific section""" url = f"{self.base_url}{title.replace(' ', '_')}" response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') # Remove unwanted sections for unwanted in soup.find_all(['script', 'style', 'footer', 'header']): unwanted.decompose() if section: # Find specific section if requested section_tag = soup.find('span', {'id': section}) if section_tag: content = section_tag.parent.find_next_siblings() text = ' '.join(tag.get_text() for tag in content) else: return "Section not found" else: # Get main content content = soup.find(id='mw-content-text') if content: text = content.get_text() else: return "Content not found" # Clean and normalize text text = ' '.join(text.split()) return text[:8000] # Truncate to avoid token limits @staticmethod def deliver_answer(fields: List[str]) -> Dict[str, Any]: """Extract specific information from text spans""" print (f"ANSWER FROM THE ASSISTANT: {fields}") return { "extracted_fields": "Provided fields was delivered to the user successfully." } class ToolRegistry: """Registry of available tools and their schemas""" def __init__(self, wiki_tools: WikiTools): self.wiki_tools = wiki_tools @property def available_functions(self) -> Dict[str, callable]: return { "fetch_wiki_content": self.wiki_tools.fetch_wiki_content, "deliver_answer": self.wiki_tools.deliver_answer } @property def tool_schemas(self) -> List[Dict[str, Any]]: return [ { "type": "function", "function": { "name": "fetch_wiki_content", "description": "Fetch content from a Wikipedia article", "parameters": { "type": "object", "properties": { "title": { "type": "string", "description": "The title of the Wikipedia article" }, "section": { "type": "string", "description": "Optional: Specific section ID to fetch", "optional": True } }, "required": ["title"] } } }, { "type": "function", "function": { "name": "deliver_answer", "description": "Extract specific information from the fetched text", "parameters": { "type": "object", "properties": { "fields": { "type": "array", "items": {"type": "string"}, "description": "List of text spans from the article that are relevant to the query" } }, "required": ["fields"] } } } ] class WikiExtractionAgent: """Main agent class that handles the extraction process""" def __init__(self, config: WikiConfig): self.config = config self.client = OpenAI(api_key=config.api_key, base_url=config.api_base) self.wiki_tools = WikiTools(config.wikipedia_base_url) self.tools = ToolRegistry(self.wiki_tools) self.messages = [{"system" : "1. First fetch any wikipedia pages you might need to answer the user query. Do not answer from parametric knowledge.\n\n2.Then, provide the answer to the user using the deliver_answer from the retrieved wikipedia page.\n\n3. You may need to issue multiple calls to wikipedia after extracting answers if there are nested dependencies for information."}] if not config.model: models = self.client.models.list() self.config.model = models.data[0].id def _serialize_tool_call(self, tool_call) -> Dict[str, Any]: """Convert tool call to serializable format""" return { "id": tool_call.id, "type": tool_call.type, "function": { "name": tool_call.function.name, "arguments": tool_call.function.arguments } } def process_tool_calls(self, message) -> List[Dict[str, Any]]: """Process and execute tool calls from assistant""" results = [] for tool_call in message.tool_calls: function_name = tool_call.function.name function_args = json.loads(tool_call.function.arguments) print(f"\nExecuting: {function_name}") print(f"Arguments: {function_args}") function_response = self.tools.available_functions[function_name](**function_args) results.append({ "tool": function_name, "args": function_args, "response": function_response }) self.messages.append({ "role": "tool", "content": json.dumps(function_response), "tool_call_id": tool_call.id, "name": function_name }) return results def extract_information(self, query: str) -> List[Dict[str, Any]]: """Main method to handle the extraction process""" self.messages = [{ "role": "user", "content": f"""Extract information from Wikipedia to answer this query: {query} You can use these tools: 1. fetch_wiki_content: Get article content 2. deliver_answer: deliver relevant information Please fetch content first, and iterate as needed to get to the webpage with the correct answer and then deliver the relevant information.""" }] all_results = [] for step in range(self.config.max_steps): print(f"\nStep {step + 1}") print("-" * 40) response = self.client.chat.completions.create( messages=self.messages, model=self.config.model, tools=self.tools.tool_schemas, temperature=0.0, ) message = response.choices[0].message if not message.tool_calls: print("Extraction Complete") break self.messages.append({ "role": "assistant", "content": json.dumps(message.content), "tool_calls": [self._serialize_tool_call(tc) for tc in message.tool_calls] }) results = self.process_tool_calls(message) all_results.extend(results) return all_results def main(): # Example usage config = WikiConfig() agent = WikiExtractionAgent(config) # Multi-step query example # The model should first issue a call to wikipedia for Einstein, extract the part from the document about where he was born # and use the value from that extraction (which could contain the city name) to call another wikipedia article for the city # and pull the population from it. # See lines 11 to 41 for the full trace of this actual query that Athene-V2-Agent issues. results = agent.extract_information( query="""What is the current population for the city where Einstein was born?""" ) # Single query example # Here, the model should just issue a call to Einstein's wikipedia page, and extract the parts regarding his # accomplishment. results = agent.extract_information( query="Why was Einstein famous?" ) if __name__ == "__main__": main()