File size: 10,537 Bytes
2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 aa7753a 2ed01e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 |
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import json
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
"""
EXAMPLE OUTPUT:
What is the current population for the city where Einstein was born?
Step 1
----------------------------------------
Executing: fetch_wiki_content
Arguments: {'title': 'Albert Einstein'}
Step 2
----------------------------------------
Executing: deliver_answer
Arguments: {'fields': ['Ulm, German Empire']}
ANSWER FROM THE ASSISTANT: ['Ulm, German Empire']
Step 3
----------------------------------------
Executing: fetch_wiki_content
Arguments: {'title': 'Ulm'}
Step 4
----------------------------------------
Executing: deliver_answer
Arguments: {'fields': ['128,928']}
ANSWER FROM THE ASSISTANT: ['128,928']
Step 5
----------------------------------------
Extraction Complete
Why was Einstein famous?
Step 1
----------------------------------------
Executing: fetch_wiki_content
Arguments: {'title': 'Albert Einstein'}
Step 2
----------------------------------------
Executing: deliver_answer
Arguments: {'fields': ['Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics.', 'His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world\'s most famous equation."', 'He received the 1921 Nobel Prize in Physics.']}
ANSWER FROM THE ASSISTANT: ['Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics.', 'His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world\'s most famous equation."', 'He received the 1921 Nobel Prize in Physics.']
Step 3
----------------------------------------
Extraction Complete
"""
@dataclass
class WikiConfig:
"""Configuration for OpenAI and Wikipedia settings"""
api_key: str = "sk-123"
api_base: str = "{info}/v1"
model: Optional[str] = None
max_steps: int = 5
wikipedia_base_url: str = "https://en.wikipedia.org/wiki/"
class WikiTools:
"""Collection of Wikipedia and extraction tools"""
def __init__(self, base_url: str):
self.base_url = base_url
def fetch_wiki_content(self, title: str, section: Optional[str] = None) -> str:
"""Fetch and clean Wikipedia article content, optionally from a specific section"""
url = f"{self.base_url}{title.replace(' ', '_')}"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Remove unwanted sections
for unwanted in soup.find_all(['script', 'style', 'footer', 'header']):
unwanted.decompose()
if section:
# Find specific section if requested
section_tag = soup.find('span', {'id': section})
if section_tag:
content = section_tag.parent.find_next_siblings()
text = ' '.join(tag.get_text() for tag in content)
else:
return "Section not found"
else:
# Get main content
content = soup.find(id='mw-content-text')
if content:
text = content.get_text()
else:
return "Content not found"
# Clean and normalize text
text = ' '.join(text.split())
return text[:8000] # Truncate to avoid token limits
@staticmethod
def deliver_answer(fields: List[str]) -> Dict[str, Any]:
"""Extract specific information from text spans"""
print (f"ANSWER FROM THE ASSISTANT: {fields}")
return {
"extracted_fields": "Provided fields was delivered to the user successfully."
}
class ToolRegistry:
"""Registry of available tools and their schemas"""
def __init__(self, wiki_tools: WikiTools):
self.wiki_tools = wiki_tools
@property
def available_functions(self) -> Dict[str, callable]:
return {
"fetch_wiki_content": self.wiki_tools.fetch_wiki_content,
"deliver_answer": self.wiki_tools.deliver_answer
}
@property
def tool_schemas(self) -> List[Dict[str, Any]]:
return [
{
"type": "function",
"function": {
"name": "fetch_wiki_content",
"description": "Fetch content from a Wikipedia article",
"parameters": {
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title of the Wikipedia article"
},
"section": {
"type": "string",
"description": "Optional: Specific section ID to fetch",
"optional": True
}
},
"required": ["title"]
}
}
},
{
"type": "function",
"function": {
"name": "deliver_answer",
"description": "Extract specific information from the fetched text",
"parameters": {
"type": "object",
"properties": {
"fields": {
"type": "array",
"items": {"type": "string"},
"description": "List of text spans from the article that are relevant to the query"
}
},
"required": ["fields"]
}
}
}
]
class WikiExtractionAgent:
"""Main agent class that handles the extraction process"""
def __init__(self, config: WikiConfig):
self.config = config
self.client = OpenAI(api_key=config.api_key, base_url=config.api_base)
self.wiki_tools = WikiTools(config.wikipedia_base_url)
self.tools = ToolRegistry(self.wiki_tools)
self.messages = [{"system" : "1. First fetch any wikipedia pages you might need to answer the user query. Do not answer from parametric knowledge.\n\n2.Then, provide the answer to the user using the deliver_answer from the retrieved wikipedia page.\n\n3. You may need to issue multiple calls to wikipedia after extracting answers if there are nested dependencies for information."}]
if not config.model:
models = self.client.models.list()
self.config.model = models.data[0].id
def _serialize_tool_call(self, tool_call) -> Dict[str, Any]:
"""Convert tool call to serializable format"""
return {
"id": tool_call.id,
"type": tool_call.type,
"function": {
"name": tool_call.function.name,
"arguments": tool_call.function.arguments
}
}
def process_tool_calls(self, message) -> List[Dict[str, Any]]:
"""Process and execute tool calls from assistant"""
results = []
for tool_call in message.tool_calls:
function_name = tool_call.function.name
function_args = json.loads(tool_call.function.arguments)
print(f"\nExecuting: {function_name}")
print(f"Arguments: {function_args}")
function_response = self.tools.available_functions[function_name](**function_args)
results.append({
"tool": function_name,
"args": function_args,
"response": function_response
})
self.messages.append({
"role": "tool",
"content": json.dumps(function_response),
"tool_call_id": tool_call.id,
"name": function_name
})
return results
def extract_information(self, query: str) -> List[Dict[str, Any]]:
"""Main method to handle the extraction process"""
self.messages = [{
"role": "user",
"content": f"""Extract information from Wikipedia to answer this query: {query}
You can use these tools:
1. fetch_wiki_content: Get article content
2. deliver_answer: deliver relevant information
Please fetch content first, and iterate as needed to get to the webpage with the correct answer and then deliver the relevant information."""
}]
all_results = []
for step in range(self.config.max_steps):
print(f"\nStep {step + 1}")
print("-" * 40)
response = self.client.chat.completions.create(
messages=self.messages,
model=self.config.model,
tools=self.tools.tool_schemas,
temperature=0.0,
)
message = response.choices[0].message
if not message.tool_calls:
print("Extraction Complete")
break
self.messages.append({
"role": "assistant",
"content": json.dumps(message.content),
"tool_calls": [self._serialize_tool_call(tc) for tc in message.tool_calls]
})
results = self.process_tool_calls(message)
all_results.extend(results)
return all_results
def main():
# Example usage
config = WikiConfig()
agent = WikiExtractionAgent(config)
# Multi-step query example
# The model should first issue a call to wikipedia for Einstein, extract the part from the document about where he was born
# and use the value from that extraction (which could contain the city name) to call another wikipedia article for the city
# and pull the population from it.
# See lines 11 to 41 for the full trace of this actual query that Athene-V2-Agent issues.
results = agent.extract_information(
query="""What is the current population for the city where Einstein was born?"""
)
# Single query example
# Here, the model should just issue a call to Einstein's wikipedia page, and extract the parts regarding his
# accomplishment.
results = agent.extract_information(
query="Why was Einstein famous?"
)
if __name__ == "__main__":
main()
|