File size: 10,537 Bytes
2ed01e6
 
 
 
 
 
 
 
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
 
 
 
 
 
aa7753a
2ed01e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa7753a
 
2ed01e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa7753a
 
 
 
2ed01e6
 
 
aa7753a
2ed01e6
aa7753a
 
2ed01e6
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import json
import requests
from bs4 import BeautifulSoup
from openai import OpenAI

"""
EXAMPLE OUTPUT:

What is the current population for the city  where Einstein was born?

Step 1
----------------------------------------

Executing: fetch_wiki_content
Arguments: {'title': 'Albert Einstein'}

Step 2
----------------------------------------

Executing: deliver_answer
Arguments: {'fields': ['Ulm, German Empire']}
ANSWER FROM THE ASSISTANT: ['Ulm, German Empire']

Step 3
----------------------------------------

Executing: fetch_wiki_content
Arguments: {'title': 'Ulm'}

Step 4
----------------------------------------

Executing: deliver_answer
Arguments: {'fields': ['128,928']}
ANSWER FROM THE ASSISTANT: ['128,928']

Step 5
----------------------------------------
Extraction Complete


Why was Einstein famous?

Step 1
----------------------------------------

Executing: fetch_wiki_content
Arguments: {'title': 'Albert Einstein'}

Step 2
----------------------------------------

Executing: deliver_answer
Arguments: {'fields': ['Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics.', 'His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world\'s most famous equation."', 'He received the 1921 Nobel Prize in Physics.']}
ANSWER FROM THE ASSISTANT: ['Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics.', 'His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world\'s most famous equation."', 'He received the 1921 Nobel Prize in Physics.']

Step 3
----------------------------------------
Extraction Complete
"""

@dataclass
class WikiConfig:
    """Configuration for OpenAI and Wikipedia settings"""
    api_key: str = "sk-123"
    api_base: str = "{info}/v1"
    model: Optional[str] = None
    max_steps: int = 5
    wikipedia_base_url: str = "https://en.wikipedia.org/wiki/"

class WikiTools:
    """Collection of Wikipedia and extraction tools"""

    def __init__(self, base_url: str):
        self.base_url = base_url

    def fetch_wiki_content(self, title: str, section: Optional[str] = None) -> str:
        """Fetch and clean Wikipedia article content, optionally from a specific section"""
        url = f"{self.base_url}{title.replace(' ', '_')}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove unwanted sections
        for unwanted in soup.find_all(['script', 'style', 'footer', 'header']):
            unwanted.decompose()

        if section:
            # Find specific section if requested
            section_tag = soup.find('span', {'id': section})
            if section_tag:
                content = section_tag.parent.find_next_siblings()
                text = ' '.join(tag.get_text() for tag in content)
            else:
                return "Section not found"
        else:
            # Get main content
            content = soup.find(id='mw-content-text')
            if content:
                text = content.get_text()
            else:
                return "Content not found"

        # Clean and normalize text
        text = ' '.join(text.split())
        return text[:8000]  # Truncate to avoid token limits

    @staticmethod
    def deliver_answer(fields: List[str]) -> Dict[str, Any]:
        """Extract specific information from text spans"""
        print (f"ANSWER FROM THE ASSISTANT: {fields}")
        return {
            "extracted_fields": "Provided fields was delivered to the user successfully."
        }

class ToolRegistry:
    """Registry of available tools and their schemas"""

    def __init__(self, wiki_tools: WikiTools):
        self.wiki_tools = wiki_tools

    @property
    def available_functions(self) -> Dict[str, callable]:
        return {
            "fetch_wiki_content": self.wiki_tools.fetch_wiki_content,
            "deliver_answer": self.wiki_tools.deliver_answer
        }

    @property
    def tool_schemas(self) -> List[Dict[str, Any]]:
        return [
            {
                "type": "function",
                "function": {
                    "name": "fetch_wiki_content",
                    "description": "Fetch content from a Wikipedia article",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "title": {
                                "type": "string",
                                "description": "The title of the Wikipedia article"
                            },
                            "section": {
                                "type": "string",
                                "description": "Optional: Specific section ID to fetch",
                                "optional": True
                            }
                        },
                        "required": ["title"]
                    }
                }
            },
            {
                "type": "function",
                "function": {
                    "name": "deliver_answer",
                    "description": "Extract specific information from the fetched text",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "fields": {
                                "type": "array",
                                "items": {"type": "string"},
                                "description": "List of text spans from the article that are relevant to the query"
                            }
                        },
                        "required": ["fields"]
                    }
                }
            }
        ]

class WikiExtractionAgent:
    """Main agent class that handles the extraction process"""

    def __init__(self, config: WikiConfig):
        self.config = config
        self.client = OpenAI(api_key=config.api_key, base_url=config.api_base)
        self.wiki_tools = WikiTools(config.wikipedia_base_url)
        self.tools = ToolRegistry(self.wiki_tools)
        self.messages = [{"system" : "1. First fetch any wikipedia pages you might need to answer the user query. Do not answer from parametric knowledge.\n\n2.Then, provide the answer to the user using the deliver_answer from the retrieved wikipedia page.\n\n3. You may need to issue multiple calls to wikipedia after extracting answers if there are nested dependencies for information."}]

        if not config.model:
            models = self.client.models.list()
            self.config.model = models.data[0].id

    def _serialize_tool_call(self, tool_call) -> Dict[str, Any]:
        """Convert tool call to serializable format"""
        return {
            "id": tool_call.id,
            "type": tool_call.type,
            "function": {
                "name": tool_call.function.name,
                "arguments": tool_call.function.arguments
            }
        }

    def process_tool_calls(self, message) -> List[Dict[str, Any]]:
        """Process and execute tool calls from assistant"""
        results = []

        for tool_call in message.tool_calls:
            function_name = tool_call.function.name
            function_args = json.loads(tool_call.function.arguments)

            print(f"\nExecuting: {function_name}")
            print(f"Arguments: {function_args}")

            function_response = self.tools.available_functions[function_name](**function_args)
            results.append({
                "tool": function_name,
                "args": function_args,
                "response": function_response
            })

            self.messages.append({
                "role": "tool",
                "content": json.dumps(function_response),
                "tool_call_id": tool_call.id,
                "name": function_name
            })

        return results

    def extract_information(self, query: str) -> List[Dict[str, Any]]:
        """Main method to handle the extraction process"""
        self.messages = [{
            "role": "user",
            "content": f"""Extract information from Wikipedia to answer this query: {query}

            You can use these tools:
            1. fetch_wiki_content: Get article content
            2. deliver_answer: deliver relevant information

            Please fetch content first, and iterate as needed to get to the webpage with the correct answer and then deliver the relevant information."""
        }]

        all_results = []

        for step in range(self.config.max_steps):
            print(f"\nStep {step + 1}")
            print("-" * 40)

            response = self.client.chat.completions.create(
                messages=self.messages,
                model=self.config.model,
                tools=self.tools.tool_schemas,
                temperature=0.0,
            )

            message = response.choices[0].message

            if not message.tool_calls:
                print("Extraction Complete")
                break

            self.messages.append({
                "role": "assistant",
                "content": json.dumps(message.content),
                "tool_calls": [self._serialize_tool_call(tc) for tc in message.tool_calls]
            })

            results = self.process_tool_calls(message)
            all_results.extend(results)

        return all_results

def main():
    # Example usage
    config = WikiConfig()
    agent = WikiExtractionAgent(config)

    # Multi-step query example
    # The model should first issue a call to wikipedia for Einstein, extract the part from the document about where he was born
    # and use the value from that extraction (which could contain the city name) to call another wikipedia article for the city
    # and pull the population from it.
    # See lines 11 to 41 for the full trace of this actual query that Athene-V2-Agent issues.
    results = agent.extract_information(
        query="""What is the current population for the city  where Einstein was born?"""
    )
    
    # Single query example
    # Here, the model should just issue a call to Einstein's wikipedia page, and extract the parts regarding his
    # accomplishment. 
    results = agent.extract_information(
        query="Why was Einstein famous?"
    )


if __name__ == "__main__":
    main()