Spaces:
Running
Running
Update tools/visit_webpage.py
Browse files- tools/visit_webpage.py +21 -9
tools/visit_webpage.py
CHANGED
@@ -16,29 +16,41 @@ class VisitWebpageTool(Tool):
|
|
16 |
import requests
|
17 |
from markdownify import markdownify
|
18 |
from requests.exceptions import RequestException
|
19 |
-
|
20 |
from smolagents.utils import truncate_content
|
21 |
except ImportError as e:
|
22 |
raise ImportError(
|
23 |
-
"You must install packages `markdownify` and `
|
24 |
) from e
|
25 |
try:
|
26 |
-
# Add user agent to avoid some blocking
|
27 |
headers = {
|
28 |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
29 |
}
|
30 |
|
31 |
-
# Send a GET request to the URL with a 20-second timeout
|
32 |
response = requests.get(url, timeout=20, headers=headers)
|
33 |
response.raise_for_status()
|
34 |
|
35 |
-
#
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
# Remove multiple line breaks
|
39 |
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
40 |
-
|
41 |
-
|
|
|
42 |
|
43 |
except requests.exceptions.Timeout:
|
44 |
return "The request timed out. Please try again later or check the URL."
|
|
|
16 |
import requests
|
17 |
from markdownify import markdownify
|
18 |
from requests.exceptions import RequestException
|
19 |
+
from bs4 import BeautifulSoup
|
20 |
from smolagents.utils import truncate_content
|
21 |
except ImportError as e:
|
22 |
raise ImportError(
|
23 |
+
"You must install packages `markdownify`, `requests`, and `beautifulsoup4` to run this tool: run `pip install markdownify requests beautifulsoup4`."
|
24 |
) from e
|
25 |
try:
|
|
|
26 |
headers = {
|
27 |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
28 |
}
|
29 |
|
|
|
30 |
response = requests.get(url, timeout=20, headers=headers)
|
31 |
response.raise_for_status()
|
32 |
|
33 |
+
# Parse HTML and extract main content
|
34 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
35 |
+
|
36 |
+
# Remove unwanted elements
|
37 |
+
for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'header']):
|
38 |
+
element.decompose()
|
39 |
+
|
40 |
+
# Get main content (prioritize article or main tags)
|
41 |
+
main_content = soup.find('article') or soup.find('main') or soup.find('body')
|
42 |
+
|
43 |
+
if main_content:
|
44 |
+
# Convert only the main content to markdown
|
45 |
+
markdown_content = markdownify(str(main_content)).strip()
|
46 |
+
else:
|
47 |
+
markdown_content = markdownify(response.text).strip()
|
48 |
|
49 |
+
# Remove multiple line breaks and limit content
|
50 |
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
51 |
+
|
52 |
+
# Limit content to ~4000 words (roughly 5000 tokens)
|
53 |
+
return truncate_content(markdown_content, 4000)
|
54 |
|
55 |
except requests.exceptions.Timeout:
|
56 |
return "The request timed out. Please try again later or check the URL."
|