fdaudens HF staff commited on
Commit
e993dbb
·
verified ·
1 Parent(s): 799b253

Update tools/visit_webpage.py

Browse files
Files changed (1) hide show
  1. tools/visit_webpage.py +21 -9
tools/visit_webpage.py CHANGED
@@ -16,29 +16,41 @@ class VisitWebpageTool(Tool):
16
  import requests
17
  from markdownify import markdownify
18
  from requests.exceptions import RequestException
19
-
20
  from smolagents.utils import truncate_content
21
  except ImportError as e:
22
  raise ImportError(
23
- "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
24
  ) from e
25
  try:
26
- # Add user agent to avoid some blocking
27
  headers = {
28
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
29
  }
30
 
31
- # Send a GET request to the URL with a 20-second timeout
32
  response = requests.get(url, timeout=20, headers=headers)
33
  response.raise_for_status()
34
 
35
- # Convert the HTML content to Markdown
36
- markdown_content = markdownify(response.text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Remove multiple line breaks
39
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
40
-
41
- return truncate_content(markdown_content, 10000)
 
42
 
43
  except requests.exceptions.Timeout:
44
  return "The request timed out. Please try again later or check the URL."
 
16
  import requests
17
  from markdownify import markdownify
18
  from requests.exceptions import RequestException
19
+ from bs4 import BeautifulSoup
20
  from smolagents.utils import truncate_content
21
  except ImportError as e:
22
  raise ImportError(
23
+ "You must install packages `markdownify`, `requests`, and `beautifulsoup4` to run this tool: run `pip install markdownify requests beautifulsoup4`."
24
  ) from e
25
  try:
 
26
  headers = {
27
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
28
  }
29
 
 
30
  response = requests.get(url, timeout=20, headers=headers)
31
  response.raise_for_status()
32
 
33
+ # Parse HTML and extract main content
34
+ soup = BeautifulSoup(response.text, 'html.parser')
35
+
36
+ # Remove unwanted elements
37
+ for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'header']):
38
+ element.decompose()
39
+
40
+ # Get main content (prioritize article or main tags)
41
+ main_content = soup.find('article') or soup.find('main') or soup.find('body')
42
+
43
+ if main_content:
44
+ # Convert only the main content to markdown
45
+ markdown_content = markdownify(str(main_content)).strip()
46
+ else:
47
+ markdown_content = markdownify(response.text).strip()
48
 
49
+ # Remove multiple line breaks and limit content
50
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
51
+
52
+ # Limit content to ~4000 words (roughly 5000 tokens)
53
+ return truncate_content(markdown_content, 4000)
54
 
55
  except requests.exceptions.Timeout:
56
  return "The request timed out. Please try again later or check the URL."