my-news-agent

Running

App Files Files Community

fdaudens HF staff commited on 11 days ago

Commit

e993dbb

verified ·

1 Parent(s): 799b253

Update tools/visit_webpage.py

Browse files

Files changed (1) hide show

tools/visit_webpage.py +21 -9

tools/visit_webpage.py CHANGED Viewed

@@ -16,29 +16,41 @@ class VisitWebpageTool(Tool):
             import requests
             from markdownify import markdownify
             from requests.exceptions import RequestException
             from smolagents.utils import truncate_content
         except ImportError as e:
             raise ImportError(
-                "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
             ) from e
         try:
-            # Add user agent to avoid some blocking
             headers = {
                 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             }
-            # Send a GET request to the URL with a 20-second timeout
             response = requests.get(url, timeout=20, headers=headers)
             response.raise_for_status()
-            # Convert the HTML content to Markdown
-            markdown_content = markdownify(response.text).strip()
-            # Remove multiple line breaks
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
-            return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
             return "The request timed out. Please try again later or check the URL."

             import requests
             from markdownify import markdownify
             from requests.exceptions import RequestException
+            from bs4 import BeautifulSoup
             from smolagents.utils import truncate_content
         except ImportError as e:
             raise ImportError(
+                "You must install packages `markdownify`, `requests`, and `beautifulsoup4` to run this tool: run `pip install markdownify requests beautifulsoup4`."
             ) from e
         try:
             headers = {
                 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             }
             response = requests.get(url, timeout=20, headers=headers)
             response.raise_for_status()
+            # Parse HTML and extract main content
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'header']):
+                element.decompose()
+            # Get main content (prioritize article or main tags)
+            main_content = soup.find('article') or soup.find('main') or soup.find('body')
+            if main_content:
+                # Convert only the main content to markdown
+                markdown_content = markdownify(str(main_content)).strip()
+            else:
+                markdown_content = markdownify(response.text).strip()
+            # Remove multiple line breaks and limit content
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
+            # Limit content to ~4000 words (roughly 5000 tokens)
+            return truncate_content(markdown_content, 4000)
         except requests.exceptions.Timeout:
             return "The request timed out. Please try again later or check the URL."