my-news-agent

Running

App Files Files Community

my-news-agent / tools /visit_webpage.py

fdaudens HF staff

Update tools/visit_webpage.py

e993dbb verified 11 days ago

raw

history blame contribute delete

2.72 kB

	from typing import Any, Optional
	from smolagents.tools import Tool
	import requests
	import markdownify
	import smolagents
	import re # Add re import here

	class VisitWebpageTool(Tool):
	name = "visit_webpage"
	description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
	inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
	output_type = "string"

	def forward(self, url: str) -> str:
	try:
	import requests
	from markdownify import markdownify
	from requests.exceptions import RequestException
	from bs4 import BeautifulSoup
	from smolagents.utils import truncate_content
	except ImportError as e:
	raise ImportError(
	"You must install packages `markdownify`, `requests`, and `beautifulsoup4` to run this tool: run `pip install markdownify requests beautifulsoup4`."
	) from e
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, timeout=20, headers=headers)
	response.raise_for_status()

	# Parse HTML and extract main content
	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove unwanted elements
	for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'header']):
	element.decompose()

	# Get main content (prioritize article or main tags)
	main_content = soup.find('article') or soup.find('main') or soup.find('body')

	if main_content:
	# Convert only the main content to markdown
	markdown_content = markdownify(str(main_content)).strip()
	else:
	markdown_content = markdownify(response.text).strip()

	# Remove multiple line breaks and limit content
	markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)

	# Limit content to ~4000 words (roughly 5000 tokens)
	return truncate_content(markdown_content, 4000)

	except requests.exceptions.Timeout:
	return "The request timed out. Please try again later or check the URL."
	except RequestException as e:
	return f"Error fetching the webpage: {str(e)}"
	except Exception as e:
	return f"An unexpected error occurred: {str(e)}"

	def __init__(self, args, *kwargs):
	self.is_initialized = False