Spaces:
Running
Running
AsyncWebCrawler
The AsyncWebCrawler
class is the main interface for web crawling operations. It provides asynchronous web crawling capabilities with extensive configuration options.
Constructor
AsyncWebCrawler(
# Browser Settings
browser_type: str = "chromium", # Options: "chromium", "firefox", "webkit"
headless: bool = True, # Run browser in headless mode
verbose: bool = False, # Enable verbose logging
# Cache Settings
always_by_pass_cache: bool = False, # Always bypass cache
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
# Network Settings
proxy: str = None, # Simple proxy URL
proxy_config: Dict = None, # Advanced proxy configuration
# Browser Behavior
sleep_on_close: bool = False, # Wait before closing browser
# Custom Settings
user_agent: str = None, # Custom user agent
headers: Dict[str, str] = {}, # Custom HTTP headers
js_code: Union[str, List[str]] = None, # Default JavaScript to execute
)
Parameters in Detail
Browser Settings
browser_type (str, optional)
- Default:
"chromium"
- Options:
"chromium"
,"firefox"
,"webkit"
- Controls which browser engine to use
# Example: Using Firefox crawler = AsyncWebCrawler(browser_type="firefox")
- Default:
headless (bool, optional)
- Default:
True
- When
True
, browser runs without GUI - Set to
False
for debugging
# Visible browser for debugging crawler = AsyncWebCrawler(headless=False)
- Default:
verbose (bool, optional)
- Default:
False
- Enables detailed logging
# Enable detailed logging crawler = AsyncWebCrawler(verbose=True)
- Default:
Cache Settings
always_by_pass_cache (bool, optional)
- Default:
False
- When
True
, always fetches fresh content
# Always fetch fresh content crawler = AsyncWebCrawler(always_by_pass_cache=True)
- Default:
base_directory (str, optional)
- Default: User's home directory
- Base path for cache storage
# Custom cache directory crawler = AsyncWebCrawler(base_directory="/path/to/cache")
Network Settings
proxy (str, optional)
- Simple proxy URL
# Using simple proxy crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080")
proxy_config (Dict, optional)
- Advanced proxy configuration with authentication
# Advanced proxy with auth crawler = AsyncWebCrawler(proxy_config={ "server": "http://proxy.example.com:8080", "username": "user", "password": "pass" })
Browser Behavior
- sleep_on_close (bool, optional)
- Default:
False
- Adds delay before closing browser
# Wait before closing crawler = AsyncWebCrawler(sleep_on_close=True)
- Default:
Custom Settings
user_agent (str, optional)
- Custom user agent string
# Custom user agent crawler = AsyncWebCrawler( user_agent="Mozilla/5.0 (Custom Agent) Chrome/90.0" )
headers (Dict[str, str], optional)
- Custom HTTP headers
# Custom headers crawler = AsyncWebCrawler( headers={ "Accept-Language": "en-US", "Custom-Header": "Value" } )
js_code (Union[str, List[str]], optional)
- Default JavaScript to execute on each page
# Default JavaScript crawler = AsyncWebCrawler( js_code=[ "window.scrollTo(0, document.body.scrollHeight);", "document.querySelector('.load-more').click();" ] )
Methods
arun()
The primary method for crawling web pages.
async def arun(
# Required
url: str, # URL to crawl
# Content Selection
css_selector: str = None, # CSS selector for content
word_count_threshold: int = 10, # Minimum words per block
# Cache Control
bypass_cache: bool = False, # Bypass cache for this request
# Session Management
session_id: str = None, # Session identifier
# Screenshot Options
screenshot: bool = False, # Take screenshot
screenshot_wait_for: float = None, # Wait before screenshot
# Content Processing
process_iframes: bool = False, # Process iframe content
remove_overlay_elements: bool = False, # Remove popups/modals
# Anti-Bot Settings
simulate_user: bool = False, # Simulate human behavior
override_navigator: bool = False, # Override navigator properties
magic: bool = False, # Enable all anti-detection
# Content Filtering
excluded_tags: List[str] = None, # HTML tags to exclude
exclude_external_links: bool = False, # Remove external links
exclude_social_media_links: bool = False, # Remove social media links
# JavaScript Handling
js_code: Union[str, List[str]] = None, # JavaScript to execute
wait_for: str = None, # Wait condition
# Page Loading
page_timeout: int = 60000, # Page load timeout (ms)
delay_before_return_html: float = None, # Wait before return
# Extraction
extraction_strategy: ExtractionStrategy = None # Extraction strategy
) -> CrawlResult:
Usage Examples
Basic Crawling
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com")
Advanced Crawling
async with AsyncWebCrawler(
browser_type="firefox",
verbose=True,
headers={"Custom-Header": "Value"}
) as crawler:
result = await crawler.arun(
url="https://example.com",
css_selector=".main-content",
word_count_threshold=20,
process_iframes=True,
magic=True,
wait_for="css:.dynamic-content",
screenshot=True
)
Session Management
async with AsyncWebCrawler() as crawler:
# First request
result1 = await crawler.arun(
url="https://example.com/login",
session_id="my_session"
)
# Subsequent request using same session
result2 = await crawler.arun(
url="https://example.com/protected",
session_id="my_session"
)
Context Manager
AsyncWebCrawler implements the async context manager protocol:
async def __aenter__(self) -> 'AsyncWebCrawler':
# Initialize browser and resources
return self
async def __aexit__(self, *args):
# Cleanup resources
pass
Always use AsyncWebCrawler with async context manager:
async with AsyncWebCrawler() as crawler:
# Your crawling code here
pass
Best Practices
- Resource Management
# Always use context manager
async with AsyncWebCrawler() as crawler:
# Crawler will be properly cleaned up
pass
- Error Handling
try:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com")
if not result.success:
print(f"Crawl failed: {result.error_message}")
except Exception as e:
print(f"Error: {str(e)}")
- Performance Optimization
# Enable caching for better performance
crawler = AsyncWebCrawler(
always_by_pass_cache=False,
verbose=True
)
- Anti-Detection
# Maximum stealth
crawler = AsyncWebCrawler(
headless=True,
user_agent="Mozilla/5.0...",
headers={"Accept-Language": "en-US"}
)
result = await crawler.arun(
url="https://example.com",
magic=True,
simulate_user=True
)
Note on Browser Types
Each browser type has its characteristics:
- chromium: Best overall compatibility
- firefox: Good for specific use cases
- webkit: Lighter weight, good for basic crawling
Choose based on your specific needs:
# High compatibility
crawler = AsyncWebCrawler(browser_type="chromium")
# Memory efficient
crawler = AsyncWebCrawler(browser_type="webkit")