import os, sys import time import warnings from enum import Enum from colorama import init, Fore, Back, Style from pathlib import Path from typing import Optional, List, Union import json import asyncio # from contextlib import nullcontext, asynccontextmanager from contextlib import asynccontextmanager from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy from .content_scraping_strategy import WebScrapingStrategy from .async_logger import AsyncLogger from .async_configs import BrowserConfig, CrawlerRunConfig from .config import ( MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, URL_LOG_SHORTEN_LENGTH ) from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, format_html, fast_format_html, create_box_message ) from urllib.parse import urlparse import random from .__version__ import __version__ as crawl4ai_version class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. There are two ways to use the crawler: 1. Using context manager (recommended for simple cases): ```python async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://example.com") ``` 2. Using explicit lifecycle management (recommended for long-running applications): ```python crawler = AsyncWebCrawler() await crawler.start() # Use the crawler multiple times result1 = await crawler.arun(url="https://example.com") result2 = await crawler.arun(url="https://another.com") await crawler.close() ``` Migration Guide: Old way (deprecated): crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) New way (recommended): browser_config = BrowserConfig(browser_type="chromium", headless=True) crawler = AsyncWebCrawler(config=browser_config) Attributes: browser_config (BrowserConfig): Configuration object for browser settings. crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. logger (AsyncLogger): Logger instance for recording events and errors. always_bypass_cache (bool): Whether to always bypass cache. crawl4ai_folder (str): Directory for storing cache. base_directory (str): Base directory for storing cache. ready (bool): Whether the crawler is ready for use. Methods: start(): Start the crawler explicitly without using context manager. close(): Close the crawler explicitly without using context manager. arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). awarmup(): Perform warmup sequence. arun_many(): Run the crawler for multiple sources. aprocess_html(): Process HTML content. Typical Usage: async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://example.com") print(result.markdown) Using configuration: browser_config = BrowserConfig(browser_type="chromium", headless=True) async with AsyncWebCrawler(config=browser_config) as crawler: crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS ) result = await crawler.arun(url="https://example.com", config=crawler_config) print(result.markdown) """ _domain_last_hit = {} def __init__( self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, config: Optional[BrowserConfig] = None, always_bypass_cache: bool = False, always_by_pass_cache: Optional[bool] = None, # Deprecated parameter base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), thread_safe: bool = False, **kwargs, ): """ Initialize the AsyncWebCrawler. Args: crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy config: Configuration object for browser settings. If None, will be created from kwargs always_bypass_cache: Whether to always bypass cache (new parameter) always_by_pass_cache: Deprecated, use always_bypass_cache instead base_directory: Base directory for storing cache thread_safe: Whether to use thread-safe operations **kwargs: Additional arguments for backwards compatibility """ # Handle browser configuration browser_config = config if browser_config is not None: if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]): self.logger.warning( message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.", tag="WARNING" ) else: # Create browser config from kwargs for backwards compatibility browser_config = BrowserConfig.from_kwargs(kwargs) self.browser_config = browser_config # Initialize logger first since other components may need it self.logger = AsyncLogger( log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), verbose=self.browser_config.verbose, tag_width=10 ) # Initialize crawler strategy params = { k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger'] } self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( browser_config=browser_config, logger=self.logger, **params # Pass remaining kwargs for backwards compatibility ) # If craweler strategy doesnt have logger, use crawler logger if not self.crawler_strategy.logger: self.crawler_strategy.logger = self.logger # Handle deprecated cache parameter if always_by_pass_cache is not None: if kwargs.get("warning", True): warnings.warn( "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. " "Use 'always_bypass_cache' instead. " "Pass warning=False to suppress this warning.", DeprecationWarning, stacklevel=2 ) self.always_bypass_cache = always_by_pass_cache else: self.always_bypass_cache = always_bypass_cache # Thread safety setup self._lock = asyncio.Lock() if thread_safe else None # Initialize directories self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) self.ready = False async def start(self): """ Start the crawler explicitly without using context manager. This is equivalent to using 'async with' but gives more control over the lifecycle. This method will: 1. Initialize the browser and context 2. Perform warmup sequence 3. Return the crawler instance for method chaining Returns: AsyncWebCrawler: The initialized crawler instance """ await self.crawler_strategy.__aenter__() await self.awarmup() return self async def close(self): """ Close the crawler explicitly without using context manager. This should be called when you're done with the crawler if you used start(). This method will: 1. Clean up browser resources 2. Close any open pages and contexts """ await self.crawler_strategy.__aexit__(None, None, None) async def __aenter__(self): return await self.start() async def __aexit__(self, exc_type, exc_val, exc_tb): await self.close() async def awarmup(self): """ Initialize the crawler with warm-up sequence. This method: 1. Logs initialization info 2. Sets up browser configuration 3. Marks the crawler as ready """ self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.ready = True @asynccontextmanager async def nullcontext(self): """异步空上下文管理器""" yield async def arun( self, url: str, config: Optional[CrawlerRunConfig] = None, # Legacy parameters maintained for backwards compatibility word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), content_filter: RelevantContentFilter = None, cache_mode: Optional[CacheMode] = None, # Deprecated cache parameters bypass_cache: bool = False, disable_cache: bool = False, no_cache_read: bool = False, no_cache_write: bool = False, # Other legacy parameters css_selector: str = None, screenshot: bool = False, pdf: bool = False, user_agent: str = None, verbose=True, **kwargs, ) -> CrawlResult: """ Runs the crawler for a single source: URL (web, local file, or raw HTML). Migration Guide: Old way (deprecated): result = await crawler.arun( url="https://example.com", word_count_threshold=200, screenshot=True, ... ) New way (recommended): config = CrawlerRunConfig( word_count_threshold=200, screenshot=True, ... ) result = await crawler.arun(url="https://example.com", crawler_config=config) Args: url: The URL to crawl (http://, https://, file://, or raw:) crawler_config: Configuration object controlling crawl behavior [other parameters maintained for backwards compatibility] Returns: CrawlResult: The result of crawling and processing """ crawler_config = config if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") async with self._lock or self.nullcontext(): try: # Handle configuration if crawler_config is not None: # if any(param is not None for param in [ # word_count_threshold, extraction_strategy, chunking_strategy, # content_filter, cache_mode, css_selector, screenshot, pdf # ]): # self.logger.warning( # message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", # tag="WARNING" # ) config = crawler_config else: # Merge all parameters into a single kwargs dict for config creation config_kwargs = { "word_count_threshold": word_count_threshold, "extraction_strategy": extraction_strategy, "chunking_strategy": chunking_strategy, "content_filter": content_filter, "cache_mode": cache_mode, "bypass_cache": bypass_cache, "disable_cache": disable_cache, "no_cache_read": no_cache_read, "no_cache_write": no_cache_write, "css_selector": css_selector, "screenshot": screenshot, "pdf": pdf, "verbose": verbose, **kwargs } config = CrawlerRunConfig.from_kwargs(config_kwargs) # Handle deprecated cache parameters if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): if kwargs.get("warning", True): warnings.warn( "Cache control boolean flags are deprecated and will be removed in version 0.5.0. " "Use 'cache_mode' parameter instead.", DeprecationWarning, stacklevel=2 ) # Convert legacy parameters if cache_mode not provided if config.cache_mode is None: config.cache_mode = _legacy_to_cache_mode( disable_cache=disable_cache, bypass_cache=bypass_cache, no_cache_read=no_cache_read, no_cache_write=no_cache_write ) # Default to ENABLED if no cache mode specified if config.cache_mode is None: config.cache_mode = CacheMode.ENABLED # Create cache context cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache) # Initialize processing variables async_response: AsyncCrawlResponse = None cached_result: CrawlResult = None screenshot_data = None pdf_data = None extracted_content = None start_time = time.perf_counter() # Try to get cached result if appropriate if cache_context.should_read(): cached_result = await async_db_manager.aget_cached_url(url) if cached_result: html = sanitize_input_encode(cached_result.html) extracted_content = sanitize_input_encode(cached_result.extracted_content or "") extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content # If screenshot is requested but its not in cache, then set cache_result to None screenshot_data = cached_result.screenshot pdf_data = cached_result.pdf if config.screenshot and not screenshot or config.pdf and not pdf: cached_result = None self.logger.url_status( url=cache_context.display_url, success=bool(html), timing=time.perf_counter() - start_time, tag="FETCH" ) # Fetch fresh content if needed if not cached_result or not html: t1 = time.perf_counter() if user_agent: self.crawler_strategy.update_user_agent(user_agent) # Pass config to crawl method async_response = await self.crawler_strategy.crawl( url, config=config # Pass the entire config object ) html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot pdf_data = async_response.pdf_data t2 = time.perf_counter() self.logger.url_status( url=cache_context.display_url, success=bool(html), timing=t2 - t1, tag="FETCH" ) # Process the HTML content crawl_result = await self.aprocess_html( url=url, html=html, extracted_content=extracted_content, config=config, # Pass the config object instead of individual parameters screenshot=screenshot_data, pdf_data=pdf_data, verbose=config.verbose, is_raw_html = True if url.startswith("raw:") else False, **kwargs ) crawl_result.status_code = async_response.status_code crawl_result.response_headers = async_response.response_headers crawl_result.downloaded_files = async_response.downloaded_files crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate # # Check and set values from async_response to crawl_result # try: # for key in vars(async_response): # if hasattr(crawl_result, key): # value = getattr(async_response, key, None) # current_value = getattr(crawl_result, key, None) # if value is not None and not current_value: # try: # setattr(crawl_result, key, value) # except Exception as e: # self.logger.warning( # message=f"Failed to set attribute {key}: {str(e)}", # tag="WARNING" # ) # except Exception as e: # self.logger.warning( # message=f"Error copying response attributes: {str(e)}", # tag="WARNING" # ) crawl_result.success = bool(html) crawl_result.session_id = getattr(config, 'session_id', None) self.logger.success( message="{url:.50}... | Status: {status} | Total: {timing}", tag="COMPLETE", params={ "url": cache_context.display_url, "status": crawl_result.success, "timing": f"{time.perf_counter() - start_time:.2f}s" }, colors={ "status": Fore.GREEN if crawl_result.success else Fore.RED, "timing": Fore.YELLOW } ) # Update cache if appropriate if cache_context.should_write() and not bool(cached_result): await async_db_manager.acache_url(crawl_result) return crawl_result else: self.logger.success( message="{url:.50}... | Status: {status} | Total: {timing}", tag="COMPLETE", params={ "url": cache_context.display_url, "status": True, "timing": f"{time.perf_counter() - start_time:.2f}s" }, colors={ "status": Fore.GREEN, "timing": Fore.YELLOW } ) cached_result.success = bool(html) cached_result.session_id = getattr(config, 'session_id', None) return cached_result except Exception as e: error_context = get_error_context(sys.exc_info()) error_message = ( f"Unexpected error in _crawl_web at line {error_context['line_no']} " f"in {error_context['function']} ({error_context['filename']}):\n" f"Error: {str(e)}\n\n" f"Code context:\n{error_context['code_context']}" ) # if not hasattr(e, "msg"): # e.msg = str(e) self.logger.error_status( url=url, error=create_box_message(error_message, type="error"), tag="ERROR" ) return CrawlResult( url=url, html="", success=False, error_message=error_message ) async def aprocess_html( self, url: str, html: str, extracted_content: str, config: CrawlerRunConfig, screenshot: str, pdf_data: str, verbose: bool, **kwargs, ) -> CrawlResult: """ Process HTML content using the provided configuration. Args: url: The URL being processed html: Raw HTML content extracted_content: Previously extracted content (if any) config: Configuration object controlling processing behavior screenshot: Screenshot data (if any) pdf_data: PDF data (if any) verbose: Whether to enable verbose logging **kwargs: Additional parameters for backwards compatibility Returns: CrawlResult: Processed result containing extracted and formatted content """ try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" t1 = time.perf_counter() # Initialize scraping strategy scrapping_strategy = WebScrapingStrategy(logger=self.logger) # Process HTML content params = {k:v for k, v in config.to_dict().items() if k not in ["url"]} # add keys from kwargs to params that doesn't exist in params params.update({k:v for k, v in kwargs.items() if k not in params.keys()}) result = scrapping_strategy.scrap( url, html, **params, # word_count_threshold=config.word_count_threshold, # css_selector=config.css_selector, # only_text=config.only_text, # image_description_min_word_threshold=config.image_description_min_word_threshold, # content_filter=config.content_filter, # **kwargs ) if result is None: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") except InvalidCSSSelectorError as e: raise ValueError(str(e)) except Exception as e: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") # Extract results cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) fit_html = sanitize_input_encode(result.get("fit_html", "")) media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) # Markdown Generation markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() # Uncomment if by default we want to use PruningContentFilter # if not config.content_filter and not markdown_generator.content_filter: # markdown_generator.content_filter = PruningContentFilter() markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, base_url=url, # html2text_options=kwargs.get('html2text', {}) ) markdown_v2 = markdown_result markdown = sanitize_input_encode(markdown_result.raw_markdown) # Log processing completion self.logger.info( message="Processed {url:.50}... | Time: {timing}ms", tag="SCRAPE", params={ "url": _url, "timing": int((time.perf_counter() - t1) * 1000) } ) # Handle content extraction if needed if (extracted_content is None and config.extraction_strategy and config.chunking_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy)): t1 = time.perf_counter() # Choose content based on input_format content_format = config.extraction_strategy.input_format if content_format == "fit_markdown" and not markdown_result.fit_markdown: self.logger.warning( message="Fit markdown requested but not available. Falling back to raw markdown.", tag="EXTRACT", params={"url": _url} ) content_format = "markdown" content = { "markdown": markdown, "html": html, "fit_markdown": markdown_result.raw_markdown }.get(content_format, markdown) # Use IdentityChunking for HTML input, otherwise use provided chunking strategy chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy sections = chunking.chunk(content) extracted_content = config.extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) # Log extraction completion self.logger.info( message="Completed for {url:.50}... | Time: {timing}s", tag="EXTRACT", params={ "url": _url, "timing": time.perf_counter() - t1 } ) # Handle screenshot and PDF data screenshot_data = None if not screenshot else screenshot pdf_data = None if not pdf_data else pdf_data # Apply HTML formatting if requested if config.prettiify: cleaned_html = fast_format_html(cleaned_html) # Return complete crawl result return CrawlResult( url=url, html=html, cleaned_html=cleaned_html, markdown_v2=markdown_v2, markdown=markdown, fit_markdown=fit_markdown, fit_html=fit_html, media=media, links=links, metadata=metadata, screenshot=screenshot_data, pdf=pdf_data, extracted_content=extracted_content, success=True, error_message="", ) async def arun_many( self, urls: List[str], config: Optional[CrawlerRunConfig] = None, # Legacy parameters maintained for backwards compatibility word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), content_filter: RelevantContentFilter = None, cache_mode: Optional[CacheMode] = None, bypass_cache: bool = False, css_selector: str = None, screenshot: bool = False, pdf: bool = False, user_agent: str = None, verbose=True, **kwargs, ) -> List[CrawlResult]: """ Runs the crawler for multiple URLs concurrently. Migration Guide: Old way (deprecated): results = await crawler.arun_many( urls, word_count_threshold=200, screenshot=True, ... ) New way (recommended): config = CrawlerRunConfig( word_count_threshold=200, screenshot=True, ... ) results = await crawler.arun_many(urls, crawler_config=config) Args: urls: List of URLs to crawl crawler_config: Configuration object controlling crawl behavior for all URLs [other parameters maintained for backwards compatibility] Returns: List[CrawlResult]: Results for each URL """ crawler_config = config # Handle configuration if crawler_config is not None: if any(param is not None for param in [ word_count_threshold, extraction_strategy, chunking_strategy, content_filter, cache_mode, css_selector, screenshot, pdf ]): self.logger.warning( message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", tag="WARNING" ) config = crawler_config else: # Merge all parameters into a single kwargs dict for config creation config_kwargs = { "word_count_threshold": word_count_threshold, "extraction_strategy": extraction_strategy, "chunking_strategy": chunking_strategy, "content_filter": content_filter, "cache_mode": cache_mode, "bypass_cache": bypass_cache, "css_selector": css_selector, "screenshot": screenshot, "pdf": pdf, "verbose": verbose, **kwargs } config = CrawlerRunConfig.from_kwargs(config_kwargs) if bypass_cache: if kwargs.get("warning", True): warnings.warn( "'bypass_cache' is deprecated and will be removed in version 0.5.0. " "Use 'cache_mode=CacheMode.BYPASS' instead. " "Pass warning=False to suppress this warning.", DeprecationWarning, stacklevel=2 ) if config.cache_mode is None: config.cache_mode = CacheMode.BYPASS semaphore_count = config.semaphore_count or 5 semaphore = asyncio.Semaphore(semaphore_count) async def crawl_with_semaphore(url): # Handle rate limiting per domain domain = urlparse(url).netloc current_time = time.time() self.logger.debug( message="Started task for {url:.50}...", tag="PARALLEL", params={"url": url} ) # Get delay settings from config mean_delay = config.mean_delay max_range = config.max_range # Apply rate limiting if domain in self._domain_last_hit: time_since_last = current_time - self._domain_last_hit[domain] if time_since_last < mean_delay: delay = mean_delay + random.uniform(0, max_range) await asyncio.sleep(delay) self._domain_last_hit[domain] = current_time async with semaphore: return await self.arun( url, crawler_config=config, # Pass the entire config object user_agent=user_agent # Maintain user_agent override capability ) # Log start of concurrent crawling self.logger.info( message="Starting concurrent crawling for {count} URLs...", tag="INIT", params={"count": len(urls)} ) # Execute concurrent crawls start_time = time.perf_counter() tasks = [crawl_with_semaphore(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) end_time = time.perf_counter() # Log completion self.logger.success( message="Concurrent crawling completed for {count} URLs | Total time: {timing}", tag="COMPLETE", params={ "count": len(urls), "timing": f"{end_time - start_time:.2f}s" }, colors={ "timing": Fore.YELLOW } ) return [result if not isinstance(result, Exception) else str(result) for result in results] async def aclear_cache(self): """Clear the cache database.""" await async_db_manager.cleanup() async def aflush_cache(self): """Flush the cache database.""" await async_db_manager.aflush_db() async def aget_cache_size(self): """Get the total number of cached items.""" return await async_db_manager.aget_total_count()