Spaces:

re-mind
/

Crawl4AI

Running

File size: 35,896 Bytes

03c0888

import os, sys
import time
import warnings
from enum import Enum
from colorama import init, Fore, Back, Style
from pathlib import Path
from typing import Optional, List, Union
import json
import asyncio
# from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager
from .chunking_strategy import *
from .content_filter_strategy import *
from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from .content_scraping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger
from .async_configs import BrowserConfig, CrawlerRunConfig
from .config import (
    MIN_WORD_THRESHOLD, 
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
    URL_LOG_SHORTEN_LENGTH
)
from .utils import (
    sanitize_input_encode,
    InvalidCSSSelectorError,
    format_html,
    fast_format_html,
    create_box_message
)

from urllib.parse import urlparse
import random
from .__version__ import __version__ as crawl4ai_version


class AsyncWebCrawler:
    """
    Asynchronous web crawler with flexible caching capabilities.
    
    There are two ways to use the crawler:

    1. Using context manager (recommended for simple cases):
        ```python
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url="https://example.com")
        ```

    2. Using explicit lifecycle management (recommended for long-running applications):
        ```python
        crawler = AsyncWebCrawler()
        await crawler.start()
        
        # Use the crawler multiple times
        result1 = await crawler.arun(url="https://example.com")
        result2 = await crawler.arun(url="https://another.com")
        
        await crawler.close()
        ```
    
    Migration Guide:
    Old way (deprecated):
        crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
    
    New way (recommended):
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        crawler = AsyncWebCrawler(config=browser_config)
    
    
    Attributes:
        browser_config (BrowserConfig): Configuration object for browser settings.
        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
        logger (AsyncLogger): Logger instance for recording events and errors.
        always_bypass_cache (bool): Whether to always bypass cache.
        crawl4ai_folder (str): Directory for storing cache.
        base_directory (str): Base directory for storing cache.
        ready (bool): Whether the crawler is ready for use.
        
        Methods:
            start(): Start the crawler explicitly without using context manager.
            close(): Close the crawler explicitly without using context manager.
            arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
            awarmup(): Perform warmup sequence.
            arun_many(): Run the crawler for multiple sources.
            aprocess_html(): Process HTML content.
    
    Typical Usage:
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url="https://example.com")
            print(result.markdown)
            
        Using configuration:
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        async with AsyncWebCrawler(config=browser_config) as crawler:
            crawler_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS                
            )
            result = await crawler.arun(url="https://example.com", config=crawler_config)
            print(result.markdown)
    """
    _domain_last_hit = {}

    def __init__(
        self,
        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
        config: Optional[BrowserConfig] = None,
        always_bypass_cache: bool = False,
        always_by_pass_cache: Optional[bool] = None,  # Deprecated parameter
        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
        thread_safe: bool = False,
        **kwargs,
    ):
        """
        Initialize the AsyncWebCrawler.

        Args:
            crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
            config: Configuration object for browser settings. If None, will be created from kwargs
            always_bypass_cache: Whether to always bypass cache (new parameter)
            always_by_pass_cache: Deprecated, use always_bypass_cache instead
            base_directory: Base directory for storing cache
            thread_safe: Whether to use thread-safe operations
            **kwargs: Additional arguments for backwards compatibility
        """  
        # Handle browser configuration
        browser_config = config
        if browser_config is not None:
            if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]):
                self.logger.warning(
                    message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
                    tag="WARNING"
                )
        else:
            # Create browser config from kwargs for backwards compatibility
            browser_config = BrowserConfig.from_kwargs(kwargs)

        self.browser_config = browser_config
        
        # Initialize logger first since other components may need it
        self.logger = AsyncLogger(
            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
            verbose=self.browser_config.verbose,    
            tag_width=10
        )

        
        # Initialize crawler strategy
        params = {
            k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger']
        }
        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
            browser_config=browser_config,
            logger=self.logger,
            **params  # Pass remaining kwargs for backwards compatibility
        )
        
        # If craweler strategy doesnt have logger, use crawler logger
        if not self.crawler_strategy.logger:
            self.crawler_strategy.logger = self.logger
        
        # Handle deprecated cache parameter
        if always_by_pass_cache is not None:
            if kwargs.get("warning", True):
                warnings.warn(
                    "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
                    "Use 'always_bypass_cache' instead. "
                    "Pass warning=False to suppress this warning.",
                    DeprecationWarning,
                    stacklevel=2
                )
            self.always_bypass_cache = always_by_pass_cache
        else:
            self.always_bypass_cache = always_bypass_cache

        # Thread safety setup
        self._lock = asyncio.Lock() if thread_safe else None
        
        # Initialize directories
        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
        
        self.ready = False

    async def start(self):
        """
        Start the crawler explicitly without using context manager.
        This is equivalent to using 'async with' but gives more control over the lifecycle.
        
        This method will:
        1. Initialize the browser and context
        2. Perform warmup sequence
        3. Return the crawler instance for method chaining
        
        Returns:
            AsyncWebCrawler: The initialized crawler instance
        """
        await self.crawler_strategy.__aenter__()
        await self.awarmup()
        return self

    async def close(self):
        """
        Close the crawler explicitly without using context manager.
        This should be called when you're done with the crawler if you used start().
        
        This method will:
        1. Clean up browser resources
        2. Close any open pages and contexts
        """
        await self.crawler_strategy.__aexit__(None, None, None)

    async def __aenter__(self):
        return await self.start()

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.close()
    
    async def awarmup(self):
        """
        Initialize the crawler with warm-up sequence.
        
        This method:
        1. Logs initialization info
        2. Sets up browser configuration
        3. Marks the crawler as ready
        """
        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
        self.ready = True

    @asynccontextmanager
    async def nullcontext(self):
        """异步空上下文管理器"""
        yield

    async def arun(
            self,
            url: str,
            config: Optional[CrawlerRunConfig] = None,
            # Legacy parameters maintained for backwards compatibility
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
            chunking_strategy: ChunkingStrategy = RegexChunking(),
            content_filter: RelevantContentFilter = None,
            cache_mode: Optional[CacheMode] = None,
            # Deprecated cache parameters
            bypass_cache: bool = False,
            disable_cache: bool = False,
            no_cache_read: bool = False,
            no_cache_write: bool = False,
            # Other legacy parameters
            css_selector: str = None,
            screenshot: bool = False,
            pdf: bool = False,
            user_agent: str = None,
            verbose=True,
            **kwargs,
        ) -> CrawlResult:
            """
            Runs the crawler for a single source: URL (web, local file, or raw HTML).

            Migration Guide:
            Old way (deprecated):
                result = await crawler.arun(
                    url="https://example.com",
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )

            New way (recommended):
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
                result = await crawler.arun(url="https://example.com", crawler_config=config)

            Args:
                url: The URL to crawl (http://, https://, file://, or raw:)
                crawler_config: Configuration object controlling crawl behavior
                [other parameters maintained for backwards compatibility]

            Returns:
                CrawlResult: The result of crawling and processing
            """
            crawler_config = config
            if not isinstance(url, str) or not url:
                raise ValueError("Invalid URL, make sure the URL is a non-empty string")
            
            async with self._lock or self.nullcontext():
                try:
                    # Handle configuration
                    if crawler_config is not None:
                        # if any(param is not None for param in [
                        #     word_count_threshold, extraction_strategy, chunking_strategy,
                        #     content_filter, cache_mode, css_selector, screenshot, pdf
                        # ]):
                        #     self.logger.warning(
                        #         message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
                        #         tag="WARNING"
                        #     )
                        config = crawler_config
                    else:
                        # Merge all parameters into a single kwargs dict for config creation
                        config_kwargs = {
                            "word_count_threshold": word_count_threshold,
                            "extraction_strategy": extraction_strategy,
                            "chunking_strategy": chunking_strategy,
                            "content_filter": content_filter,
                            "cache_mode": cache_mode,
                            "bypass_cache": bypass_cache,
                            "disable_cache": disable_cache,
                            "no_cache_read": no_cache_read,
                            "no_cache_write": no_cache_write,
                            "css_selector": css_selector,
                            "screenshot": screenshot,
                            "pdf": pdf,
                            "verbose": verbose,
                            **kwargs
                        }
                        config = CrawlerRunConfig.from_kwargs(config_kwargs)

                    # Handle deprecated cache parameters
                    if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
                        if kwargs.get("warning", True):
                            warnings.warn(
                                "Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
                                "Use 'cache_mode' parameter instead.",
                                DeprecationWarning,
                                stacklevel=2
                            )
                        
                        # Convert legacy parameters if cache_mode not provided
                        if config.cache_mode is None:
                            config.cache_mode = _legacy_to_cache_mode(
                                disable_cache=disable_cache,
                                bypass_cache=bypass_cache,
                                no_cache_read=no_cache_read,
                                no_cache_write=no_cache_write
                            )
                    
                    # Default to ENABLED if no cache mode specified
                    if config.cache_mode is None:
                        config.cache_mode = CacheMode.ENABLED

                    # Create cache context
                    cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache)

                    # Initialize processing variables
                    async_response: AsyncCrawlResponse = None
                    cached_result: CrawlResult = None
                    screenshot_data = None
                    pdf_data = None
                    extracted_content = None
                    start_time = time.perf_counter()

                    # Try to get cached result if appropriate
                    if cache_context.should_read():
                        cached_result = await async_db_manager.aget_cached_url(url)

                    if cached_result:
                        html = sanitize_input_encode(cached_result.html)
                        extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
                        extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
                        # If screenshot is requested but its not in cache, then set cache_result to None
                        screenshot_data = cached_result.screenshot
                        pdf_data = cached_result.pdf
                        if config.screenshot and not screenshot or config.pdf and not pdf:
                            cached_result = None

                        self.logger.url_status(
                            url=cache_context.display_url,
                            success=bool(html),
                            timing=time.perf_counter() - start_time,
                            tag="FETCH"
                        )

                    # Fetch fresh content if needed
                    if not cached_result or not html:
                        t1 = time.perf_counter()
                        
                        if user_agent:
                            self.crawler_strategy.update_user_agent(user_agent)
                        
                        # Pass config to crawl method
                        async_response = await self.crawler_strategy.crawl(
                            url,
                            config=config  # Pass the entire config object
                        )
                        
                        html = sanitize_input_encode(async_response.html)
                        screenshot_data = async_response.screenshot
                        pdf_data = async_response.pdf_data
                        
                        t2 = time.perf_counter()
                        self.logger.url_status(
                            url=cache_context.display_url,
                            success=bool(html),
                            timing=t2 - t1,
                            tag="FETCH"
                        )

                        # Process the HTML content
                        crawl_result = await self.aprocess_html(
                            url=url,
                            html=html,
                            extracted_content=extracted_content,
                            config=config,  # Pass the config object instead of individual parameters
                            screenshot=screenshot_data,
                            pdf_data=pdf_data,
                            verbose=config.verbose,
                            is_raw_html = True if url.startswith("raw:") else False,
                            **kwargs
                        )

                        crawl_result.status_code = async_response.status_code
                        crawl_result.response_headers = async_response.response_headers
                        crawl_result.downloaded_files = async_response.downloaded_files
                        crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate

                        # # Check and set values from async_response to crawl_result
                        # try:
                        #     for key in vars(async_response):
                        #         if hasattr(crawl_result, key):
                        #             value = getattr(async_response, key, None)
                        #             current_value = getattr(crawl_result, key, None)
                        #             if value is not None and not current_value:
                        #                 try:
                        #                     setattr(crawl_result, key, value)
                        #                 except Exception as e:
                        #                     self.logger.warning(
                        #                         message=f"Failed to set attribute {key}: {str(e)}",
                        #                         tag="WARNING"
                        #                     )
                        # except Exception as e:
                        #     self.logger.warning(
                        #         message=f"Error copying response attributes: {str(e)}",
                        #         tag="WARNING"
                        #     )

                        crawl_result.success = bool(html)
                        crawl_result.session_id = getattr(config, 'session_id', None)

                        self.logger.success(
                            message="{url:.50}... | Status: {status} | Total: {timing}",
                            tag="COMPLETE",
                            params={
                                "url": cache_context.display_url,
                                "status": crawl_result.success,
                                "timing": f"{time.perf_counter() - start_time:.2f}s"
                            },
                            colors={
                                "status": Fore.GREEN if crawl_result.success else Fore.RED,
                                "timing": Fore.YELLOW
                            }
                        )

                        # Update cache if appropriate
                        if cache_context.should_write() and not bool(cached_result):
                            await async_db_manager.acache_url(crawl_result)

                        return crawl_result

                    else:
                        self.logger.success(
                            message="{url:.50}... | Status: {status} | Total: {timing}",
                            tag="COMPLETE",
                            params={
                                "url": cache_context.display_url,
                                "status": True,
                                "timing": f"{time.perf_counter() - start_time:.2f}s"
                            },
                            colors={
                                "status": Fore.GREEN,
                                "timing": Fore.YELLOW
                            }
                        )

                        cached_result.success = bool(html)
                        cached_result.session_id = getattr(config, 'session_id', None)
                        return cached_result

                except Exception as e:
                    error_context = get_error_context(sys.exc_info())
                
                    error_message = (
                        f"Unexpected error in _crawl_web at line {error_context['line_no']} "
                        f"in {error_context['function']} ({error_context['filename']}):\n"
                        f"Error: {str(e)}\n\n"
                        f"Code context:\n{error_context['code_context']}"
                    )
                    # if not hasattr(e, "msg"):
                    #     e.msg = str(e)
                    
                    self.logger.error_status(
                        url=url,
                        error=create_box_message(error_message, type="error"),
                        tag="ERROR"
                    )
                    
                    return CrawlResult(
                        url=url,
                        html="",
                        success=False,
                        error_message=error_message
                    )

    async def aprocess_html(
            self,
            url: str,
            html: str,
            extracted_content: str,
            config: CrawlerRunConfig,
            screenshot: str,
            pdf_data: str,
            verbose: bool,
            **kwargs,
        ) -> CrawlResult:
            """
            Process HTML content using the provided configuration.
            
            Args:
                url: The URL being processed
                html: Raw HTML content
                extracted_content: Previously extracted content (if any)
                config: Configuration object controlling processing behavior
                screenshot: Screenshot data (if any)
                pdf_data: PDF data (if any)
                verbose: Whether to enable verbose logging
                **kwargs: Additional parameters for backwards compatibility
            
            Returns:
                CrawlResult: Processed result containing extracted and formatted content
            """
            try:
                _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
                t1 = time.perf_counter()

                # Initialize scraping strategy
                scrapping_strategy = WebScrapingStrategy(logger=self.logger)

                # Process HTML content
                params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
                # add keys from kwargs to params that doesn't exist in params
                params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
                
                result = scrapping_strategy.scrap(
                    url,
                    html,
                    **params,
                    # word_count_threshold=config.word_count_threshold,
                    # css_selector=config.css_selector,
                    # only_text=config.only_text,
                    # image_description_min_word_threshold=config.image_description_min_word_threshold,
                    # content_filter=config.content_filter,
                    # **kwargs
                )

                if result is None:
                    raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")

            except InvalidCSSSelectorError as e:
                raise ValueError(str(e))
            except Exception as e:
                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")

       

            # Extract results
            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
            fit_html = sanitize_input_encode(result.get("fit_html", ""))
            media = result.get("media", [])
            links = result.get("links", [])
            metadata = result.get("metadata", {})

            # Markdown Generation
            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
            
            # Uncomment if by default we want to use PruningContentFilter
            # if not config.content_filter and not markdown_generator.content_filter:
            #     markdown_generator.content_filter = PruningContentFilter()
            
            markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
                cleaned_html=cleaned_html,
                base_url=url,
                # html2text_options=kwargs.get('html2text', {})
            )
            markdown_v2 = markdown_result
            markdown = sanitize_input_encode(markdown_result.raw_markdown)

            # Log processing completion
            self.logger.info(
                message="Processed {url:.50}... | Time: {timing}ms",
                tag="SCRAPE",
                params={
                    "url": _url,
                    "timing": int((time.perf_counter() - t1) * 1000)
                }
            )

            # Handle content extraction if needed
            if (extracted_content is None and 
                config.extraction_strategy and 
                config.chunking_strategy and 
                not isinstance(config.extraction_strategy, NoExtractionStrategy)):
                
                t1 = time.perf_counter()
                
                # Choose content based on input_format
                content_format = config.extraction_strategy.input_format
                if content_format == "fit_markdown" and not markdown_result.fit_markdown:
                    self.logger.warning(
                        message="Fit markdown requested but not available. Falling back to raw markdown.",
                        tag="EXTRACT",
                        params={"url": _url}
                    )
                    content_format = "markdown"

                content = {
                    "markdown": markdown,
                    "html": html,
                    "fit_markdown": markdown_result.raw_markdown
                }.get(content_format, markdown)
                
                # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
                chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy
                sections = chunking.chunk(content)
                extracted_content = config.extraction_strategy.run(url, sections)
                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)

                # Log extraction completion
                self.logger.info(
                    message="Completed for {url:.50}... | Time: {timing}s",
                    tag="EXTRACT",
                    params={
                        "url": _url,
                        "timing": time.perf_counter() - t1
                    }
                )

            # Handle screenshot and PDF data
            screenshot_data = None if not screenshot else screenshot
            pdf_data = None if not pdf_data else pdf_data

            # Apply HTML formatting if requested
            if config.prettiify:
                cleaned_html = fast_format_html(cleaned_html)

            # Return complete crawl result
            return CrawlResult(
                url=url,
                html=html,
                cleaned_html=cleaned_html,
                markdown_v2=markdown_v2,
                markdown=markdown,
                fit_markdown=fit_markdown,
                fit_html=fit_html,
                media=media,
                links=links,
                metadata=metadata,
                screenshot=screenshot_data,
                pdf=pdf_data,
                extracted_content=extracted_content,
                success=True,
                error_message="",
            )    

    async def arun_many(
            self,
            urls: List[str],
            config: Optional[CrawlerRunConfig] = None,
            # Legacy parameters maintained for backwards compatibility
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
            chunking_strategy: ChunkingStrategy = RegexChunking(),
            content_filter: RelevantContentFilter = None,
            cache_mode: Optional[CacheMode] = None,
            bypass_cache: bool = False,
            css_selector: str = None,
            screenshot: bool = False,
            pdf: bool = False,
            user_agent: str = None,
            verbose=True,
            **kwargs,
        ) -> List[CrawlResult]:
            """
            Runs the crawler for multiple URLs concurrently.

            Migration Guide:
            Old way (deprecated):
                results = await crawler.arun_many(
                    urls,
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
            
            New way (recommended):
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
                results = await crawler.arun_many(urls, crawler_config=config)

            Args:
                urls: List of URLs to crawl
                crawler_config: Configuration object controlling crawl behavior for all URLs
                [other parameters maintained for backwards compatibility]
            
            Returns:
                List[CrawlResult]: Results for each URL
            """
            crawler_config = config
            # Handle configuration
            if crawler_config is not None:
                if any(param is not None for param in [
                    word_count_threshold, extraction_strategy, chunking_strategy,
                    content_filter, cache_mode, css_selector, screenshot, pdf
                ]):
                    self.logger.warning(
                        message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
                        tag="WARNING"
                    )
                config = crawler_config
            else:
                # Merge all parameters into a single kwargs dict for config creation
                config_kwargs = {
                    "word_count_threshold": word_count_threshold,
                    "extraction_strategy": extraction_strategy,
                    "chunking_strategy": chunking_strategy,
                    "content_filter": content_filter,
                    "cache_mode": cache_mode,
                    "bypass_cache": bypass_cache,
                    "css_selector": css_selector,
                    "screenshot": screenshot,
                    "pdf": pdf,
                    "verbose": verbose,
                    **kwargs
                }
                config = CrawlerRunConfig.from_kwargs(config_kwargs)

            if bypass_cache:
                if kwargs.get("warning", True):
                    warnings.warn(
                        "'bypass_cache' is deprecated and will be removed in version 0.5.0. "
                        "Use 'cache_mode=CacheMode.BYPASS' instead. "
                        "Pass warning=False to suppress this warning.",
                        DeprecationWarning,
                        stacklevel=2
                    )
                if config.cache_mode is None:
                    config.cache_mode = CacheMode.BYPASS

            semaphore_count = config.semaphore_count or 5
            semaphore = asyncio.Semaphore(semaphore_count)

            async def crawl_with_semaphore(url):
                # Handle rate limiting per domain
                domain = urlparse(url).netloc
                current_time = time.time()
                
                self.logger.debug(
                    message="Started task for {url:.50}...",
                    tag="PARALLEL",
                    params={"url": url}
                )

                # Get delay settings from config
                mean_delay = config.mean_delay
                max_range = config.max_range
                
                # Apply rate limiting
                if domain in self._domain_last_hit:
                    time_since_last = current_time - self._domain_last_hit[domain]
                    if time_since_last < mean_delay:
                        delay = mean_delay + random.uniform(0, max_range)
                        await asyncio.sleep(delay)
                
                self._domain_last_hit[domain] = current_time

                async with semaphore:
                    return await self.arun(
                        url,
                        crawler_config=config,  # Pass the entire config object
                        user_agent=user_agent  # Maintain user_agent override capability
                    )

            # Log start of concurrent crawling
            self.logger.info(
                message="Starting concurrent crawling for {count} URLs...",
                tag="INIT",
                params={"count": len(urls)}
            )

            # Execute concurrent crawls
            start_time = time.perf_counter()
            tasks = [crawl_with_semaphore(url) for url in urls]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            end_time = time.perf_counter()

            # Log completion
            self.logger.success(
                message="Concurrent crawling completed for {count} URLs | Total time: {timing}",
                tag="COMPLETE",
                params={
                    "count": len(urls),
                    "timing": f"{end_time - start_time:.2f}s"
                },
                colors={
                    "timing": Fore.YELLOW
                }
            )

            return [result if not isinstance(result, Exception) else str(result) for result in results]

    async def aclear_cache(self):
        """Clear the cache database."""
        await async_db_manager.cleanup()

    async def aflush_cache(self):
        """Flush the cache database."""
        await async_db_manager.aflush_db()

    async def aget_cache_size(self):
        """Get the total number of cached items."""
        return await async_db_manager.aget_total_count()