File size: 35,896 Bytes
03c0888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
import os, sys
import time
import warnings
from enum import Enum
from colorama import init, Fore, Back, Style
from pathlib import Path
from typing import Optional, List, Union
import json
import asyncio
# from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager
from .chunking_strategy import *
from .content_filter_strategy import *
from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from .content_scraping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger
from .async_configs import BrowserConfig, CrawlerRunConfig
from .config import (
    MIN_WORD_THRESHOLD, 
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
    URL_LOG_SHORTEN_LENGTH
)
from .utils import (
    sanitize_input_encode,
    InvalidCSSSelectorError,
    format_html,
    fast_format_html,
    create_box_message
)

from urllib.parse import urlparse
import random
from .__version__ import __version__ as crawl4ai_version


class AsyncWebCrawler:
    """
    Asynchronous web crawler with flexible caching capabilities.
    
    There are two ways to use the crawler:

    1. Using context manager (recommended for simple cases):
        ```python
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url="https://example.com")
        ```

    2. Using explicit lifecycle management (recommended for long-running applications):
        ```python
        crawler = AsyncWebCrawler()
        await crawler.start()
        
        # Use the crawler multiple times
        result1 = await crawler.arun(url="https://example.com")
        result2 = await crawler.arun(url="https://another.com")
        
        await crawler.close()
        ```
    
    Migration Guide:
    Old way (deprecated):
        crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
    
    New way (recommended):
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        crawler = AsyncWebCrawler(config=browser_config)
    
    
    Attributes:
        browser_config (BrowserConfig): Configuration object for browser settings.
        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
        logger (AsyncLogger): Logger instance for recording events and errors.
        always_bypass_cache (bool): Whether to always bypass cache.
        crawl4ai_folder (str): Directory for storing cache.
        base_directory (str): Base directory for storing cache.
        ready (bool): Whether the crawler is ready for use.
        
        Methods:
            start(): Start the crawler explicitly without using context manager.
            close(): Close the crawler explicitly without using context manager.
            arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
            awarmup(): Perform warmup sequence.
            arun_many(): Run the crawler for multiple sources.
            aprocess_html(): Process HTML content.
    
    Typical Usage:
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url="https://example.com")
            print(result.markdown)
            
        Using configuration:
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        async with AsyncWebCrawler(config=browser_config) as crawler:
            crawler_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS                
            )
            result = await crawler.arun(url="https://example.com", config=crawler_config)
            print(result.markdown)
    """
    _domain_last_hit = {}

    def __init__(
        self,
        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
        config: Optional[BrowserConfig] = None,
        always_bypass_cache: bool = False,
        always_by_pass_cache: Optional[bool] = None,  # Deprecated parameter
        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
        thread_safe: bool = False,
        **kwargs,
    ):
        """
        Initialize the AsyncWebCrawler.

        Args:
            crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
            config: Configuration object for browser settings. If None, will be created from kwargs
            always_bypass_cache: Whether to always bypass cache (new parameter)
            always_by_pass_cache: Deprecated, use always_bypass_cache instead
            base_directory: Base directory for storing cache
            thread_safe: Whether to use thread-safe operations
            **kwargs: Additional arguments for backwards compatibility
        """  
        # Handle browser configuration
        browser_config = config
        if browser_config is not None:
            if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]):
                self.logger.warning(
                    message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
                    tag="WARNING"
                )
        else:
            # Create browser config from kwargs for backwards compatibility
            browser_config = BrowserConfig.from_kwargs(kwargs)

        self.browser_config = browser_config
        
        # Initialize logger first since other components may need it
        self.logger = AsyncLogger(
            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
            verbose=self.browser_config.verbose,    
            tag_width=10
        )

        
        # Initialize crawler strategy
        params = {
            k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger']
        }
        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
            browser_config=browser_config,
            logger=self.logger,
            **params  # Pass remaining kwargs for backwards compatibility
        )
        
        # If craweler strategy doesnt have logger, use crawler logger
        if not self.crawler_strategy.logger:
            self.crawler_strategy.logger = self.logger
        
        # Handle deprecated cache parameter
        if always_by_pass_cache is not None:
            if kwargs.get("warning", True):
                warnings.warn(
                    "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
                    "Use 'always_bypass_cache' instead. "
                    "Pass warning=False to suppress this warning.",
                    DeprecationWarning,
                    stacklevel=2
                )
            self.always_bypass_cache = always_by_pass_cache
        else:
            self.always_bypass_cache = always_bypass_cache

        # Thread safety setup
        self._lock = asyncio.Lock() if thread_safe else None
        
        # Initialize directories
        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
        
        self.ready = False

    async def start(self):
        """
        Start the crawler explicitly without using context manager.
        This is equivalent to using 'async with' but gives more control over the lifecycle.
        
        This method will:
        1. Initialize the browser and context
        2. Perform warmup sequence
        3. Return the crawler instance for method chaining
        
        Returns:
            AsyncWebCrawler: The initialized crawler instance
        """
        await self.crawler_strategy.__aenter__()
        await self.awarmup()
        return self

    async def close(self):
        """
        Close the crawler explicitly without using context manager.
        This should be called when you're done with the crawler if you used start().
        
        This method will:
        1. Clean up browser resources
        2. Close any open pages and contexts
        """
        await self.crawler_strategy.__aexit__(None, None, None)

    async def __aenter__(self):
        return await self.start()

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.close()
    
    async def awarmup(self):
        """
        Initialize the crawler with warm-up sequence.
        
        This method:
        1. Logs initialization info
        2. Sets up browser configuration
        3. Marks the crawler as ready
        """
        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
        self.ready = True

    @asynccontextmanager
    async def nullcontext(self):
        """异步空上下文管理器"""
        yield

    async def arun(
            self,
            url: str,
            config: Optional[CrawlerRunConfig] = None,
            # Legacy parameters maintained for backwards compatibility
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
            chunking_strategy: ChunkingStrategy = RegexChunking(),
            content_filter: RelevantContentFilter = None,
            cache_mode: Optional[CacheMode] = None,
            # Deprecated cache parameters
            bypass_cache: bool = False,
            disable_cache: bool = False,
            no_cache_read: bool = False,
            no_cache_write: bool = False,
            # Other legacy parameters
            css_selector: str = None,
            screenshot: bool = False,
            pdf: bool = False,
            user_agent: str = None,
            verbose=True,
            **kwargs,
        ) -> CrawlResult:
            """
            Runs the crawler for a single source: URL (web, local file, or raw HTML).

            Migration Guide:
            Old way (deprecated):
                result = await crawler.arun(
                    url="https://example.com",
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )

            New way (recommended):
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
                result = await crawler.arun(url="https://example.com", crawler_config=config)

            Args:
                url: The URL to crawl (http://, https://, file://, or raw:)
                crawler_config: Configuration object controlling crawl behavior
                [other parameters maintained for backwards compatibility]

            Returns:
                CrawlResult: The result of crawling and processing
            """
            crawler_config = config
            if not isinstance(url, str) or not url:
                raise ValueError("Invalid URL, make sure the URL is a non-empty string")
            
            async with self._lock or self.nullcontext():
                try:
                    # Handle configuration
                    if crawler_config is not None:
                        # if any(param is not None for param in [
                        #     word_count_threshold, extraction_strategy, chunking_strategy,
                        #     content_filter, cache_mode, css_selector, screenshot, pdf
                        # ]):
                        #     self.logger.warning(
                        #         message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
                        #         tag="WARNING"
                        #     )
                        config = crawler_config
                    else:
                        # Merge all parameters into a single kwargs dict for config creation
                        config_kwargs = {
                            "word_count_threshold": word_count_threshold,
                            "extraction_strategy": extraction_strategy,
                            "chunking_strategy": chunking_strategy,
                            "content_filter": content_filter,
                            "cache_mode": cache_mode,
                            "bypass_cache": bypass_cache,
                            "disable_cache": disable_cache,
                            "no_cache_read": no_cache_read,
                            "no_cache_write": no_cache_write,
                            "css_selector": css_selector,
                            "screenshot": screenshot,
                            "pdf": pdf,
                            "verbose": verbose,
                            **kwargs
                        }
                        config = CrawlerRunConfig.from_kwargs(config_kwargs)

                    # Handle deprecated cache parameters
                    if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
                        if kwargs.get("warning", True):
                            warnings.warn(
                                "Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
                                "Use 'cache_mode' parameter instead.",
                                DeprecationWarning,
                                stacklevel=2
                            )
                        
                        # Convert legacy parameters if cache_mode not provided
                        if config.cache_mode is None:
                            config.cache_mode = _legacy_to_cache_mode(
                                disable_cache=disable_cache,
                                bypass_cache=bypass_cache,
                                no_cache_read=no_cache_read,
                                no_cache_write=no_cache_write
                            )
                    
                    # Default to ENABLED if no cache mode specified
                    if config.cache_mode is None:
                        config.cache_mode = CacheMode.ENABLED

                    # Create cache context
                    cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache)

                    # Initialize processing variables
                    async_response: AsyncCrawlResponse = None
                    cached_result: CrawlResult = None
                    screenshot_data = None
                    pdf_data = None
                    extracted_content = None
                    start_time = time.perf_counter()

                    # Try to get cached result if appropriate
                    if cache_context.should_read():
                        cached_result = await async_db_manager.aget_cached_url(url)

                    if cached_result:
                        html = sanitize_input_encode(cached_result.html)
                        extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
                        extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
                        # If screenshot is requested but its not in cache, then set cache_result to None
                        screenshot_data = cached_result.screenshot
                        pdf_data = cached_result.pdf
                        if config.screenshot and not screenshot or config.pdf and not pdf:
                            cached_result = None

                        self.logger.url_status(
                            url=cache_context.display_url,
                            success=bool(html),
                            timing=time.perf_counter() - start_time,
                            tag="FETCH"
                        )

                    # Fetch fresh content if needed
                    if not cached_result or not html:
                        t1 = time.perf_counter()
                        
                        if user_agent:
                            self.crawler_strategy.update_user_agent(user_agent)
                        
                        # Pass config to crawl method
                        async_response = await self.crawler_strategy.crawl(
                            url,
                            config=config  # Pass the entire config object
                        )
                        
                        html = sanitize_input_encode(async_response.html)
                        screenshot_data = async_response.screenshot
                        pdf_data = async_response.pdf_data
                        
                        t2 = time.perf_counter()
                        self.logger.url_status(
                            url=cache_context.display_url,
                            success=bool(html),
                            timing=t2 - t1,
                            tag="FETCH"
                        )

                        # Process the HTML content
                        crawl_result = await self.aprocess_html(
                            url=url,
                            html=html,
                            extracted_content=extracted_content,
                            config=config,  # Pass the config object instead of individual parameters
                            screenshot=screenshot_data,
                            pdf_data=pdf_data,
                            verbose=config.verbose,
                            is_raw_html = True if url.startswith("raw:") else False,
                            **kwargs
                        )

                        crawl_result.status_code = async_response.status_code
                        crawl_result.response_headers = async_response.response_headers
                        crawl_result.downloaded_files = async_response.downloaded_files
                        crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate

                        # # Check and set values from async_response to crawl_result
                        # try:
                        #     for key in vars(async_response):
                        #         if hasattr(crawl_result, key):
                        #             value = getattr(async_response, key, None)
                        #             current_value = getattr(crawl_result, key, None)
                        #             if value is not None and not current_value:
                        #                 try:
                        #                     setattr(crawl_result, key, value)
                        #                 except Exception as e:
                        #                     self.logger.warning(
                        #                         message=f"Failed to set attribute {key}: {str(e)}",
                        #                         tag="WARNING"
                        #                     )
                        # except Exception as e:
                        #     self.logger.warning(
                        #         message=f"Error copying response attributes: {str(e)}",
                        #         tag="WARNING"
                        #     )

                        crawl_result.success = bool(html)
                        crawl_result.session_id = getattr(config, 'session_id', None)

                        self.logger.success(
                            message="{url:.50}... | Status: {status} | Total: {timing}",
                            tag="COMPLETE",
                            params={
                                "url": cache_context.display_url,
                                "status": crawl_result.success,
                                "timing": f"{time.perf_counter() - start_time:.2f}s"
                            },
                            colors={
                                "status": Fore.GREEN if crawl_result.success else Fore.RED,
                                "timing": Fore.YELLOW
                            }
                        )

                        # Update cache if appropriate
                        if cache_context.should_write() and not bool(cached_result):
                            await async_db_manager.acache_url(crawl_result)

                        return crawl_result

                    else:
                        self.logger.success(
                            message="{url:.50}... | Status: {status} | Total: {timing}",
                            tag="COMPLETE",
                            params={
                                "url": cache_context.display_url,
                                "status": True,
                                "timing": f"{time.perf_counter() - start_time:.2f}s"
                            },
                            colors={
                                "status": Fore.GREEN,
                                "timing": Fore.YELLOW
                            }
                        )

                        cached_result.success = bool(html)
                        cached_result.session_id = getattr(config, 'session_id', None)
                        return cached_result

                except Exception as e:
                    error_context = get_error_context(sys.exc_info())
                
                    error_message = (
                        f"Unexpected error in _crawl_web at line {error_context['line_no']} "
                        f"in {error_context['function']} ({error_context['filename']}):\n"
                        f"Error: {str(e)}\n\n"
                        f"Code context:\n{error_context['code_context']}"
                    )
                    # if not hasattr(e, "msg"):
                    #     e.msg = str(e)
                    
                    self.logger.error_status(
                        url=url,
                        error=create_box_message(error_message, type="error"),
                        tag="ERROR"
                    )
                    
                    return CrawlResult(
                        url=url,
                        html="",
                        success=False,
                        error_message=error_message
                    )

    async def aprocess_html(
            self,
            url: str,
            html: str,
            extracted_content: str,
            config: CrawlerRunConfig,
            screenshot: str,
            pdf_data: str,
            verbose: bool,
            **kwargs,
        ) -> CrawlResult:
            """
            Process HTML content using the provided configuration.
            
            Args:
                url: The URL being processed
                html: Raw HTML content
                extracted_content: Previously extracted content (if any)
                config: Configuration object controlling processing behavior
                screenshot: Screenshot data (if any)
                pdf_data: PDF data (if any)
                verbose: Whether to enable verbose logging
                **kwargs: Additional parameters for backwards compatibility
            
            Returns:
                CrawlResult: Processed result containing extracted and formatted content
            """
            try:
                _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
                t1 = time.perf_counter()

                # Initialize scraping strategy
                scrapping_strategy = WebScrapingStrategy(logger=self.logger)

                # Process HTML content
                params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
                # add keys from kwargs to params that doesn't exist in params
                params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
                
                result = scrapping_strategy.scrap(
                    url,
                    html,
                    **params,
                    # word_count_threshold=config.word_count_threshold,
                    # css_selector=config.css_selector,
                    # only_text=config.only_text,
                    # image_description_min_word_threshold=config.image_description_min_word_threshold,
                    # content_filter=config.content_filter,
                    # **kwargs
                )

                if result is None:
                    raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")

            except InvalidCSSSelectorError as e:
                raise ValueError(str(e))
            except Exception as e:
                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")

       

            # Extract results
            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
            fit_html = sanitize_input_encode(result.get("fit_html", ""))
            media = result.get("media", [])
            links = result.get("links", [])
            metadata = result.get("metadata", {})

            # Markdown Generation
            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
            
            # Uncomment if by default we want to use PruningContentFilter
            # if not config.content_filter and not markdown_generator.content_filter:
            #     markdown_generator.content_filter = PruningContentFilter()
            
            markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
                cleaned_html=cleaned_html,
                base_url=url,
                # html2text_options=kwargs.get('html2text', {})
            )
            markdown_v2 = markdown_result
            markdown = sanitize_input_encode(markdown_result.raw_markdown)

            # Log processing completion
            self.logger.info(
                message="Processed {url:.50}... | Time: {timing}ms",
                tag="SCRAPE",
                params={
                    "url": _url,
                    "timing": int((time.perf_counter() - t1) * 1000)
                }
            )

            # Handle content extraction if needed
            if (extracted_content is None and 
                config.extraction_strategy and 
                config.chunking_strategy and 
                not isinstance(config.extraction_strategy, NoExtractionStrategy)):
                
                t1 = time.perf_counter()
                
                # Choose content based on input_format
                content_format = config.extraction_strategy.input_format
                if content_format == "fit_markdown" and not markdown_result.fit_markdown:
                    self.logger.warning(
                        message="Fit markdown requested but not available. Falling back to raw markdown.",
                        tag="EXTRACT",
                        params={"url": _url}
                    )
                    content_format = "markdown"

                content = {
                    "markdown": markdown,
                    "html": html,
                    "fit_markdown": markdown_result.raw_markdown
                }.get(content_format, markdown)
                
                # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
                chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy
                sections = chunking.chunk(content)
                extracted_content = config.extraction_strategy.run(url, sections)
                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)

                # Log extraction completion
                self.logger.info(
                    message="Completed for {url:.50}... | Time: {timing}s",
                    tag="EXTRACT",
                    params={
                        "url": _url,
                        "timing": time.perf_counter() - t1
                    }
                )

            # Handle screenshot and PDF data
            screenshot_data = None if not screenshot else screenshot
            pdf_data = None if not pdf_data else pdf_data

            # Apply HTML formatting if requested
            if config.prettiify:
                cleaned_html = fast_format_html(cleaned_html)

            # Return complete crawl result
            return CrawlResult(
                url=url,
                html=html,
                cleaned_html=cleaned_html,
                markdown_v2=markdown_v2,
                markdown=markdown,
                fit_markdown=fit_markdown,
                fit_html=fit_html,
                media=media,
                links=links,
                metadata=metadata,
                screenshot=screenshot_data,
                pdf=pdf_data,
                extracted_content=extracted_content,
                success=True,
                error_message="",
            )    

    async def arun_many(
            self,
            urls: List[str],
            config: Optional[CrawlerRunConfig] = None,
            # Legacy parameters maintained for backwards compatibility
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
            chunking_strategy: ChunkingStrategy = RegexChunking(),
            content_filter: RelevantContentFilter = None,
            cache_mode: Optional[CacheMode] = None,
            bypass_cache: bool = False,
            css_selector: str = None,
            screenshot: bool = False,
            pdf: bool = False,
            user_agent: str = None,
            verbose=True,
            **kwargs,
        ) -> List[CrawlResult]:
            """
            Runs the crawler for multiple URLs concurrently.

            Migration Guide:
            Old way (deprecated):
                results = await crawler.arun_many(
                    urls,
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
            
            New way (recommended):
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
                results = await crawler.arun_many(urls, crawler_config=config)

            Args:
                urls: List of URLs to crawl
                crawler_config: Configuration object controlling crawl behavior for all URLs
                [other parameters maintained for backwards compatibility]
            
            Returns:
                List[CrawlResult]: Results for each URL
            """
            crawler_config = config
            # Handle configuration
            if crawler_config is not None:
                if any(param is not None for param in [
                    word_count_threshold, extraction_strategy, chunking_strategy,
                    content_filter, cache_mode, css_selector, screenshot, pdf
                ]):
                    self.logger.warning(
                        message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
                        tag="WARNING"
                    )
                config = crawler_config
            else:
                # Merge all parameters into a single kwargs dict for config creation
                config_kwargs = {
                    "word_count_threshold": word_count_threshold,
                    "extraction_strategy": extraction_strategy,
                    "chunking_strategy": chunking_strategy,
                    "content_filter": content_filter,
                    "cache_mode": cache_mode,
                    "bypass_cache": bypass_cache,
                    "css_selector": css_selector,
                    "screenshot": screenshot,
                    "pdf": pdf,
                    "verbose": verbose,
                    **kwargs
                }
                config = CrawlerRunConfig.from_kwargs(config_kwargs)

            if bypass_cache:
                if kwargs.get("warning", True):
                    warnings.warn(
                        "'bypass_cache' is deprecated and will be removed in version 0.5.0. "
                        "Use 'cache_mode=CacheMode.BYPASS' instead. "
                        "Pass warning=False to suppress this warning.",
                        DeprecationWarning,
                        stacklevel=2
                    )
                if config.cache_mode is None:
                    config.cache_mode = CacheMode.BYPASS

            semaphore_count = config.semaphore_count or 5
            semaphore = asyncio.Semaphore(semaphore_count)

            async def crawl_with_semaphore(url):
                # Handle rate limiting per domain
                domain = urlparse(url).netloc
                current_time = time.time()
                
                self.logger.debug(
                    message="Started task for {url:.50}...",
                    tag="PARALLEL",
                    params={"url": url}
                )

                # Get delay settings from config
                mean_delay = config.mean_delay
                max_range = config.max_range
                
                # Apply rate limiting
                if domain in self._domain_last_hit:
                    time_since_last = current_time - self._domain_last_hit[domain]
                    if time_since_last < mean_delay:
                        delay = mean_delay + random.uniform(0, max_range)
                        await asyncio.sleep(delay)
                
                self._domain_last_hit[domain] = current_time

                async with semaphore:
                    return await self.arun(
                        url,
                        crawler_config=config,  # Pass the entire config object
                        user_agent=user_agent  # Maintain user_agent override capability
                    )

            # Log start of concurrent crawling
            self.logger.info(
                message="Starting concurrent crawling for {count} URLs...",
                tag="INIT",
                params={"count": len(urls)}
            )

            # Execute concurrent crawls
            start_time = time.perf_counter()
            tasks = [crawl_with_semaphore(url) for url in urls]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            end_time = time.perf_counter()

            # Log completion
            self.logger.success(
                message="Concurrent crawling completed for {count} URLs | Total time: {timing}",
                tag="COMPLETE",
                params={
                    "count": len(urls),
                    "timing": f"{end_time - start_time:.2f}s"
                },
                colors={
                    "timing": Fore.YELLOW
                }
            )

            return [result if not isinstance(result, Exception) else str(result) for result in results]

    async def aclear_cache(self):
        """Clear the cache database."""
        await async_db_manager.cleanup()

    async def aflush_cache(self):
        """Flush the cache database."""
        await async_db_manager.aflush_db()

    async def aget_cache_size(self):
        """Get the total number of cached items."""
        return await async_db_manager.aget_total_count()