amaye15 commited on
Commit
3cbf3d0
Β·
1 Parent(s): 87d2ee3
Files changed (3) hide show
  1. Dockerfile +2 -2
  2. README.md +1 -2
  3. main.py +70 -24
Dockerfile CHANGED
@@ -274,7 +274,7 @@ RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
274
  fi
275
 
276
  # Expose port
277
- EXPOSE 8000 11235 9222 8080
278
 
279
  # Start the FastAPI server
280
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
 
274
  fi
275
 
276
  # Expose port
277
+ EXPOSE 8000 11235 9222 8080 7860
278
 
279
  # Start the FastAPI server
280
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -6,9 +6,8 @@ colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
- port: 11235
10
  ---
11
-
12
 
13
 
14
  # πŸš€πŸ€– Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
 
6
  sdk: docker
7
  pinned: false
8
  license: mit
 
9
  ---
10
+ <!-- port: 11235 -->
11
 
12
 
13
  # πŸš€πŸ€– Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
main.py CHANGED
@@ -4,7 +4,7 @@ from fastapi.responses import JSONResponse
4
  from fastapi import FastAPI, HTTPException, Request
5
  from fastapi.responses import HTMLResponse, JSONResponse
6
  from fastapi.staticfiles import StaticFiles
7
- from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.templating import Jinja2Templates
9
  from fastapi.exceptions import RequestValidationError
10
  from starlette.middleware.base import BaseHTTPMiddleware
@@ -38,30 +38,36 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
38
  logging.basicConfig(level=logging.INFO)
39
  logger = logging.getLogger(__name__)
40
 
 
41
  class TaskStatus(str, Enum):
42
  PENDING = "pending"
43
  PROCESSING = "processing"
44
  COMPLETED = "completed"
45
  FAILED = "failed"
46
 
 
47
  class CrawlerType(str, Enum):
48
  BASIC = "basic"
49
  LLM = "llm"
50
  COSINE = "cosine"
51
  JSON_CSS = "json_css"
52
 
 
53
  class ExtractionConfig(BaseModel):
54
  type: CrawlerType
55
  params: Dict[str, Any] = {}
56
 
 
57
  class ChunkingStrategy(BaseModel):
58
  type: str
59
  params: Dict[str, Any] = {}
60
 
 
61
  class ContentFilter(BaseModel):
62
  type: str = "bm25"
63
  params: Dict[str, Any] = {}
64
 
 
65
  class CrawlRequest(BaseModel):
66
  urls: Union[HttpUrl, List[HttpUrl]]
67
  word_count_threshold: int = MIN_WORD_THRESHOLD
@@ -77,9 +83,10 @@ class CrawlRequest(BaseModel):
77
  session_id: Optional[str] = None
78
  cache_mode: Optional[CacheMode] = CacheMode.ENABLED
79
  priority: int = Field(default=5, ge=1, le=10)
80
- ttl: Optional[int] = 3600
81
  crawler_params: Dict[str, Any] = {}
82
 
 
83
  @dataclass
84
  class TaskInfo:
85
  id: str
@@ -89,6 +96,7 @@ class TaskInfo:
89
  created_at: float = time.time()
90
  ttl: int = 3600
91
 
 
92
  class ResourceMonitor:
93
  def __init__(self, max_concurrent_tasks: int = 10):
94
  self.max_concurrent_tasks = max_concurrent_tasks
@@ -106,7 +114,9 @@ class ResourceMonitor:
106
  mem_usage = psutil.virtual_memory().percent / 100
107
  cpu_usage = psutil.cpu_percent() / 100
108
 
109
- memory_factor = max(0, (self.memory_threshold - mem_usage) / self.memory_threshold)
 
 
110
  cpu_factor = max(0, (self.cpu_threshold - cpu_usage) / self.cpu_threshold)
111
 
112
  self._last_available_slots = math.floor(
@@ -116,6 +126,7 @@ class ResourceMonitor:
116
 
117
  return self._last_available_slots
118
 
 
119
  class TaskManager:
120
  def __init__(self, cleanup_interval: int = 300):
121
  self.tasks: Dict[str, TaskInfo] = {}
@@ -149,12 +160,16 @@ class TaskManager:
149
  except asyncio.TimeoutError:
150
  try:
151
  # Then try low priority
152
- _, task_id = await asyncio.wait_for(self.low_priority.get(), timeout=0.1)
 
 
153
  return task_id
154
  except asyncio.TimeoutError:
155
  return None
156
 
157
- def update_task(self, task_id: str, status: TaskStatus, result: Any = None, error: str = None):
 
 
158
  if task_id in self.tasks:
159
  task_info = self.tasks[task_id]
160
  task_info.status = status
@@ -180,6 +195,7 @@ class TaskManager:
180
  except Exception as e:
181
  logger.error(f"Error in cleanup loop: {e}")
182
 
 
183
  class CrawlerPool:
184
  def __init__(self, max_size: int = 10):
185
  self.max_size = max_size
@@ -222,6 +238,7 @@ class CrawlerPool:
222
  await crawler.__aexit__(None, None, None)
223
  self.active_crawlers.clear()
224
 
 
225
  class CrawlerService:
226
  def __init__(self, max_concurrent_tasks: int = 10):
227
  self.resource_monitor = ResourceMonitor(max_concurrent_tasks)
@@ -258,10 +275,10 @@ class CrawlerService:
258
  async def submit_task(self, request: CrawlRequest) -> str:
259
  task_id = str(uuid.uuid4())
260
  await self.task_manager.add_task(task_id, request.priority, request.ttl or 3600)
261
-
262
  # Store request data with task
263
  self.task_manager.tasks[task_id].request = request
264
-
265
  return task_id
266
 
267
  async def _process_queue(self):
@@ -286,9 +303,11 @@ class CrawlerService:
286
 
287
  try:
288
  crawler = await self.crawler_pool.acquire(**request.crawler_params)
289
-
290
- extraction_strategy = self._create_extraction_strategy(request.extraction_config)
291
-
 
 
292
  if isinstance(request.urls, list):
293
  results = await crawler.arun_many(
294
  urls=[str(url) for url in request.urls],
@@ -318,16 +337,21 @@ class CrawlerService:
318
  )
319
 
320
  await self.crawler_pool.release(crawler)
321
- self.task_manager.update_task(task_id, TaskStatus.COMPLETED, results)
 
 
322
 
323
  except Exception as e:
324
  logger.error(f"Error processing task {task_id}: {str(e)}")
325
- self.task_manager.update_task(task_id, TaskStatus.FAILED, error=str(e))
 
 
326
 
327
  except Exception as e:
328
  logger.error(f"Error in queue processing: {str(e)}")
329
  await asyncio.sleep(1)
330
 
 
331
  app = FastAPI(title="Crawl4AI API")
332
 
333
  # CORS configuration
@@ -344,6 +368,7 @@ app.add_middleware(
344
  security = HTTPBearer()
345
  CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
346
 
 
347
  async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
348
  if not CRAWL4AI_API_TOKEN:
349
  return credentials # No token verification if CRAWL4AI_API_TOKEN is not set
@@ -351,10 +376,12 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Security(secu
351
  raise HTTPException(status_code=401, detail="Invalid token")
352
  return credentials
353
 
 
354
  def secure_endpoint():
355
  """Returns security dependency only if CRAWL4AI_API_TOKEN is set"""
356
  return Depends(verify_token) if CRAWL4AI_API_TOKEN else None
357
 
 
358
  # Check if site directory exists
359
  if os.path.exists(__location__ + "/site"):
360
  # Mount the site directory as a static directory
@@ -364,14 +391,17 @@ site_templates = Jinja2Templates(directory=__location__ + "/site")
364
 
365
  crawler_service = CrawlerService()
366
 
 
367
  @app.on_event("startup")
368
  async def startup_event():
369
  await crawler_service.start()
370
 
 
371
  @app.on_event("shutdown")
372
  async def shutdown_event():
373
  await crawler_service.stop()
374
 
 
375
  @app.get("/")
376
  def read_root():
377
  if os.path.exists(__location__ + "/site"):
@@ -379,12 +409,16 @@ def read_root():
379
  # Return a json response
380
  return {"message": "Crawl4AI API service is running"}
381
 
 
382
  @app.post("/crawl", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
383
  async def crawl(request: CrawlRequest) -> Dict[str, str]:
384
  task_id = await crawler_service.submit_task(request)
385
  return {"task_id": task_id}
386
 
387
- @app.get("/task/{task_id}", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
 
 
 
388
  async def get_task_status(task_id: str):
389
  task_info = crawler_service.task_manager.get_task(task_id)
390
  if not task_info:
@@ -406,36 +440,45 @@ async def get_task_status(task_id: str):
406
 
407
  return response
408
 
 
409
  @app.post("/crawl_sync", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
410
  async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
411
  task_id = await crawler_service.submit_task(request)
412
-
413
  # Wait up to 60 seconds for task completion
414
  for _ in range(60):
415
  task_info = crawler_service.task_manager.get_task(task_id)
416
  if not task_info:
417
  raise HTTPException(status_code=404, detail="Task not found")
418
-
419
  if task_info.status == TaskStatus.COMPLETED:
420
  # Return same format as /task/{task_id} endpoint
421
  if isinstance(task_info.result, list):
422
- return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
 
 
 
423
  return {"status": task_info.status, "result": task_info.result.dict()}
424
-
425
  if task_info.status == TaskStatus.FAILED:
426
  raise HTTPException(status_code=500, detail=task_info.error)
427
-
428
  await asyncio.sleep(1)
429
-
430
  # If we get here, task didn't complete within timeout
431
  raise HTTPException(status_code=408, detail="Task timed out")
432
 
433
- @app.post("/crawl_direct", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
 
 
 
434
  async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
435
  try:
436
  crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
437
- extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
438
-
 
 
439
  try:
440
  if isinstance(request.urls, list):
441
  results = await crawler.arun_many(
@@ -470,7 +513,8 @@ async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
470
  except Exception as e:
471
  logger.error(f"Error in direct crawl: {str(e)}")
472
  raise HTTPException(status_code=500, detail=str(e))
473
-
 
474
  @app.get("/health")
475
  async def health_check():
476
  available_slots = await crawler_service.resource_monitor.get_available_slots()
@@ -482,6 +526,8 @@ async def health_check():
482
  "cpu_usage": psutil.cpu_percent(),
483
  }
484
 
 
485
  if __name__ == "__main__":
486
  import uvicorn
487
- uvicorn.run(app, host="0.0.0.0", port=11235)
 
 
4
  from fastapi import FastAPI, HTTPException, Request
5
  from fastapi.responses import HTMLResponse, JSONResponse
6
  from fastapi.staticfiles import StaticFiles
7
+ from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.templating import Jinja2Templates
9
  from fastapi.exceptions import RequestValidationError
10
  from starlette.middleware.base import BaseHTTPMiddleware
 
38
  logging.basicConfig(level=logging.INFO)
39
  logger = logging.getLogger(__name__)
40
 
41
+
42
  class TaskStatus(str, Enum):
43
  PENDING = "pending"
44
  PROCESSING = "processing"
45
  COMPLETED = "completed"
46
  FAILED = "failed"
47
 
48
+
49
  class CrawlerType(str, Enum):
50
  BASIC = "basic"
51
  LLM = "llm"
52
  COSINE = "cosine"
53
  JSON_CSS = "json_css"
54
 
55
+
56
  class ExtractionConfig(BaseModel):
57
  type: CrawlerType
58
  params: Dict[str, Any] = {}
59
 
60
+
61
  class ChunkingStrategy(BaseModel):
62
  type: str
63
  params: Dict[str, Any] = {}
64
 
65
+
66
  class ContentFilter(BaseModel):
67
  type: str = "bm25"
68
  params: Dict[str, Any] = {}
69
 
70
+
71
  class CrawlRequest(BaseModel):
72
  urls: Union[HttpUrl, List[HttpUrl]]
73
  word_count_threshold: int = MIN_WORD_THRESHOLD
 
83
  session_id: Optional[str] = None
84
  cache_mode: Optional[CacheMode] = CacheMode.ENABLED
85
  priority: int = Field(default=5, ge=1, le=10)
86
+ ttl: Optional[int] = 3600
87
  crawler_params: Dict[str, Any] = {}
88
 
89
+
90
  @dataclass
91
  class TaskInfo:
92
  id: str
 
96
  created_at: float = time.time()
97
  ttl: int = 3600
98
 
99
+
100
  class ResourceMonitor:
101
  def __init__(self, max_concurrent_tasks: int = 10):
102
  self.max_concurrent_tasks = max_concurrent_tasks
 
114
  mem_usage = psutil.virtual_memory().percent / 100
115
  cpu_usage = psutil.cpu_percent() / 100
116
 
117
+ memory_factor = max(
118
+ 0, (self.memory_threshold - mem_usage) / self.memory_threshold
119
+ )
120
  cpu_factor = max(0, (self.cpu_threshold - cpu_usage) / self.cpu_threshold)
121
 
122
  self._last_available_slots = math.floor(
 
126
 
127
  return self._last_available_slots
128
 
129
+
130
  class TaskManager:
131
  def __init__(self, cleanup_interval: int = 300):
132
  self.tasks: Dict[str, TaskInfo] = {}
 
160
  except asyncio.TimeoutError:
161
  try:
162
  # Then try low priority
163
+ _, task_id = await asyncio.wait_for(
164
+ self.low_priority.get(), timeout=0.1
165
+ )
166
  return task_id
167
  except asyncio.TimeoutError:
168
  return None
169
 
170
+ def update_task(
171
+ self, task_id: str, status: TaskStatus, result: Any = None, error: str = None
172
+ ):
173
  if task_id in self.tasks:
174
  task_info = self.tasks[task_id]
175
  task_info.status = status
 
195
  except Exception as e:
196
  logger.error(f"Error in cleanup loop: {e}")
197
 
198
+
199
  class CrawlerPool:
200
  def __init__(self, max_size: int = 10):
201
  self.max_size = max_size
 
238
  await crawler.__aexit__(None, None, None)
239
  self.active_crawlers.clear()
240
 
241
+
242
  class CrawlerService:
243
  def __init__(self, max_concurrent_tasks: int = 10):
244
  self.resource_monitor = ResourceMonitor(max_concurrent_tasks)
 
275
  async def submit_task(self, request: CrawlRequest) -> str:
276
  task_id = str(uuid.uuid4())
277
  await self.task_manager.add_task(task_id, request.priority, request.ttl or 3600)
278
+
279
  # Store request data with task
280
  self.task_manager.tasks[task_id].request = request
281
+
282
  return task_id
283
 
284
  async def _process_queue(self):
 
303
 
304
  try:
305
  crawler = await self.crawler_pool.acquire(**request.crawler_params)
306
+
307
+ extraction_strategy = self._create_extraction_strategy(
308
+ request.extraction_config
309
+ )
310
+
311
  if isinstance(request.urls, list):
312
  results = await crawler.arun_many(
313
  urls=[str(url) for url in request.urls],
 
337
  )
338
 
339
  await self.crawler_pool.release(crawler)
340
+ self.task_manager.update_task(
341
+ task_id, TaskStatus.COMPLETED, results
342
+ )
343
 
344
  except Exception as e:
345
  logger.error(f"Error processing task {task_id}: {str(e)}")
346
+ self.task_manager.update_task(
347
+ task_id, TaskStatus.FAILED, error=str(e)
348
+ )
349
 
350
  except Exception as e:
351
  logger.error(f"Error in queue processing: {str(e)}")
352
  await asyncio.sleep(1)
353
 
354
+
355
  app = FastAPI(title="Crawl4AI API")
356
 
357
  # CORS configuration
 
368
  security = HTTPBearer()
369
  CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
370
 
371
+
372
  async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
373
  if not CRAWL4AI_API_TOKEN:
374
  return credentials # No token verification if CRAWL4AI_API_TOKEN is not set
 
376
  raise HTTPException(status_code=401, detail="Invalid token")
377
  return credentials
378
 
379
+
380
  def secure_endpoint():
381
  """Returns security dependency only if CRAWL4AI_API_TOKEN is set"""
382
  return Depends(verify_token) if CRAWL4AI_API_TOKEN else None
383
 
384
+
385
  # Check if site directory exists
386
  if os.path.exists(__location__ + "/site"):
387
  # Mount the site directory as a static directory
 
391
 
392
  crawler_service = CrawlerService()
393
 
394
+
395
  @app.on_event("startup")
396
  async def startup_event():
397
  await crawler_service.start()
398
 
399
+
400
  @app.on_event("shutdown")
401
  async def shutdown_event():
402
  await crawler_service.stop()
403
 
404
+
405
  @app.get("/")
406
  def read_root():
407
  if os.path.exists(__location__ + "/site"):
 
409
  # Return a json response
410
  return {"message": "Crawl4AI API service is running"}
411
 
412
+
413
  @app.post("/crawl", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
414
  async def crawl(request: CrawlRequest) -> Dict[str, str]:
415
  task_id = await crawler_service.submit_task(request)
416
  return {"task_id": task_id}
417
 
418
+
419
+ @app.get(
420
+ "/task/{task_id}", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []
421
+ )
422
  async def get_task_status(task_id: str):
423
  task_info = crawler_service.task_manager.get_task(task_id)
424
  if not task_info:
 
440
 
441
  return response
442
 
443
+
444
  @app.post("/crawl_sync", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
445
  async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
446
  task_id = await crawler_service.submit_task(request)
447
+
448
  # Wait up to 60 seconds for task completion
449
  for _ in range(60):
450
  task_info = crawler_service.task_manager.get_task(task_id)
451
  if not task_info:
452
  raise HTTPException(status_code=404, detail="Task not found")
453
+
454
  if task_info.status == TaskStatus.COMPLETED:
455
  # Return same format as /task/{task_id} endpoint
456
  if isinstance(task_info.result, list):
457
+ return {
458
+ "status": task_info.status,
459
+ "results": [result.dict() for result in task_info.result],
460
+ }
461
  return {"status": task_info.status, "result": task_info.result.dict()}
462
+
463
  if task_info.status == TaskStatus.FAILED:
464
  raise HTTPException(status_code=500, detail=task_info.error)
465
+
466
  await asyncio.sleep(1)
467
+
468
  # If we get here, task didn't complete within timeout
469
  raise HTTPException(status_code=408, detail="Task timed out")
470
 
471
+
472
+ @app.post(
473
+ "/crawl_direct", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []
474
+ )
475
  async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
476
  try:
477
  crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
478
+ extraction_strategy = crawler_service._create_extraction_strategy(
479
+ request.extraction_config
480
+ )
481
+
482
  try:
483
  if isinstance(request.urls, list):
484
  results = await crawler.arun_many(
 
513
  except Exception as e:
514
  logger.error(f"Error in direct crawl: {str(e)}")
515
  raise HTTPException(status_code=500, detail=str(e))
516
+
517
+
518
  @app.get("/health")
519
  async def health_check():
520
  available_slots = await crawler_service.resource_monitor.get_available_slots()
 
526
  "cpu_usage": psutil.cpu_percent(),
527
  }
528
 
529
+
530
  if __name__ == "__main__":
531
  import uvicorn
532
+
533
+ uvicorn.run(app, host="0.0.0.0", port=11235)