import requests from bs4 import BeautifulSoup import time import random import logging import sqlite3 from fake_useragent import UserAgent from urllib.parse import urljoin, urlparse from urllib.robotparser import RobotFileParser from concurrent.futures import ThreadPoolExecutor import gradio as gr from threading import Lock # User agent initialization ua = UserAgent() google_bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" # Domain access times for rate-limiting domain_access_times = {} # Thread-safe visited URLs set visited = set() visited_lock = Lock() # Setup logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" ) # Save to file def save_to_file(filename, data): with open(filename, "a") as file: file.write(data + "\n") # Save to database def save_to_database(url, keywords_matched): conn = sqlite3.connect("crawler.db") cursor = conn.cursor() cursor.execute("CREATE TABLE IF NOT EXISTS results (url TEXT, keywords TEXT)") cursor.execute("INSERT INTO results (url, keywords) VALUES (?, ?)", (url, ",".join(keywords_matched))) conn.commit() conn.close() # Get delay for rate-limiting def get_delay(domain): now = time.time() if domain in domain_access_times: elapsed = now - domain_access_times[domain] delay = max(0, 5 - elapsed) else: delay = 0 domain_access_times[domain] = now return delay # Can crawl based on robots.txt def can_crawl(url, user_agent): parsed_url = "/".join(url.split("/")[:3]) + "/robots.txt" rp = RobotFileParser() rp.set_url(parsed_url) try: rp.read() except Exception: return True # Assume crawlable if robots.txt cannot be fetched return rp.can_fetch(user_agent, url) # Crawl function def crawl(url, keywords, depth, found_urls, keywords_found): if depth <= 0: return "", "" with visited_lock: if url in visited: return found_urls, keywords_found visited.add(url) domain = urlparse(url).netloc time.sleep(get_delay(domain)) if not can_crawl(url, google_bot_ua): logging.warning(f"Blocked by robots.txt: {url}") return found_urls, keywords_found for attempt in range(3): # Retry up to 3 times try: user_agent = google_bot_ua if random.random() < 0.2 else ua.random headers = {"User-Agent": user_agent, "Referer": "https://www.google.com"} response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() break except requests.exceptions.RequestException as e: if attempt == 2: logging.error(f"Failed after retries: {url} - {e}") return found_urls, keywords_found time.sleep(2 ** attempt) # Exponential backoff save_to_file("found.txt", url) logging.info(f"Crawled: {url}") found_urls += f"{url}\n" soup = BeautifulSoup(response.text, "html.parser") # Check for keywords text = soup.get_text().lower() keywords_matched = [kw for kw in keywords if kw.lower() in text] if keywords_matched: save_to_file("keywords_found.txt", url) save_to_database(url, keywords_matched) logging.info(f"Keywords found in {url}: {keywords_matched}") keywords_found += f"{url} - Keywords: {', '.join(keywords_matched)}\n" # Find and crawl links for link in soup.find_all("a", href=True): next_url = urljoin(url, link["href"]) if next_url.startswith("http"): found_urls, keywords_found = crawl(next_url, keywords, depth - 1, found_urls, keywords_found) return found_urls, keywords_found # Gradio interface function def gradio_crawl(start_url, keywords, depth): keywords_list = keywords.split(',') found_urls = "" keywords_found = "" found_urls, keywords_found = crawl(start_url, keywords_list, depth, found_urls, keywords_found) return found_urls, keywords_found # Gradio UI setup iface = gr.Interface( fn=gradio_crawl, inputs=[ gr.Textbox(label="Starting URL", placeholder="Enter the starting URL"), gr.Textbox(label="Keywords (comma-separated)", placeholder="Enter keywords to search for"), gr.Slider(label="Crawl Depth", minimum=1, maximum=5, step=1, value=3) ], outputs=[ gr.Textbox(label="Crawled URLs", lines=10, placeholder="Found URLs will be shown here...", interactive=False), gr.Textbox(label="Keywords Found", lines=10, placeholder="URLs with matching keywords will be shown here...", interactive=False) ], live=True, title="WebCrawlPLUS", description="A webcrawler from SynAckNetwork.com" ) # Launch the Gradio app iface.launch()