async_pdf_chunck_api

Paused

App Files Files Community

Arafath10 commited on Aug 10, 2024

Commit

8f2f190

verified ·

1 Parent(s): 089d488

Delete scraper.py

Browse files

Files changed (1) hide show

scraper.py +0 -116

scraper.py DELETED Viewed

@@ -1,116 +0,0 @@
-# scraper.py
-import asyncio
-from playwright.async_api import async_playwright
-from bs4 import BeautifulSoup
-import requests
-import time
-class Scraper:
-    @staticmethod
-    async def power_scrapper_2(url):
-        async with async_playwright() as p:
-            browser = await p.chromium.launch(headless=True)
-            page = await browser.new_page()
-            # Route to block images, videos, and CSS
-            await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
-            await page.goto(url)
-            # Get the title
-            title = await page.title()
-            # Get all links
-            page_url = await page.evaluate("""() => {
-                return Array.from(document.querySelectorAll('a')).map(a => a.href);
-            }""")
-            # Get page content (text from paragraphs and headers)
-            page_content = await page.evaluate("""() => {
-                let elements = Array.from(document.querySelectorAll('body *'));
-                return elements.map(element => element.innerText).join('\\n');
-            }""")
-            await browser.close()
-            return title,page_url, page_content
-    @staticmethod
-    async def power_scrapper(url):
-        async with async_playwright() as p:
-            browser = await p.chromium.launch(headless=True)
-            page = await browser.new_page()
-            # Block unnecessary resources to speed up loading
-            await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
-            # Open the target website
-            await page.goto(url, wait_until='domcontentloaded')
-            # Wait for a short time to ensure dynamic content is loaded
-            await page.wait_for_timeout(1000)
-            # Extract all links
-            links = await page.query_selector_all('a')
-            page_url = []
-            page_content = []
-            for link in links:
-                href = await link.get_attribute('href')
-                page_url.append(href)
-            # Extract all text content
-            elements = await page.query_selector_all('body *')
-            for element in elements:
-                text_content = await element.text_content()
-                if text_content and text_content.strip():
-                    page_content.append(text_content.strip())
-            await browser.close()
-            return page_url, page_content
-    @staticmethod
-    def get_links(soup):
-        links = []
-        for link in soup.find_all('a'):
-            href = link.get('href')
-            links.append(href)
-        return links
-    @staticmethod
-    def get_text_content(soup):
-        text_elements = []
-        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
-            elements = soup.find_all(tag)
-            for element in elements:
-                text_elements.append(element.get_text())
-        return text_elements
-    @staticmethod
-    def get_title(soup):
-        title = soup.find('title').get_text()
-        return title
-    @staticmethod
-    async def scrape(url):
-        try:
-            headers = {'User-Agent': 'Mozilla/5.0'}
-            response = requests.get(url)
-            soup = BeautifulSoup(response.content, 'html.parser')
-            title = Scraper.get_title(soup)
-            links = Scraper.get_links(soup)
-            text_content = Scraper.get_text_content(soup)
-            if not links:
-                print("Running alternative scrapper")
-                links, text_content = await Scraper.power_scrapper_2(url)
-            return {"title": title, "URL": links, "Content": text_content}
-        except:
-            title,links, text_content = await Scraper.power_scrapper_2(url)
-            return {"title": title, "URL": links, "Content": text_content}