Arafath10 commited on
Commit
8f2f190
·
verified ·
1 Parent(s): 089d488

Delete scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +0 -116
scraper.py DELETED
@@ -1,116 +0,0 @@
1
- # scraper.py
2
-
3
- import asyncio
4
- from playwright.async_api import async_playwright
5
- from bs4 import BeautifulSoup
6
- import requests
7
- import time
8
-
9
-
10
-
11
-
12
- class Scraper:
13
- @staticmethod
14
- async def power_scrapper_2(url):
15
- async with async_playwright() as p:
16
- browser = await p.chromium.launch(headless=True)
17
- page = await browser.new_page()
18
-
19
- # Route to block images, videos, and CSS
20
- await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
21
-
22
- await page.goto(url)
23
-
24
- # Get the title
25
- title = await page.title()
26
-
27
- # Get all links
28
- page_url = await page.evaluate("""() => {
29
- return Array.from(document.querySelectorAll('a')).map(a => a.href);
30
- }""")
31
-
32
- # Get page content (text from paragraphs and headers)
33
- page_content = await page.evaluate("""() => {
34
- let elements = Array.from(document.querySelectorAll('body *'));
35
- return elements.map(element => element.innerText).join('\\n');
36
- }""")
37
-
38
-
39
- await browser.close()
40
- return title,page_url, page_content
41
-
42
- @staticmethod
43
- async def power_scrapper(url):
44
- async with async_playwright() as p:
45
- browser = await p.chromium.launch(headless=True)
46
- page = await browser.new_page()
47
-
48
- # Block unnecessary resources to speed up loading
49
- await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
50
-
51
- # Open the target website
52
- await page.goto(url, wait_until='domcontentloaded')
53
-
54
- # Wait for a short time to ensure dynamic content is loaded
55
- await page.wait_for_timeout(1000)
56
-
57
- # Extract all links
58
- links = await page.query_selector_all('a')
59
- page_url = []
60
- page_content = []
61
- for link in links:
62
- href = await link.get_attribute('href')
63
- page_url.append(href)
64
-
65
- # Extract all text content
66
- elements = await page.query_selector_all('body *')
67
-
68
- for element in elements:
69
- text_content = await element.text_content()
70
- if text_content and text_content.strip():
71
- page_content.append(text_content.strip())
72
-
73
- await browser.close()
74
- return page_url, page_content
75
-
76
- @staticmethod
77
- def get_links(soup):
78
- links = []
79
- for link in soup.find_all('a'):
80
- href = link.get('href')
81
- links.append(href)
82
- return links
83
-
84
- @staticmethod
85
- def get_text_content(soup):
86
- text_elements = []
87
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
88
- elements = soup.find_all(tag)
89
- for element in elements:
90
- text_elements.append(element.get_text())
91
- return text_elements
92
-
93
- @staticmethod
94
- def get_title(soup):
95
- title = soup.find('title').get_text()
96
- return title
97
-
98
- @staticmethod
99
- async def scrape(url):
100
- try:
101
- headers = {'User-Agent': 'Mozilla/5.0'}
102
- response = requests.get(url)
103
- soup = BeautifulSoup(response.content, 'html.parser')
104
-
105
- title = Scraper.get_title(soup)
106
- links = Scraper.get_links(soup)
107
- text_content = Scraper.get_text_content(soup)
108
-
109
- if not links:
110
- print("Running alternative scrapper")
111
- links, text_content = await Scraper.power_scrapper_2(url)
112
-
113
- return {"title": title, "URL": links, "Content": text_content}
114
- except:
115
- title,links, text_content = await Scraper.power_scrapper_2(url)
116
- return {"title": title, "URL": links, "Content": text_content}