Search-Engine / Scrapper.py
ababio's picture
Upload 24 files
2c4cdb1 verified
raw
history blame
6.89 kB
#import os # Module for interacting with the operating system
import time # Module for time-related operations
import ujson # Module for working with JSON data
from random import randint # Module for generating random numbers
from typing import Dict, List, Any # Type hinting imports
import requests # Library for making HTTP requests
from bs4 import BeautifulSoup # Library for parsing HTML data
from selenium import webdriver # Library for browser automation
from selenium.common.exceptions import NoSuchElementException # Exception for missing elements
from webdriver_manager.chrome import ChromeDriverManager # Driver manager for Chrome (We are using Chromium based )
# Delete files if present
# try:
# os.remove('Authors_URL.txt')
# os.remove('scraper_results.json')
# except OSError:
# pass
def write_authors(list1, file_name):
# Function to write authors' URLs to a file
with open(file_name, 'w', encoding='utf-8') as f:
for i in range(0, len(list1)):
f.write(list1[i] + '\n')
def initCrawlerScraper(seed,max_profiles=500):
# Initialize driver for Chrome
webOpt = webdriver.ChromeOptions()
webOpt.add_experimental_option('excludeSwitches', ['enable-logging'])
webOpt.add_argument('--ignore-certificate-errors')
webOpt.add_argument('--incognito')
webOpt.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install(), options=webOpt)
driver.get(seed) # Start with the original link
Links = [] # Array with pureportal profiles URL
pub_data = [] # To store publication information for each pureportal profile
nextLink = driver.find_element_by_css_selector(".nextLink").is_enabled() # Check if the next page link is enabled
print("Crawler has begun...")
while (nextLink):
page = driver.page_source
# XML parser to parse each URL
bs = BeautifulSoup(page, "lxml") # Parse the page source using BeautifulSoup
# Extracting exact URL by spliting string into list
for link in bs.findAll('a', class_='link person'):
url = str(link)[str(link).find('https://pureportal.coventry.ac.uk/en/persons/'):].split('"')
Links.append(url[0])
# Click on Next button to visit next page
try:
if driver.find_element_by_css_selector(".nextLink"):
element = driver.find_element_by_css_selector(".nextLink")
driver.execute_script("arguments[0].click();", element)
else:
nextLink = False
except NoSuchElementException:
break
# Check if the maximum number of profiles is reached
if len(Links) >= max_profiles:
break
print("Crawler has found ", len(Links), " pureportal profiles")
write_authors(Links, 'Authors_URL.txt') # Write the authors' URLs to a file
print("Scraping publication data for ", len(Links), " pureportal profiles...")
count = 0
for link in Links:
# Visit each link to get data
time.sleep(1) # Delay of 1 second to hit next data
driver.get(link)
try:
if driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large"):
element = driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large")
for a in element:
if "research output".lower() in a.text.lower():
driver.execute_script("arguments[0].click();", a)
driver.get(driver.current_url)
# Get name of Author
name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
r = requests.get(driver.current_url)
# Parse all the data via BeautifulSoup
soup = BeautifulSoup(r.content, 'lxml')
# Extracting publication name, publication url, date and CU Authors
table = soup.find('ul', attrs={'class': 'list-results'})
if table != None:
for row in table.findAll('div', attrs={'class': 'result-container'}):
data = {}
data['name'] = row.h3.a.text
data['pub_url'] = row.h3.a['href']
date = row.find("span", class_="date")
rowitem = row.find_all(['div'])
span = row.find_all(['span'])
data['cu_author'] = name.text
data['date'] = date.text
print("Publication Name :", row.h3.a.text)
print("Publication URL :", row.h3.a['href'])
print("CU Author :", name.text)
print("Date :", date.text)
print("\n")
pub_data.append(data)
else:
# Get name of Author
name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
r = requests.get(link)
# Parse all the data via BeautifulSoup
soup = BeautifulSoup(r.content, 'lxml')
# Extracting publication name, publication url, date and CU Authors
table = soup.find('div', attrs={'class': 'relation-list relation-list-publications'})
if table != None:
for row in table.findAll('div', attrs={'class': 'result-container'}):
data = {}
data["name"] = row.h3.a.text
data['pub_url'] = row.h3.a['href']
date = row.find("span", class_="date")
rowitem = row.find_all(['div'])
span = row.find_all(['span'])
data['cu_author'] = name.text
data['date'] = date.text
print("Publication Name :", row.h3.a.text)
print("Publication URL :", row.h3.a['href'])
print("CU Author :", name.text)
print("Date :", date.text)
print("\n")
pub_data.append(data)
except Exception:
continue
print("Crawler has scrapped data for ", len(pub_data), " pureportal publications")
driver.quit()
# Writing all the scraped results in a file with JSON format
with open('scraper_results.json', 'w') as f:
ujson.dump(pub_data, f)
initCrawlerScraper('https://pureportal.coventry.ac.uk/en/organisations/coventry-university/persons/', max_profiles=500)