Spaces:

ababio
/

Search-Engine

Sleeping

App Files Files Community

Search-Engine / Scrapper.py

ababio

Upload 24 files

2c4cdb1 verified 3 months ago

raw

history blame

6.89 kB

	#import os # Module for interacting with the operating system
	import time # Module for time-related operations
	import ujson # Module for working with JSON data
	from random import randint # Module for generating random numbers
	from typing import Dict, List, Any # Type hinting imports

	import requests # Library for making HTTP requests
	from bs4 import BeautifulSoup # Library for parsing HTML data
	from selenium import webdriver # Library for browser automation
	from selenium.common.exceptions import NoSuchElementException # Exception for missing elements
	from webdriver_manager.chrome import ChromeDriverManager # Driver manager for Chrome (We are using Chromium based )



	# Delete files if present
	# try:
	# os.remove('Authors_URL.txt')
	# os.remove('scraper_results.json')
	# except OSError:
	# pass

	def write_authors(list1, file_name):
	# Function to write authors' URLs to a file
	with open(file_name, 'w', encoding='utf-8') as f:
	for i in range(0, len(list1)):
	f.write(list1[i] + '\n')


	def initCrawlerScraper(seed,max_profiles=500):
	# Initialize driver for Chrome
	webOpt = webdriver.ChromeOptions()
	webOpt.add_experimental_option('excludeSwitches', ['enable-logging'])
	webOpt.add_argument('--ignore-certificate-errors')
	webOpt.add_argument('--incognito')
	webOpt.headless = True
	driver = webdriver.Chrome(ChromeDriverManager().install(), options=webOpt)
	driver.get(seed) # Start with the original link

	Links = [] # Array with pureportal profiles URL
	pub_data = [] # To store publication information for each pureportal profile

	nextLink = driver.find_element_by_css_selector(".nextLink").is_enabled() # Check if the next page link is enabled
	print("Crawler has begun...")
	while (nextLink):
	page = driver.page_source
	# XML parser to parse each URL
	bs = BeautifulSoup(page, "lxml") # Parse the page source using BeautifulSoup

	# Extracting exact URL by spliting string into list
	for link in bs.findAll('a', class_='link person'):
	url = str(link)[str(link).find('https://pureportal.coventry.ac.uk/en/persons/'):].split('"')
	Links.append(url[0])

	# Click on Next button to visit next page
	try:
	if driver.find_element_by_css_selector(".nextLink"):
	element = driver.find_element_by_css_selector(".nextLink")
	driver.execute_script("arguments[0].click();", element)
	else:
	nextLink = False
	except NoSuchElementException:
	break

	# Check if the maximum number of profiles is reached
	if len(Links) >= max_profiles:
	break

	print("Crawler has found ", len(Links), " pureportal profiles")
	write_authors(Links, 'Authors_URL.txt') # Write the authors' URLs to a file

	print("Scraping publication data for ", len(Links), " pureportal profiles...")
	count = 0
	for link in Links:
	# Visit each link to get data
	time.sleep(1) # Delay of 1 second to hit next data
	driver.get(link)
	try:
	if driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large"):
	element = driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large")
	for a in element:
	if "research output".lower() in a.text.lower():
	driver.execute_script("arguments[0].click();", a)
	driver.get(driver.current_url)
	# Get name of Author
	name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
	r = requests.get(driver.current_url)
	# Parse all the data via BeautifulSoup
	soup = BeautifulSoup(r.content, 'lxml')

	# Extracting publication name, publication url, date and CU Authors
	table = soup.find('ul', attrs={'class': 'list-results'})
	if table != None:
	for row in table.findAll('div', attrs={'class': 'result-container'}):
	data = {}
	data['name'] = row.h3.a.text
	data['pub_url'] = row.h3.a['href']
	date = row.find("span", class_="date")

	rowitem = row.find_all(['div'])
	span = row.find_all(['span'])
	data['cu_author'] = name.text
	data['date'] = date.text
	print("Publication Name :", row.h3.a.text)
	print("Publication URL :", row.h3.a['href'])
	print("CU Author :", name.text)
	print("Date :", date.text)
	print("\n")
	pub_data.append(data)
	else:
	# Get name of Author
	name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
	r = requests.get(link)
	# Parse all the data via BeautifulSoup
	soup = BeautifulSoup(r.content, 'lxml')
	# Extracting publication name, publication url, date and CU Authors
	table = soup.find('div', attrs={'class': 'relation-list relation-list-publications'})
	if table != None:
	for row in table.findAll('div', attrs={'class': 'result-container'}):
	data = {}
	data["name"] = row.h3.a.text
	data['pub_url'] = row.h3.a['href']
	date = row.find("span", class_="date")
	rowitem = row.find_all(['div'])
	span = row.find_all(['span'])
	data['cu_author'] = name.text
	data['date'] = date.text
	print("Publication Name :", row.h3.a.text)
	print("Publication URL :", row.h3.a['href'])
	print("CU Author :", name.text)
	print("Date :", date.text)
	print("\n")
	pub_data.append(data)
	except Exception:
	continue

	print("Crawler has scrapped data for ", len(pub_data), " pureportal publications")
	driver.quit()
	# Writing all the scraped results in a file with JSON format
	with open('scraper_results.json', 'w') as f:
	ujson.dump(pub_data, f)


	initCrawlerScraper('https://pureportal.coventry.ac.uk/en/organisations/coventry-university/persons/', max_profiles=500)