Spaces:
Sleeping
Sleeping
File size: 6,888 Bytes
2c4cdb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
#import os # Module for interacting with the operating system
import time # Module for time-related operations
import ujson # Module for working with JSON data
from random import randint # Module for generating random numbers
from typing import Dict, List, Any # Type hinting imports
import requests # Library for making HTTP requests
from bs4 import BeautifulSoup # Library for parsing HTML data
from selenium import webdriver # Library for browser automation
from selenium.common.exceptions import NoSuchElementException # Exception for missing elements
from webdriver_manager.chrome import ChromeDriverManager # Driver manager for Chrome (We are using Chromium based )
# Delete files if present
# try:
# os.remove('Authors_URL.txt')
# os.remove('scraper_results.json')
# except OSError:
# pass
def write_authors(list1, file_name):
# Function to write authors' URLs to a file
with open(file_name, 'w', encoding='utf-8') as f:
for i in range(0, len(list1)):
f.write(list1[i] + '\n')
def initCrawlerScraper(seed,max_profiles=500):
# Initialize driver for Chrome
webOpt = webdriver.ChromeOptions()
webOpt.add_experimental_option('excludeSwitches', ['enable-logging'])
webOpt.add_argument('--ignore-certificate-errors')
webOpt.add_argument('--incognito')
webOpt.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install(), options=webOpt)
driver.get(seed) # Start with the original link
Links = [] # Array with pureportal profiles URL
pub_data = [] # To store publication information for each pureportal profile
nextLink = driver.find_element_by_css_selector(".nextLink").is_enabled() # Check if the next page link is enabled
print("Crawler has begun...")
while (nextLink):
page = driver.page_source
# XML parser to parse each URL
bs = BeautifulSoup(page, "lxml") # Parse the page source using BeautifulSoup
# Extracting exact URL by spliting string into list
for link in bs.findAll('a', class_='link person'):
url = str(link)[str(link).find('https://pureportal.coventry.ac.uk/en/persons/'):].split('"')
Links.append(url[0])
# Click on Next button to visit next page
try:
if driver.find_element_by_css_selector(".nextLink"):
element = driver.find_element_by_css_selector(".nextLink")
driver.execute_script("arguments[0].click();", element)
else:
nextLink = False
except NoSuchElementException:
break
# Check if the maximum number of profiles is reached
if len(Links) >= max_profiles:
break
print("Crawler has found ", len(Links), " pureportal profiles")
write_authors(Links, 'Authors_URL.txt') # Write the authors' URLs to a file
print("Scraping publication data for ", len(Links), " pureportal profiles...")
count = 0
for link in Links:
# Visit each link to get data
time.sleep(1) # Delay of 1 second to hit next data
driver.get(link)
try:
if driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large"):
element = driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large")
for a in element:
if "research output".lower() in a.text.lower():
driver.execute_script("arguments[0].click();", a)
driver.get(driver.current_url)
# Get name of Author
name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
r = requests.get(driver.current_url)
# Parse all the data via BeautifulSoup
soup = BeautifulSoup(r.content, 'lxml')
# Extracting publication name, publication url, date and CU Authors
table = soup.find('ul', attrs={'class': 'list-results'})
if table != None:
for row in table.findAll('div', attrs={'class': 'result-container'}):
data = {}
data['name'] = row.h3.a.text
data['pub_url'] = row.h3.a['href']
date = row.find("span", class_="date")
rowitem = row.find_all(['div'])
span = row.find_all(['span'])
data['cu_author'] = name.text
data['date'] = date.text
print("Publication Name :", row.h3.a.text)
print("Publication URL :", row.h3.a['href'])
print("CU Author :", name.text)
print("Date :", date.text)
print("\n")
pub_data.append(data)
else:
# Get name of Author
name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
r = requests.get(link)
# Parse all the data via BeautifulSoup
soup = BeautifulSoup(r.content, 'lxml')
# Extracting publication name, publication url, date and CU Authors
table = soup.find('div', attrs={'class': 'relation-list relation-list-publications'})
if table != None:
for row in table.findAll('div', attrs={'class': 'result-container'}):
data = {}
data["name"] = row.h3.a.text
data['pub_url'] = row.h3.a['href']
date = row.find("span", class_="date")
rowitem = row.find_all(['div'])
span = row.find_all(['span'])
data['cu_author'] = name.text
data['date'] = date.text
print("Publication Name :", row.h3.a.text)
print("Publication URL :", row.h3.a['href'])
print("CU Author :", name.text)
print("Date :", date.text)
print("\n")
pub_data.append(data)
except Exception:
continue
print("Crawler has scrapped data for ", len(pub_data), " pureportal publications")
driver.quit()
# Writing all the scraped results in a file with JSON format
with open('scraper_results.json', 'w') as f:
ujson.dump(pub_data, f)
initCrawlerScraper('https://pureportal.coventry.ac.uk/en/organisations/coventry-university/persons/', max_profiles=500)
|