paper-matching / input_format.py
jskim's picture
update
963bf46
raw
history blame
2.57 kB
import numpy as np
from pypdf import PdfReader
from urllib.parse import urlparse
import requests
from semanticscholar import SemanticScholar
### Input Formatting Module
## Input formatting for the given paper
# Extracting text from a pdf or a link
def get_text_from_pdf(file_path):
"""
Convert a pdf to list of text files
"""
reader = PdfReader(file_path)
text = []
for p in reader.pages:
t = p.extract_text()
text.append(t)
return text
def get_text_from_url(url, file_path='paper.pdf'):
"""
Get text of the paper from a url
"""
## Check for different URL cases
url_parts = urlparse(url)
# arxiv
if 'arxiv' in url_parts.netloc:
if 'abs' in url_parts.path:
# abstract page, change the url to pdf link
paper_id = url_parts.path.split('/')[-1]
url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
elif 'pdf' in url_parts.path:
# pdf file, pass
pass
else:
raise ValueError('invalid url')
else:
raise ValueError('invalid url')
# download the file
download_pdf(url, file_path)
# get the text from the pdf file
text = get_text_from_pdf(file_path)
return text
def download_pdf(url, file_name):
"""
Download the pdf file from given url and save it as file_name
"""
# Send GET request
response = requests.get(url)
# Save the PDF
if response.status_code == 200:
with open(file_name, "wb") as f:
f.write(response.content)
elif response.status_code == 404:
raise ValueError('cannot download the file')
else:
print(response.status_code)
## Input formatting for the given author (reviewer)
# Extracting text from a link
def get_text_from_author_id(author_id, max_count=100):
if author_id is None:
raise ValueError('Input valid author ID')
author_id = str(author_id)
# author_id = '1737249'
url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%author_id
r = requests.get(url)
if r.status_code == 404:
raise ValueError('Input valid author ID')
data = r.json()
papers = data['papers'][:max_count]
name = data['name']
return name, papers
## TODO Preprocess Extracted Texts from PDFs
# Get a portion of the text for actual task
def get_title(text):
pass
def get_abstract(text):
pass
def get_introduction(text):
pass
def get_conclusion(text):
pass