File size: 4,633 Bytes
9688967 1b935b9 9688967 c1df272 9688967 1b935b9 9688967 c680313 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import json
import random
import tools
from bs4 import BeautifulSoup
def fetch_new_page(category):
url = f'https://arxiv.org/list/{category}/new'
return tools.fetch_page(url)
def fetch_recent_page(category):
url = f'https://arxiv.org/list/{category}/recent'
return tools.fetch_page(url)
def extract_new_data(category):
paper_ids = []
page_content = fetch_new_page(category)
lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
for list in lists:
papers = list.find_all('dt')
paper_contents = list.find_all('dd')
titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
for paper, title in zip(papers, titles):
if not tools.verify_simple_title(title):
continue
else:
paper_link = paper.find('a', href=True)
if paper_link:
paper_id = paper_link.text.strip().split(':')[1]
paper_ids.append(paper_id)
else:
continue
return paper_ids
def extract_recent_data(category):
paper_ids = []
page_content = fetch_recent_page(category)
lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
for list in lists:
papers = list.find_all('dt')
for paper in papers:
paper_link = paper.find('a', href=True)
if paper_link:
paper_id = paper_link.text.strip().split(':')[1]
paper_ids.append(paper_id)
else:
continue
return paper_ids
def extract_data(category):
sanitized_data = []
new_data = extract_new_data(category)
recent_data = extract_recent_data(category)
data = list(set(new_data + recent_data))
if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
category_list = []
for id in data:
if len(category_list) >= 1:
break
if tools.check_data_in_file(id, 'arxiv.txt'):
continue
else:
category_list.append(id)
for category_id in category_list:
sanitized_data.append(category_id)
tools.write_data_to_file(id, 'arxiv.txt')
else:
for id in data:
if len(sanitized_data) >= 2:
break
if tools.check_data_in_file(id, 'arxiv.txt'):
continue
else:
tools.write_data_to_file(id, 'arxiv.txt')
sanitized_data.append(id)
random.shuffle(sanitized_data)
return sanitized_data
def extract_arxiv_data():
if not tools.download_datafile('arxiv.txt'):
raise Exception("Failed to download datafile")
categories = {
"Astrophysics": ["astro-ph"],
"Condensed Matter": ["cond-mat"],
"General Relativity and Quantum Cosmology": ["gr-qc"],
"High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
"Mathematical Physics": ["math-ph"],
"Nonlinear Sciences": ["nlin"],
"Nuclear Experiment": ["nucl-ex"],
"Nuclear Theory": ["nucl-th"],
"Physics": ["physics"],
"Quantum Physics": ["quant-ph"],
"Mathematics": ["math"],
"Computer Science": ["cs"],
"Quantitative Biology": ["q-bio"],
"Quantitative Finance": ["q-fin"],
"Statistics": ["stat"],
"Electrical Engineering and Systems Science": ["eess"],
"Economics": ["econ"]
}
data = {}
for category, subcategories in categories.items():
category_data = {}
all_ids = []
temp_id_storage = []
for subcategory in subcategories:
ids = extract_data(subcategory)
if len(ids) == 2:
for id in ids:
temp_id_storage.append(id)
else:
for id in ids:
all_ids.append(id)
for temp_id in temp_id_storage:
all_ids.append(temp_id)
random.shuffle(all_ids)
if len(all_ids) > 2:
print(f"Found more than 3 papers for {category}.")
all_ids = all_ids
category_data['count'] = len(all_ids)
category_data['ids'] = all_ids
data[category] = category_data
data = json.dumps(data, indent=4, ensure_ascii=False)
if not tools.upload_datafile('arxiv.txt'):
raise Exception("Failed to upload datafile")
return data
if __name__ == '__main__':
data = extract_arxiv_data()
with open('arxiv_data.json', 'w') as f:
f.write(data) |