File size: 4,633 Bytes
9688967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b935b9
9688967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1df272
9688967
 
 
 
 
 
 
 
1b935b9
9688967
 
 
 
 
 
 
 
 
 
 
 
 
c680313
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import json
import random
import tools
from bs4 import BeautifulSoup

def fetch_new_page(category):
    url = f'https://arxiv.org/list/{category}/new'
    return tools.fetch_page(url)

def fetch_recent_page(category):
    url = f'https://arxiv.org/list/{category}/recent'
    return tools.fetch_page(url)

def extract_new_data(category):
    paper_ids = []
    page_content = fetch_new_page(category)
    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
    for list in lists:
        papers = list.find_all('dt')
        paper_contents = list.find_all('dd')
        titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
        for paper, title in zip(papers, titles):
            if not tools.verify_simple_title(title):
                continue
            else:
                paper_link = paper.find('a', href=True)
                if paper_link:
                    paper_id = paper_link.text.strip().split(':')[1]
                    paper_ids.append(paper_id)
                else:
                    continue
    return paper_ids

def extract_recent_data(category):
    paper_ids = []
    page_content = fetch_recent_page(category)
    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
    for list in lists:
        papers = list.find_all('dt')
        for paper in papers:
            paper_link = paper.find('a', href=True)
            if paper_link:
                paper_id = paper_link.text.strip().split(':')[1]
                paper_ids.append(paper_id)
            else:
                continue
    return paper_ids

def extract_data(category):
    sanitized_data = []
    new_data = extract_new_data(category)
    recent_data = extract_recent_data(category)
    data = list(set(new_data + recent_data))
    if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
        category_list = []
        for id in data:
            if len(category_list) >= 1:
                break
            if tools.check_data_in_file(id, 'arxiv.txt'):
                continue
            else:
                category_list.append(id)
        for category_id in category_list:
            sanitized_data.append(category_id)
            tools.write_data_to_file(id, 'arxiv.txt')
    else:
        for id in data:
            if len(sanitized_data) >= 2:
                break
            if tools.check_data_in_file(id, 'arxiv.txt'):
                continue
            else:
                tools.write_data_to_file(id, 'arxiv.txt')
                sanitized_data.append(id)
    random.shuffle(sanitized_data)
    return sanitized_data

def extract_arxiv_data():
    if not tools.download_datafile('arxiv.txt'):
        raise Exception("Failed to download datafile")
    categories = {
        "Astrophysics": ["astro-ph"],
        "Condensed Matter": ["cond-mat"],
        "General Relativity and Quantum Cosmology": ["gr-qc"],
        "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
        "Mathematical Physics": ["math-ph"],
        "Nonlinear Sciences": ["nlin"],
        "Nuclear Experiment": ["nucl-ex"],
        "Nuclear Theory": ["nucl-th"],
        "Physics": ["physics"],
        "Quantum Physics": ["quant-ph"],
        "Mathematics": ["math"],
        "Computer Science": ["cs"],
        "Quantitative Biology": ["q-bio"],
        "Quantitative Finance": ["q-fin"],
        "Statistics": ["stat"],
        "Electrical Engineering and Systems Science": ["eess"],
        "Economics": ["econ"]
    }
    data = {}
    for category, subcategories in categories.items():
        category_data = {}
        all_ids = []
        temp_id_storage = []
        for subcategory in subcategories:
            ids = extract_data(subcategory)
            if len(ids) == 2:
                for id in ids:
                    temp_id_storage.append(id)
            else:
                for id in ids:
                    all_ids.append(id)
        for temp_id in temp_id_storage:
            all_ids.append(temp_id)
        random.shuffle(all_ids)
        if len(all_ids) > 2:
            print(f"Found more than 3 papers for {category}.")
            all_ids = all_ids
        category_data['count'] = len(all_ids)
        category_data['ids'] = all_ids
        data[category] = category_data
    data = json.dumps(data, indent=4, ensure_ascii=False)
    if not tools.upload_datafile('arxiv.txt'):
        raise Exception("Failed to upload datafile")
    return data

if __name__ == '__main__':
    data = extract_arxiv_data()
    with open('arxiv_data.json', 'w') as f:
        f.write(data)