raannakasturi commited on
Commit
9688967
·
verified ·
1 Parent(s): 8f664d2

Update arvix.py

Browse files
Files changed (1) hide show
  1. arvix.py +129 -129
arvix.py CHANGED
@@ -1,130 +1,130 @@
1
- import json
2
- import random
3
- import tools
4
- from bs4 import BeautifulSoup
5
-
6
- def fetch_new_page(category):
7
- url = f'https://arxiv.org/list/{category}/new'
8
- return tools.fetch_page(url)
9
-
10
- def fetch_recent_page(category):
11
- url = f'https://arxiv.org/list/{category}/recent'
12
- return tools.fetch_page(url)
13
-
14
- def extract_new_data(category):
15
- paper_ids = []
16
- page_content = fetch_new_page(category)
17
- lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
18
- for list in lists:
19
- papers = list.find_all('dt')
20
- paper_contents = list.find_all('dd')
21
- titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
22
- for paper, title in zip(papers, titles):
23
- if not tools.verify_simple_title(title):
24
- continue
25
- else:
26
- paper_link = paper.find('a', href=True)
27
- if paper_link:
28
- paper_id = paper_link.text.strip().split(':')[1]
29
- paper_ids.append(paper_id)
30
- else:
31
- continue
32
- return paper_ids
33
-
34
- def extract_recent_data(category):
35
- paper_ids = []
36
- page_content = fetch_recent_page(category)
37
- lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
38
- for list in lists:
39
- papers = list.find_all('dt')
40
- for paper in papers:
41
- paper_link = paper.find('a', href=True)
42
- if paper_link:
43
- paper_id = paper_link.text.strip().split(':')[1]
44
- paper_ids.append(paper_id)
45
- else:
46
- continue
47
- return paper_ids
48
-
49
- def extract_data(category):
50
- sanitized_data = []
51
- new_data = extract_new_data(category)
52
- recent_data = extract_recent_data(category)
53
- data = list(set(new_data + recent_data))
54
- if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
55
- category_list = []
56
- for id in data:
57
- if len(category_list) >= 3:
58
- break
59
- if tools.check_data_in_file(id, 'arxiv.txt'):
60
- continue
61
- else:
62
- category_list.append(id)
63
- for category_id in category_list:
64
- sanitized_data.append(category_id)
65
- tools.write_data_to_file(id, 'arxiv.txt')
66
- else:
67
- for id in data:
68
- if len(sanitized_data) >= 12:
69
- break
70
- if tools.check_data_in_file(id, 'arxiv.txt'):
71
- continue
72
- else:
73
- tools.write_data_to_file(id, 'arxiv.txt')
74
- sanitized_data.append(id)
75
- random.shuffle(sanitized_data)
76
- return sanitized_data
77
-
78
- def extract_arxiv_data():
79
- if not tools.download_datafile('arxiv.txt'):
80
- raise Exception("Failed to download datafile")
81
- categories = {
82
- "Astrophysics": ["astro-ph"],
83
- "Condensed Matter": ["cond-mat"],
84
- "General Relativity and Quantum Cosmology": ["gr-qc"],
85
- "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
86
- "Mathematical Physics": ["math-ph"],
87
- "Nonlinear Sciences": ["nlin"],
88
- "Nuclear Experiment": ["nucl-ex"],
89
- "Nuclear Theory": ["nucl-th"],
90
- "Physics": ["physics"],
91
- "Quantum Physics": ["quant-ph"],
92
- "Mathematics": ["math"],
93
- "Computer Science": ["cs"],
94
- "Quantitative Biology": ["q-bio"],
95
- "Quantitative Finance": ["q-fin"],
96
- "Statistics": ["stat"],
97
- "Electrical Engineering and Systems Science": ["eess"],
98
- "Economics": ["econ"]
99
- }
100
- data = {}
101
- for category, subcategories in categories.items():
102
- category_data = {}
103
- all_ids = []
104
- temp_id_storage = []
105
- for subcategory in subcategories:
106
- ids = extract_data(subcategory)
107
- if len(ids) == 3:
108
- for id in ids:
109
- temp_id_storage.append(id)
110
- else:
111
- for id in ids:
112
- all_ids.append(id)
113
- for temp_id in temp_id_storage:
114
- all_ids.append(temp_id)
115
- random.shuffle(all_ids)
116
- if len(all_ids) > 12:
117
- print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
118
- all_ids = all_ids[:12]
119
- category_data['count'] = len(all_ids)
120
- category_data['ids'] = all_ids
121
- data[category] = category_data
122
- data = json.dumps(data, indent=4, ensure_ascii=False)
123
- if not tools.upload_datafile('arxiv.txt'):
124
- raise Exception("Failed to upload datafile")
125
- return data
126
-
127
- if __name__ == '__main__':
128
- data = extract_arxiv_data()
129
- with open('arxiv_data.json', 'w') as f:
130
  f.write(data)
 
1
+ import json
2
+ import random
3
+ import tools
4
+ from bs4 import BeautifulSoup
5
+
6
+ def fetch_new_page(category):
7
+ url = f'https://arxiv.org/list/{category}/new'
8
+ return tools.fetch_page(url)
9
+
10
+ def fetch_recent_page(category):
11
+ url = f'https://arxiv.org/list/{category}/recent'
12
+ return tools.fetch_page(url)
13
+
14
+ def extract_new_data(category):
15
+ paper_ids = []
16
+ page_content = fetch_new_page(category)
17
+ lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
18
+ for list in lists:
19
+ papers = list.find_all('dt')
20
+ paper_contents = list.find_all('dd')
21
+ titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
22
+ for paper, title in zip(papers, titles):
23
+ if not tools.verify_simple_title(title):
24
+ continue
25
+ else:
26
+ paper_link = paper.find('a', href=True)
27
+ if paper_link:
28
+ paper_id = paper_link.text.strip().split(':')[1]
29
+ paper_ids.append(paper_id)
30
+ else:
31
+ continue
32
+ return paper_ids
33
+
34
+ def extract_recent_data(category):
35
+ paper_ids = []
36
+ page_content = fetch_recent_page(category)
37
+ lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
38
+ for list in lists:
39
+ papers = list.find_all('dt')
40
+ for paper in papers:
41
+ paper_link = paper.find('a', href=True)
42
+ if paper_link:
43
+ paper_id = paper_link.text.strip().split(':')[1]
44
+ paper_ids.append(paper_id)
45
+ else:
46
+ continue
47
+ return paper_ids
48
+
49
+ def extract_data(category):
50
+ sanitized_data = []
51
+ new_data = extract_new_data(category)
52
+ recent_data = extract_recent_data(category)
53
+ data = list(set(new_data + recent_data))
54
+ if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
55
+ category_list = []
56
+ for id in data:
57
+ if len(category_list) >= 1:
58
+ break
59
+ if tools.check_data_in_file(id, 'arxiv.txt'):
60
+ continue
61
+ else:
62
+ category_list.append(id)
63
+ for category_id in category_list:
64
+ sanitized_data.append(category_id)
65
+ tools.write_data_to_file(id, 'arxiv.txt')
66
+ else:
67
+ for id in data:
68
+ if len(sanitized_data) >= 3:
69
+ break
70
+ if tools.check_data_in_file(id, 'arxiv.txt'):
71
+ continue
72
+ else:
73
+ tools.write_data_to_file(id, 'arxiv.txt')
74
+ sanitized_data.append(id)
75
+ random.shuffle(sanitized_data)
76
+ return sanitized_data
77
+
78
+ def extract_arxiv_data():
79
+ if not tools.download_datafile('arxiv.txt'):
80
+ raise Exception("Failed to download datafile")
81
+ categories = {
82
+ "Astrophysics": ["astro-ph"],
83
+ "Condensed Matter": ["cond-mat"],
84
+ "General Relativity and Quantum Cosmology": ["gr-qc"],
85
+ "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
86
+ "Mathematical Physics": ["math-ph"],
87
+ "Nonlinear Sciences": ["nlin"],
88
+ "Nuclear Experiment": ["nucl-ex"],
89
+ "Nuclear Theory": ["nucl-th"],
90
+ "Physics": ["physics"],
91
+ "Quantum Physics": ["quant-ph"],
92
+ "Mathematics": ["math"],
93
+ "Computer Science": ["cs"],
94
+ "Quantitative Biology": ["q-bio"],
95
+ "Quantitative Finance": ["q-fin"],
96
+ "Statistics": ["stat"],
97
+ "Electrical Engineering and Systems Science": ["eess"],
98
+ "Economics": ["econ"]
99
+ }
100
+ data = {}
101
+ for category, subcategories in categories.items():
102
+ category_data = {}
103
+ all_ids = []
104
+ temp_id_storage = []
105
+ for subcategory in subcategories:
106
+ ids = extract_data(subcategory)
107
+ if len(ids) == 3:
108
+ for id in ids:
109
+ temp_id_storage.append(id)
110
+ else:
111
+ for id in ids:
112
+ all_ids.append(id)
113
+ for temp_id in temp_id_storage:
114
+ all_ids.append(temp_id)
115
+ random.shuffle(all_ids)
116
+ if len(all_ids) > 3:
117
+ print(f"Found more than 3 papers for {category}.")
118
+ all_ids = all_ids
119
+ category_data['count'] = len(all_ids)
120
+ category_data['ids'] = all_ids
121
+ data[category] = category_data
122
+ data = json.dumps(data, indent=4, ensure_ascii=False)
123
+ if not tools.upload_datafile('arxiv.txt'):
124
+ raise Exception("Failed to upload datafile")
125
+ return data
126
+
127
+ if __name__ == '__main__':
128
+ data = extract_arxiv_data()
129
+ with open('arxiv_data.json', 'w') as f:
130
  f.write(data)