Commit
·
eeea145
1
Parent(s):
6018f49
Refactor ID extraction logic in arvix and improve DOI fetching in pmc
Browse files
arvix.py
CHANGED
@@ -113,7 +113,6 @@ def extract_arxiv_data():
|
|
113 |
for temp_id in temp_id_storage:
|
114 |
all_ids.append(temp_id)
|
115 |
random.shuffle(all_ids)
|
116 |
-
print(len(all_ids))
|
117 |
if len(all_ids) > 12:
|
118 |
print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
|
119 |
all_ids = all_ids[:12]
|
@@ -124,7 +123,6 @@ def extract_arxiv_data():
|
|
124 |
if not tools.upload_datafile('arxiv.txt'):
|
125 |
raise Exception("Failed to upload datafile")
|
126 |
return data
|
127 |
-
|
128 |
|
129 |
if __name__ == '__main__':
|
130 |
data = extract_arxiv_data()
|
|
|
113 |
for temp_id in temp_id_storage:
|
114 |
all_ids.append(temp_id)
|
115 |
random.shuffle(all_ids)
|
|
|
116 |
if len(all_ids) > 12:
|
117 |
print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
|
118 |
all_ids = all_ids[:12]
|
|
|
123 |
if not tools.upload_datafile('arxiv.txt'):
|
124 |
raise Exception("Failed to upload datafile")
|
125 |
return data
|
|
|
126 |
|
127 |
if __name__ == '__main__':
|
128 |
data = extract_arxiv_data()
|
pmc.py
CHANGED
@@ -32,11 +32,14 @@ def fetch_dois():
|
|
32 |
page_content = tools.fetch_page(link)
|
33 |
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
|
34 |
for page_data in page_datas:
|
35 |
-
|
36 |
-
if doi.startswith('10.'):
|
37 |
-
doi_list.append(doi)
|
38 |
-
else:
|
39 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
doi_data[topic] = doi_list
|
41 |
data = json.dumps(doi_data, indent=4, ensure_ascii=False)
|
42 |
return data
|
@@ -48,6 +51,8 @@ def fetch_doi_data():
|
|
48 |
thread = threading.Thread(target=fetch_and_store)
|
49 |
thread.start()
|
50 |
thread.join()
|
|
|
|
|
51 |
return result[0]
|
52 |
|
53 |
def doi_to_pmc():
|
|
|
32 |
page_content = tools.fetch_page(link)
|
33 |
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
|
34 |
for page_data in page_datas:
|
35 |
+
if not page_data.find("a", href=True):
|
|
|
|
|
|
|
36 |
continue
|
37 |
+
else:
|
38 |
+
doi = page_data.find("a", href=True).text
|
39 |
+
if doi.startswith('10.'):
|
40 |
+
doi_list.append(doi)
|
41 |
+
else:
|
42 |
+
continue
|
43 |
doi_data[topic] = doi_list
|
44 |
data = json.dumps(doi_data, indent=4, ensure_ascii=False)
|
45 |
return data
|
|
|
51 |
thread = threading.Thread(target=fetch_and_store)
|
52 |
thread.start()
|
53 |
thread.join()
|
54 |
+
if len(result) == 0 or not result or result[0] == None:
|
55 |
+
return []
|
56 |
return result[0]
|
57 |
|
58 |
def doi_to_pmc():
|