raannakasturi commited on
Commit
eeea145
·
1 Parent(s): 6018f49

Refactor ID extraction logic in arvix and improve DOI fetching in pmc

Browse files
Files changed (2) hide show
  1. arvix.py +0 -2
  2. pmc.py +9 -4
arvix.py CHANGED
@@ -113,7 +113,6 @@ def extract_arxiv_data():
113
  for temp_id in temp_id_storage:
114
  all_ids.append(temp_id)
115
  random.shuffle(all_ids)
116
- print(len(all_ids))
117
  if len(all_ids) > 12:
118
  print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
119
  all_ids = all_ids[:12]
@@ -124,7 +123,6 @@ def extract_arxiv_data():
124
  if not tools.upload_datafile('arxiv.txt'):
125
  raise Exception("Failed to upload datafile")
126
  return data
127
-
128
 
129
  if __name__ == '__main__':
130
  data = extract_arxiv_data()
 
113
  for temp_id in temp_id_storage:
114
  all_ids.append(temp_id)
115
  random.shuffle(all_ids)
 
116
  if len(all_ids) > 12:
117
  print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
118
  all_ids = all_ids[:12]
 
123
  if not tools.upload_datafile('arxiv.txt'):
124
  raise Exception("Failed to upload datafile")
125
  return data
 
126
 
127
  if __name__ == '__main__':
128
  data = extract_arxiv_data()
pmc.py CHANGED
@@ -32,11 +32,14 @@ def fetch_dois():
32
  page_content = tools.fetch_page(link)
33
  page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
34
  for page_data in page_datas:
35
- doi = page_data.find("a", href=True).text
36
- if doi.startswith('10.'):
37
- doi_list.append(doi)
38
- else:
39
  continue
 
 
 
 
 
 
40
  doi_data[topic] = doi_list
41
  data = json.dumps(doi_data, indent=4, ensure_ascii=False)
42
  return data
@@ -48,6 +51,8 @@ def fetch_doi_data():
48
  thread = threading.Thread(target=fetch_and_store)
49
  thread.start()
50
  thread.join()
 
 
51
  return result[0]
52
 
53
  def doi_to_pmc():
 
32
  page_content = tools.fetch_page(link)
33
  page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
34
  for page_data in page_datas:
35
+ if not page_data.find("a", href=True):
 
 
 
36
  continue
37
+ else:
38
+ doi = page_data.find("a", href=True).text
39
+ if doi.startswith('10.'):
40
+ doi_list.append(doi)
41
+ else:
42
+ continue
43
  doi_data[topic] = doi_list
44
  data = json.dumps(doi_data, indent=4, ensure_ascii=False)
45
  return data
 
51
  thread = threading.Thread(target=fetch_and_store)
52
  thread.start()
53
  thread.join()
54
+ if len(result) == 0 or not result or result[0] == None:
55
+ return []
56
  return result[0]
57
 
58
  def doi_to_pmc():