Spaces:

raannakasturi
/

ReXploreIDFetchingAPI

Running

raannakasturi commited on Dec 21, 2024

Commit

eeea145

1 Parent(s): 6018f49

Refactor ID extraction logic in arvix and improve DOI fetching in pmc

Files changed (2) hide show

arvix.py CHANGED Viewed

@@ -113,7 +113,6 @@ def extract_arxiv_data():
         for temp_id in temp_id_storage:
             all_ids.append(temp_id)
         random.shuffle(all_ids)
-        print(len(all_ids))
         if len(all_ids) > 12:
             print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
             all_ids = all_ids[:12]
@@ -124,7 +123,6 @@ def extract_arxiv_data():
     if not tools.upload_datafile('arxiv.txt'):
         raise Exception("Failed to upload datafile")
     return data
 if __name__ == '__main__':
     data = extract_arxiv_data()

         for temp_id in temp_id_storage:
             all_ids.append(temp_id)
         random.shuffle(all_ids)
         if len(all_ids) > 12:
             print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
             all_ids = all_ids[:12]
     if not tools.upload_datafile('arxiv.txt'):
         raise Exception("Failed to upload datafile")
     return data
 if __name__ == '__main__':
     data = extract_arxiv_data()

pmc.py CHANGED Viewed

@@ -32,11 +32,14 @@ def fetch_dois():
             page_content = tools.fetch_page(link)
             page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
             for page_data in page_datas:
-                doi = page_data.find("a", href=True).text
-                if doi.startswith('10.'):
-                    doi_list.append(doi)
-                else:
                     continue
         doi_data[topic] = doi_list
     data = json.dumps(doi_data, indent=4, ensure_ascii=False)
     return data
@@ -48,6 +51,8 @@ def fetch_doi_data():
     thread = threading.Thread(target=fetch_and_store)
     thread.start()
     thread.join()
     return result[0]
 def doi_to_pmc():

             page_content = tools.fetch_page(link)
             page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
             for page_data in page_datas:
+                if not page_data.find("a", href=True):
                     continue
+                else:
+                    doi = page_data.find("a", href=True).text
+                    if doi.startswith('10.'):
+                        doi_list.append(doi)
+                    else:
+                        continue
         doi_data[topic] = doi_list
     data = json.dumps(doi_data, indent=4, ensure_ascii=False)
     return data
     thread = threading.Thread(target=fetch_and_store)
     thread.start()
     thread.join()
+    if len(result) == 0 or not result or result[0] == None:
+        return []
     return result[0]
 def doi_to_pmc():