raannakasturi commited on
Commit
11f88e0
·
verified ·
1 Parent(s): c1df272

Update pmc.py

Browse files
Files changed (1) hide show
  1. pmc.py +114 -112
pmc.py CHANGED
@@ -1,113 +1,115 @@
1
- import json
2
- import xml.etree.ElementTree as ET
3
- from bs4 import BeautifulSoup
4
- import requests
5
- import tools
6
- import threading
7
-
8
- def fetch_links(category):
9
- links = []
10
- xml_data = tools.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
11
- items = ET.fromstring(xml_data).findall('channel/item')
12
- for item in items:
13
- link = item.find('link').text
14
- links.append(link)
15
- return links
16
-
17
- def fetch_all_links():
18
- categories = ["Science", "Health", "Environment", "Technology", "Society"]
19
- sd_links_data = {}
20
- for category in categories:
21
- links = fetch_links(category)
22
- sd_links_data[category] = links
23
- data = json.dumps(sd_links_data, indent=4, ensure_ascii=False)
24
- return data
25
-
26
- def fetch_dois():
27
- doi_data = {}
28
- data = json.loads(fetch_all_links())
29
- for topic, links in data.items():
30
- doi_list = []
31
- for link in links:
32
- page_content = tools.fetch_page(link)
33
- page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
34
- for page_data in page_datas:
35
- if not page_data.find("a", href=True):
36
- continue
37
- else:
38
- doi = page_data.find("a", href=True).text
39
- if doi.startswith('10.'):
40
- doi_list.append(doi)
41
- else:
42
- continue
43
- doi_data[topic] = doi_list
44
- data = json.dumps(doi_data, indent=4, ensure_ascii=False)
45
- return data
46
-
47
- def fetch_doi_data():
48
- result = []
49
- def fetch_and_store():
50
- result.append(fetch_dois())
51
- thread = threading.Thread(target=fetch_and_store)
52
- thread.start()
53
- thread.join()
54
- if len(result) == 0 or not result or result[0] == None:
55
- return []
56
- return result[0]
57
-
58
- def doi_to_pmc():
59
- data = json.loads(fetch_doi_data())
60
- pmc_data = {}
61
- for topic, dois in data.items():
62
- if len(dois) > 0:
63
- doi_list = ""
64
- for doi in dois:
65
- doi_list += doi + ","
66
- doi_list = doi_list.rstrip(',')
67
- try:
68
- url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/[email protected]&ids={doi_list}&format=json"
69
- doi_pmc_data = requests.get(url).json()
70
- except Exception as e:
71
- print(f"Error: {str(e)}")
72
- if doi_pmc_data['status'] == 'ok':
73
- pmc_list = []
74
- for record in doi_pmc_data['records']:
75
- if 'pmcid' in record:
76
- if 'live' in record and record['live'] == False:
77
- continue
78
- pmc_list.append(record['pmcid'])
79
- else:
80
- continue
81
- pmc_data[topic] = pmc_list
82
- else:
83
- continue
84
- else:
85
- continue
86
- data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
87
- return data
88
-
89
- def extract_pmc_data():
90
- if not tools.download_datafile('pmc.txt'):
91
- raise Exception("Failed to download datafile")
92
- pmc_data ={}
93
- pmcid_data = json.loads(doi_to_pmc())
94
- for topic, pmcids in pmcid_data.items():
95
- pmc_ids = []
96
- for id in pmcids:
97
- if tools.check_data_in_file(id, 'pmc.txt'):
98
- continue
99
- else:
100
- tools.write_data_to_file(id, 'pmc.txt')
101
- pmc_ids.append(id)
102
- pmc_data[topic] = {}
103
- pmc_data[topic]['count'] = len(pmc_ids)
104
- pmc_data[topic]['ids'] = pmc_ids
105
- data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
106
- if not tools.upload_datafile('pmc.txt'):
107
- raise Exception("Failed to upload datafile")
108
- return data
109
-
110
- if __name__ == "__main__":
111
- data = extract_pmc_data()
112
- with open('pmc_data.json', 'w') as f:
 
 
113
  f.write(data)
 
1
+ import json
2
+ import xml.etree.ElementTree as ET
3
+ from bs4 import BeautifulSoup
4
+ import requests
5
+ import tools
6
+ import threading
7
+
8
+ def fetch_links(category):
9
+ links = []
10
+ xml_data = tools.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
11
+ items = ET.fromstring(xml_data).findall('channel/item')
12
+ for item in items:
13
+ link = item.find('link').text
14
+ links.append(link)
15
+ return links
16
+
17
+ def fetch_all_links():
18
+ categories = ["Science", "Health", "Environment", "Technology", "Society"]
19
+ sd_links_data = {}
20
+ for category in categories:
21
+ links = fetch_links(category)
22
+ sd_links_data[category] = links
23
+ data = json.dumps(sd_links_data, indent=4, ensure_ascii=False)
24
+ return data
25
+
26
+ def fetch_dois():
27
+ doi_data = {}
28
+ data = json.loads(fetch_all_links())
29
+ for topic, links in data.items():
30
+ doi_list = []
31
+ for link in links:
32
+ page_content = tools.fetch_page(link)
33
+ page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
34
+ for page_data in page_datas:
35
+ if not page_data.find("a", href=True):
36
+ continue
37
+ else:
38
+ doi = page_data.find("a", href=True).text
39
+ if doi.startswith('10.'):
40
+ doi_list.append(doi)
41
+ else:
42
+ continue
43
+ doi_data[topic] = doi_list
44
+ data = json.dumps(doi_data, indent=4, ensure_ascii=False)
45
+ return data
46
+
47
+ def fetch_doi_data():
48
+ result = []
49
+ def fetch_and_store():
50
+ result.append(fetch_dois())
51
+ thread = threading.Thread(target=fetch_and_store)
52
+ thread.start()
53
+ thread.join()
54
+ if len(result) == 0 or not result or result[0] == None:
55
+ return []
56
+ return result[0]
57
+
58
+ def doi_to_pmc():
59
+ data = json.loads(fetch_doi_data())
60
+ pmc_data = {}
61
+ for topic, dois in data.items():
62
+ if len(dois) > 0:
63
+ doi_list = ""
64
+ for doi in dois:
65
+ doi_list += doi + ","
66
+ doi_list = doi_list.rstrip(',')
67
+ try:
68
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/[email protected]&ids={doi_list}&format=json"
69
+ doi_pmc_data = requests.get(url).json()
70
+ except Exception as e:
71
+ print(f"Error: {str(e)}")
72
+ if doi_pmc_data['status'] == 'ok':
73
+ pmc_list = []
74
+ for record in doi_pmc_data['records']:
75
+ if 'pmcid' in record:
76
+ if 'live' in record and record['live'] == False:
77
+ continue
78
+ pmc_list.append(record['pmcid'])
79
+ else:
80
+ continue
81
+ pmc_data[topic] = pmc_list
82
+ else:
83
+ continue
84
+ else:
85
+ continue
86
+ data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
87
+ return data
88
+
89
+ def extract_pmc_data():
90
+ if not tools.download_datafile('pmc.txt'):
91
+ raise Exception("Failed to download datafile")
92
+ pmc_data ={}
93
+ pmcid_data = json.loads(doi_to_pmc())
94
+ for topic, pmcids in pmcid_data.items():
95
+ pmc_ids = []
96
+ for id in pmcids:
97
+ if len(pmc_ids) >= 2:
98
+ continue
99
+ elif tools.check_data_in_file(id, 'pmc.txt'):
100
+ continue
101
+ else:
102
+ tools.write_data_to_file(id, 'pmc.txt')
103
+ pmc_ids.append(id)
104
+ pmc_data[topic] = {}
105
+ pmc_data[topic]['count'] = len(pmc_ids)
106
+ pmc_data[topic]['ids'] = pmc_ids
107
+ data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
108
+ if not tools.upload_datafile('pmc.txt'):
109
+ raise Exception("Failed to upload datafile")
110
+ return data
111
+
112
+ if __name__ == "__main__":
113
+ data = extract_pmc_data()
114
+ with open('pmc_data.json', 'w') as f:
115
  f.write(data)