File size: 3,938 Bytes
11f88e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8ba39d
11f88e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c680313
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import requests
import tools
import threading

def fetch_links(category):
    links = []
    xml_data = tools.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
    items = ET.fromstring(xml_data).findall('channel/item')
    for item in items:
        link = item.find('link').text
        links.append(link)
    return links

def fetch_all_links():
    categories = ["Science", "Health", "Environment", "Technology", "Society"]
    sd_links_data = {}
    for category in categories:
        links = fetch_links(category)
        sd_links_data[category] = links
    data = json.dumps(sd_links_data, indent=4, ensure_ascii=False)
    return data

def fetch_dois():
    doi_data = {}
    data = json.loads(fetch_all_links())
    for topic, links in data.items():
        doi_list = []
        for link in links:
            page_content = tools.fetch_page(link)
            page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
            for page_data in page_datas:
                if not page_data.find("a", href=True):
                    continue
                else:
                    doi = page_data.find("a", href=True).text
                    if doi.startswith('10.'):
                        doi_list.append(doi)
                    else:
                        continue
        doi_data[topic] = doi_list
    data = json.dumps(doi_data, indent=4, ensure_ascii=False)
    return data

def fetch_doi_data():
    result = []
    def fetch_and_store():
        result.append(fetch_dois())
    thread = threading.Thread(target=fetch_and_store)
    thread.start()
    thread.join()
    if len(result) == 0 or not result or result[0] == None:
        return []
    return result[0]

def doi_to_pmc():
    data = json.loads(fetch_doi_data())
    pmc_data = {}
    for topic, dois in data.items():
        if len(dois) > 0:
            doi_list = ""
            for doi in dois:
                doi_list += doi + ","
            doi_list = doi_list.rstrip(',')
            try:
                url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/[email protected]&ids={doi_list}&format=json"
                doi_pmc_data = requests.get(url).json()
            except Exception as e:
                print(f"Error: {str(e)}")
            if doi_pmc_data['status'] == 'ok':
                pmc_list = []
                for record in doi_pmc_data['records']:
                    if 'pmcid' in record:
                        if 'live' in record and record['live'] == False:
                            continue
                        pmc_list.append(record['pmcid'])
                    else:
                        continue
                pmc_data[topic] = pmc_list
            else:
                continue
        else:
            continue
    data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
    return data

def extract_pmc_data():
    if not tools.download_datafile('pmc.txt'):
        raise Exception("Failed to download datafile")
    pmc_data ={}
    pmcid_data = json.loads(doi_to_pmc())
    for topic, pmcids in pmcid_data.items():
        pmc_ids = []
        for id in pmcids:
            if len(pmc_ids) >= 3:
                continue
            elif tools.check_data_in_file(id, 'pmc.txt'):
                continue
            else:
                tools.write_data_to_file(id, 'pmc.txt')
                pmc_ids.append(id)
        pmc_data[topic] = {}
        pmc_data[topic]['count'] = len(pmc_ids)
        pmc_data[topic]['ids'] = pmc_ids
    data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
    if not tools.upload_datafile('pmc.txt'):
        raise Exception("Failed to upload datafile")
    return data

if __name__ == "__main__":
    data = extract_pmc_data()
    with open('pmc_data.json', 'w') as f:
        f.write(data)