File size: 3,658 Bytes
928f123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import re
import json
import requests
import datetime
from datetime import date
from datetime import datetime
import xml.etree.ElementTree as ET
from requests.exceptions import HTTPError

def _get_today():
    return str(date.today())

def _download_pdf_from_arxiv(filename):
    url = f'https://arxiv.org/pdf/{filename}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception(f"Failed to download pdf for arXiv id {filename}")

def download_pdf_from_arxiv(arxiv_id):
    filename = f"{arxiv_id}.pdf"
    pdf_content = _download_pdf_from_arxiv(filename)

    # Save the pdf content to a file
    with open(filename, "wb") as f:
        f.write(pdf_content)

    return filename

def _get_papers_from_hf_daily_papers(target_date):
    if target_date is None:
        target_date = _get_today()
        print(f"target_date is not set => scrap today's papers [{target_date}]")
    url = f"https://huggingface.co./api/daily_papers?date={target_date}"

    response = requests.get(url)

    if response.status_code == 200:
        return target_date, response.text
    else:
        raise HTTPError(f"Error fetching data. Status code: {response.status_code}")

def get_papers_from_hf_daily_papers(target_date):
    target_date, results = _get_papers_from_hf_daily_papers(target_date)
    results = json.loads(results)
    for result in results:
        result["target_date"] = target_date
    return target_date, results


def _get_paper_xml_by_arxiv_id(arxiv_id):
    url = f"http://export.arxiv.org/api/query?search_query=id:{arxiv_id}&start=0&max_results=1"
    return requests.get(url)

def _is_arxiv_id_valid(arxiv_id):
  pattern = r"^\d{4}\.\d{5}$" 
  return bool(re.match(pattern, arxiv_id))

def _get_paper_metadata_by_arxiv_id(response):
    root = ET.fromstring(response.content)

    # Example: Extracting title, authors, and abstract
    title = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}title').text
    authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in root.findall('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}author')]
    abstract = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}summary').text
    target_date = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}published').text    

    return title, authors, abstract, target_date

def get_papers_from_arxiv_ids(arxiv_ids):
    results = []

    for arxiv_id in arxiv_ids:
        print(arxiv_id)
        if _is_arxiv_id_valid(arxiv_id):
            try:
                xml_data = _get_paper_xml_by_arxiv_id(arxiv_id)
                title, authors, abstract, target_date = _get_paper_metadata_by_arxiv_id(xml_data)

                datetime_obj = datetime.strptime(target_date, "%Y-%m-%dT%H:%M:%SZ")
                formatted_date = datetime_obj.strftime("%Y-%m-%d")

                results.append(
                    {
                        "title": title,
                        "target_date": formatted_date,
                        "paper": {
                            "summary": abstract,
                            "id": arxiv_id,
                            "authors" : authors,
                        }
                    }
                )
            except:
                print("......something wrong happend when downloading metadata")
                print("......this usually happens when you try out the today's published paper")
                continue
        else:
            print(f"......not a valid arXiv ID[{arxiv_id}]")

    return results