update structure
Browse files
assets/prompt/summarizing.xml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="utf-8"?>
|
2 |
+
<!DOCTYPE body [
|
3 |
+
<!ENTITY warning "Warning: Something bad happened... please refresh and try again.">
|
4 |
+
]>
|
5 |
+
<body>
|
6 |
+
<query rank="0">
|
7 |
+
<title>User Message</title>
|
8 |
+
<text>
|
9 |
+
Task Description:
|
10 |
+
|
11 |
+
You are provided with the title, abstract, and introduction of a research paper. Your task is to generate a concise summary of what kind of problem does this paper aim to solve and what methods are proposed to address it. The summary should follow this format:
|
12 |
+
The problem of [problem] can be addressed by [main idea/approach].
|
13 |
+
|
14 |
+
Instructions:
|
15 |
+
|
16 |
+
Title: Read the title to understand the general topic of the paper.
|
17 |
+
Abstract: Read the abstract to get a concise summary of the research, including the problem addressed, the methods used, and the main findings.
|
18 |
+
Introduction: Read the introduction to gain a deeper understanding of the background, significance, and specific problem the paper addresses, as well as the proposed approach or solution.
|
19 |
+
Based on the provided information, generate a single sentence that captures the essence of the paper, following the format specified above.
|
20 |
+
|
21 |
+
Your Turn:
|
22 |
+
|
23 |
+
Given the following paper information:
|
24 |
+
Title: {title}
|
25 |
+
Abstract: {abstract}
|
26 |
+
Introduction: {introduction}
|
27 |
+
|
28 |
+
Output:
|
29 |
+
The problem of [problem] can be addressed by [main idea/approach].
|
30 |
+
</text>
|
31 |
+
</query>
|
32 |
+
<query rank="1">
|
33 |
+
<title>User Message</title>
|
34 |
+
<text>
|
35 |
+
Please read the title, abstract, and introduction of the paper again, as well as the summary you provided. Complete the following two tasks:
|
36 |
+
1.Briefly provide the two most critical motivations behind proposing these methods to address the problems.
|
37 |
+
2.Briefly provide the three most critical or innovative details of the paper that were not mentioned in your summary (It's best if these details are the new methods or techniques adopted in this paper).
|
38 |
+
|
39 |
+
Output:
|
40 |
+
Motivations:1.[motivation1]. 2.[motivation2]. Details:1.[detail1]. 2.[detail2]. 3.[detail3].
|
41 |
+
</text>
|
42 |
+
</query>
|
43 |
+
</body>
|
configs/datasets.yaml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
DEFAULT:
|
2 |
-
pdf_cached: /
|
3 |
ignore_paper_id_list: ./assets/data/ignore_paper_id_list.json
|
4 |
log_level: "DEBUG"
|
5 |
log_dir: ./log
|
@@ -7,7 +7,7 @@ DEFAULT:
|
|
7 |
device: "cpu" # "cpu"
|
8 |
|
9 |
ARTICLE:
|
10 |
-
summarizing_prompt: ./prompt/summarizing.xml
|
11 |
|
12 |
RETRIEVE:
|
13 |
cite_type: "all_cite_id_list"
|
|
|
1 |
DEFAULT:
|
2 |
+
pdf_cached: ./assets/paper/pdf_cached
|
3 |
ignore_paper_id_list: ./assets/data/ignore_paper_id_list.json
|
4 |
log_level: "DEBUG"
|
5 |
log_dir: ./log
|
|
|
7 |
device: "cpu" # "cpu"
|
8 |
|
9 |
ARTICLE:
|
10 |
+
summarizing_prompt: ./assets/prompt/summarizing.xml
|
11 |
|
12 |
RETRIEVE:
|
13 |
cite_type: "all_cite_id_list"
|
src/utils/scipdf/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = "0.1.1"
|
2 |
+
|
3 |
+
__all__ = ["pdf", "features"]
|
4 |
+
|
5 |
+
from utils.scipdf.features.text_utils import *
|
6 |
+
from utils.scipdf.pdf.parse_pdf import *
|
src/utils/scipdf/features/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .text_utils import compute_readability_stats, compute_text_stats
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"compute_readability_stats",
|
5 |
+
"compute_text_stats",
|
6 |
+
"compute_journal_features",
|
7 |
+
]
|
src/utils/scipdf/features/text_utils.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import textstat
|
4 |
+
import spacy
|
5 |
+
from collections import Counter
|
6 |
+
from itertools import groupby
|
7 |
+
|
8 |
+
|
9 |
+
nlp = spacy.load("en_core_web_sm")
|
10 |
+
|
11 |
+
PRESENT_TENSE_VERB_LIST = ["VB", "VBP", "VBZ", "VBG"]
|
12 |
+
VERB_LIST = ["VB", "VBP", "VBZ", "VBG", "VBN", "VBD"]
|
13 |
+
NOUN_LIST = ["NNP", "NNPS"]
|
14 |
+
|
15 |
+
|
16 |
+
SECTIONS_MAPS = {
|
17 |
+
"Authors": "Authors",
|
18 |
+
"AUTHORS": "AUTHORS",
|
19 |
+
"Abstract": "Abstract",
|
20 |
+
"ABSTRACT": "Abstract",
|
21 |
+
"Date": "Date",
|
22 |
+
"DATE": "DATE",
|
23 |
+
"INTRODUCTION": "Introduction",
|
24 |
+
"MATERIALS AND METHODS": "Methods",
|
25 |
+
"Materials and methods": "Methods",
|
26 |
+
"METHODS": "Methods",
|
27 |
+
"RESULTS": "Results",
|
28 |
+
"CONCLUSIONS": "Conclusions",
|
29 |
+
"CONCLUSIONS AND FUTURE APPLICATIONS": "Conclusions",
|
30 |
+
"DISCUSSION": "Discussion",
|
31 |
+
"ACKNOWLEDGMENTS": "Acknowledgement",
|
32 |
+
"TABLES": "Tables",
|
33 |
+
"Tabnles": "Tables",
|
34 |
+
"DISCLOSURE": "Disclosure",
|
35 |
+
"CONFLICT OF INTEREST": "Disclosure",
|
36 |
+
"Acknowledgement": "Acknowledgements",
|
37 |
+
}
|
38 |
+
|
39 |
+
|
40 |
+
def compute_readability_stats(text):
|
41 |
+
"""
|
42 |
+
Compute reading statistics of the given text
|
43 |
+
Reference: https://github.com/shivam5992/textstat
|
44 |
+
|
45 |
+
Parameters
|
46 |
+
==========
|
47 |
+
text: str, input section or abstract text
|
48 |
+
"""
|
49 |
+
try:
|
50 |
+
readability_dict = {
|
51 |
+
"flesch_reading_ease": textstat.flesch_reading_ease(text),
|
52 |
+
"smog": textstat.smog_index(text),
|
53 |
+
"flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
|
54 |
+
"coleman_liau_index": textstat.coleman_liau_index(text),
|
55 |
+
"automated_readability_index": textstat.automated_readability_index(text),
|
56 |
+
"dale_chall": textstat.dale_chall_readability_score(text),
|
57 |
+
"difficult_words": textstat.difficult_words(text),
|
58 |
+
"linsear_write": textstat.linsear_write_formula(text),
|
59 |
+
"gunning_fog": textstat.gunning_fog(text),
|
60 |
+
"text_standard": textstat.text_standard(text),
|
61 |
+
"n_syllable": textstat.syllable_count(text),
|
62 |
+
"avg_letter_per_word": textstat.avg_letter_per_word(text),
|
63 |
+
"avg_sentence_length": textstat.avg_sentence_length(text),
|
64 |
+
}
|
65 |
+
except:
|
66 |
+
readability_dict = {
|
67 |
+
"flesch_reading_ease": None,
|
68 |
+
"smog": None,
|
69 |
+
"flesch_kincaid_grade": None,
|
70 |
+
"coleman_liau_index": None,
|
71 |
+
"automated_readability_index": None,
|
72 |
+
"dale_chall": None,
|
73 |
+
"difficult_words": None,
|
74 |
+
"linsear_write": None,
|
75 |
+
"gunning_fog": None,
|
76 |
+
"text_standard": None,
|
77 |
+
"n_syllable": None,
|
78 |
+
"avg_letter_per_word": None,
|
79 |
+
"avg_sentence_length": None,
|
80 |
+
}
|
81 |
+
return readability_dict
|
82 |
+
|
83 |
+
|
84 |
+
def compute_text_stats(text):
|
85 |
+
"""
|
86 |
+
Compute part of speech features from a given spacy wrapper of text
|
87 |
+
|
88 |
+
Parameters
|
89 |
+
==========
|
90 |
+
text: spacy.tokens.doc.Doc, spacy wrapper of the section or abstract text
|
91 |
+
|
92 |
+
Output
|
93 |
+
======
|
94 |
+
text_stat: dict, part of speech and text features extracted from the given text
|
95 |
+
"""
|
96 |
+
try:
|
97 |
+
pos = dict(Counter([token.pos_ for token in text]))
|
98 |
+
pos_tag = dict(
|
99 |
+
Counter([token.tag_ for token in text])
|
100 |
+
) # detailed part-of-speech
|
101 |
+
|
102 |
+
n_present_verb = sum(
|
103 |
+
[v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST]
|
104 |
+
)
|
105 |
+
n_verb = sum([v for k, v in pos_tag.items() if k in VERB_LIST])
|
106 |
+
|
107 |
+
word_shape = dict(Counter([token.shape_ for token in text])) # word shape
|
108 |
+
n_word_per_sents = [len([token for token in sent]) for sent in text.sents]
|
109 |
+
n_digits = sum([token.is_digit or token.like_num for token in text])
|
110 |
+
n_word = sum(n_word_per_sents)
|
111 |
+
n_sents = len(n_word_per_sents)
|
112 |
+
text_stats_dict = {
|
113 |
+
"pos": pos,
|
114 |
+
"pos_tag": pos_tag,
|
115 |
+
"word_shape": word_shape,
|
116 |
+
"n_word": n_word,
|
117 |
+
"n_sents": n_sents,
|
118 |
+
"n_present_verb": n_present_verb,
|
119 |
+
"n_verb": n_verb,
|
120 |
+
"n_digits": n_digits,
|
121 |
+
"percent_digits": n_digits / n_word,
|
122 |
+
"n_word_per_sents": n_word_per_sents,
|
123 |
+
"avg_word_per_sents": np.mean(n_word_per_sents),
|
124 |
+
}
|
125 |
+
except:
|
126 |
+
text_stats_dict = {
|
127 |
+
"pos": None,
|
128 |
+
"pos_tag": None,
|
129 |
+
"word_shape": None,
|
130 |
+
"n_word": None,
|
131 |
+
"n_sents": None,
|
132 |
+
"n_present_verb": None,
|
133 |
+
"n_verb": None,
|
134 |
+
"n_digits": None,
|
135 |
+
"percent_digits": None,
|
136 |
+
"n_word_per_sents": None,
|
137 |
+
"avg_word_per_sents": None,
|
138 |
+
}
|
139 |
+
return text_stats_dict
|
140 |
+
|
141 |
+
|
142 |
+
def compute_journal_features(article):
|
143 |
+
"""
|
144 |
+
Parse features about journal references from a given dictionary of parsed article e.g.
|
145 |
+
number of reference made, number of unique journal refered, minimum year of references,
|
146 |
+
maximum year of references, ...
|
147 |
+
|
148 |
+
Parameters
|
149 |
+
==========
|
150 |
+
article: dict, article dictionary parsed from GROBID and converted to dictionary
|
151 |
+
see ``pdf/parse_pdf.py`` for the detail of the output dictionary
|
152 |
+
|
153 |
+
Output
|
154 |
+
======
|
155 |
+
reference_dict: dict, dictionary of
|
156 |
+
"""
|
157 |
+
try:
|
158 |
+
n_reference = len(article["references"])
|
159 |
+
n_unique_journals = len(
|
160 |
+
pd.unique([a["journal"] for a in article["references"]])
|
161 |
+
)
|
162 |
+
reference_years = []
|
163 |
+
for reference in article["references"]:
|
164 |
+
year = reference["year"]
|
165 |
+
if year.isdigit():
|
166 |
+
# filter outliers
|
167 |
+
if int(year) in range(1800, 2100):
|
168 |
+
reference_years.append(int(year))
|
169 |
+
avg_ref_year = np.mean(reference_years)
|
170 |
+
median_ref_year = np.median(reference_years)
|
171 |
+
min_ref_year = np.min(reference_years)
|
172 |
+
max_ref_year = np.max(reference_years)
|
173 |
+
journal_features_dict = {
|
174 |
+
"n_reference": n_reference,
|
175 |
+
"n_unique_journals": n_unique_journals,
|
176 |
+
"avg_ref_year": avg_ref_year,
|
177 |
+
"median_ref_year": median_ref_year,
|
178 |
+
"min_ref_year": min_ref_year,
|
179 |
+
"max_ref_year": max_ref_year,
|
180 |
+
}
|
181 |
+
except:
|
182 |
+
journal_features_dict = {
|
183 |
+
"n_reference": None,
|
184 |
+
"n_unique_journals": None,
|
185 |
+
"avg_ref_year": None,
|
186 |
+
"median_ref_year": None,
|
187 |
+
"min_ref_year": None,
|
188 |
+
"max_ref_year": None,
|
189 |
+
}
|
190 |
+
return journal_features_dict
|
191 |
+
|
192 |
+
|
193 |
+
def merge_section_list(section_list, section_maps=SECTIONS_MAPS, section_start=""):
|
194 |
+
"""
|
195 |
+
Merge a list of sections into a normalized list of sections,
|
196 |
+
you can get the list of sections from parsed article JSON in ``parse_pdf.py`` e.g.
|
197 |
+
|
198 |
+
>> section_list = [s['heading'] for s in article_json['sections']]
|
199 |
+
>> section_list_merged = merge_section_list(section_list)
|
200 |
+
|
201 |
+
Parameters
|
202 |
+
==========
|
203 |
+
section_list: list, list of sections
|
204 |
+
|
205 |
+
Output
|
206 |
+
======
|
207 |
+
section_list_merged: list, sections
|
208 |
+
"""
|
209 |
+
sect_map = section_start # text for starting section e.g. ``Introduction``
|
210 |
+
section_list_merged = []
|
211 |
+
for section in section_list:
|
212 |
+
if any([(s.lower() in section.lower()) for s in section_maps.keys()]):
|
213 |
+
sect = [s for s in section_maps.keys() if s.lower() in section.lower()][0]
|
214 |
+
sect_map = section_maps.get(sect, "") #
|
215 |
+
section_list_merged.append(sect_map)
|
216 |
+
else:
|
217 |
+
section_list_merged.append(sect_map)
|
218 |
+
return section_list_merged
|
src/utils/scipdf/pdf/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .parse_pdf import *
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"list_pdf_paths",
|
5 |
+
"parse_abstract",
|
6 |
+
"parse_figure_caption",
|
7 |
+
"parse_references",
|
8 |
+
"parse_pdf_to_dict",
|
9 |
+
]
|
src/utils/scipdf/pdf/parse_pdf.py
ADDED
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import os.path as op
|
4 |
+
from glob import glob
|
5 |
+
import urllib
|
6 |
+
from typing import Dict
|
7 |
+
import subprocess
|
8 |
+
import requests
|
9 |
+
import fitz
|
10 |
+
from PIL import Image
|
11 |
+
import io
|
12 |
+
from bs4 import BeautifulSoup, NavigableString
|
13 |
+
from tqdm import tqdm, tqdm_notebook
|
14 |
+
|
15 |
+
|
16 |
+
# GROBID_URL = "http://localhost:8070"
|
17 |
+
GROBID_URL = "http://10.82.77.107:8070"
|
18 |
+
DIR_PATH = op.dirname(op.abspath(__file__))
|
19 |
+
PDF_FIGURES_JAR_PATH = op.join(
|
20 |
+
DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar"
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
def list_pdf_paths(pdf_folder: str):
|
25 |
+
"""
|
26 |
+
list of pdf paths in pdf folder
|
27 |
+
"""
|
28 |
+
return glob(op.join(pdf_folder, "*", "*", "*.pdf"))
|
29 |
+
|
30 |
+
|
31 |
+
def validate_url(path: str):
|
32 |
+
"""
|
33 |
+
Validate a given ``path`` if it is URL or not
|
34 |
+
"""
|
35 |
+
regex = re.compile(
|
36 |
+
r"^(?:http|ftp)s?://" # http:// or https://
|
37 |
+
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
|
38 |
+
r"localhost|" # localhost...
|
39 |
+
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
|
40 |
+
r"(?::\d+)?" # optional port
|
41 |
+
r"(?:/?|[/?]\S+)$",
|
42 |
+
re.IGNORECASE,
|
43 |
+
)
|
44 |
+
return re.match(regex, path) is not None
|
45 |
+
|
46 |
+
|
47 |
+
def parse_pdf(
|
48 |
+
pdf_path: str,
|
49 |
+
fulltext: bool = True,
|
50 |
+
soup: bool = False,
|
51 |
+
return_coordinates: bool = False,
|
52 |
+
grobid_url: str = GROBID_URL,
|
53 |
+
):
|
54 |
+
"""
|
55 |
+
Function to parse PDF to XML or BeautifulSoup using GROBID tool
|
56 |
+
|
57 |
+
You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally
|
58 |
+
After loading GROBID zip file, you can run GROBID by using the following
|
59 |
+
>> ./gradlew run
|
60 |
+
|
61 |
+
Parameters
|
62 |
+
==========
|
63 |
+
pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF
|
64 |
+
fulltext: bool, option for parsing, if True, parse full text of the article
|
65 |
+
if False, parse only header
|
66 |
+
grobid_url: str, url to GROBID parser, default at 'http://localhost:8070'
|
67 |
+
This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service
|
68 |
+
soup: bool, if True, return BeautifulSoup of the article
|
69 |
+
|
70 |
+
Output
|
71 |
+
======
|
72 |
+
parsed_article: if soup is False, return parsed XML in text format,
|
73 |
+
else return BeautifulSoup of the XML
|
74 |
+
Example
|
75 |
+
=======
|
76 |
+
>> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True)
|
77 |
+
"""
|
78 |
+
# GROBID URL
|
79 |
+
if fulltext:
|
80 |
+
url = "%s/api/processFulltextDocument" % grobid_url
|
81 |
+
else:
|
82 |
+
url = "%s/api/processHeaderDocument" % grobid_url
|
83 |
+
|
84 |
+
files = []
|
85 |
+
if return_coordinates:
|
86 |
+
files += [
|
87 |
+
("teiCoordinates", (None, "persName")),
|
88 |
+
("teiCoordinates", (None, "figure")),
|
89 |
+
("teiCoordinates", (None, "ref")),
|
90 |
+
("teiCoordinates", (None, "formula")),
|
91 |
+
("teiCoordinates", (None, "biblStruct")),
|
92 |
+
]
|
93 |
+
|
94 |
+
if isinstance(pdf_path, str):
|
95 |
+
if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf":
|
96 |
+
print("The input URL has to end with ``.pdf``")
|
97 |
+
parsed_article = None
|
98 |
+
elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf":
|
99 |
+
page = urllib.request.urlopen(pdf_path).read()
|
100 |
+
files += [("input", page)]
|
101 |
+
parsed_article = requests.post(url, files=files).text
|
102 |
+
elif op.exists(pdf_path):
|
103 |
+
files += [("input", (open(pdf_path, "rb")))]
|
104 |
+
parsed_article = requests.post(
|
105 |
+
url, files=files
|
106 |
+
).text
|
107 |
+
else:
|
108 |
+
parsed_article = None
|
109 |
+
elif isinstance(pdf_path, bytes):
|
110 |
+
# assume that incoming is byte string
|
111 |
+
files += [("input", (pdf_path))]
|
112 |
+
parsed_article = requests.post(url, files=files).text
|
113 |
+
else:
|
114 |
+
parsed_article = None
|
115 |
+
|
116 |
+
if soup and parsed_article is not None:
|
117 |
+
parsed_article = BeautifulSoup(parsed_article, "lxml")
|
118 |
+
|
119 |
+
return parsed_article
|
120 |
+
|
121 |
+
|
122 |
+
def parse_authors(article):
|
123 |
+
"""
|
124 |
+
Parse authors from a given BeautifulSoup of an article
|
125 |
+
"""
|
126 |
+
authors = []
|
127 |
+
try:
|
128 |
+
author_names = article.find("sourcedesc").findAll("persname")
|
129 |
+
except Exception:
|
130 |
+
return authors
|
131 |
+
for author in author_names:
|
132 |
+
firstname = author.find("forename", {"type": "first"})
|
133 |
+
firstname = firstname.text.strip() if firstname is not None else ""
|
134 |
+
middlename = author.find("forename", {"type": "middle"})
|
135 |
+
middlename = middlename.text.strip() if middlename is not None else ""
|
136 |
+
lastname = author.find("surname")
|
137 |
+
lastname = lastname.text.strip() if lastname is not None else ""
|
138 |
+
if middlename != "":
|
139 |
+
authors.append(firstname + " " + middlename + " " + lastname)
|
140 |
+
else:
|
141 |
+
authors.append(firstname + " " + lastname)
|
142 |
+
authors = "; ".join(authors)
|
143 |
+
return authors
|
144 |
+
|
145 |
+
|
146 |
+
def parse_date(article):
|
147 |
+
"""
|
148 |
+
Parse date from a given BeautifulSoup of an article
|
149 |
+
"""
|
150 |
+
try:
|
151 |
+
pub_date = article.find("publicationstmt")
|
152 |
+
year = pub_date.find("date")
|
153 |
+
except Exception:
|
154 |
+
return ""
|
155 |
+
year = year.attrs.get("when") if year is not None else ""
|
156 |
+
return year
|
157 |
+
|
158 |
+
|
159 |
+
def parse_abstract(article):
|
160 |
+
"""
|
161 |
+
Parse abstract from a given BeautifulSoup of an article
|
162 |
+
"""
|
163 |
+
div = article.find("abstract")
|
164 |
+
abstract = ""
|
165 |
+
for p in list(div.children):
|
166 |
+
if not isinstance(p, NavigableString) and len(list(p)) > 0:
|
167 |
+
abstract += " ".join(
|
168 |
+
[elem.text for elem in p if not isinstance(elem, NavigableString)]
|
169 |
+
)
|
170 |
+
return abstract
|
171 |
+
|
172 |
+
|
173 |
+
def find_references(div):
|
174 |
+
"""
|
175 |
+
For a given section, find references made in the section for publications, figures, tables
|
176 |
+
"""
|
177 |
+
publication_ref = [ref.attrs.get("target").strip("#") for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr" and "target" in ref.attrs]
|
178 |
+
figure_ref = [ref.attrs.get("target").strip("#") for ref in div.find_all("ref") if ref.attrs.get("type") == "figure" and "target" in ref.attrs]
|
179 |
+
table_ref = [ref.attrs.get("target").strip("#") for ref in div.find_all("ref") if ref.attrs.get("type") == "table" and "target" in ref.attrs]
|
180 |
+
return {"publication_ref": publication_ref, "figure_ref": figure_ref, "table_ref": table_ref}
|
181 |
+
|
182 |
+
|
183 |
+
def parse_sections(article, as_list: bool = False):
|
184 |
+
"""
|
185 |
+
Parse list of sections from a given BeautifulSoup of an article
|
186 |
+
|
187 |
+
Parameters
|
188 |
+
==========
|
189 |
+
as_list: bool, if True, output text as a list of paragraph instead
|
190 |
+
of joining it together as one single text
|
191 |
+
"""
|
192 |
+
article_text = article.find("text")
|
193 |
+
divs = article_text.find_all("div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"})
|
194 |
+
sections = []
|
195 |
+
for div in divs:
|
196 |
+
div_list = list(div.children)
|
197 |
+
if len(div_list) == 0:
|
198 |
+
heading = ""
|
199 |
+
text = ""
|
200 |
+
elif len(div_list) == 1:
|
201 |
+
if isinstance(div_list[0], NavigableString):
|
202 |
+
heading = str(div_list[0])
|
203 |
+
text = ""
|
204 |
+
else:
|
205 |
+
heading = ""
|
206 |
+
text = div_list[0].text
|
207 |
+
else:
|
208 |
+
text = []
|
209 |
+
heading = div_list[0]
|
210 |
+
if isinstance(heading, NavigableString):
|
211 |
+
heading = str(heading)
|
212 |
+
p_all = list(div.children)[1:]
|
213 |
+
else:
|
214 |
+
heading = ""
|
215 |
+
p_all = list(div.children)
|
216 |
+
for p in p_all:
|
217 |
+
if p is not None:
|
218 |
+
try:
|
219 |
+
text.append(p.text)
|
220 |
+
except:
|
221 |
+
pass
|
222 |
+
if not as_list:
|
223 |
+
text = "\n".join(text)
|
224 |
+
|
225 |
+
if heading != "" or text != "":
|
226 |
+
ref_dict = find_references(div)
|
227 |
+
sections.append(
|
228 |
+
{
|
229 |
+
"heading": heading,
|
230 |
+
"text": text,
|
231 |
+
"publication_ref": ref_dict["publication_ref"],
|
232 |
+
"figure_ref": ref_dict["figure_ref"],
|
233 |
+
"table_ref": ref_dict["table_ref"],
|
234 |
+
}
|
235 |
+
)
|
236 |
+
return sections
|
237 |
+
|
238 |
+
|
239 |
+
def parse_references(article):
|
240 |
+
"""
|
241 |
+
Parse list of references from a given BeautifulSoup of an article
|
242 |
+
"""
|
243 |
+
reference_list = []
|
244 |
+
references = article.find("text").find("div", attrs={"type": "references"})
|
245 |
+
references = references.find_all("biblstruct") if references is not None else []
|
246 |
+
reference_list = []
|
247 |
+
for reference in references:
|
248 |
+
ref_id = reference.get('xml:id', "")
|
249 |
+
title = reference.find("title", attrs={"level": "a"})
|
250 |
+
if title is None:
|
251 |
+
title = reference.find("title", attrs={"level": "m"})
|
252 |
+
title = title.text if title is not None else ""
|
253 |
+
journal = reference.find("title", attrs={"level": "j"})
|
254 |
+
journal = journal.text if journal is not None else ""
|
255 |
+
if journal == "":
|
256 |
+
journal = reference.find("publisher")
|
257 |
+
journal = journal.text if journal is not None else ""
|
258 |
+
year = reference.find("date")
|
259 |
+
year = year.attrs.get("when") if year is not None else ""
|
260 |
+
authors = []
|
261 |
+
for author in reference.find_all("author"):
|
262 |
+
firstname = author.find("forename", {"type": "first"})
|
263 |
+
firstname = firstname.text.strip() if firstname is not None else ""
|
264 |
+
middlename = author.find("forename", {"type": "middle"})
|
265 |
+
middlename = middlename.text.strip() if middlename is not None else ""
|
266 |
+
lastname = author.find("surname")
|
267 |
+
lastname = lastname.text.strip() if lastname is not None else ""
|
268 |
+
if middlename != "":
|
269 |
+
authors.append(firstname + " " + middlename + " " + lastname)
|
270 |
+
else:
|
271 |
+
authors.append(firstname + " " + lastname)
|
272 |
+
authors = "; ".join(authors)
|
273 |
+
reference_list.append(
|
274 |
+
{"ref_id": ref_id, "title": title, "journal": journal, "year": year, "authors": authors}
|
275 |
+
)
|
276 |
+
return reference_list
|
277 |
+
|
278 |
+
|
279 |
+
def parse_figure_caption(article):
|
280 |
+
"""
|
281 |
+
Parse list of figures/tables from a given BeautifulSoup of an article
|
282 |
+
"""
|
283 |
+
figures_list = []
|
284 |
+
figures = article.find_all("figure")
|
285 |
+
for figure in figures:
|
286 |
+
figure_type = figure.attrs.get("type") or "figure"
|
287 |
+
figure_id = figure.attrs.get("xml:id") or ""
|
288 |
+
label = figure.find("label").text
|
289 |
+
if figure_type == "table":
|
290 |
+
caption = figure.find("figdesc").text
|
291 |
+
data = figure.table.text
|
292 |
+
else:
|
293 |
+
caption = figure.text
|
294 |
+
data = ""
|
295 |
+
figures_list.append(
|
296 |
+
{
|
297 |
+
"figure_label": label,
|
298 |
+
"figure_type": figure_type,
|
299 |
+
"figure_id": figure_id,
|
300 |
+
"figure_caption": caption,
|
301 |
+
"figure_data": data,
|
302 |
+
}
|
303 |
+
)
|
304 |
+
return figures_list
|
305 |
+
|
306 |
+
|
307 |
+
def parse_formulas(article):
|
308 |
+
"""
|
309 |
+
Parse list of formulas from a given BeautifulSoup of an article
|
310 |
+
"""
|
311 |
+
formulas_list = []
|
312 |
+
formulas = article.find_all("formula")
|
313 |
+
for formula in formulas:
|
314 |
+
formula_id = formula.attrs["xml:id"] or ""
|
315 |
+
formula_text = formula.text
|
316 |
+
formula_coordinates = formula.attrs.get("coords") or ""
|
317 |
+
if formula_coordinates != "":
|
318 |
+
formula_coordinates = [float(x) for x in formula_coordinates.split(",")]
|
319 |
+
formulas_list.append(
|
320 |
+
{
|
321 |
+
"formula_id": formula_id,
|
322 |
+
"formula_text": formula_text,
|
323 |
+
"formula_coordinates": formula_coordinates,
|
324 |
+
}
|
325 |
+
)
|
326 |
+
return formulas_list
|
327 |
+
|
328 |
+
|
329 |
+
def convert_article_soup_to_dict(article, as_list: bool = False):
|
330 |
+
"""
|
331 |
+
Function to convert BeautifulSoup to JSON format
|
332 |
+
similar to the output from https://github.com/allenai/science-parse/
|
333 |
+
|
334 |
+
Parameters
|
335 |
+
==========
|
336 |
+
article: BeautifulSoup
|
337 |
+
|
338 |
+
Output
|
339 |
+
======
|
340 |
+
article_json: dict, parsed dictionary of a given article in the following format
|
341 |
+
{
|
342 |
+
'title': ...,
|
343 |
+
'abstract': ...,
|
344 |
+
'sections': [
|
345 |
+
{'heading': ..., 'text': ...},
|
346 |
+
{'heading': ..., 'text': ...},
|
347 |
+
...
|
348 |
+
],
|
349 |
+
'references': [
|
350 |
+
{'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
|
351 |
+
{'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
|
352 |
+
...
|
353 |
+
],
|
354 |
+
'figures': [
|
355 |
+
{'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...},
|
356 |
+
...
|
357 |
+
]
|
358 |
+
}
|
359 |
+
"""
|
360 |
+
article_dict = {}
|
361 |
+
if article is not None:
|
362 |
+
title = article.find("title", attrs={"type": "main"})
|
363 |
+
title = title.text.strip() if title is not None else ""
|
364 |
+
|
365 |
+
article_dict["title"] = title
|
366 |
+
article_dict["authors"] = parse_authors(article)
|
367 |
+
article_dict["pub_date"] = parse_date(article)
|
368 |
+
article_dict["abstract"] = parse_abstract(article)
|
369 |
+
article_dict["sections"] = parse_sections(article, as_list=as_list)
|
370 |
+
article_dict["references"] = parse_references(article)
|
371 |
+
article_dict["figures"] = parse_figure_caption(article)
|
372 |
+
article_dict["formulas"] = parse_formulas(article)
|
373 |
+
|
374 |
+
doi = article.find("idno", attrs={"type": "DOI"})
|
375 |
+
doi = doi.text if doi is not None else ""
|
376 |
+
article_dict["doi"] = doi
|
377 |
+
|
378 |
+
return article_dict
|
379 |
+
else:
|
380 |
+
return None
|
381 |
+
|
382 |
+
|
383 |
+
def parse_pdf_to_dict(
|
384 |
+
pdf_path: str,
|
385 |
+
fulltext: bool = True,
|
386 |
+
soup: bool = True,
|
387 |
+
as_list: bool = False,
|
388 |
+
return_coordinates: bool = True,
|
389 |
+
grobid_url: str = GROBID_URL,
|
390 |
+
parse_figures: bool = True,
|
391 |
+
):
|
392 |
+
"""
|
393 |
+
Parse the given PDF and return dictionary of the parsed article
|
394 |
+
|
395 |
+
Parameters
|
396 |
+
==========
|
397 |
+
pdf_path: str, path to publication or article
|
398 |
+
fulltext: bool, whether to extract fulltext or not
|
399 |
+
soup: bool, whether to return BeautifulSoup or not
|
400 |
+
as_list: bool, whether to return list of sections or not
|
401 |
+
grobid_url: str, url to grobid server, default is `GROBID_URL`
|
402 |
+
This could be changed to "https://kermitt2-grobid.hf.space" for the cloud service
|
403 |
+
|
404 |
+
Ouput
|
405 |
+
=====
|
406 |
+
article_dict: dict, dictionary of an article
|
407 |
+
"""
|
408 |
+
parsed_article = parse_pdf(
|
409 |
+
pdf_path,
|
410 |
+
fulltext=fulltext,
|
411 |
+
soup=soup,
|
412 |
+
return_coordinates=return_coordinates,
|
413 |
+
grobid_url=grobid_url,
|
414 |
+
)
|
415 |
+
article_dict = convert_article_soup_to_dict(parsed_article, as_list=as_list)
|
416 |
+
|
417 |
+
return article_dict
|
418 |
+
|
419 |
+
|
420 |
+
def parse_figures(
|
421 |
+
pdf_folder: str,
|
422 |
+
jar_path: str = PDF_FIGURES_JAR_PATH,
|
423 |
+
resolution: int = 300,
|
424 |
+
output_folder: str = "figures",
|
425 |
+
):
|
426 |
+
"""
|
427 |
+
Parse figures from the given scientific PDF using pdffigures2
|
428 |
+
|
429 |
+
Parameters
|
430 |
+
==========
|
431 |
+
pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files
|
432 |
+
jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file
|
433 |
+
resolution: int, resolution of the output figures
|
434 |
+
output_folder: str, path to folder that we want to save parsed data (related to figures) and figures
|
435 |
+
|
436 |
+
Output
|
437 |
+
======
|
438 |
+
folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively
|
439 |
+
"""
|
440 |
+
if not op.isdir(output_folder):
|
441 |
+
os.makedirs(output_folder)
|
442 |
+
|
443 |
+
# create ``data`` and ``figures`` subfolder within ``output_folder``
|
444 |
+
data_path = op.join(output_folder, "data")
|
445 |
+
figure_path = op.join(output_folder, "figures")
|
446 |
+
if not op.exists(data_path):
|
447 |
+
os.makedirs(data_path)
|
448 |
+
if not op.exists(figure_path):
|
449 |
+
os.makedirs(figure_path)
|
450 |
+
|
451 |
+
if op.isdir(data_path) and op.isdir(figure_path):
|
452 |
+
args = [
|
453 |
+
"java",
|
454 |
+
"-jar",
|
455 |
+
jar_path,
|
456 |
+
pdf_folder,
|
457 |
+
"-i",
|
458 |
+
str(resolution),
|
459 |
+
"-d",
|
460 |
+
op.join(op.abspath(data_path), ""),
|
461 |
+
"-m",
|
462 |
+
op.join(op.abspath(figure_path), ""), # end path with "/"
|
463 |
+
]
|
464 |
+
_ = subprocess.run(
|
465 |
+
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20
|
466 |
+
)
|
467 |
+
print("Done parsing figures from PDFs!")
|
468 |
+
else:
|
469 |
+
print(
|
470 |
+
"You may have to check of ``data`` and ``figures`` in the the output folder path."
|
471 |
+
)
|