lihuigu commited on
Commit
cee6a24
·
1 Parent(s): f48c751

update structure

Browse files
assets/prompt/summarizing.xml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <!DOCTYPE body [
3
+ <!ENTITY warning "Warning: Something bad happened... please refresh and try again.">
4
+ ]>
5
+ <body>
6
+ <query rank="0">
7
+ <title>User Message</title>
8
+ <text>
9
+ Task Description:
10
+
11
+ You are provided with the title, abstract, and introduction of a research paper. Your task is to generate a concise summary of what kind of problem does this paper aim to solve and what methods are proposed to address it. The summary should follow this format:
12
+ The problem of [problem] can be addressed by [main idea/approach].
13
+
14
+ Instructions:
15
+
16
+ Title: Read the title to understand the general topic of the paper.
17
+ Abstract: Read the abstract to get a concise summary of the research, including the problem addressed, the methods used, and the main findings.
18
+ Introduction: Read the introduction to gain a deeper understanding of the background, significance, and specific problem the paper addresses, as well as the proposed approach or solution.
19
+ Based on the provided information, generate a single sentence that captures the essence of the paper, following the format specified above.
20
+
21
+ Your Turn:
22
+
23
+ Given the following paper information:
24
+ Title: {title}
25
+ Abstract: {abstract}
26
+ Introduction: {introduction}
27
+
28
+ Output:
29
+ The problem of [problem] can be addressed by [main idea/approach].
30
+ </text>
31
+ </query>
32
+ <query rank="1">
33
+ <title>User Message</title>
34
+ <text>
35
+ Please read the title, abstract, and introduction of the paper again, as well as the summary you provided. Complete the following two tasks:
36
+ 1.Briefly provide the two most critical motivations behind proposing these methods to address the problems.
37
+ 2.Briefly provide the three most critical or innovative details of the paper that were not mentioned in your summary (It's best if these details are the new methods or techniques adopted in this paper).
38
+
39
+ Output:
40
+ Motivations:1.[motivation1]. 2.[motivation2]. Details:1.[detail1]. 2.[detail2]. 3.[detail3].
41
+ </text>
42
+ </query>
43
+ </body>
configs/datasets.yaml CHANGED
@@ -1,5 +1,5 @@
1
  DEFAULT:
2
- pdf_cached: /data/llms/data/scipip-data/pdf_cached
3
  ignore_paper_id_list: ./assets/data/ignore_paper_id_list.json
4
  log_level: "DEBUG"
5
  log_dir: ./log
@@ -7,7 +7,7 @@ DEFAULT:
7
  device: "cpu" # "cpu"
8
 
9
  ARTICLE:
10
- summarizing_prompt: ./prompt/summarizing.xml
11
 
12
  RETRIEVE:
13
  cite_type: "all_cite_id_list"
 
1
  DEFAULT:
2
+ pdf_cached: ./assets/paper/pdf_cached
3
  ignore_paper_id_list: ./assets/data/ignore_paper_id_list.json
4
  log_level: "DEBUG"
5
  log_dir: ./log
 
7
  device: "cpu" # "cpu"
8
 
9
  ARTICLE:
10
+ summarizing_prompt: ./assets/prompt/summarizing.xml
11
 
12
  RETRIEVE:
13
  cite_type: "all_cite_id_list"
src/utils/scipdf/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __version__ = "0.1.1"
2
+
3
+ __all__ = ["pdf", "features"]
4
+
5
+ from utils.scipdf.features.text_utils import *
6
+ from utils.scipdf.pdf.parse_pdf import *
src/utils/scipdf/features/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .text_utils import compute_readability_stats, compute_text_stats
2
+
3
+ __all__ = [
4
+ "compute_readability_stats",
5
+ "compute_text_stats",
6
+ "compute_journal_features",
7
+ ]
src/utils/scipdf/features/text_utils.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import textstat
4
+ import spacy
5
+ from collections import Counter
6
+ from itertools import groupby
7
+
8
+
9
+ nlp = spacy.load("en_core_web_sm")
10
+
11
+ PRESENT_TENSE_VERB_LIST = ["VB", "VBP", "VBZ", "VBG"]
12
+ VERB_LIST = ["VB", "VBP", "VBZ", "VBG", "VBN", "VBD"]
13
+ NOUN_LIST = ["NNP", "NNPS"]
14
+
15
+
16
+ SECTIONS_MAPS = {
17
+ "Authors": "Authors",
18
+ "AUTHORS": "AUTHORS",
19
+ "Abstract": "Abstract",
20
+ "ABSTRACT": "Abstract",
21
+ "Date": "Date",
22
+ "DATE": "DATE",
23
+ "INTRODUCTION": "Introduction",
24
+ "MATERIALS AND METHODS": "Methods",
25
+ "Materials and methods": "Methods",
26
+ "METHODS": "Methods",
27
+ "RESULTS": "Results",
28
+ "CONCLUSIONS": "Conclusions",
29
+ "CONCLUSIONS AND FUTURE APPLICATIONS": "Conclusions",
30
+ "DISCUSSION": "Discussion",
31
+ "ACKNOWLEDGMENTS": "Acknowledgement",
32
+ "TABLES": "Tables",
33
+ "Tabnles": "Tables",
34
+ "DISCLOSURE": "Disclosure",
35
+ "CONFLICT OF INTEREST": "Disclosure",
36
+ "Acknowledgement": "Acknowledgements",
37
+ }
38
+
39
+
40
+ def compute_readability_stats(text):
41
+ """
42
+ Compute reading statistics of the given text
43
+ Reference: https://github.com/shivam5992/textstat
44
+
45
+ Parameters
46
+ ==========
47
+ text: str, input section or abstract text
48
+ """
49
+ try:
50
+ readability_dict = {
51
+ "flesch_reading_ease": textstat.flesch_reading_ease(text),
52
+ "smog": textstat.smog_index(text),
53
+ "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
54
+ "coleman_liau_index": textstat.coleman_liau_index(text),
55
+ "automated_readability_index": textstat.automated_readability_index(text),
56
+ "dale_chall": textstat.dale_chall_readability_score(text),
57
+ "difficult_words": textstat.difficult_words(text),
58
+ "linsear_write": textstat.linsear_write_formula(text),
59
+ "gunning_fog": textstat.gunning_fog(text),
60
+ "text_standard": textstat.text_standard(text),
61
+ "n_syllable": textstat.syllable_count(text),
62
+ "avg_letter_per_word": textstat.avg_letter_per_word(text),
63
+ "avg_sentence_length": textstat.avg_sentence_length(text),
64
+ }
65
+ except:
66
+ readability_dict = {
67
+ "flesch_reading_ease": None,
68
+ "smog": None,
69
+ "flesch_kincaid_grade": None,
70
+ "coleman_liau_index": None,
71
+ "automated_readability_index": None,
72
+ "dale_chall": None,
73
+ "difficult_words": None,
74
+ "linsear_write": None,
75
+ "gunning_fog": None,
76
+ "text_standard": None,
77
+ "n_syllable": None,
78
+ "avg_letter_per_word": None,
79
+ "avg_sentence_length": None,
80
+ }
81
+ return readability_dict
82
+
83
+
84
+ def compute_text_stats(text):
85
+ """
86
+ Compute part of speech features from a given spacy wrapper of text
87
+
88
+ Parameters
89
+ ==========
90
+ text: spacy.tokens.doc.Doc, spacy wrapper of the section or abstract text
91
+
92
+ Output
93
+ ======
94
+ text_stat: dict, part of speech and text features extracted from the given text
95
+ """
96
+ try:
97
+ pos = dict(Counter([token.pos_ for token in text]))
98
+ pos_tag = dict(
99
+ Counter([token.tag_ for token in text])
100
+ ) # detailed part-of-speech
101
+
102
+ n_present_verb = sum(
103
+ [v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST]
104
+ )
105
+ n_verb = sum([v for k, v in pos_tag.items() if k in VERB_LIST])
106
+
107
+ word_shape = dict(Counter([token.shape_ for token in text])) # word shape
108
+ n_word_per_sents = [len([token for token in sent]) for sent in text.sents]
109
+ n_digits = sum([token.is_digit or token.like_num for token in text])
110
+ n_word = sum(n_word_per_sents)
111
+ n_sents = len(n_word_per_sents)
112
+ text_stats_dict = {
113
+ "pos": pos,
114
+ "pos_tag": pos_tag,
115
+ "word_shape": word_shape,
116
+ "n_word": n_word,
117
+ "n_sents": n_sents,
118
+ "n_present_verb": n_present_verb,
119
+ "n_verb": n_verb,
120
+ "n_digits": n_digits,
121
+ "percent_digits": n_digits / n_word,
122
+ "n_word_per_sents": n_word_per_sents,
123
+ "avg_word_per_sents": np.mean(n_word_per_sents),
124
+ }
125
+ except:
126
+ text_stats_dict = {
127
+ "pos": None,
128
+ "pos_tag": None,
129
+ "word_shape": None,
130
+ "n_word": None,
131
+ "n_sents": None,
132
+ "n_present_verb": None,
133
+ "n_verb": None,
134
+ "n_digits": None,
135
+ "percent_digits": None,
136
+ "n_word_per_sents": None,
137
+ "avg_word_per_sents": None,
138
+ }
139
+ return text_stats_dict
140
+
141
+
142
+ def compute_journal_features(article):
143
+ """
144
+ Parse features about journal references from a given dictionary of parsed article e.g.
145
+ number of reference made, number of unique journal refered, minimum year of references,
146
+ maximum year of references, ...
147
+
148
+ Parameters
149
+ ==========
150
+ article: dict, article dictionary parsed from GROBID and converted to dictionary
151
+ see ``pdf/parse_pdf.py`` for the detail of the output dictionary
152
+
153
+ Output
154
+ ======
155
+ reference_dict: dict, dictionary of
156
+ """
157
+ try:
158
+ n_reference = len(article["references"])
159
+ n_unique_journals = len(
160
+ pd.unique([a["journal"] for a in article["references"]])
161
+ )
162
+ reference_years = []
163
+ for reference in article["references"]:
164
+ year = reference["year"]
165
+ if year.isdigit():
166
+ # filter outliers
167
+ if int(year) in range(1800, 2100):
168
+ reference_years.append(int(year))
169
+ avg_ref_year = np.mean(reference_years)
170
+ median_ref_year = np.median(reference_years)
171
+ min_ref_year = np.min(reference_years)
172
+ max_ref_year = np.max(reference_years)
173
+ journal_features_dict = {
174
+ "n_reference": n_reference,
175
+ "n_unique_journals": n_unique_journals,
176
+ "avg_ref_year": avg_ref_year,
177
+ "median_ref_year": median_ref_year,
178
+ "min_ref_year": min_ref_year,
179
+ "max_ref_year": max_ref_year,
180
+ }
181
+ except:
182
+ journal_features_dict = {
183
+ "n_reference": None,
184
+ "n_unique_journals": None,
185
+ "avg_ref_year": None,
186
+ "median_ref_year": None,
187
+ "min_ref_year": None,
188
+ "max_ref_year": None,
189
+ }
190
+ return journal_features_dict
191
+
192
+
193
+ def merge_section_list(section_list, section_maps=SECTIONS_MAPS, section_start=""):
194
+ """
195
+ Merge a list of sections into a normalized list of sections,
196
+ you can get the list of sections from parsed article JSON in ``parse_pdf.py`` e.g.
197
+
198
+ >> section_list = [s['heading'] for s in article_json['sections']]
199
+ >> section_list_merged = merge_section_list(section_list)
200
+
201
+ Parameters
202
+ ==========
203
+ section_list: list, list of sections
204
+
205
+ Output
206
+ ======
207
+ section_list_merged: list, sections
208
+ """
209
+ sect_map = section_start # text for starting section e.g. ``Introduction``
210
+ section_list_merged = []
211
+ for section in section_list:
212
+ if any([(s.lower() in section.lower()) for s in section_maps.keys()]):
213
+ sect = [s for s in section_maps.keys() if s.lower() in section.lower()][0]
214
+ sect_map = section_maps.get(sect, "") #
215
+ section_list_merged.append(sect_map)
216
+ else:
217
+ section_list_merged.append(sect_map)
218
+ return section_list_merged
src/utils/scipdf/pdf/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .parse_pdf import *
2
+
3
+ __all__ = [
4
+ "list_pdf_paths",
5
+ "parse_abstract",
6
+ "parse_figure_caption",
7
+ "parse_references",
8
+ "parse_pdf_to_dict",
9
+ ]
src/utils/scipdf/pdf/parse_pdf.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import os.path as op
4
+ from glob import glob
5
+ import urllib
6
+ from typing import Dict
7
+ import subprocess
8
+ import requests
9
+ import fitz
10
+ from PIL import Image
11
+ import io
12
+ from bs4 import BeautifulSoup, NavigableString
13
+ from tqdm import tqdm, tqdm_notebook
14
+
15
+
16
+ # GROBID_URL = "http://localhost:8070"
17
+ GROBID_URL = "http://10.82.77.107:8070"
18
+ DIR_PATH = op.dirname(op.abspath(__file__))
19
+ PDF_FIGURES_JAR_PATH = op.join(
20
+ DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar"
21
+ )
22
+
23
+
24
+ def list_pdf_paths(pdf_folder: str):
25
+ """
26
+ list of pdf paths in pdf folder
27
+ """
28
+ return glob(op.join(pdf_folder, "*", "*", "*.pdf"))
29
+
30
+
31
+ def validate_url(path: str):
32
+ """
33
+ Validate a given ``path`` if it is URL or not
34
+ """
35
+ regex = re.compile(
36
+ r"^(?:http|ftp)s?://" # http:// or https://
37
+ r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
38
+ r"localhost|" # localhost...
39
+ r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
40
+ r"(?::\d+)?" # optional port
41
+ r"(?:/?|[/?]\S+)$",
42
+ re.IGNORECASE,
43
+ )
44
+ return re.match(regex, path) is not None
45
+
46
+
47
+ def parse_pdf(
48
+ pdf_path: str,
49
+ fulltext: bool = True,
50
+ soup: bool = False,
51
+ return_coordinates: bool = False,
52
+ grobid_url: str = GROBID_URL,
53
+ ):
54
+ """
55
+ Function to parse PDF to XML or BeautifulSoup using GROBID tool
56
+
57
+ You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally
58
+ After loading GROBID zip file, you can run GROBID by using the following
59
+ >> ./gradlew run
60
+
61
+ Parameters
62
+ ==========
63
+ pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF
64
+ fulltext: bool, option for parsing, if True, parse full text of the article
65
+ if False, parse only header
66
+ grobid_url: str, url to GROBID parser, default at 'http://localhost:8070'
67
+ This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service
68
+ soup: bool, if True, return BeautifulSoup of the article
69
+
70
+ Output
71
+ ======
72
+ parsed_article: if soup is False, return parsed XML in text format,
73
+ else return BeautifulSoup of the XML
74
+ Example
75
+ =======
76
+ >> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True)
77
+ """
78
+ # GROBID URL
79
+ if fulltext:
80
+ url = "%s/api/processFulltextDocument" % grobid_url
81
+ else:
82
+ url = "%s/api/processHeaderDocument" % grobid_url
83
+
84
+ files = []
85
+ if return_coordinates:
86
+ files += [
87
+ ("teiCoordinates", (None, "persName")),
88
+ ("teiCoordinates", (None, "figure")),
89
+ ("teiCoordinates", (None, "ref")),
90
+ ("teiCoordinates", (None, "formula")),
91
+ ("teiCoordinates", (None, "biblStruct")),
92
+ ]
93
+
94
+ if isinstance(pdf_path, str):
95
+ if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf":
96
+ print("The input URL has to end with ``.pdf``")
97
+ parsed_article = None
98
+ elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf":
99
+ page = urllib.request.urlopen(pdf_path).read()
100
+ files += [("input", page)]
101
+ parsed_article = requests.post(url, files=files).text
102
+ elif op.exists(pdf_path):
103
+ files += [("input", (open(pdf_path, "rb")))]
104
+ parsed_article = requests.post(
105
+ url, files=files
106
+ ).text
107
+ else:
108
+ parsed_article = None
109
+ elif isinstance(pdf_path, bytes):
110
+ # assume that incoming is byte string
111
+ files += [("input", (pdf_path))]
112
+ parsed_article = requests.post(url, files=files).text
113
+ else:
114
+ parsed_article = None
115
+
116
+ if soup and parsed_article is not None:
117
+ parsed_article = BeautifulSoup(parsed_article, "lxml")
118
+
119
+ return parsed_article
120
+
121
+
122
+ def parse_authors(article):
123
+ """
124
+ Parse authors from a given BeautifulSoup of an article
125
+ """
126
+ authors = []
127
+ try:
128
+ author_names = article.find("sourcedesc").findAll("persname")
129
+ except Exception:
130
+ return authors
131
+ for author in author_names:
132
+ firstname = author.find("forename", {"type": "first"})
133
+ firstname = firstname.text.strip() if firstname is not None else ""
134
+ middlename = author.find("forename", {"type": "middle"})
135
+ middlename = middlename.text.strip() if middlename is not None else ""
136
+ lastname = author.find("surname")
137
+ lastname = lastname.text.strip() if lastname is not None else ""
138
+ if middlename != "":
139
+ authors.append(firstname + " " + middlename + " " + lastname)
140
+ else:
141
+ authors.append(firstname + " " + lastname)
142
+ authors = "; ".join(authors)
143
+ return authors
144
+
145
+
146
+ def parse_date(article):
147
+ """
148
+ Parse date from a given BeautifulSoup of an article
149
+ """
150
+ try:
151
+ pub_date = article.find("publicationstmt")
152
+ year = pub_date.find("date")
153
+ except Exception:
154
+ return ""
155
+ year = year.attrs.get("when") if year is not None else ""
156
+ return year
157
+
158
+
159
+ def parse_abstract(article):
160
+ """
161
+ Parse abstract from a given BeautifulSoup of an article
162
+ """
163
+ div = article.find("abstract")
164
+ abstract = ""
165
+ for p in list(div.children):
166
+ if not isinstance(p, NavigableString) and len(list(p)) > 0:
167
+ abstract += " ".join(
168
+ [elem.text for elem in p if not isinstance(elem, NavigableString)]
169
+ )
170
+ return abstract
171
+
172
+
173
+ def find_references(div):
174
+ """
175
+ For a given section, find references made in the section for publications, figures, tables
176
+ """
177
+ publication_ref = [ref.attrs.get("target").strip("#") for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr" and "target" in ref.attrs]
178
+ figure_ref = [ref.attrs.get("target").strip("#") for ref in div.find_all("ref") if ref.attrs.get("type") == "figure" and "target" in ref.attrs]
179
+ table_ref = [ref.attrs.get("target").strip("#") for ref in div.find_all("ref") if ref.attrs.get("type") == "table" and "target" in ref.attrs]
180
+ return {"publication_ref": publication_ref, "figure_ref": figure_ref, "table_ref": table_ref}
181
+
182
+
183
+ def parse_sections(article, as_list: bool = False):
184
+ """
185
+ Parse list of sections from a given BeautifulSoup of an article
186
+
187
+ Parameters
188
+ ==========
189
+ as_list: bool, if True, output text as a list of paragraph instead
190
+ of joining it together as one single text
191
+ """
192
+ article_text = article.find("text")
193
+ divs = article_text.find_all("div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"})
194
+ sections = []
195
+ for div in divs:
196
+ div_list = list(div.children)
197
+ if len(div_list) == 0:
198
+ heading = ""
199
+ text = ""
200
+ elif len(div_list) == 1:
201
+ if isinstance(div_list[0], NavigableString):
202
+ heading = str(div_list[0])
203
+ text = ""
204
+ else:
205
+ heading = ""
206
+ text = div_list[0].text
207
+ else:
208
+ text = []
209
+ heading = div_list[0]
210
+ if isinstance(heading, NavigableString):
211
+ heading = str(heading)
212
+ p_all = list(div.children)[1:]
213
+ else:
214
+ heading = ""
215
+ p_all = list(div.children)
216
+ for p in p_all:
217
+ if p is not None:
218
+ try:
219
+ text.append(p.text)
220
+ except:
221
+ pass
222
+ if not as_list:
223
+ text = "\n".join(text)
224
+
225
+ if heading != "" or text != "":
226
+ ref_dict = find_references(div)
227
+ sections.append(
228
+ {
229
+ "heading": heading,
230
+ "text": text,
231
+ "publication_ref": ref_dict["publication_ref"],
232
+ "figure_ref": ref_dict["figure_ref"],
233
+ "table_ref": ref_dict["table_ref"],
234
+ }
235
+ )
236
+ return sections
237
+
238
+
239
+ def parse_references(article):
240
+ """
241
+ Parse list of references from a given BeautifulSoup of an article
242
+ """
243
+ reference_list = []
244
+ references = article.find("text").find("div", attrs={"type": "references"})
245
+ references = references.find_all("biblstruct") if references is not None else []
246
+ reference_list = []
247
+ for reference in references:
248
+ ref_id = reference.get('xml:id', "")
249
+ title = reference.find("title", attrs={"level": "a"})
250
+ if title is None:
251
+ title = reference.find("title", attrs={"level": "m"})
252
+ title = title.text if title is not None else ""
253
+ journal = reference.find("title", attrs={"level": "j"})
254
+ journal = journal.text if journal is not None else ""
255
+ if journal == "":
256
+ journal = reference.find("publisher")
257
+ journal = journal.text if journal is not None else ""
258
+ year = reference.find("date")
259
+ year = year.attrs.get("when") if year is not None else ""
260
+ authors = []
261
+ for author in reference.find_all("author"):
262
+ firstname = author.find("forename", {"type": "first"})
263
+ firstname = firstname.text.strip() if firstname is not None else ""
264
+ middlename = author.find("forename", {"type": "middle"})
265
+ middlename = middlename.text.strip() if middlename is not None else ""
266
+ lastname = author.find("surname")
267
+ lastname = lastname.text.strip() if lastname is not None else ""
268
+ if middlename != "":
269
+ authors.append(firstname + " " + middlename + " " + lastname)
270
+ else:
271
+ authors.append(firstname + " " + lastname)
272
+ authors = "; ".join(authors)
273
+ reference_list.append(
274
+ {"ref_id": ref_id, "title": title, "journal": journal, "year": year, "authors": authors}
275
+ )
276
+ return reference_list
277
+
278
+
279
+ def parse_figure_caption(article):
280
+ """
281
+ Parse list of figures/tables from a given BeautifulSoup of an article
282
+ """
283
+ figures_list = []
284
+ figures = article.find_all("figure")
285
+ for figure in figures:
286
+ figure_type = figure.attrs.get("type") or "figure"
287
+ figure_id = figure.attrs.get("xml:id") or ""
288
+ label = figure.find("label").text
289
+ if figure_type == "table":
290
+ caption = figure.find("figdesc").text
291
+ data = figure.table.text
292
+ else:
293
+ caption = figure.text
294
+ data = ""
295
+ figures_list.append(
296
+ {
297
+ "figure_label": label,
298
+ "figure_type": figure_type,
299
+ "figure_id": figure_id,
300
+ "figure_caption": caption,
301
+ "figure_data": data,
302
+ }
303
+ )
304
+ return figures_list
305
+
306
+
307
+ def parse_formulas(article):
308
+ """
309
+ Parse list of formulas from a given BeautifulSoup of an article
310
+ """
311
+ formulas_list = []
312
+ formulas = article.find_all("formula")
313
+ for formula in formulas:
314
+ formula_id = formula.attrs["xml:id"] or ""
315
+ formula_text = formula.text
316
+ formula_coordinates = formula.attrs.get("coords") or ""
317
+ if formula_coordinates != "":
318
+ formula_coordinates = [float(x) for x in formula_coordinates.split(",")]
319
+ formulas_list.append(
320
+ {
321
+ "formula_id": formula_id,
322
+ "formula_text": formula_text,
323
+ "formula_coordinates": formula_coordinates,
324
+ }
325
+ )
326
+ return formulas_list
327
+
328
+
329
+ def convert_article_soup_to_dict(article, as_list: bool = False):
330
+ """
331
+ Function to convert BeautifulSoup to JSON format
332
+ similar to the output from https://github.com/allenai/science-parse/
333
+
334
+ Parameters
335
+ ==========
336
+ article: BeautifulSoup
337
+
338
+ Output
339
+ ======
340
+ article_json: dict, parsed dictionary of a given article in the following format
341
+ {
342
+ 'title': ...,
343
+ 'abstract': ...,
344
+ 'sections': [
345
+ {'heading': ..., 'text': ...},
346
+ {'heading': ..., 'text': ...},
347
+ ...
348
+ ],
349
+ 'references': [
350
+ {'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
351
+ {'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
352
+ ...
353
+ ],
354
+ 'figures': [
355
+ {'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...},
356
+ ...
357
+ ]
358
+ }
359
+ """
360
+ article_dict = {}
361
+ if article is not None:
362
+ title = article.find("title", attrs={"type": "main"})
363
+ title = title.text.strip() if title is not None else ""
364
+
365
+ article_dict["title"] = title
366
+ article_dict["authors"] = parse_authors(article)
367
+ article_dict["pub_date"] = parse_date(article)
368
+ article_dict["abstract"] = parse_abstract(article)
369
+ article_dict["sections"] = parse_sections(article, as_list=as_list)
370
+ article_dict["references"] = parse_references(article)
371
+ article_dict["figures"] = parse_figure_caption(article)
372
+ article_dict["formulas"] = parse_formulas(article)
373
+
374
+ doi = article.find("idno", attrs={"type": "DOI"})
375
+ doi = doi.text if doi is not None else ""
376
+ article_dict["doi"] = doi
377
+
378
+ return article_dict
379
+ else:
380
+ return None
381
+
382
+
383
+ def parse_pdf_to_dict(
384
+ pdf_path: str,
385
+ fulltext: bool = True,
386
+ soup: bool = True,
387
+ as_list: bool = False,
388
+ return_coordinates: bool = True,
389
+ grobid_url: str = GROBID_URL,
390
+ parse_figures: bool = True,
391
+ ):
392
+ """
393
+ Parse the given PDF and return dictionary of the parsed article
394
+
395
+ Parameters
396
+ ==========
397
+ pdf_path: str, path to publication or article
398
+ fulltext: bool, whether to extract fulltext or not
399
+ soup: bool, whether to return BeautifulSoup or not
400
+ as_list: bool, whether to return list of sections or not
401
+ grobid_url: str, url to grobid server, default is `GROBID_URL`
402
+ This could be changed to "https://kermitt2-grobid.hf.space" for the cloud service
403
+
404
+ Ouput
405
+ =====
406
+ article_dict: dict, dictionary of an article
407
+ """
408
+ parsed_article = parse_pdf(
409
+ pdf_path,
410
+ fulltext=fulltext,
411
+ soup=soup,
412
+ return_coordinates=return_coordinates,
413
+ grobid_url=grobid_url,
414
+ )
415
+ article_dict = convert_article_soup_to_dict(parsed_article, as_list=as_list)
416
+
417
+ return article_dict
418
+
419
+
420
+ def parse_figures(
421
+ pdf_folder: str,
422
+ jar_path: str = PDF_FIGURES_JAR_PATH,
423
+ resolution: int = 300,
424
+ output_folder: str = "figures",
425
+ ):
426
+ """
427
+ Parse figures from the given scientific PDF using pdffigures2
428
+
429
+ Parameters
430
+ ==========
431
+ pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files
432
+ jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file
433
+ resolution: int, resolution of the output figures
434
+ output_folder: str, path to folder that we want to save parsed data (related to figures) and figures
435
+
436
+ Output
437
+ ======
438
+ folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively
439
+ """
440
+ if not op.isdir(output_folder):
441
+ os.makedirs(output_folder)
442
+
443
+ # create ``data`` and ``figures`` subfolder within ``output_folder``
444
+ data_path = op.join(output_folder, "data")
445
+ figure_path = op.join(output_folder, "figures")
446
+ if not op.exists(data_path):
447
+ os.makedirs(data_path)
448
+ if not op.exists(figure_path):
449
+ os.makedirs(figure_path)
450
+
451
+ if op.isdir(data_path) and op.isdir(figure_path):
452
+ args = [
453
+ "java",
454
+ "-jar",
455
+ jar_path,
456
+ pdf_folder,
457
+ "-i",
458
+ str(resolution),
459
+ "-d",
460
+ op.join(op.abspath(data_path), ""),
461
+ "-m",
462
+ op.join(op.abspath(figure_path), ""), # end path with "/"
463
+ ]
464
+ _ = subprocess.run(
465
+ args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20
466
+ )
467
+ print("Done parsing figures from PDFs!")
468
+ else:
469
+ print(
470
+ "You may have to check of ``data`` and ``figures`` in the the output folder path."
471
+ )